Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)

RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00
commit ec82764bef
2462 changed files with 2174303 additions and 0 deletions
@@ -0,0 +1,44 @@
+# rtl/debug
+
+Trace taps, observability modules, and first-class debug infrastructure.
+
+This directory is intentionally first-class per the debug/validation strategy
+in `docs/contracts/validation.md`. Nothing here is ornamental; stubs and real
+blocks alike depend on it.
+
+## Wave 1 contents
+
+- `trace_pkg.sv` — shared types (`subsys_e`, `event_e`) and string renderers
+  used by all trace producers and `trace_sink_stub`.
+- `trace_sink_stub.sv` — simulation-only text trace writer. One instance per
+  output file; each Wave 1 stub wires its event port to its own sink. See
+  `docs/decisions/0000-trace-format.md` for format rationale.
+
+## Usage pattern for Wave 1 testbenches
+
+```systemverilog
+trace_sink_stub #(
+    .FILENAME  ("ee.trace"),
+    .SINK_LABEL("ee_fetch")
+) u_trace_ee (
+    .clk      (clk),
+    .rst_n    (rst_n),
+    .ev_valid (dut_ev_valid),
+    .ev_subsys(dut_ev_subsys),
+    .ev_event (dut_ev_event),
+    .ev_arg0  (dut_ev_arg0),
+    .ev_arg1  (dut_ev_arg1),
+    .ev_arg2  (dut_ev_arg2),
+    .ev_arg3  (dut_ev_arg3),
+    .ev_flags (dut_ev_flags)
+);
+```
+
+## Notes
+
+- Cycle counter is internal to the sink and advances on `clk` while `rst_n`
+  is high. Cross-clock correlation is a later-wave concern.
+- `ev_flags == 0` renders as `-` in the trace line; any non-zero value is
+  printed as an 8-hex-digit field.
+- Event/subsystem codes are globally unique in Wave 1 (not per-subsystem).
+  Revisit if the namespace gets crowded.
@@ -0,0 +1,166 @@
+// retroDE_ps2 — trace package
+//
+// Defines the shared trace vocabulary used by Wave 1 stubs. Kept small on
+// purpose: subsystem IDs, event codes, and string renderers only. See
+// docs/decisions/0000-trace-format.md and docs/stub_module_plan.md.
+//
+// Contract:
+//   trace line = cycle subsystem event arg0 arg1 arg2 arg3 flags
+//
+// Event codes are globally unique (not per-subsystem) in Wave 1 to keep the
+// renderer trivial. Revisit if the namespace gets crowded.
+
+`ifndef RETRODE_PS2_TRACE_PKG_SV
+`define RETRODE_PS2_TRACE_PKG_SV
+
+`timescale 1ns/1ps
+
+package trace_pkg;
+
+    typedef enum logic [3:0] {
+        SUBSYS_EE    = 4'h0,
+        SUBSYS_MEM   = 4'h1,
+        SUBSYS_GS    = 4'h2,
+        SUBSYS_INTC  = 4'h3,
+        SUBSYS_SIF   = 4'h4,
+        SUBSYS_DMAC  = 4'h5,
+        SUBSYS_IOP   = 4'h6,
+        SUBSYS_GIF   = 4'h7,
+        SUBSYS_PLAT  = 4'h8,
+        SUBSYS_OTHER = 4'hF
+    } subsys_e;
+
+    typedef enum logic [7:0] {
+        EV_RESET     = 8'h00,
+        EV_IFETCH    = 8'h01,
+        EV_READ      = 8'h02,
+        EV_WRITE     = 8'h03,
+        EV_UNMAPPED  = 8'h04,
+        EV_IRQ       = 8'h05,
+        EV_MODE      = 8'h06,
+        EV_BGCOLOR   = 8'h07,
+        // Wave 2 additions — DMAC / GIF / GS write-path visibility
+        // (see docs/wave2_dma_gif_plan.md).
+        EV_DMA_CFG   = 8'h08,
+        EV_DMA_START = 8'h09,
+        EV_DMA_BEAT  = 8'h0A,
+        EV_DMA_DONE  = 8'h0B,
+        EV_GIFTAG    = 8'h0C,
+        EV_GS_WRITE  = 8'h0D,
+        // Ch76 — GS primitive-observer "primitive complete" pulse.
+        // Fired by gs_stub when an XYZ2/XYZF2 vertex commit closes a
+        // discrete primitive (POINT / LINE / TRI / SPRITE) per the
+        // currently-latched PRIM[2:0]. arg0=prim_type, arg1=vert
+        // threshold, arg2=cumulative prim count after this draw,
+        // arg3=closing vertex data. No rasterization yet.
+        EV_PRIM_DRAW = 8'h0E,
+        EV_OTHER     = 8'hFF
+    } event_e;
+
+    // ------------------------------------------------------------------
+    // Ch81 — structured GIF/GS field decoders
+    //
+    // The raw 64-bit XYZ2 / XYZF2 / RGBAQ payloads are an awkward
+    // contract for the next layer of the pipeline (rasterizer or
+    // pixel emit). These struct types carry the same data already
+    // unpacked into channel components so a consumer doesn't have
+    // to re-derive the bit slices.
+    //
+    // XYZ2  (PCSX2 GSRegs.h: bits[15:0]=X, [31:16]=Y, [63:32]=Z 32-bit)
+    // XYZF2 (PCSX2 GSRegs.h: bits[15:0]=X, [31:16]=Y, [55:32]=Z 24-bit,
+    //                       [63:56]=F fog byte)
+    // RGBAQ (bits[7:0]=R, [15:8]=G, [23:16]=B, [31:24]=A, [63:32]=Q float)
+    //
+    // X and Y are PS2 12.4 fixed-point screen coordinates (top 12
+    // bits = integer pixel, low 4 bits = sub-pixel). Z is treated
+    // as opaque in this package — depth interpretation depends on
+    // the GS framebuffer/zbuffer config, which the recognition
+    // layer doesn't model. Q is the texture-coordinate divisor
+    // (IEEE single-precision float); we carry it verbatim.
+    //
+    // is_xyzf2 records the source format so a consumer can
+    // disambiguate the 24-bit-Z + 8-bit-fog packing from the full
+    // 32-bit-Z packing without re-reading the original reg#.
+    // ------------------------------------------------------------------
+
+    typedef struct packed {
+        logic        is_xyzf2;                                               // 1 = XYZF2 source, 0 = XYZ2
+        logic [7:0]  fog;                                                    // valid iff is_xyzf2; else 0
+        logic [31:0] z;                                                      // 32-bit (XYZ2) or zero-extended 24-bit (XYZF2)
+        logic [15:0] y;                                                      // 12.4 fixed-point screen Y
+        logic [15:0] x;                                                      // 12.4 fixed-point screen X
+    } vertex_t;
+
+    typedef struct packed {
+        logic [31:0] q;                                                      // texture-coord divisor (IEEE float)
+        logic [7:0]  a;
+        logic [7:0]  b;
+        logic [7:0]  g;
+        logic [7:0]  r;
+    } color_t;
+
+    function automatic vertex_t decode_vertex(input logic [63:0] data,
+                                              input logic        is_xyzf2);
+        vertex_t v;
+        v.x        = data[15:0];
+        v.y        = data[31:16];
+        v.is_xyzf2 = is_xyzf2;
+        if (is_xyzf2) begin
+            v.z   = {8'd0, data[55:32]};                                     // zero-extend 24 bits
+            v.fog = data[63:56];
+        end else begin
+            v.z   = data[63:32];
+            v.fog = 8'd0;
+        end
+        return v;
+    endfunction
+
+    function automatic color_t decode_color(input logic [63:0] data);
+        color_t c;
+        c.r = data[7:0];
+        c.g = data[15:8];
+        c.b = data[23:16];
+        c.a = data[31:24];
+        c.q = data[63:32];
+        return c;
+    endfunction
+
+    function automatic string subsys_str(input subsys_e s);
+        case (s)
+            SUBSYS_EE:   return "EE";
+            SUBSYS_MEM:  return "MEM";
+            SUBSYS_GS:   return "GS";
+            SUBSYS_INTC: return "INTC";
+            SUBSYS_SIF:  return "SIF";
+            SUBSYS_DMAC: return "DMAC";
+            SUBSYS_IOP:  return "IOP";
+            SUBSYS_GIF:  return "GIF";
+            SUBSYS_PLAT: return "PLAT";
+            default:     return "OTHER";
+        endcase
+    endfunction
+
+    function automatic string event_str(input event_e e);
+        case (e)
+            EV_RESET:     return "RESET";
+            EV_IFETCH:    return "IFETCH";
+            EV_READ:      return "READ";
+            EV_WRITE:     return "WRITE";
+            EV_UNMAPPED:  return "UNMAPPED";
+            EV_IRQ:       return "IRQ";
+            EV_MODE:      return "MODE";
+            EV_BGCOLOR:   return "BGCOLOR";
+            EV_DMA_CFG:   return "DMA_CFG";
+            EV_DMA_START: return "DMA_START";
+            EV_DMA_BEAT:  return "DMA_BEAT";
+            EV_DMA_DONE:  return "DMA_DONE";
+            EV_GIFTAG:    return "GIFTAG";
+            EV_GS_WRITE:  return "GS_WRITE";
+            EV_PRIM_DRAW: return "PRIM_DRAW";
+            default:      return "OTHER";
+        endcase
+    endfunction
+
+endpackage : trace_pkg
+
+`endif // RETRODE_PS2_TRACE_PKG_SV
@@ -0,0 +1,88 @@
+// retroDE_ps2 — trace_sink_stub
+//
+// Simulation-only text trace writer for Wave 1 stubs.
+//
+// Purpose, owns, success condition, replacement path: see
+//   docs/stub_module_plan.md  (Wave 1, item 1)
+//   docs/contracts/validation.md
+//   docs/decisions/0000-trace-format.md
+//
+// Interface shape:
+//   - One sink instance per output file. A testbench instantiates multiple
+//     sinks (one per stub under test) and wires each stub's event port to
+//     its own sink. Offline tooling merges files by cycle when needed.
+//   - The cycle counter is internal and advances on clk while rst_n is high.
+//     Multi-clock-domain correlation is a later-wave concern.
+//
+// Line format (docs/decisions/0000):
+//     cycle  subsys  event  arg0  arg1  arg2  arg3  flags
+//   where flags is rendered as `-` when zero.
+
+`timescale 1ns/1ps
+
+module trace_sink_stub
+    import trace_pkg::*;
+#(
+    parameter string FILENAME       = "trace.txt",
+    parameter int    SCHEMA_VERSION = 1,
+    parameter string SINK_LABEL     = "trace"
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    input  logic          ev_valid,
+    input  subsys_e       ev_subsys,
+    input  event_e        ev_event,
+    input  logic [63:0]   ev_arg0,
+    input  logic [63:0]   ev_arg1,
+    input  logic [63:0]   ev_arg2,
+    input  logic [63:0]   ev_arg3,
+    input  logic [31:0]   ev_flags
+);
+
+    integer          fd;
+    longint unsigned cycle_count;
+
+    initial begin
+        fd = $fopen(FILENAME, "w");
+        if (fd == 0) begin
+            $fatal(1, "[trace_sink_stub %0s] cannot open %0s", SINK_LABEL, FILENAME);
+        end
+        $fdisplay(fd, "# retroDE_ps2 trace, schema v%0d, sink=%0s",
+                  SCHEMA_VERSION, SINK_LABEL);
+        $fdisplay(fd, "# columns: cycle subsystem event arg0 arg1 arg2 arg3 flags");
+        cycle_count = 64'd0;
+    end
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            cycle_count <= 64'd0;
+        end else begin
+            cycle_count <= cycle_count + 64'd1;
+
+            if (ev_valid) begin
+                if (ev_flags == 32'd0) begin
+                    $fdisplay(fd,
+                              "%0d %0s %0s 0x%016h 0x%016h 0x%016h 0x%016h -",
+                              cycle_count,
+                              subsys_str(ev_subsys),
+                              event_str(ev_event),
+                              ev_arg0, ev_arg1, ev_arg2, ev_arg3);
+                end else begin
+                    $fdisplay(fd,
+                              "%0d %0s %0s 0x%016h 0x%016h 0x%016h 0x%016h 0x%08h",
+                              cycle_count,
+                              subsys_str(ev_subsys),
+                              event_str(ev_event),
+                              ev_arg0, ev_arg1, ev_arg2, ev_arg3,
+                              ev_flags);
+                end
+            end
+        end
+    end
+
+    final begin
+        if (fd != 0) $fclose(fd);
+    end
+
+endmodule : trace_sink_stub
@@ -0,0 +1,40 @@
+# rtl/dmac
+
+EE DMAC. Matches `docs/contracts/dmac.md`.
+
+## Wave 2 / Wave 2.5 contents
+
+- `dmac_reg_stub.sv` — channel-2-focused register shell + single-transfer
+  state machine. Wave 2.5 revision is memory-backed: DMAC now issues real
+  memory reads via the `mem_rd_*` port (connected directly to
+  `ee_ram_stub` in the current topology; routing through
+  `ee_memory_map_stub` is deferred). State flow: IDLE → FETCH_WAIT →
+  ACTIVE_SEND → DONE. MADR is the real fetch source address.
+  See `docs/wave25_memory_backed_dma_plan.md`.
+
+  **EE-core chapter 3** added a CPU write path: the EE memory map's
+  new `ee_dmac_ch2_wr_*` port drives `reg_wr_en` / `reg_offset` /
+  `reg_wr_data`, so the EE core can program MADR/QWC/CHCR from a
+  MIPS bootstrap via `SW`.
+
+  **EE-core chapter 4** added a CPU read path (`reg_rd_en` /
+  `reg_rd_data` / `reg_rd_valid`, 1-cycle latency) plus a DONE_COUNT
+  monotonic counter at offset 0x40. CHCR/MADR/QWC/TADR read back
+  their stored values; DONE_COUNT increments each time the state
+  machine enters S_DONE. The EE map forwards CPU reads in the same
+  DMAC window through a new `ee_dmac_ch2_rd_*` pair, so software
+  can now poll CHCR.start or compare DONE_COUNT before/after a
+  transfer without needing INTC.
+
+## Explicit non-goals (Wave 2 / 2.5)
+
+- Multi-channel arbitration or fairness.
+- Chain mode (normal / chain / interleaved transfer modes).
+- Stall / ring / suspend semantics.
+- Interrupt routing to INTC.
+- QWC > 1 multi-beat transfers (state machine is shaped for it; initial
+  signoff is QWC == 1 per Wave 2.5 plan).
+- Routing through `ee_memory_map_stub` (current topology is direct to
+  `ee_ram_stub`).
+
+Each of these is a future-wave concern, not a stub-plan shortcut.
@@ -0,0 +1,355 @@
+// retroDE_ps2 — dmac_reg_stub
+//
+// EE DMAC stub. Channel-agnostic: the module's behaviour is generic across
+// PS2 DMA channels and downstream endpoints. The specific channel and path
+// id are set via parameters; the downstream endpoint wires (ep_*) are
+// valid/data/last/ready regardless of what consumer is connected. Current
+// uses: CHANNEL=2 (GIF path), CHANNEL=5 (SIF0 path).
+//
+// Payload source: memory-backed via the `mem_rd_*` master port, typically
+// routed through `ee_memory_map_stub` to `ee_ram_stub`. MADR is the real
+// fetch source address.
+//
+// Contract refs:
+//   docs/stub_module_plan.md             (Wave 2, item 8)
+//   docs/wave2_dma_gif_plan.md           (Wave 2 scope)
+//   docs/wave25_memory_backed_dma_plan.md (Wave 2.5 scope — THIS REVISION)
+//   docs/contracts/dmac.md
+//
+// Register surface (single channel, selected by CHANNEL parameter):
+//   offset 0x00  CHCR  — start bit at [0], other bits recorded
+//   offset 0x10  MADR  — real fetch source address (Wave 2.5)
+//   offset 0x20  QWC   — transfer length in 128-bit qwords (first sign-off
+//                        path requires QWC == 1; state machine is QWC-
+//                        generic for a future Wave 2.6 extension)
+//   offset 0x30  TADR  — recorded for future chain-mode use
+//   offset 0x40  DONE_COUNT — monotonic completion counter (read-only;
+//                        writes are accepted but ignored). Software reads
+//                        this to distinguish "nth completion" without
+//                        counting interrupts externally. EE-core chapter 4
+//                        addition; mirrors iop_dmac_reg_stub's DONE_COUNT
+//                        but at a new slot (0x0C is occupied on the IOP
+//                        stub; EE stub's 16-byte register spacing puts
+//                        DONE_COUNT at 0x40).
+//
+// Register reads (EE-core chapter 4, added alongside the original write
+// surface): reg_rd_en / reg_rd_data / reg_rd_valid with 1-cycle latency,
+// matching the rest of the stub ecosystem. All four config registers plus
+// DONE_COUNT are readable; all other offsets return 0.
+//
+// Memory master interface (to ee_ram_stub in Wave 2.5):
+//   mem_rd_en / mem_rd_addr drive the request
+//   mem_rd_valid / mem_rd_data return data one cycle later
+//
+// Downstream endpoint: ep_{valid,data,last,ready}. The port names are
+// channel-agnostic because the DMAC's behaviour is generic across PS2
+// channels (ch2 = GIF, ch5 = SIF0, etc.). Connect the endpoint side to
+// whichever consumer matches the instantiated CHANNEL/PATH_ID.
+//
+// State machine:
+//   IDLE         → FETCH_WAIT on CHCR start
+//   FETCH_WAIT   → ACTIVE_SEND on mem_rd_valid (data latched)
+//   ACTIVE_SEND  → FETCH_WAIT on endpoint accept with more beats pending
+//                → DONE on endpoint accept for the final beat
+//   DONE         → IDLE next cycle (clears CHCR.start)
+//
+// Trace payload schemas (per wave25_memory_backed_dma_plan.md):
+//   DMAC DMA_CFG   arg0=channel arg1=chcr  arg2=madr arg3=qwc
+//                  flags=reg_offset (which reg was written)
+//   DMAC DMA_START arg0=channel arg1=qwc   arg2=MADR arg3=path_id
+//   DMAC DMA_BEAT  arg0=channel arg1=beat  arg2=src_addr arg3=remaining
+//   DMAC DMA_DONE  arg0=channel arg1=beats arg2=completion arg3=path_id
+//     completion code: 0 = OK
+
+`timescale 1ns/1ps
+
+module dmac_reg_stub
+    import trace_pkg::*;
+#(
+    parameter logic [3:0] CHANNEL = 4'd2,
+    parameter logic [3:0] PATH_ID = 4'd2
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // CPU / testbench register write port (single-channel, see CHANNEL).
+    // reg_offset is shared by read and write; callers must not assert both
+    // enables in the same cycle (the map ensures this because the EE CPU
+    // emits either rd or wr per transaction, never both).
+    input  logic          reg_wr_en,
+    input  logic [7:0]    reg_offset,
+    input  logic [31:0]   reg_wr_data,
+
+    // Register read port (EE-core chapter 4). 1-cycle latency.
+    input  logic          reg_rd_en,
+    output logic [31:0]   reg_rd_data,
+    output logic          reg_rd_valid,
+
+    // Memory master (Wave 2.5) — direct link to ee_ram_stub in this phase.
+    // Future waves will route this through ee_memory_map_stub.
+    output logic          mem_rd_en,
+    output logic [31:0]   mem_rd_addr,
+    input  logic [127:0]  mem_rd_data,
+    input  logic          mem_rd_valid,
+
+    // Downstream to gif_path_stub
+    output logic          ep_valid,
+    output logic [127:0]  ep_data,
+    output logic          ep_last,
+    input  logic          ep_ready,
+
+    // Completion pulse — one cycle high when the transfer reaches S_DONE.
+    // Intended as an INTC source; level-held bit latching happens in the
+    // interrupt controller, not here.
+    output logic          irq_completion_o,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam logic [7:0] CHCR_OFFSET       = 8'h00;
+    localparam logic [7:0] MADR_OFFSET       = 8'h10;
+    localparam logic [7:0] QWC_OFFSET        = 8'h20;
+    localparam logic [7:0] TADR_OFFSET       = 8'h30;
+    localparam logic [7:0] DONE_COUNT_OFFSET = 8'h40;
+
+    // ------------------------------------------------------------------
+    // Register file (ch2 only)
+    // ------------------------------------------------------------------
+
+    logic [31:0] chcr;
+    logic [31:0] madr;
+    logic [31:0] qwc;
+    logic [31:0] tadr;
+    logic [31:0] done_count;
+
+    logic        start_pulse;
+    assign start_pulse = reg_wr_en && (reg_offset == CHCR_OFFSET) &&
+                         reg_wr_data[0] && !chcr[0];
+
+    // Single owner for the config regs: software writes win over the
+    // S_DONE auto-clear on CHCR[0] in the unlikely same-cycle case
+    // (the NBA queue lets the case-statement full-width assign
+    // override the partial bit-0 clear). Software writing CHCR while
+    // the DMA is completing is not part of any sane flow, so this
+    // ordering is defensive — the point is: chcr has one procedural
+    // driver, not two.
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            chcr <= 32'd0;
+            madr <= 32'd0;
+            qwc  <= 32'd0;
+            tadr <= 32'd0;
+        end else begin
+            if (state == S_DONE) chcr[0] <= 1'b0;
+            if (reg_wr_en) begin
+                case (reg_offset)
+                    CHCR_OFFSET: chcr <= reg_wr_data;
+                    MADR_OFFSET: madr <= reg_wr_data;
+                    QWC_OFFSET:  qwc  <= reg_wr_data;
+                    TADR_OFFSET: tadr <= reg_wr_data;
+                    default: ;
+                endcase
+            end
+        end
+    end
+
+    // DONE_COUNT: monotonic completion counter. Increments on S_DONE
+    // entry. Reset-only clear path; writes at the DONE_COUNT offset are
+    // silently dropped by the write always_ff above (read-only register).
+    always_ff @(posedge clk) begin
+        if (!rst_n)                 done_count <= 32'd0;
+        else if (state == S_DONE)   done_count <= done_count + 32'd1;
+    end
+
+    // Register read (1-cycle latency, matches rest of stub ecosystem).
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            reg_rd_data  <= 32'd0;
+            reg_rd_valid <= 1'b0;
+        end else begin
+            reg_rd_valid <= reg_rd_en;
+            if (reg_rd_en) begin
+                case (reg_offset)
+                    CHCR_OFFSET:       reg_rd_data <= chcr;
+                    MADR_OFFSET:       reg_rd_data <= madr;
+                    QWC_OFFSET:        reg_rd_data <= qwc;
+                    TADR_OFFSET:       reg_rd_data <= tadr;
+                    DONE_COUNT_OFFSET: reg_rd_data <= done_count;
+                    default:           reg_rd_data <= 32'd0;
+                endcase
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Transfer state machine
+    // ------------------------------------------------------------------
+
+    typedef enum logic [1:0] {
+        S_IDLE        = 2'd0,
+        S_FETCH_WAIT  = 2'd1,
+        S_ACTIVE_SEND = 2'd2,
+        S_DONE        = 2'd3
+    } state_e;
+
+    state_e        state;
+    logic [31:0]   madr_latched;
+    logic [31:0]   qwc_latched;
+    logic [31:0]   beat_index;
+    logic [127:0]  beat_payload;
+
+    logic [31:0] src_addr;
+    assign src_addr = madr_latched + (beat_index << 4);   // beat * 16 bytes
+
+    logic beat_accepted;
+    assign beat_accepted = ep_valid && ep_ready;
+
+    // Pulse mem_rd_en for one cycle whenever we first enter FETCH_WAIT.
+    logic prev_state_fw;
+    always_ff @(posedge clk) begin
+        if (!rst_n) prev_state_fw <= 1'b0;
+        else        prev_state_fw <= (state == S_FETCH_WAIT);
+    end
+    logic entering_fw;
+    assign entering_fw = (state == S_FETCH_WAIT) && !prev_state_fw;
+
+    assign mem_rd_en   = entering_fw;
+    assign mem_rd_addr = src_addr;
+
+    // Drive endpoint only in ACTIVE_SEND with the latched payload.
+    assign ep_valid = (state == S_ACTIVE_SEND);
+    assign ep_data  = beat_payload;
+    assign ep_last  = (state == S_ACTIVE_SEND) &&
+                           (beat_index + 32'd1 == qwc_latched);
+
+    assign irq_completion_o = (state == S_DONE);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state        <= S_IDLE;
+            madr_latched <= 32'd0;
+            qwc_latched  <= 32'd0;
+            beat_index   <= 32'd0;
+            beat_payload <= 128'd0;
+        end else begin
+            unique case (state)
+                S_IDLE: begin
+                    if (start_pulse) begin
+                        // start_pulse is gated by reg_wr_en && reg_offset ==
+                        // CHCR_OFFSET, so a same-cycle QWC write is
+                        // structurally impossible through this interface.
+                        // Latch the currently-visible register state.
+                        state        <= S_FETCH_WAIT;
+                        madr_latched <= madr;
+                        qwc_latched  <= qwc;
+                        beat_index   <= 32'd0;
+                    end
+                end
+
+                S_FETCH_WAIT: begin
+                    if (mem_rd_valid) begin
+                        beat_payload <= mem_rd_data;
+                        state        <= S_ACTIVE_SEND;
+                    end
+                end
+
+                S_ACTIVE_SEND: begin
+                    if (beat_accepted) begin
+                        if (beat_index + 32'd1 == qwc_latched) begin
+                            state <= S_DONE;
+                        end else begin
+                            beat_index <= beat_index + 32'd1;
+                            state      <= S_FETCH_WAIT;
+                        end
+                    end
+                end
+
+                S_DONE: begin
+                    state <= S_IDLE;
+                    // chcr[0] auto-clear on S_DONE now lives in the
+                    // register-ownership always_ff above (single
+                    // procedural driver for chcr).
+                end
+
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace emission — one event per cycle; priority:
+    //   DONE pulse > BEAT accept > START on transition > CFG on write
+    // ------------------------------------------------------------------
+
+    logic prev_state_fetch_or_later;
+    always_ff @(posedge clk) begin
+        if (!rst_n) prev_state_fetch_or_later <= 1'b0;
+        else        prev_state_fetch_or_later <= (state != S_IDLE);
+    end
+
+    logic enter_start;   // transitioning from IDLE into the transfer
+    assign enter_start = (state == S_FETCH_WAIT) && !prev_state_fetch_or_later;
+
+    logic enter_done;
+    assign enter_done = (state == S_DONE);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_CFG;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (enter_done) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_DONE;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, beat_index + 32'd1};   // beats completed
+            ev_arg2   <= 64'd0;                         // completion: OK
+            ev_arg3   <= {60'd0, PATH_ID};
+            ev_flags  <= 32'd0;
+        end else if (beat_accepted) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_BEAT;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, beat_index};
+            ev_arg2   <= {32'd0, src_addr};             // this beat's source
+            ev_arg3   <= {32'd0, qwc_latched - beat_index - 32'd1};
+            ev_flags  <= 32'd0;
+        end else if (enter_start) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_START;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, qwc_latched};
+            ev_arg2   <= {32'd0, madr_latched};         // MADR is the source
+            ev_arg3   <= {60'd0, PATH_ID};
+            ev_flags  <= 32'd0;
+        end else if (reg_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_CFG;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, (reg_offset == CHCR_OFFSET) ? reg_wr_data : chcr};
+            ev_arg2   <= {32'd0, (reg_offset == MADR_OFFSET) ? reg_wr_data : madr};
+            ev_arg3   <= {32'd0, (reg_offset == QWC_OFFSET)  ? reg_wr_data : qwc};
+            ev_flags  <= {24'd0, reg_offset};
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : dmac_reg_stub
@@ -0,0 +1,177 @@
+// retroDE_ps2 — ee_dmac_ctrl_stub
+//
+// Ch287 — EE DMAC global control/status registers at
+// 0x1000_E000..0x1000_E0FF (256 bytes). NOT the per-channel registers
+// (those live in dmac_reg_stub at 0x1000_A000+ for channel 2; per-
+// channel registers for other channels are not modelled yet).
+//
+// Surface modelled here (R5900 DMAC global):
+//   offset 0x00  D_CTRL  — DMAC enable / cycle-stealing / RELE / etc.
+//                          Latched write, read returns last-written.
+//   offset 0x10  D_STAT  — Per-channel interrupt status (CIS) + per-
+//                          channel interrupt mask (CIM) + stall / MEIS.
+//                          Read returns current latch (reset = 0 = no
+//                          pending interrupts). Writes are W1C against
+//                          the CIS/MEIS half (bits where write_data has
+//                          a 1 are cleared); CIM half is NOT W1C — bits
+//                          are unconditionally written. Real R5900
+//                          splits the word: bits[15:0] = CIS (W1C), bits
+//                          [31:16] = CIM (write). With nothing in the
+//                          stub yet setting bits, qbert sees "no
+//                          interrupts pending" on every read, which is
+//                          exactly the wait-for-quiet pattern its init
+//                          loop polls for.
+//   offset 0x20  D_PCR   — Per-channel priority + W1C enables. Latched
+//                          write, read returns last-written.
+//   offset 0x30  D_SQWC  — Stall/skip cycles. Latched.
+//   offset 0x40  D_RBSR  — Ring-buffer size. Latched.
+//   offset 0x50  D_RBOR  — Ring-buffer base. Latched.
+//   any other offset      — write traced + dropped; read returns 0.
+//
+// Codex framing: "If the hot PC is truly a D_STAT poll, read-as-zero
+// may or may not be the right 'ready' value. Let the next run tell us.
+// If it still loops, the next chapter should decode the branch
+// condition and choose the exact D_STAT bit semantics, not guess the
+// whole region." The implementation honors that — every offset has
+// minimal-sufficient behavior; future chapters can refine specific
+// bits once a real ELF surfaces a divergence.
+//
+// Port interface mirrors the dmac_reg_stub / intc_stub conventions:
+//   reg_wr_en / reg_offset / reg_wr_data : write port
+//   reg_rd_en / reg_offset / reg_rd_data / reg_rd_valid : read port,
+//                                                          1-cycle latency
+//   trace_pkg::* : ev_* events tagged SUBSYS_DMAC + EV_READ/EV_WRITE
+//                  with arg0 = offset, arg1 = data.
+
+`timescale 1ns/1ps
+
+module ee_dmac_ctrl_stub
+    import trace_pkg::*;
+(
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Write port (single-cycle, shared offset with read).
+    input  logic          reg_wr_en,
+    input  logic [7:0]    reg_offset,
+    input  logic [31:0]   reg_wr_data,
+
+    // Read port (1-cycle latency).
+    input  logic          reg_rd_en,
+    output logic [31:0]   reg_rd_data,
+    output logic          reg_rd_valid,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam logic [7:0] D_CTRL_OFFSET = 8'h00;
+    localparam logic [7:0] D_STAT_OFFSET = 8'h10;
+    localparam logic [7:0] D_PCR_OFFSET  = 8'h20;
+    localparam logic [7:0] D_SQWC_OFFSET = 8'h30;
+    localparam logic [7:0] D_RBSR_OFFSET = 8'h40;
+    localparam logic [7:0] D_RBOR_OFFSET = 8'h50;
+
+    // ------------------------------------------------------------------
+    // Register file
+    // ------------------------------------------------------------------
+    logic [31:0] d_ctrl;
+    logic [31:0] d_stat;     // CIS in low half (W1C), CIM in high half (W)
+    logic [31:0] d_pcr;
+    logic [31:0] d_sqwc;
+    logic [31:0] d_rbsr;
+    logic [31:0] d_rbor;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            d_ctrl <= 32'd0;
+            d_stat <= 32'd0;
+            d_pcr  <= 32'd0;
+            d_sqwc <= 32'd0;
+            d_rbsr <= 32'd0;
+            d_rbor <= 32'd0;
+        end else if (reg_wr_en) begin
+            unique case (reg_offset)
+                D_CTRL_OFFSET: d_ctrl <= reg_wr_data;
+                D_STAT_OFFSET: begin
+                    // W1C on the low half (interrupt-status bits): a 1
+                    // in reg_wr_data clears that bit; a 0 leaves it.
+                    // Direct-write on the high half (mask bits).
+                    d_stat[15:0]  <= d_stat[15:0]  & ~reg_wr_data[15:0];
+                    d_stat[31:16] <= reg_wr_data[31:16];
+                end
+                D_PCR_OFFSET:  d_pcr  <= reg_wr_data;
+                D_SQWC_OFFSET: d_sqwc <= reg_wr_data;
+                D_RBSR_OFFSET: d_rbsr <= reg_wr_data;
+                D_RBOR_OFFSET: d_rbor <= reg_wr_data;
+                default: ;   // unknown offsets: write dropped (traced)
+            endcase
+        end
+    end
+
+    // Read mux (1-cycle latency to match the stub ecosystem).
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            reg_rd_data  <= 32'd0;
+            reg_rd_valid <= 1'b0;
+        end else begin
+            reg_rd_valid <= reg_rd_en;
+            if (reg_rd_en) begin
+                unique case (reg_offset)
+                    D_CTRL_OFFSET: reg_rd_data <= d_ctrl;
+                    D_STAT_OFFSET: reg_rd_data <= d_stat;
+                    D_PCR_OFFSET:  reg_rd_data <= d_pcr;
+                    D_SQWC_OFFSET: reg_rd_data <= d_sqwc;
+                    D_RBSR_OFFSET: reg_rd_data <= d_rbsr;
+                    D_RBOR_OFFSET: reg_rd_data <= d_rbor;
+                    default:       reg_rd_data <= 32'd0;
+                endcase
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace — one event per cycle, write priority over read (consistent
+    // with the rest of the stub ecosystem).
+    // ------------------------------------------------------------------
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (reg_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {56'd0, reg_offset};
+            ev_arg1   <= {32'd0, reg_wr_data};
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'h0000_0001;   // write
+        end else if (reg_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_READ;
+            ev_arg0   <= {56'd0, reg_offset};
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : ee_dmac_ctrl_stub
@@ -0,0 +1,181 @@
+// retroDE_ps2 — ee_dmac_passive_chan_stub
+//
+// Ch288 — Lightweight per-channel register surface for the EE DMAC
+// channels NOT covered by a dedicated transfer-FSM stub. Hosts the
+// four standard per-channel registers (CHCR/MADR/QWC/TADR) for each
+// covered channel; reset to zero, writes latch, reads return the
+// latched value. NO transfer FSM, NO start-bit side effects, NO
+// D_STAT interaction. This is the "init-time channel-clear / quiet
+// register surface" Codex framed for Ch288.
+//
+// Channels covered (4 KiB window each, starting at 0x1000_8000):
+//   ch0 (VIF0)     0x1000_8000-0x1000_8FFF
+//   ch1 (VIF1)     0x1000_9000-0x1000_9FFF
+//   ch3 (IPU_FROM) 0x1000_B000-0x1000_BFFF
+//   ch4 (IPU_TO)   0x1000_C000-0x1000_CFFF   ← qbert's first hit
+//   ch5 (SIF0)     0x1000_D000-0x1000_DFFF
+//
+// SKIPPED:
+//   ch2 (GIF)      0x1000_A000-0x1000_AFFF — already routed
+//                  externally to dmac_reg_stub via the map's
+//                  ee_dmac_ch2_* ports. Do NOT shadow it here.
+//
+// Channel index extracted from chan_addr[15:12]:
+//   0x8 → ch0, 0x9 → ch1, 0xB → ch3, 0xC → ch4, 0xD → ch5
+//   (0xA / ch2 is filtered by the caller; if chan_addr[15:12]==0xA
+//    arrives here the module silently drops it.)
+//
+// Register offsets (chan_addr[11:0], matches dmac_reg_stub layout):
+//   0x00 CHCR  — control (start bit at [0]); latched, no FSM
+//   0x10 MADR  — main address
+//   0x20 QWC   — quadword count
+//   0x30 TADR  — tag address
+//   any other offset: read = 0, write dropped + traced
+
+`timescale 1ns/1ps
+
+module ee_dmac_passive_chan_stub
+    import trace_pkg::*;
+(
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Write port. chan_addr is the 16-bit offset into the entire
+    // 0x1000_8000-base window: chan_addr[15:12] = channel selector,
+    // chan_addr[11:0] = register offset within that channel.
+    input  logic          reg_wr_en,
+    input  logic [15:0]   chan_addr,
+    input  logic [31:0]   reg_wr_data,
+
+    // Read port (1-cycle latency).
+    input  logic          reg_rd_en,
+    output logic [31:0]   reg_rd_data,
+    output logic          reg_rd_valid,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam logic [11:0] CHCR_OFFSET = 12'h000;
+    localparam logic [11:0] MADR_OFFSET = 12'h010;
+    localparam logic [11:0] QWC_OFFSET  = 12'h020;
+    localparam logic [11:0] TADR_OFFSET = 12'h030;
+
+    // Channel index from the high nibble of chan_addr. Out-of-range
+    // nibbles (anything outside 0x8/0x9/0xB/0xC/0xD) get
+    // chan_valid=0 and the access is dropped.
+    logic [3:0]  chan_nibble;
+    logic [2:0]  chan_idx;     // 0..4 packed: 0=ch0, 1=ch1, 2=ch3, 3=ch4, 4=ch5
+    logic        chan_valid;
+    always_comb begin
+        chan_nibble = chan_addr[15:12];
+        chan_idx    = 3'd0;
+        chan_valid  = 1'b0;
+        unique case (chan_nibble)
+            4'h8: begin chan_idx = 3'd0; chan_valid = 1'b1; end // ch0
+            4'h9: begin chan_idx = 3'd1; chan_valid = 1'b1; end // ch1
+            4'hB: begin chan_idx = 3'd2; chan_valid = 1'b1; end // ch3
+            4'hC: begin chan_idx = 3'd3; chan_valid = 1'b1; end // ch4
+            4'hD: begin chan_idx = 3'd4; chan_valid = 1'b1; end // ch5
+            default: ;
+        endcase
+    end
+
+    logic [11:0] reg_offset;
+    assign reg_offset = chan_addr[11:0];
+
+    // ------------------------------------------------------------------
+    // Register file: 5 channels × 4 registers
+    // ------------------------------------------------------------------
+    logic [31:0] chcr [0:4];
+    logic [31:0] madr [0:4];
+    logic [31:0] qwc  [0:4];
+    logic [31:0] tadr [0:4];
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            for (int i = 0; i < 5; i++) begin
+                chcr[i] <= 32'd0;
+                madr[i] <= 32'd0;
+                qwc[i]  <= 32'd0;
+                tadr[i] <= 32'd0;
+            end
+        end else if (reg_wr_en && chan_valid) begin
+            unique case (reg_offset)
+                CHCR_OFFSET: chcr[chan_idx] <= reg_wr_data;
+                MADR_OFFSET: madr[chan_idx] <= reg_wr_data;
+                QWC_OFFSET:  qwc[chan_idx]  <= reg_wr_data;
+                TADR_OFFSET: tadr[chan_idx] <= reg_wr_data;
+                default: ;
+            endcase
+        end
+    end
+
+    // Read mux (1-cycle latency). Returns 0 for invalid channel /
+    // unknown offset.
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            reg_rd_data  <= 32'd0;
+            reg_rd_valid <= 1'b0;
+        end else begin
+            reg_rd_valid <= reg_rd_en;
+            if (reg_rd_en && chan_valid) begin
+                unique case (reg_offset)
+                    CHCR_OFFSET: reg_rd_data <= chcr[chan_idx];
+                    MADR_OFFSET: reg_rd_data <= madr[chan_idx];
+                    QWC_OFFSET:  reg_rd_data <= qwc[chan_idx];
+                    TADR_OFFSET: reg_rd_data <= tadr[chan_idx];
+                    default:     reg_rd_data <= 32'd0;
+                endcase
+            end else if (reg_rd_en) begin
+                reg_rd_data <= 32'd0;   // invalid channel
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace — write priority over read; tagged SUBSYS_DMAC with
+    // arg0 = chan_nibble (0x8/0x9/0xB/0xC/0xD = phys channel), arg1
+    // = data, arg2 = reg_offset, arg3 = chan_idx (packed 0..4).
+    // ------------------------------------------------------------------
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (reg_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {60'd0, chan_nibble};
+            ev_arg1   <= {32'd0, reg_wr_data};
+            ev_arg2   <= {52'd0, reg_offset};
+            ev_arg3   <= {61'd0, chan_idx};
+            ev_flags  <= {31'd0, chan_valid};
+        end else if (reg_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_READ;
+            ev_arg0   <= {60'd0, chan_nibble};
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= {52'd0, reg_offset};
+            ev_arg3   <= {61'd0, chan_idx};
+            ev_flags  <= {31'd0, chan_valid};
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : ee_dmac_passive_chan_stub
@@ -0,0 +1,222 @@
+# rtl/ee
+
+Emotion Engine-side RTL. Matches `docs/contracts/ee.md`.
+
+## Current contents
+
+- `ee_fetch_stub.sv` — minimal sequential fetcher from the early waves.
+  On reset, PC = BIOS reset vector (0xBFC00000). Each cycle while
+  `enable` is high, issues a read at PC and advances PC += 4. No
+  decode, no branches, no exceptions. Emits `EV_RESET` once at reset
+  exit and `EV_IFETCH` for each returned response. Retained for the
+  Milestone-B golden-reference comparison.
+- `ee_core_stub.sv` — **first real EE instruction-decoding core.**
+  Structural mirror of `iop_core_stub`: same multi-cycle FSM, same
+  R3000 subset (LUI/ORI/ADDIU/LW/SW/BEQ/BNE/J/JR/NOP/SYSCALL/MFC0/MTC0/
+  RFE), same branch-delay-slot discipline, same minimal COP0 +
+  exception entry, same `STRICT_UNSUPPORTED` trap gate. Separate file
+  from the IOP core because the EE is fundamentally an R5900 and will
+  eventually need 64-bit registers, COP1/COP2, VU-side plumbing the
+  IOP will never grow. Emits traces under `SUBSYS_EE` (vs.
+  `SUBSYS_IOP` for the IOP core).
+
+## Current status
+
+The EE side has a first real execution primitive (`ee_core_stub`) and
+runs hand-assembled bootstraps from the shared BIOS ROM window. The
+IOP side is ahead — it has DMAC ch9 data path, real interrupt
+exception entry, BIOS reset, and strict-mode BIOS smoke bring-up. The
+EE side's next natural growth (in roughly this order) is:
+
+1. ~~CPU-side LW/SW to EE RAM.~~ **Done** (`tb_ee_core_memops`). EE
+   memory map now routes CPU 32-bit reads and writes into the 128-bit
+   `ee_ram_stub` with lane-select on reads and byte-enable masking on
+   writes. CPU wins over DMAC on same-cycle RAM-read collisions and
+   over the SIF egress bridge on RAM-write collisions.
+2. ~~EE DMAC register access from the core.~~ **Done**
+   (`tb_ee_core_dmac`, `tb_ee_core_dmac_poll`). Chapter 3 added the
+   write-side: EE map decodes a CPU write at `phys[28:12] ==
+   17'h1_000A` (0x1000_A000-0x1000_AFFF, ch2 GIF) and routes it
+   through a new `ee_dmac_ch2_wr_*` port into `dmac_reg_stub`. The
+   EE core programs MADR/QWC/CHCR via SW; the DMAC fetches from EE
+   RAM through the map's `dmac_rd_*` port and completes with real
+   DMA_START/BEAT/DONE events. Chapter 4 added the read-side:
+   `dmac_reg_stub` grew a `reg_rd_*` surface (CHCR/MADR/QWC/TADR +
+   DONE_COUNT monotonic counter at 0x40), and the EE map forwards
+   CPU reads in the same DMAC window via a new `ee_dmac_ch2_rd_*`
+   port. The core polls CHCR.start until the DMAC clears it, then
+   reads DONE_COUNT and writes the witness to RAM — no more fixed
+   NOP padding.
+3. ~~EE INTC + exception entry.~~ **Done** (`tb_ee_core_dmac_intc`).
+   EE map now decodes the EE INTC register window at `phys[28:12] ==
+   17'h1_000F` (0x1000_F000/0x1000_F010 for STAT/MASK) and carries
+   both directions through new `ee_intc_{wr,rd}_*` ports. An
+   `intc_stub` instance on the EE side latches
+   `dmac_reg_stub.irq_completion_o` and drives `ee_core_stub.cpu_irq`
+   (which feeds `cause_ip[2]`). Bootstrap enables interrupts
+   (Status = IEc | IM[2]), programs INTC_MASK, kicks the DMAC, and
+   waits on DONE_COUNT; a RAM-resident ISR at `EXC_VECTOR=0x80` acks
+   INTC_STAT via W1C, MFC0 EPC, JR + RFE. Core takes exactly one
+   exception + one RFE, strictly after DMA_DONE.
+4. ~~EE-side strict BIOS smoke.~~ **Done** (`tb_ee_core_bios_smoke`).
+   EE mirror of the IOP smoke harness: `ee_core_stub` instantiated
+   with `STRICT_UNSUPPORTED=1'b1`; synthetic CI bootstrap ends in an
+   `AND` (SPECIAL func 0x24) that the core doesn't decode, so
+   `trap_o`/`trap_pc_o`/`trap_instr_o` fire and halt the core loudly.
+   Swap in a real BIOS via `make tb_ee_core_bios_smoke
+   BIOS=/path/to/bios.hex` (plusarg-driven `$readmemh` into
+   `u_bios.mem`, same convention as the IOP target). Output line
+   includes an inline mnemonic decoder so the iteration loop (drop
+   in BIOS, read output, add the missing opcode) works without a
+   separate disassembler.
+5. **Widen the core opcode set, driven by real-BIOS smoke.** The
+   iteration loop is live: drop a BIOS dump in via
+   `make tb_ee_core_bios_smoke BIOS=...`, read `trap_instr` +
+   `mnemonic` from the output, implement the op, re-run. Progress
+   so far (each step landed a dedicated coverage TB and kept
+   full_checks green):
+   - **SLTI / SLTIU** (I-type compare, opcodes 0x0A / 0x0B). First
+     real-BIOS trip at 0xBFC0_0008. TB: `tb_ee_core_slti`.
+   - **ADDI** (opcode 0x08). Implemented as ADDIU (no overflow
+     trap — real BIOS doesn't emit ADDI where overflow could
+     actually happen). TB: `tb_ee_core_addi`.
+   - **ANDI** (opcode 0x0C, zero-extended). TB: `tb_ee_core_andi`.
+   - **AND / OR / XOR / NOR** (SPECIAL R-type logic family, func
+     0x24-0x27; destination = rd). Batched because they share the
+     R-type ALU plumbing. TB: `tb_ee_core_rtype_logic`.
+   - **SB** (opcode 0x28, byte store with lane broadcast +
+     one-hot byte-enable on the map write bus). TB:
+     `tb_ee_core_sb`. Unlocked a 1500-instruction stretch
+     (retired=180 → 1704).
+   - **LB** (opcode 0x20, sign-extended byte load via
+     `map_rd_data` lane extraction + 24-bit sign-extend in
+     `S_MEM_WAIT`). TB: `tb_ee_core_lb`.
+   - **JAL** (opcode 0x03, jump-and-link; writes `$31 = pc+8`).
+     TB: `tb_ee_core_jal`.
+   - **ADDU / SUBU** (SPECIAL R-type arith, func 0x21 / 0x23).
+     Batched, share R-type ALU. TB: `tb_ee_core_rtype_addu`.
+     Codex pre-approved the grouping.
+   - **SLT / SLTU** (SPECIAL R-type compare, func 0x2A / 0x2B).
+     Batched with the R-type ALU; register-form pair of
+     SLTI/SLTIU. TB: `tb_ee_core_slt`. Unlocked a 5700-
+     instruction stretch (retired=1717 → 7385).
+   - **LH / LHU** (opcodes 0x21 / 0x25, halfword load with sign-
+     and zero-extension respectively). Batched — same lane-
+     extraction plumbing, differ only in fill semantics. Halfword
+     addressing uses `ea[1]` (ea[0] must be zero for aligned
+     access). TBs: `tb_ee_core_lh`, `tb_ee_core_lhu` (each
+     covers both halfword lanes + the fill discipline for
+     negative high-lane values). Unlocked retired=7385 → 8207.
+   - **SLL / SRL / SRA** (SPECIAL R-type shifts, func 0x00 /
+     0x02 / 0x03). Batched per Codex pre-approval. Destination
+     = rd, operand = rt, shift amount = `shamt` (bits [10:6]).
+     SRA uses `$signed(rt_val) >>> shamt` for arithmetic right
+     shift (sign fill); SRL uses `rt_val >> shamt` (zero fill).
+     SLL $0,$0,0 is the canonical NOP encoding and flows through
+     this path harmlessly — the rd_idx=0 writeback guard blocks
+     any phantom write. TB: `tb_ee_core_shift` (critical probes:
+     SRL vs SRA on the same negative input to catch sign-vs-zero
+     fill bugs). Unlocked a **12,000-instruction stretch**
+     (retired=8207 → 20327).
+   - **SH** (opcode 0x29, halfword store). Store-side mate to
+     LH/LHU; same lane-broadcast + byte-enable idiom as SB but
+     at halfword granularity via `ea[1]`. 2-of-4 byte-enable
+     (`4'b0011` for low lane, `4'b1100` for high lane) preserves
+     the non-addressed halfword. TB: `tb_ee_core_sh` — two
+     chained probes with register values that have distinctive
+     upper halves (0xCAFE_FACE, 0x1234_5678). If the byte-enable
+     is wrong or the full register leaks into the map_wr_data
+     bus, the preservation check catches it (RAM word ends up
+     0x5678_FACE after both stores; wrong behavior would corrupt
+     the non-addressed halfword). Unlocked a **56,000-
+     instruction stretch** (retired=20327 → 76406) once the
+     RAM-size infra issue was also fixed in the same chapter
+     — see next bullet.
+   - **Real-BIOS RAM size (chapter 7.9 infra fix).** Before this
+     chapter, `tb_ee_core_bios_smoke` used only 4 KiB of EE RAM
+     — fine for the synthetic CI program (which never writes
+     beyond the first qword), but destructive once the real
+     BIOS copies a large chunk of itself into RAM and jumps
+     there. Addresses beyond 4 KiB silently aliased into the
+     same window, producing 156k "retires" that were actually
+     the core executing a scrambled mix of overwritten bytes,
+     with no trap ever firing because whatever happened to land
+     at the aliased offset decoded to something supported.
+     Bumped `EE_RAM_BYTES` in the bench to 4 MiB (real PS2 has
+     32 MiB; 4 MiB covers BIOS init comfortably without
+     ballooning sim memory). After the fix, real-BIOS smoke
+     runs honestly and trapped on JALR at 0xBFC5_29E8.
+   - **JALR** (SPECIAL func 0x09, register-indirect call). Target
+     is `rs_val` (same path as JR); link address pc+8 is written
+     to `rd_idx`. Unlike JAL's hardcoded `$31`, JALR's link
+     destination is explicit in the instruction, and `rd==0` is
+     a valid encoding that suppresses the link write. TB:
+     `tb_ee_core_jalr` — two probes: canonical `jalr $31, $rs`
+     (what the BIOS used) plus `jalr $20, $rs` with the return
+     via `jr $20` to prove the rd field is honored and not
+     accidentally hardcoded to $31. Unlocked retired=76406 →
+     84112 and the BIOS fully jumped into RAM-resident code
+     (next trap_pc is `0x0000_060C`, a RAM address, not BIOS).
+   - **ADD / SUB** (SPECIAL R-type, func 0x20 / 0x22). Batched
+     per Codex's guidance — same pragmatic policy as ADDI vs
+     ADDIU: this core does not model the Arithmetic Overflow
+     exception, so ADD behaves as ADDU and SUB behaves as SUBU.
+     Merged into the existing `rs_val + rt_val` / `rs_val - rt_val`
+     arms of `rtype_alu_wb`. TB: `tb_ee_core_add_sub` — four
+     probes including INT_MAX+1 wrap, which documents the
+     deferred-exception policy (the wrap is the *expected*
+     outcome, so the TB will fail loudly if overflow trapping
+     ever lands without the TB being updated).
+   - **COP0 Count (reg 9)** — first machine-state chapter after
+     the iter-14 transition. Free-running 32-bit counter that
+     increments every clock and resets to 0. Exposed read-only
+     through MFC0 $9. MTC0 $9 silently dropped (no reset-to-value
+     yet; revisit if BIOS depends on it). TB:
+     `tb_ee_core_cop0_count` — two probes covering consecutive-
+     MFC0 advance and a canonical `while (now < target)` poll
+     that must exit.
+   - **Enhanced bios_smoke PC sampler** with `peek_instr(addr)`
+     helper (hierarchical read through `u_bios.mem` / `u_ee_ram.mem`)
+     and a parallel `retired_history` array. Timeout now reports
+     the instruction and retired count at each sample, not just
+     pc. Timeout window bumped 5 ms → 20 ms for BIOS runway.
+   - **Sampler pointer snapshots + 80 ms timeout.** After the
+     instruction-aware sampler showed the loop was a linked-list
+     walk (not a hardware wait), Codex directed "extend timeout
+     first, then add pointer snapshots only if still stuck".
+     Timeout bumped 20 ms → 80 ms: retired grew linearly to
+     2.46 M, still 100% in the same loop (≈350k iterations — way
+     beyond any plausible BIOS list length). Added `u_core.regfile[5]`
+     and `[6]` hierarchical snapshots at each sample. Finding:
+     - `$5` (sentinel) = `0x00000974` — plausible low-RAM pointer
+     - `$6` (current) = `0xDEADBEEF` — **the EE map's unmapped-
+       read poison value**.
+     The cycle is self-perpetuating: `lw $2, 0($6)` with
+     `$6 = 0xDEADBEEF` reads address 0xDEADBEEF, which is
+     unmapped, returning 0xDEADBEEF; the `bne $2, $0` stays
+     taken forever. The real root cause is an **earlier** BIOS
+     read from an unmapped address that poisoned a data structure
+     — the traversal followed the poisoned pointer and locked in.
+   - *(next-move call is with Codex: add an unmapped-read tracer
+     to find the first bad address, implement whatever peripheral
+     the BIOS was reading, change the poison value to 0 so the
+     loop exits and exposes further BIOS progress, or something
+     else.)*
+   - **Bench-drift note (chapter 7.5):** the synthetic BIOS smoke
+     sentinel was originally AND; once AND was added to the
+     R-type ALU, the synthetic test silently stopped tripping
+     and started timing out. Codex caught it; sentinel is now
+     BREAK (SPECIAL func 0x0D). See project memory for the full
+     post-mortem. Lesson: avoid using real opcodes as
+     "unsupported sentinels" in test benches.
+
+## Scope boundary
+
+This directory owns EE CPU execution and its immediate coprocessors
+(COP0 minimum; eventually COP1 FPU and COP2 VU macro mode). It does
+**not** own:
+
+- memory map / address decode — that's `rtl/memory/ee_memory_map_stub.sv`.
+- interrupt controller — that's `rtl/intc/` (generic; the same
+  `intc_stub` module already serves the IOP side).
+- DMAC, VIF/VU, GIF/GS — separate directories.
@@ -0,0 +1,142 @@
+// retroDE_ps2 — ee_biu_mmio_stub
+//
+// Narrow latched-register-file stub for the EE Bus Interface Unit /
+// cache-control window at virtual `0xFFFE_0000 - 0xFFFE_0FFF`
+// (physical `0x1FFE_0000 - 0x1FFE_0FFF` after kseg1-stripping).
+// Architecturally this is the R5900's privileged BIU/control
+// register space — the same place the BIOS writes CACHE-control
+// and BIU-config values during boot.
+//
+// Chapter 9: chapter 8 closed the 0x1F80_xxxx hole. The first-
+// unmapped observer in tb_ee_core_bios_smoke then showed the next
+// unmapped event was a WRITE at 0xFFFE_0130 (pc=0xBFC0_21BC,
+// cycle 808). Multiple more writes to that same offset fire later
+// with values 0xCC4, 0xCC0, 0x1E988, 0xC04, 0x3202_000F —
+// classic cache/BIU config dance. Without a stub, these writes
+// land as UNMAPPED events; the first one reads back to this stub
+// would return 0xDEADBEEF and re-poison the pointer chain chapter
+// 8 just cleaned up.
+//
+// Codex's call for chapter 9: give this its own dedicated stub
+// with its own region tag, NOT a broad "everything else" fallback.
+// Keep architecturally distinct surfaces distinct. If the BIOS
+// later touches 0x1FA0_0000 (next unmapped in the observer), that
+// will be its own chapter, not folded in here.
+//
+// Semantics (same shape as ee_bootstrap_mmio_stub):
+//   - 4 KiB window = 1024 × 32-bit latched registers, zero-init.
+//   - Writes latch per-byte: for each `wr_be[i]`, byte[i] of the
+//     addressed register updates; untouched lanes preserve their
+//     prior value. Makes SB/SH through the window safe.
+//   - Reads return currently-latched value, one-cycle latency.
+//   - No side effects. BIOS read-modify-write sequences stay
+//     self-consistent.
+//
+// Size cost: 1024 × 32 bits = 4 KiB sim memory. Negligible.
+//
+// Trace: per-access event on SUBSYS_MEM with region tag
+// `REGION_EE_BIU = 10` (distinct from REGION_EE_MISC_MMIO=9 so
+// post-run analysis can separate the two windows).
+
+`timescale 1ns/1ps
+
+module ee_biu_mmio_stub
+    import trace_pkg::*;
+(
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Write port — 12-bit offset within the 4 KiB window
+    input  logic          reg_wr_en,
+    input  logic [11:0]   reg_wr_addr,
+    input  logic [31:0]   reg_wr_data,
+    input  logic [3:0]    reg_wr_be,
+
+    // Read port — 1-cycle latency
+    input  logic          reg_rd_en,
+    input  logic [11:0]   reg_rd_addr,
+    output logic [31:0]   reg_rd_data,
+    output logic          reg_rd_valid,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam int WORDS = 1024;  // 4 KiB / 4
+    localparam logic [63:0] REGION_EE_BIU = 64'd10;
+
+    logic [31:0] regs [0:WORDS-1];
+
+    initial begin
+        for (int i = 0; i < WORDS; i++) regs[i] = 32'd0;
+    end
+
+    logic [9:0] wr_idx;
+    logic [9:0] rd_idx;
+    assign wr_idx = reg_wr_addr[11:2];
+    assign rd_idx = reg_rd_addr[11:2];
+
+    // Per-byte write latch
+    always_ff @(posedge clk) begin
+        if (rst_n && reg_wr_en) begin
+            if (reg_wr_be[0]) regs[wr_idx][ 7: 0] <= reg_wr_data[ 7: 0];
+            if (reg_wr_be[1]) regs[wr_idx][15: 8] <= reg_wr_data[15: 8];
+            if (reg_wr_be[2]) regs[wr_idx][23:16] <= reg_wr_data[23:16];
+            if (reg_wr_be[3]) regs[wr_idx][31:24] <= reg_wr_data[31:24];
+        end
+    end
+
+    // Read — 1-cycle latency
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            reg_rd_data  <= 32'd0;
+            reg_rd_valid <= 1'b0;
+        end else begin
+            reg_rd_valid <= reg_rd_en;
+            if (reg_rd_en) reg_rd_data <= regs[rd_idx];
+        end
+    end
+
+    // Trace — write wins same-cycle collision (defensive; map enforces
+    // mutual exclusion)
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_MEM;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (reg_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_MEM;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {52'd0, reg_wr_addr};
+            ev_arg1   <= {32'd0, reg_wr_data};
+            ev_arg2   <= {60'd0, reg_wr_be};
+            ev_arg3   <= REGION_EE_BIU;
+            ev_flags  <= 32'h0000_0001;
+        end else if (reg_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_MEM;
+            ev_event  <= EV_READ;
+            ev_arg0   <= {52'd0, reg_rd_addr};
+            ev_arg1   <= {32'd0, regs[rd_idx]};
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= REGION_EE_BIU;
+            ev_flags  <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : ee_biu_mmio_stub
@@ -0,0 +1,269 @@
+// retroDE_ps2 — ee_bootstrap_mmio_stub
+//
+// Latched-register-file stub for the EE "bootstrap MMIO" window at
+// physical `0x1F80_0000 - 0x1F80_FFFF` (64 KiB). Covers the real
+// PS2 MCH (memory controller), SBUS gateway, and RDRAM init
+// registers the BIOS touches very early in boot. This is the
+// narrowest thing that closes the poisoned-dataflow hole found by
+// chapter 7.99: before this module existed, the EE map returned
+// `0xDEADBEEF` for every CPU read in this window, and the BIOS
+// laundered that poison into a data structure whose later
+// traversal wedged the core forever.
+//
+// Semantics (deliberately simple, not architecturally accurate):
+//   - Full window is a 16 KiB word-addressed register file; all
+//     registers reset/init to 0.
+//   - Writes latch per-byte: for each `wr_be[i]` that is asserted,
+//     `regs[addr[15:2]][8*i +: 8] <= wr_data[8*i +: 8]`. Untouched
+//     byte lanes preserve their existing value. This makes SB/SH
+//     write-through-this-window safe — prior chapters added SB/SH
+//     for BIOS progress, and without be-aware latching a sub-word
+//     store here would clobber the other three (or two) bytes.
+//   - Reads return the currently-latched value, one-cycle latency,
+//     matching the rest of the stub ecosystem.
+//   - No side effects, no per-register behavior (no ready-bit
+//     auto-set, no interrupt generation, no state machines).
+//
+// That keeps BIOS read/modify/write sequences self-consistent:
+// if the BIOS reads reg X, ORs a bit, writes back, it sees the
+// merged value on the next read. It does NOT emulate real
+// hardware semantics (e.g. status bits that flip on their own,
+// interrupt latches, FIFO behavior). If the BIOS tripwire-depends
+// on any of that, it will reveal itself the same way the 0x14B4
+// linked-list wedge did — via a new diagnostic signal, handled
+// in a future chapter.
+//
+// Trace:
+//   Per-access event on SUBSYS_MEM with the region tag
+//   `REGION_EE_MISC_MMIO = 9`. arg0 is the 16-bit offset within
+//   the window (not the full 32-bit address — the map's own
+//   trace already carries the full address; the stub's finer
+//   trace carries the offset so downstream analysis can see
+//   which register was touched without having to mask). arg1 is
+//   the data (write data, or the value being returned on read).
+//   arg3 is the region constant. flags bit 0 = write.
+//
+// Size cost: 16384 × 32 bits ≈ 64 KiB of sim memory. Negligible.
+
+`timescale 1ns/1ps
+
+module ee_bootstrap_mmio_stub
+    import trace_pkg::*;
+#(
+    // Ch202 — narrow "ready" return for offset 0x1814. Pre-Ch201 the
+    // window returned the latched register value (which initialises to
+    // 0); the BIOS at PC=0xBFC4FB04..FB30 polls this address waiting
+    // for ($read & $mask) != 0 and our zero return left it spinning.
+    // Default = 32'hFFFFFFFF satisfies any non-zero mask the BIOS may
+    // hold in $a0 — wider than a real PS2 GPUSTAT (typical idle =
+    // 0x1C00_0000), but the BIOS has not been observed to USE the
+    // value beyond the bit-test so the wider satisfaction is safe.
+    // A future chapter can narrow this if a side-effect is observed.
+    parameter logic [31:0] MMIO_1814_RDY_VALUE = 32'hFFFF_FFFF,
+
+    // Ch258 — IOP DMAC PCR realism stub. The IOP DMAC Priority Control
+    // Register lives at phys 0x1F8010F0 (= EE kseg1 0xBF8010F0). Real
+    // PS1/IOP hardware resets this to 0x07654321 (priority 1 for ch0,
+    // 2 for ch1, ... 7 for ch6, with bit[31:24]=0x07 as the enable
+    // mask). Ch218 observer captured BIOS reading this address three
+    // times during the Ch215 longjmp treadmill (PC=0xbfc4d2cc /
+    // 0xbfc4d2dc / 0xbfc4d350), all returning 0 from our latched-zero
+    // stub. Whether the zero return is the cause of the treadmill or
+    // an incidental noise read is open — Ch258's job is to flip the
+    // PCR to its real reset value and re-observe.
+    //
+    // This is a REALISM STUB, not a fix. We are not modelling the
+    // IOP DMA channel priority semantics; we are just declining to
+    // return poison-zero for a named hardware register with a known
+    // reset value. If BIOS escapes the Ch215 treadmill after this
+    // change, great. If it does not, Ch258 closes with "PCR was not
+    // the gate" and we name the next observed blocker.
+    parameter logic [31:0] MMIO_10F0_PCR_VALUE = 32'h0765_4321
+)
+(
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Write port
+    input  logic          reg_wr_en,
+    input  logic [15:0]   reg_wr_addr,
+    input  logic [31:0]   reg_wr_data,
+    input  logic [3:0]    reg_wr_be,
+
+    // Read port — 1-cycle latency, matches rest of stub ecosystem
+    input  logic          reg_rd_en,
+    input  logic [15:0]   reg_rd_addr,
+    output logic [31:0]   reg_rd_data,
+    output logic          reg_rd_valid,
+
+    // Ch259 / Ch260 — DIAGNOSTIC source-injection port for the named
+    // IOP INTC view at 0x1F801070/0x1F801074. DEFAULT IS ZERO in every
+    // existing instantiation (tb_ee_bootstrap_mmio.sv and
+    // tb_ee_core_bios_smoke.sv both tie this to 16'd0 unless the
+    // BIOS-long TB's +IOP_INTC_BOOT_SRC plusarg overrides it).
+    //
+    // When non-zero, each set bit is ORed into I_STAT every cycle so
+    // the assertion survives W1C clears (matches the "real device
+    // asserts the line until serviced" shape, not a one-shot pulse).
+    //
+    // This port exists ONLY as a controlled diagnostic knob. Ch259
+    // closed the BIOS-mmio-probe arc with the finding that single
+    // synthetic source bits do not break the Ch215 treadmill — the
+    // multi-state IOP/SBUS/kernel activity is needed instead. Any
+    // future use of this port should be similarly scoped (TB-driven,
+    // documented intent, default-zero on instantiation).
+    input  logic [15:0]   iop_intc_inject_src_i,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam int WORDS = 16384;  // 64 KiB / 4
+    localparam logic [63:0] REGION_EE_MISC_MMIO = 64'd9;
+
+    logic [31:0] regs [0:WORDS-1];
+
+    initial begin
+        for (int i = 0; i < WORDS; i++) regs[i] = 32'd0;
+    end
+
+    logic [13:0] wr_idx;
+    logic [13:0] rd_idx;
+    assign wr_idx = reg_wr_addr[15:2];
+    assign rd_idx = reg_rd_addr[15:2];
+
+    // Per-byte write latch — honors reg_wr_be so SB/SH through this
+    // window preserves the untouched byte lanes instead of clobbering
+    // the whole 32-bit register.
+    always_ff @(posedge clk) begin
+        if (rst_n && reg_wr_en) begin
+            if (reg_wr_be[0]) regs[wr_idx][ 7: 0] <= reg_wr_data[ 7: 0];
+            if (reg_wr_be[1]) regs[wr_idx][15: 8] <= reg_wr_data[15: 8];
+            if (reg_wr_be[2]) regs[wr_idx][23:16] <= reg_wr_data[23:16];
+            if (reg_wr_be[3]) regs[wr_idx][31:24] <= reg_wr_data[31:24];
+        end
+    end
+
+    // Read — 1-cycle latency. Ch202: offset 0x1814 ignores the latched
+    // register and returns MMIO_1814_RDY_VALUE so the BIOS bit-test
+    // poll satisfies (read & mask) != 0 on the first read. Writes to
+    // 0x1814 still latch into regs[]; a future chapter can promote
+    // 0x1814 to a true read-write register if BIOS-write semantics
+    // matter, but the current observed behavior is read-only-status.
+    // Ch258 adds the same shape for offset 0x10F0 (IOP DMAC PCR).
+    // Ch259 promotes 0x1070 (IOP INTC I_STAT) and 0x1074 (I_MASK)
+    // OUT of the anonymous regfile into named INTC behavior — W1C
+    // on STAT writes, plain-write on MASK writes, sticky source
+    // injection from `iop_intc_inject_src_i`. Matches the existing
+    // `rtl/intc/intc_stub.sv` shape exactly so the EE-side view of
+    // the IOP INTC behaves like the IOP-side view does.
+    localparam logic [13:0] OFFSET_1814_WIDX = 14'h0605;  // 0x1814 >> 2 (1541)
+    localparam logic [13:0] OFFSET_10F0_WIDX = 14'h043C;  // 0x10F0 >> 2 (1084)
+    localparam logic [13:0] OFFSET_1070_WIDX = 14'h041C;  // 0x1070 >> 2 (1052)
+    localparam logic [13:0] OFFSET_1074_WIDX = 14'h041D;  // 0x1074 >> 2 (1053)
+
+    // Ch259 — named IOP INTC state. Independent of the anonymous
+    // regs[] (writes to 0x1070/0x1074 still update regs[] via the
+    // generic per-byte latch above, but reads bypass it for these
+    // offsets, matching the Ch202/Ch258 override pattern).
+    logic [15:0] iop_intc_stat_q;
+    logic [15:0] iop_intc_mask_q;
+
+    wire [15:0]  iop_intc_stat_w1c_mask =
+        (reg_wr_en && wr_idx == OFFSET_1070_WIDX && (&reg_wr_be))
+            ? reg_wr_data[15:0] : 16'd0;
+    wire         iop_intc_mask_wr_en =
+        reg_wr_en && wr_idx == OFFSET_1074_WIDX && (&reg_wr_be);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            iop_intc_stat_q <= 16'd0;
+            iop_intc_mask_q <= 16'd0;
+        end else begin
+            // I_STAT: W1C of cleared bits, OR'd with sticky injection.
+            // Assertion-wins on same-cycle W1C+source collision —
+            // matches `intc_stub.sv` lines ~102-110 so we don't
+            // swallow an interrupt that's still held.
+            iop_intc_stat_q <= (iop_intc_stat_q & ~iop_intc_stat_w1c_mask)
+                             | iop_intc_inject_src_i;
+            if (iop_intc_mask_wr_en)
+                iop_intc_mask_q <= reg_wr_data[15:0];
+        end
+    end
+
+    wire [31:0] iop_intc_stat_read = {16'd0, iop_intc_stat_q | iop_intc_inject_src_i};
+    wire [31:0] iop_intc_mask_read = {16'd0, iop_intc_mask_q};
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            reg_rd_data  <= 32'd0;
+            reg_rd_valid <= 1'b0;
+        end else begin
+            reg_rd_valid <= reg_rd_en;
+            if (reg_rd_en) begin
+                if (rd_idx == OFFSET_1814_WIDX)
+                    reg_rd_data <= MMIO_1814_RDY_VALUE;
+                else if (rd_idx == OFFSET_10F0_WIDX)
+                    reg_rd_data <= MMIO_10F0_PCR_VALUE;
+                else if (rd_idx == OFFSET_1070_WIDX)
+                    reg_rd_data <= iop_intc_stat_read;
+                else if (rd_idx == OFFSET_1074_WIDX)
+                    reg_rd_data <= iop_intc_mask_read;
+                else
+                    reg_rd_data <= regs[rd_idx];
+            end
+        end
+    end
+
+    // Trace emission — one event per cycle, write wins on same-cycle
+    // collision (mirrors the rd/wr_en mutual-exclusion at the map level;
+    // this is defensive for mechanical safety).
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_MEM;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (reg_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_MEM;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {48'd0, reg_wr_addr};
+            ev_arg1   <= {32'd0, reg_wr_data};
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= REGION_EE_MISC_MMIO;
+            ev_flags  <= 32'h0000_0001;
+        end else if (reg_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_MEM;
+            ev_event  <= EV_READ;
+            ev_arg0   <= {48'd0, reg_rd_addr};
+            ev_arg1   <= (rd_idx == OFFSET_1814_WIDX)
+                          ? {32'd0, MMIO_1814_RDY_VALUE}
+                       : (rd_idx == OFFSET_10F0_WIDX)
+                          ? {32'd0, MMIO_10F0_PCR_VALUE}
+                       : (rd_idx == OFFSET_1070_WIDX)
+                          ? {32'd0, iop_intc_stat_read}
+                       : (rd_idx == OFFSET_1074_WIDX)
+                          ? {32'd0, iop_intc_mask_read}
+                          : {32'd0, regs[rd_idx]};
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= REGION_EE_MISC_MMIO;
+            ev_flags  <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : ee_bootstrap_mmio_stub
@@ -0,0 +1,128 @@
+// retroDE_ps2 — ee_fetch_stub
+//
+// Minimal sequential-fetch stand-in for the R5900. Wave 1 scope only: enough
+// to drive ee_memory_map_stub → bios_rom_stub for Milestone B.
+//
+// Contract refs:
+//   docs/stub_module_plan.md    (Wave 1, item 4)
+//   docs/contracts/ee.md
+//
+// Behavior:
+//   - On reset, PC = RESET_VECTOR (default 0xBFC00000, the MIPS BIOS
+//     reset vector in kseg1).
+//   - Each cycle while `enable` is high: issue a read at PC, advance
+//     PC += 4. No decode, no branches, no exceptions, no retirement
+//     fidelity (all out-of-scope per plan).
+//   - Responses return 1 cycle later via rd_valid/rd_data from the
+//     memory map. The issued address is latched so the trace line can
+//     pair address with data.
+//
+// Non-goals for this wave (stub plan, explicit):
+//   - full decode,
+//   - exceptions beyond deterministic fault handling,
+//   - FPU/MMI behavior,
+//   - instruction retirement fidelity.
+//
+// Trace payload schema (per stub plan):
+//   EE RESET  arg0=reset_vector
+//   EE IFETCH arg0=pc arg1=data arg2=resp_kind arg3=-
+//     resp_kind: 0=OK (only path in Wave 1)
+
+`timescale 1ns/1ps
+
+module ee_fetch_stub
+    import trace_pkg::*;
+#(
+    parameter logic [31:0] RESET_VECTOR = 32'hBFC00000
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+    input  logic          enable,
+
+    // Memory-facing fetch port
+    output logic          rd_en,
+    output logic [31:0]   rd_addr,
+    input  logic [31:0]   rd_data,
+    input  logic          rd_valid,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    // ------------------------------------------------------------------
+    // PC and one-cycle issued-address shadow
+    //
+    //   pc      is the address being issued THIS cycle (rd_addr)
+    //   pc_d1   is the address whose response arrives THIS cycle on rd_valid
+    //
+    // pc_d1 only advances alongside pc when enable is high, so it stays
+    // aligned with the in-flight request.
+    // ------------------------------------------------------------------
+
+    logic [31:0] pc;
+    logic [31:0] pc_d1;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            pc    <= RESET_VECTOR;
+            pc_d1 <= RESET_VECTOR;
+        end else if (enable) begin
+            pc_d1 <= pc;
+            pc    <= pc + 32'd4;
+        end
+    end
+
+    assign rd_en   = enable;
+    assign rd_addr = pc;
+
+    // ------------------------------------------------------------------
+    // Trace
+    //   - Single EV_RESET pulse at reset exit.
+    //   - EV_IFETCH one cycle after each rd_valid response.
+    // ------------------------------------------------------------------
+
+    logic reset_emit_pending;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid           <= 1'b0;
+            ev_subsys          <= SUBSYS_EE;
+            ev_event           <= EV_RESET;
+            ev_arg0            <= 64'd0;
+            ev_arg1            <= 64'd0;
+            ev_arg2            <= 64'd0;
+            ev_arg3            <= 64'd0;
+            ev_flags           <= 32'd0;
+            reset_emit_pending <= 1'b1;
+        end else if (reset_emit_pending) begin
+            ev_valid           <= 1'b1;
+            ev_subsys          <= SUBSYS_EE;
+            ev_event           <= EV_RESET;
+            ev_arg0            <= {32'd0, RESET_VECTOR};
+            ev_arg1            <= 64'd0;
+            ev_arg2            <= 64'd0;
+            ev_arg3            <= 64'd0;
+            ev_flags           <= 32'd0;
+            reset_emit_pending <= 1'b0;
+        end else if (rd_valid) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_EE;
+            ev_event  <= EV_IFETCH;
+            ev_arg0   <= {32'd0, pc_d1};
+            ev_arg1   <= {32'd0, rd_data};
+            ev_arg2   <= 64'd0;    // resp_kind: 0 = OK
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : ee_fetch_stub
@@ -0,0 +1,43 @@
+# rtl/gif_gs
+
+GIF path and Graphics Synthesizer logic. Matches `docs/contracts/gif_gs.md`.
+
+## Current contents
+
+- `gs_stub.sv` — GS shell with **two architecturally distinct write ports**
+  (Ch75 namespace split):
+  - `reg_wr_*` — privileged-block writes (16-bit offset within `0x12000000`).
+    Latches `BGCOLOR` (offset `0x00E0`) into `bg_{r,g,b}`; other offsets emit
+    `EV_MODE`.
+  - `gif_reg_*` — GIF A+D register-number writes (8-bit reg# + 64-bit data).
+    Decodes `PRIM=0x00`, `RGBAQ=0x01`, `XYZF2=0x04`, `XYZ2=0x05`,
+    `FRAME_1=0x4C`, `ZBUF_1=0x4E` into per-register 64-bit latches; unknown
+    reg numbers emit `EV_MODE`.
+  - No VRAM, no drawing yet — that is the next architectural step.
+
+- `gif_path_stub.sv` — Wave 2 minimal GIF packet logger; project-local
+  single-qword register-write format. Used by `tb_bgcolor_via_dma`.
+
+- `gif_packed_stub.sv` — real PS2 GIFtag parser (Ch72-Ch75). Handles PACKED
+  (FLG=0), REGLIST (FLG=1), IMAGE (FLG=2), DISABLE (FLG=3). The
+  `REAL_AD_REG_MAP` parameter selects the A+D dispatch port:
+  - `REAL_AD_REG_MAP=0` (default, back-compat) — drives `gs_stub.reg_wr_*`
+    using a project-local 16-bit offset carried in `in_data[79:64]`.
+  - `REAL_AD_REG_MAP=1` — drives `gs_stub.gif_reg_*` using the real PS2
+    8-bit reg# carried in `in_data[71:64]`. Source-of-truth: PCSX2
+    `GSRegs.h`.
+
+## BGCOLOR reset value
+
+At reset, `bg_{r,g,b}` default to `0x40` each (mid-grey) rather than black.
+Rationale: this makes "gs_stub reset but no BGCOLOR write yet" visually
+distinct from "video output disabled / black frame" in Milestone A. Override
+is a `BGCOLOR` write from the test harness.
+
+## Pitfall: namespace conflation
+
+Ch74 conflated GIF A+D reg numbers with GS privileged-block offsets and
+mapped e.g. `0x14`→PMODE@`0x0000`. That is fiction — those are separate
+namespaces. Ch75 split them. **`ZBUF_1` is `0x4E`, not `0x4F` (that's
+`ZBUF_2`).** When adding a new GIF-context register, source the reg# from
+PCSX2 `GSRegs.h`, never from the privileged-block map.
@@ -0,0 +1,275 @@
+// retroDE_ps2 — clut_loader_stub (Ch99 + Ch100 + Ch101)
+//
+// VRAM→CLUT load engine triggered by GIF TEX0.CLD. Watches the
+// 1-cycle `tex0_wr_pulse` from gs_stub and starts a 256-entry
+// load when the just-written TEX0 satisfies all three:
+//   - CSM == 1                (CSM2 linear; CSM1 swizzle deferred)
+//   - CPSM ∈ {PSMCT32, PSMCT16}
+//   - CLD permits a load under the change-detect policy:
+//       0 = never, 1 = always,
+//       2 = CBP changed since last load,
+//       3 = CBP, CPSM, or CSA changed since last load,
+//       4 = always, but write only the 16-entry CSA window
+//           (Ch102) — destination indices CSA*16..CSA*16+15
+//           wrap mod 256; the rest of clut_stub is preserved.
+//       5..7 = reserved/edge cases at this scope (no-op).
+// Per-CPSM stride: PSMCT32 reads 4 bytes/entry from
+// VRAM[CBP*256 + i*4]; PSMCT16 reads 2 bytes/entry from
+// VRAM[CBP*256 + i*2] and unpacks RGB5A1 → PSMCT32 ABGR with
+// 5→8 bit-replicate. clut_stub always sees PSMCT32 entries.
+//
+// Scope (Ch99 + Ch100):
+//   - CSM2 (linear addressing) only — entry i lives at byte
+//     offset i*entry_stride from CBP*256, where entry_stride is
+//     4 (PSMCT32) or 2 (PSMCT16). The loader explicitly gates
+//     start on tex0_csm == 1'b1 (CSM2). A TEX0_1 write with
+//     CSM=0 (CSM1, 16×16 grid swizzle) is silently ignored at
+//     this scope rather than performing a wrong linear load.
+//   - CPSM=PSMCT32 (=0) and CPSM=PSMCT16 (=2) accepted. PSMCT16
+//     entries are unpacked from RGB5A1 to PSMCT32 ABGR via 5→8
+//     bit-replicate ({c5, c5[4:2]}) so clut_stub always stores
+//     PSMCT32 regardless of source format and pcrtc's existing
+//     PSMT8+CLUT lookup path stays unchanged. Alpha is replicated
+//     across 8 bits ({8{a1}}). Other CPSM codes (PSMCT24, PSMT8H,
+//     etc.) are silently ignored.
+//   - CLD modes (Ch101 + Ch102): full conditional policy
+//     honored for CLD ∈ {0, 1, 2, 3, 4}.
+//       0 = no load.
+//       1 = always load — full 256 entries.
+//       2 = load only when CBP changed since last load.
+//       3 = load when CBP, CPSM, or CSA changed since last load.
+//       4 = partial CSA-window load (Ch102) — always fires, but
+//           writes only 16 entries at clut_stub[CSA*16 + i] for
+//           i ∈ 0..15 (CSA*16 wraps mod 256). The other 240
+//           entries are preserved; the VRAM source still starts
+//           at CBP*256 and uses the same per-CPSM byte stride.
+//     CLD ∈ {5, 6, 7} silently no-op at this scope (reserved /
+//     edge cases). The change-detect compares against `prev_*`
+//     regs latched on entry to S_LOAD; reset clears them to 0,
+//     so a first CLD=2 with CBP==0 is silently skipped (matches
+//     the "nothing changed" interpretation).
+//   - Reference (kept for posterity): real PS2 CLD encodes:
+//       1 = always
+//       2 = CBP changed
+//       3 = CBP, CPSM, or CSA changed
+//       4 = CSA changed (partial 16-entry load at CSA)
+//       5..7 = reserved/edge cases
+//     Modeling those needs a full-CLUT register snapshot for
+//     change detection — deferred.
+//   - CSA is consumed two ways. (a) For CLD=3 it's a
+//     change-detect input (any prev-vs-new CSA delta triggers
+//     a full reload). (b) For CLD=4 it picks the destination
+//     window: load_csa_base = {CSA, 4'd0} (8-bit, so CSA=16..31
+//     wrap to base 0..240). Full-CLUT loads (CLD ∈ {1,2,3})
+//     overwrite all 256 entries regardless of CSA.
+//   - One in-flight load at a time. A new TEX0_1 write while
+//     `load_busy=1` is silently ignored at this scope.
+//
+// Timing: full load = 256 clocks; partial (CLD=4) = 16 clocks.
+// `load_busy` is high throughout. TBs typically `wait (load_busy == 0)` to
+// gate scanout configuration on the load completing.
+
+`timescale 1ns/1ps
+
+module clut_loader_stub #(
+    // Ch350 — CSM1 (16×16 CT32 grid) CLUT-load path. Default OFF so all existing CSM2-linear behaviour is
+    // BYTE-IDENTICAL (a CSM=0 TEX0 is still silently ignored when this is 0, exactly as Ch99..Ch102). When 1,
+    // a CSM=0 / CPSM=PSMCT32 TEX0 commit loads the palette in the real GS CSM1 grid order: palette entry i is
+    // read at (x=i[3:0], y=i[7:4]) of a 16×16 PSMCT32 surface based at CBP, via the CT32 block+byte swizzle.
+    // This is the order Ch349 proved SH3 uses (host gs_localmem 'grid'); CSM2-linear scatters those colours.
+    parameter bit CLUT_CSM1_ENABLE = 1'b0
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // From gs_stub: 1-cycle pulse on TEX0_1 commit + the
+    // newly-decoded sub-fields.
+    input  logic        tex0_wr_pulse,
+    input  logic [13:0] tex0_cbp,
+    input  logic [3:0]  tex0_cpsm,
+    input  logic        tex0_csm,    // Ch99 audit: must be 1 (CSM2)
+    input  logic [4:0]  tex0_csa,    // Ch101: change-detect for CLD=3
+    input  logic [2:0]  tex0_cld,
+
+    // VRAM second read port — combinational byte-addressed read.
+    output logic [31:0] vram_read_addr,
+    input  logic [31:0] vram_read_data,
+
+    // CLUT staging-area write port.
+    output logic        clut_write_en,
+    output logic [7:0]  clut_write_idx,
+    output logic [31:0] clut_write_data,
+
+    // Status: high while a load is in flight.
+    output logic        load_busy
+);
+
+    typedef enum logic [0:0] {
+        S_IDLE,
+        S_LOAD
+    } state_e;
+
+    state_e      state;
+    logic [7:0]  load_idx;
+    logic [13:0] load_cbp;
+    logic        load_cpsm_is_ct16;   // Ch100: latched CPSM mode.
+
+    // Ch101 — change-detect snapshots. Updated on every entry to
+    // S_LOAD (i.e., every successful start). Used by CLD=2 (CBP
+    // change) and CLD=3 (CBP/CPSM/CSA any-change).
+    logic [13:0] prev_cbp;
+    logic [3:0]  prev_cpsm;
+    logic [4:0]  prev_csa;
+
+    // Ch102 — partial CSA-window load mode (CLD=4). When set,
+    // walks 16 entries instead of 256 and writes them to
+    // clut_stub[load_csa_base + load_idx] (8-bit wrap).
+    logic        load_partial;
+    logic [7:0]  load_csa_base;
+    logic [7:0]  load_terminal;
+    assign load_terminal = load_partial ? 8'h0F : 8'hFF;
+    logic        load_csm1;             // Ch350: latched CSM1-grid mode (PSMCT32 only).
+    logic [31:0] cbp_bytes;
+    logic [31:0] addr_offset_ct32;
+    logic [31:0] addr_offset_ct16;
+    logic [31:0] addr_offset_csm1;
+    logic [31:0] addr_offset;
+
+    assign load_busy = (state == S_LOAD);
+
+    // CBP is in 256-byte units (matches PS2 GS docs for the CLUT
+    // staging area: each CBP step covers one 256-byte block).
+    assign cbp_bytes = {18'd0, load_cbp} << 8;
+
+    // Per-PSM byte offset within the staging block.
+    //   PSMCT32 entries are 4 bytes → byte offset = idx * 4.
+    //   PSMCT16 entries are 2 bytes → byte offset = idx * 2.
+    assign addr_offset_ct32 = {22'd0, load_idx, 2'd0};
+    assign addr_offset_ct16 = {23'd0, load_idx, 1'd0};
+    // Ch350 — CSM1 16×16 CT32 grid offset for entry load_idx (ix=load_idx[3:0], iy=load_idx[7:4]):
+    //   block = {iy[3], ix[3]} (0..3) → block*256 ; byte_in_block = iy[2:0]*32 + ix[2:0]*4.
+    // Matches gs_localmem.ct32_addr(cbp,dbw=1,ix,iy) (page_index=0 for a 16×16 region). PSMCT32 only.
+    assign addr_offset_csm1 = ({30'd0, load_idx[7], load_idx[3]}        << 8)   // block * 256
+                            + ({29'd0, load_idx[6:4]}                   << 5)   // iy[2:0] * 32
+                            + ({29'd0, load_idx[2:0]}                   << 2);  // ix[2:0] * 4
+    assign addr_offset      = load_csm1       ? addr_offset_csm1
+                            : load_cpsm_is_ct16 ? addr_offset_ct16
+                                                : addr_offset_ct32;
+    assign vram_read_addr   = cbp_bytes + addr_offset;
+
+    // Ch100 — PSMCT16 → PSMCT32 unpack. RGB5A1 packing in the
+    // low 16 bits of vram_read_data: R[4:0] G[9:5] B[14:10] A[15].
+    // 5→8 bit-replicate matches the same expansion pcrtc uses
+    // for direct PSMCT16 framebuffer scanout (Ch94). Alpha is
+    // replicated across 8 bits.
+    logic [15:0] psm16_entry;
+    logic [4:0]  psm16_r5, psm16_g5, psm16_b5;
+    logic        psm16_a1;
+    logic [7:0]  psm16_r8, psm16_g8, psm16_b8, psm16_a8;
+    logic [31:0] write_data_ct16;
+
+    assign psm16_entry      = vram_read_data[15:0];
+    assign psm16_r5         = psm16_entry[4:0];
+    assign psm16_g5         = psm16_entry[9:5];
+    assign psm16_b5         = psm16_entry[14:10];
+    assign psm16_a1         = psm16_entry[15];
+    assign psm16_r8         = {psm16_r5, psm16_r5[4:2]};
+    assign psm16_g8         = {psm16_g5, psm16_g5[4:2]};
+    assign psm16_b8         = {psm16_b5, psm16_b5[4:2]};
+    assign psm16_a8         = {8{psm16_a1}};
+    assign write_data_ct16  = {psm16_a8, psm16_b8, psm16_g8, psm16_r8};
+
+    // Combinational addr/data feed for vram_stub port 1 and
+    // clut_stub write port. Idle when not loading. In partial
+    // (CLD=4) mode the destination index is the CSA window base
+    // + load_idx, with 8-bit wrap; in full mode it's just
+    // load_idx (0..255).
+    assign clut_write_en    = (state == S_LOAD);
+    assign clut_write_idx   = load_partial ? (load_csa_base + load_idx)
+                                           : load_idx;
+    assign clut_write_data  = load_cpsm_is_ct16 ? write_data_ct16
+                                                : vram_read_data;
+
+    // Ch101 — CLD-mode trigger policy. cld_match says "the CLD
+    // value alone permits a load (assuming CSM/CPSM also OK)."
+    // The full start gate ANDs this with the existing CSM/CPSM
+    // checks below.
+    logic cld_match;
+    always_comb begin
+        unique case (tex0_cld)
+            3'd0:    cld_match = 1'b0;                          // no load
+            3'd1:    cld_match = 1'b1;                          // always (full)
+            3'd2:    cld_match = (tex0_cbp  != prev_cbp);       // CBP changed
+            3'd3:    cld_match = (tex0_cbp  != prev_cbp)
+                              || (tex0_cpsm != prev_cpsm)
+                              || (tex0_csa  != prev_csa);
+            3'd4:    cld_match = 1'b1;                          // always (partial CSA window)
+            default: cld_match = 1'b0;                          // CLD ∈ {5..7} reserved
+        endcase
+    end
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state             <= S_IDLE;
+            load_idx          <= 8'd0;
+            load_cbp          <= 14'd0;
+            load_cpsm_is_ct16 <= 1'b0;
+            load_csm1         <= 1'b0;
+            load_partial      <= 1'b0;
+            load_csa_base     <= 8'd0;
+            prev_cbp          <= 14'd0;
+            prev_cpsm         <= 4'd0;
+            prev_csa          <= 5'd0;
+        end else begin
+            unique case (state)
+                S_IDLE: begin
+                    // Ch99 audit-medium: CSM=0 means CSM1 (16×16
+                    // grid swizzle inside a CSPM block) which we
+                    // do NOT model yet. Silently treating CSM=0
+                    // as CSM2-linear would produce wrong palette
+                    // bytes, so gate the start on CSM=1 as well.
+                    // Ch100: CPSM == 0 (PSMCT32) and CPSM == 2
+                    // (PSMCT16) are both honored. Other CPSMs
+                    // are silently ignored.
+                    // Ch101: CLD-mode policy decides whether the
+                    // load fires AND records the new TEX0
+                    // CBP/CPSM/CSA in prev_* for future change
+                    // detection.
+                    // Ch102: CLD=4 starts a 16-entry partial load
+                    // at the CSA window; load_partial /
+                    // load_csa_base latch the mode + destination
+                    // base.
+                    // Ch350 — start when CSM2 (csm=1, CPSM∈{CT32,CT16}, unchanged) OR, only when
+                    // CLUT_CSM1_ENABLE, CSM1-grid (csm=0, CPSM=CT32). The CSM1 branch is the sole new
+                    // trigger; with the param OFF this AND-term is constant-0 so a csm=0 TEX0 is ignored
+                    // exactly as before (CSM2 path byte-identical).
+                    if (tex0_wr_pulse
+                        && cld_match
+                        && ( ((tex0_csm == 1'b1) && ((tex0_cpsm == 4'd0) || (tex0_cpsm == 4'd2)))
+                          || (CLUT_CSM1_ENABLE && (tex0_csm == 1'b0) && (tex0_cpsm == 4'd0)) )) begin
+                        state             <= S_LOAD;
+                        load_idx          <= 8'd0;
+                        load_cbp          <= tex0_cbp;
+                        load_cpsm_is_ct16 <= (tex0_cpsm == 4'd2);
+                        load_csm1         <= (tex0_csm == 1'b0);   // CSM1-grid addressing (PSMCT32)
+                        load_partial      <= (tex0_cld == 3'd4);
+                        load_csa_base     <= {tex0_csa, 4'd0};
+                        prev_cbp          <= tex0_cbp;
+                        prev_cpsm         <= tex0_cpsm;
+                        prev_csa          <= tex0_csa;
+                    end
+                end
+                S_LOAD: begin
+                    // Terminal index is 0xFF for full load, 0x0F
+                    // for partial (CSA window). load_terminal
+                    // mux below picks between them.
+                    if (load_idx == load_terminal) begin
+                        state <= S_IDLE;
+                    end
+                    load_idx <= load_idx + 8'd1;
+                end
+            endcase
+        end
+    end
+
+endmodule : clut_loader_stub
@@ -0,0 +1,80 @@
+// retroDE_ps2 — clut_stub (Ch97)
+//
+// Minimal palette RAM for indexed-color scanout. PSMT8 scanout
+// (Ch96) currently surfaces the index as grayscale; with this
+// CLUT wired in, the index is looked up to produce real RGB.
+//
+// Scope (intentionally minimal for Ch97):
+//   - 256 entries × 32 bits (PSMCT32 ABGR per entry). PSMT4
+//     (16 entries) uses the same RAM with a smaller index range.
+//   - CSM2 (linear) addressing only. Index N reads entry N. CSA
+//     (entry offset) is honored OUTSIDE this module — pcrtc
+//     computes effective_idx = idx + (CSA << 4) and presents it
+//     as `read_idx`. CSM1 (16×16 grid swizzle inside a CSPM
+//     block) is deferred.
+//   - Combinational read port for pcrtc (tight scanout latency).
+//   - Single registered write port. Two writers exist at this
+//     scope, picked by the wiring at the TB level:
+//       (a) TB-direct programming for tests that want to lock
+//           pcrtc-side decode in isolation (Ch97 PSMT8+CLUT TB,
+//           Ch98 TEX0_1 CSA-flow TB).
+//       (b) `clut_loader_stub` (Ch99/Ch100) — a small FSM that
+//           copies 256 entries from VRAM[CBP*256] into this RAM
+//           when a TEX0_1 GIF write commits with CLD!=0,
+//           CSM=CSM2, and CPSM ∈ {PSMCT32, PSMCT16}. PSMCT16
+//           entries are unpacked from RGB5A1 to PSMCT32 ABGR
+//           inside the loader, so clut_stub always stores
+//           PSMCT32 regardless of source. clut_stub doesn't know
+//           which writer is in play; it just commits whatever
+//           the wired write_* port carries.
+//
+// Real PS2 CLUT is held in a 1 KiB internal staging area and
+// loaded from VRAM[CBP] when CLD bits in TEX0 fire. Ch99/Ch100
+// model the load path for CPSM ∈ {PSMCT32, PSMCT16} with CSM2;
+// CSM1 swizzle, conditional CLD modes (2..7), CSA partial-window
+// loads (CLD=4), and CPSM ∉ {PSMCT32, PSMCT16} stay deferred.
+
+`timescale 1ns/1ps
+
+module clut_stub
+#(
+    parameter int unsigned ENTRIES = 256
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // TB-direct write port (no GIF TEX0 path yet).
+    input  logic        write_en,
+    input  logic [7:0]  write_idx,
+    input  logic [31:0] write_data,
+
+    // Combinational read port consumed by gs_pcrtc_stub (scanout).
+    input  logic [7:0]  read_idx,
+    output logic [31:0] read_data,
+
+    // Ch296 — SECOND combinational read port for the TEXTURE sampler
+    // (gs_stub texel-fetch path). Independent of the pcrtc scanout port
+    // above: the table is a tiny 256x32 LUT, so a second read fan-out is
+    // free and keeps the two consumers (scanout vs sampler) decoupled.
+    // PSMT8 indexed texturing looks up clut_stub[tex_read_idx] to turn a
+    // fetched 8-bit index into a PSMCT32 color.
+    input  logic [7:0]  tex_read_idx,
+    output logic [31:0] tex_read_data
+);
+
+    logic [31:0] mem [0:ENTRIES-1];
+
+    initial begin
+        for (int i = 0; i < ENTRIES; i++) mem[i] = 32'd0;
+    end
+
+    assign read_data     = mem[read_idx];
+    assign tex_read_data = mem[tex_read_idx];
+
+    always_ff @(posedge clk) begin
+        if (rst_n && write_en) begin
+            mem[write_idx] <= write_data;
+        end
+    end
+
+endmodule : clut_stub
@@ -0,0 +1,131 @@
+// retroDE_ps2 — ee_gs_priv_bridge_stub (Ch111)
+//
+// Bridges 32-bit EE-MMIO writes targeting the GS privileged-
+// register window at 0x1200_0000 into the 64-bit gs_stub.reg_wr_*
+// port. Real PS2 driver code reaches PMODE / DISPFB1 / DISPLAY1
+// (etc.) via 64-bit MIPS `sd` instructions; the EE microarch
+// breaks each `sd` into a pair of 32-bit `sw` operations to the
+// low+high halves of the 8-byte register slot. This bridge does
+// the inverse — it watches the 32-bit EE write stream, latches a
+// 64-bit shadow per 8-byte slot, and fires a gs_stub.reg_wr_*
+// pulse on EVERY half-write with the running 64-bit shadow value.
+//
+// Scope:
+//   - One shared 64-bit shadow + an offset[15:3] tag identifying
+//     the currently-tracked 8-byte slot. Sequential writes to the
+//     SAME slot accumulate (low first, then high → final shadow
+//     has both halves correct on the second fire). Switching
+//     slots resets the shadow to zero so partial-half writes to
+//     a fresh slot don't carry stale data from a different reg.
+//   - Each EE half-write fires a gs_stub.reg_wr_* pulse with the
+//     8-byte-aligned offset (`{ee_wr_addr[15:3], 3'b000}`) and
+//     the FULL 64-bit shadow. Single-half writes (e.g. PMODE
+//     where only the low byte matters) work because the high
+//     half stays zero and gs_stub's latch sees the right value.
+//   - 32-bit EE write width (matches ee_memory_map_stub's
+//     ee_wr_*-port surface). **Full-word writes only**:
+//     `ee_wr_be` MUST be 4'b1111 on every accepted write. Byte-
+//     lane merging into the 64-bit shadow is intentionally NOT
+//     modelled here — control-plane GS registers (PMODE/
+//     DISPFB1/DISPLAY1/etc.) are always written as full 32-bit
+//     halves of an `sd`, and constraining the contract keeps the
+//     shadow + commit logic small. A simulation-time `$error`
+//     fires if a non-full be is presented; a future chapter can
+//     widen the bridge to per-byte merge if/when a real driver
+//     pattern needs sub-word writes here.
+//
+// Wiring contract (TB-level for Ch111):
+//   ee_wr_en   ← TB EE-MMIO write strobe at 0x12000000+offset
+//   ee_wr_addr ← 16-bit offset within the GS priv window (= EE
+//                phys addr [15:0]; the upper EE-window decode
+//                lives in the test bench / memory map)
+//   ee_wr_data ← 32-bit EE data (one of two halves of a 64-bit
+//                GS register)
+//   ee_wr_be   ← 4-bit per-byte enable (typically 4'b1111)
+//
+// The bridge does NOT participate in EE reads. The gs_stub
+// privileged-register port is write-only at this scope, matching
+// the limited read coverage of the GS priv block in the rest of
+// the design.
+
+`timescale 1ns/1ps
+
+module ee_gs_priv_bridge_stub
+(
+    input  logic         clk,
+    input  logic         rst_n,
+
+    // EE-MMIO write port (32-bit data, 16-bit offset within
+    // 0x1200_0000 window).
+    input  logic         ee_wr_en,
+    input  logic [15:0]  ee_wr_addr,
+    input  logic [31:0]  ee_wr_data,
+    input  logic [3:0]   ee_wr_be,
+
+    // gs_stub privileged-register port (16-bit offset, 64-bit data).
+    output logic         gs_reg_wr_en,
+    output logic [15:0]  gs_reg_wr_addr,
+    output logic [63:0]  gs_reg_wr_data
+);
+
+    // Shared 64-bit shadow + the 13-bit offset[15:3] tag of the
+    // currently-tracked slot. Resets to zero on rst_n or on a
+    // switch to a different slot.
+    logic [63:0] shadow;
+    logic [12:0] shadow_tag;
+    logic        shadow_valid;
+
+    logic [12:0] cur_tag;
+    logic        cur_is_high;
+    logic [63:0] new_shadow;
+
+    assign cur_tag     = ee_wr_addr[15:3];
+    assign cur_is_high = ee_wr_addr[2];
+
+    always_comb begin
+        logic [63:0] base_shadow;
+        // If the EE write hits the same 8-byte slot we're already
+        // tracking, merge into the existing shadow. Otherwise start
+        // a fresh shadow at zero (the un-touched half stays 0 — that's
+        // safe for the demo where we always write the half that
+        // matters first; high-only writes are not used in this TB
+        // family).
+        base_shadow = (shadow_valid && shadow_tag == cur_tag) ? shadow
+                                                              : 64'd0;
+        if (cur_is_high)
+            new_shadow = {ee_wr_data, base_shadow[31:0]};
+        else
+            new_shadow = {base_shadow[63:32], ee_wr_data};
+    end
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            shadow         <= 64'd0;
+            shadow_tag     <= 13'd0;
+            shadow_valid   <= 1'b0;
+            gs_reg_wr_en   <= 1'b0;
+            gs_reg_wr_addr <= 16'd0;
+            gs_reg_wr_data <= 64'd0;
+        end else begin
+            gs_reg_wr_en <= 1'b0;
+            if (ee_wr_en) begin
+                // Contract: full-word writes only. Sub-word
+                // (per-byte) merging into the 64-bit shadow is
+                // out of scope at Ch111. Catch contract violations
+                // loudly so a future driver pattern that needs
+                // byte-lane writes is forced to widen the bridge.
+                if (ee_wr_be !== 4'b1111) begin
+                    $error("ee_gs_priv_bridge_stub: ee_wr_be=%b — only 4'b1111 supported (full-word writes); offset=0x%04x data=0x%08h",
+                           ee_wr_be, ee_wr_addr, ee_wr_data);
+                end
+                shadow         <= new_shadow;
+                shadow_tag     <= cur_tag;
+                shadow_valid   <= 1'b1;
+                gs_reg_wr_en   <= 1'b1;
+                gs_reg_wr_addr <= {cur_tag, 3'b000};
+                gs_reg_wr_data <= new_shadow;
+            end
+        end
+    end
+
+endmodule : ee_gs_priv_bridge_stub
@@ -0,0 +1,663 @@
+// retroDE_ps2 — gif_image_xfer_stub (Ch110)
+//
+// Host→local image-transfer engine. On a TRXDIR write that arms a
+// host→local upload (XDIR == 0), the engine snapshots the
+// already-latched BITBLTBUF / TRXPOS / TRXREG fields and consumes
+// IMAGE-mode quadwords from gif_packed_stub, unpacking them into
+// per-pixel VRAM writes at the destination region defined by
+// (DBP, DBW, DPSM, DSAX, DSAY, RRW, RRH).
+//
+// Scope (after Ch139):
+//   - PSMCT32 (DPSM == 6'h00): 4 bytes/pixel, 4 pixels/qword,
+//     row_stride = DBW * 256, write_be = 4'b1111, mask=0xFFFFFFFF.
+//   - PSMCT16 (DPSM == 6'h02): 2 bytes/pixel, 8 pixels/qword,
+//     row_stride = DBW * 128, write_be = 4'b0011, mask=0xFFFFFFFF.
+//   - PSMT8 (DPSM == 6'h13): 1 byte/pixel (an 8-bit CLUT index),
+//     16 pixels/qword, row_stride = DBW * 64, write_be = 4'b0001,
+//     mask = 0xFFFFFFFF.
+//   - PSMT4 (DPSM == 6'h14): 0.5 bytes/pixel (a 4-bit CLUT index),
+//     32 pixels/qword (2 px/byte × 16 bytes), row_stride = DBW * 32,
+//     write_be = 4'b0001 with a per-emit nibble mask: 0x0000_000F
+//     for the LOW nibble of the byte (when (DSAX+x) is even) or
+//     0x0000_00F0 for the HIGH nibble (when (DSAX+x) is odd). The
+//     4-bit index sits at the matching nibble position in
+//     write_data[7:0]; vram_stub's per-bit merge commits exactly
+//     that nibble — the OTHER nibble of the same byte is preserved.
+//     Back-to-back emits to the same byte (e.g. x=0 + x=1 of the
+//     same row) chain through NBA semantics without bypass logic
+//     (same trick the raster channel uses since Ch106).
+//   - Other PSMs (PSMCT24/PSMZ-*): the engine still consumes
+//     IMAGE qwords (so gif_packed_stub doesn't desync) but emits
+//     zero VRAM writes. Lane cadence falls back to PSMCT32
+//     (4 lanes/qword).
+//   - Addressing: linear by DEFAULT — the destination address
+//     math is
+//       dest_base  = DBP * 256
+//       row_stride = DBW * 64 * bpp
+//       addr(x, y) = dest_base + (DSAY + y) * row_stride
+//                              + (DSAX + x) * bpp
+//     Four OPTIONAL per-PSM swizzle paths gated by parameters:
+//     `PSMCT32_SWIZZLE=1` (Ch121) routes PSMCT32 uploads through
+//     gs_swizzle_psmct32_stub; `PSMCT16_SWIZZLE=1` (Ch127) routes
+//     PSMCT16 uploads through gs_swizzle_psmct16_stub;
+//     `PSMT8_SWIZZLE=1` (Ch133) routes PSMT8 uploads through
+//     gs_swizzle_psmt8_stub (page=128×64 px, bw_pg=DBW>>1 — DBW
+//     must be even for PSMT8); `PSMT4_SWIZZLE=1` (Ch139) routes
+//     PSMT4 uploads through gs_swizzle_psmt4_stub (page=128×128
+//     px, bw_pg=DBW>>1 — DBW must be even for PSMT4 too; module
+//     also outputs nibble_hi selector since PSMT4 packs 2 pixels
+//     per byte). In all four cases the per-pixel byte address is
+//     `dest_base + swizzle(FBP=0, FBW=DBW, x=DSAX+cur_x,
+//     y=DSAY+cur_y)`. The PSMT4 path additionally uses the
+//     swizzle's `nibble_hi` output (instead of the linear
+//     formula's x_eff[0]) to pick which nibble of the byte gets
+//     the upload's 4-bit pixel — the existing Ch118 nibble RMW
+//     write-mask machinery (write_be=4'b0001, write_mask=
+//     0x0F or 0xF0) layers on top of the swizzled byte address.
+//     The four parameters are independent. All four parameter
+//     defaults are 0 → legacy linear behavior.
+//   - One pending qword buffer + a 5-bit lane counter (0..3 for
+//     PSMCT32, 0..7 for PSMCT16, 0..15 for PSMT8, 0..31 for
+//     PSMT4; the last-lane index is snapshotted at TRXDIR-arm
+//     time per `lane_last_q`).
+//     Backpressure to the upstream is exposed via `data_ready`.
+//     Wired into `gif_packed_stub.image_data_ready` (Ch110), so
+//     the GIF gates `in_ready` only in S_IMAGE state with FLG=2;
+//     the DMAC's ep_ready follows gif_in_ready directly. Outside
+//     S_IMAGE the gate is a no-op.
+//
+// Wiring contract (TB-level):
+//   trxdir_wr_pulse ← gs_stub.trxdir_wr_q
+//   trxdir          ← gs_stub.trxdir_q
+//   bitbltbuf       ← gs_stub.bitbltbuf_q
+//   trxpos          ← gs_stub.trxpos_q
+//   trxreg          ← gs_stub.trxreg_q
+//   data_valid      ← gif_packed_stub.image_data_valid
+//   data_qword      ← gif_packed_stub.image_data
+//   data_last       ← gif_packed_stub.image_data_last
+//   data_ready      → gif_packed_stub.image_data_ready (Ch110).
+//                     The GIF FSM uses it to gate in_ready only in
+//                     S_IMAGE+FLG=2; dmac.ep_ready follows
+//                     gif.in_ready directly (no TB-level AND).
+//   vram_we / waddr / wdata / wbe / wmask → muxed into vram_stub's
+//     write port (the TB selects between the engine, the raster
+//     channel, and any TB-direct path).
+//   busy → high while a transfer is active (between trxdir_wr arm
+//          and the last lane emit). TB uses this for the vram_stub
+//          write-port mux.
+//
+// What this stub does NOT do:
+//   - Source-direction (local→host or local→local) transfers.
+//   - PSMCT24 / PSMZ-* image transfers (not currently exercised
+//     in the demo flow).
+//   - Mid-transfer TRXDIR re-arm or interleaving with REGLIST.
+//   - HWREG-side legacy/non-PSM-aware swizzle (out of scope —
+//     PSMCT32 since Ch121, PSMCT16 since Ch127, PSMT8 since
+//     Ch133, PSMT4 since Ch139 all support the canonical PCSX2
+//     swizzle behind their respective parameter gates).
+//   - HWREG via privileged-MMIO (the real PS2 path that reads
+//     pixel data through the privileged HWREG register at
+//     0x12001000); IMAGE-mode GIF qwords are the only data
+//     source modelled here.
+
+`timescale 1ns/1ps
+
+module gif_image_xfer_stub
+    import trace_pkg::*;
+#(
+    // Ch121 — when set, PSMCT32 uploads compute the per-pixel VRAM
+    // byte address via the real PS2 GS page/block swizzle
+    // (gs_swizzle_psmct32_stub) instead of the legacy linear formula
+    // `dest_base + (DSAY+y)*row_stride + (DSAX+x)*4`. Other PSMs
+    // are not affected by this parameter — PSMCT16 has its own
+    // gate (PSMCT16_SWIZZLE, Ch127), PSMT8 has PSMT8_SWIZZLE
+    // (Ch133), PSMT4 has PSMT4_SWIZZLE (Ch139, see below).
+    // Default 0 keeps every existing PSMCT32 image-xfer TB on
+    // the original linear addressing — its expectations don't
+    // change.
+    parameter bit PSMCT32_SWIZZLE = 1'b0,
+
+    // Ch127 — when set, PSMCT16 uploads compute the per-pixel VRAM
+    // byte address via the canonical PS2 GS page/block/column
+    // swizzle (gs_swizzle_psmct16_stub) instead of the legacy
+    // linear formula `dest_base + (DSAY+y)*row_stride +
+    // (DSAX+x)*2`. PSMCT32 / PSMT8 / PSMT4 are governed by their
+    // own gates (PSMCT32_SWIZZLE / PSMT8_SWIZZLE / PSMT4_SWIZZLE).
+    // Default 0 keeps every existing PSMCT16 image-xfer
+    // TB on the legacy linear path. Mirrors the Ch126 PCRTC
+    // read-side wiring at the upload write side, completing
+    // the second integration point for the Ch125 PSMCT16
+    // primitive.
+    parameter bit PSMCT16_SWIZZLE = 1'b0,
+
+    // Ch133 — when set, PSMT8 uploads compute the per-pixel VRAM
+    // byte address via the canonical PS2 GS page/block/column
+    // swizzle (gs_swizzle_psmt8_stub) instead of the legacy
+    // linear formula `dest_base + (DSAY+y)*row_stride +
+    // (DSAX+x)*1`. PSMT8 pages are 128 px wide so the swizzle
+    // internally uses `bw_pg = DBW >> 1` — PCSX2 asserts DBW must
+    // be even for PSMT8 at GSLocalMemory.h:553. PSMCT32 / PSMCT16
+    // / PSMT4 are governed by their own gates.
+    // Default 0 keeps every existing PSMT8 image-xfer TB
+    // (Ch117 PSMT8, Ch107 PSMT4-via-CT16-CLUT palette path) on
+    // the legacy linear addressing. Mirrors the Ch132 PCRTC
+    // read-side wiring at the upload write side, completing
+    // the second integration point for the Ch131 PSMT8 primitive.
+    parameter bit PSMT8_SWIZZLE = 1'b0,
+
+    // Ch139 — when set, PSMT4 uploads compute the per-pixel VRAM
+    // byte address via the canonical PS2 GS page/block/column
+    // swizzle (gs_swizzle_psmt4_stub) instead of the legacy
+    // linear formula `dest_base + (DSAY+y)*row_stride +
+    // (DSAX+x)*0.5`. PSMT4 pages are 128 px wide AND 128 px tall;
+    // the swizzle internally uses `bw_pg = DBW >> 1` — PCSX2
+    // asserts DBW must be even for PSMT4 at GSLocalMemory.h:560.
+    // The PSMT4 swizzle module also outputs a `nibble_hi`
+    // selector that picks which nibble of the byte at the
+    // swizzled address holds this pixel — the linear formula's
+    // x_eff[0] selector is wrong under the swizzled layout
+    // because the canonical PCSX2 column table reorders nibbles
+    // within a block. The existing Ch118 nibble RMW machinery
+    // (write_be=4'b0001 + write_mask 0x0F or 0xF0) layers on top
+    // of the swizzled byte address: the mask is selected by the
+    // swizzle's nibble_hi when this gate is on, instead of by
+    // x_eff[0]. PSMCT32 / PSMCT16 / PSMT8 are governed by their
+    // own gates. Default 0 keeps every existing PSMT4 image-xfer
+    // TB (Ch118 PSMT4, Ch107 PSMT4-e2e palette path) on the
+    // legacy linear addressing. Mirrors the Ch138 PCRTC
+    // read-side wiring at the upload write side, completing the
+    // second integration point for the Ch137 PSMT4 primitive.
+    parameter bit PSMT4_SWIZZLE = 1'b0
+)(
+    input  logic         clk,
+    input  logic         rst_n,
+
+    // Arm input — pulses for one cycle on TRXDIR commit.
+    input  logic         trxdir_wr_pulse,
+    input  logic [63:0]  trxdir,
+    input  logic [63:0]  bitbltbuf,
+    input  logic [63:0]  trxpos,
+    input  logic [63:0]  trxreg,
+
+    // IMAGE qword stream from gif_packed_stub.
+    input  logic         data_valid,
+    input  logic [127:0] data_qword,
+    input  logic         data_last,
+    output logic         data_ready,
+
+    // VRAM write port. PSM-aware be + per-bit merge mask:
+    //   PSMCT32 (Ch110): be = 4'b1111, mask = 0xFFFFFFFF.
+    //   PSMCT16 (Ch116): be = 4'b0011, mask = 0xFFFFFFFF.
+    //   PSMT8   (Ch117): be = 4'b0001, mask = 0xFFFFFFFF.
+    //   PSMT4   (Ch118): be = 4'b0001, mask = 0x0000_000F (low
+    //                    nibble) or 0x0000_00F0 (high nibble),
+    //                    keyed by (DSAX+x)[0]. The 4-bit index
+    //                    sits at the matching nibble position in
+    //                    write_data[7:0]; vram_stub merges only
+    //                    the targeted nibble.
+    output logic         vram_we,
+    output logic [31:0]  vram_waddr,
+    output logic [31:0]  vram_wdata,
+    output logic [3:0]   vram_wbe,
+    output logic [31:0]  vram_wmask,
+
+    // Engine status.
+    output logic         busy
+);
+
+    // BITBLTBUF field decode (real PS2 layout, per PCSX2 GSRegs.h):
+    //   [13:0]   SBP
+    //   [21:16]  SBW
+    //   [29:24]  SPSM
+    //   [45:32]  DBP
+    //   [53:48]  DBW
+    //   [61:56]  DPSM
+    logic [13:0] dbp;
+    logic [5:0]  dbw;
+    logic [5:0]  dpsm;
+    assign dbp  = bitbltbuf[45:32];
+    assign dbw  = bitbltbuf[53:48];
+    assign dpsm = bitbltbuf[61:56];
+
+    // TRXPOS field decode:
+    //   [10:0]   SSAX
+    //   [26:16]  SSAY
+    //   [42:32]  DSAX
+    //   [58:48]  DSAY
+    //   [60:59]  DIR
+    logic [10:0] dsax;
+    logic [10:0] dsay;
+    assign dsax = trxpos[42:32];
+    assign dsay = trxpos[58:48];
+
+    // TRXREG field decode:
+    //   [11:0]   RRW
+    //   [43:32]  RRH
+    logic [11:0] rrw;
+    logic [11:0] rrh;
+    assign rrw = trxreg[11:0];
+    assign rrh = trxreg[43:32];
+
+    // TRXDIR field decode (XDIR is bits [1:0]).
+    logic [1:0] xdir;
+    assign xdir = trxdir[1:0];
+
+    // Snapshotted transfer parameters (latched at trxdir_wr arm).
+    logic [13:0] dbp_q;
+    logic [5:0]  dbw_q;
+    logic [5:0]  dpsm_q;
+    logic [10:0] dsax_q;
+    logic [10:0] dsay_q;
+    logic [11:0] rrw_q;
+    logic [11:0] rrh_q;
+    logic [31:0] dest_base_q;        // DBP * 256 (bytes)
+    logic [31:0] row_stride_q;       // DBW * 64 * bpp
+    logic        psmct32_q;          // DPSM == 0x00 → 4 bytes/pixel
+    logic        psmct16_q;          // DPSM == 0x02 → 2 bytes/pixel (Ch116)
+    logic        psmt8_q;             // DPSM == 0x13 → 1 byte/pixel  (Ch117)
+    logic        psmt4_q;             // DPSM == 0x14 → 0.5 byte/pixel (Ch118)
+    // Last-lane index for the current PSM (3 for PSMCT32 → 4
+    // lanes, 7 for PSMCT16 → 8 lanes, 15 for PSMT8 → 16 lanes,
+    // 31 for PSMT4 → 32 lanes).
+    // Other PSMs use the PSMCT32 cadence (3) for silent consume.
+    logic [4:0]  lane_last_q;
+
+    // Per-emit progression: which qword (0..NLOOP-1) and which
+    // lane within the qword (0..3 for PSMCT32, 0..7 for PSMCT16,
+    // 0..15 for PSMT8, 0..31 for PSMT4).
+    logic [127:0] qword_q;
+    logic [4:0]   lane_q;            // widened to 5 bits for PSMT4
+    logic         lane_valid_q;      // a buffered qword is being drained
+
+    // Pixel cursor (cur_x, cur_y) within the destination rect,
+    // measured from (DSAX, DSAY). Wrap at RRW.
+    logic [11:0] cur_x_q;
+    logic [11:0] cur_y_q;
+    logic [23:0] pix_total_q;        // RRW * RRH (cap 16M)
+    logic [23:0] pix_done_q;
+
+    // FSM.
+    typedef enum logic [1:0] {
+        S_IDLE  = 2'd0,
+        S_RUN   = 2'd1
+    } state_e;
+    state_e state;
+
+    assign busy = (state == S_RUN);
+
+    // The engine is "ready" for a new qword when no qword is
+    // currently being drained. In S_IDLE we admit qwords too —
+    // upstream image_data_valid won't pulse outside an active
+    // S_IMAGE state, so this is benign.
+    assign data_ready = !lane_valid_q;
+
+    // Combinational pixel address for the in-flight lane.
+    //   PSMCT32: addr = dest_base + (DSAY+cur_y) * row_stride
+    //                              + (DSAX+cur_x) * 4
+    //   PSMCT16: addr = dest_base + (DSAY+cur_y) * row_stride
+    //                              + (DSAX+cur_x) * 2
+    //   PSMT8  : addr = dest_base + (DSAY+cur_y) * row_stride
+    //                              + (DSAX+cur_x) * 1
+    //   PSMT4  : addr = dest_base + (DSAY+cur_y) * row_stride
+    //                              + ((DSAX+cur_x) >> 1)
+    //            nibble = (DSAX+cur_x)[0] high vs low
+    // (row_stride already encodes the bpp factor.)
+    logic [31:0] cur_addr_c;
+    logic [31:0] cur_data_c;
+    logic [3:0]  cur_be_c;
+    logic [31:0] cur_mask_c;
+    always_comb begin
+        logic [31:0] x_off;
+        logic [11:0] x_eff;
+        logic [3:0]  t4_nibble;
+        x_eff = dsax_q + cur_x_q;
+        if (psmt4_q)
+            x_off = {21'd0, x_eff[11:1]};                   // (x_eff >> 1)
+        else if (psmt8_q)
+            x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q});
+        else if (psmct16_q)
+            x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q}) * 32'd2;
+        else
+            x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q}) * 32'd4;
+        cur_addr_c = dest_base_q
+                   + (32'(dsay_q) + 32'(cur_y_q)) * row_stride_q
+                   + x_off;
+        // PSMT4: extract the 4-bit nibble at lane_q from qword_q.
+        // qword[lane*4 +: 4] for lane in 0..31. iverilog 12 supports
+        // indexed part-select with variable base + constant width.
+        t4_nibble = qword_q[(5'(lane_q) * 4) +: 4];
+        if (psmt4_q) begin
+            // 32 PSMT4 pixels per qword (2 px/byte × 16 bytes).
+            // Place the 4-bit index at the matching nibble position
+            // in write_data[7:0] keyed by the nibble selector.
+            // Linear (PSMT4_SWIZZLE=0): x_eff[0] is the selector
+            // (low nibble = even pixel, high = odd pixel).
+            // Swizzled (Ch139, PSMT4_SWIZZLE=1): the swizzle module
+            // outputs `nibble_hi` directly — required because the
+            // canonical PCSX2 columnTable4 reorders nibbles within
+            // a block, so x_eff[0] is no longer correct. write_be
+            // is 4'b0001 (single-byte commit) and write_mask
+            // gates the targeted nibble; vram_stub merges only
+            // that nibble, preserving the OTHER nibble of the
+            // same byte.
+            logic psmt4_nibble_select;
+            psmt4_nibble_select = PSMT4_SWIZZLE ? swizzle4_nibble_hi
+                                                : x_eff[0];
+            if (psmt4_nibble_select) begin
+                cur_data_c = {24'd0, t4_nibble, 4'd0};      // high nibble
+                cur_mask_c = 32'h0000_00F0;
+            end else begin
+                cur_data_c = {24'd0, 4'd0, t4_nibble};      // low nibble
+                cur_mask_c = 32'h0000_000F;
+            end
+            cur_be_c = 4'b0001;
+        end else if (psmt8_q) begin
+            // 16 PSMT8 pixels per qword. Place the 8-bit index in
+            // the LOW byte of write_data; vram_stub's per-byte BE
+            // commits exactly 1 byte at the exact pixel address
+            // (write_addr = cur_addr_c) at any byte alignment.
+            cur_mask_c = 32'hFFFF_FFFF;
+            unique case (lane_q[3:0])
+                4'd0:  cur_data_c = {24'd0, qword_q[  7:  0]};
+                4'd1:  cur_data_c = {24'd0, qword_q[ 15:  8]};
+                4'd2:  cur_data_c = {24'd0, qword_q[ 23: 16]};
+                4'd3:  cur_data_c = {24'd0, qword_q[ 31: 24]};
+                4'd4:  cur_data_c = {24'd0, qword_q[ 39: 32]};
+                4'd5:  cur_data_c = {24'd0, qword_q[ 47: 40]};
+                4'd6:  cur_data_c = {24'd0, qword_q[ 55: 48]};
+                4'd7:  cur_data_c = {24'd0, qword_q[ 63: 56]};
+                4'd8:  cur_data_c = {24'd0, qword_q[ 71: 64]};
+                4'd9:  cur_data_c = {24'd0, qword_q[ 79: 72]};
+                4'd10: cur_data_c = {24'd0, qword_q[ 87: 80]};
+                4'd11: cur_data_c = {24'd0, qword_q[ 95: 88]};
+                4'd12: cur_data_c = {24'd0, qword_q[103: 96]};
+                4'd13: cur_data_c = {24'd0, qword_q[111:104]};
+                4'd14: cur_data_c = {24'd0, qword_q[119:112]};
+                default: cur_data_c = {24'd0, qword_q[127:120]};
+            endcase
+            cur_be_c = 4'b0001;
+        end else if (psmct16_q) begin
+            // 8 PSMCT16 pixels per qword. Place the 16-bit value
+            // in the LOW halfword of write_data; vram_stub's per-
+            // byte BE commits exactly 2 bytes at the 2-byte-
+            // aligned pixel address (write_addr = cur_addr_c).
+            cur_mask_c = 32'hFFFF_FFFF;
+            unique case (lane_q[2:0])
+                3'd0: cur_data_c = {16'd0, qword_q[ 15:  0]};
+                3'd1: cur_data_c = {16'd0, qword_q[ 31: 16]};
+                3'd2: cur_data_c = {16'd0, qword_q[ 47: 32]};
+                3'd3: cur_data_c = {16'd0, qword_q[ 63: 48]};
+                3'd4: cur_data_c = {16'd0, qword_q[ 79: 64]};
+                3'd5: cur_data_c = {16'd0, qword_q[ 95: 80]};
+                3'd6: cur_data_c = {16'd0, qword_q[111: 96]};
+                default: cur_data_c = {16'd0, qword_q[127:112]};
+            endcase
+            cur_be_c = 4'b0011;
+        end else begin
+            // PSMCT32: 4 pixels per qword, full 32-bit.
+            cur_mask_c = 32'hFFFF_FFFF;
+            unique case (lane_q[1:0])
+                2'd0:    cur_data_c = qword_q[ 31:  0];
+                2'd1:    cur_data_c = qword_q[ 63: 32];
+                2'd2:    cur_data_c = qword_q[ 95: 64];
+                default: cur_data_c = qword_q[127: 96];
+            endcase
+            cur_be_c = 4'b1111;
+        end
+    end
+
+    // Ch121 — optional PSMCT32 swizzled write address.
+    //
+    // When PSMCT32_SWIZZLE=1 AND the active PSM is PSMCT32, route
+    // the per-pixel byte address through gs_swizzle_psmct32_stub
+    // instead of the linear formula. The swizzle module gives
+    // a within-FB byte offset relative to FBP=0; we add dest_base_q
+    // (= DBP*256) to anchor the upload at the same DBP-relative
+    // base the linear path uses. dbw_q feeds the swizzle's FBW
+    // input directly (both are in 64-pixel units, matching the
+    // PSMCT32 page = 64 px wide convention). The per-pixel x and
+    // y inputs are the FULL effective coordinates (DSAX+cur_x,
+    // DSAY+cur_y), so the swizzle correctly handles non-zero
+    // DSAX/DSAY uploads as well.
+    //
+    // Other PSMs are governed by their own dispatch branches in
+    // the per-PSM mux below (PSMCT16 via PSMCT16_SWIZZLE Ch127,
+    // PSMT8 via PSMT8_SWIZZLE Ch133, PSMT4 via PSMT4_SWIZZLE
+    // Ch139). With PSMCT32_SWIZZLE=0 the PSMCT32 path falls
+    // through to cur_addr_c. The swizzle module is purely
+    // combinational; when its gate is off its output is unused
+    // and the synthesizer trims it.
+    logic [31:0] cur_addr_swizzled_c;
+    logic [11:0] swizzle_x_in;
+    logic [11:0] swizzle_y_in;
+    assign swizzle_x_in = dsax_q + cur_x_q;
+    assign swizzle_y_in = dsay_q + cur_y_q;
+    logic [31:0] swizzle_addr_off;
+    gs_swizzle_psmct32_stub u_swizzle (
+        .fbp (9'd0),
+        .fbw (dbw_q),
+        .x   (swizzle_x_in),
+        .y   (swizzle_y_in),
+        .addr(swizzle_addr_off)
+    );
+    assign cur_addr_swizzled_c = dest_base_q + swizzle_addr_off;
+
+    // Ch127 — optional PSMCT16 swizzled write address. Same shape
+    // as Ch121 above but uses gs_swizzle_psmct16_stub. The PSMCT16
+    // page (64×64) and block grid (4 cols × 8 rows of 16×8 blocks)
+    // and within-block columnTable16 are all baked into that
+    // module — we just feed it `dbw_q` as FBW and the full
+    // effective coords. dest_base_q (= DBP*256) is added on top
+    // so any DBP works; the swizzle module is given FBP=0 so its
+    // output is the within-FB byte offset only.
+    logic [31:0] cur_addr_swizzled16_c;
+    logic [31:0] swizzle16_addr_off;
+    gs_swizzle_psmct16_stub u_swizzle16 (
+        .fbp (9'd0),
+        .fbw (dbw_q),
+        .x   (swizzle_x_in),
+        .y   (swizzle_y_in),
+        .addr(swizzle16_addr_off)
+    );
+    assign cur_addr_swizzled16_c = dest_base_q + swizzle16_addr_off;
+
+    // Ch133 — optional PSMT8 swizzled write address. Same shape as
+    // Ch121 / Ch127 above but uses gs_swizzle_psmt8_stub. PSMT8
+    // pages are 128 px wide so the swizzle internally uses
+    // bw_pg = DBW>>1 (PCSX2 asserts DBW must be even for PSMT8).
+    // dest_base_q (= DBP*256) is added on top so any DBP works;
+    // the swizzle module is given FBP=0 so its output is the
+    // within-FB byte offset only.
+    logic [31:0] cur_addr_swizzled8_c;
+    logic [31:0] swizzle8_addr_off;
+    gs_swizzle_psmt8_stub u_swizzle8 (
+        .fbp (9'd0),
+        .fbw (dbw_q),
+        .x   (swizzle_x_in),
+        .y   (swizzle_y_in),
+        .addr(swizzle8_addr_off)
+    );
+    assign cur_addr_swizzled8_c = dest_base_q + swizzle8_addr_off;
+
+    // Ch139 — optional PSMT4 swizzled write address. Same wiring
+    // shape as Ch121/Ch127/Ch133 but uses gs_swizzle_psmt4_stub,
+    // which outputs both an absolute byte address AND a
+    // `nibble_hi` selector. PSMT4 pages are 128 px wide AND tall;
+    // the swizzle internally uses bw_pg=DBW>>1 (PCSX2 asserts
+    // DBW must be even for PSMT4). dest_base_q (= DBP*256) is
+    // added on top so any DBP works; the swizzle module is given
+    // FBP=0 so its addr output is the within-FB byte offset only.
+    // The nibble_hi output threads into the PSMT4 data lane mux
+    // below: when this gate is on AND psmt4_q, the existing Ch118
+    // nibble RMW machinery (write_be=4'b0001, write_mask 0x0F or
+    // 0xF0) keys on the swizzle's nibble_hi instead of x_eff[0].
+    logic [31:0] cur_addr_swizzled4_c;
+    logic [31:0] swizzle4_addr_off;
+    logic        swizzle4_nibble_hi;
+    gs_swizzle_psmt4_stub u_swizzle4 (
+        .fbp      (9'd0),
+        .fbw      (dbw_q),
+        .x        (swizzle_x_in),
+        .y        (swizzle_y_in),
+        .addr     (swizzle4_addr_off),
+        .nibble_hi(swizzle4_nibble_hi)
+    );
+    assign cur_addr_swizzled4_c = dest_base_q + swizzle4_addr_off;
+
+    // VRAM write outputs — pulse for one cycle per pixel emit.
+    // Only fire when DPSM is supported (PSMCT32, PSMCT16, PSMT8,
+    // or PSMT4). Other PSMs still consume qwords lane-by-lane to
+    // keep gif_packed_stub from desync, but no VRAM write happens.
+    logic emit_now;
+    assign emit_now = lane_valid_q &&
+                      (psmct32_q || psmct16_q || psmt8_q || psmt4_q);
+
+    // Per-PSM swizzle dispatch. The four parameters are
+    // independent; defaults of 0 keep every PSM on the legacy
+    // linear path.
+    assign vram_we    = emit_now;
+    assign vram_waddr = (PSMCT32_SWIZZLE && psmct32_q) ? cur_addr_swizzled_c   :
+                        (PSMCT16_SWIZZLE && psmct16_q) ? cur_addr_swizzled16_c :
+                        (PSMT8_SWIZZLE   && psmt8_q)   ? cur_addr_swizzled8_c  :
+                        (PSMT4_SWIZZLE   && psmt4_q)   ? cur_addr_swizzled4_c  :
+                                                         cur_addr_c;
+    assign vram_wdata = cur_data_c;
+    assign vram_wbe   = cur_be_c;
+    assign vram_wmask = cur_mask_c;
+
+    // Compute target pixel count = RRW * RRH (24-bit cap is fine
+    // for any palette/texture upload we model here).
+    logic [23:0] pix_total_calc;
+    assign pix_total_calc = {12'd0, rrw} * {12'd0, rrh};
+
+    // Step / wrap logic (on the cycle a pixel emits).
+    logic [11:0] next_x;
+    logic [11:0] next_y;
+    logic        wrap_row;
+    assign wrap_row = (cur_x_q + 12'd1 == rrw_q);
+    assign next_x   = wrap_row ? 12'd0           : (cur_x_q + 12'd1);
+    assign next_y   = wrap_row ? (cur_y_q + 12'd1) : cur_y_q;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state         <= S_IDLE;
+            dbp_q         <= 14'd0;
+            dbw_q         <= 6'd0;
+            dpsm_q        <= 6'd0;
+            dsax_q        <= 11'd0;
+            dsay_q        <= 11'd0;
+            rrw_q         <= 12'd0;
+            rrh_q         <= 12'd0;
+            dest_base_q   <= 32'd0;
+            row_stride_q  <= 32'd0;
+            psmct32_q     <= 1'b0;
+            psmct16_q     <= 1'b0;
+            psmt8_q       <= 1'b0;
+            psmt4_q       <= 1'b0;
+            lane_last_q   <= 5'd3;
+            qword_q       <= 128'd0;
+            lane_q        <= 5'd0;
+            lane_valid_q  <= 1'b0;
+            cur_x_q       <= 12'd0;
+            cur_y_q       <= 12'd0;
+            pix_total_q   <= 24'd0;
+            pix_done_q    <= 24'd0;
+        end else begin
+            unique case (state)
+                S_IDLE: begin
+                    if (trxdir_wr_pulse && (xdir == 2'd0)) begin
+                        logic is_ct32, is_ct16, is_t8, is_t4;
+                        is_ct32 = (dpsm == 6'h00);
+                        is_ct16 = (dpsm == 6'h02);
+                        is_t8   = (dpsm == 6'h13);
+                        is_t4   = (dpsm == 6'h14);
+                        // Snapshot all transfer params.
+                        dbp_q        <= dbp;
+                        dbw_q        <= dbw;
+                        dpsm_q       <= dpsm;
+                        dsax_q       <= dsax;
+                        dsay_q       <= dsay;
+                        rrw_q        <= rrw;
+                        rrh_q        <= rrh;
+                        dest_base_q  <= {18'd0, dbp} << 8;
+                        // row_stride = DBW * 64 * bpp:
+                        //   PSMCT32 → DBW * 256  (DBW << 8)
+                        //   PSMCT16 → DBW * 128  (DBW << 7)
+                        //   PSMT8   → DBW *  64  (DBW << 6)
+                        //   PSMT4   → DBW *  32  (DBW << 5)
+                        //   other   → fall back to PSMCT32-stride
+                        //             (no VRAM emit anyway)
+                        row_stride_q <= is_t4   ? ({18'd0, dbw} << 5)
+                                      : is_t8   ? ({18'd0, dbw} << 6)
+                                      : is_ct16 ? ({18'd0, dbw} << 7)
+                                                : ({18'd0, dbw} << 8);
+                        psmct32_q    <= is_ct32;
+                        psmct16_q    <= is_ct16;
+                        psmt8_q      <= is_t8;
+                        psmt4_q      <= is_t4;
+                        // Lanes/qword: 4 (PSMCT32) → last=3,
+                        // 8 (PSMCT16) → last=7, 16 (PSMT8) → last=15,
+                        // 32 (PSMT4) → last=31. Other PSMs use the
+                        // PSMCT32 cadence (silent consume).
+                        lane_last_q  <= is_t4   ? 5'd31
+                                      : is_t8   ? 5'd15
+                                      : is_ct16 ? 5'd7
+                                                : 5'd3;
+                        cur_x_q      <= 12'd0;
+                        cur_y_q      <= 12'd0;
+                        pix_total_q  <= pix_total_calc;
+                        pix_done_q   <= 24'd0;
+                        lane_valid_q <= 1'b0;
+                        state        <= S_RUN;
+                    end
+                end
+
+                S_RUN: begin
+                    if (!lane_valid_q && data_valid && data_ready) begin
+                        // Latch a fresh qword to drain.
+                        qword_q      <= data_qword;
+                        lane_q       <= 5'd0;
+                        lane_valid_q <= 1'b1;
+                    end else if (lane_valid_q) begin
+                        // Drain one lane per cycle. Step the cursor
+                        // when the emit fires (supported PSM or not —
+                        // for unsupported PSM, no VRAM write fires
+                        // but we still consume the lane to keep
+                        // gif_packed_stub from desync).
+                        if (lane_q == lane_last_q) begin
+                            lane_valid_q <= 1'b0;
+                            lane_q       <= 5'd0;
+                        end else begin
+                            lane_q <= lane_q + 5'd1;
+                        end
+
+                        // Pixel-cursor + done-count step.
+                        cur_x_q   <= next_x;
+                        cur_y_q   <= next_y;
+                        pix_done_q <= pix_done_q + 24'd1;
+
+                        // Did this lane emit complete the rect?
+                        if (pix_done_q + 24'd1 >= pix_total_q) begin
+                            // End of transfer. Drop any remaining
+                            // unused lanes — for PSMCT32 the rect
+                            // size should be a multiple of 4 px,
+                            // for PSMCT16 a multiple of 8 px, for
+                            // PSMT8 a multiple of 16 px, for PSMT4
+                            // a multiple of 32 px (else the extra
+                            // trailing lanes within the last qword
+                            // are silently swallowed).
+                            // Return to IDLE on the same cycle the
+                            // last lane emits.
+                            state        <= S_IDLE;
+                            lane_valid_q <= 1'b0;
+                            lane_q       <= 5'd0;
+                        end
+                    end
+                end
+
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+endmodule : gif_image_xfer_stub
@@ -0,0 +1,428 @@
+// retroDE_ps2 — gif_packed_stub (Ch72 + Ch73)
+//
+// Real-format GIF parser. Sits in the same upstream/downstream slot
+// as gif_path_stub but accepts a real PS2 GIFtag in front of the
+// data, instead of the project-local single-qword register-write
+// format that gif_path_stub uses for Wave 2.
+//
+// Scope:
+//   - PACKED (FLG=0): NLOOP×NREG PACKED entries, one entry per qword.
+//     A+D (REGS nibble 0xE) emits a GS register write; other nibbles
+//     are traced EV_MODE no-ops. (Ch72.)
+//   - REGLIST (FLG=1): NLOOP×NREG REGLIST entries, two entries per
+//     qword (low 64 = entry #0, high 64 = entry #1). REGLIST data
+//     bytes are register values keyed by REGS nibbles in order; only
+//     A+D (0xE) gets a GS write here, since real REGLIST treats each
+//     nibble as the register *number* (not A+D), and we don't yet
+//     have a reg# → gs-offset map. Other nibbles consume the entry
+//     and trace EV_MODE. (Ch73.)
+//   - FLG=2 (IMAGE) and FLG=3 (DISABLE): payload is NLOOP qwords,
+//     consumed silently with EV_MODE traces, no GS effect. (Ch73 —
+//     prevents the "next data qword is mistaken for a new GIFtag"
+//     desync flagged in the Ch73 audit.)
+//   - NLOOP up to 15 bits, NREG up to 16 registers. PS2 docs: a
+//     NREG field of 0 means 16; we use a 5-bit effective count to
+//     represent 16 correctly (Ch73 audit-medium fix — was clamped
+//     to 4'd15, which mis-counted PACKED 16-reg packets by one).
+//   - EOP carries no behavioral difference here (always-ready
+//     sink); preserved as trace metadata.
+//
+// PACKED A+D data qword layout — selectable via REAL_AD_REG_MAP:
+//
+//   REAL_AD_REG_MAP=0 (default, project-local Ch72/Ch73 back-compat):
+//     bits[ 63:  0] = 64-bit register data
+//     bits[ 79: 64] = 16-bit project-local GS privileged offset
+//                     (drives gs_stub.reg_wr_*)
+//     bits[127: 80] = reserved
+//
+//   REAL_AD_REG_MAP=1 (real PS2 layout, Ch75):
+//     bits[ 63:  0] = 64-bit register data
+//     bits[ 71: 64] = 8-bit GIF A+D register number per PCSX2 GSRegs.h
+//                     (drives gs_stub.gif_reg_*; gs_stub owns the
+//                      decode into PRIM/RGBAQ/XYZF2/XYZ2/FRAME_1/ZBUF_1)
+//     bits[127: 72] = reserved
+//
+// The two namespaces are architecturally distinct. Do NOT add a
+// reg# → privileged-offset LUT here — that conflation is the Ch74
+// mistake Ch75 corrected. New GIF-context registers belong inside
+// gs_stub, keyed by reg#.
+//
+// in_ready is held high — same one-shot contract as gif_path_stub.
+//
+// Trace schema:
+//   On tag accept:    EV_GIFTAG  arg0={flg,path_id} arg1={eop,nreg,nloop_lo}
+//                                arg2=regs_64  arg3=0  flags={in_last,1}
+//   On PACKED data (A+D):  EV_WRITE   arg0=path_id arg1=regnib
+//                                     arg2={offset16}  arg3=data64
+//                                     flags={in_last,0}
+//   On PACKED data (other): EV_MODE  same layout, no GS write.
+//   On REGLIST entry:       EV_MODE  arg0=path_id arg1=regnib
+//                                    arg2=0  arg3=entry64
+//                                    flags={in_last,0}
+//   On IMAGE/DISABLE qword: EV_MODE  arg0={flg,path_id}
+//                                    arg2=0  arg3=in_data[63:0]
+//                                    flags={in_last,0}
+
+`timescale 1ns/1ps
+
+module gif_packed_stub
+    import trace_pkg::*;
+#(
+    parameter logic [3:0] PATH_ID = 4'd2,
+    // Ch75 (was Ch74, corrected): switch the PACKED A+D address-source
+    // from the project-local 16-bit-offset layout (default,
+    // bits[79:64]=gs_offset) to the real PS2 A+D layout where
+    // bits[71:64] is the 8-bit GS A+D register *number*. Per PCSX2's
+    // GSRegs.h, the GIF A+D register namespace is distinct from the
+    // GS privileged-MMIO offset namespace — Ch74's LUT mistakenly
+    // mapped one to the other. The corrected design hands the 8-bit
+    // reg# to gs_stub via its new gif_reg_* port and lets gs_stub
+    // own the GIF-context register file decode. When this parameter
+    // is 0, the legacy gs_wr_* port (16-bit privileged-style offset)
+    // is driven for back-compat with Ch72/Ch73 PACKED-A+D TBs and
+    // tb_bgcolor_via_dma.
+    parameter bit         REAL_AD_REG_MAP = 1'b0
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Upstream from DMAC
+    input  logic          in_valid,
+    input  logic [127:0]  in_data,
+    input  logic          in_last,
+    output logic          in_ready,
+
+    // Downstream — legacy 16-bit-offset port (REAL_AD_REG_MAP=0).
+    // Drives gs_stub's privileged-style reg_wr_* port.
+    output logic          gs_wr_en,
+    output logic [15:0]   gs_wr_addr,
+    output logic [63:0]   gs_wr_data,
+
+    // Ch110 — IMAGE-mode (FLG=2) data passthrough. `image_data_valid`
+    // pulses for one cycle on every accepted IMAGE qword (i.e., when
+    // gif_packed_stub.state == S_IMAGE and a qword is consumed).
+    // `image_data` is the raw 128-bit qword payload; `image_data_last`
+    // mirrors the upstream `in_last`. Downstream `gif_image_xfer_stub`
+    // captures the qword and writes the unpacked pixels into vram_stub
+    // at the BITBLTBUF/TRXPOS/TRXREG-described destination. NOT wired
+    // by TBs that don't model image transfers — leaving these outputs
+    // unconnected is fine (named-port instantiation).
+    output logic          image_data_valid,
+    output logic [127:0]  image_data,
+    output logic          image_data_last,
+    // Ch110 — backpressure from the IMAGE consumer. When state is
+    // S_IMAGE, in_ready is gated by image_data_ready so the upstream
+    // DMA stalls while gif_image_xfer_stub is busy emitting the
+    // previous qword's pixel writes. Outside S_IMAGE the gate has no
+    // effect — in_ready stays high. TBs that don't model image
+    // transfers tie this to 1'b1 (the always-ready default).
+    input  logic          image_data_ready,
+    // Ch172 — backpressure from the raster command FIFO inside
+    // gs_stub. When the GIF is processing PACKED/REGLIST qwords
+    // and the raster FIFO is full, deassert in_ready so the DMAC
+    // pauses BEFORE the next qword is consumed (which might
+    // trigger prim_complete → push_drop). The stall point is the
+    // qword-acceptance handshake — once a qword is `accept`-ed,
+    // the parser fully processes it; we never have a "consumed
+    // but not committed" state. Outside PACKED/REGLIST this input
+    // is ignored. TBs that don't model the raster FIFO tie this
+    // to 1'b0 (always-have-space).
+    input  logic          raster_fifo_full,
+
+    // Ch75 — real-PS2 GIF A+D register-number port (REAL_AD_REG_MAP=1).
+    // Drives gs_stub's GIF-context gif_reg_* port. Only one of
+    // {gs_wr_*, gif_reg_*} is active per accept depending on the
+    // parameter.
+    output logic          gif_reg_wr_en,
+    output logic [7:0]    gif_reg_num,
+    output logic [63:0]   gif_reg_data,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    // Ch73: state widened to cover REGLIST and IMAGE/DISABLE payloads
+    // so unsupported FLG packets don't desync onto the next qword.
+    typedef enum logic [1:0] {
+        S_TAG     = 2'd0,
+        S_PACKED  = 2'd1,
+        S_REGLIST = 2'd2,
+        S_IMAGE   = 2'd3   // also used for FLG=3 DISABLE
+    } state_e;
+    state_e state;
+
+    // Ch110 — in_ready is high outside S_IMAGE (no backpressure
+    // path) and gated by image_data_ready ONLY when an actual
+    // FLG=2 IMAGE payload is in flight. S_IMAGE is also reused
+    // for FLG=3 DISABLE qwords (per Ch73 desync fix); those are
+    // opaque consume-only and must NOT route to
+    // gif_image_xfer_stub or apply backpressure. flg_q is the
+    // currently-running tag's FLG; gating on flg_q==2'd2 keeps
+    // DISABLE payloads silent.
+    logic image_active;
+    assign image_active = (state == S_IMAGE) && (flg_q == 2'd2);
+    // Ch172 — three-tier in_ready policy:
+    //   1) S_IMAGE with FLG=2 → wait for gif_image_xfer_stub to be
+    //      ready for the next qword.
+    //   2) Otherwise (PACKED / REGLIST / TAG-fetch) → stall when
+    //      the downstream raster command FIFO is full so the
+    //      next register write can't trigger an unrecoverable
+    //      prim_complete push_drop. Over-stalling on PACKED qwords
+    //      that aren't going to cause a push is intentionally
+    //      conservative — it has no functional impact and keeps
+    //      the gate simple.
+    //   3) Default (sim TBs without the raster path) → always
+    //      ready (raster_fifo_full tied to 1'b0 in those cases).
+    assign in_ready = image_active ? image_data_ready
+                                   : !raster_fifo_full;
+
+    logic accept;
+    assign accept = in_valid && in_ready;
+
+    // Ch110 — IMAGE-mode data passthrough (combinational).
+    // Gated on image_active so FLG=3 DISABLE qwords are NOT
+    // forwarded to gif_image_xfer_stub.
+    assign image_data       = in_data;
+    assign image_data_last  = in_last;
+    assign image_data_valid = accept && image_active;
+
+    // Latched tag context (valid in S_PACKED / S_REGLIST / S_IMAGE)
+    logic [14:0] nloop_q;
+    logic        eop_q;
+    logic [1:0]  flg_q;
+    // Ch73: nreg_eff widened to 5 bits. NREG field is 4 bits (0..15);
+    // PS2 docs say a value of 0 means 16. Encoding 16 as 5'b10000
+    // lets reg_idx == nreg_eff_q correctly terminate a 16-register
+    // packet. (Old 4-bit clamp made NREG=0 consume only 15 entries.)
+    logic [4:0]  nreg_eff_q;
+    logic [63:0] regs_q;
+    logic [4:0]  reg_idx;
+
+    // Ch73: REGLIST and IMAGE/DISABLE consume opaque qwords. We track
+    // how many qwords are still left in the payload, computed at
+    // S_TAG entry. PACKED keeps its per-entry reg_idx scheme.
+    //   REGLIST count = ceil(NLOOP * NREG / 2)   (2 entries / qword)
+    //   IMAGE   count = NLOOP                    (1 qword / loop)
+    logic [19:0] payload_qwords_left;
+
+    // Combinational tag-field decode for the qword on the wire in S_TAG.
+    logic [14:0] tag_nloop;
+    logic        tag_eop;
+    logic [1:0]  tag_flg;
+    logic [3:0]  tag_nreg_field;
+    logic [4:0]  tag_nreg_eff;
+    logic [63:0] tag_regs;
+
+    assign tag_nloop      = in_data[14:0];
+    assign tag_eop        = in_data[15];
+    assign tag_flg        = in_data[59:58];
+    assign tag_nreg_field = in_data[63:60];
+    assign tag_nreg_eff   = (tag_nreg_field == 4'd0) ? 5'd16
+                                                     : {1'b0, tag_nreg_field};
+    assign tag_regs       = in_data[127:64];
+
+    // Ch73 audit-low: replace indexed bit-select with shift/mask. The
+    // big case statement triggered iverilog's "constant selects in
+    // always_*" "sorry" warnings repeatedly. Use concat-pad to form
+    // the shift amount (reg_idx * 4) without going through `*`, which
+    // iverilog truncates to operand width and would alias high
+    // reg_idx values back to small shifts (e.g., reg_idx=8 wrapping
+    // to shift=0 — the bug found in Ch73 bring-up).
+    logic [6:0] cur_regnib_shift;
+    logic [3:0] cur_regnib;
+    assign cur_regnib_shift = {reg_idx, 2'b00};                          // reg_idx*4 in 7 bits
+    assign cur_regnib       = (regs_q >> cur_regnib_shift) & 64'hF;
+
+    // ------------------------------------------------------------------
+    // FSM
+    // ------------------------------------------------------------------
+    logic packed_last_in_loop;
+    logic packet_loop_last;
+    assign packed_last_in_loop = (reg_idx + 5'd1 == nreg_eff_q);
+    assign packet_loop_last    = (nloop_q == 15'd1);
+
+    // Ch73: pre-compute REGLIST payload-qword count = ceil(NLOOP *
+    // NREG / 2). Done at S_TAG accept so the FSM only needs an
+    // opaque countdown afterwards.
+    logic [19:0] reglist_total_entries;
+    logic [19:0] reglist_total_qwords;
+    assign reglist_total_entries = tag_nloop * tag_nreg_eff;
+    assign reglist_total_qwords  = (reglist_total_entries + 20'd1) >> 1;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state                <= S_TAG;
+            nloop_q              <= 15'd0;
+            eop_q                <= 1'b0;
+            flg_q                <= 2'd0;
+            nreg_eff_q           <= 5'd0;
+            regs_q               <= 64'd0;
+            reg_idx              <= 5'd0;
+            payload_qwords_left  <= 20'd0;
+        end else if (accept) begin
+            unique case (state)
+                S_TAG: begin
+                    nloop_q              <= tag_nloop;
+                    eop_q                <= tag_eop;
+                    flg_q                <= tag_flg;
+                    nreg_eff_q           <= tag_nreg_eff;
+                    regs_q               <= tag_regs;
+                    reg_idx              <= 5'd0;
+                    payload_qwords_left  <= 20'd0;
+                    if (tag_nloop == 15'd0) begin
+                        state <= S_TAG;                                  // empty tag
+                    end else begin
+                        unique case (tag_flg)
+                            2'd0: state <= S_PACKED;
+                            2'd1: begin
+                                state               <= S_REGLIST;
+                                payload_qwords_left <= reglist_total_qwords;
+                            end
+                            default: begin
+                                state               <= S_IMAGE;          // FLG=2/3
+                                payload_qwords_left <= {5'd0, tag_nloop};
+                            end
+                        endcase
+                    end
+                end
+
+                S_PACKED: begin
+                    if (packed_last_in_loop) begin
+                        reg_idx <= 5'd0;
+                        if (packet_loop_last) state <= S_TAG;
+                        else                  nloop_q <= nloop_q - 15'd1;
+                    end else begin
+                        reg_idx <= reg_idx + 5'd1;
+                    end
+                end
+
+                S_REGLIST, S_IMAGE: begin
+                    // Both branches consume opaque qwords. Trace fires
+                    // per accept (see trace block below); decode of
+                    // individual REGLIST entries is left to a future
+                    // chapter once gs_stub gains the matching reg
+                    // surface. The point of Ch73 here is just: don't
+                    // desync onto the next GIFtag.
+                    if (payload_qwords_left == 20'd1) state <= S_TAG;
+                    else payload_qwords_left <= payload_qwords_left - 20'd1;
+                end
+
+                default: state <= S_TAG;
+            endcase
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // GS write — fires only on PACKED A+D data accepts. REGLIST
+    // entries don't generate GS writes here (real REGLIST treats each
+    // nibble as a register *number*, not A+D, and we don't have GS
+    // routing for that path yet). IMAGE/DISABLE never generates GS
+    // writes.
+    //
+    // Ch75: split into two output ports based on REAL_AD_REG_MAP.
+    // Only one fires per accept; the other stays low. gs_stub's
+    // privileged-side `reg_wr_*` and GIF-A+D-side `gif_reg_*` ports
+    // are architecturally distinct.
+    // ------------------------------------------------------------------
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            gs_wr_en      <= 1'b0;
+            gs_wr_addr    <= 16'd0;
+            gs_wr_data    <= 64'd0;
+            gif_reg_wr_en <= 1'b0;
+            gif_reg_num   <= 8'd0;
+            gif_reg_data  <= 64'd0;
+        end else if (accept && state == S_PACKED && cur_regnib == 4'hE) begin
+            if (REAL_AD_REG_MAP) begin
+                gs_wr_en      <= 1'b0;
+                gif_reg_wr_en <= 1'b1;
+                gif_reg_num   <= in_data[71:64];
+                gif_reg_data  <= in_data[63:0];
+            end else begin
+                gs_wr_en      <= 1'b1;
+                gs_wr_addr    <= in_data[79:64];
+                gs_wr_data    <= in_data[63:0];
+                gif_reg_wr_en <= 1'b0;
+            end
+        end else begin
+            gs_wr_en      <= 1'b0;
+            gif_reg_wr_en <= 1'b0;
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace
+    // ------------------------------------------------------------------
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_GIF;
+            ev_event  <= EV_GIFTAG;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (accept && state == S_TAG) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_GIF;
+            ev_event  <= EV_GIFTAG;
+            // arg0[3:0]=path_id, arg0[5:4]=flg → callers can grep by FLG
+            ev_arg0   <= {58'd0, tag_flg, PATH_ID};
+            // Compact tag summary: {eop[15], reserved[14:13]=flg, nreg[12:9], nloop[8:0]}
+            ev_arg1   <= {49'd0, tag_eop, tag_flg, tag_nreg_field,
+                          tag_nloop[8:0]};
+            ev_arg2   <= tag_regs;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= {30'd0, in_last, 1'b1};                         // bit0=is_tag
+        end else if (accept && state == S_PACKED) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_GIF;
+            ev_event  <= (cur_regnib == 4'hE) ? EV_WRITE : EV_MODE;
+            ev_arg0   <= {60'd0, PATH_ID};
+            ev_arg1   <= {60'd0, cur_regnib};
+            ev_arg2   <= {48'd0, in_data[79:64]};
+            ev_arg3   <= in_data[63:0];
+            ev_flags  <= {30'd0, in_last, 1'b0};                         // bit0=is_data
+        end else if (accept && state == S_REGLIST) begin
+            // Two entries per qword: low half (reglist_half=0) → bits
+            // [63:0]; high half (reglist_half=1) → bits[127:64]. Trace
+            // each as EV_MODE (no GS write). reglist_half is the
+            // already-flopped bit, so the same trace block fires for
+            // both halves of the same qword on consecutive accepts —
+            // wait, no: REGLIST's S_REGLIST branch consumes one accept
+            // per half of the same qword? In our FSM, the high half
+            // re-enters S_REGLIST on the SAME qword? It does not — the
+            // FSM advances reglist_half within a single accept. Trace
+            // the low-half entry on the accept; the high-half entry's
+            // trace is omitted in this minimal Ch73 path.
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_GIF;
+            ev_event  <= EV_MODE;
+            ev_arg0   <= {60'd0, PATH_ID};
+            ev_arg1   <= {60'd0, cur_regnib};
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= in_data[63:0];                                  // low-half entry data
+            ev_flags  <= {30'd0, in_last, 1'b0};
+        end else if (accept && state == S_IMAGE) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_GIF;
+            ev_event  <= EV_MODE;
+            ev_arg0   <= {58'd0, flg_q, PATH_ID};
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= in_data[63:0];
+            ev_flags  <= {30'd0, in_last, 1'b0};
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : gif_packed_stub
@@ -0,0 +1,128 @@
+// retroDE_ps2 — gif_path_stub
+//
+// Narrow GIF ingress stub for Wave 2. Accepts qword payloads from
+// dmac_reg_stub, interprets them as register-write packets (project-local
+// format — see below), and forwards register writes into gs_stub.
+//
+// This is NOT real GIFtag decode. Real GIFtag/PACKED/REGLIST/IMAGE formats
+// arrive in Wave 3. The wire format here is a project-local shortcut sized
+// to the "programmable BGCOLOR via DMA/GIF" target in
+// docs/wave2_dma_gif_plan.md.
+//
+// Project-local Wave 2 packet format (per qword):
+//   bits [ 15:  0]   = target register offset within GS privileged block
+//                      (e.g., 0x00E0 for BGCOLOR)
+//   bits [ 79: 16]   = 64-bit register value (low 24 bits are RGB for BGCOLOR)
+//   bits [127: 80]   = reserved, must be zero
+//
+// Each qword produced by the DMAC is treated as one standalone register
+// write. This module is stateless with respect to packet framing —
+// multi-beat transfers (Wave 2.6 onward) work transparently because every
+// accepted qword is independently decoded. `in_last` is preserved as
+// trace-visible metadata in ev_flags[0] but does not gate decode. Real
+// GIFtag/PACKED/REGLIST/IMAGE format decode, along with tag-phase vs.
+// data-phase state, is deferred to Wave 3.
+//
+// PATH selection is hard-scoped to the DMAC channel-2 path (PATH id 2)
+// since no arbitration exists yet.
+//
+// Trace payload schema:
+//   GIF GIFTAG  arg0=path_id arg1=packet_type arg2=reg_offset arg3=payload_lo
+
+`timescale 1ns/1ps
+
+module gif_path_stub
+    import trace_pkg::*;
+#(
+    parameter logic [3:0] PATH_ID = 4'd2
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Upstream from DMAC
+    input  logic          in_valid,
+    input  logic [127:0]  in_data,
+    input  logic          in_last,
+    output logic          in_ready,
+
+    // Downstream to gs_stub (register-write style)
+    output logic          gs_wr_en,
+    output logic [15:0]   gs_wr_addr,
+    output logic [63:0]   gs_wr_data,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    // Wave 2 packet-type magic — one value for "register-write packet".
+    // Future waves will add real GIFtag formats.
+    localparam logic [15:0] PKT_TYPE_REG_WRITE = 16'hA01A;
+
+    // Always ready in Wave 2 — no internal backpressure.
+    assign in_ready = 1'b1;
+
+    logic        accept;
+    assign accept = in_valid && in_ready;
+
+    // Decode fields from the qword.
+    logic [15:0] decoded_offset;
+    logic [63:0] decoded_value;
+
+    assign decoded_offset = in_data[15:0];
+    assign decoded_value  = in_data[79:16];
+
+    // ------------------------------------------------------------------
+    // Downstream to gs_stub (registered one-shot pulse)
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            gs_wr_en   <= 1'b0;
+            gs_wr_addr <= 16'd0;
+            gs_wr_data <= 64'd0;
+        end else begin
+            gs_wr_en   <= accept;
+            if (accept) begin
+                gs_wr_addr <= decoded_offset;
+                gs_wr_data <= decoded_value;
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace emission
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_GIF;
+            ev_event  <= EV_GIFTAG;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (accept) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_GIF;
+            ev_event  <= EV_GIFTAG;
+            ev_arg0   <= {60'd0, PATH_ID};
+            ev_arg1   <= {48'd0, PKT_TYPE_REG_WRITE};
+            ev_arg2   <= {48'd0, decoded_offset};
+            ev_arg3   <= {{32{1'b0}}, decoded_value[31:0]};
+            // flags[0] marks end-of-packet (tracks DMAC's in_last)
+            ev_flags  <= {31'd0, in_last};
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : gif_path_stub
@@ -0,0 +1,157 @@
+// retroDE_ps2 — gs_alpha_blend
+//
+// Brick 2a — GS ALPHA blending (transparency), the source-over case.
+//
+// Computes, per RGB channel:
+//   Cv = ((Cs - Cd) * As) >> 7 + Cd   (clamped to [0,255])
+// where
+//   Cs = source color  (the sprite's RGBAQ color channel)
+//   Cd = destination   (the framebuffer pixel READ back at the write addr)
+//   As = source alpha  (RGBAQ.A; PS2 0..128 scale where 0x80 == 1.0)
+//
+// This is the standard PS2 GS ALPHA register config A=0(Cs) B=1(Cd)
+// C=0(As) D=1(Cd) — i.e. the canonical alpha-over blend. The brick-2a
+// scope implements ONLY this config; selecting any other (A,B,C,D)
+// tuple is handled by the caller (gs_stub) which falls back to an
+// opaque write for unsupported configs, so this unit is always asked
+// for the source-over result.
+//
+// Purely combinational: a subtract, a multiply by As (<= 8 bits), an
+// arithmetic shift right by 7, an add, and a clamp. No divide. Fully
+// synthesizable — there is NO `// synthesis translate_off` on this
+// path. The >>7 is a wire shift; the *As is a single small multiply.
+//
+// The (Cs - Cd) term is signed (can be negative when the dest is
+// brighter than the source), so the multiply and the shift are done
+// in signed arithmetic and the final sum is clamped back into the
+// unsigned [0,255] byte range. As is treated as unsigned 0..128; the
+// caller passes RGBAQ.A[7:0] (real GS uses A[6:0]*2 internally for
+// the 0..128 mapping, but A[7:0] already encodes 0x80=1.0 for the
+// values our demo programs, and clamping As at 128 keeps a stray
+// A>0x80 from over-shooting).
+//
+// Alpha (the A channel of the output) follows real-GS behavior for a
+// framebuffer write: the SOURCE alpha is written through. brick-2a
+// keeps the existing emit-lane A byte (= source A) unchanged; only
+// R/G/B are blended. The 'a_out' port forwards the source A so the
+// caller can repack the 32-bit ABGR.
+
+`timescale 1ns/1ps
+
+module gs_alpha_blend #(
+    // Brick-2c — generic GS blend selector. Default OFF → the module is
+    // BYTE-IDENTICAL to the original source-over-only implementation,
+    // regardless of the selector inputs (which default to 0 when an
+    // instantiation leaves them unconnected). When set, the generic
+    // GS ALPHA datapath (A/B/C/D selectors + FIX) is used.
+    parameter bit ALPHA_MODES = 1'b0
+) (
+    // Source (sprite) color channels + alpha.
+    input  logic [7:0] cs_r,
+    input  logic [7:0] cs_g,
+    input  logic [7:0] cs_b,
+    input  logic [7:0] as,        // source alpha, 0..128 scale (0x80 = 1.0)
+
+    // Destination (framebuffer) color channels.
+    input  logic [7:0] cd_r,
+    input  logic [7:0] cd_g,
+    input  logic [7:0] cd_b,
+
+    // Brick-2c — generic GS ALPHA_1 selectors (only read when ALPHA_MODES=1).
+    //   a_sel : A operand  0=Cs 1=Cd 2=0
+    //   b_sel : B operand  0=Cs 1=Cd 2=0
+    //   c_sel : C coeff    0=As 1=Ad 2=FIX
+    //   d_sel : D operand  0=Cs 1=Cd 2=0
+    //   ad    : destination alpha (8-bit, used when c_sel==1)
+    //   fix   : fixed alpha coefficient (8-bit, used when c_sel==2)
+    input  logic [1:0] a_sel,
+    input  logic [1:0] b_sel,
+    input  logic [1:0] c_sel,
+    input  logic [1:0] d_sel,
+    input  logic [7:0] ad,
+    input  logic [7:0] fix,
+
+    // Blended output.
+    output logic [7:0] cv_r,
+    output logic [7:0] cv_g,
+    output logic [7:0] cv_b,
+    output logic [7:0] a_out      // source alpha, passed through
+);
+
+    // Clamp As at 128 (0x80) — anything above 1.0 is treated as 1.0.
+    logic [7:0] as_eff;
+    assign as_eff = (as > 8'd128) ? 8'd128 : as;
+
+    function automatic logic [7:0] blend_ch(input logic [7:0] cs,
+                                            input logic [7:0] cd,
+                                            input logic [7:0] alpha);
+        logic signed [9:0]  diff;     // Cs - Cd, range -255..+255
+        logic signed [17:0] prod;     // diff * alpha, alpha 0..128
+        logic signed [17:0] shifted;  // prod >>> 7
+        logic signed [17:0] sum;      // shifted + Cd
+        diff    = $signed({2'b00, cs}) - $signed({2'b00, cd});
+        prod    = diff * $signed({1'b0, alpha});
+        shifted = prod >>> 7;                      // arithmetic shift
+        sum     = shifted + $signed({10'd0, cd});
+        // Clamp to [0,255].
+        if (sum < 18'sd0)
+            return 8'd0;
+        else if (sum > 18'sd255)
+            return 8'd255;
+        else
+            return sum[7:0];
+    endfunction
+
+    // ------------------------------------------------------------------
+    // Brick-2c — generic GS blend selector datapath.
+    //   Cv = clamp( (((A - B) * C) >>> 7) + D )    per RGB channel.
+    //   A/B/D ∈ {Cs, Cd, 0}; C ∈ {As, Ad, FIX} (8-bit coeff, 0x80==1.0).
+    // (A-B) is signed; *C is unsigned 0..255; >>>7 arithmetic; +D; clamp.
+    // ------------------------------------------------------------------
+    function automatic logic [7:0] blend_generic(
+        input logic [7:0] cs, input logic [7:0] cd,
+        input logic [1:0] asel, input logic [1:0] bsel,
+        input logic [1:0] dsel, input logic [7:0] coef);
+        logic [7:0]        op_a;
+        logic [7:0]        op_b;
+        logic [7:0]        op_d;
+        logic signed [31:0] diff;     // signed (A - B)
+        logic signed [31:0] prod;     // diff * coef (coef unsigned 0..255)
+        logic signed [31:0] shifted;  // prod >>> 7
+        logic signed [31:0] sum;      // shifted + D
+        op_a = (asel == 2'd0) ? cs : (asel == 2'd1) ? cd : 8'd0;
+        op_b = (bsel == 2'd0) ? cs : (bsel == 2'd1) ? cd : 8'd0;
+        op_d = (dsel == 2'd0) ? cs : (dsel == 2'd1) ? cd : 8'd0;
+        diff    = $signed({1'b0, op_a}) - $signed({1'b0, op_b});
+        prod    = diff * $signed({24'd0, coef});
+        shifted = prod >>> 7;                 // arithmetic shift
+        sum     = shifted + $signed({24'd0, op_d});
+        if (sum < 32'sd0)
+            return 8'd0;
+        else if (sum > 32'sd255)
+            return 8'd255;
+        else
+            return sum[7:0];
+    endfunction
+
+    // Shared 8-bit C coefficient (same for all three channels).
+    logic [7:0] coef_c;
+    assign coef_c = (c_sel == 2'd0) ? as_eff :
+                    (c_sel == 2'd1) ? ad     : fix;
+
+    generate
+        if (ALPHA_MODES) begin : g_generic
+            assign cv_r = blend_generic(cs_r, cd_r, a_sel, b_sel, d_sel, coef_c);
+            assign cv_g = blend_generic(cs_g, cd_g, a_sel, b_sel, d_sel, coef_c);
+            assign cv_b = blend_generic(cs_b, cd_b, a_sel, b_sel, d_sel, coef_c);
+        end else begin : g_source_over
+            // EXACT original source-over expression — byte-identical.
+            assign cv_r = blend_ch(cs_r, cd_r, as_eff);
+            assign cv_g = blend_ch(cs_g, cd_g, as_eff);
+            assign cv_b = blend_ch(cs_b, cd_b, as_eff);
+        end
+    endgenerate
+
+    assign a_out = as;            // source alpha passes through unchanged
+
+endmodule : gs_alpha_blend
@@ -0,0 +1,89 @@
+// retroDE_ps2 — gs_async_fifo (Ch318)
+//
+// Generic dual-clock (asynchronous) FIFO with gray-code pointers and 2-FF pointer
+// synchronizers — the standard CDC-safe ring buffer. Used by gs_lpddr_axi_master to
+// cross 256-bit framebuffer-row packets {addr,data,strb} from the GS clock domain to
+// the f2sdram (LPDDR AXI) clock domain. Both domains are treated as GENUINELY async
+// even when nominally the same frequency (GS = PLL design_clk; f2sdram = raw board
+// clock), per the Ch318 directive.
+//
+// DEPTH must be a power of two. `wr`/`rd` are single-cycle handshakes gated by
+// !full / !empty. Standard caveats: do NOT assert wr when full or rd when empty
+// (the wrapper gates both). One-deep gray pointers, single 2-FF synchronizer each
+// way — adequate for the modest packet rate (one 32-byte beat per 16 flushed pixels).
+
+module gs_async_fifo #(
+    parameter int WIDTH = 320,          // {addr[31:0], data[255:0], strb[31:0]}
+    parameter int DEPTH = 16            // power of two
+) (
+    // write domain
+    input  logic              wclk,
+    input  logic              wrst_n,
+    input  logic              wr,
+    input  logic [WIDTH-1:0]  wdata,
+    output logic              wfull,
+    // read domain
+    input  logic              rclk,
+    input  logic              rrst_n,
+    input  logic              rd,
+    output logic [WIDTH-1:0]  rdata,
+    output logic              rempty
+);
+    localparam int AW = $clog2(DEPTH);
+
+    logic [WIDTH-1:0] mem [0:DEPTH-1];
+
+    // ---- binary + gray pointers (one extra MSB for full/empty disambiguation) ----
+    logic [AW:0] wbin, wgray, wbin_nxt, wgray_nxt;
+    logic        wfull_nxt;            // Ch352 — combinational next-value for the now-REGISTERED wfull
+    logic [AW:0] rbin, rgray, rbin_nxt, rgray_nxt;
+
+    // synchronized opposite-domain gray pointers (2-FF)
+    logic [AW:0] rgray_s1, rgray_s2;    // read gray -> write domain
+    logic [AW:0] wgray_s1, wgray_s2;    // write gray -> read domain
+
+    function automatic logic [AW:0] bin2gray(input logic [AW:0] b);
+        bin2gray = b ^ (b >> 1);
+    endfunction
+
+    // ---------------- write domain ----------------
+    assign wbin_nxt  = wbin + (wr && !wfull);
+    assign wgray_nxt = bin2gray(wbin_nxt);
+    // full: next write gray == read gray with top two bits inverted. Ch352 — wfull is now a REGISTERED flag
+    // (Cummings canonical). The previous `assign wfull = (wgray_nxt == ...)` was combinational, and since
+    // wgray_nxt <- wbin_nxt <- wfull, it formed a wbin_nxt->wgray_nxt->wfull->wbin_nxt COMBINATIONAL LOOP that
+    // Quartus reports and that made Place churn. Registering it breaks the loop with no overflow-behavior change:
+    // wfull still asserts the cycle after the filling write (full is computed from wgray_nxt = the pointer AFTER
+    // the current write), so the (DEPTH+1)th write is still blocked. rempty is intentionally left unchanged.
+    assign wfull_nxt = (wgray_nxt == {~rgray_s2[AW:AW-1], rgray_s2[AW-2:0]});
+    always_ff @(posedge wclk or negedge wrst_n) begin
+        if (!wrst_n) begin
+            wbin <= '0; wgray <= '0; wfull <= 1'b0;
+            rgray_s1 <= '0; rgray_s2 <= '0;
+        end else begin
+            wbin  <= wbin_nxt;
+            wgray <= wgray_nxt;
+            wfull <= wfull_nxt;
+            rgray_s1 <= rgray;          // sync read gray into write domain
+            rgray_s2 <= rgray_s1;
+        end
+    end
+    always_ff @(posedge wclk) if (wr && !wfull) mem[wbin[AW-1:0]] <= wdata;
+
+    // ---------------- read domain ----------------
+    assign rbin_nxt  = rbin + (rd && !rempty);
+    assign rgray_nxt = bin2gray(rbin_nxt);
+    always_ff @(posedge rclk or negedge rrst_n) begin
+        if (!rrst_n) begin
+            rbin <= '0; rgray <= '0;
+            wgray_s1 <= '0; wgray_s2 <= '0;
+        end else begin
+            rbin  <= rbin_nxt;
+            rgray <= rgray_nxt;
+            wgray_s1 <= wgray;          // sync write gray into read domain
+            wgray_s2 <= wgray_s1;
+        end
+    end
+    assign rdata  = mem[rbin[AW-1:0]];
+    assign rempty = (rgray == wgray_s2);
+endmodule : gs_async_fifo
@@ -0,0 +1,88 @@
+// ============================================================================
+// gs_grad_divider.sv  (Ch352 — sequential signed divider for the triangle-setup gradient solve)
+//
+// Replaces the single combinational `grad_num_q[grad_step] / grad_det_q` in gs_stub. That combinational
+// divider is a ~6700-cell, ~100ns cone at the 25MHz design clock — the worst setup path, and (the real lesson)
+// it CANNOT be covered by any SDC timing exception: both a multicycle and a false_path made the Quartus fitter
+// grind on its cone indefinitely (Place stuck <1% for hours). A sequential divider has REGISTERED iterations and
+// no combinational cone, so every internal path is an ordinary single-cycle path that closes timing normally —
+// no exception needed, no grind.
+//
+// BIT-EXACT to SystemVerilog signed `/`:
+//   * truncation toward zero (divide magnitudes, then apply the XOR-of-signs);
+//   * den == 0 -> quotient 0 (matches the gs_stub `if (grad_det_q==0) grad_quo=0` guard).
+// Restoring division of the W-bit magnitudes (W iterations), one iteration per clock.
+//
+// Handshake: pulse `start` with num/den stable -> `busy` high for the solve -> `done` pulses for one cycle
+// with `quo` valid (and stays valid until the next start). The gs_stub gradient FSM waits on `done`.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_grad_divider #(
+    parameter int W = 56                     // operand width (gs_stub: grad_num_q / sign-extended grad_det)
+)(
+    input  logic                 clk,
+    input  logic                 rst_n,
+    input  logic                 start,       // pulse: begin a divide (num/den sampled this cycle)
+    input  logic signed [W-1:0]  num,
+    input  logic signed [W-1:0]  den,
+    output logic signed [W-1:0]  quo,         // truncate-toward-zero quotient (== $signed(num)/$signed(den))
+    output logic                 busy,
+    output logic                 done          // 1-cycle pulse when quo is valid
+);
+    localparam int CW = $clog2(W+1);
+
+    // magnitude + sign capture
+    function automatic logic [W-1:0] absval(input logic signed [W-1:0] v);
+        absval = v[W-1] ? (~v + 1'b1) : v;     // |v| (the most-negative wraps to 2^(W-1), which fits unsigned W)
+    endfunction
+
+    logic [W:0]    rem;        // remainder, W+1 bits for the compare/subtract
+    logic [W-1:0]  qbuild;     // quotient under construction (also shifts the dividend out of its top)
+    logic [W-1:0]  den_mag;    // |den|
+    logic          qsign;      // result sign = num_sign ^ den_sign
+    logic [CW-1:0] iter;
+    logic          run;
+
+    // one restoring step: bring the next dividend bit into rem, conditionally subtract |den|.
+    wire [W:0]     rem_sh  = {rem[W-1:0], qbuild[W-1]};   // rem<<1 | dividend MSB
+    wire           sub_ok  = (rem_sh >= {1'b0, den_mag});
+    wire [W:0]     rem_nxt = sub_ok ? (rem_sh - {1'b0, den_mag}) : rem_sh;
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            rem <= '0; qbuild <= '0; den_mag <= '0; qsign <= 1'b0; iter <= '0;
+            run <= 1'b0; busy <= 1'b0; done <= 1'b0; quo <= '0;
+        end else begin
+            done <= 1'b0;
+            if (start && !busy) begin
+                if (den == '0) begin
+                    // den == 0 -> quotient 0 (matches the gs_stub guard), available next cycle
+                    quo  <= '0;
+                    done <= 1'b1;
+                    busy <= 1'b0;
+                    run  <= 1'b0;
+                end else begin
+                    rem     <= '0;
+                    qbuild  <= absval(num);
+                    den_mag <= absval(den);
+                    qsign   <= num[W-1] ^ den[W-1];
+                    iter    <= CW'(W);
+                    run     <= 1'b1;
+                    busy    <= 1'b1;
+                end
+            end else if (run) begin
+                rem    <= rem_nxt;
+                qbuild <= {qbuild[W-2:0], sub_ok};   // shift dividend out, shift quotient bit in
+                iter   <= iter - 1'b1;
+                if (iter == CW'(1)) begin
+                    // final iteration: qbuild now holds the W-bit magnitude quotient -> apply sign
+                    run  <= 1'b0;
+                    busy <= 1'b0;
+                    done <= 1'b1;
+                    quo  <= qsign ? (~{qbuild[W-2:0], sub_ok} + 1'b1) : {qbuild[W-2:0], sub_ok};
+                end
+            end
+        end
+    end
+endmodule
@@ -0,0 +1,248 @@
+// retroDE_ps2 — gs_lpddr_axi_master (Ch318)
+//
+// HARDWARE-facing wrapper that takes the PSMCT16 tile-FLUSH pixel stream (GS clock)
+// and writes it to real LPDDR over the qsys f2sdram AXI4 port (f2sdram clock). It
+// does NOT modify the proven gs_lpddr_fb_writer (the Ch317 sim model) — it is a
+// sibling hardware path with the same input stream.
+//
+// Pipeline (per the Ch318 directive):
+//   GS clock  : PACKER — accumulate 16 PSMCT16 pixels of a tile-row into one 256-bit
+//               (32-byte) beat {addr, data, strb}. A tile-row is exactly 16 px on a
+//               32-byte-aligned line, so a beat completes naturally on its 16th px
+//               (no dangling partial beat). On completion, push to the async FIFO.
+//   async FIFO: gray-code CDC, carries {addr[31:0], data[255:0], strb[31:0]} (320b).
+//   f2sdram   : AXI burst FSM — pop a beat and issue a single-beat INCR write
+//               (AWSIZE=5 = 32 B, AWLEN=0, AWBURST=INCR, full per-byte WSTRB, never
+//               crossing a 4 KiB boundary since each beat is one 32-byte line). AW
+//               then W then B, all with backpressure (await ready/valid).
+//
+// Address: awaddr = FB_BASE + packet_addr (packet_addr is the FB-relative byte addr
+// from raster_pixel_fb_addr_q). FB_BASE must point at a LINUX-SAFE reserved LPDDR
+// region before any board run — the qsys aperture proves fabric CAN address SDRAM,
+// not which physical range is safe to scribble on (Ch318 board gate).
+//
+// Counters (f2sdram domain, TB/status readable): beats, bursts, bresp_err, fifo
+// overflow, done-ish (idle && fifo empty). enable=0 → fully inert.
+
+module gs_lpddr_axi_master #(
+    parameter int  FIFO_DEPTH = 16
+) (
+    // GS clock domain — flush pixel stream
+    input  logic        gs_clk,
+    input  logic        gs_rst_n,
+    input  logic        enable,
+    // ---- RUNTIME controls (driven by the HPS bridge register, axi_clk domain) ----
+    // arm: HARD SAFETY GATE — no AXI write can issue unless high. Defaults LOW at the
+    //   bridge register, so the booted core is inert until the HPS explicitly arms it.
+    //   Synced into gs_clk for the packer; used directly in the axi_clk FSM.
+    input  logic        arm,
+    // canary: when high, write ONLY the offset-0 beat (the 32-byte top-of-frame line)
+    //   and discard all others — a deterministic, blast-radius-limited first test.
+    input  logic        canary,
+    // fb_base: LPDDR byte base address for the framebuffer (e.g. 0x8000_0000). awaddr
+    //   = fb_base + frame-relative offset. Runtime so a wrong base is re-targetable
+    //   without a rebuild.
+    input  logic [31:0] fb_base,
+    // Ch352 CDC (Codex) — {arm,canary,fb_base} arrive RAW from the HPS bridge (CLOCK2_50), NOT axi_clk as the
+    // legacy comment above implies. ctrl_commit is a TOGGLE the bridge flips on any control write; we sync it
+    // into axi_clk and latch the controls on its edge, so the multi-bit fb_base crosses COHERENTLY (the CDC
+    // lives here, at the receiving boundary, so no caller can supply raw controls into the AW path).
+    input  logic        ctrl_commit,
+    input  logic        px_emit,
+    input  logic [31:0] px_addr,      // FB-relative byte address (raster_pixel_fb_addr_q)
+    input  logic [15:0] px_pix16,
+
+    // f2sdram (LPDDR AXI) clock domain
+    input  logic        axi_clk,
+    input  logic        axi_rst_n,
+    // AXI4 write-address
+    output logic [31:0] awaddr,
+    output logic [7:0]  awlen,
+    output logic [2:0]  awsize,
+    output logic [1:0]  awburst,
+    output logic [4:0]  awid,
+    output logic        awvalid,
+    input  logic        awready,
+    // AXI4 write-data
+    output logic [255:0] wdata,
+    output logic [31:0]  wstrb,
+    output logic         wlast,
+    output logic         wvalid,
+    input  logic         wready,
+    // AXI4 write-response
+    input  logic        bvalid,
+    output logic        bready,
+    input  logic [1:0]  bresp,
+
+    // status / counters (axi domain)
+    output logic [31:0] beats_written,
+    output logic [31:0] bursts_issued,
+    output logic [31:0] bresp_err_count,
+    output logic [31:0] fifo_overflow_count,
+    output logic        idle
+);
+    localparam int PW = 320;   // {addr[31:0], data[255:0], strb[31:0]}
+
+    // ============================ GS-clock PACKER ============================
+    logic [31:0]  cur_addr;
+    logic [255:0] cur_data;
+    logic [31:0]  cur_strb;
+    logic         has_data;
+    logic         fifo_wr;
+    logic [PW-1:0] fifo_wdata;
+    logic          fifo_wfull;
+
+    // Ch352 — axi_clk control snapshot: sync the bridge commit toggle and latch {arm,canary,fb_base} on its
+    // edge. Init to the bridge's SAFE defaults (arm=0, canary=1, fb_base=0x8000_0000) so the booted core is
+    // inert until the HPS arms it, even before the first commit. All axi_clk uses + the gs_clk arm-sync read
+    // these coherent latched copies instead of the raw bridge buses.
+    logic [2:0]  commit_sync;
+    logic        arm_axi, canary_axi;
+    logic [31:0] fb_base_axi;
+    always_ff @(posedge axi_clk or negedge axi_rst_n) begin
+        if (!axi_rst_n) begin
+            commit_sync <= 3'd0; arm_axi <= 1'b0; canary_axi <= 1'b1; fb_base_axi <= 32'h8000_0000;
+        end else begin
+            commit_sync <= {commit_sync[1:0], ctrl_commit};
+            if (commit_sync[2] != commit_sync[1]) begin  // commit edge: bridge buses are stable, latch them
+                arm_axi     <= arm;
+                canary_axi  <= canary;
+                fb_base_axi <= fb_base;
+            end
+        end
+    end
+
+    // High for the one cycle the snapshot updates. Admission is blocked then so the FSM never consumes a beat
+    // straddling a config change (old base/arm on the pop cycle, new on the next).
+    wire commit_edge = (commit_sync[2] != commit_sync[1]);
+
+    // arm crosses from axi_clk into gs_clk — 2-FF synchronizer (from the COHERENT latched arm).
+    logic arm_s1, arm_gs;
+    always_ff @(posedge gs_clk or negedge gs_rst_n) begin
+        if (!gs_rst_n) begin arm_s1 <= 1'b0;    arm_gs <= 1'b0; end
+        else          begin arm_s1 <= arm_axi;  arm_gs <= arm_s1; end
+    end
+
+    always_ff @(posedge gs_clk or negedge gs_rst_n) begin
+        if (!gs_rst_n) begin
+            cur_addr <= '0; cur_data <= '0; cur_strb <= '0; has_data <= 1'b0;
+            fifo_wr  <= 1'b0; fifo_wdata <= '0; fifo_overflow_count <= '0;
+        end else begin
+            fifo_wr <= 1'b0;
+            if (enable && arm_gs && px_emit) begin   // gate: no accumulation until armed
+                logic [31:0]  abeat;
+                logic [3:0]   lane;        // 0..15 (which 16-bit lane)
+                logic [255:0] nd;
+                logic [31:0]  ns;
+                abeat = {px_addr[31:5], 5'd0};
+                lane  = px_addr[4:1];
+                if (has_data && (abeat != cur_addr)) begin
+                    // line changed before the previous beat filled — flush it, restart
+                    fifo_wdata <= {cur_addr, cur_data, cur_strb};
+                    fifo_wr    <= 1'b1;
+                    cur_addr   <= abeat;
+                    cur_data   <= (256'(px_pix16) << ({28'd0, lane} * 16));
+                    cur_strb   <= (32'd3 << ({28'd0, lane} * 2));
+                    has_data   <= 1'b1;
+                end else begin
+                    nd = has_data ? cur_data : 256'd0;
+                    ns = has_data ? cur_strb : 32'd0;
+                    nd[ ({28'd0, lane} * 16) +: 16 ] = px_pix16;
+                    ns[ ({28'd0, lane} * 2)  +: 2  ] = 2'b11;
+                    if (&ns) begin
+                        // beat complete (all 16 lanes) — flush, beat consumed
+                        fifo_wdata <= {abeat, nd, ns};
+                        fifo_wr    <= 1'b1;
+                        has_data   <= 1'b0;
+                    end else begin
+                        cur_addr <= abeat;
+                        cur_data <= nd;
+                        cur_strb <= ns;
+                        has_data <= 1'b1;
+                    end
+                end
+            end
+            // overflow witness: a push attempt while the FIFO is full (must stay 0)
+            if (fifo_wr && fifo_wfull)
+                fifo_overflow_count <= fifo_overflow_count + 32'd1;
+        end
+    end
+
+    // ============================ async FIFO (CDC) ============================
+    logic [PW-1:0] fifo_rdata;
+    logic          fifo_rempty;
+    logic          fifo_rd;
+    // Ch323 — reset BOTH FIFO pointers from the STABLE axi_rst_n (assert async, deassert
+    // synced into gs_clk). gs_rst_n (= core reset) toggles on every CORE_CTRL re-render; if
+    // the write pointer reset followed it while the read pointer stayed, the gray pointers
+    // would desync → FIFO corruption (phantom beats, no commit). Same fix as gs_z_flush_writer.
+    reg [1:0] wrst_sync;
+    always_ff @(posedge gs_clk or negedge axi_rst_n) begin
+        if (!axi_rst_n) wrst_sync <= 2'b00;
+        else            wrst_sync <= {wrst_sync[0], 1'b1};
+    end
+    wire fifo_wrst_n = wrst_sync[1];
+    gs_async_fifo #(.WIDTH(PW), .DEPTH(FIFO_DEPTH)) u_fifo (
+        .wclk(gs_clk), .wrst_n(fifo_wrst_n), .wr(fifo_wr && !fifo_wfull), .wdata(fifo_wdata), .wfull(fifo_wfull),
+        .rclk(axi_clk), .rrst_n(axi_rst_n), .rd(fifo_rd), .rdata(fifo_rdata), .rempty(fifo_rempty)
+    );
+
+    // ============================ f2sdram-clock AXI FSM ============================
+    localparam logic [1:0] S_IDLE=2'd0, S_AW=2'd1, S_W=2'd2, S_B=2'd3;
+    logic [1:0]   state;
+    logic [31:0]  beat_addr;
+    logic [255:0] beat_data;
+    logic [31:0]  beat_strb;
+    logic [31:0]  awaddr_q;        // Ch352 — full AW address latched at admission, held stable AW->W->B
+
+    assign awsize  = 3'd5;          // 32 bytes/beat (256-bit)
+    assign awburst = 2'b01;         // INCR
+    assign awid    = 5'd0;
+    assign awlen   = 8'd0;          // single beat per line (tile-rows aren't contiguous)
+    assign awaddr  = awaddr_q;      // Ch352 — latched at admission; STABLE through AW->W->B (AXI requires it)
+    assign wdata   = beat_data;
+    assign wstrb   = beat_strb;
+    assign wlast   = 1'b1;          // 1-beat burst
+    // Ch352 — AXI transaction stability (Codex): arm_axi/commit gate ADMISSION ONLY (S_IDLE pop). Once a beat is
+    // admitted, awvalid/wvalid are driven by STATE alone and run to completion, so a later arm-deassert or a
+    // fb_base commit can never drop VALID mid-handshake or move awaddr while AWVALID && !AWREADY.
+    assign awvalid = (state == S_AW);
+    assign wvalid  = (state == S_W);
+    assign bready  = (state == S_B);
+    assign fifo_rd = (state == S_IDLE) && !fifo_rempty && arm_axi && !commit_edge;
+    assign idle    = (state == S_IDLE) && fifo_rempty;
+
+    always_ff @(posedge axi_clk or negedge axi_rst_n) begin
+        if (!axi_rst_n) begin
+            state <= S_IDLE; beat_addr <= '0; beat_data <= '0; beat_strb <= '0; awaddr_q <= '0;
+            beats_written <= '0; bursts_issued <= '0; bresp_err_count <= '0;
+        end else begin
+            unique case (state)
+                S_IDLE: if (!fifo_rempty && arm_axi && !commit_edge) begin
+                    beat_addr <= fifo_rdata[319:288];   // {addr, data, strb}
+                    beat_data <= fifo_rdata[287:32];
+                    beat_strb <= fifo_rdata[31:0];
+                    awaddr_q  <= fb_base_axi + fifo_rdata[319:288];  // latch FULL AW addr from the STABLE base
+                    // canary: write ONLY the offset-0 (top-of-frame) 32-byte line;
+                    // discard every other beat (fifo_rd still pops it this cycle).
+                    if (canary_axi && (fifo_rdata[319:288] != 32'd0))
+                        state <= S_IDLE;
+                    else
+                        state <= S_AW;
+                end
+                S_AW: if (awready) begin
+                    bursts_issued <= bursts_issued + 32'd1;
+                    state <= S_W;
+                end
+                S_W: if (wready) begin
+                    beats_written <= beats_written + 32'd1;
+                    state <= S_B;
+                end
+                default: if (bvalid) begin   // S_B
+                    if (bresp != 2'b00) bresp_err_count <= bresp_err_count + 32'd1;
+                    state <= S_IDLE;
+                end
+            endcase
+        end
+    end
+endmodule : gs_lpddr_axi_master
@@ -0,0 +1,133 @@
+// retroDE_ps2 — gs_lpddr_fb_writer (Ch317)
+//
+// FIRST LPDDR-backed-framebuffer step: a write sink that takes the GS tile-FLUSH
+// pixel stream (PSMCT16, one pixel per emit) and commits it to an LPDDR-style
+// framebuffer, modelling the real EMIF AXI4 write path so the addressing / data /
+// stride / burst behaviour can be proven in sim before wiring the hard EMIF.
+//
+// SCOPE (Ch317, deliberately tight — see doc 0010 Ch317):
+//   * Tile color/Z stay ON-CHIP; texture stays local. ONLY the framebuffer FLUSH
+//     is redirected here.
+//   * Address gen is the simple linear `fb_base + (screen_y*pitch + screen_x)*bpp`
+//     — which the GS already produces on `raster_pixel_fb_addr_q` for PSMCT16
+//     ((fbp<<11) + (pixel_index<<1)), so we consume that byte address directly.
+//   * PSMCT16 (2 bytes/pixel) — lower bandwidth, already-proven format.
+//   * BURSTS: the flush emits a tile-row's 16 pixels at contiguous +2 byte
+//     addresses, then jumps by `pitch` to the next row. The burst engine COALESCES
+//     a contiguous +2 run into one burst, capped at MAX_BURST_BYTES (the doc 0008
+//     4 KiB-boundary AXI rule). Real per-tile-row burst = 16 beats = 32 bytes.
+//   * A staging FIFO decouples the 1-pixel/cycle emit from the burst engine and
+//     surfaces under/overflow — the realistic shape a hard EMIF AXI master needs.
+//   * Backing memory `fbmem` is byte-addressed and TB-readable for the
+//     write/readback PROOF (a later rung swaps it for the EMIF AXI master +
+//     LPDDR scanout). At enable=0 the whole module is inert (no writes, counters 0).
+//
+// COUNTERS (Codex acceptance — bandwidth/diag): bytes_written, burst_count,
+// busy_cycles (engine draining), fifo_overflow/underflow, fifo_occ_max. The TB
+// computes effective GB/s off bytes_written / (busy_cycles * clk_period).
+
+module gs_lpddr_fb_writer #(
+    parameter int FB_BYTES        = 8192,   // backing FB size (64x64 PSMCT16 = 8 KiB)
+    parameter int FIFO_DEPTH      = 32,     // pixel staging FIFO depth (power-of-2)
+    parameter int MAX_BURST_BYTES = 4096    // AXI4 4 KiB-boundary cap (doc 0008 lesson)
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+    input  logic        enable,             // LPDDR_FB_ENABLE; 0 → fully inert
+
+    // GS tile-flush pixel stream (PSMCT16, one pixel per emit)
+    input  logic        px_emit,
+    input  logic [31:0] px_addr,            // linear FB byte address (raster_pixel_fb_addr_q)
+    input  logic [15:0] px_pix16,           // raster_pixel_color_q[15:0]
+
+    // diagnostics / proof (read hierarchically by the TB; no functional consumers)
+    output logic [31:0] bytes_written,
+    output logic [31:0] burst_count,
+    output logic [31:0] busy_cycles,
+    output logic [31:0] fifo_overflow_count,
+    output logic [31:0] fifo_underflow_count,
+    output logic [15:0] fifo_occ
+);
+    localparam int ADDR_W = $clog2(FB_BYTES);
+    localparam int PTR_W  = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1;
+
+    // ---- byte-addressed backing framebuffer (the LPDDR model) ----
+    logic [7:0] fbmem [0:FB_BYTES-1];
+
+    // ---- staging FIFO of {addr, pix16} ----
+    logic [31:0] fifo_addr [0:FIFO_DEPTH-1];
+    logic [15:0] fifo_pix  [0:FIFO_DEPTH-1];
+    logic [PTR_W-1:0] wptr, rptr;
+    logic [PTR_W:0]   count;                 // 0..FIFO_DEPTH (PTR_W+1 bits)
+    // count==FIFO_DEPTH sets the top bit (FIFO_DEPTH is a power of 2 == 1<<PTR_W),
+    // so count[PTR_W] alone is the full flag. (Do NOT compare against a PTR_W-wide
+    // literal — PTR_W'(FIFO_DEPTH) truncates FIFO_DEPTH to 0 and reads empty as full.)
+    wire  fifo_full  = count[PTR_W];
+    wire  fifo_empty = (count == '0);
+
+    // ---- burst engine state (coalesce contiguous +2 runs) ----
+    logic        in_burst;                   // currently extending a burst
+    logic [31:0] last_addr;                  // last byte address written
+    logic [31:0] burst_bytes;                // bytes in the current burst so far
+
+    logic              do_push, do_pop;
+    logic [31:0]       a;          // popped byte address
+    logic [15:0]       p;          // popped pixel
+    logic              contig;
+    always_comb begin
+        do_push = px_emit && !fifo_full;
+        do_pop  = !fifo_empty;                 // drain one entry/cycle when available
+        a       = fifo_addr[rptr];
+        p       = fifo_pix [rptr];
+        contig  = in_burst && (a == last_addr + 32'd2)
+                            && (burst_bytes + 32'd2 <= 32'(MAX_BURST_BYTES));
+    end
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            wptr <= '0; rptr <= '0; count <= '0;
+            in_burst <= 1'b0; last_addr <= '0; burst_bytes <= '0;
+            bytes_written <= '0; burst_count <= '0; busy_cycles <= '0;
+            fifo_overflow_count <= '0; fifo_underflow_count <= '0; fifo_occ <= '0;
+        end else if (enable) begin
+            // ---- push side: one flushed pixel per emit ----
+            if (px_emit && fifo_full)
+                fifo_overflow_count <= fifo_overflow_count + 32'd1;  // dropped — proof must show 0
+            if (do_push) begin
+                fifo_addr[wptr] <= px_addr;
+                fifo_pix [wptr] <= px_pix16;
+                wptr <= wptr + PTR_W'(1);
+            end
+
+            // ---- drain side: pop one entry/cycle, commit fbmem, coalesce bursts ----
+            if (do_pop) begin
+                // commit the 2 PSMCT16 bytes at the linear address
+                if (a < 32'(FB_BYTES))       fbmem[a[ADDR_W-1:0]]           <= p[7:0];
+                if ((a + 1) < 32'(FB_BYTES)) fbmem[a[ADDR_W-1:0] + 1'b1]    <= p[15:8];
+                bytes_written <= bytes_written + 32'd2;
+                busy_cycles   <= busy_cycles + 32'd1;
+                rptr <= rptr + PTR_W'(1);
+                if (contig)
+                    burst_bytes <= burst_bytes + 32'd2;     // extend current burst
+                else begin
+                    burst_count <= burst_count + 32'd1;     // start a NEW burst
+                    burst_bytes <= 32'd2;
+                end
+                in_burst  <= 1'b1;
+                last_addr <= a;
+            end else if (in_burst) begin
+                in_burst <= 1'b0;                            // FIFO drained → close burst
+            end
+
+            // single count update (push and pop net correctly)
+            if (do_push && !do_pop)      count <= count + 1'b1;
+            else if (!do_push && do_pop) count <= count - 1'b1;
+            // (both or neither → unchanged)
+
+            if (16'(count) > fifo_occ) fifo_occ <= 16'(count);
+            // fifo_underflow_count: the engine never pops empty (do_pop gated on
+            // !fifo_empty), so it stays 0 here — surfaced for the future EMIF rung
+            // where an external AXI master could request beyond the staged data.
+        end
+    end
+endmodule : gs_lpddr_fb_writer
@@ -0,0 +1,155 @@
+// ============================================================================
+// gs_lpddr_rd_arb.sv  (Ch320 Brick 2; Ch322 extended 2:1 -> 3:1)
+//
+// 3:1 AXI4 READ-channel arbiter for the FPGA-private LPDDR4B EMIF user port.
+// Lets the Ch320 scanout reader (port 0, priority), the Ch319 HPS read-probe
+// (port 1), and the Ch322 texture-cache fill (port 2, lowest priority) share the
+// single EMIF read channel. The write channel is arbitrated separately
+// (gs_lpddr_wr_arb). Adapted from ao486 axi_fb_arbiter (read half): grant held
+// for a whole transaction, watchdog force-release, idle-drain rready so a late
+// response can't wedge the bus. All single-clock (emif_clk).
+//
+// Port 2 (texture fill) is a ONE-SHOT prefill before raster; scanout (port 0)
+// keeps priority. Leave s2_* unconnected (arvalid=0) on builds without a texture
+// cache — the arbiter is then bit-for-bit the old 2:1 behavior.
+//
+// Single-beat transactions (ARLEN=0), so a response completes on rvalid&rlast.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_lpddr_rd_arb (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // ---- Port 0: scanout reader (priority) ----
+    input  logic [29:0] s0_araddr,
+    input  logic [1:0]  s0_arburst,
+    input  logic [6:0]  s0_arid,
+    input  logic [7:0]  s0_arlen,
+    input  logic [2:0]  s0_arsize,
+    input  logic        s0_arvalid,
+    output logic        s0_arready,
+    output logic [255:0] s0_rdata,
+    output logic [1:0]  s0_rresp,
+    output logic        s0_rlast,
+    output logic        s0_rvalid,
+    input  logic        s0_rready,
+
+    // ---- Port 1: HPS read-probe ----
+    input  logic [29:0] s1_araddr,
+    input  logic [1:0]  s1_arburst,
+    input  logic [6:0]  s1_arid,
+    input  logic [7:0]  s1_arlen,
+    input  logic [2:0]  s1_arsize,
+    input  logic        s1_arvalid,
+    output logic        s1_arready,
+    output logic [255:0] s1_rdata,
+    output logic [1:0]  s1_rresp,
+    output logic        s1_rlast,
+    output logic        s1_rvalid,
+    input  logic        s1_rready,
+
+    // ---- Port 2: texture-cache fill (lowest priority; Ch322) ----
+    input  logic [29:0] s2_araddr,
+    input  logic [1:0]  s2_arburst,
+    input  logic [6:0]  s2_arid,
+    input  logic [7:0]  s2_arlen,
+    input  logic [2:0]  s2_arsize,
+    input  logic        s2_arvalid,
+    output logic        s2_arready,
+    output logic [255:0] s2_rdata,
+    output logic [1:0]  s2_rresp,
+    output logic        s2_rlast,
+    output logic        s2_rvalid,
+    input  logic        s2_rready,
+
+    // ---- Port 3: tile-reload fill (Ch323; priority ABOVE probe/texfill, below scanout) ----
+    input  logic [29:0] s3_araddr,
+    input  logic [1:0]  s3_arburst,
+    input  logic [6:0]  s3_arid,
+    input  logic [7:0]  s3_arlen,
+    input  logic [2:0]  s3_arsize,
+    input  logic        s3_arvalid,
+    output logic        s3_arready,
+    output logic [255:0] s3_rdata,
+    output logic [1:0]  s3_rresp,
+    output logic        s3_rlast,
+    output logic        s3_rvalid,
+    input  logic        s3_rready,
+
+    // ---- Master out: EMIF read channel ----
+    output logic [29:0] m_araddr,
+    output logic [1:0]  m_arburst,
+    output logic [6:0]  m_arid,
+    output logic [7:0]  m_arlen,
+    output logic [2:0]  m_arsize,
+    output logic        m_arvalid,
+    input  logic        m_arready,
+    input  logic [255:0] m_rdata,
+    input  logic [1:0]  m_rresp,
+    input  logic        m_rlast,
+    input  logic        m_rvalid,
+    output logic        m_rready
+);
+    // grant: 0=idle, 1=s0 scanout, 2=s1 probe, 3=s2 texfill, 4=s3 tile-reload.
+    // EXPLICIT priority (Ch323, Codex): scanout > tile_reload > probe > texture_fill — i.e.
+    // s0 > s3 > s1 > s2. Render-display (scanout) highest; the render-prep tile reload above
+    // the debug read-probe so a debug read can never starve a render's Z/color reload.
+    reg [2:0]  grant;
+    // Ch326 — NON-ABORTING ARBITER (Codex). The OLD design force-released the grant on a
+    // watchdog (was 2^10 ~3.3us) at ANY point in the transaction; when it fired AFTER the AR had
+    // handshaked, the idle state's m_rready=1 drained the now-orphaned response and the requester
+    // hung forever (blank HDMI + stuck HPS probe under the always-on-scanout traffic). Once
+    // m_arvalid && m_arready, the read is COMMITTED and its response BELONGS to that requester —
+    // there is no AXI-legal way to abandon it. So: the watchdog gates ONLY the pre-AR wait (no
+    // transaction committed yet — safe to drop); after AR acceptance the grant is held until
+    // m_rvalid && m_rlast && selected_rready, regardless of how long the read takes.
+    reg        ar_done;        // AR handshake captured for the active grant -> never abort past here
+    reg [21:0] watchdog;       // pre-AR only (waiting for m_arready); ~6.7 ms @ 310 MHz dead-bus backstop
+    wire       wd_expired = watchdog[21];
+    wire       sel_rready = (grant==3'd1)?s0_rready:(grant==3'd2)?s1_rready:
+                            (grant==3'd3)?s2_rready:(grant==3'd4)?s3_rready:1'b1;
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            grant <= 3'd0; ar_done <= 1'b0; watchdog <= '0;
+        end else if (grant == 3'd0) begin
+            ar_done <= 1'b0; watchdog <= '0;
+            if      (s0_arvalid) grant <= 3'd1;   // scanout (highest)
+            else if (s3_arvalid) grant <= 3'd4;   // tile reload (render-prep)
+            else if (s1_arvalid) grant <= 3'd2;   // read probe (debug)
+            else if (s2_arvalid) grant <= 3'd3;   // texture fill (lowest)
+        end else begin
+            if (m_arvalid && m_arready) ar_done <= 1'b1;            // AR accepted -> COMMITTED
+            if (m_rvalid && m_rlast && sel_rready) begin
+                grant <= 3'd0; ar_done <= 1'b0; watchdog <= '0;     // response delivered -> release
+            end else if (!ar_done) begin                            // still waiting for AR (nothing owed)
+                if (wd_expired) begin grant <= 3'd0; ar_done <= 1'b0; watchdog <= '0; end
+                else watchdog <= watchdog + 22'd1;
+            end
+            // ar_done && response not yet complete: HOLD the grant, never abort.
+        end
+    end
+
+    // AR mux
+    assign m_araddr  = (grant==3'd4)?s3_araddr :(grant==3'd3)?s2_araddr :(grant==3'd2)?s1_araddr :s0_araddr;
+    assign m_arburst = (grant==3'd4)?s3_arburst:(grant==3'd3)?s2_arburst:(grant==3'd2)?s1_arburst:s0_arburst;
+    assign m_arid    = (grant==3'd4)?s3_arid   :(grant==3'd3)?s2_arid   :(grant==3'd2)?s1_arid   :s0_arid;
+    assign m_arlen   = (grant==3'd4)?s3_arlen  :(grant==3'd3)?s2_arlen  :(grant==3'd2)?s1_arlen  :s0_arlen;
+    assign m_arsize  = (grant==3'd4)?s3_arsize :(grant==3'd3)?s2_arsize :(grant==3'd2)?s1_arsize :s0_arsize;
+    assign m_arvalid = (grant==3'd1)?s0_arvalid:(grant==3'd2)?s1_arvalid:(grant==3'd3)?s2_arvalid:(grant==3'd4)?s3_arvalid:1'b0;
+    assign s0_arready = (grant==3'd1)?m_arready:1'b0;
+    assign s1_arready = (grant==3'd2)?m_arready:1'b0;
+    assign s2_arready = (grant==3'd3)?m_arready:1'b0;
+    assign s3_arready = (grant==3'd4)?m_arready:1'b0;
+
+    // R demux (idle: rready=1 drains any stale/late response)
+    assign s0_rdata=m_rdata; assign s1_rdata=m_rdata; assign s2_rdata=m_rdata; assign s3_rdata=m_rdata;
+    assign s0_rresp=m_rresp; assign s1_rresp=m_rresp; assign s2_rresp=m_rresp; assign s3_rresp=m_rresp;
+    assign s0_rlast=m_rlast; assign s1_rlast=m_rlast; assign s2_rlast=m_rlast; assign s3_rlast=m_rlast;
+    assign s0_rvalid = (grant==3'd1)?m_rvalid:1'b0;
+    assign s1_rvalid = (grant==3'd2)?m_rvalid:1'b0;
+    assign s2_rvalid = (grant==3'd3)?m_rvalid:1'b0;
+    assign s3_rvalid = (grant==3'd4)?m_rvalid:1'b0;
+    assign m_rready  = (grant==3'd1)?s0_rready:(grant==3'd2)?s1_rready:(grant==3'd3)?s2_rready:(grant==3'd4)?s3_rready:1'b1;
+endmodule
@@ -0,0 +1,117 @@
+// ============================================================================
+// gs_lpddr_rd_probe.sv  (Ch319 Brick 3)
+//
+// HPS-triggered single-word AXI4 READ probe for FPGA-private LPDDR4B.
+//
+// Lets the HPS read framebuffer bytes back THROUGH THE HPS BRIDGE (never
+// /dev/mem) for checksum + screen-dump. Drives the EMIF user port's READ
+// channel (AR/R) only — the write channel (AW/W/B) is the GS tile-flush
+// writer (gs_lpddr_axi_master); read and write channels are independent, so
+// the two masters share the one EMIF port with no arbitration.
+//
+// Runs on axi_clk (= emif_clk, ~310 MHz). The control input `rd_pulse` is a
+// TOGGLE in the bridge (design_clk) domain; it is 2-FF synced + edge-detected
+// here. The outputs `rd_done` (toggle) + `rd_data` are produced in axi_clk;
+// the bridge syncs `rd_done` and latches `rd_data` on its edge (same return-
+// path CDC contract as ao486 ao486_hps_bridge ↔ lpddr4b_loader).
+//
+// One AXI read = one 32-byte (256-bit) beat; the requested 32-bit word is the
+// lane selected by addr[4:2] (8 lanes per beat). araddr is 32-byte aligned.
+// ============================================================================
+
+module gs_lpddr_rd_probe #(
+    parameter ADDR_W = 30
+)(
+    input  logic              axi_clk,    // emif_clk
+    input  logic              axi_rst_n,  // emif_reset_n (EMIF cal-ready)
+
+    // ---- control / status (rd_pulse is a design_clk-domain toggle) ----
+    input  logic              rd_pulse,   // toggles when the HPS requests a read
+    input  logic [31:0]       rd_addr,    // byte address (stable when rd_pulse toggles)
+    output logic              rd_done,     // toggles (axi_clk) on completion
+    output logic [31:0]       rd_data,     // 32-bit word (stable after rd_done edge)
+    output logic              rd_busy,
+
+    // ---- AXI4 READ channel to the EMIF user port (axi_clk, 256-bit data) ----
+    output logic [ADDR_W-1:0] araddr,
+    output logic [1:0]        arburst,
+    output logic [6:0]        arid,
+    output logic [7:0]        arlen,
+    output logic [2:0]        arsize,
+    output logic              arvalid,
+    input  logic              arready,
+    input  logic [255:0]      rdata,
+    input  logic [1:0]        rresp,
+    input  logic              rlast,
+    input  logic              rvalid,
+    output logic              rready
+);
+    // AXI read-address constants
+    assign arburst = 2'b01;   // INCR
+    assign arid    = 7'd1;    // distinct from the writer (awid = 0)
+    assign arlen   = 8'd0;    // single beat
+    assign arsize  = 3'b101;  // 32 bytes (full 256-bit width)
+
+    // CDC: sync rd_pulse (design_clk) into axi_clk + edge-detect.
+    reg [2:0] pulse_sync;
+    wire pulse_edge = (pulse_sync[2] != pulse_sync[1]);
+
+    reg [2:0] rd_lane;        // which 32-bit lane of the 256-bit beat (addr[4:2])
+
+    typedef enum logic [1:0] { S_IDLE, S_AR, S_R } st_t;
+    st_t st;
+
+    always_ff @(posedge axi_clk) begin
+        if (!axi_rst_n) begin
+            pulse_sync <= 3'd0;
+            st         <= S_IDLE;
+            araddr     <= '0;
+            arvalid    <= 1'b0;
+            rready     <= 1'b0;
+            rd_done    <= 1'b0;
+            rd_data    <= 32'd0;
+            rd_busy    <= 1'b0;
+            rd_lane    <= 3'd0;
+        end else begin
+            pulse_sync <= {pulse_sync[1:0], rd_pulse};
+
+            case (st)
+                S_IDLE: begin
+                    if (pulse_edge) begin
+                        rd_lane <= rd_addr[4:2];
+                        araddr  <= {rd_addr[ADDR_W-1:5], 5'd0};  // 32-byte aligned
+                        arvalid <= 1'b1;
+                        rd_busy <= 1'b1;
+                        st      <= S_AR;
+                    end
+                end
+                S_AR: begin
+                    if (arready) begin
+                        arvalid <= 1'b0;
+                        rready  <= 1'b1;
+                        st      <= S_R;
+                    end
+                end
+                S_R: begin
+                    if (rvalid) begin
+                        rready <= 1'b0;
+                        case (rd_lane)
+                            3'd0: rd_data <= rdata[31:0];
+                            3'd1: rd_data <= rdata[63:32];
+                            3'd2: rd_data <= rdata[95:64];
+                            3'd3: rd_data <= rdata[127:96];
+                            3'd4: rd_data <= rdata[159:128];
+                            3'd5: rd_data <= rdata[191:160];
+                            3'd6: rd_data <= rdata[223:192];
+                            default: rd_data <= rdata[255:224];
+                        endcase
+                        rd_busy <= 1'b0;
+                        rd_done <= ~rd_done;
+                        st      <= S_IDLE;
+                    end
+                end
+                default: st <= S_IDLE;
+            endcase
+        end
+    end
+endmodule
@@ -0,0 +1,173 @@
+// ============================================================================
+// gs_lpddr_scanout.sv  (Ch320 Brick 1)
+//
+// LPDDR4B-backed scanout for the SMALL framebuffer demo (explicitly scoped to
+// a tiny frame — default 64x64 PSMCT16 = 8 KiB). Instead of ao486-style line
+// buffering/streaming, it copies the WHOLE framebuffer from LPDDR4B into a
+// small on-chip cache once per frame, then serves scanout pixels from the
+// cache. When the framebuffer grows past "tiny demo", revisit ao486's
+// vga_fb_ddr line-buffer approach.
+//
+// Pixel mapping is automatic: the GS writer mirrors BRAM VRAM byte-for-byte
+// into LPDDR4B, so cache[addr] == BRAM_VRAM[addr]. We index the cache by the
+// PCRTC's own `vram_read_addr`, so the decoded pixel is identical to the BRAM
+// scanout pixel for the same raster position — the video-source mux is seamless
+// regardless of swizzle/MAG.
+//
+// Two clock domains:
+//   axi_clk   (emif_clk, ~310 MHz) — fill the cache from LPDDR4B over AXI4
+//   video_clk (design_clk)         — index the cache by vram_read_addr -> r/g/b
+//
+// Fill happens on frame_start (vsync) and completes during vblank (256 single
+// beats ~ 1 us); scanout reads the stable cache during the active region. The
+// read channel is shared with the Ch319 read-probe via an external arbiter.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_lpddr_scanout #(
+    parameter [29:0] FB_BASE     = 30'd0,   // LPDDR byte base to FETCH the framebuffer from
+    parameter int    CACHE_BEATS = 256,     // 256 * 32 B = 8 KiB = 64x64 PSMCT16
+    // Ch324 — the LPDDR fetch base (FB_BASE) and the PCRTC vram_read_addr base can DIFFER:
+    // the spill framebuffer lives at COLOR_SPILL_BASE in LPDDR but the PCRTC addresses it
+    // BRAM-relative (0-based). VRAM_BASE is the vram_read_addr origin (defaults to FB_BASE
+    // for the Ch320/321 mirror case where they coincide).
+    parameter [29:0] VRAM_BASE   = FB_BASE,
+    // Ch324 — pixel format: 0 = PSMCT16 (RGBA5551, 16 px/beat), 1 = PSMCT32 (ABGR, 8 px/beat).
+    parameter bit    PSMCT32     = 1'b0
+)(
+    // ---- AXI read clock domain (emif_clk) ----
+    input  logic         axi_clk,
+    input  logic         axi_rst_n,
+    input  logic         enable,        // 1 = refill the cache on frame_start
+    input  logic         frame_start,   // video-domain pulse/level (vsync); synced internally
+
+    // ---- video clock domain (design_clk) ----
+    input  logic         video_clk,
+    input  logic [31:0]  vram_read_addr,
+    output logic [7:0]   r,
+    output logic [7:0]   g,
+    output logic [7:0]   b,
+
+    // ---- status (axi_clk domain; the bridge syncs these) ----
+    output logic         cache_valid,    // a full frame has been loaded
+    output logic [31:0]  rd_beats,       // beats read (cumulative)
+    output logic [31:0]  rd_errs,        // non-OKAY read responses (cumulative)
+
+    // ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
+    output logic [29:0]  araddr,
+    output logic [1:0]   arburst,
+    output logic [6:0]   arid,
+    output logic [7:0]   arlen,
+    output logic [2:0]   arsize,
+    output logic         arvalid,
+    input  logic         arready,
+    input  logic [255:0] rdata,
+    input  logic [1:0]   rresp,
+    input  logic         rlast,
+    input  logic         rvalid,
+    output logic         rready
+);
+    localparam int BEAT_BITS = $clog2(CACHE_BEATS);   // 8 for 256
+    localparam int FB_SPAN    = CACHE_BEATS * 32;      // 8192 bytes
+
+    assign arburst = 2'b01;   // INCR
+    assign arid    = 7'd2;    // distinct from writer (0) and read-probe (1)
+    assign arlen   = 8'd0;    // single beat
+    assign arsize  = 3'b101;  // 32 bytes
+
+    // Frame cache: one 256-bit word per 32-byte beat.
+    logic [255:0] cache [0:CACHE_BEATS-1];
+
+    // ---------------- fill side (axi_clk) ----------------
+    logic [2:0] fs_sync;
+    wire fs_edge = (fs_sync[2] != fs_sync[1]);
+
+    typedef enum logic [1:0] { F_IDLE, F_AR, F_R } fstate_t;
+    fstate_t fst;
+    logic [BEAT_BITS:0] beat;     // 0..CACHE_BEATS (extra bit for the terminal compare)
+
+    always_ff @(posedge axi_clk) begin
+        if (!axi_rst_n) begin
+            fs_sync <= 3'd0; fst <= F_IDLE; beat <= '0;
+            araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
+            cache_valid <= 1'b0; rd_beats <= 32'd0; rd_errs <= 32'd0;
+        end else begin
+            fs_sync <= {fs_sync[1:0], frame_start};
+            case (fst)
+                F_IDLE: begin
+                    if (fs_edge && enable) begin
+                        beat        <= '0;
+                        cache_valid <= 1'b0;
+                        araddr      <= FB_BASE;
+                        arvalid     <= 1'b1;
+                        fst         <= F_AR;
+                    end
+                end
+                F_AR: begin
+                    if (arready) begin
+                        arvalid <= 1'b0;
+                        rready  <= 1'b1;
+                        fst     <= F_R;
+                    end
+                end
+                F_R: begin
+                    if (rvalid) begin
+                        cache[beat[BEAT_BITS-1:0]] <= rdata;
+                        rready   <= 1'b0;
+                        rd_beats <= rd_beats + 32'd1;
+                        if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
+                        if (beat == CACHE_BEATS-1) begin
+                            cache_valid <= 1'b1;
+                            fst         <= F_IDLE;
+                        end else begin
+                            beat    <= beat + 1'b1;
+                            araddr  <= FB_BASE + (({{(30-BEAT_BITS-1){1'b0}}, (beat + 1'b1)}) << 5);
+                            arvalid <= 1'b1;
+                            fst     <= F_AR;
+                        end
+                    end
+                end
+                default: fst <= F_IDLE;
+            endcase
+        end
+    end
+
+    // ---------------- scanout side (video_clk) ----------------
+    // Byte offset of the requested pixel within the framebuffer (vram_read_addr is BRAM-relative
+    // = VRAM_BASE-origin; the cache holds the SAME bytes fetched from FB_BASE in LPDDR).
+    wire [31:0] off       = vram_read_addr - {2'b00, VRAM_BASE};
+    wire [BEAT_BITS-1:0] beat_ix = off[BEAT_BITS+4 -: BEAT_BITS];  // off>>5, low BEAT_BITS
+    wire [3:0]  hw_sel    = off[4:1];                              // PSMCT16: 16 halfwords / beat
+    wire [2:0]  w_sel     = off[4:2];                              // PSMCT32: 8 words / beat
+    wire in_range = cache_valid
+                 && (vram_read_addr >= {2'b00, VRAM_BASE})
+                 && (off < FB_SPAN);
+
+    // Registered (sync-read) cache lookup — 1-cycle latency to match the PCRTC's
+    // VRAM_SYNC_READ pixel timing so the muxed output aligns with PCRTC de/sync.
+    // Split the array-index and the part-select across the register boundary
+    // (chained index+part-select in one expr trips iverilog-12).
+    logic [255:0] word_q;
+    logic [3:0]   hw_q;
+    logic [2:0]   w_q;
+    logic         in_range_q;
+    always_ff @(posedge video_clk) begin
+        word_q     <= cache[beat_ix];
+        hw_q       <= hw_sel;
+        w_q        <= w_sel;
+        in_range_q <= in_range;
+    end
+    wire [15:0] px16_q = word_q[hw_q*16 +: 16];
+    wire [31:0] px32_q = word_q[w_q*32  +: 32];
+
+    // PSMCT16 (RGBA5551): R[4:0] G[9:5] B[14:10], 5->8 by bit-replication ({c5,c5[4:2]}).
+    wire [4:0] r5 = px16_q[4:0];
+    wire [4:0] g5 = px16_q[9:5];
+    wire [4:0] b5 = px16_q[14:10];
+    wire [7:0] r16 = {r5, r5[4:2]}, g16 = {g5, g5[4:2]}, b16 = {b5, b5[4:2]};
+    // PSMCT32 (ABGR8888): R[7:0] G[15:8] B[23:16] (A discarded) — identical decode to gs_pcrtc.
+    wire [7:0] r32 = px32_q[7:0], g32 = px32_q[15:8], b32 = px32_q[23:16];
+    assign r = !in_range_q ? 8'd0 : (PSMCT32 ? r32 : r16);
+    assign g = !in_range_q ? 8'd0 : (PSMCT32 ? g32 : g16);
+    assign b = !in_range_q ? 8'd0 : (PSMCT32 ? b32 : b16);
+endmodule
@@ -0,0 +1,213 @@
+// ============================================================================
+// gs_lpddr_scanout_lb.sv  (Ch321 Brick 2)
+//
+// LINE-BUFFER LPDDR4B scanout — the architectural successor to the whole-frame
+// cache (gs_lpddr_scanout). Instead of mirroring the entire framebuffer in
+// on-chip RAM (which defeats the point of putting the FB in LPDDR), this holds
+// just TWO scanlines: it displays row L from one buffer while prefetching row
+// L+1 into the other. On-chip cost is O(width), not O(width*height).
+//
+// NARROW SCOPE (Ch321): the 128x128 PSMCT16 demo. The frame is LINEAR (the GS
+// writer mirrors the rasterizer's linear flush addresses), display window at
+// origin, 1:1 (MAG off) — so the reader serves pixel (col=pixel_x, line=pixel_y)
+// directly when inside the window. No general MAG/window handling beyond that.
+//
+// Two clock domains:
+//   axi_clk   (emif_clk) — AXI4 burst-read one row (ROW_BEATS beats) into a buffer
+//   video_clk (design)   — pixel_x/pixel_y index the active line buffer -> r/g/b
+//
+// Prefetch handshake: on each new display line (and at frame start) the video
+// side requests the next FB row via a toggle; the axi side fills the OTHER
+// buffer. `underflow` flags any pixel read before its row finished loading.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_lpddr_scanout_lb #(
+    parameter [29:0] FB_BASE     = 30'd0,
+    parameter int    STRIDE_BYTES = 256,   // PSMCT16 128px*2B=256; PSMCT32 128px*4B=512
+    parameter int    ROW_BEATS    = 8,     // STRIDE_BYTES / 32  (PSMCT16 128px=8; PSMCT32 128px=16)
+    parameter int    N_ROWS       = 128,
+    // Ch327a — PSMCT32 (ABGR8888, 8 px/256-bit beat) vs the original PSMCT16 (RGBA5551,
+    // 16 px/beat). The Ch326 LPDDR-only spill framebuffer is PSMCT32 @ COLOR_SPILL_BASE, so the
+    // line-buffer must decode it — NOT a config flip of the Ch321 PSMCT16/FB-at-0 path.
+    parameter bit    PSMCT32      = 1'b0
+)(
+    // ---- AXI read clock domain (emif_clk) ----
+    input  logic         axi_clk,
+    input  logic         axi_rst_n,
+    input  logic         enable,        // 1 = active (prefetch + serve)
+
+    // ---- video clock domain (design_clk) ----
+    input  logic         video_clk,
+    input  logic         frame_start,   // vsync pulse/level (synced internally)
+    input  logic [11:0]  pixel_x,       // raster column (display)
+    input  logic [11:0]  pixel_y,       // raster line   (display)
+    input  logic         in_window,     // PCRTC displayed-frame window gate
+    output logic [7:0]   r,
+    output logic [7:0]   g,
+    output logic [7:0]   b,
+
+    // ---- status (axi_clk domain; bridge syncs) ----
+    output logic         line_valid,    // at least one row has been loaded
+    output logic         underflow,     // a pixel was read before its row was ready (sticky)
+    output logic [31:0]  rd_errs,       // non-OKAY read responses (cumulative)
+
+    // ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
+    output logic [29:0]  araddr,
+    output logic [1:0]   arburst,
+    output logic [6:0]   arid,
+    output logic [7:0]   arlen,
+    output logic [2:0]   arsize,
+    output logic         arvalid,
+    input  logic         arready,
+    input  logic [255:0] rdata,
+    input  logic [1:0]   rresp,
+    input  logic         rlast,
+    input  logic         rvalid,
+    output logic         rready
+);
+    localparam int RB_BITS = $clog2(ROW_BEATS);   // 3 for 8
+
+    assign arburst = 2'b01;   // INCR
+    assign arid    = 7'd3;    // distinct: writer=0, probe=1, frame-cache=2, line-buf=3
+    assign arlen   = 8'd0;    // SINGLE-BEAT per read — the only AXI read pattern proven on this
+                              // EMIF (writer/probe/frame-cache all use arlen=0). A multi-beat
+                              // burst (arlen=ROW_BEATS-1) was untested and garbled on hardware.
+    assign arsize  = 3'b101;  // 32 bytes
+
+    // Two line buffers, ROW_BEATS x 256-bit each (one display row).
+    logic [255:0] lb0 [0:ROW_BEATS-1];
+    logic [255:0] lb1 [0:ROW_BEATS-1];
+
+    // ================= video side (video_clk) =================
+    // No miss-prone request toggle. The video side just exposes the current
+    // in-window display row; the axi side free-runs, fetching rows sequentially
+    // and staying one row ahead (see below). disp_row_v resets on vsync.
+    logic [$clog2(N_ROWS):0] disp_row_v;
+    logic [2:0]  fs_sync_v;
+    wire         fs_edge_v = (fs_sync_v[2] != fs_sync_v[1]);
+    // The buffer holding display line L is L&1 (row L is fetched into L&1). Select
+    // it DIRECTLY from pixel_y[0] (tracks the current pixel) — a separately-registered
+    // "disp_buf" lags by one cycle and corrupts col 0 of each line.
+    wire disp_buf = pixel_y[0];
+
+    always_ff @(posedge video_clk) begin
+        if (!enable) begin
+            disp_row_v <= '0; fs_sync_v <= 3'd0;
+        end else begin
+            fs_sync_v <= {fs_sync_v[1:0], frame_start};
+            if (fs_edge_v)                              disp_row_v <= '0;
+            else if (in_window && (pixel_y < N_ROWS))   disp_row_v <= ($clog2(N_ROWS)+1)'(pixel_y);
+        end
+    end
+
+    // Registered (sync-read) pixel: pick buffer + beat + within-beat lane from pixel_x.
+    // PSMCT32: 8 px/256-bit beat -> beat = pixel_x>>3, lane = pixel_x[2:0] (32-bit).
+    // PSMCT16: 16 px/beat       -> beat = pixel_x>>4, lane = pixel_x[3:0] (16-bit).
+    localparam int PXSH    = PSMCT32 ? 3 : 4;                 // px-per-beat shift
+    localparam int PX_PER_ROW = PSMCT32 ? (STRIDE_BYTES/4) : (STRIDE_BYTES/2);
+    wire [RB_BITS-1:0] col_beat = pixel_x[RB_BITS+PXSH-1 -: RB_BITS];
+    wire [3:0]         col_lane = PSMCT32 ? {1'b0, pixel_x[2:0]} : pixel_x[3:0];
+    logic [255:0] word_q; logic [3:0] lane_q; logic in_q;
+    always_ff @(posedge video_clk) begin
+        word_q <= disp_buf ? lb1[col_beat] : lb0[col_beat];
+        lane_q <= col_lane;
+        in_q   <= in_window && (pixel_x < PX_PER_ROW) && (pixel_y < N_ROWS);
+    end
+    // PSMCT32 ABGR8888 (r=[7:0],g=[15:8],b=[23:16]) — matches gs_lpddr_scanout (frame-cache).
+    wire [31:0] px32 = word_q[lane_q[2:0]*32 +: 32];   // 3-bit lane: always in-range (0..224)
+    wire [7:0]  r32 = px32[7:0], g32 = px32[15:8], b32 = px32[23:16];
+    // PSMCT16 RGBA5551 5-bit lanes expanded to 8-bit.
+    wire [15:0] px16 = word_q[lane_q*16 +: 16];
+    wire [4:0] r5 = px16[4:0], g5 = px16[9:5], b5 = px16[14:10];
+    assign r = !in_q ? 8'd0 : (PSMCT32 ? r32 : {r5, r5[4:2]});
+    assign g = !in_q ? 8'd0 : (PSMCT32 ? g32 : {g5, g5[4:2]});
+    assign b = !in_q ? 8'd0 : (PSMCT32 ? b32 : {b5, b5[4:2]});
+
+    // ================= axi side (axi_clk) — row fill FSM =================
+    // free-running prefetcher: fetch rows sequentially, staying <= disp_row+1 ahead.
+    // disp_row crosses video->axi (slowly-changing; the +1 throttle tolerates a 1-off
+    // transient). frame_start is edge-detected here to reset next_fetch every frame.
+    logic [2:0] fs_sync_e;
+    wire        fs_edge_e = (fs_sync_e[2] != fs_sync_e[1]);
+    logic [$clog2(N_ROWS):0] disp_row_s0, disp_row_e;
+    logic [$clog2(N_ROWS):0] next_fetch;     // next row to load (0..N_ROWS)
+    typedef enum logic [1:0] { L_IDLE, L_AR, L_R } lstate_t;
+    lstate_t lst;
+    logic [$clog2(N_ROWS):0] cur_row;
+    logic        cur_buf;
+    logic [RB_BITS:0] beat;
+    logic        fs_pending;   // a vsync restart is pending; applied in L_IDLE (never mid-read)
+
+    always_ff @(posedge axi_clk) begin
+        if (!axi_rst_n) begin
+            fs_sync_e <= 3'd0; disp_row_s0 <= '0; disp_row_e <= '0; next_fetch <= '0;
+            lst <= L_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
+            cur_row <= '0; cur_buf <= 1'b0; beat <= '0;
+            line_valid <= 1'b0; rd_errs <= 32'd0; fs_pending <= 1'b0;
+        end else begin
+            fs_sync_e   <= {fs_sync_e[1:0], frame_start};
+            disp_row_s0 <= disp_row_v;     // 2-FF sync of the display row
+            disp_row_e  <= disp_row_s0;
+            // vsync: mark a prefetch restart. DEFER it to L_IDLE so an in-flight AXI
+            // read is never aborted mid-handshake (which would deadlock the slave).
+            if (fs_edge_e) fs_pending <= 1'b1;
+            case (lst)
+                L_IDLE: begin
+                    if (fs_pending) begin
+                        next_fetch <= '0;       // restart prefetch sequence from row 0
+                        fs_pending <= 1'b0;
+                    end else if (enable && (next_fetch < N_ROWS) && (next_fetch <= disp_row_e + 1'b1)) begin
+                        cur_row <= next_fetch;
+                        cur_buf <= next_fetch[0];
+                        araddr  <= FB_BASE + (next_fetch * STRIDE_BYTES);
+                        beat    <= '0;
+                        arvalid <= 1'b1;
+                        lst     <= L_AR;
+                    end
+                end
+                L_AR: begin
+                    if (arready) begin
+                        arvalid <= 1'b0;
+                        rready  <= 1'b1;
+                        lst     <= L_R;
+                    end
+                end
+                L_R: begin
+                    if (rvalid) begin
+                        if (cur_buf) lb1[beat[RB_BITS-1:0]] <= rdata;
+                        else         lb0[beat[RB_BITS-1:0]] <= rdata;
+                        if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
+                        rready <= 1'b0;
+                        if (beat == ROW_BEATS-1) begin
+                            line_valid <= 1'b1;
+                            next_fetch <= next_fetch + 1'b1;  // advance prefetch (rows 0..next_fetch-1 loaded)
+                            lst        <= L_IDLE;
+                        end else begin
+                            // next single-beat read of this row (arlen=0 each).
+                            beat    <= beat + 1'b1;
+                            araddr  <= araddr + 30'd32;
+                            arvalid <= 1'b1;
+                            lst     <= L_AR;
+                        end
+                    end
+                end
+                default: lst <= L_IDLE;
+            endcase
+        end
+    end
+
+    // underflow (sticky, video domain): an in-window pixel for line pixel_y is read
+    // before that row was prefetched. The axi side loads rows 0..next_fetch-1, so row
+    // pixel_y is ready iff pixel_y < next_fetch. next_fetch crosses axi->video synced
+    // (slowly-changing; a 1-off transient is harmless). Resets on vsync.
+    logic [$clog2(N_ROWS):0] nf_s0, nf_v;
+    logic underflow_v;
+    always_ff @(posedge video_clk) begin
+        nf_s0 <= next_fetch; nf_v <= nf_s0;
+        if (!enable || fs_edge_v) underflow_v <= 1'b0;
+        else if (in_window && (pixel_y < N_ROWS) && (($clog2(N_ROWS)+1)'(pixel_y) >= nf_v))
+            underflow_v <= 1'b1;
+    end
+    assign underflow = underflow_v;
+endmodule
@@ -0,0 +1,179 @@
+// ============================================================================
+// gs_lpddr_wr_arb.sv  (Ch322 Brick 3; Ch323 extended 2:1 -> 3:1)
+//
+// 3:1 AXI4 WRITE-channel arbiter for the FPGA-private LPDDR4B EMIF user port.
+// The write twin of gs_lpddr_rd_arb. Lets the GS framebuffer writer
+// (gs_lpddr_axi_master, port 0, PRIORITY), the Ch323 tile Z-flush writer
+// (gs_z_flush_writer, port 2) and the Ch322 HPS write-probe
+// (gs_lpddr_wr_probe, port 1) share the single EMIF write channel.
+//
+// EXPLICIT priority (Ch323, Codex): FB-writer > Z-writer > wr-probe — i.e.
+// s0 > s2 > s1. The active render's color (FB) and Z spill outrank the debug
+// write-probe so a debug write can never starve a render flush. Leave s2_*
+// unconnected (awvalid=0) on builds without a Z writer — the arbiter is then
+// bit-for-bit the old 2:1 behavior.
+//
+// Per-transaction grant held AW->W->B (single-beat writes, AWLEN=0, so B
+// completes one transaction). Watchdog force-release guards a lost B.
+// All single-clock (emif_clk).
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_lpddr_wr_arb (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // ---- Port 0: GS framebuffer writer (priority) ----
+    input  logic [29:0] s0_awaddr,
+    input  logic [1:0]  s0_awburst,
+    input  logic [6:0]  s0_awid,
+    input  logic [7:0]  s0_awlen,
+    input  logic [2:0]  s0_awsize,
+    input  logic        s0_awvalid,
+    output logic        s0_awready,
+    input  logic [255:0] s0_wdata,
+    input  logic [31:0] s0_wstrb,
+    input  logic        s0_wlast,
+    input  logic        s0_wvalid,
+    output logic        s0_wready,
+    output logic [1:0]  s0_bresp,
+    output logic        s0_bvalid,
+    input  logic        s0_bready,
+
+    // ---- Port 1: HPS write-probe ----
+    input  logic [29:0] s1_awaddr,
+    input  logic [1:0]  s1_awburst,
+    input  logic [6:0]  s1_awid,
+    input  logic [7:0]  s1_awlen,
+    input  logic [2:0]  s1_awsize,
+    input  logic        s1_awvalid,
+    output logic        s1_awready,
+    input  logic [255:0] s1_wdata,
+    input  logic [31:0] s1_wstrb,
+    input  logic        s1_wlast,
+    input  logic        s1_wvalid,
+    output logic        s1_wready,
+    output logic [1:0]  s1_bresp,
+    output logic        s1_bvalid,
+    input  logic        s1_bready,
+
+    // ---- Port 2: tile Z-flush writer (Ch323; priority ABOVE probe, below FB writer) ----
+    input  logic [29:0] s2_awaddr,
+    input  logic [1:0]  s2_awburst,
+    input  logic [6:0]  s2_awid,
+    input  logic [7:0]  s2_awlen,
+    input  logic [2:0]  s2_awsize,
+    input  logic        s2_awvalid,
+    output logic        s2_awready,
+    input  logic [255:0] s2_wdata,
+    input  logic [31:0] s2_wstrb,
+    input  logic        s2_wlast,
+    input  logic        s2_wvalid,
+    output logic        s2_wready,
+    output logic [1:0]  s2_bresp,
+    output logic        s2_bvalid,
+    input  logic        s2_bready,
+
+    // ---- Port 3: HPS write-probe (Ch323 diag; LOWEST priority — debug staging) ----
+    input  logic [29:0] s3_awaddr,
+    input  logic [1:0]  s3_awburst,
+    input  logic [6:0]  s3_awid,
+    input  logic [7:0]  s3_awlen,
+    input  logic [2:0]  s3_awsize,
+    input  logic        s3_awvalid,
+    output logic        s3_awready,
+    input  logic [255:0] s3_wdata,
+    input  logic [31:0] s3_wstrb,
+    input  logic        s3_wlast,
+    input  logic        s3_wvalid,
+    output logic        s3_wready,
+    output logic [1:0]  s3_bresp,
+    output logic        s3_bvalid,
+    input  logic        s3_bready,
+
+    // ---- Master out: EMIF write channel ----
+    output logic [29:0] m_awaddr,
+    output logic [1:0]  m_awburst,
+    output logic [6:0]  m_awid,
+    output logic [7:0]  m_awlen,
+    output logic [2:0]  m_awsize,
+    output logic        m_awvalid,
+    input  logic        m_awready,
+    output logic [255:0] m_wdata,
+    output logic [31:0] m_wstrb,
+    output logic        m_wlast,
+    output logic        m_wvalid,
+    input  logic        m_wready,
+    input  logic [1:0]  m_bresp,
+    input  logic        m_bvalid,
+    output logic        m_bready
+);
+    // grant: 0=idle, 1=s0 FB writer, 2=s1 color spill, 3=s2 Z spill, 4=s3 HPS write-probe.
+    // EXPLICIT priority: FB-writer > Z-spill > color-spill > wr-probe — i.e. s0 > s2 > s1 > s3.
+    reg [2:0]  grant;
+    // Ch326 — NON-ABORTING ARBITER (Codex), same protocol fix as gs_lpddr_rd_arb. Once
+    // m_awvalid && m_awready, the write is COMMITTED (the slave will return B); abandoning it on
+    // a watchdog would orphan the B / leave the slave mid-write. So the watchdog gates ONLY the
+    // pre-AW wait; after AW acceptance the grant is held until m_bvalid && selected_bready. (The
+    // FB/spill writers never tripped the old 2^10 watchdog in practice, but the latent bug is the
+    // same — fixed for safety.)
+    // "committed" = EITHER the AW or a W beat has handshaked. The current writers send AW-then-W
+    // so AW sets it first, but tracking either makes this a GENERAL AXI write arbiter that never
+    // abandons a transaction regardless of AW/W ordering (Codex audit note).
+    reg        aw_done;        // a write beat/addr accepted for the active grant -> never abort past here
+    reg [21:0] watchdog;       // pre-commit only; ~6.7 ms @ 310 MHz dead-bus backstop
+    wire       wd_expired = watchdog[21];
+    wire       sel_bready = (grant==3'd1)?s0_bready:(grant==3'd2)?s1_bready:
+                            (grant==3'd3)?s2_bready:(grant==3'd4)?s3_bready:1'b1;
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            grant <= 3'd0; aw_done <= 1'b0; watchdog <= '0;
+        end else if (grant == 3'd0) begin
+            aw_done <= 1'b0; watchdog <= '0;
+            if      (s0_awvalid) grant <= 3'd1;   // FB writer (highest)
+            else if (s2_awvalid) grant <= 3'd3;   // Z spill (render-flush)
+            else if (s1_awvalid) grant <= 3'd2;   // color spill (render-flush)
+            else if (s3_awvalid) grant <= 3'd4;   // HPS write-probe (debug, lowest)
+        end else begin
+            if ((m_awvalid && m_awready) || (m_wvalid && m_wready)) aw_done <= 1'b1;  // AW or W accepted -> COMMITTED
+            if (m_bvalid && sel_bready) begin
+                grant <= 3'd0; aw_done <= 1'b0; watchdog <= '0;     // B delivered -> release
+            end else if (!aw_done) begin                            // still waiting for AW (nothing owed)
+                if (wd_expired) begin grant <= 3'd0; aw_done <= 1'b0; watchdog <= '0; end
+                else watchdog <= watchdog + 22'd1;
+            end
+            // aw_done && B not yet seen: HOLD the grant, never abort.
+        end
+    end
+
+    // AW mux
+    assign m_awaddr  = (grant==3'd4)?s3_awaddr :(grant==3'd3)?s2_awaddr :(grant==3'd2)?s1_awaddr :s0_awaddr;
+    assign m_awburst = (grant==3'd4)?s3_awburst:(grant==3'd3)?s2_awburst:(grant==3'd2)?s1_awburst:s0_awburst;
+    assign m_awid    = (grant==3'd4)?s3_awid   :(grant==3'd3)?s2_awid   :(grant==3'd2)?s1_awid   :s0_awid;
+    assign m_awlen   = (grant==3'd4)?s3_awlen  :(grant==3'd3)?s2_awlen  :(grant==3'd2)?s1_awlen  :s0_awlen;
+    assign m_awsize  = (grant==3'd4)?s3_awsize :(grant==3'd3)?s2_awsize :(grant==3'd2)?s1_awsize :s0_awsize;
+    assign m_awvalid = (grant==3'd1)?s0_awvalid:(grant==3'd2)?s1_awvalid:(grant==3'd3)?s2_awvalid:(grant==3'd4)?s3_awvalid:1'b0;
+    assign s0_awready = (grant==3'd1)?m_awready:1'b0;
+    assign s1_awready = (grant==3'd2)?m_awready:1'b0;
+    assign s2_awready = (grant==3'd3)?m_awready:1'b0;
+    assign s3_awready = (grant==3'd4)?m_awready:1'b0;
+
+    // W mux
+    assign m_wdata = (grant==3'd4)?s3_wdata:(grant==3'd3)?s2_wdata:(grant==3'd2)?s1_wdata:s0_wdata;
+    assign m_wstrb = (grant==3'd4)?s3_wstrb:(grant==3'd3)?s2_wstrb:(grant==3'd2)?s1_wstrb:s0_wstrb;
+    assign m_wlast = (grant==3'd4)?s3_wlast:(grant==3'd3)?s2_wlast:(grant==3'd2)?s1_wlast:s0_wlast;
+    assign m_wvalid = (grant==3'd1)?s0_wvalid:(grant==3'd2)?s1_wvalid:(grant==3'd3)?s2_wvalid:(grant==3'd4)?s3_wvalid:1'b0;
+    assign s0_wready = (grant==3'd1)?m_wready:1'b0;
+    assign s1_wready = (grant==3'd2)?m_wready:1'b0;
+    assign s2_wready = (grant==3'd3)?m_wready:1'b0;
+    assign s3_wready = (grant==3'd4)?m_wready:1'b0;
+
+    // B demux (idle: bready=1 drains any stale/late response)
+    assign s0_bresp = m_bresp; assign s1_bresp = m_bresp; assign s2_bresp = m_bresp; assign s3_bresp = m_bresp;
+    assign s0_bvalid = (grant==3'd1)?m_bvalid:1'b0;
+    assign s1_bvalid = (grant==3'd2)?m_bvalid:1'b0;
+    assign s2_bvalid = (grant==3'd3)?m_bvalid:1'b0;
+    assign s3_bvalid = (grant==3'd4)?m_bvalid:1'b0;
+    assign m_bready  = (grant==3'd1)?s0_bready:(grant==3'd2)?s1_bready:(grant==3'd3)?s2_bready:(grant==3'd4)?s3_bready:1'b1;
+endmodule
@@ -0,0 +1,136 @@
+// ============================================================================
+// gs_lpddr_wr_probe.sv  (Ch322 Brick 3)
+//
+// HPS-bridge-driven LPDDR4B WRITE probe — the missing PS2-side LPDDR loader,
+// cloned from retroDE_ao486/rtl/ao486/lpddr4b_loader.sv (write half). The PS2
+// core already has the READ half (gs_lpddr_rd_probe); this is its symmetric
+// twin so the HPS can STAGE arbitrary words into FPGA-private LPDDR4B from Linux
+// (e.g. a known texture for Ch322), then read them back / hash via the existing
+// read-probe before the texture cache fills from them.
+//
+// This is HPS -> bridge registers -> FPGA EMIF write. NOT HPS direct memory
+// access, and NOT the retired f2sdram path. The EMIF write channel is shared
+// with the GS framebuffer writer through gs_lpddr_wr_arb (FB writer = priority,
+// this probe writes only when the writer is idle).
+//
+// Runs on emif_clk. The bridge pulse/addr/data come from CLOCK2_50 and are
+// toggle-synchronized internally (same CDC as lpddr4b_loader / gs_lpddr_rd_probe).
+//
+// Each wr_pulse triggers ONE single-beat 32-bit write: the 32-bit lane within
+// the 256-bit EMIF word is selected by addr[4:2] with the matching WSTRB nibble.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_lpddr_wr_probe (
+    input  logic        emif_clk,
+    input  logic        emif_rst_n,
+
+    // ---- control (from HPS bridge, CLOCK2_50 domain) ----
+    input  logic        wr_pulse,      // toggles when the HPS writes a data word
+    input  logic [29:0] wr_addr,       // EMIF byte address (stable when pulse flips)
+    input  logic [31:0] wr_data,       // data word (stable when pulse flips)
+    input  logic        full_beat,     // Ch323 diag: write ALL 8 lanes (wstrb=0xFFFFFFFF) — tests
+                                        //   full-width commit through THIS arbiter/profile path
+
+    // ---- status (emif_clk domain; bridge syncs) ----
+    output logic        busy,
+    output logic        done_toggle,   // toggles on each completed write
+    output logic [31:0] bresp_errs,    // count of non-OKAY write responses
+
+    // ---- AXI4 write channel to the EMIF user port (emif_clk, 256-bit) ----
+    output logic [29:0] awaddr,
+    output logic [1:0]  awburst,
+    output logic [6:0]  awid,
+    output logic [7:0]  awlen,
+    output logic [2:0]  awsize,
+    output logic        awvalid,
+    input  logic        awready,
+    output logic [255:0] wdata,
+    output logic [31:0]  wstrb,
+    output logic         wlast,
+    output logic         wvalid,
+    input  logic        wready,
+    input  logic [1:0]  bresp,
+    input  logic        bvalid,
+    output logic        bready
+);
+    assign awburst = 2'b01;   // INCR
+    assign awid    = 7'd5;    // distinct id: fb-writer/probe ids elsewhere; 5 = wr-probe
+    assign awlen   = 8'd0;    // single beat
+    assign awsize  = 3'b101;  // 32 bytes (full 256-bit bus)
+    assign bready  = 1'b1;
+
+    // CDC: toggle sync CLOCK2_50 -> emif_clk (same as lpddr4b_loader)
+    reg [2:0] wr_sync;
+    wire      wr_edge = (wr_sync[2] != wr_sync[1]);
+    reg [29:0] lat_addr;
+    reg [31:0] lat_wdata;
+
+    typedef enum logic [1:0] { S_IDLE, S_AW, S_W, S_B } state_t;
+    state_t state;
+
+    always_ff @(posedge emif_clk or negedge emif_rst_n) begin
+        if (!emif_rst_n) begin
+            wr_sync <= 3'd0; lat_addr <= 30'd0; lat_wdata <= 32'd0;
+            state <= S_IDLE; awaddr <= 30'd0; awvalid <= 1'b0;
+            wdata <= 256'd0; wstrb <= 32'd0; wlast <= 1'b0; wvalid <= 1'b0;
+            busy <= 1'b0; done_toggle <= 1'b0; bresp_errs <= 32'd0;
+        end else begin
+            wr_sync <= {wr_sync[1:0], wr_pulse};
+            case (state)
+                S_IDLE: begin
+                    busy <= 1'b0;
+                    if (wr_edge) begin
+                        lat_addr  <= wr_addr;
+                        lat_wdata <= wr_data;
+                        busy      <= 1'b1;
+                        awaddr    <= {wr_addr[29:5], 5'd0};   // 32-byte aligned beat
+                        awvalid   <= 1'b1;
+                        state     <= S_AW;
+                    end
+                end
+                S_AW: begin
+                    if (awready) begin
+                        awvalid <= 1'b0;
+                        wdata   <= 256'd0;
+                        wstrb   <= 32'd0;
+                        if (full_beat) begin
+                            // diag: replicate the word across all 8 lanes, full WSTRB.
+                            wdata <= {8{lat_wdata}};
+                            wstrb <= 32'hFFFF_FFFF;
+                        end else
+                        case (lat_addr[4:2])   // place the 32-bit lane + its WSTRB nibble
+                            3'd0: begin wdata[ 31:  0] <= lat_wdata; wstrb[ 3: 0] <= 4'hF; end
+                            3'd1: begin wdata[ 63: 32] <= lat_wdata; wstrb[ 7: 4] <= 4'hF; end
+                            3'd2: begin wdata[ 95: 64] <= lat_wdata; wstrb[11: 8] <= 4'hF; end
+                            3'd3: begin wdata[127: 96] <= lat_wdata; wstrb[15:12] <= 4'hF; end
+                            3'd4: begin wdata[159:128] <= lat_wdata; wstrb[19:16] <= 4'hF; end
+                            3'd5: begin wdata[191:160] <= lat_wdata; wstrb[23:20] <= 4'hF; end
+                            3'd6: begin wdata[223:192] <= lat_wdata; wstrb[27:24] <= 4'hF; end
+                            3'd7: begin wdata[255:224] <= lat_wdata; wstrb[31:28] <= 4'hF; end
+                        endcase
+                        wlast  <= 1'b1;
+                        wvalid <= 1'b1;
+                        state  <= S_W;
+                    end
+                end
+                S_W: begin
+                    if (wready) begin
+                        wvalid <= 1'b0;
+                        wlast  <= 1'b0;
+                        state  <= S_B;
+                    end
+                end
+                S_B: begin
+                    if (bvalid) begin
+                        if (bresp != 2'b00) bresp_errs <= bresp_errs + 32'd1;
+                        busy        <= 1'b0;
+                        done_toggle <= ~done_toggle;
+                        state       <= S_IDLE;
+                    end
+                end
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+endmodule
@@ -0,0 +1,847 @@
+// retroDE_ps2 — gs_pcrtc_stub (Ch90)
+//
+// Minimal PCRTC (Programmable CRT Controller) scanout engine.
+// Real PS2 PCRTC reads VRAM via a DISPFB (display framebuffer)
+// configuration register and feeds the analog video DAC. This
+// stub is the SCANOUT side of the GS pipeline — its dual is
+// gs_stub, which is the WRITE side. Together they close the loop
+// from `raster_pixel_emit` (Ch88) → vram_stub (Ch89) → visible
+// pixels (Ch90).
+//
+// Architectural note. `platform_video_stub` is a flood-fill video
+// adapter that always paints BGCOLOR within its active area —
+// it predates VRAM persistence and stays as-is for back-compat.
+// `gs_pcrtc_stub` is the SCANOUT-AWARE alternative, used by TBs
+// that want to verify the round trip "gs_stub writes a pixel →
+// vram_stub stores it → pcrtc reads it back as video." We did
+// not extend platform_video_stub (which would have rippled
+// through 6 existing TBs); pcrtc is a parallel module that owns
+// its own raster timing AND vram read addressing, so a TB picks
+// the one that fits.
+//
+// Scope:
+//   - Single DISPFB context: pcrtc consumes `pmode_q` and
+//     `dispfb1_q` directly from gs_stub's privileged CPU MMIO
+//     latches (Ch91). The Ch90 sideband ports
+//     (scanout_enable / dispfb_fbp / dispfb_fbw) are gone — TBs
+//     drive scanout configuration the way a real driver would,
+//     by writing PMODE and DISPFB1 through the gs_stub.reg_wr_*
+//     port. This means `wait (raster_done); write PMODE.EN1=1`
+//     is the canonical sequence, not a sideband poke.
+//   - Addressing: linear by DEFAULT — fb_addr math mirrors
+//     gs_stub's pixel fb_addr math byte-exactly so a pixel
+//     written at (x,y) reads back at (x,y) without swizzle
+//     reconciliation. Four OPTIONAL per-PSM swizzle paths gated
+//     by parameters: `PSMCT32_SWIZZLE=1` (Ch120) routes PSMCT32
+//     reads through gs_swizzle_psmct32_stub; `PSMCT16_SWIZZLE=1`
+//     (Ch126) routes PSMCT16 reads through gs_swizzle_psmct16_stub;
+//     `PSMT8_SWIZZLE=1` (Ch132) routes PSMT8 reads through
+//     gs_swizzle_psmt8_stub (page=128×64 px, bw_pg=FBW>>1 — FBW
+//     must be even for PSMT8); `PSMT4_SWIZZLE=1` (Ch138) routes
+//     PSMT4 reads through gs_swizzle_psmt4_stub (page=128×128 px,
+//     bw_pg=FBW>>1 — FBW must be even for PSMT4; module also
+//     outputs nibble_hi selector since PSMT4 packs 2 pixels/byte).
+//     The four parameters are independent. All four defaults are
+//     0 → existing TBs see legacy linear behavior.
+//   - PSMCT32 (PSM=0), PSMCT16 (PSM=2), PSMT8 (PSM=0x13), and
+//     PSMT4 (PSM=0x14) are honored at this scope. Any other
+//     PSM forces scanout off rather than mis-decoding the byte
+//     layout. PSMCT16 reads 2 bytes/pixel and unpacks RGB5A1 →
+//     RGB888 via bit-replicate. PSMT8 reads 1 byte/pixel and
+//     PSMT4 reads 4 bits/pixel (2 pixels/byte, low nibble =
+//     even pixel). For PSMT8 / PSMT4, with `clut_enable=1` the
+//     index is looked up in clut_stub for real RGB; with
+//     `clut_enable=0`, the index/nibble surfaces as grayscale.
+//     gs_stub's raster channel emits PSMCT32 + PSMCT16 (Ch95) +
+//     PSMT8 (Ch105) + PSMT4 (Ch106). CLUT contents come from a
+//     TB-direct write OR from a VRAM→CLUT load triggered by
+//     TEX0_1.CLD via clut_loader_stub (Ch99..Ch102).
+//   - Single CRTC: one display, one DISPFB context. Real PS2 has
+//     two (DISPFB1/DISPLAY1 and DISPFB2/DISPLAY2) for interlace/
+//     merge. The PMODE.EN2 + DISPFB2/DISPLAY2 path is deferred.
+//   - DISPLAY1 DX/DY/DW/DH ARE honored (Ch92): they define the
+//     display window inside the active area. Outside the window,
+//     pcrtc emits 0 for r/g/b even with scanout_enable=1.
+//     MAGH/MAGV ARE honored (Ch93): each VRAM column shows for
+//     (MAGH+1) consecutive VCK pulses before advancing, and each
+//     VRAM line shows for (MAGV+1) raster lines. Practically,
+//     a 4-pixel-wide VRAM sprite with MAGH=1 (2×) appears 8
+//     pixels wide on screen. The H/V totals still come from
+//     module parameters at instantiation. Real PS2 driver-
+//     equivalent bring-up is now "configure DISPFB1 → configure
+//     DISPLAY1 → render → set PMODE.EN1=1." Note: DISPLAY1=0
+//     (post-reset default) means a 1×1 window at (0,0); a TB
+//     MUST configure DISPLAY1 for anything visible to scan out.
+//   - When scanout_enable
+//     (= PMODE.EN1 & (PSMCT32 || PSMCT16 || PSMT8 || PSMT4))
+//     is 0, r/g/b output is forced to 0 across the active area.
+//     There's no BGCOLOR fallback in this module — that lives in
+//     platform_video_stub.
+//
+// Trace payload: one EV_MODE pulse per completed frame, mirroring
+// platform_video_stub's schema (arg0=frame_count, arg1=H*V).
+//   PLAT MODE   arg0=frame_number arg1=pixels_per_frame arg2=- arg3=-
+
+`timescale 1ns/1ps
+
+module gs_pcrtc_stub
+    import trace_pkg::*;
+#(
+    // Horizontal timing (in pixel clocks). Defaults match
+    // platform_video_stub's tiny-TB convention.
+    parameter int H_ACTIVE = 16,
+    parameter int H_FRONT  = 2,
+    parameter int H_SYNC   = 4,
+    parameter int H_BACK   = 2,
+
+    // Vertical timing (in lines)
+    parameter int V_ACTIVE = 8,
+    parameter int V_FRONT  = 1,
+    parameter int V_SYNC   = 1,
+    parameter int V_BACK   = 1,
+
+    parameter bit HSYNC_ACTIVE_LOW = 1'b1,
+    parameter bit VSYNC_ACTIVE_LOW = 1'b1,
+
+    // Ch120 — when set, PSMCT32 scanout reads VRAM via the real PS2
+    // GS page/block swizzle (gs_swizzle_psmct32_stub) instead of the
+    // legacy linear `FBW*64*y + x*4` formula. PSMCT16 / PSMT8 / PSMT4
+    // are governed by their own gates (PSMCT16_SWIZZLE Ch126,
+    // PSMT8_SWIZZLE Ch132, PSMT4_SWIZZLE Ch138 — see below).
+    // Default 0 keeps every existing PSMCT32 scanout TB on the
+    // original linear addressing.
+    parameter bit PSMCT32_SWIZZLE = 1'b0,
+
+    // Ch126 — when set, PSMCT16 scanout reads VRAM via the real PS2
+    // GS page/block/column swizzle (gs_swizzle_psmct16_stub) instead
+    // of the legacy linear `FBW*64*y + x*2` formula. PSMCT32 / PSMT8
+    // / PSMT4 are governed by their own gates (PSMCT32_SWIZZLE /
+    // PSMT8_SWIZZLE) or stay linear. Default 0 keeps every existing
+    // PSMCT16 scanout TB (Ch94 PSM-aware, Ch95 raster, Ch103 PSMT4-
+    // via-CT16-CLUT, etc.) on the original linear addressing.
+    parameter bit PSMCT16_SWIZZLE = 1'b0,
+
+    // Ch132 — when set, PSMT8 scanout reads VRAM via the real PS2 GS
+    // page/block/column swizzle (gs_swizzle_psmt8_stub) instead of
+    // the legacy linear `FBW*64*y + x` formula. PSMT8 pages are 128
+    // px wide (vs 64 px for CT32/CT16) so the swizzle internally uses
+    // bw_pg = FBW>>1 — PCSX2 asserts FBW must be even for PSMT8.
+    // Default 0 keeps every existing PSMT8 scanout TB (Ch96, Ch97,
+    // Ch103 PSMT4-via-CT16-CLUT, Ch107 PSMT4-e2e palette path, etc.)
+    // on the original linear addressing. PSMCT32 / PSMCT16 / PSMT4
+    // are governed by their own gates or stay linear.
+    parameter bit PSMT8_SWIZZLE = 1'b0,
+
+    // Ch138 — when set, PSMT4 scanout reads VRAM via the real PS2 GS
+    // page/block/column swizzle (gs_swizzle_psmt4_stub) instead of
+    // the legacy linear `byte_offset = pixel_index >> 1` formula.
+    // PSMT4 pixels are 4 bits each (2 pixels per byte); the swizzle
+    // module outputs both an absolute byte address AND a `nibble_hi`
+    // selector that picks the high or low nibble of the byte at
+    // that address. PSMT4 pages are 128 px wide (same as PSMT8) so
+    // the swizzle internally uses bw_pg = FBW>>1 — PCSX2 asserts
+    // FBW must be even for PSMT4. The grayscale + CLUT lookup paths
+    // BOTH use the same swizzle output: the byte at `addr` is read
+    // from VRAM, and `nibble_hi` (instead of pixel_index[0]) picks
+    // which nibble. Default 0 keeps every existing PSMT4 scanout TB
+    // (Ch103 PSMT4+CLUT, Ch104 PSMT4 round-trip, Ch107 PSMT4 e2e,
+    // etc.) on the original linear addressing. PSMCT32 / PSMCT16 /
+    // PSMT8 are governed by their own gates.
+    parameter bit PSMT4_SWIZZLE = 1'b0,
+
+    // Ch158 — when set, the data-decode + sync-output pipeline is
+    // delayed by 1 cycle so it aligns with a sync-read VRAM (e.g.
+    // `vram_bram_stub`, Ch154) whose `read_data` is registered.
+    // The address-driving stage (`vram_read_addr`) keeps using the
+    // current `(hcnt, vcnt)` so the read is issued one pixel
+    // "ahead"; the registered `vram_read_data` returns a cycle
+    // later, and the decode comb consumes the matching delayed
+    // counter view via the `*_dec` signals.
+    //
+    // Default 0 preserves the legacy combinational-read behavior
+    // every existing PCRTC TB (Ch90+ scanout TBs) is written
+    // against — those TBs drive `vram_read_data` via legacy
+    // `vram_stub` (comb read) and consume r/g/b on the same
+    // cycle as the addr drive. Set to 1 in the BRAM wrapper /
+    // board top once `vram_bram_stub` is the storage.
+    parameter bit VRAM_SYNC_READ = 1'b0,
+
+    // Ch163 — bypass the magnification dividers
+    // `vram_x_unshift = hwin_rel / hmag_factor` and the matching y
+    // form when the demo locks `MAGH = MAGV = 0`. Quartus infers a
+    // 32-bit hardware divider from the `/` operators above (the
+    // Ch162 STA worst path after STRIP_HW_DIVIDER closed the EE-
+    // core divider). For demos that never write MAGH/MAGV non-zero
+    // — which includes the PSMCT32 raster demo and every other
+    // hardware-target wrapper today — the divisors are constant 1
+    // and the math collapses to a passthrough.
+    //
+    // Default 0 keeps the existing divider math live so every
+    // Ch93-era scanout MAG TB stays green (the TBs that drive
+    // MAGH != 0 / MAGV != 0 such as `tb_gs_scanout_magh_magv`
+    // continue to use the default).
+    //
+    // When 1, `vram_x_unshift = hwin_rel` / `vram_y_unshift =
+    // vwin_rel` — equivalent to the MAGH=MAGV=0 case but without
+    // the divider. The hardware-demo path forwards this parameter
+    // through `top_psmct32_raster_demo_bram` and the DE25-Nano
+    // board top sets it to 1'b1.
+    parameter bit STRIP_PCRTC_MAG_DIV = 1'b0
+) (
+    input  logic         clk,
+    input  logic         rst_n,
+
+    // Ch91/Ch92/Ch93/Ch94/Ch96/Ch103 — PMODE + DISPFB1 + DISPLAY1
+    // latches from gs_stub's privileged CPU MMIO port.
+    // EN1 (PMODE bit 0) gates scanout. DISPFB1 carries the
+    // framebuffer base / width / PSM the PCRTC reads from
+    // (PSMCT32, PSMCT16, PSMT8, and PSMT4 honored at this scope;
+    // any other PSM forces scanout off). DISPLAY1 carries the
+    // display window: DX/DY = origin within the active area;
+    // DW/DH = width/height MINUS one (real PS2 semantics).
+    // MAGH/MAGV (Ch93) scale the window-relative coordinate so
+    // each VRAM column/line repeats for (MAGH+1)/(MAGV+1)
+    // displayed pulses/lines; pcrtc still takes H/V TOTALS from
+    // module parameters at instantiation, not from registers.
+    input  logic [63:0]  pmode_q,
+    input  logic [63:0]  dispfb1_q,
+    input  logic [63:0]  display1_q,
+
+    // VRAM read port: combinational read from vram_stub.
+    output logic [31:0]  vram_read_addr,
+    input  logic [31:0]  vram_read_data,
+
+    // Ch97 — CLUT (palette) read port for indexed-color scanout.
+    // When `clut_enable` is high AND the active PSM is PSMT8,
+    // pcrtc presents `clut_read_idx = vram_read_data[7:0] +
+    // (clut_csa << 4)` and decodes the returned PSMCT32 RGB
+    // entry instead of the grayscale fallback. CSM is implicitly
+    // CSM2 (linear). CSA shifts the lookup window in 16-entry
+    // increments and wraps mod 256. When `clut_enable` is low,
+    // the CLUT is bypassed and PSMT8 still scans out as
+    // grayscale (Ch96 default).
+    input  logic         clut_enable,
+    input  logic [4:0]   clut_csa,
+    output logic [7:0]   clut_read_idx,
+    input  logic [31:0]  clut_read_data,
+
+    // Video out
+    output logic         hsync,
+    output logic         vsync,
+    output logic         de,
+    output logic [7:0]   r,
+    output logic [7:0]   g,
+    output logic [7:0]   b,
+    // Ch320 — high exactly when this scanout pixel is inside the displayed frame
+    // (scanout enabled AND within the DX/DY/DW/DH display window). Aligned to r/g/b.
+    // An LPDDR4B scanout reader gates its pixels by this so it shows ONE frame, not
+    // a tiled fill of the whole active line.
+    output logic         pix_window_o,
+
+    // Trace
+    output logic         ev_valid,
+    output subsys_e      ev_subsys,
+    output event_e       ev_event,
+    output logic [63:0]  ev_arg0,
+    output logic [63:0]  ev_arg1,
+    output logic [63:0]  ev_arg2,
+    output logic [63:0]  ev_arg3,
+    output logic [31:0]  ev_flags
+);
+
+    localparam int H_TOTAL = H_ACTIVE + H_FRONT + H_SYNC + H_BACK;
+    localparam int V_TOTAL = V_ACTIVE + V_FRONT + V_SYNC + V_BACK;
+
+    localparam int H_SYNC_START = H_ACTIVE + H_FRONT;
+    localparam int H_SYNC_END   = H_SYNC_START + H_SYNC;
+    localparam int V_SYNC_START = V_ACTIVE + V_FRONT;
+    localparam int V_SYNC_END   = V_SYNC_START + V_SYNC;
+
+    localparam int HCNT_W = $clog2(H_TOTAL);
+    localparam int VCNT_W = $clog2(V_TOTAL);
+
+    logic [HCNT_W-1:0] hcnt;
+    logic [VCNT_W-1:0] vcnt;
+
+    logic end_of_line;
+    logic end_of_frame;
+    assign end_of_line  = (hcnt == HCNT_W'(H_TOTAL - 1));
+    assign end_of_frame = end_of_line && (vcnt == VCNT_W'(V_TOTAL - 1));
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            hcnt <= '0;
+            vcnt <= '0;
+        end else if (end_of_line) begin
+            hcnt <= '0;
+            vcnt <= end_of_frame ? '0 : (vcnt + VCNT_W'(1));
+        end else begin
+            hcnt <= hcnt + HCNT_W'(1);
+        end
+    end
+
+    logic active_h;
+    logic active_v;
+    logic in_hsync;
+    logic in_vsync;
+
+    assign active_h = (hcnt < HCNT_W'(H_ACTIVE));
+    assign active_v = (vcnt < VCNT_W'(V_ACTIVE));
+    assign in_hsync = (hcnt >= HCNT_W'(H_SYNC_START)) && (hcnt < HCNT_W'(H_SYNC_END));
+    assign in_vsync = (vcnt >= VCNT_W'(V_SYNC_START)) && (vcnt < VCNT_W'(V_SYNC_END));
+
+    // ------------------------------------------------------------------
+    // Ch158 — decode-stage pipeline. When `VRAM_SYNC_READ=1`, every
+    // hcnt/vcnt-derived signal that the data-decode stage consumes
+    // is delayed by 1 cycle so it lines up with `vram_bram_stub`'s
+    // 1-cycle-late `vram_read_data`. The address-side
+    // (`vram_read_addr`) keeps using the current `hcnt`/`vcnt` so the
+    // read is issued one pixel "ahead".
+    //
+    // The registers below always exist (zero-cost in sim, optimized
+    // away when unreached in synthesis); the `*_dec` muxes select
+    // between the registered view (sync) and the live signal
+    // (legacy comb-read passthrough).
+    // ------------------------------------------------------------------
+    logic in_hsync_q, in_vsync_q;
+    logic active_h_q, active_v_q;
+    logic in_display_window_q, scanout_enable_q;
+    logic dispfb_psm_ct32_q, dispfb_psm_ct16_q, dispfb_psm_t8_q, dispfb_psm_t4_q;
+    logic psm4_nibble_select_q;
+    logic end_of_frame_q;
+
+    logic in_hsync_dec, in_vsync_dec;
+    logic active_h_dec, active_v_dec;
+    logic in_display_window_dec, scanout_enable_dec;
+    logic dispfb_psm_ct32_dec, dispfb_psm_ct16_dec, dispfb_psm_t8_dec, dispfb_psm_t4_dec;
+    logic psm4_nibble_select_dec;
+    logic end_of_frame_dec;
+
+    // psm4_nibble_select / dispfb_psm_* / scanout_enable /
+    // in_display_window are forward-referenced — they are declared
+    // and assigned later in the file (after the address/decode
+    // logic that produces them). SystemVerilog allows module-level
+    // forward references inside always_ff/always_comb blocks; the
+    // registers below capture them at every posedge.
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            in_hsync_q           <= 1'b0;
+            in_vsync_q           <= 1'b0;
+            active_h_q           <= 1'b0;
+            active_v_q           <= 1'b0;
+            in_display_window_q  <= 1'b0;
+            scanout_enable_q     <= 1'b0;
+            dispfb_psm_ct32_q    <= 1'b0;
+            dispfb_psm_ct16_q    <= 1'b0;
+            dispfb_psm_t8_q      <= 1'b0;
+            dispfb_psm_t4_q      <= 1'b0;
+            psm4_nibble_select_q <= 1'b0;
+            end_of_frame_q       <= 1'b0;
+        end else begin
+            in_hsync_q           <= in_hsync;
+            in_vsync_q           <= in_vsync;
+            active_h_q           <= active_h;
+            active_v_q           <= active_v;
+            in_display_window_q  <= in_display_window;
+            scanout_enable_q     <= scanout_enable;
+            dispfb_psm_ct32_q    <= dispfb_psm_ct32;
+            dispfb_psm_ct16_q    <= dispfb_psm_ct16;
+            dispfb_psm_t8_q      <= dispfb_psm_t8;
+            dispfb_psm_t4_q      <= dispfb_psm_t4;
+            psm4_nibble_select_q <= psm4_nibble_select;
+            end_of_frame_q       <= end_of_frame;
+        end
+    end
+
+    assign in_hsync_dec           = VRAM_SYNC_READ ? in_hsync_q           : in_hsync;
+    assign in_vsync_dec           = VRAM_SYNC_READ ? in_vsync_q           : in_vsync;
+    assign active_h_dec           = VRAM_SYNC_READ ? active_h_q           : active_h;
+    assign active_v_dec           = VRAM_SYNC_READ ? active_v_q           : active_v;
+    assign in_display_window_dec  = VRAM_SYNC_READ ? in_display_window_q  : in_display_window;
+    assign scanout_enable_dec     = VRAM_SYNC_READ ? scanout_enable_q     : scanout_enable;
+    // Ch320 — same gate the r/g/b output uses (line ~"if (de && scanout_enable_dec &&
+    // in_display_window_dec)"), minus de (the HDMI path applies de). Lets an external
+    // LPDDR4B scanout reader blank outside the displayed frame, matching BRAM scanout.
+    assign pix_window_o = scanout_enable_dec && in_display_window_dec;
+    assign dispfb_psm_ct32_dec    = VRAM_SYNC_READ ? dispfb_psm_ct32_q    : dispfb_psm_ct32;
+    assign dispfb_psm_ct16_dec    = VRAM_SYNC_READ ? dispfb_psm_ct16_q    : dispfb_psm_ct16;
+    assign dispfb_psm_t8_dec      = VRAM_SYNC_READ ? dispfb_psm_t8_q      : dispfb_psm_t8;
+    assign dispfb_psm_t4_dec      = VRAM_SYNC_READ ? dispfb_psm_t4_q      : dispfb_psm_t4;
+    assign psm4_nibble_select_dec = VRAM_SYNC_READ ? psm4_nibble_select_q : psm4_nibble_select;
+    assign end_of_frame_dec       = VRAM_SYNC_READ ? end_of_frame_q       : end_of_frame;
+
+    assign hsync = HSYNC_ACTIVE_LOW ? ~in_hsync_dec : in_hsync_dec;
+    assign vsync = VSYNC_ACTIVE_LOW ? ~in_vsync_dec : in_vsync_dec;
+    assign de    = active_h_dec && active_v_dec;
+
+    // ------------------------------------------------------------------
+    // VRAM addressing. Mirror gs_stub's fb_addr math byte-exactly
+    // so written-then-scanned pixels round-trip without
+    // reconciliation:
+    //   fbp_bytes      = dispfb_fbp << 11               (FBP * 2048)
+    //   pixels_per_row = dispfb_fbw << 6                (FBW * 64)
+    //   effective_x    = (hcnt - DX) / (MAGH+1) + DBX   (Ch92/Ch93)
+    //   effective_y    = (vcnt - DY) / (MAGV+1) + DBY
+    //   pixel_index    = effective_y * pixels_per_row + effective_x
+    //   byte_offset    = pixel_index << dispfb_bpp_shift
+    //   fb_addr        = fbp_bytes + byte_offset
+    // dispfb_bpp_shift is now PSM-aware (Ch94/Ch96): 2 for
+    // PSMCT32, 1 for PSMCT16, 0 for PSMT8. Other PSMs force
+    // scanout off rather than mis-decoding bytes.
+    // ------------------------------------------------------------------
+
+    // Decode DISPFB1 sub-fields per real PS2 GS register layout
+    // (PCSX2 GSRegs.h — DISPFB structure):
+    //   FBP : [8:0]    base address in 2048-byte units
+    //   FBW : [14:9]   width in 64-pixel units
+    //   PSM : [19:15]  pixel storage mode (we only honor PSMCT32 = 0)
+    //   DBX : [42:32]  display-buffer X origin (Ch91-audit fix)
+    //   DBY : [53:43]  display-buffer Y origin (Ch91-audit fix)
+    //
+    // DBX/DBY shift the scanout's VRAM origin: the pixel that
+    // appears at (hcnt=0, vcnt=0) is VRAM (DBX, DBY), not (0, 0).
+    // Useful for double-buffered framebuffers and offset display
+    // windows.
+    logic [8:0]  dispfb_fbp;
+    logic [5:0]  dispfb_fbw;
+    logic [4:0]  dispfb_psm;
+    logic [10:0] dispfb_dbx;
+    logic [10:0] dispfb_dby;
+    logic        dispfb_psm_ok;
+    logic        pmode_en1;
+    logic        scanout_enable;
+
+    assign dispfb_fbp    = dispfb1_q[8:0];
+    assign dispfb_fbw    = dispfb1_q[14:9];
+    assign dispfb_psm    = dispfb1_q[19:15];
+    assign dispfb_dbx    = dispfb1_q[42:32];
+    assign dispfb_dby    = dispfb1_q[53:43];
+
+    // Ch94/Ch96/Ch97/Ch103 — scanout PSM awareness. Four formats:
+    //   PSMCT32 (5'h00) — 4 bytes/pixel, byte order {A,B,G,R}.
+    //   PSMCT16 (5'h02) — 2 bytes/pixel, RGB5A1 packed:
+    //                     R[4:0] G[9:5] B[14:10] A[15].
+    //   PSMT8   (5'h13) — 1 byte/pixel, 8-bit index.
+    //   PSMT4   (5'h14) — 4 bits/pixel = 2 pixels/byte. Byte
+    //                     offset = pixel_index >> 1; nibble
+    //                     selector = pixel_index[0] (low =
+    //                     even, high = odd). The 4-bit nibble
+    //                     zero-extends to an 8-bit CLUT index;
+    //                     CSA picks the 16-entry palette window.
+    // For PSMT8/PSMT4, with `clut_enable=1` pcrtc looks up
+    // CLUT[idx + (CSA << 4)] in the external clut_stub for real
+    // RGB. With `clut_enable=0`, the index/nibble surfaces as
+    // grayscale (8-bit replication for PSMT8, 4→8 bit-replicate
+    // for PSMT4) so the storage lane stays visually verifiable
+    // without programming a palette.
+    // 5→8 expansion (PSMCT16) uses bit-replicate ({r5, r5[4:2]}),
+    // matching PCSX2. Other PSMs still disable scanout rather
+    // than mis-decode bytes; PSMCT24/PSMCT16S/PSMZ32/etc. force
+    // scanout off here.
+    logic        dispfb_psm_ct32;
+    logic        dispfb_psm_ct16;
+    logic        dispfb_psm_t8;
+    logic        dispfb_psm_t4;
+    logic [1:0]  dispfb_bpp_shift;
+
+    assign dispfb_psm_ct32  = (dispfb_psm == 5'h00);
+    assign dispfb_psm_ct16  = (dispfb_psm == 5'h02);
+    assign dispfb_psm_t8    = (dispfb_psm == 5'h13);
+    assign dispfb_psm_t4    = (dispfb_psm == 5'h14);
+    assign dispfb_psm_ok    = dispfb_psm_ct32 | dispfb_psm_ct16
+                            | dispfb_psm_t8   | dispfb_psm_t4;
+    assign dispfb_bpp_shift = dispfb_psm_ct32 ? 2'd2 :   // 4 bytes/pixel
+                              dispfb_psm_ct16 ? 2'd1 :   // 2 bytes/pixel
+                              dispfb_psm_t8   ? 2'd0 :   // 1 byte/pixel
+                                                2'd2;    // PSMT4 uses byte_offset right-shift, not bpp_shift
+    assign pmode_en1     = pmode_q[0];
+    assign scanout_enable = pmode_en1 & dispfb_psm_ok;
+
+    // Ch92/Ch93 — DISPLAY1 sub-fields per real PS2 GS register
+    // layout (PCSX2 GSRegs.h — DISPLAY structure):
+    //   DX   : [11:0]   display window X start (in VCK pulses)
+    //   DY   : [22:12]  display window Y start (in raster lines)
+    //   MAGH : [26:23]  horizontal magnification - 1 (Ch93)
+    //   MAGV : [28:27]  vertical magnification - 1   (Ch93)
+    //   DW   : [43:32]  display width - 1            (in VCK pulses)
+    //   DH   : [54:44]  display height - 1           (in raster lines)
+    //
+    // The display window is the sub-rect (DX..DX+DW, DY..DY+DH)
+    // inside the active area. Outside the window, r/g/b is 0
+    // even when scanout_enable is 1. Inside, the VRAM index is
+    // measured RELATIVE to the window origin, scaled DOWN by the
+    // magnification factors (MAGH+1 / MAGV+1), then shifted by
+    // DBX/DBY. This means the pixel at displayed (DX, DY)
+    // corresponds to VRAM (DBX, DBY); successive displayed
+    // pixels along H map to the SAME VRAM column for (MAGH+1)
+    // VCK pulses before advancing.
+    logic [11:0] display_dx;
+    logic [10:0] display_dy;
+    logic [3:0]  display_magh;
+    logic [1:0]  display_magv;
+    logic [11:0] display_dw;
+    logic [10:0] display_dh;
+
+    assign display_dx   = display1_q[11:0];
+    assign display_dy   = display1_q[22:12];
+    assign display_magh = display1_q[26:23];
+    assign display_magv = display1_q[28:27];
+    assign display_dw   = display1_q[43:32];
+    assign display_dh   = display1_q[54:44];
+
+    // Window inside-test: (hcnt - DX) in [0, DW] AND (vcnt - DY)
+    // in [0, DH]. We do the lower-bound check by comparing >=
+    // and the upper-bound by computing the relative coord.
+    logic [11:0] hwin_rel;
+    logic [11:0] vwin_rel;
+    logic        in_display_window;
+    assign hwin_rel = {{(12-HCNT_W){1'b0}}, hcnt} - {{0{1'b0}}, display_dx};
+    assign vwin_rel = {{(12-VCNT_W){1'b0}}, vcnt[VCNT_W-1:0]} - {1'b0, display_dy};
+    assign in_display_window = ({{(12-HCNT_W){1'b0}}, hcnt} >= {{0{1'b0}}, display_dx})
+                            && (hwin_rel <= display_dw)
+                            && ({{(12-VCNT_W){1'b0}}, vcnt[VCNT_W-1:0]} >= {1'b0, display_dy})
+                            && (vwin_rel <= {1'b0, display_dh});
+
+    logic [31:0] fbp_bytes;
+    logic [31:0] pixels_per_row;
+    logic [31:0] hmag_factor;     // MAGH + 1, range 1..16
+    logic [31:0] vmag_factor;     // MAGV + 1, range 1..4
+    logic [31:0] vram_x_unshift;
+    logic [31:0] vram_y_unshift;
+    logic [31:0] effective_x;
+    logic [31:0] effective_y;
+    logic [31:0] pixel_index;
+    logic [31:0] byte_offset;
+
+    // VRAM index is measured from inside the display window and
+    // SCALED DOWN by the magnification factors:
+    //   effective_x = ((hcnt - DX) / (MAGH+1)) + DBX
+    //   effective_y = ((vcnt - DY) / (MAGV+1)) + DBY
+    // MAGH=MAGV=0 → factors=1×, math collapses to the pre-Ch93
+    // form (and the pre-Ch92 form when DISPLAY1 covers the full
+    // active area). MAGH=N>0 means each VRAM column shows for
+    // (N+1) consecutive VCK pulses before the next column. SystemVerilog
+    // `/` truncates toward zero on unsigned 32-bit operands —
+    // matches PS2 PCRTC behavior since (hcnt-DX) is always
+    // non-negative inside the window (the window check guards
+    // hcnt >= DX before VRAM is read).
+    assign fbp_bytes      = {23'd0, dispfb_fbp} << 11;
+    assign pixels_per_row = {26'd0, dispfb_fbw} << 6;
+    assign hmag_factor    = {28'd0, display_magh} + 32'd1;
+    assign vmag_factor    = {30'd0, display_magv} + 32'd1;
+    // Ch163 — when STRIP_PCRTC_MAG_DIV is 1, bypass the divisions
+    // and use the window-relative coords directly. Quartus then has
+    // nothing to infer for the magnification divider (the Ch162-onwards
+    // STA worst path on `u_demo|u_pcrtc|div_1_rtl_0|...`). The
+    // hardware-demo path locks MAGH=MAGV=0 so the divisors are
+    // constant 1 and this is behavior-neutral. The default 0 keeps
+    // the live divider math for the existing Ch93 magnification
+    // scanout TBs (`tb_gs_scanout_magh_magv` etc.).
+    assign vram_x_unshift = STRIP_PCRTC_MAG_DIV
+                            ? {20'd0, hwin_rel}
+                            : ({20'd0, hwin_rel} / hmag_factor);
+    assign vram_y_unshift = STRIP_PCRTC_MAG_DIV
+                            ? {20'd0, vwin_rel}
+                            : ({20'd0, vwin_rel} / vmag_factor);
+    assign effective_x    = vram_x_unshift + {21'd0, dispfb_dbx};
+    assign effective_y    = vram_y_unshift + {21'd0, dispfb_dby};
+    assign pixel_index    = (effective_y * pixels_per_row) + effective_x;
+    // PSMT4 packs 2 pixels per byte → byte_offset = pixel_index/2;
+    // all other supported PSMs are integer-bytes-per-pixel and
+    // use the standard left-shift by bpp_shift.
+    assign byte_offset    = dispfb_psm_t4 ? (pixel_index >> 1)
+                                          : (pixel_index << dispfb_bpp_shift);
+    logic [31:0] vram_linear_addr;
+    assign vram_linear_addr = fbp_bytes + byte_offset;
+
+    // Ch120 — optional PSMCT32 swizzled scanout. The swizzle module
+    // is purely combinational and reuses dispfb_fbp / dispfb_fbw +
+    // the per-cycle effective_x / effective_y (already magnification-
+    // aware via Ch93). When PSMCT32_SWIZZLE=1 AND the active PSM is
+    // PSMCT32, mux its output into vram_read_addr. Other PSMs (CT16,
+    // T8, T4) and PSMCT32_SWIZZLE=0 keep the legacy linear address.
+    logic [31:0] vram_swizzled_addr;
+    gs_swizzle_psmct32_stub u_swizzle (
+        .fbp (dispfb_fbp),
+        .fbw (dispfb_fbw),
+        .x   (effective_x[11:0]),
+        .y   (effective_y[11:0]),
+        .addr(vram_swizzled_addr)
+    );
+
+    // Ch126 — optional PSMCT16 swizzled scanout. Same wiring shape
+    // as Ch120 but uses gs_swizzle_psmct16_stub. The PSMCT16 module
+    // bakes its own page-shape (64×64 vs CT32's 64×32), block grid
+    // (4 cols × 8 rows vs CT32's 8×4), and within-block column-table
+    // permutation in. Default PSMCT16_SWIZZLE=0 preserves linear
+    // PSMCT16 scanout for the legacy TBs (Ch94/Ch95/Ch103/etc.).
+    logic [31:0] vram_swizzled16_addr;
+    gs_swizzle_psmct16_stub u_swizzle16 (
+        .fbp (dispfb_fbp),
+        .fbw (dispfb_fbw),
+        .x   (effective_x[11:0]),
+        .y   (effective_y[11:0]),
+        .addr(vram_swizzled16_addr)
+    );
+
+    // Ch132 — optional PSMT8 swizzled scanout. Same wiring shape as
+    // Ch120/Ch126. PSMT8 pages are 128 px wide so the swizzle
+    // internally divides FBW by 2 (PCSX2 asserts FBW must be even
+    // for PSMT8). Default PSMT8_SWIZZLE=0 preserves linear PSMT8
+    // scanout for the legacy TBs (Ch96, Ch97, Ch103, Ch107, etc.).
+    logic [31:0] vram_swizzled8_addr;
+    gs_swizzle_psmt8_stub u_swizzle8 (
+        .fbp (dispfb_fbp),
+        .fbw (dispfb_fbw),
+        .x   (effective_x[11:0]),
+        .y   (effective_y[11:0]),
+        .addr(vram_swizzled8_addr)
+    );
+
+    // Ch138 — optional PSMT4 swizzled scanout. Same wiring shape as
+    // Ch120/Ch126/Ch132 but uses gs_swizzle_psmt4_stub. PSMT4 is
+    // 4 bits/pixel, so the module outputs both an absolute byte
+    // address AND a `nibble_hi` selector. Default PSMT4_SWIZZLE=0
+    // preserves linear PSMT4 scanout for the legacy TBs (Ch103,
+    // Ch104, Ch107, etc.) — the linear path uses pixel_index[0] as
+    // the nibble selector; the swizzled path uses the swizzle
+    // module's nibble_hi output instead.
+    logic [31:0] vram_swizzled4_addr;
+    logic        swizzle4_nibble_hi;
+    gs_swizzle_psmt4_stub u_swizzle4 (
+        .fbp      (dispfb_fbp),
+        .fbw      (dispfb_fbw),
+        .x        (effective_x[11:0]),
+        .y        (effective_y[11:0]),
+        .addr     (vram_swizzled4_addr),
+        .nibble_hi(swizzle4_nibble_hi)
+    );
+
+    assign vram_read_addr = (PSMCT32_SWIZZLE && dispfb_psm_ct32) ? vram_swizzled_addr   :
+                            (PSMCT16_SWIZZLE && dispfb_psm_ct16) ? vram_swizzled16_addr :
+                            (PSMT8_SWIZZLE   && dispfb_psm_t8)   ? vram_swizzled8_addr  :
+                            (PSMT4_SWIZZLE   && dispfb_psm_t4)   ? vram_swizzled4_addr  :
+                                                                   vram_linear_addr;
+
+    // PSMCT32 layout in vram_stub: little-endian write of
+    // raster_pixel_color_q[31:0] = {A, B, G, R}. Read back as:
+    //   data[7:0]   = R
+    //   data[15:8]  = G
+    //   data[23:16] = B
+    //   data[31:24] = A    (alpha, not exposed at the video DAC)
+
+    // Ch94/Ch96/Ch97 — PSM-aware color decode.
+    //   PSMCT32: lower 24 bits = {B, G, R}; alpha at [31:24]
+    //            dropped.
+    //   PSMCT16: RGB5A1 in lower 16 bits, 5→8 bit-replicate.
+    //   PSMT8  : index in vram_read_data[7:0]. With clut_enable
+    //            (Ch97), CLUT[idx + (CSA << 4)] is looked up for
+    //            real RGB; without it, the index is emitted as
+    //            grayscale (Ch96 fallback). The vram_stub read
+    //            returns 4 bytes starting at the byte address,
+    //            so [7:0] is the byte at the addressed PSMT8
+    //            pixel regardless of 4-byte alignment.
+    logic [15:0] psm16_pixel;
+    logic [4:0]  psm16_r5, psm16_g5, psm16_b5;
+    logic [7:0]  psm16_r8, psm16_g8, psm16_b8;
+    logic [7:0]  psm8_idx;
+    logic [3:0]  psm4_nibble;
+    logic [7:0]  psm4_idx;
+    logic [7:0]  psm4_gray;
+
+    // Ch158 (audit Medium fix) — sub-word PSM lane selection.
+    //
+    // `vram_stub` returns the 4 bytes STARTING at `byte_addr`, so
+    // for the legacy comb-read shape the sub-word value is always
+    // at the LOW lane of `vram_read_data` (CT16 → [15:0], T8 → [7:0],
+    // T4 byte → [7:0]). `vram_bram_stub` is word-addressable
+    // (returns mem[byte_addr >> 2]), so the sub-word value lives
+    // at lane `byte_addr[1:0]` within the returned 32-bit word —
+    // CT16 halfword at byte_addr[1]==1 sits at [31:16] and is
+    // missed by a fixed-low-lane extract.
+    //
+    // The address-LSB register below is a 1-cycle-delayed copy of
+    // `vram_read_addr[1:0]` matching the `_dec` decode-stage view
+    // of the registered `vram_read_data`. The `data_lane` mux is
+    // forced to 0 in legacy mode (so vram_stub's byte-addressable
+    // semantics keep working) and uses the registered LSBs in
+    // sync mode (so vram_bram_stub's word-addressable layout
+    // resolves to the right byte/halfword).
+    logic [1:0] vram_addr_lane_q;
+    logic [1:0] vram_addr_lane_dec;
+    logic [1:0] data_lane;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) vram_addr_lane_q <= 2'd0;
+        else        vram_addr_lane_q <= vram_read_addr[1:0];
+    end
+    assign vram_addr_lane_dec = VRAM_SYNC_READ ? vram_addr_lane_q
+                                               : vram_read_addr[1:0];
+    assign data_lane          = VRAM_SYNC_READ ? vram_addr_lane_dec
+                                               : 2'd0;
+
+    // CT16 halfword: [1] picks low (==0) or high (==1) halfword of
+    // the 32-bit word. byte_addr[0]==1 is misuse for CT16 (the
+    // address-stage formula always yields even byte addresses).
+    assign psm16_pixel = data_lane[1] ? vram_read_data[31:16]
+                                      : vram_read_data[15:0];
+
+    // PSMT8/T4 byte: [1:0] picks 1 of 4 byte lanes. Used directly
+    // as `psm8_idx` and as the source byte for the PSMT4 nibble
+    // extract below.
+    logic [7:0] vram_byte_lane;
+    always_comb begin
+        case (data_lane)
+            2'b00: vram_byte_lane = vram_read_data[ 7: 0];
+            2'b01: vram_byte_lane = vram_read_data[15: 8];
+            2'b10: vram_byte_lane = vram_read_data[23:16];
+            2'b11: vram_byte_lane = vram_read_data[31:24];
+        endcase
+    end
+
+    assign psm16_r5    = psm16_pixel[4:0];
+    assign psm16_g5    = psm16_pixel[9:5];
+    assign psm16_b5    = psm16_pixel[14:10];
+    assign psm16_r8    = {psm16_r5, psm16_r5[4:2]};
+    assign psm16_g8    = {psm16_g5, psm16_g5[4:2]};
+    assign psm16_b8    = {psm16_b5, psm16_b5[4:2]};
+    assign psm8_idx    = vram_byte_lane;
+
+    // Ch103 — PSMT4 nibble extraction. The byte at byte_offset
+    // holds two pixels: low nibble = even pixel, high nibble =
+    // odd pixel. pixel_index[0] picks which one this scanout
+    // cycle is reading. The 4-bit nibble zero-extends to an
+    // 8-bit CLUT index; the grayscale fallback replicates the
+    // nibble across both halves of an 8-bit channel value
+    // (4'hF → 8'hFF, 4'h5 → 8'h55, etc.).
+    //
+    // Ch138 — when PSMT4_SWIZZLE=1 AND the active PSM is PSMT4,
+    // the nibble selector comes from the swizzle module's
+    // `nibble_hi` output (which is `columnTable4[yb][xb] & 1` —
+    // the canonical PCSX2 selector under the swizzled layout).
+    // pixel_index[0] is the linear formula's selector; the
+    // swizzled formula needs the swizzle's own bit because the
+    // swizzle reorders pixels within a block.
+    logic psm4_nibble_select;
+    assign psm4_nibble_select = (PSMT4_SWIZZLE && dispfb_psm_t4)
+                                ? swizzle4_nibble_hi
+                                : pixel_index[0];
+    // Ch158 — pair the nibble selector with vram_read_data: in
+    // legacy comb-read mode they are both same-cycle; in sync-read
+    // mode the selector is registered (psm4_nibble_select_dec) so
+    // it lines up with the registered VRAM data. The `_dec` mux
+    // selects between the two views via `VRAM_SYNC_READ`. The
+    // BYTE that holds the nibble is picked from `vram_byte_lane`
+    // (the byte_addr[1:0]-keyed lane in sync mode, the low lane
+    // in legacy mode — see the audit-Medium fix above).
+    assign psm4_nibble = psm4_nibble_select_dec ? vram_byte_lane[7:4]
+                                                : vram_byte_lane[3:0];
+    assign psm4_idx    = {4'd0, psm4_nibble};
+    assign psm4_gray   = {psm4_nibble, psm4_nibble};
+
+    // Ch97/Ch103 — CLUT effective index. `clut_csa` shifts the
+    // lookup window in 16-entry units. The 8-bit add wraps mod
+    // 256, matching the size of the staging area. The base index
+    // is the PSMT8 byte index for PSMT8, the zero-extended PSMT4
+    // nibble for PSMT4, otherwise unused (pcrtc just doesn't
+    // consume the CLUT output).
+    // Ch158 — clut_idx_base + clut_read_idx are derived from
+    // vram_read_data (already aligned with the data-decode stage)
+    // and from `dispfb_psm_t4_dec` (the registered/passthrough
+    // PSM flag), so the CLUT lookup happens on the same cycle as
+    // the pixel-emit decode comb.
+    logic [7:0] clut_idx_base;
+    assign clut_idx_base = dispfb_psm_t4_dec ? psm4_idx : psm8_idx;
+    assign clut_read_idx = clut_idx_base + {clut_csa, 4'd0};
+
+    always_comb begin
+        if (de && scanout_enable_dec && in_display_window_dec) begin
+            if (dispfb_psm_ct16_dec) begin
+                r = psm16_r8;
+                g = psm16_g8;
+                b = psm16_b8;
+            end else if (dispfb_psm_t8_dec) begin
+                if (clut_enable) begin
+                    // CLUT lookup. Each entry is PSMCT32. Byte
+                    // order matches PSMCT32 framebuffer reads:
+                    //   [7:0]=R, [15:8]=G, [23:16]=B, [31:24]=A
+                    r = clut_read_data[7:0];
+                    g = clut_read_data[15:8];
+                    b = clut_read_data[23:16];
+                end else begin
+                    // Ch96 fallback: surface index as grayscale.
+                    r = psm8_idx;
+                    g = psm8_idx;
+                    b = psm8_idx;
+                end
+            end else if (dispfb_psm_t4_dec) begin
+                if (clut_enable) begin
+                    // Ch103 — PSMT4 + CLUT. The 4-bit nibble has
+                    // already been mux'd into clut_read_idx via
+                    // clut_idx_base + (CSA<<4); the returned
+                    // entry is PSMCT32 ABGR.
+                    r = clut_read_data[7:0];
+                    g = clut_read_data[15:8];
+                    b = clut_read_data[23:16];
+                end else begin
+                    // Grayscale fallback — replicate the nibble
+                    // across the 8-bit DAC value so 4'hF → 8'hFF.
+                    r = psm4_gray;
+                    g = psm4_gray;
+                    b = psm4_gray;
+                end
+            end else begin
+                // PSMCT32 — the only remaining format that
+                // dispfb_psm_ok admits at this scope.
+                r = vram_read_data[7:0];
+                g = vram_read_data[15:8];
+                b = vram_read_data[23:16];
+            end
+        end else begin
+            r = 8'd0;
+            g = 8'd0;
+            b = 8'd0;
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace: one EV_MODE per completed frame.
+    // ------------------------------------------------------------------
+
+    logic [31:0] frame_count;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            frame_count <= 32'd0;
+
+            ev_valid    <= 1'b0;
+            ev_subsys   <= SUBSYS_PLAT;
+            ev_event    <= EV_MODE;
+            ev_arg0     <= 64'd0;
+            ev_arg1     <= 64'd0;
+            ev_arg2     <= 64'd0;
+            ev_arg3     <= 64'd0;
+            ev_flags    <= 32'd0;
+        end else if (end_of_frame_dec) begin
+            // Ch158: when VRAM_SYNC_READ=1, end_of_frame_dec lags
+            // the counter-side end_of_frame by 1 cycle so it fires
+            // when the LAST visible pixel actually emits (which is
+            // 1 cycle after the address-stage hits the last cell).
+            // Legacy comb-read passthrough makes end_of_frame_dec
+            // == end_of_frame, so existing TBs are unaffected.
+            frame_count <= frame_count + 32'd1;
+
+            ev_valid    <= 1'b1;
+            ev_subsys   <= SUBSYS_PLAT;
+            ev_event    <= EV_MODE;
+            ev_arg0     <= {32'd0, frame_count};
+            ev_arg1     <= {32'd0, 32'(H_ACTIVE * V_ACTIVE)};
+            ev_arg2     <= 64'd0;
+            ev_arg3     <= 64'd0;
+            ev_flags    <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : gs_pcrtc_stub
@@ -0,0 +1,109 @@
+// retroDE_ps2 — gs_persp_uv (Ch301)
+//
+// Per-pixel PERSPECTIVE-CORRECT texture-coordinate divide. Given the three
+// affinely-interpolated perspective attributes at a pixel —
+//
+//     uq = (u/w) * 2**FRAC      (u-over-w, fixed-point)
+//     vq = (v/w) * 2**FRAC      (v-over-w, fixed-point)
+//     q  = (1/w) * 2**FRAC      (one-over-w, fixed-point)
+//
+// — this recovers the integer texel coordinates:
+//
+//     w_recip = 1/q  (= w, via the pipelined gs_reciprocal_stub LUT, NO divider)
+//     u_texel = (uq * w_recip) >> SCALE   (= (u/w) * w = u)
+//     v_texel = (vq * w_recip) >> SCALE   (= (v/w) * w = v)
+//
+// gs_reciprocal_stub returns recip = floor(2**SCALE / q). With q = (1/w)<<FRAC
+// that is recip = w << (SCALE-FRAC). Then uq*recip = (u/w<<FRAC)*(w<<(SCALE-FRAC))
+// = u << SCALE, so (uq*recip) >> SCALE = u.  (The FRAC scaling cancels.)
+//
+// Pipeline (NO divider, ~1 result/cycle):
+//   recip:  RLAT cycles (gs_reciprocal_stub, 3).
+//   uq/vq:  delayed RLAT cycles to align with recip.
+//   mul:    1 cycle (uq*recip, vq*recip) + shift + clamp.
+//   total latency = RLAT + 1.
+//
+// Output texel coords are clamped to [0, TEXEL_MAX] (saturating), matching the
+// integer-coord clamp the affine path already applies.
+
+`timescale 1ns/1ps
+
+module gs_persp_uv #(
+    parameter int ATTR_W    = 24,   // width of uq/vq  ((u/w)<<FRAC)
+    parameter int Q_W       = 24,   // width of q      ((1/w)<<FRAC)
+    parameter int FRAC      = 12,   // fixed-point fraction bits of the attributes
+    parameter int SCALE     = 24,   // gs_reciprocal scale (recip = floor(2**SCALE/q))
+    parameter int RECIP_W   = 25,
+    parameter int TEXEL_W   = 11,
+    parameter int TEXEL_MAX = 2047,
+    // Ch351 — reciprocal LUT mantissa width. Default 8 (256-entry) is byte-identical to Ch301/342/348.
+    // Far-W perspective draws (small Q at high PERSP_FRAC) want more: 11 (2048-entry) ~ 0.05% rel error.
+    parameter int RECIP_IDX_BITS = 8
+) (
+    input  logic               clk,
+    input  logic               rst_n,
+    input  logic               in_valid,
+    input  logic [ATTR_W-1:0]  uq,
+    input  logic [ATTR_W-1:0]  vq,
+    input  logic [Q_W-1:0]     q,
+    output logic               out_valid,
+    output logic [TEXEL_W-1:0] u,
+    output logic [TEXEL_W-1:0] v
+);
+    localparam int RLAT = 3;  // gs_reciprocal_stub latency
+
+    // --- reciprocal of q (= w), pipelined LUT, no divider ---
+    logic               recip_valid;
+    logic [RECIP_W-1:0] w_recip;
+    gs_reciprocal_stub #(
+        .Q_W(Q_W), .IDX_BITS(RECIP_IDX_BITS), .SCALE(SCALE), .OUT_W(RECIP_W)
+    ) u_recip (
+        .clk(clk), .rst_n(rst_n),
+        .in_valid(in_valid), .q(q),
+        .out_valid(recip_valid), .recip(w_recip)
+    );
+
+    // --- delay uq/vq by RLAT to align with w_recip ---
+    logic [ATTR_W-1:0] uq_pipe [0:RLAT-1];
+    logic [ATTR_W-1:0] vq_pipe [0:RLAT-1];
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            for (int i = 0; i < RLAT; i++) begin
+                uq_pipe[i] <= '0;
+                vq_pipe[i] <= '0;
+            end
+        end else begin
+            uq_pipe[0] <= uq;
+            vq_pipe[0] <= vq;
+            for (int i = 1; i < RLAT; i++) begin
+                uq_pipe[i] <= uq_pipe[i-1];
+                vq_pipe[i] <= vq_pipe[i-1];
+            end
+        end
+    end
+
+    // --- multiply + shift + clamp (1 reg stage) ---
+    localparam int PROD_W = ATTR_W + RECIP_W;
+    function automatic logic [TEXEL_W-1:0] clamp_texel(input logic [PROD_W-1:0] prod);
+        logic [PROD_W-1:0] shifted;
+        shifted = prod >> SCALE;
+        if (shifted > PROD_W'(TEXEL_MAX)) clamp_texel = TEXEL_W'(TEXEL_MAX);
+        else                              clamp_texel = shifted[TEXEL_W-1:0];
+    endfunction
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            out_valid <= 1'b0;
+            u         <= '0;
+            v         <= '0;
+        end else begin
+            logic [PROD_W-1:0] u_prod, v_prod;
+            out_valid <= recip_valid;
+            u_prod    = uq_pipe[RLAT-1] * w_recip;
+            v_prod    = vq_pipe[RLAT-1] * w_recip;
+            u         <= clamp_texel(u_prod);
+            v         <= clamp_texel(v_prod);
+        end
+    end
+
+endmodule : gs_persp_uv
@@ -0,0 +1,287 @@
+// gs_prim_list_feeder — Ch330 Brick 1
+//
+// Runtime primitive-list feeder (minimal). Reads a NORMALIZED combined-TAZ triangle
+// list from a small staging RAM and EXPANDS each record into the exact gif_reg_*
+// write sequence gs_stub already consumes — reusing the entire proven ingestion
+// (vertex window, bbox, attr packing, FIFO/grid, tile renderer). This is NOT GIF A+D
+// decode: the format is per-PRIMITIVE records and the feeder knows the fixed
+// "combined-TAZ triangle" shape (shared state once, then PRIM + 3 vertices each).
+//
+// In the Ch330 command-list profile the feeder is the EXCLUSIVE owner of gif_reg_*
+// (no arbitration with the GIF unpacker / DMAC). It respects the SAME capacity
+// boundary as the baked path: before the prim-completing vertex (vtx2's XYZ2) it
+// PAUSES while fifo_full, so a full FIFO stalls the feeder instead of dropping prims.
+//
+// Staging layout (64-bit words, word-addressed):
+//   [0]              : { ..., count[15:0] }   — number of triangle records
+//   [1]              : FRAME_1 data           — shared state, emitted once at start
+//   [2]              : ALPHA_1 data
+//   [3]              : TEST_1  data
+//   [4]              : ZBUF_1  data
+//   [5]              : TEX0_1  data
+//   [6]              : PRIM    data            — re-issued per triangle
+//   [7 + 9*i + 0..2] : tri i vtx0 RGBAQ/UV/XYZ2
+//   [7 + 9*i + 3..5] : tri i vtx1 RGBAQ/UV/XYZ2
+//   [7 + 9*i + 6..8] : tri i vtx2 RGBAQ/UV/XYZ2
+//
+// One `start` pulse plays the whole list; `done` pulses when it finishes.
+// Boring on purpose: 2 cycles per emitted register (present addr, then drive).
+
+`timescale 1ns/1ps
+
+module gs_prim_list_feeder #(
+    parameter int STG_ADDR_W = 12
+) (
+    input  logic                   clk,
+    input  logic                   rst_n,
+
+    input  logic                   start,
+    output logic                   busy,
+    output logic                   done,
+    // Ch330 Brick 3 — observability counters (latched per list, cleared at start).
+    output logic [15:0]            records_emitted,   // primitives whose final XYZ2 was emitted
+    output logic [31:0]            fifo_wait_cycles,   // cycles paused at a completing kick under fifo_full
+
+    output logic [STG_ADDR_W-1:0]  stg_rd_addr,
+    input  logic [63:0]            stg_rd_data,
+
+    input  logic                   fifo_full,
+
+    output logic                   gif_reg_wr_en,
+    output logic [7:0]             gif_reg_num,
+    output logic [63:0]            gif_reg_data
+);
+
+    localparam logic [7:0] REG_PRIM    = 8'h00;
+    localparam logic [7:0] REG_RGBAQ   = 8'h01;
+    localparam logic [7:0] REG_ST      = 8'h02;   // Ch342 — perspective ST (S/T) for FST=0 tris
+    localparam logic [7:0] REG_UV      = 8'h03;
+    localparam logic [7:0] REG_XYZ2    = 8'h05;
+    localparam logic [7:0] REG_TEX0_1  = 8'h06;
+    localparam logic [7:0] REG_ALPHA_1 = 8'h42;
+    localparam logic [7:0] REG_TEST_1  = 8'h47;
+    localparam logic [7:0] REG_FRAME_1 = 8'h4C;
+    localparam logic [7:0] REG_ZBUF_1  = 8'h4E;
+
+    localparam int OFF_COUNT = 0;
+    localparam int OFF_FRAME = 1;   // FRAME,ALPHA,TEST,ZBUF,TEX0,PRIM = words 1..6
+    localparam int OFF_TRIS  = 7;
+    localparam int WORDS_PER_TRI = 9;
+
+    // Header registers (loaded once).
+    logic [15:0] tri_count;
+    logic [63:0] hdr_q [0:5];        // [0]=FRAME [1]=ALPHA [2]=TEST [3]=ZBUF [4]=TEX0 [5]=PRIM
+    // setup-emit index -> GIF reg num (iverilog-12: no unpacked localparam array).
+    function automatic logic [7:0] hdr_reg_num(input logic [2:0] i);
+        unique case (i)
+            3'd0:    hdr_reg_num = REG_FRAME_1;
+            3'd1:    hdr_reg_num = REG_ALPHA_1;
+            3'd2:    hdr_reg_num = REG_TEST_1;
+            3'd3:    hdr_reg_num = REG_ZBUF_1;
+            default: hdr_reg_num = REG_TEX0_1;
+        endcase
+    endfunction
+
+    typedef enum logic [3:0] {
+        S_IDLE,
+        S_HDR_RD, S_HDR_LD,          // read words 0..6 into tri_count/rect_count + hdr_q
+        S_SETUP,                     // emit FRAME/ALPHA/TEST/ZBUF/TEX0 from hdr_q
+        S_PRIM,                      // emit PRIM (hdr_q[5]) for the current tri
+        S_VTX_RD, S_VTX_EMIT,        // walk the 9 vertex words of the current tri
+        S_AFTER_TRIS,                // Ch334 — tris done; start rects if any, else done
+        S_RECT_RD, S_RECT_LD,        // Ch334 — read a rect's 3 words (color, corner0, corner1)
+        S_RECT_EMIT,                 // Ch334 — emit the 20-step 2-triangle expansion of one rect
+        S_DONE
+    } state_t;
+
+    localparam int WORDS_PER_RECT = 3;          // Ch334 — color + corner0(XYZ2) + corner1(XYZ2)
+
+    state_t      state;
+    logic [3:0]  hdr_i;              // 0..6 header-word read index
+    logic [2:0]  setup_i;            // 0..4 setup-emit index
+    logic [15:0] tri_idx;            // 0..tri_count-1
+    logic [3:0]  vtx_word;           // 0..8 within a tri
+
+    // Ch334 — native rectangle records (one record -> two colored triangles, expanded HERE).
+    logic [15:0] rect_count;         // count[31:16]
+    logic [15:0] rect_idx;           // 0..rect_count-1
+    logic [1:0]  rect_word;          // 0..2 read index within a rect record
+    logic [63:0] rect_color;         // RGBAQ for both triangles
+    logic [63:0] rect_c0, rect_c1;   // the two opposite corners (XYZ2-packed)
+    logic [4:0]  rect_emit;          // 0..19 emit step
+
+    // Ch342 — PERSPECTIVE format flag (word0[32]). 0 = legacy RGBAQ/UV/XYZ2 per vertex (byte-exact).
+    // 1 = RGBAQ/ST/XYZ2: the middle vertex word is emitted as REG_ST (host packs S_fp[23:0]/T_fp[55:32],
+    // 24-bit FRAC=12) and RGBAQ carries Q_fp[55:32]; PRIM (hdr_q[5]) must be FST=0. Same 9 words/tri,
+    // same 27-tri cap. Rects are not allowed in this format (rect_count forced 0 at header load).
+    logic perspective_mode;
+    // Ch345a — SPRITE format flag (word0[33]). 1 = each primitive is a SPRITE record: 2 vertices x
+    // (RGBAQ, UV, XYZ2) = 6 words, vs a TRI's 3 vertices = 9 words. PRIM (hdr_q[5]) carries SPRITE+TME+ABE
+    // and gs_stub kicks on the 2nd XYZ2 per the PRIM type. Affine UV only (perspective_mode forced 0 with
+    // it); rects forced off. Same shared-state setup (FRAME/ALPHA/TEST/ZBUF/TEX0/PRIM). Narrow grammar:
+    // PSMCT32 dest+tex, UV affine, ABE source-over, TCC texel alpha — the Ch344-proven subset.
+    logic sprite_mode;
+    wire [4:0] words_per_prim = sprite_mode ? 5'd6 : 5'd9;   // staging words per primitive
+    wire [3:0] last_vtx_word  = sprite_mode ? 4'd5 : 4'd8;   // final XYZ2 of the primitive (the kick)
+    logic [7:0] vtx_reg_num;
+    always_comb unique case (vtx_word % 3)
+        2'd0:    vtx_reg_num = REG_RGBAQ;
+        2'd1:    vtx_reg_num = perspective_mode ? REG_ST : REG_UV;
+        default: vtx_reg_num = REG_XYZ2;
+    endcase
+    wire vtx_completing = (vtx_word == last_vtx_word);   // final XYZ2 = the FIFO push / kick
+
+    // Ch334 — corner fields (XYZ2 layout: x=[15:4], y=[31:20], z=[63:32]) + a packer.
+    wire [11:0] rx0 = rect_c0[15:4];  wire [11:0] ry0 = rect_c0[31:20];
+    wire [11:0] rx1 = rect_c1[15:4];  wire [11:0] ry1 = rect_c1[31:20];
+    wire [31:0] rz  = rect_c0[63:32];
+    function automatic logic [63:0] mk_xyz2(input logic [11:0] x, input logic [11:0] y, input logic [31:0] z);
+        mk_xyz2 = {z, y, 4'd0, x, 4'd0};
+    endfunction
+    // 20-step expansion: [PRIM, (RGBAQ,UV,XYZ2)x3] x2. Two tris cover the quad (x0,y0)-(x1,y1).
+    logic [7:0]  rect_reg;
+    logic [63:0] rect_dat;
+    always_comb begin
+        unique case (rect_emit)
+            5'd0, 5'd10:                                  rect_reg = REG_PRIM;
+            5'd1,5'd4,5'd7,5'd11,5'd14,5'd17:             rect_reg = REG_RGBAQ;
+            5'd2,5'd5,5'd8,5'd12,5'd15,5'd18:             rect_reg = REG_UV;
+            default:                                      rect_reg = REG_XYZ2;  // 3,6,9,13,16,19
+        endcase
+        unique case (rect_emit)
+            5'd0, 5'd10:                                  rect_dat = hdr_q[5];        // PRIM
+            5'd1,5'd4,5'd7,5'd11,5'd14,5'd17:             rect_dat = rect_color;      // RGBAQ
+            5'd2,5'd5,5'd8,5'd12,5'd15,5'd18:             rect_dat = 64'd0;           // UV (uniform texture)
+            5'd3:    rect_dat = mk_xyz2(rx0, ry0, rz);    // tri1 v0
+            5'd6:    rect_dat = mk_xyz2(rx1, ry0, rz);    // tri1 v1
+            5'd9:    rect_dat = mk_xyz2(rx0, ry1, rz);    // tri1 v2 (completes tri1)
+            5'd13:   rect_dat = mk_xyz2(rx1, ry0, rz);    // tri2 v0
+            5'd16:   rect_dat = mk_xyz2(rx0, ry1, rz);    // tri2 v1
+            default: rect_dat = mk_xyz2(rx1, ry1, rz);    // tri2 v2 (idx 19, completes the rect)
+        endcase
+    end
+    wire rect_completing = (rect_emit == 5'd9) || (rect_emit == 5'd19);  // the two FIFO pushes
+
+    assign busy = (state != S_IDLE) && (state != S_DONE);
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            state <= S_IDLE; tri_count <= 0; hdr_i <= 0; setup_i <= 0; perspective_mode <= 1'b0; sprite_mode <= 1'b0;
+            tri_idx <= 0; vtx_word <= 0; stg_rd_addr <= '0;
+            gif_reg_wr_en <= 1'b0; gif_reg_num <= 8'd0; gif_reg_data <= 64'd0; done <= 1'b0;
+            records_emitted <= 16'd0; fifo_wait_cycles <= 32'd0;
+            rect_count <= 0; rect_idx <= 0; rect_word <= 0; rect_emit <= 0;
+            rect_color <= 64'd0; rect_c0 <= 64'd0; rect_c1 <= 64'd0;
+            for (int k=0;k<6;k++) hdr_q[k] <= 64'd0;
+        end else begin
+            gif_reg_wr_en <= 1'b0;
+            done          <= 1'b0;
+
+            unique case (state)
+                S_IDLE: if (start) begin
+                    hdr_i <= 4'd0; stg_rd_addr <= STG_ADDR_W'(OFF_COUNT); state <= S_HDR_RD;
+                    records_emitted <= 16'd0; fifo_wait_cycles <= 32'd0;   // clear per list
+                end
+
+                // ---- header load: words 0..6 (count + 6 shared-state words) ----
+                S_HDR_RD: state <= S_HDR_LD;            // addr presented; data next cycle
+                S_HDR_LD: begin
+                    if (hdr_i == 4'd0) begin
+                        tri_count  <= stg_rd_data[15:0];
+                        // Ch342 — word0[32] = perspective format flag; rects are not allowed with it
+                        // (force rect_count 0 so the rect-expansion path can never run in this format).
+                        perspective_mode <= stg_rd_data[32];
+                        sprite_mode      <= stg_rd_data[33];   // Ch345a
+                        // rects disallowed with the perspective OR sprite format.
+                        rect_count <= (stg_rd_data[32] || stg_rd_data[33]) ? 16'd0 : stg_rd_data[31:16];
+                    end
+                    else               hdr_q[hdr_i-4'd1] <= stg_rd_data;
+                    if (hdr_i == 4'd6) begin            // all of count + hdr_q[0..5] loaded
+                        setup_i <= 3'd0; state <= S_SETUP;
+                    end else begin
+                        hdr_i <= hdr_i + 4'd1;
+                        stg_rd_addr <= STG_ADDR_W'(OFF_COUNT) + STG_ADDR_W'(hdr_i + 4'd1);
+                        state <= S_HDR_RD;
+                    end
+                end
+
+                // ---- emit shared state once (from hdr_q, no staging read) ----
+                S_SETUP: begin
+                    gif_reg_wr_en <= 1'b1;
+                    gif_reg_num   <= hdr_reg_num(setup_i);
+                    gif_reg_data  <= hdr_q[setup_i];
+                    if (setup_i == 3'd4) begin
+                        tri_idx <= 16'd0;
+                        state   <= (tri_count == 16'd0) ? S_AFTER_TRIS : S_PRIM;
+                    end else begin
+                        setup_i <= setup_i + 3'd1;
+                    end
+                end
+
+                // ---- per triangle: PRIM, then 9 vertex words ----
+                S_PRIM: begin
+                    gif_reg_wr_en <= 1'b1; gif_reg_num <= REG_PRIM; gif_reg_data <= hdr_q[5];
+                    vtx_word      <= 4'd0;
+                    stg_rd_addr   <= STG_ADDR_W'(OFF_TRIS) + STG_ADDR_W'(tri_idx * words_per_prim);
+                    state         <= S_VTX_RD;
+                end
+                S_VTX_RD: state <= S_VTX_EMIT;          // vert-word addr presented; data next cycle
+                S_VTX_EMIT: begin
+                    if (vtx_completing && fifo_full) begin
+                        // pause: hold addr/data, do not emit, until the FIFO drains a slot
+                        fifo_wait_cycles <= fifo_wait_cycles + 32'd1;
+                        state <= S_VTX_EMIT;
+                    end else begin
+                        gif_reg_wr_en <= 1'b1; gif_reg_num <= vtx_reg_num; gif_reg_data <= stg_rd_data;
+                        if (vtx_word == last_vtx_word) begin
+                            records_emitted <= records_emitted + 16'd1;   // a primitive's final XYZ2 emitted
+                            if (tri_idx + 16'd1 == tri_count) state <= S_AFTER_TRIS;
+                            else begin tri_idx <= tri_idx + 16'd1; state <= S_PRIM; end
+                        end else begin
+                            vtx_word    <= vtx_word + 4'd1;
+                            stg_rd_addr <= stg_rd_addr + STG_ADDR_W'(1);
+                            state       <= S_VTX_RD;
+                        end
+                    end
+                end
+
+                // ---- Ch334 — native rectangles: each record = 3 words, expands to 2 tris ----
+                S_AFTER_TRIS: begin
+                    if (rect_count != 16'd0) begin
+                        rect_idx    <= 16'd0; rect_word <= 2'd0;
+                        stg_rd_addr <= STG_ADDR_W'(OFF_TRIS) + STG_ADDR_W'(tri_count * words_per_prim);
+                        state       <= S_RECT_RD;
+                    end else state <= S_DONE;
+                end
+                S_RECT_RD: state <= S_RECT_LD;          // rect-word addr presented; data next cycle
+                S_RECT_LD: begin
+                    unique case (rect_word)
+                        2'd0:    rect_color <= stg_rd_data;
+                        2'd1:    rect_c0    <= stg_rd_data;
+                        default: rect_c1    <= stg_rd_data;
+                    endcase
+                    stg_rd_addr <= stg_rd_addr + STG_ADDR_W'(1);   // advance through every rect word
+                    if (rect_word == 2'd2) begin rect_word <= 2'd0; rect_emit <= 5'd0; state <= S_RECT_EMIT; end
+                    else                   begin rect_word <= rect_word + 2'd1; state <= S_RECT_RD; end
+                end
+                S_RECT_EMIT: begin
+                    if (rect_completing && fifo_full) begin
+                        fifo_wait_cycles <= fifo_wait_cycles + 32'd1;   // pause at a completing XYZ2
+                        state <= S_RECT_EMIT;
+                    end else begin
+                        gif_reg_wr_en <= 1'b1; gif_reg_num <= rect_reg; gif_reg_data <= rect_dat;
+                        if (rect_completing) records_emitted <= records_emitted + 16'd1;  // one tri done
+                        if (rect_emit == 5'd19) begin                  // whole rect emitted (2 tris)
+                            if (rect_idx + 16'd1 == rect_count) state <= S_DONE;
+                            else begin rect_idx <= rect_idx + 16'd1; state <= S_RECT_RD; end  // addr already at next base
+                        end else rect_emit <= rect_emit + 5'd1;
+                    end
+                end
+
+                S_DONE: begin done <= 1'b1; state <= S_IDLE; end
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+endmodule : gs_prim_list_feeder
@@ -0,0 +1,127 @@
+// retroDE_ps2 — gs_reciprocal_stub (Ch301)
+//
+// Pipelined fixed-point reciprocal unit for PERSPECTIVE-CORRECT texture
+// interpolation. Computes  recip = floor(2**SCALE / q)  for an unsigned input
+// q, with NO divider in the datapath — a serialized per-pixel divide would
+// stall the ~1-pixel/cycle rasterizer (the architect's explicit constraint).
+//
+// Method — range-reduced table lookup (classic LUT reciprocal):
+//   1. e   = position of q's most-significant set bit (0..Q_W-1).
+//   2. M   = q normalized to an IDX_BITS-wide mantissa with its MSB at the top
+//            (M in [2**(IDX_BITS-1) .. 2**IDX_BITS-1)), i.e. q ~= M * 2**(e-(IDX_BITS-1)).
+//   3. recip = LUT[M] >> e, where LUT[M] = floor(2**(SCALE+IDX_BITS-1) / M).
+//      Proof: LUT[M] >> e ~= 2**(SCALE+IDX_BITS-1)/(M * 2**e)
+//                          = 2**SCALE / (M * 2**(e-(IDX_BITS-1)))
+//                          = 2**SCALE / q.   ✓ (uniform for all e)
+//
+// Accuracy is ~1 part in 2**IDX_BITS (relative). For the first perspective
+// rung (texel coords <= 63) an 8-bit mantissa gives sub-texel error; bump
+// IDX_BITS for tighter precision later if real traces demand it.
+//
+// Pipeline: 3 stages (LAT=3), one result per cycle.
+//   S0: register input q + valid.
+//   S1: e = msb(q); M = normalize(q).
+//   S2: lut_out = LUT[M]; carry e.
+//   S3: recip = lut_out >> e; out_valid.
+//
+// q==0 saturates to all-ones (1/0 -> +inf), which is harmless for the demo
+// (q = 1/w with w finite positive is always > 0).
+//
+// LUT init is a computed `initial` for-loop (Quartus infers ROM from it). If a
+// future synth flow rejects it, switch to $readmemh of a generated .mem.
+
+`timescale 1ns/1ps
+
+module gs_reciprocal_stub #(
+    parameter int Q_W      = 24,   // input width (q in [1, 2**Q_W))
+    parameter int IDX_BITS = 8,    // mantissa / LUT-index width (256 entries)
+    parameter int SCALE    = 24,   // output = floor(2**SCALE / q)
+    parameter int OUT_W    = 25    // output width (recip <= 2**SCALE for q>=1)
+) (
+    input  logic              clk,
+    input  logic              rst_n,
+    input  logic              in_valid,
+    input  logic [Q_W-1:0]    q,
+    output logic              out_valid,
+    output logic [OUT_W-1:0]  recip
+);
+
+    localparam int LUT_N    = (1 << IDX_BITS);
+    localparam int TOP_BIT  = IDX_BITS - 1;          // mantissa MSB position
+    // LUT entries: floor(2**(SCALE+TOP_BIT) / M). Only M in [2**TOP_BIT .. LUT_N-1]
+    // are ever addressed (M always has its MSB set after normalization).
+    localparam int LUT_W    = SCALE + 1;             // wide enough for M=2**TOP_BIT
+    logic [LUT_W-1:0] lut [0:LUT_N-1];
+
+    initial begin
+        // 2**(SCALE+TOP_BIT) as a 64-bit constant numerator.
+        longint unsigned num;
+        num = (64'd1 << (SCALE + TOP_BIT));
+        for (int m = 0; m < LUT_N; m++) begin
+            if (m == 0) lut[m] = '0;
+            else        lut[m] = LUT_W'(num / m);
+        end
+    end
+
+    // --- combinational msb-detect + normalize (S0->S1 inputs) ---
+    function automatic int unsigned msb_index(input logic [Q_W-1:0] v);
+        msb_index = 0;
+        for (int i = 0; i < Q_W; i++)
+            if (v[i]) msb_index = i;
+    endfunction
+
+    // ---------------- S1: e + mantissa (from the LIVE input) ----------------
+    // The msb-detect + normalize is combinational on the input q and registered
+    // here, so the whole unit is exactly 3 register stages (S1/S2/S3) → LAT=3.
+    logic                 s1_valid;
+    logic [$clog2(Q_W):0] s1_e;
+    logic [IDX_BITS-1:0]  s1_m;
+    logic                 s1_zero;
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            s1_valid <= 1'b0; s1_e <= '0; s1_m <= '0; s1_zero <= 1'b0;
+        end else begin
+            int unsigned e;
+            logic [Q_W-1:0] norm;
+            e        = msb_index(q);
+            s1_valid <= in_valid;
+            s1_zero  <= (q == '0);
+            s1_e     <= ($clog2(Q_W)+1)'(e);
+            // normalize so the mantissa MSB sits at bit TOP_BIT
+            if (e >= TOP_BIT) norm = q >> (e - TOP_BIT);
+            else              norm = q << (TOP_BIT - e);
+            s1_m     <= norm[IDX_BITS-1:0];
+        end
+    end
+
+    // ---------------- S2: LUT read ------------------------
+    logic                 s2_valid;
+    logic [$clog2(Q_W):0] s2_e;
+    logic [LUT_W-1:0]     s2_lut;
+    logic                 s2_zero;
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            s2_valid <= 1'b0; s2_e <= '0; s2_lut <= '0; s2_zero <= 1'b0;
+        end else begin
+            s2_valid <= s1_valid;
+            s2_e     <= s1_e;
+            s2_lut   <= lut[s1_m];
+            s2_zero  <= s1_zero;
+        end
+    end
+
+    // ---------------- S3: shift back ----------------------
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            out_valid <= 1'b0; recip <= '0;
+        end else begin
+            logic [LUT_W-1:0] shifted;
+            out_valid <= s2_valid;
+            shifted   = s2_lut >> s2_e;
+            if (s2_zero)              recip <= '1;                 // 1/0 -> saturate
+            else if (shifted > OUT_W'('1)) recip <= '1;           // clamp to OUT_W
+            else                      recip <= OUT_W'(shifted);
+        end
+    end
+
+endmodule : gs_reciprocal_stub
@@ -0,0 +1,231 @@
+// retroDE_ps2 — gs_swizzle_psmct16_stub (Ch125)
+//
+// Pure-combinational PSMCT16 page/block/column swizzle: maps a
+// pixel coordinate (x, y) within a framebuffer at (FBP, FBW) to
+// its physical VRAM byte address using the real PS2 GS PSMCT16
+// layout. Mirrors Ch119's `gs_swizzle_psmct32_stub` shape but
+// with PSMCT16's 4-cols × 8-rows page block grid (represented
+// as `blockTable16[8][4]` indexed `[block_y][block_x]`) and the
+// within-block column table that PSMCT32 didn't need (CT32
+// within-block IS row-major halfwords by accident; CT16 is not).
+//
+// THIS MODULE IS NOT YET WIRED INTO gs_pcrtc_stub /
+// gif_image_xfer_stub / gs_stub. Future chapters will wire it
+// behind a `PSMCT16_SWIZZLE`-style parameter gate, mirroring the
+// PSMCT32 progression (Ch120 read-side → Ch121 image-xfer write
+// side → Ch122 raster write side → Ch123/Ch124 e2e demos).
+// Default-off keeps the legacy linear PSMCT16 TBs (Ch94, Ch95,
+// Ch103, Ch116) on the linear path.
+//
+// SOURCE-TABLE PROVENANCE (per Codex's Ch125 guidance):
+//   blockTable16   — pcsx2/GS/GSTables.cpp lines 29–39, master
+//                    HEAD commit 3d71e310 (file-touch commit
+//                    d983b2b0, 2026-01-12). 8 rows × 4 cols,
+//                    indexed [block_y][block_x].
+//   columnTable16  — pcsx2/GS/GSTables.cpp lines 91–109, same
+//                    commit. 8 rows × 16 cols, indexed [yb][xb],
+//                    values are halfword-within-block (0..127).
+//   Cross-check    — older GSdx (Debian pcsx2 1.5.0~gfc1d9aef0)
+//                    PixelAddressOrg16(x, y, bp, bw) =
+//                    (BlockNumber16(x, y, bp, bw) << 7) +
+//                    columnTable16[y & 7][x & 15], with
+//                    BlockNumber16 = bp + ((y>>1) & ~0x1f)*bw +
+//                    ((x>>1) & ~0x1f) +
+//                    blockTable16[(y>>3)&7][(x>>4)&3].
+//                    The `<< 7` confirms columnTable16 is in
+//                    halfword units (block = 128 halfwords).
+//                    Multiply final value by 2 for byte address.
+//                    PCSX2's `bp` is in 256-byte block-pointer
+//                    units; in our FBP (2048-byte) units,
+//                    bp = FBP * 8, so bp*256 = FBP*2048.
+//
+// NOTE on PCSX2 license: the PCSX2 project is GPL-3.0+. This
+// stub re-expresses the same PSMCT16 swizzle math in
+// SystemVerilog as a hardware contract — the values in the
+// blockTable16 / columnTable16 case statements come from PCSX2
+// source and represent the PS2 hardware layout itself (not
+// PCSX2-original creative content). The retroDE_ps2 project
+// authors should consider whether this provenance affects
+// licensing for downstream consumers; from an engineering
+// correctness standpoint, locking against the canonical source
+// is the only way to be byte-accurate to real PS2 VRAM.
+//
+// Real PS2 PSMCT16 layout:
+//   - VRAM is 4 MiB total, organized in 8 KiB pages.
+//   - Each page is 64×64 PSMCT16 pixels (= 64*64*2 = 8192 bytes).
+//     2× as many pixels per page as PSMCT32 (which has 64×32 px)
+//     because each PSMCT16 pixel is 2 bytes vs CT32's 4.
+//   - Each page is divided into a 4×8 grid of blocks (4 cols of
+//     blocks across, 8 rows down). Each block is 16×8 PSMCT16
+//     pixels (= 16*8*2 = 256 bytes). 4×8 = 32 blocks/page.
+//   - Block ordering within a page follows blockTable16, which
+//     differs from blockTable32 because the grid shape is
+//     different (8×4 vs 4×8).
+//   - Within a block, halfword placement follows columnTable16:
+//     a 16×8 → 128-entry permutation that organizes the 4
+//     internal 16×2-pixel sub-columns and interleaves the two
+//     pixel rows per sub-column.
+//
+// Address formula (FBP in 2048-byte units; FBW in 64-pixel
+// units; addr in bytes):
+//   page_x          = x / 64
+//   page_y          = y / 64
+//   page_index      = page_y * FBW + page_x
+//   page_base       = FBP*2048 + page_index*8192
+//
+//   block_x_in_page = (x % 64) / 16             // 0..3
+//   block_y_in_page = (y % 64) / 8              // 0..7
+//   block_idx       = blockTable16[block_y_in_page][block_x_in_page]
+//   block_base      = page_base + block_idx*256
+//
+//   xb              = x % 16
+//   yb              = y % 8
+//   hw_idx          = columnTable16[yb][xb]     // 0..127
+//   addr            = block_base + hw_idx*2
+
+`timescale 1ns/1ps
+
+module gs_swizzle_psmct16_stub
+(
+    input  logic [8:0]  fbp,    // FBP — frame base, in 2048-byte units
+    input  logic [5:0]  fbw,    // FBW — frame width, in 64-pixel units
+    input  logic [11:0] x,
+    input  logic [11:0] y,
+    output logic [31:0] addr
+);
+
+    // --------------------------------------------------------------
+    // blockTable16 (verbatim from pcsx2/GS/GSTables.cpp lines 29–39).
+    // Indexed [block_y_in_page (0..7)][block_x_in_page (0..3)].
+    // --------------------------------------------------------------
+    function automatic logic [4:0] swizzle_psmct16(
+        input logic [2:0] by,
+        input logic [1:0] bx);
+        case ({by, bx})
+            5'd0:  return 5'd0;     // (0,0)
+            5'd1:  return 5'd2;     // (0,1)
+            5'd2:  return 5'd8;     // (0,2)
+            5'd3:  return 5'd10;    // (0,3)
+            5'd4:  return 5'd1;     // (1,0)
+            5'd5:  return 5'd3;     // (1,1)
+            5'd6:  return 5'd9;     // (1,2)
+            5'd7:  return 5'd11;    // (1,3)
+            5'd8:  return 5'd4;     // (2,0)
+            5'd9:  return 5'd6;     // (2,1)
+            5'd10: return 5'd12;    // (2,2)
+            5'd11: return 5'd14;    // (2,3)
+            5'd12: return 5'd5;     // (3,0)
+            5'd13: return 5'd7;     // (3,1)
+            5'd14: return 5'd13;    // (3,2)
+            5'd15: return 5'd15;    // (3,3)
+            5'd16: return 5'd16;    // (4,0)
+            5'd17: return 5'd18;    // (4,1)
+            5'd18: return 5'd24;    // (4,2)
+            5'd19: return 5'd26;    // (4,3)
+            5'd20: return 5'd17;    // (5,0)
+            5'd21: return 5'd19;    // (5,1)
+            5'd22: return 5'd25;    // (5,2)
+            5'd23: return 5'd27;    // (5,3)
+            5'd24: return 5'd20;    // (6,0)
+            5'd25: return 5'd22;    // (6,1)
+            5'd26: return 5'd28;    // (6,2)
+            5'd27: return 5'd30;    // (6,3)
+            5'd28: return 5'd21;    // (7,0)
+            5'd29: return 5'd23;    // (7,1)
+            5'd30: return 5'd29;    // (7,2)
+            default: return 5'd31;  // (7,3)
+        endcase
+    endfunction
+
+    // --------------------------------------------------------------
+    // columnTable16 (verbatim from pcsx2/GS/GSTables.cpp lines 91–109).
+    // Indexed [yb (0..7)][xb (0..15)] → halfword-within-block 0..127.
+    //   yb=0:   0  2  8 10 16 18 24 26  1  3  9 11 17 19 25 27
+    //   yb=1:   4  6 12 14 20 22 28 30  5  7 13 15 21 23 29 31
+    //   yb=2:  32 34 40 42 48 50 56 58 33 35 41 43 49 51 57 59
+    //   yb=3:  36 38 44 46 52 54 60 62 37 39 45 47 53 55 61 63
+    //   yb=4:  64 66 72 74 80 82 88 90 65 67 73 75 81 83 89 91
+    //   yb=5:  68 70 76 78 84 86 92 94 69 71 77 79 85 87 93 95
+    //   yb=6:  96 98 104 106 112 114 120 122  97 99 105 107 113 115 121 123
+    //   yb=7: 100 102 108 110 116 118 124 126 101 103 109 111 117 119 125 127
+    // --------------------------------------------------------------
+    function automatic logic [6:0] col_idx_psmct16(
+        input logic [2:0] yb,
+        input logic [3:0] xb);
+        case ({yb, xb})
+            // yb=0
+            7'd0:   return 7'd0;   7'd1:   return 7'd2;   7'd2:   return 7'd8;   7'd3:   return 7'd10;
+            7'd4:   return 7'd16;  7'd5:   return 7'd18;  7'd6:   return 7'd24;  7'd7:   return 7'd26;
+            7'd8:   return 7'd1;   7'd9:   return 7'd3;   7'd10:  return 7'd9;   7'd11:  return 7'd11;
+            7'd12:  return 7'd17;  7'd13:  return 7'd19;  7'd14:  return 7'd25;  7'd15:  return 7'd27;
+            // yb=1
+            7'd16:  return 7'd4;   7'd17:  return 7'd6;   7'd18:  return 7'd12;  7'd19:  return 7'd14;
+            7'd20:  return 7'd20;  7'd21:  return 7'd22;  7'd22:  return 7'd28;  7'd23:  return 7'd30;
+            7'd24:  return 7'd5;   7'd25:  return 7'd7;   7'd26:  return 7'd13;  7'd27:  return 7'd15;
+            7'd28:  return 7'd21;  7'd29:  return 7'd23;  7'd30:  return 7'd29;  7'd31:  return 7'd31;
+            // yb=2
+            7'd32:  return 7'd32;  7'd33:  return 7'd34;  7'd34:  return 7'd40;  7'd35:  return 7'd42;
+            7'd36:  return 7'd48;  7'd37:  return 7'd50;  7'd38:  return 7'd56;  7'd39:  return 7'd58;
+            7'd40:  return 7'd33;  7'd41:  return 7'd35;  7'd42:  return 7'd41;  7'd43:  return 7'd43;
+            7'd44:  return 7'd49;  7'd45:  return 7'd51;  7'd46:  return 7'd57;  7'd47:  return 7'd59;
+            // yb=3
+            7'd48:  return 7'd36;  7'd49:  return 7'd38;  7'd50:  return 7'd44;  7'd51:  return 7'd46;
+            7'd52:  return 7'd52;  7'd53:  return 7'd54;  7'd54:  return 7'd60;  7'd55:  return 7'd62;
+            7'd56:  return 7'd37;  7'd57:  return 7'd39;  7'd58:  return 7'd45;  7'd59:  return 7'd47;
+            7'd60:  return 7'd53;  7'd61:  return 7'd55;  7'd62:  return 7'd61;  7'd63:  return 7'd63;
+            // yb=4
+            7'd64:  return 7'd64;  7'd65:  return 7'd66;  7'd66:  return 7'd72;  7'd67:  return 7'd74;
+            7'd68:  return 7'd80;  7'd69:  return 7'd82;  7'd70:  return 7'd88;  7'd71:  return 7'd90;
+            7'd72:  return 7'd65;  7'd73:  return 7'd67;  7'd74:  return 7'd73;  7'd75:  return 7'd75;
+            7'd76:  return 7'd81;  7'd77:  return 7'd83;  7'd78:  return 7'd89;  7'd79:  return 7'd91;
+            // yb=5
+            7'd80:  return 7'd68;  7'd81:  return 7'd70;  7'd82:  return 7'd76;  7'd83:  return 7'd78;
+            7'd84:  return 7'd84;  7'd85:  return 7'd86;  7'd86:  return 7'd92;  7'd87:  return 7'd94;
+            7'd88:  return 7'd69;  7'd89:  return 7'd71;  7'd90:  return 7'd77;  7'd91:  return 7'd79;
+            7'd92:  return 7'd85;  7'd93:  return 7'd87;  7'd94:  return 7'd93;  7'd95:  return 7'd95;
+            // yb=6
+            7'd96:  return 7'd96;  7'd97:  return 7'd98;  7'd98:  return 7'd104; 7'd99:  return 7'd106;
+            7'd100: return 7'd112; 7'd101: return 7'd114; 7'd102: return 7'd120; 7'd103: return 7'd122;
+            7'd104: return 7'd97;  7'd105: return 7'd99;  7'd106: return 7'd105; 7'd107: return 7'd107;
+            7'd108: return 7'd113; 7'd109: return 7'd115; 7'd110: return 7'd121; 7'd111: return 7'd123;
+            // yb=7
+            7'd112: return 7'd100; 7'd113: return 7'd102; 7'd114: return 7'd108; 7'd115: return 7'd110;
+            7'd116: return 7'd116; 7'd117: return 7'd118; 7'd118: return 7'd124; 7'd119: return 7'd126;
+            7'd120: return 7'd101; 7'd121: return 7'd103; 7'd122: return 7'd109; 7'd123: return 7'd111;
+            7'd124: return 7'd117; 7'd125: return 7'd119; 7'd126: return 7'd125; default: return 7'd127;
+        endcase
+    endfunction
+
+    // Decompose pixel coord into page / block / pixel-in-block.
+    logic [11:0] page_x;
+    logic [11:0] page_y;
+    logic [2:0]  by;
+    logic [1:0]  bx;
+    logic [3:0]  xb;
+    logic [2:0]  yb;
+
+    assign page_x = x >> 6;     // x / 64
+    assign page_y = y >> 6;     // y / 64
+    assign by     = y[5:3];     // (y % 64) / 8
+    assign bx     = x[5:4];     // (x % 64) / 16
+    assign xb     = x[3:0];     // x % 16
+    assign yb     = y[2:0];     // y % 8
+
+    logic [4:0] block_idx;
+    assign block_idx = swizzle_psmct16(by, bx);
+
+    logic [6:0] hw_idx;
+    assign hw_idx = col_idx_psmct16(yb, xb);
+
+    logic [31:0] page_base;
+    logic [31:0] block_base;
+    logic [31:0] byte_in_block;
+    logic [31:0] page_index;
+    assign page_index    = ({20'd0, page_y} * {26'd0, fbw}) + {20'd0, page_x};
+    assign page_base     = ({23'd0, fbp} << 11) + (page_index << 13);   // FBP*2048 + page_index*8192
+    assign block_base    = page_base + ({27'd0, block_idx} << 8);       // + block_idx*256
+    assign byte_in_block = {24'd0, hw_idx, 1'b0};                       // hw_idx * 2
+
+    assign addr = block_base + byte_in_block;
+
+endmodule : gs_swizzle_psmct16_stub
@@ -0,0 +1,148 @@
+// retroDE_ps2 — gs_swizzle_psmct32_stub (Ch119)
+//
+// Pure-combinational PSMCT32 page/block swizzle: maps a pixel
+// coordinate (x, y) within a framebuffer at (FBP, FBW) to its
+// physical VRAM byte address using the real PS2 GS layout.
+//
+// THIS MODULE DOES NOT YET REPLACE THE LINEAR ADDRESSING in
+// gs_stub / gs_pcrtc_stub / gif_image_xfer_stub. It is the
+// math primitive that future chapters will wire into the
+// existing address paths to swap "linear FBW*64*y + x*4"
+// for the real GS swizzled addressing. Ch119 establishes the
+// math, locks it against the canonical PCSX2 PSMCT32 block
+// table with a focused TB, and leaves integration to follow-on
+// chapters so the existing 109 TBs stay on the linear path.
+//
+// Real PS2 PSMCT32 layout (per PCSX2 GS source):
+//   - VRAM is 4 MiB total, organized in 8 KiB pages.
+//   - Each page is 64×32 PSMCT32 pixels (= 64×32×4 = 8192 bytes).
+//   - Each page is divided into a 4×8 grid of blocks (4 rows of
+//     blocks, 8 cols of blocks per row), each block 8×8 pixels
+//     (= 256 bytes). 4×8 = 32 blocks/page.
+//   - Block ordering within a page is NOT row-major; it follows
+//     the PSMCT32 swizzle table below (a Z-order-like permutation
+//     of (block_x, block_y) that PCSX2's GSLocalMemoryFunctions.cpp
+//     defines verbatim).
+//   - Within a block, PSMCT32 is row-major: pixel (xb, yb) maps
+//     to byte_offset_in_block = yb*32 + xb*4 (no further swizzle
+//     for PSMCT32 — other PSMs have intra-block reorderings).
+//
+// Address formula (linear in pages, swizzled in blocks within a
+// page, linear within a block):
+//   pages_per_fbrow = FBW                      // FBW is in 64-px units; PSMCT32 page is 64 px wide
+//   page_x          = x / 64
+//   page_y          = y / 32
+//   page_index      = page_y * pages_per_fbrow + page_x
+//   page_base       = FBP*2048 + page_index*8192
+//
+//   block_x_in_page = (x % 64) / 8             // 0..7
+//   block_y_in_page = (y % 32) / 8             // 0..3
+//   block_idx       = SWIZZLE[block_y_in_page][block_x_in_page]
+//   block_base      = page_base + block_idx*256
+//
+//   xb              = x % 8
+//   yb              = y % 8
+//   byte_in_block   = yb*32 + xb*4
+//
+//   addr            = block_base + byte_in_block
+//
+// FBP is a 9-bit field at 2048-byte granularity, so FBP*2048
+// can land at any 2048-byte boundary in VRAM — including mid-
+// page boundaries (FBP[1:0] != 0). The math here treats FBP*2048
+// as the literal byte base, with the swizzled page/block/pixel
+// offset added on top, which matches real-PS2 behavior. Page-
+// aligned FBP (FBP[1:0]==0) is the common case in our demo, but
+// the address formula is bit-correct for any 2048-byte-aligned
+// FBP, and the focused TB exercises non-page-aligned FBP=1,2,3
+// to lock that.
+
+`timescale 1ns/1ps
+
+module gs_swizzle_psmct32_stub
+(
+    // Framebuffer config (matches FRAME_1 register fields).
+    input  logic [8:0]  fbp,    // FBP   — frame base, in 2048-byte units
+    input  logic [5:0]  fbw,    // FBW   — frame width, in 64-pixel units
+    // Pixel coordinate within the framebuffer.
+    input  logic [11:0] x,
+    input  logic [11:0] y,
+    // Resulting VRAM byte address.
+    output logic [31:0] addr
+);
+
+    // --------------------------------------------------------------
+    // Block swizzle table for PSMCT32 (PCSX2 GSLocalMemoryFunctions.cpp,
+    // psmt32 block order). Indexed [block_y_in_page][block_x_in_page]
+    // with block_x ∈ 0..7 and block_y ∈ 0..3; value is the linear
+    // block index within the page (0..31).
+    // --------------------------------------------------------------
+    function automatic logic [4:0] swizzle_psmct32(
+        input logic [1:0] by,    // block_y_in_page (0..3)
+        input logic [2:0] bx);   // block_x_in_page (0..7)
+        case ({by, bx})
+            5'h00: return 5'd0;    // (0,0)
+            5'h01: return 5'd1;    // (0,1)
+            5'h02: return 5'd4;    // (0,2)
+            5'h03: return 5'd5;    // (0,3)
+            5'h04: return 5'd16;   // (0,4)
+            5'h05: return 5'd17;   // (0,5)
+            5'h06: return 5'd20;   // (0,6)
+            5'h07: return 5'd21;   // (0,7)
+            5'h08: return 5'd2;    // (1,0)
+            5'h09: return 5'd3;    // (1,1)
+            5'h0A: return 5'd6;    // (1,2)
+            5'h0B: return 5'd7;    // (1,3)
+            5'h0C: return 5'd18;   // (1,4)
+            5'h0D: return 5'd19;   // (1,5)
+            5'h0E: return 5'd22;   // (1,6)
+            5'h0F: return 5'd23;   // (1,7)
+            5'h10: return 5'd8;    // (2,0)
+            5'h11: return 5'd9;    // (2,1)
+            5'h12: return 5'd12;   // (2,2)
+            5'h13: return 5'd13;   // (2,3)
+            5'h14: return 5'd24;   // (2,4)
+            5'h15: return 5'd25;   // (2,5)
+            5'h16: return 5'd28;   // (2,6)
+            5'h17: return 5'd29;   // (2,7)
+            5'h18: return 5'd10;   // (3,0)
+            5'h19: return 5'd11;   // (3,1)
+            5'h1A: return 5'd14;   // (3,2)
+            5'h1B: return 5'd15;   // (3,3)
+            5'h1C: return 5'd26;   // (3,4)
+            5'h1D: return 5'd27;   // (3,5)
+            5'h1E: return 5'd30;   // (3,6)
+            default: return 5'd31; // (3,7)
+        endcase
+    endfunction
+
+    // Decompose pixel coord into page / block / pixel-in-block.
+    logic [11:0] page_x;
+    logic [11:0] page_y;
+    logic [1:0]  by;
+    logic [2:0]  bx;
+    logic [2:0]  xb;
+    logic [2:0]  yb;
+
+    assign page_x = x >> 6;     // x / 64
+    assign page_y = y >> 5;     // y / 32
+    assign by     = y[4:3];     // (y % 32) / 8
+    assign bx     = x[5:3];     // (x % 64) / 8
+    assign xb     = x[2:0];     // x % 8
+    assign yb     = y[2:0];     // y % 8
+
+    logic [4:0] block_idx;
+    assign block_idx = swizzle_psmct32(by, bx);
+
+    // Address composition.
+    logic [31:0] page_base;
+    logic [31:0] block_base;
+    logic [31:0] byte_in_block;
+    logic [31:0] page_index;
+    assign page_index    = ({20'd0, page_y} * {26'd0, fbw}) + {20'd0, page_x};
+    assign page_base     = ({23'd0, fbp} << 11) + (page_index << 13);   // FBP*2048 + page_index*8192
+    assign block_base    = page_base + ({27'd0, block_idx} << 8);       // + block_idx*256
+    assign byte_in_block = ({29'd0, yb} << 5) + ({29'd0, xb} << 2);     // yb*32 + xb*4
+
+    assign addr = block_base + byte_in_block;
+
+endmodule : gs_swizzle_psmct32_stub
@@ -0,0 +1,339 @@
+// retroDE_ps2 — gs_swizzle_psmt4_stub (Ch137)
+//
+// Pure-combinational PSMT4 page/block/column swizzle: maps a
+// pixel coordinate (x, y) within a framebuffer at (FBP, FBW) to
+// its physical VRAM byte address AND high/low nibble select
+// using the real PS2 GS PSMT4 layout. Mirrors Ch119's PSMCT32 +
+// Ch125's PSMCT16 + Ch131's PSMT8 stubs but with PSMT4's wider
+// 32-px-wide block, 32×16 within-block nibble layout, and the
+// half-byte addressing distinction (each PSMT4 pixel is 4 bits;
+// two PSMT4 pixels share a byte). Output `nibble_hi` selects
+// which nibble of the byte at `addr` the pixel occupies.
+//
+// THIS MODULE IS NOT YET WIRED INTO gs_pcrtc_stub /
+// gif_image_xfer_stub / gs_stub. Future chapters will wire it
+// behind a `PSMT4_SWIZZLE`-style parameter gate, mirroring the
+// PSMCT32 (Ch120/121/122), PSMCT16 (Ch126/127/128), and PSMT8
+// (Ch132/133/134) progressions. Default-off keeps the legacy
+// linear PSMT4 TBs (Ch103, Ch106, Ch118, Ch107 e2e palette path)
+// on the linear path. The existing per-bit write_mask 0x0F/0xF0
+// nibble RMW from Ch106/Ch118 will still apply on top of the
+// swizzled byte address — the swizzle doesn't touch the nibble
+// merge logic.
+//
+// SOURCE-TABLE PROVENANCE (per Codex's Ch125/Ch131/Ch137 guidance):
+//   blockTable4    — pcsx2/GS/GSTables.cpp lines 61–69, master
+//                    HEAD commit 3000e113e2b3a76357c08dfa80d3c747f40e2706
+//                    (file blob SHA 3581209b8217378f473f9de22a9dbc8c45ca49b6).
+//                    8 rows × 4 cols, indexed [block_y][block_x].
+//   columnTable4   — pcsx2/GS/GSTables.cpp lines 147–213, same
+//                    commit. 16 rows × 32 cols, indexed [yb][xb],
+//                    values are nibble-within-block (0..511).
+//   Cross-check    — GSLocalMemory.h:558 BlockNumber4 + the
+//                    pxOffset template at GSTables.cpp:247–258
+//                    (blockSize=512, pageSize=16384, pageWidth=128,
+//                    note `blockSize` here is in NIBBLES; byte-
+//                    grain pageSize = 8192 = 16384 nibbles / 2).
+//                    PSMT4 has pageShiftX=7, pageShiftY=7,
+//                    blockShiftX=5, blockShiftY=4,
+//                    m_bwPg = bw >> (pageShiftX - 6) = bw >> 1
+//                    (so FBW must be even for PSMT4 — PCSX2 asserts
+//                    `(bw & 1) == 0` at GSLocalMemory.h:560).
+//                    PCSX2's `bp` is in 256-byte block-pointer
+//                    units; in our FBP (2048-byte) units,
+//                    bp = FBP * 8, so bp*256 = FBP*2048.
+//
+// NOTE on PCSX2 license: the PCSX2 project is GPL-3.0+. This
+// stub re-expresses the same PSMT4 swizzle math in SystemVerilog
+// as a hardware contract — the values in the blockTable4 /
+// columnTable4 case statements come from PCSX2 source and
+// represent the PS2 hardware layout itself (not PCSX2-original
+// creative content). The retroDE_ps2 project authors should
+// consider whether this provenance affects licensing for
+// downstream consumers; from an engineering correctness
+// standpoint, locking against the canonical source is the only
+// way to be byte-accurate to real PS2 VRAM.
+//
+// Real PS2 PSMT4 layout:
+//   - VRAM is 4 MiB total, organized in 8 KiB pages.
+//   - Each page is 128×128 PSMT4 pixels (= 128*128/2 = 8192
+//     bytes — 2 PSMT4 pixels per byte). 4× as many pixels per
+//     page as PSMT8 (128×64) and same byte stride.
+//   - Each page is divided into a 4×8 grid of blocks (4 cols of
+//     blocks across, 8 rows down). Each block is 32×16 PSMT4
+//     pixels (= 32*16/2 = 256 bytes = 512 nibbles). 4×8 = 32
+//     blocks/page (same number of blocks as the other PSMs).
+//   - Block ordering within a page follows blockTable4 (which
+//     happens to be identical to PSMCT16's blockTable16 — both
+//     PSMs share the same block grid orientation).
+//   - Within a block, NIBBLE placement follows columnTable4: a
+//     32×16 → 512-entry permutation that organizes the internal
+//     8-wide × 4-tall sub-columns and 4-tall row-groups of the
+//     block.
+//
+// Address formula (FBP in 2048-byte units; FBW in 64-pixel
+// units; addr in BYTES; FBW must be even):
+//   page_x          = x / 128
+//   page_y          = y / 128
+//   bw_pg           = FBW / 2                    // pages per row
+//   page_index      = page_y * bw_pg + page_x
+//   page_base       = FBP*2048 + page_index*8192
+//
+//   block_x_in_page = (x % 128) / 32             // 0..3
+//   block_y_in_page = (y % 128) / 16             // 0..7
+//   block_idx       = blockTable4[block_y_in_page][block_x_in_page]
+//   block_base      = page_base + block_idx*256
+//
+//   xb              = x % 32
+//   yb              = y % 16
+//   nibble_idx      = columnTable4[yb][xb]       // 0..511
+//   byte_in_block   = nibble_idx >> 1            // 0..255
+//   addr            = block_base + byte_in_block
+//   nibble_hi       = nibble_idx[0]              // 0=low nibble, 1=high
+
+`timescale 1ns/1ps
+
+module gs_swizzle_psmt4_stub
+(
+    input  logic [8:0]  fbp,    // FBP — frame base, in 2048-byte units
+    input  logic [5:0]  fbw,    // FBW — frame width, in 64-pixel units (must be even)
+    input  logic [11:0] x,
+    input  logic [11:0] y,
+    output logic [31:0] addr,
+    output logic        nibble_hi
+);
+
+    // --------------------------------------------------------------
+    // blockTable4 (verbatim from pcsx2/GS/GSTables.cpp lines 61–69).
+    // Indexed [block_y_in_page (0..7)][block_x_in_page (0..3)].
+    //   by=0: {  0,  2,  8, 10 }
+    //   by=1: {  1,  3,  9, 11 }
+    //   by=2: {  4,  6, 12, 14 }
+    //   by=3: {  5,  7, 13, 15 }
+    //   by=4: { 16, 18, 24, 26 }
+    //   by=5: { 17, 19, 25, 27 }
+    //   by=6: { 20, 22, 28, 30 }
+    //   by=7: { 21, 23, 29, 31 }
+    // --------------------------------------------------------------
+    function automatic logic [4:0] swizzle_psmt4(
+        input logic [2:0] by,
+        input logic [1:0] bx);
+        case ({by, bx})
+            5'd0:  return 5'd0;   5'd1:  return 5'd2;   5'd2:  return 5'd8;   5'd3:  return 5'd10;
+            5'd4:  return 5'd1;   5'd5:  return 5'd3;   5'd6:  return 5'd9;   5'd7:  return 5'd11;
+            5'd8:  return 5'd4;   5'd9:  return 5'd6;   5'd10: return 5'd12;  5'd11: return 5'd14;
+            5'd12: return 5'd5;   5'd13: return 5'd7;   5'd14: return 5'd13;  5'd15: return 5'd15;
+            5'd16: return 5'd16;  5'd17: return 5'd18;  5'd18: return 5'd24;  5'd19: return 5'd26;
+            5'd20: return 5'd17;  5'd21: return 5'd19;  5'd22: return 5'd25;  5'd23: return 5'd27;
+            5'd24: return 5'd20;  5'd25: return 5'd22;  5'd26: return 5'd28;  5'd27: return 5'd30;
+            5'd28: return 5'd21;  5'd29: return 5'd23;  5'd30: return 5'd29;  default: return 5'd31;
+        endcase
+    endfunction
+
+    // --------------------------------------------------------------
+    // columnTable4 (verbatim from pcsx2/GS/GSTables.cpp lines 147–213).
+    // Indexed [yb (0..15)][xb (0..31)] → nibble-within-block 0..511.
+    // 512 entries total. Encoded as one large case statement on
+    // {yb, xb} (4+5 = 9 bits). Comments separate yb-block boundaries.
+    // --------------------------------------------------------------
+    function automatic logic [8:0] col_idx_psmt4(
+        input logic [3:0] yb,
+        input logic [4:0] xb);
+        case ({yb, xb})
+            // yb=0:   0   8  32  40  64  72  96 104   2  10  34  42  66  74  98 106
+            //         4  12  36  44  68  76 100 108   6  14  38  46  70  78 102 110
+            9'd0:   return 9'd0;   9'd1:   return 9'd8;   9'd2:   return 9'd32;  9'd3:   return 9'd40;
+            9'd4:   return 9'd64;  9'd5:   return 9'd72;  9'd6:   return 9'd96;  9'd7:   return 9'd104;
+            9'd8:   return 9'd2;   9'd9:   return 9'd10;  9'd10:  return 9'd34;  9'd11:  return 9'd42;
+            9'd12:  return 9'd66;  9'd13:  return 9'd74;  9'd14:  return 9'd98;  9'd15:  return 9'd106;
+            9'd16:  return 9'd4;   9'd17:  return 9'd12;  9'd18:  return 9'd36;  9'd19:  return 9'd44;
+            9'd20:  return 9'd68;  9'd21:  return 9'd76;  9'd22:  return 9'd100; 9'd23:  return 9'd108;
+            9'd24:  return 9'd6;   9'd25:  return 9'd14;  9'd26:  return 9'd38;  9'd27:  return 9'd46;
+            9'd28:  return 9'd70;  9'd29:  return 9'd78;  9'd30:  return 9'd102; 9'd31:  return 9'd110;
+            // yb=1:  16  24  48  56  80  88 112 120  18  26  50  58  82  90 114 122
+            //        20  28  52  60  84  92 116 124  22  30  54  62  86  94 118 126
+            9'd32:  return 9'd16;  9'd33:  return 9'd24;  9'd34:  return 9'd48;  9'd35:  return 9'd56;
+            9'd36:  return 9'd80;  9'd37:  return 9'd88;  9'd38:  return 9'd112; 9'd39:  return 9'd120;
+            9'd40:  return 9'd18;  9'd41:  return 9'd26;  9'd42:  return 9'd50;  9'd43:  return 9'd58;
+            9'd44:  return 9'd82;  9'd45:  return 9'd90;  9'd46:  return 9'd114; 9'd47:  return 9'd122;
+            9'd48:  return 9'd20;  9'd49:  return 9'd28;  9'd50:  return 9'd52;  9'd51:  return 9'd60;
+            9'd52:  return 9'd84;  9'd53:  return 9'd92;  9'd54:  return 9'd116; 9'd55:  return 9'd124;
+            9'd56:  return 9'd22;  9'd57:  return 9'd30;  9'd58:  return 9'd54;  9'd59:  return 9'd62;
+            9'd60:  return 9'd86;  9'd61:  return 9'd94;  9'd62:  return 9'd118; 9'd63:  return 9'd126;
+            // yb=2:  65  73  97 105   1   9  33  41  67  75  99 107   3  11  35  43
+            //        69  77 101 109   5  13  37  45  71  79 103 111   7  15  39  47
+            9'd64:  return 9'd65;  9'd65:  return 9'd73;  9'd66:  return 9'd97;  9'd67:  return 9'd105;
+            9'd68:  return 9'd1;   9'd69:  return 9'd9;   9'd70:  return 9'd33;  9'd71:  return 9'd41;
+            9'd72:  return 9'd67;  9'd73:  return 9'd75;  9'd74:  return 9'd99;  9'd75:  return 9'd107;
+            9'd76:  return 9'd3;   9'd77:  return 9'd11;  9'd78:  return 9'd35;  9'd79:  return 9'd43;
+            9'd80:  return 9'd69;  9'd81:  return 9'd77;  9'd82:  return 9'd101; 9'd83:  return 9'd109;
+            9'd84:  return 9'd5;   9'd85:  return 9'd13;  9'd86:  return 9'd37;  9'd87:  return 9'd45;
+            9'd88:  return 9'd71;  9'd89:  return 9'd79;  9'd90:  return 9'd103; 9'd91:  return 9'd111;
+            9'd92:  return 9'd7;   9'd93:  return 9'd15;  9'd94:  return 9'd39;  9'd95:  return 9'd47;
+            // yb=3:  81  89 113 121  17  25  49  57  83  91 115 123  19  27  51  59
+            //        85  93 117 125  21  29  53  61  87  95 119 127  23  31  55  63
+            9'd96:  return 9'd81;  9'd97:  return 9'd89;  9'd98:  return 9'd113; 9'd99:  return 9'd121;
+            9'd100: return 9'd17;  9'd101: return 9'd25;  9'd102: return 9'd49;  9'd103: return 9'd57;
+            9'd104: return 9'd83;  9'd105: return 9'd91;  9'd106: return 9'd115; 9'd107: return 9'd123;
+            9'd108: return 9'd19;  9'd109: return 9'd27;  9'd110: return 9'd51;  9'd111: return 9'd59;
+            9'd112: return 9'd85;  9'd113: return 9'd93;  9'd114: return 9'd117; 9'd115: return 9'd125;
+            9'd116: return 9'd21;  9'd117: return 9'd29;  9'd118: return 9'd53;  9'd119: return 9'd61;
+            9'd120: return 9'd87;  9'd121: return 9'd95;  9'd122: return 9'd119; 9'd123: return 9'd127;
+            9'd124: return 9'd23;  9'd125: return 9'd31;  9'd126: return 9'd55;  9'd127: return 9'd63;
+            // yb=4: 192 200 224 232 128 136 160 168 194 202 226 234 130 138 162 170
+            //       196 204 228 236 132 140 164 172 198 206 230 238 134 142 166 174
+            9'd128: return 9'd192; 9'd129: return 9'd200; 9'd130: return 9'd224; 9'd131: return 9'd232;
+            9'd132: return 9'd128; 9'd133: return 9'd136; 9'd134: return 9'd160; 9'd135: return 9'd168;
+            9'd136: return 9'd194; 9'd137: return 9'd202; 9'd138: return 9'd226; 9'd139: return 9'd234;
+            9'd140: return 9'd130; 9'd141: return 9'd138; 9'd142: return 9'd162; 9'd143: return 9'd170;
+            9'd144: return 9'd196; 9'd145: return 9'd204; 9'd146: return 9'd228; 9'd147: return 9'd236;
+            9'd148: return 9'd132; 9'd149: return 9'd140; 9'd150: return 9'd164; 9'd151: return 9'd172;
+            9'd152: return 9'd198; 9'd153: return 9'd206; 9'd154: return 9'd230; 9'd155: return 9'd238;
+            9'd156: return 9'd134; 9'd157: return 9'd142; 9'd158: return 9'd166; 9'd159: return 9'd174;
+            // yb=5: 208 216 240 248 144 152 176 184 210 218 242 250 146 154 178 186
+            //       212 220 244 252 148 156 180 188 214 222 246 254 150 158 182 190
+            9'd160: return 9'd208; 9'd161: return 9'd216; 9'd162: return 9'd240; 9'd163: return 9'd248;
+            9'd164: return 9'd144; 9'd165: return 9'd152; 9'd166: return 9'd176; 9'd167: return 9'd184;
+            9'd168: return 9'd210; 9'd169: return 9'd218; 9'd170: return 9'd242; 9'd171: return 9'd250;
+            9'd172: return 9'd146; 9'd173: return 9'd154; 9'd174: return 9'd178; 9'd175: return 9'd186;
+            9'd176: return 9'd212; 9'd177: return 9'd220; 9'd178: return 9'd244; 9'd179: return 9'd252;
+            9'd180: return 9'd148; 9'd181: return 9'd156; 9'd182: return 9'd180; 9'd183: return 9'd188;
+            9'd184: return 9'd214; 9'd185: return 9'd222; 9'd186: return 9'd246; 9'd187: return 9'd254;
+            9'd188: return 9'd150; 9'd189: return 9'd158; 9'd190: return 9'd182; 9'd191: return 9'd190;
+            // yb=6: 129 137 161 169 193 201 225 233 131 139 163 171 195 203 227 235
+            //       133 141 165 173 197 205 229 237 135 143 167 175 199 207 231 239
+            9'd192: return 9'd129; 9'd193: return 9'd137; 9'd194: return 9'd161; 9'd195: return 9'd169;
+            9'd196: return 9'd193; 9'd197: return 9'd201; 9'd198: return 9'd225; 9'd199: return 9'd233;
+            9'd200: return 9'd131; 9'd201: return 9'd139; 9'd202: return 9'd163; 9'd203: return 9'd171;
+            9'd204: return 9'd195; 9'd205: return 9'd203; 9'd206: return 9'd227; 9'd207: return 9'd235;
+            9'd208: return 9'd133; 9'd209: return 9'd141; 9'd210: return 9'd165; 9'd211: return 9'd173;
+            9'd212: return 9'd197; 9'd213: return 9'd205; 9'd214: return 9'd229; 9'd215: return 9'd237;
+            9'd216: return 9'd135; 9'd217: return 9'd143; 9'd218: return 9'd167; 9'd219: return 9'd175;
+            9'd220: return 9'd199; 9'd221: return 9'd207; 9'd222: return 9'd231; 9'd223: return 9'd239;
+            // yb=7: 145 153 177 185 209 217 241 249 147 155 179 187 211 219 243 251
+            //       149 157 181 189 213 221 245 253 151 159 183 191 215 223 247 255
+            9'd224: return 9'd145; 9'd225: return 9'd153; 9'd226: return 9'd177; 9'd227: return 9'd185;
+            9'd228: return 9'd209; 9'd229: return 9'd217; 9'd230: return 9'd241; 9'd231: return 9'd249;
+            9'd232: return 9'd147; 9'd233: return 9'd155; 9'd234: return 9'd179; 9'd235: return 9'd187;
+            9'd236: return 9'd211; 9'd237: return 9'd219; 9'd238: return 9'd243; 9'd239: return 9'd251;
+            9'd240: return 9'd149; 9'd241: return 9'd157; 9'd242: return 9'd181; 9'd243: return 9'd189;
+            9'd244: return 9'd213; 9'd245: return 9'd221; 9'd246: return 9'd245; 9'd247: return 9'd253;
+            9'd248: return 9'd151; 9'd249: return 9'd159; 9'd250: return 9'd183; 9'd251: return 9'd191;
+            9'd252: return 9'd215; 9'd253: return 9'd223; 9'd254: return 9'd247; 9'd255: return 9'd255;
+            // yb=8: 256 264 288 296 320 328 352 360 258 266 290 298 322 330 354 362
+            //       260 268 292 300 324 332 356 364 262 270 294 302 326 334 358 366
+            9'd256: return 9'd256; 9'd257: return 9'd264; 9'd258: return 9'd288; 9'd259: return 9'd296;
+            9'd260: return 9'd320; 9'd261: return 9'd328; 9'd262: return 9'd352; 9'd263: return 9'd360;
+            9'd264: return 9'd258; 9'd265: return 9'd266; 9'd266: return 9'd290; 9'd267: return 9'd298;
+            9'd268: return 9'd322; 9'd269: return 9'd330; 9'd270: return 9'd354; 9'd271: return 9'd362;
+            9'd272: return 9'd260; 9'd273: return 9'd268; 9'd274: return 9'd292; 9'd275: return 9'd300;
+            9'd276: return 9'd324; 9'd277: return 9'd332; 9'd278: return 9'd356; 9'd279: return 9'd364;
+            9'd280: return 9'd262; 9'd281: return 9'd270; 9'd282: return 9'd294; 9'd283: return 9'd302;
+            9'd284: return 9'd326; 9'd285: return 9'd334; 9'd286: return 9'd358; 9'd287: return 9'd366;
+            // yb=9: 272 280 304 312 336 344 368 376 274 282 306 314 338 346 370 378
+            //       276 284 308 316 340 348 372 380 278 286 310 318 342 350 374 382
+            9'd288: return 9'd272; 9'd289: return 9'd280; 9'd290: return 9'd304; 9'd291: return 9'd312;
+            9'd292: return 9'd336; 9'd293: return 9'd344; 9'd294: return 9'd368; 9'd295: return 9'd376;
+            9'd296: return 9'd274; 9'd297: return 9'd282; 9'd298: return 9'd306; 9'd299: return 9'd314;
+            9'd300: return 9'd338; 9'd301: return 9'd346; 9'd302: return 9'd370; 9'd303: return 9'd378;
+            9'd304: return 9'd276; 9'd305: return 9'd284; 9'd306: return 9'd308; 9'd307: return 9'd316;
+            9'd308: return 9'd340; 9'd309: return 9'd348; 9'd310: return 9'd372; 9'd311: return 9'd380;
+            9'd312: return 9'd278; 9'd313: return 9'd286; 9'd314: return 9'd310; 9'd315: return 9'd318;
+            9'd316: return 9'd342; 9'd317: return 9'd350; 9'd318: return 9'd374; 9'd319: return 9'd382;
+            // yb=10: 321 329 353 361 257 265 289 297 323 331 355 363 259 267 291 299
+            //        325 333 357 365 261 269 293 301 327 335 359 367 263 271 295 303
+            9'd320: return 9'd321; 9'd321: return 9'd329; 9'd322: return 9'd353; 9'd323: return 9'd361;
+            9'd324: return 9'd257; 9'd325: return 9'd265; 9'd326: return 9'd289; 9'd327: return 9'd297;
+            9'd328: return 9'd323; 9'd329: return 9'd331; 9'd330: return 9'd355; 9'd331: return 9'd363;
+            9'd332: return 9'd259; 9'd333: return 9'd267; 9'd334: return 9'd291; 9'd335: return 9'd299;
+            9'd336: return 9'd325; 9'd337: return 9'd333; 9'd338: return 9'd357; 9'd339: return 9'd365;
+            9'd340: return 9'd261; 9'd341: return 9'd269; 9'd342: return 9'd293; 9'd343: return 9'd301;
+            9'd344: return 9'd327; 9'd345: return 9'd335; 9'd346: return 9'd359; 9'd347: return 9'd367;
+            9'd348: return 9'd263; 9'd349: return 9'd271; 9'd350: return 9'd295; 9'd351: return 9'd303;
+            // yb=11: 337 345 369 377 273 281 305 313 339 347 371 379 275 283 307 315
+            //        341 349 373 381 277 285 309 317 343 351 375 383 279 287 311 319
+            9'd352: return 9'd337; 9'd353: return 9'd345; 9'd354: return 9'd369; 9'd355: return 9'd377;
+            9'd356: return 9'd273; 9'd357: return 9'd281; 9'd358: return 9'd305; 9'd359: return 9'd313;
+            9'd360: return 9'd339; 9'd361: return 9'd347; 9'd362: return 9'd371; 9'd363: return 9'd379;
+            9'd364: return 9'd275; 9'd365: return 9'd283; 9'd366: return 9'd307; 9'd367: return 9'd315;
+            9'd368: return 9'd341; 9'd369: return 9'd349; 9'd370: return 9'd373; 9'd371: return 9'd381;
+            9'd372: return 9'd277; 9'd373: return 9'd285; 9'd374: return 9'd309; 9'd375: return 9'd317;
+            9'd376: return 9'd343; 9'd377: return 9'd351; 9'd378: return 9'd375; 9'd379: return 9'd383;
+            9'd380: return 9'd279; 9'd381: return 9'd287; 9'd382: return 9'd311; 9'd383: return 9'd319;
+            // yb=12: 448 456 480 488 384 392 416 424 450 458 482 490 386 394 418 426
+            //        452 460 484 492 388 396 420 428 454 462 486 494 390 398 422 430
+            9'd384: return 9'd448; 9'd385: return 9'd456; 9'd386: return 9'd480; 9'd387: return 9'd488;
+            9'd388: return 9'd384; 9'd389: return 9'd392; 9'd390: return 9'd416; 9'd391: return 9'd424;
+            9'd392: return 9'd450; 9'd393: return 9'd458; 9'd394: return 9'd482; 9'd395: return 9'd490;
+            9'd396: return 9'd386; 9'd397: return 9'd394; 9'd398: return 9'd418; 9'd399: return 9'd426;
+            9'd400: return 9'd452; 9'd401: return 9'd460; 9'd402: return 9'd484; 9'd403: return 9'd492;
+            9'd404: return 9'd388; 9'd405: return 9'd396; 9'd406: return 9'd420; 9'd407: return 9'd428;
+            9'd408: return 9'd454; 9'd409: return 9'd462; 9'd410: return 9'd486; 9'd411: return 9'd494;
+            9'd412: return 9'd390; 9'd413: return 9'd398; 9'd414: return 9'd422; 9'd415: return 9'd430;
+            // yb=13: 464 472 496 504 400 408 432 440 466 474 498 506 402 410 434 442
+            //        468 476 500 508 404 412 436 444 470 478 502 510 406 414 438 446
+            9'd416: return 9'd464; 9'd417: return 9'd472; 9'd418: return 9'd496; 9'd419: return 9'd504;
+            9'd420: return 9'd400; 9'd421: return 9'd408; 9'd422: return 9'd432; 9'd423: return 9'd440;
+            9'd424: return 9'd466; 9'd425: return 9'd474; 9'd426: return 9'd498; 9'd427: return 9'd506;
+            9'd428: return 9'd402; 9'd429: return 9'd410; 9'd430: return 9'd434; 9'd431: return 9'd442;
+            9'd432: return 9'd468; 9'd433: return 9'd476; 9'd434: return 9'd500; 9'd435: return 9'd508;
+            9'd436: return 9'd404; 9'd437: return 9'd412; 9'd438: return 9'd436; 9'd439: return 9'd444;
+            9'd440: return 9'd470; 9'd441: return 9'd478; 9'd442: return 9'd502; 9'd443: return 9'd510;
+            9'd444: return 9'd406; 9'd445: return 9'd414; 9'd446: return 9'd438; 9'd447: return 9'd446;
+            // yb=14: 385 393 417 425 449 457 481 489 387 395 419 427 451 459 483 491
+            //        389 397 421 429 453 461 485 493 391 399 423 431 455 463 487 495
+            9'd448: return 9'd385; 9'd449: return 9'd393; 9'd450: return 9'd417; 9'd451: return 9'd425;
+            9'd452: return 9'd449; 9'd453: return 9'd457; 9'd454: return 9'd481; 9'd455: return 9'd489;
+            9'd456: return 9'd387; 9'd457: return 9'd395; 9'd458: return 9'd419; 9'd459: return 9'd427;
+            9'd460: return 9'd451; 9'd461: return 9'd459; 9'd462: return 9'd483; 9'd463: return 9'd491;
+            9'd464: return 9'd389; 9'd465: return 9'd397; 9'd466: return 9'd421; 9'd467: return 9'd429;
+            9'd468: return 9'd453; 9'd469: return 9'd461; 9'd470: return 9'd485; 9'd471: return 9'd493;
+            9'd472: return 9'd391; 9'd473: return 9'd399; 9'd474: return 9'd423; 9'd475: return 9'd431;
+            9'd476: return 9'd455; 9'd477: return 9'd463; 9'd478: return 9'd487; 9'd479: return 9'd495;
+            // yb=15: 401 409 433 441 465 473 497 505 403 411 435 443 467 475 499 507
+            //        405 413 437 445 469 477 501 509 407 415 439 447 471 479 503 511
+            9'd480: return 9'd401; 9'd481: return 9'd409; 9'd482: return 9'd433; 9'd483: return 9'd441;
+            9'd484: return 9'd465; 9'd485: return 9'd473; 9'd486: return 9'd497; 9'd487: return 9'd505;
+            9'd488: return 9'd403; 9'd489: return 9'd411; 9'd490: return 9'd435; 9'd491: return 9'd443;
+            9'd492: return 9'd467; 9'd493: return 9'd475; 9'd494: return 9'd499; 9'd495: return 9'd507;
+            9'd496: return 9'd405; 9'd497: return 9'd413; 9'd498: return 9'd437; 9'd499: return 9'd445;
+            9'd500: return 9'd469; 9'd501: return 9'd477; 9'd502: return 9'd501; 9'd503: return 9'd509;
+            9'd504: return 9'd407; 9'd505: return 9'd415; 9'd506: return 9'd439; 9'd507: return 9'd447;
+            9'd508: return 9'd471; 9'd509: return 9'd479; 9'd510: return 9'd503; default: return 9'd511;
+        endcase
+    endfunction
+
+    // Decompose pixel coord into page / block / pixel-in-block.
+    logic [11:0] page_x;
+    logic [11:0] page_y;
+    logic [2:0]  by;
+    logic [1:0]  bx;
+    logic [4:0]  xb;
+    logic [3:0]  yb;
+    logic [5:0]  bw_pg;
+
+    assign page_x = x >> 7;     // x / 128
+    assign page_y = y >> 7;     // y / 128
+    assign by     = y[6:4];     // (y % 128) / 16
+    assign bx     = x[6:5];     // (x % 128) / 32
+    assign xb     = x[4:0];     // x % 32
+    assign yb     = y[3:0];     // y % 16
+    assign bw_pg  = fbw >> 1;   // FBW / 2 (FBW must be even)
+
+    logic [4:0] block_idx;
+    assign block_idx = swizzle_psmt4(by, bx);
+
+    logic [8:0] nibble_idx;
+    assign nibble_idx = col_idx_psmt4(yb, xb);
+
+    logic [31:0] page_base;
+    logic [31:0] block_base;
+    logic [31:0] page_index;
+    assign page_index    = ({20'd0, page_y} * {26'd0, bw_pg}) + {20'd0, page_x};
+    assign page_base     = ({23'd0, fbp} << 11) + (page_index << 13);   // FBP*2048 + page_index*8192
+    assign block_base    = page_base + ({27'd0, block_idx} << 8);       // + block_idx*256
+
+    assign addr      = block_base + {23'd0, nibble_idx[8:1]};            // byte_in_block = nibble_idx >> 1
+    assign nibble_hi = nibble_idx[0];                                     // 0=low, 1=high
+
+endmodule : gs_swizzle_psmt4_stub
@@ -0,0 +1,259 @@
+// retroDE_ps2 — gs_swizzle_psmt8_stub (Ch131)
+//
+// Pure-combinational PSMT8 page/block/column swizzle: maps a
+// pixel coordinate (x, y) within a framebuffer at (FBP, FBW) to
+// its physical VRAM byte address using the real PS2 GS PSMT8
+// layout. Mirrors Ch119's `gs_swizzle_psmct32_stub` and Ch125's
+// `gs_swizzle_psmct16_stub` shape, but with PSMT8's wider page
+// (128 px vs 64 px), 8-cols × 4-rows page block grid, and the
+// 16×16 within-block column table.
+//
+// THIS MODULE IS NOT YET WIRED INTO gs_pcrtc_stub /
+// gif_image_xfer_stub / gs_stub. Future chapters will wire it
+// behind a `PSMT8_SWIZZLE`-style parameter gate, mirroring the
+// PSMCT32 (Ch120/121/122) and PSMCT16 (Ch126/127/128) progressions.
+// Default-off keeps the legacy linear PSMT8 TBs (Ch96, Ch97,
+// Ch103, Ch105, Ch107, Ch117) on the linear path.
+//
+// SOURCE-TABLE PROVENANCE (per Codex's Ch125/Ch131 guidance):
+//   blockTable8    — pcsx2/GS/GSTables.cpp lines 53–59, master
+//                    HEAD commit 3000e113e2b3a76357c08dfa80d3c747f40e2706
+//                    (file blob SHA 3581209b8217378f473f9de22a9dbc8c45ca49b6).
+//                    4 rows × 8 cols, indexed [block_y][block_x].
+//   columnTable8   — pcsx2/GS/GSTables.cpp lines 111–145, same
+//                    commit. 16 rows × 16 cols, indexed [yb][xb],
+//                    values are byte-within-block (0..255).
+//   Cross-check    — GSLocalMemory.h line 551 BlockNumber8 +
+//                    pxOffset template at GSTables.cpp lines 247–258
+//                    (blockSize=256, pageSize=8192, pageWidth=128).
+//                    PSMT8 has pageShiftX=7, pageShiftY=6,
+//                    blockShiftX=4, blockShiftY=4,
+//                    m_bwPg = bw >> (pageShiftX - 6) = bw >> 1
+//                    (so FBW must be even for PSMT8 — PCSX2 asserts
+//                    `(bw & 1) == 0` at GSLocalMemory.h:553).
+//                    PCSX2's `bp` is in 256-byte block-pointer
+//                    units; in our FBP (2048-byte) units,
+//                    bp = FBP * 8, so bp*256 = FBP*2048.
+//
+// NOTE on PCSX2 license: the PCSX2 project is GPL-3.0+. This
+// stub re-expresses the same PSMT8 swizzle math in SystemVerilog
+// as a hardware contract — the values in the blockTable8 /
+// columnTable8 case statements come from PCSX2 source and
+// represent the PS2 hardware layout itself (not PCSX2-original
+// creative content). The retroDE_ps2 project authors should
+// consider whether this provenance affects licensing for
+// downstream consumers; from an engineering correctness
+// standpoint, locking against the canonical source is the only
+// way to be byte-accurate to real PS2 VRAM.
+//
+// Real PS2 PSMT8 layout:
+//   - VRAM is 4 MiB total, organized in 8 KiB pages.
+//   - Each page is 128×64 PSMT8 pixels (= 128*64*1 = 8192 bytes).
+//     2× as many pixels per page as PSMCT16 (which has 64×64 px)
+//     and 4× as many as PSMCT32 (64×32 px) because each PSMT8
+//     pixel is only 1 byte vs CT16's 2 vs CT32's 4.
+//   - Each page is divided into a 8×4 grid of blocks (8 cols of
+//     blocks across, 4 rows down). Each block is 16×16 PSMT8
+//     pixels (= 16*16*1 = 256 bytes). 8×4 = 32 blocks/page.
+//   - Block ordering within a page follows blockTable8.
+//   - Within a block, byte placement follows columnTable8: a
+//     16×16 → 256-entry permutation that organizes 4 internal
+//     columns (4 wide each) × 4 internal row-groups (4 tall each)
+//     with intra-group y-pair interleaving.
+//
+// Address formula (FBP in 2048-byte units; FBW in 64-pixel
+// units; addr in bytes; FBW must be even):
+//   page_x          = x / 128
+//   page_y          = y / 64
+//   bw_pg           = FBW / 2                    // pages per row
+//   page_index      = page_y * bw_pg + page_x
+//   page_base       = FBP*2048 + page_index*8192
+//
+//   block_x_in_page = (x % 128) / 16             // 0..7
+//   block_y_in_page = (y % 64)  / 16             // 0..3
+//   block_idx       = blockTable8[block_y_in_page][block_x_in_page]
+//   block_base      = page_base + block_idx*256
+//
+//   xb              = x % 16
+//   yb              = y % 16
+//   byte_idx        = columnTable8[yb][xb]       // 0..255
+//   addr            = block_base + byte_idx
+
+`timescale 1ns/1ps
+
+module gs_swizzle_psmt8_stub
+(
+    input  logic [8:0]  fbp,    // FBP — frame base, in 2048-byte units
+    input  logic [5:0]  fbw,    // FBW — frame width, in 64-pixel units (must be even)
+    input  logic [11:0] x,
+    input  logic [11:0] y,
+    output logic [31:0] addr
+);
+
+    // --------------------------------------------------------------
+    // blockTable8 (verbatim from pcsx2/GS/GSTables.cpp lines 53–59).
+    // Indexed [block_y_in_page (0..3)][block_x_in_page (0..7)].
+    //   by=0: { 0, 1, 4, 5,16,17,20,21}
+    //   by=1: { 2, 3, 6, 7,18,19,22,23}
+    //   by=2: { 8, 9,12,13,24,25,28,29}
+    //   by=3: {10,11,14,15,26,27,30,31}
+    // --------------------------------------------------------------
+    function automatic logic [4:0] swizzle_psmt8(
+        input logic [1:0] by,
+        input logic [2:0] bx);
+        case ({by, bx})
+            // by=0
+            5'd0:  return 5'd0;   5'd1:  return 5'd1;   5'd2:  return 5'd4;   5'd3:  return 5'd5;
+            5'd4:  return 5'd16;  5'd5:  return 5'd17;  5'd6:  return 5'd20;  5'd7:  return 5'd21;
+            // by=1
+            5'd8:  return 5'd2;   5'd9:  return 5'd3;   5'd10: return 5'd6;   5'd11: return 5'd7;
+            5'd12: return 5'd18;  5'd13: return 5'd19;  5'd14: return 5'd22;  5'd15: return 5'd23;
+            // by=2
+            5'd16: return 5'd8;   5'd17: return 5'd9;   5'd18: return 5'd12;  5'd19: return 5'd13;
+            5'd20: return 5'd24;  5'd21: return 5'd25;  5'd22: return 5'd28;  5'd23: return 5'd29;
+            // by=3
+            5'd24: return 5'd10;  5'd25: return 5'd11;  5'd26: return 5'd14;  5'd27: return 5'd15;
+            5'd28: return 5'd26;  5'd29: return 5'd27;  5'd30: return 5'd30;  default: return 5'd31;
+        endcase
+    endfunction
+
+    // --------------------------------------------------------------
+    // columnTable8 (verbatim from pcsx2/GS/GSTables.cpp lines 111–145).
+    // Indexed [yb (0..15)][xb (0..15)] → byte-within-block 0..255.
+    //   yb=0:    0   4  16  20  32  36  48  52   2   6  18  22  34  38  50  54
+    //   yb=1:    8  12  24  28  40  44  56  60  10  14  26  30  42  46  58  62
+    //   yb=2:   33  37  49  53   1   5  17  21  35  39  51  55   3   7  19  23
+    //   yb=3:   41  45  57  61   9  13  25  29  43  47  59  63  11  15  27  31
+    //   yb=4:   96 100 112 116  64  68  80  84  98 102 114 118  66  70  82  86
+    //   yb=5:  104 108 120 124  72  76  88  92 106 110 122 126  74  78  90  94
+    //   yb=6:   65  69  81  85  97 101 113 117  67  71  83  87  99 103 115 119
+    //   yb=7:   73  77  89  93 105 109 121 125  75  79  91  95 107 111 123 127
+    //   yb=8:  128 132 144 148 160 164 176 180 130 134 146 150 162 166 178 182
+    //   yb=9:  136 140 152 156 168 172 184 188 138 142 154 158 170 174 186 190
+    //   yb=10: 161 165 177 181 129 133 145 149 163 167 179 183 131 135 147 151
+    //   yb=11: 169 173 185 189 137 141 153 157 171 175 187 191 139 143 155 159
+    //   yb=12: 224 228 240 244 192 196 208 212 226 230 242 246 194 198 210 214
+    //   yb=13: 232 236 248 252 200 204 216 220 234 238 250 254 202 206 218 222
+    //   yb=14: 193 197 209 213 225 229 241 245 195 199 211 215 227 231 243 247
+    //   yb=15: 201 205 217 221 233 237 249 253 203 207 219 223 235 239 251 255
+    // --------------------------------------------------------------
+    function automatic logic [7:0] col_idx_psmt8(
+        input logic [3:0] yb,
+        input logic [3:0] xb);
+        case ({yb, xb})
+            // yb=0
+            8'd0:   return 8'd0;   8'd1:   return 8'd4;   8'd2:   return 8'd16;  8'd3:   return 8'd20;
+            8'd4:   return 8'd32;  8'd5:   return 8'd36;  8'd6:   return 8'd48;  8'd7:   return 8'd52;
+            8'd8:   return 8'd2;   8'd9:   return 8'd6;   8'd10:  return 8'd18;  8'd11:  return 8'd22;
+            8'd12:  return 8'd34;  8'd13:  return 8'd38;  8'd14:  return 8'd50;  8'd15:  return 8'd54;
+            // yb=1
+            8'd16:  return 8'd8;   8'd17:  return 8'd12;  8'd18:  return 8'd24;  8'd19:  return 8'd28;
+            8'd20:  return 8'd40;  8'd21:  return 8'd44;  8'd22:  return 8'd56;  8'd23:  return 8'd60;
+            8'd24:  return 8'd10;  8'd25:  return 8'd14;  8'd26:  return 8'd26;  8'd27:  return 8'd30;
+            8'd28:  return 8'd42;  8'd29:  return 8'd46;  8'd30:  return 8'd58;  8'd31:  return 8'd62;
+            // yb=2
+            8'd32:  return 8'd33;  8'd33:  return 8'd37;  8'd34:  return 8'd49;  8'd35:  return 8'd53;
+            8'd36:  return 8'd1;   8'd37:  return 8'd5;   8'd38:  return 8'd17;  8'd39:  return 8'd21;
+            8'd40:  return 8'd35;  8'd41:  return 8'd39;  8'd42:  return 8'd51;  8'd43:  return 8'd55;
+            8'd44:  return 8'd3;   8'd45:  return 8'd7;   8'd46:  return 8'd19;  8'd47:  return 8'd23;
+            // yb=3
+            8'd48:  return 8'd41;  8'd49:  return 8'd45;  8'd50:  return 8'd57;  8'd51:  return 8'd61;
+            8'd52:  return 8'd9;   8'd53:  return 8'd13;  8'd54:  return 8'd25;  8'd55:  return 8'd29;
+            8'd56:  return 8'd43;  8'd57:  return 8'd47;  8'd58:  return 8'd59;  8'd59:  return 8'd63;
+            8'd60:  return 8'd11;  8'd61:  return 8'd15;  8'd62:  return 8'd27;  8'd63:  return 8'd31;
+            // yb=4
+            8'd64:  return 8'd96;  8'd65:  return 8'd100; 8'd66:  return 8'd112; 8'd67:  return 8'd116;
+            8'd68:  return 8'd64;  8'd69:  return 8'd68;  8'd70:  return 8'd80;  8'd71:  return 8'd84;
+            8'd72:  return 8'd98;  8'd73:  return 8'd102; 8'd74:  return 8'd114; 8'd75:  return 8'd118;
+            8'd76:  return 8'd66;  8'd77:  return 8'd70;  8'd78:  return 8'd82;  8'd79:  return 8'd86;
+            // yb=5
+            8'd80:  return 8'd104; 8'd81:  return 8'd108; 8'd82:  return 8'd120; 8'd83:  return 8'd124;
+            8'd84:  return 8'd72;  8'd85:  return 8'd76;  8'd86:  return 8'd88;  8'd87:  return 8'd92;
+            8'd88:  return 8'd106; 8'd89:  return 8'd110; 8'd90:  return 8'd122; 8'd91:  return 8'd126;
+            8'd92:  return 8'd74;  8'd93:  return 8'd78;  8'd94:  return 8'd90;  8'd95:  return 8'd94;
+            // yb=6
+            8'd96:  return 8'd65;  8'd97:  return 8'd69;  8'd98:  return 8'd81;  8'd99:  return 8'd85;
+            8'd100: return 8'd97;  8'd101: return 8'd101; 8'd102: return 8'd113; 8'd103: return 8'd117;
+            8'd104: return 8'd67;  8'd105: return 8'd71;  8'd106: return 8'd83;  8'd107: return 8'd87;
+            8'd108: return 8'd99;  8'd109: return 8'd103; 8'd110: return 8'd115; 8'd111: return 8'd119;
+            // yb=7
+            8'd112: return 8'd73;  8'd113: return 8'd77;  8'd114: return 8'd89;  8'd115: return 8'd93;
+            8'd116: return 8'd105; 8'd117: return 8'd109; 8'd118: return 8'd121; 8'd119: return 8'd125;
+            8'd120: return 8'd75;  8'd121: return 8'd79;  8'd122: return 8'd91;  8'd123: return 8'd95;
+            8'd124: return 8'd107; 8'd125: return 8'd111; 8'd126: return 8'd123; 8'd127: return 8'd127;
+            // yb=8
+            8'd128: return 8'd128; 8'd129: return 8'd132; 8'd130: return 8'd144; 8'd131: return 8'd148;
+            8'd132: return 8'd160; 8'd133: return 8'd164; 8'd134: return 8'd176; 8'd135: return 8'd180;
+            8'd136: return 8'd130; 8'd137: return 8'd134; 8'd138: return 8'd146; 8'd139: return 8'd150;
+            8'd140: return 8'd162; 8'd141: return 8'd166; 8'd142: return 8'd178; 8'd143: return 8'd182;
+            // yb=9
+            8'd144: return 8'd136; 8'd145: return 8'd140; 8'd146: return 8'd152; 8'd147: return 8'd156;
+            8'd148: return 8'd168; 8'd149: return 8'd172; 8'd150: return 8'd184; 8'd151: return 8'd188;
+            8'd152: return 8'd138; 8'd153: return 8'd142; 8'd154: return 8'd154; 8'd155: return 8'd158;
+            8'd156: return 8'd170; 8'd157: return 8'd174; 8'd158: return 8'd186; 8'd159: return 8'd190;
+            // yb=10
+            8'd160: return 8'd161; 8'd161: return 8'd165; 8'd162: return 8'd177; 8'd163: return 8'd181;
+            8'd164: return 8'd129; 8'd165: return 8'd133; 8'd166: return 8'd145; 8'd167: return 8'd149;
+            8'd168: return 8'd163; 8'd169: return 8'd167; 8'd170: return 8'd179; 8'd171: return 8'd183;
+            8'd172: return 8'd131; 8'd173: return 8'd135; 8'd174: return 8'd147; 8'd175: return 8'd151;
+            // yb=11
+            8'd176: return 8'd169; 8'd177: return 8'd173; 8'd178: return 8'd185; 8'd179: return 8'd189;
+            8'd180: return 8'd137; 8'd181: return 8'd141; 8'd182: return 8'd153; 8'd183: return 8'd157;
+            8'd184: return 8'd171; 8'd185: return 8'd175; 8'd186: return 8'd187; 8'd187: return 8'd191;
+            8'd188: return 8'd139; 8'd189: return 8'd143; 8'd190: return 8'd155; 8'd191: return 8'd159;
+            // yb=12
+            8'd192: return 8'd224; 8'd193: return 8'd228; 8'd194: return 8'd240; 8'd195: return 8'd244;
+            8'd196: return 8'd192; 8'd197: return 8'd196; 8'd198: return 8'd208; 8'd199: return 8'd212;
+            8'd200: return 8'd226; 8'd201: return 8'd230; 8'd202: return 8'd242; 8'd203: return 8'd246;
+            8'd204: return 8'd194; 8'd205: return 8'd198; 8'd206: return 8'd210; 8'd207: return 8'd214;
+            // yb=13
+            8'd208: return 8'd232; 8'd209: return 8'd236; 8'd210: return 8'd248; 8'd211: return 8'd252;
+            8'd212: return 8'd200; 8'd213: return 8'd204; 8'd214: return 8'd216; 8'd215: return 8'd220;
+            8'd216: return 8'd234; 8'd217: return 8'd238; 8'd218: return 8'd250; 8'd219: return 8'd254;
+            8'd220: return 8'd202; 8'd221: return 8'd206; 8'd222: return 8'd218; 8'd223: return 8'd222;
+            // yb=14
+            8'd224: return 8'd193; 8'd225: return 8'd197; 8'd226: return 8'd209; 8'd227: return 8'd213;
+            8'd228: return 8'd225; 8'd229: return 8'd229; 8'd230: return 8'd241; 8'd231: return 8'd245;
+            8'd232: return 8'd195; 8'd233: return 8'd199; 8'd234: return 8'd211; 8'd235: return 8'd215;
+            8'd236: return 8'd227; 8'd237: return 8'd231; 8'd238: return 8'd243; 8'd239: return 8'd247;
+            // yb=15
+            8'd240: return 8'd201; 8'd241: return 8'd205; 8'd242: return 8'd217; 8'd243: return 8'd221;
+            8'd244: return 8'd233; 8'd245: return 8'd237; 8'd246: return 8'd249; 8'd247: return 8'd253;
+            8'd248: return 8'd203; 8'd249: return 8'd207; 8'd250: return 8'd219; 8'd251: return 8'd223;
+            8'd252: return 8'd235; 8'd253: return 8'd239; 8'd254: return 8'd251; default: return 8'd255;
+        endcase
+    endfunction
+
+    // Decompose pixel coord into page / block / pixel-in-block.
+    logic [11:0] page_x;
+    logic [11:0] page_y;
+    logic [1:0]  by;
+    logic [2:0]  bx;
+    logic [3:0]  xb;
+    logic [3:0]  yb;
+    logic [5:0]  bw_pg;
+
+    assign page_x = x >> 7;     // x / 128
+    assign page_y = y >> 6;     // y / 64
+    assign by     = y[5:4];     // (y % 64)  / 16
+    assign bx     = x[6:4];     // (x % 128) / 16
+    assign xb     = x[3:0];     // x % 16
+    assign yb     = y[3:0];     // y % 16
+    assign bw_pg  = fbw >> 1;   // FBW / 2 (FBW must be even)
+
+    logic [4:0] block_idx;
+    assign block_idx = swizzle_psmt8(by, bx);
+
+    logic [7:0] byte_idx;
+    assign byte_idx = col_idx_psmt8(yb, xb);
+
+    logic [31:0] page_base;
+    logic [31:0] block_base;
+    logic [31:0] page_index;
+    assign page_index    = ({20'd0, page_y} * {26'd0, bw_pg}) + {20'd0, page_x};
+    assign page_base     = ({23'd0, fbp} << 11) + (page_index << 13);   // FBP*2048 + page_index*8192
+    assign block_base    = page_base + ({27'd0, block_idx} << 8);       // + block_idx*256
+
+    assign addr = block_base + {24'd0, byte_idx};                       // + byte_idx (1 byte/pixel)
+
+endmodule : gs_swizzle_psmt8_stub
@@ -0,0 +1,74 @@
+// retroDE_ps2 — gs_texel_addr
+//
+// Texture-sampling address generator (brick 1, step 1 of GS texturing).
+//
+// Given a texture coordinate (u,v) and the TEX0 texture descriptor, computes
+// the LINEAR VRAM byte address of that texel — the read-side mirror of the
+// rasterizer's existing framebuffer-address math (gs_stub.sv ~line 530:
+//   fb_addr = base + (Y*FBW*64 + X) * bytes_per_pixel ).
+//
+// Linear (non-swizzled) only, on purpose: the swizzle paths in gs_stub are
+// param-gated OFF by default, so linear is the baseline. Swizzled texel
+// addressing will reuse the existing gs_swizzle_* modules later.
+//
+// `base_byte_addr` is the texture base in VRAM, ALREADY scaled to bytes by
+// the caller from TEX0.TBP0. Keeping the base as a byte input (rather than
+// scaling TBP0 here) isolates the one thing that must be reconciled with the
+// texture-UPLOAD path (gif_image_xfer_stub / BITBLTBUF) — so we read texels
+// from exactly where BITBLT wrote them. That reconciliation is tracked as the
+// next integration step; this module's (u,v)->offset math is unambiguous and
+// unit-tested below.
+
+`timescale 1ns/1ps
+
+module gs_texel_addr #(
+    parameter int ADDR_W = 32
+) (
+    input  logic [31:0]        base_byte_addr, // texture base in VRAM (bytes)
+    input  logic [10:0]        u,              // texel column (0..2047)
+    input  logic [10:0]        v,              // texel row    (0..2047)
+    input  logic [13:0]        tbw,            // TEX0.TBW — texels-per-row / 64
+    input  logic [5:0]         psm,            // pixel storage mode
+    output logic [ADDR_W-1:0]  texel_byte_addr,
+    output logic               nibble_hi       // PSMT4: high nibble of the byte?
+);
+
+    localparam logic [5:0] PSMCT32 = 6'h00;
+    localparam logic [5:0] PSMCT16 = 6'h02;
+    localparam logic [5:0] PSMT8   = 6'h13;
+    localparam logic [5:0] PSMT4   = 6'h14;
+
+    // texels per row = TBW * 64
+    logic [19:0] row_texels;
+    // linear texel index = v * row_texels + u
+    logic [31:0] texel_offset;
+
+    always_comb begin
+        row_texels   = {tbw, 6'b000000};                 // tbw * 64
+        texel_offset = (v * row_texels) + {21'd0, u};
+
+        unique case (psm)
+            PSMCT32: begin
+                texel_byte_addr = base_byte_addr + (texel_offset << 2);  // 4 B/texel
+                nibble_hi       = 1'b0;
+            end
+            PSMCT16: begin
+                texel_byte_addr = base_byte_addr + (texel_offset << 1);  // 2 B/texel
+                nibble_hi       = 1'b0;
+            end
+            PSMT8: begin
+                texel_byte_addr = base_byte_addr + texel_offset;         // 1 B/texel
+                nibble_hi       = 1'b0;
+            end
+            PSMT4: begin
+                texel_byte_addr = base_byte_addr + (texel_offset >> 1);  // 4 b/texel
+                nibble_hi       = texel_offset[0];
+            end
+            default: begin
+                texel_byte_addr = base_byte_addr + (texel_offset << 2);
+                nibble_hi       = 1'b0;
+            end
+        endcase
+    end
+
+endmodule : gs_texel_addr
@@ -0,0 +1,196 @@
+// ============================================================================
+// gs_texture_cache.sv  (Ch322 — PREFILLED texture cache, correctness-first)
+//
+// Proves texture bytes can live in FPGA-private LPDDR4B and be consumed by the
+// GS sampler through an on-chip RAM at the EXISTING 1-cycle texel latency.
+//
+// This is a PREFILLED cache, NOT a demand cache. The whole known texture is
+// filled from LPDDR into an on-chip RAM ONCE, before rendering (mirroring the
+// Ch321 line-buffer trick: warm a bounded buffer, then serve at native latency).
+// Every sampler read is therefore a HIT at 1-cycle latency — the nearest-path
+// sampler's fixed-latency contract (gs_texture_unit valid_pipe, RD_LATENCY=1) is
+// preserved with ZERO pipeline/stall surgery. Demand miss/stall is explicitly
+// DEFERRED to a later chapter (it would be a raster-walker pipeline redesign).
+//
+// Two clock domains (same split as gs_lpddr_scanout_lb):
+//   axi_clk    (emif_clk) — fill FSM: single-beat 256-bit reads (arlen=0, the
+//                           only read pattern proven on this EMIF) from LPDDR.
+//   sample_clk (design)   — the sampler's texel read port; 1-cycle registered,
+//                           byte-identical timing to the vram_bram_stub read2.
+//
+// The fill is one-shot before raster, so the on-chip RAM is static when the
+// sampler reads it (write side idle) — a plain dual-clock simple-dual-port RAM,
+// no read/write CDC hazard. `fill_done` is 2-FF synced into sample_clk so the
+// read mux only goes live after the texture is fully resident.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_texture_cache #(
+    parameter [29:0] LPDDR_TEX_BASE = 30'd0,   // byte base of the texture in LPDDR4B
+    parameter [31:0] TEX_VRAM_BASE  = 32'd2048,// VRAM byte base the sampler addresses (TBP0*256)
+    parameter int    TEX_BYTES      = 256,     // texture size in bytes (8x8 PSMCT32 = 256)
+    parameter int    N_BEATS        = 8         // TEX_BYTES / 32
+)(
+    // ---- AXI read clock domain (emif_clk) — fill side ----
+    input  logic         axi_clk,
+    input  logic         axi_rst_n,
+    input  logic         fill_start,    // TOGGLE (bridge domain, CDC-synced): each edge (re)fills
+    output logic         fill_done,     // texture fully resident (until the next fill arm)
+    output logic [31:0]  fill_beats,    // beats completed (cumulative)
+    output logic [31:0]  fill_bytes,    // bytes filled (cumulative)
+    output logic [31:0]  rd_errs,       // non-OKAY read responses (cumulative)
+    output logic [31:0]  fill_crc,      // Ch352 — sum32 of EVERY 32-bit word actually written into tex_mem. The
+                                        //   host compares this to the file's sum32 to PROVE tex_mem integrity on
+                                        //   silicon (the LPDDR readback only proves LPDDR, not the cache contents).
+
+    // ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
+    output logic [29:0]  araddr,
+    output logic [1:0]   arburst,
+    output logic [6:0]   arid,
+    output logic [7:0]   arlen,
+    output logic [2:0]   arsize,
+    output logic         arvalid,
+    input  logic         arready,
+    input  logic [255:0] rdata,
+    input  logic [1:0]   rresp,
+    input  logic         rlast,
+    input  logic         rvalid,
+    output logic         rready,
+
+    // ---- sampler clock domain (design_clk) — texel read port ----
+    input  logic         sample_clk,
+    input  logic         tex_rd_en,     // sampler issues a texel read this cycle
+    input  logic [31:0]  tex_rd_addr,   // VRAM byte address (TEX_VRAM_BASE + offset)
+    output logic [31:0]  tex_rd_data,   // 1-cycle REGISTERED texel word (matches read2)
+    output logic         tex_ready      // fill_done synced into sample_clk (mux gate)
+);
+    localparam int TEX_WORDS = TEX_BYTES/4;            // 32-bit words in the cache
+    localparam int WIDX_BITS = (TEX_WORDS <= 1) ? 1 : $clog2(TEX_WORDS);
+    localparam int BIDX_BITS = (N_BEATS <= 1) ? 1 : $clog2(N_BEATS);
+
+    assign arburst = 2'b01;   // INCR
+    assign arid    = 7'd4;    // distinct id: writer=0, probe=1, frame-cache=2, line-buf=3, tex-fill=4
+    assign arlen   = 8'd0;    // SINGLE-BEAT (the only AXI read pattern proven on this EMIF)
+    assign arsize  = 3'b101;  // 32 bytes
+
+    // On-chip texture RAM: written by the fill FSM (axi_clk), read by the sampler
+    // (sample_clk). One-shot warm fill => static during reads => no CDC hazard.
+    //
+    // One ordinary 32-bit simple-dual-port RAM. History of cache geometry vs Quartus Place:
+    //   flat 8-write array      -> exploded to flops (707k ALUT)
+    //   8x 8192x32 banks        -> width fragmentation, 344/358, 9h Place thrash
+    //   1x 8192x256 RAM         -> count OK (320/358) but one ~104-M20K macro too WIDE -> Place 40min+, no QDB
+    //   2x 8192x128 halves      -> still rigid, Place 50min+ no progress
+    //   4x 8192x64 banks       -> 328/358; Place still stalled with ample RAM after read2 was removed.
+    //   1x 65536x32 (HERE)     -> latch each AXI beat, drain 8 lanes over 8 axi_clk cycles.
+    // Serializing the fill removes the multi-bank/multi-write geometry while preserving the sampler's
+    // one-cycle registered 32-bit read. The one-shot fill is still tiny compared with board startup.
+    (* ramstyle = "M20K" *) logic [31:0] tex_mem [0:TEX_WORDS-1];
+
+    // ================= fill side (axi_clk) =================
+    typedef enum logic [2:0] { F_IDLE, F_AR, F_R, F_DRAIN, F_DONE } fstate_t;
+    fstate_t fst;
+    logic [$clog2(N_BEATS):0] beat;     // 0..N_BEATS
+    logic [255:0] fill_data_q;
+    logic [2:0]   fill_lane;
+    logic [WIDX_BITS-1:0] fill_word_base;
+    wire  [WIDX_BITS-1:0] fill_word_idx = fill_word_base + WIDX_BITS'(fill_lane);
+    // fill_start is an EDGE/TOGGLE (bridge toggles it on each arm), CDC-synced here so the
+    // cache is RE-FILLABLE: each arm reloads the texture (lets the HPS re-stage a different
+    // texture without a board reset). 3-FF sync + edge-detect, like the read/write probes.
+    logic [2:0] fs_sync;
+    wire        fs_edge = (fs_sync[2] != fs_sync[1]);
+
+    always_ff @(posedge axi_clk) begin
+        if (!axi_rst_n) begin
+            fst <= F_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
+            beat <= '0; fill_done <= 1'b0; fill_beats <= 32'd0; fill_bytes <= 32'd0;
+            rd_errs <= 32'd0; fs_sync <= 3'd0; fill_data_q <= '0;
+            fill_lane <= 3'd0; fill_word_base <= '0; fill_crc <= 32'd0;
+        end else begin
+            fs_sync <= {fs_sync[1:0], fill_start};
+            case (fst)
+                F_IDLE: begin
+                    if (fs_edge) begin
+                        araddr     <= LPDDR_TEX_BASE;
+                        beat       <= '0;
+                        fill_done  <= 1'b0;          // re-arm: drop ready until reloaded
+                        fill_beats <= 32'd0;
+                        fill_bytes <= 32'd0;
+                        rd_errs    <= 32'd0;
+                        fill_crc   <= 32'd0;     // Ch352 — restart the tex_mem integrity sum for this fill
+                        arvalid    <= 1'b1;
+                        fst        <= F_AR;
+                    end
+                end
+                F_AR: begin
+                    if (arready) begin
+                        arvalid <= 1'b0;
+                        rready  <= 1'b1;
+                        fst     <= F_R;
+                    end
+                end
+                F_R: begin
+                    if (rvalid) begin
+                        // Capture the AXI beat, then issue one M20K-native 32-bit write per cycle.
+                        fill_data_q  <= rdata;
+                        fill_lane    <= 3'd0;
+                        fill_word_base <= {beat[BIDX_BITS-1:0], 3'b000};
+                        if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
+                        rready <= 1'b0;
+                        fst    <= F_DRAIN;
+                    end
+                end
+                F_DRAIN: begin
+                    tex_mem[fill_word_idx] <= fill_data_q[fill_lane*32 +: 32];
+                    fill_crc <= fill_crc + fill_data_q[fill_lane*32 +: 32];   // sum32 over the words written
+                    if (fill_lane == 3'd7) begin
+                        fill_beats <= fill_beats + 32'd1;
+                        fill_bytes <= fill_bytes + 32'd32;
+                        if (beat == N_BEATS-1) begin
+                            fill_done <= 1'b1;
+                            fst       <= F_DONE;
+                        end else begin
+                            beat    <= beat + 1'b1;
+                            araddr  <= araddr + 30'd32;   // next single-beat read
+                            arvalid <= 1'b1;
+                            fst     <= F_AR;
+                        end
+                    end else begin
+                        fill_lane <= fill_lane + 3'd1;
+                    end
+                end
+                F_DONE: begin
+                    // resident until the next arm. A fresh fill_start edge re-loads the
+                    // texture (e.g. HPS re-stages a different one) — start a new fill.
+                    if (fs_edge) begin
+                        araddr     <= LPDDR_TEX_BASE;
+                        beat       <= '0;
+                        fill_done  <= 1'b0;
+                        fill_beats <= 32'd0;
+                        fill_bytes <= 32'd0;
+                        rd_errs    <= 32'd0;
+                        fill_crc   <= 32'd0;     // Ch352 — restart the tex_mem integrity sum for this fill
+                        arvalid    <= 1'b1;
+                        fst        <= F_AR;
+                    end
+                end
+                default: fst <= F_IDLE;
+            endcase
+        end
+    end
+
+    // ================= sampler side (sample_clk) =================
+    // 1-cycle REGISTERED read, identical timing to vram_bram_stub.read2:
+    // present (tex_rd_addr) when tex_rd_en, data lands next cycle.
+    wire [31:0] word_off = (tex_rd_addr - TEX_VRAM_BASE) >> 2;
+    wire [WIDX_BITS-1:0] rd_word = word_off[WIDX_BITS-1:0];
+    always_ff @(posedge sample_clk) begin
+        if (tex_rd_en) tex_rd_data <= tex_mem[rd_word];
+    end
+
+    // fill_done -> sample_clk (2-FF). The read mux only goes live once warm.
+    logic [1:0] done_sync;
+    always_ff @(posedge sample_clk) done_sync <= {done_sync[0], fill_done};
+    assign tex_ready = done_sync[1];
+endmodule
@@ -0,0 +1,757 @@
+// retroDE_ps2 — gs_texture_unit
+//
+// Per-pixel texture sampler (brick 1, the texturing datapath core).
+//
+// Takes a per-pixel texture coordinate (u,v) + the TEX0 descriptor, fetches
+// the texel from VRAM through a read port, and outputs the sampled color,
+// pipelined to absorb the VRAM read latency.
+//
+//   (u,v,valid) --[gs_texel_addr]--> byte addr --> VRAM read port
+//                                                     |  (RD_LATENCY cyc)
+//                              sampled color <--[decode]-- tex_rd_data
+//
+// v1 scope (kept deliberately minimal so it's fully verifiable now):
+//   - PSMCT32 only (32-bit ABGR texels, direct — no CLUT).
+//   - DECAL texture function (texel replaces fragment color).
+//
+// Ch296 — PSMT8 indexed texturing (this chapter):
+//   - When psm==PSMT8 (0x13) the fetched 32-bit word holds FOUR packed
+//     8-bit indices. The byte for this texel is selected by the texel
+//     byte address' low 2 bits (gs_texel_addr emits a 1-byte/texel
+//     address for PSMT8). That index drives a CLUT lookup port; the
+//     returned PSMCT32 entry is the texel color (DECAL).
+//   - The lookup is COMBINATIONAL (clut_stub's read port is comb), so it
+//     lands in the SAME cycle as the direct PSMCT32 path — the existing
+//     single S1->S2 register in gs_stub aligns it with emit unchanged.
+//   - PSMCT32 (psm==0x00) behavior is byte-identical to before.
+// Next versions add: PSMCT16 unpack, PSMT4 (nibble) + CLUT, swizzle, and
+// MODULATE/HIGHLIGHT tex functions.
+//
+// The VRAM read port here is generic (byte address out, 32-bit word in,
+// fixed RD_LATENCY). Integration wires it to vram_stub's spare read port;
+// vram_stub's exact address convention is reconciled at integration time.
+
+`timescale 1ns/1ps
+
+module gs_texture_unit #(
+    // Ch298 — SWIZZLED PSMT4 texture sampling. When PSMT4_SWIZZLE=1 AND the
+    // texture psm==PSMT4, the texel byte address + nibble_hi are computed by
+    // gs_swizzle_psmt4_stub (the SAME proven module already on the framebuffer
+    // WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT4 block layout,
+    // instead of the linear gs_texel_addr. LINEAR is the default (0) so every
+    // existing linear PSMT4/PSMT8/PSMCT32 demo + TB is byte-identical. The
+    // swizzled address feeds the SAME word-aligned read, byte-lane extract,
+    // nibble select, and CLUT lookup — only the address GENERATION differs.
+    // Because the swizzled address (its low 2 bits + nibble_hi) is also
+    // address-derived, it flows through the SAME SEL_DELAY pipe as the linear
+    // selectors, so registered-read (TEX_RD_REGISTERED=1) alignment is reused
+    // verbatim. PSMT8/PSMCT32 always take the linear address (this rung is
+    // PSMT4-only).
+    parameter bit PSMT4_SWIZZLE = 1'b0,
+    // Ch299 — SWIZZLED PSMT8 texture sampling. The sibling of PSMT4_SWIZZLE,
+    // MINUS the nibble (PSMT8 is 1 byte/texel). When PSMT8_SWIZZLE=1 AND the
+    // texture psm==PSMT8, the texel byte address is computed by
+    // gs_swizzle_psmt8_stub (the SAME proven module already on the framebuffer
+    // WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT8 block layout,
+    // instead of the linear gs_texel_addr. LINEAR is the default (0) so every
+    // existing linear PSMT8/PSMT4/PSMCT32 demo + TB is byte-identical. The
+    // swizzled address feeds the SAME word-aligned read, byte-lane extract, and
+    // CLUT lookup — only the address GENERATION differs. Because the swizzled
+    // address' low 2 bits (byte-lane selector) are also address-derived, they
+    // flow through the SAME SEL_DELAY pipe as the linear selectors, so
+    // registered-read (TEX_RD_REGISTERED=1) alignment is reused verbatim. NO
+    // nibble pipe is needed — PSMT8 has no nibble. PSMT4/PSMCT32 always take
+    // their own address (this rung is PSMT8-only).
+    parameter bit PSMT8_SWIZZLE = 1'b0,
+    // Ch300 — SWIZZLED PSMCT32 (direct-color) texture sampling. The closure
+    // rung of the swizzle layout family. When PSMCT32_SWIZZLE=1 AND the texture
+    // psm==PSMCT32, the texel byte address is computed by gs_swizzle_psmct32_stub
+    // (the SAME proven module already on the framebuffer WRITE / SCANOUT / UPLOAD
+    // paths — Ch119/Ch122) using the real PS2 PSMCT32 page/block layout, instead
+    // of the linear gs_texel_addr. Unlike PSMT4/PSMT8 this needs NO CLUT and NO
+    // byte-lane select: PSMCT32 is 4 bytes/texel, so the swizzled address is
+    // already word-aligned and the fetched 32-bit word IS the color directly
+    // (tex_color = tex_rd_data). LINEAR is the default (0) so every existing
+    // linear PSMCT32 demo + TB (textured / tritex) is byte-identical. This is
+    // the SAME single-param-per-format gate as PSMCT32_SWIZZLE on the FB side,
+    // so a PSMCT32 texture and a PSMCT32 framebuffer swizzle together.
+    parameter bit PSMCT32_SWIZZLE = 1'b0,
+    // Ch294 — GS texture WRAP MODES (REPEAT + CLAMP). When TEX_WRAP_ENABLE=1
+    // the per-pixel (u,v) are resolved against the texture's power-of-two
+    // dimensions (width=2^TW, height=2^TH from TEX0) using the CLAMP_1 wrap
+    // mode (WMS for u/S, WMT for v/T): 0=REPEAT (u & (width-1)), 1=CLAMP
+    // (u>=width -> width-1). REGION_* (2/3) are NOT modelled and pass through.
+    // The wrap is applied to u/v BEFORE address generation, so it covers the
+    // linear path AND every swizzle path. With TEX_WRAP_ENABLE=0 (default)
+    // u_eff===u and v_eff===v as a compile-time constant, so the wrap logic is
+    // pruned and every existing consumer is BYTE-IDENTICAL.
+    parameter bit TEX_WRAP_ENABLE = 1'b0,
+    // Ch308 — BILINEAR (4-tap) texture filtering, PSMCT32-only this rung.
+    // When BILINEAR_ENABLE=1 AND psm==PSMCT32 the sampler runs a 4-beat read
+    // FSM: it fetches the 4 texels surrounding the fractional coord
+    //   (u,v) (u+1,v) (u,v+1) (u+1,v+1)
+    // — each independently wrapped/clamped through the SAME u_eff/v_eff
+    // machinery (so edge taps repeat/clamp instead of reading outside the
+    // texture) — then blends them per channel (R,G,B,A) by the 4-bit
+    // fractional u_frac/v_frac (0..15, /16) using a >>4 fixed-point lerp.
+    // For !BILINEAR_ENABLE (default) OR psm!=PSMCT32 the EXACT current
+    // single-read NEAREST path is used and u_frac/v_frac are ignored, so the
+    // synthesized logic and every existing consumer is BYTE-IDENTICAL (the
+    // bilinear FSM, the per-beat coord select, and the blend datapath are all
+    // pruned as compile-time-dead when BILINEAR_ENABLE=0). Bilinear is
+    // PSMCT32-only by default; with PALETTE_BILINEAR=1 (Ch314) it also covers
+    // PSMT8/PSMT4 via per-tap CLUT-before-interp. At PALETTE_BILINEAR=0 the
+    // indexed textures still take the nearest path even with BILINEAR_ENABLE=1.
+    //
+    // ALPHA: the alpha channel is INTERPOLATED with the same 4-tap lerp as
+    // R/G/B (not pass-through-nearest). For an opaque texture (all taps a=255)
+    // this returns 255 exactly; for a texel-center sample (u_frac=v_frac=0) it
+    // returns the (u,v) tap's alpha exactly.
+    parameter bit BILINEAR_ENABLE = 1'b0,
+    // Ch314 — BILINEAR for PALETTIZED (indexed) textures. When
+    // PALETTE_BILINEAR=1 (and BILINEAR_ENABLE=1) the 4-tap path also runs for
+    // PSMT8 (0x13) and PSMT4 (0x14). The CRITICAL rule is CLUT-BEFORE-INTERP:
+    // each of the 4 taps fetches an INDEX, that index is CLUT'd to an RGBA
+    // color (the existing combinational clut_rd_idx/clut_rd_data port), and the
+    // 4 COLORS are then interpolated — NOT the indices. This falls out of
+    // capturing `near_color` per tap (clut_rd_data for indexed, tex_rd_data for
+    // PSMCT32) instead of the raw word. Swizzled addressing + wrap/clamp run in
+    // the SAME per-tap addr-gen that already feeds the nearest path, so they
+    // happen BEFORE the index/CLUT lookup. Default 0 → indexed textures stay
+    // nearest even with BILINEAR_ENABLE=1, so every existing build is
+    // byte-identical (the combined path only ever fed PSMCT32 textures anyway).
+    parameter bit PALETTE_BILINEAR = 1'b0,
+    parameter int RD_LATENCY = 1,      // VRAM read latency in clk cycles
+    // Ch296 — PSMT8 byte-lane realignment. The byte selected from the
+    // fetched word must use the LOW 2 bits of the address that was ISSUED
+    // for the returned data. When the texel ADDRESS advances every cycle
+    // while a read is in flight (gs_stub TEX_RD_REGISTERED=1: address
+    // taken from the S0 walker, registered read returns 1 cycle later),
+    // the current `addr` no longer matches the in-flight word, so the
+    // selector must be delayed by SEL_DELAY cycles to re-pair them. When
+    // the address is HELD stable across the read (combinational read port,
+    // address from the stable S1 latch), SEL_DELAY=0 and the current addr
+    // is correct. Driven from gs_stub as TEX_RD_REGISTERED?TEX_RD_LATENCY:0.
+    parameter int SEL_DELAY  = 0
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // per-pixel texture coordinate in
+    input  logic        in_valid,
+    input  logic [10:0] u,
+    input  logic [10:0] v,
+
+    // Ch308 — fractional texture coords for BILINEAR (4-bit, 0..15 => /16).
+    // Unused at default (BILINEAR_ENABLE=0) and for non-PSMCT32 psm.
+    input  logic [3:0]  u_frac,
+    input  logic [3:0]  v_frac,
+
+    // Ch310 — RUNTIME filter select (per-primitive TEX1_1.MMAG). When
+    // BILINEAR_ENABLE=1 the 4-tap path runs ONLY when (is_ct32 && filter_lin);
+    // with filter_lin=0 (TEX1.MMAG=0 NEAREST) the sampler falls back to the
+    // exact nearest single-read path (busy stays 0). Unused at
+    // BILINEAR_ENABLE=0 (g_nearest), so the default build is byte-identical.
+    input  logic        filter_lin,
+
+    // Ch294 — wrap-mode controls (CLAMP_1 WMS/WMT + TEX0 TW/TH). Unused at
+    // default (TEX_WRAP_ENABLE=0) since u_eff/v_eff collapse to u/v.
+    input  logic [1:0]  wms,
+    input  logic [1:0]  wmt,
+    input  logic [3:0]  tw,
+    input  logic [3:0]  th,
+
+    // TEX0 descriptor
+    input  logic [31:0] tbp0_base_bytes, // texture base in VRAM (bytes)
+    input  logic [13:0] tbw,             // TEX0.TBW (texels/row / 64)
+    input  logic [5:0]  psm,             // pixel storage mode
+
+    // VRAM texel read port
+    output logic        tex_rd_en,
+    output logic [31:0] tex_rd_addr,     // byte address
+    input  logic [31:0] tex_rd_data,     // 32-bit word, valid RD_LATENCY later
+
+    // Ch296 — CLUT lookup port (PSMT8 indexed texturing). The extracted
+    // 8-bit index drives `clut_rd_idx`; the parent wires this to
+    // clut_stub's second (combinational) read port and returns the
+    // PSMCT32 entry on `clut_rd_data`. Unused for PSMCT32 textures.
+    output logic [7:0]  clut_rd_idx,
+    input  logic [31:0] clut_rd_data,    // PSMCT32 entry for clut_rd_idx
+
+    // sampled color out (aligned with out_valid)
+    output logic        out_valid,
+    output logic [31:0] tex_color,       // ABGR8888
+
+    // Ch308 — BILINEAR busy: high while the 4-beat read sequence is in flight
+    // (the caller must not issue a new in_valid until it drops / out_valid
+    // pulses). Always 0 on the nearest path (BILINEAR_ENABLE=0 or non-PSMCT32),
+    // so a caller that ignores it sees byte-identical behavior.
+    output logic        busy
+);
+
+    localparam logic [5:0] PSM_PSMCT32 = 6'h00;
+    localparam logic [5:0] PSM_PSMT8   = 6'h13;
+    localparam logic [5:0] PSM_PSMT4   = 6'h14;
+
+    // --- Ch294: wrap-mode resolution (u/v -> u_eff/v_eff) ---
+    // Applied BEFORE any address generation so it covers the linear path AND
+    // every swizzle path. width=2^TW, height=2^TH (both powers of two), so
+    // REPEAT is a mask and CLAMP is a >width-1 saturate. u/v are unsigned so
+    // there is no negative/underflow case to handle. REGION_* (2/3) pass
+    // through unchanged (not modelled this rung). At TEX_WRAP_ENABLE=0 this is
+    // a constant pass-through (u_eff===u, v_eff===v) -> byte-identical.
+    // Ch308 — the coord that FEEDS the wrap. On the nearest path (bilinear off
+    // or non-PSMCT32) this is the port u/v UNCHANGED, so the wrap output
+    // (u_eff/v_eff) and everything downstream is byte-identical. On the
+    // bilinear path it is the current beat's neighbor coord (u+du[k],v+dv[k]),
+    // so each of the 4 taps is independently wrapped/clamped. `bili_active` is
+    // a compile-time constant 0 when BILINEAR_ENABLE=0, so u_in===u / v_in===v
+    // collapses away at the default build.
+    logic        bili_active;            // declared below; bilinear running for this psm
+    logic [10:0] beat_u, beat_v;         // declared below; current beat neighbor coord
+    logic [10:0] u_in, v_in;
+    always_comb begin
+        if (BILINEAR_ENABLE && bili_active) begin
+            u_in = beat_u;  v_in = beat_v;
+        end else begin
+            u_in = u;       v_in = v;     // byte-identical nearest coord
+        end
+    end
+
+    logic [10:0] u_eff, v_eff;
+    logic [10:0] u_wmask, v_wmask;     // width-1 / height-1
+    logic [10:0] u_wlimit, v_wlimit;
+    always_comb begin
+        u_wmask  = (11'd1 << tw) - 11'd1;   v_wmask  = (11'd1 << th) - 11'd1;
+        u_wlimit = u_wmask;                 v_wlimit = v_wmask;   // width-1 / height-1
+        if (!TEX_WRAP_ENABLE) begin
+            u_eff = u_in;  v_eff = v_in;    // pass-through -> byte-identical at default
+        end else begin
+            // U
+            unique case (wms)
+                2'd0:    u_eff = u_in & u_wmask;                       // REPEAT
+                2'd1:    u_eff = (u_in > u_wlimit) ? u_wlimit : u_in;  // CLAMP
+                default: u_eff = u_in;                                 // REGION_* not modelled -> pass-through
+            endcase
+            // V
+            unique case (wmt)
+                2'd0:    v_eff = v_in & v_wmask;
+                2'd1:    v_eff = (v_in > v_wlimit) ? v_wlimit : v_in;
+                default: v_eff = v_in;
+            endcase
+        end
+    end
+
+    // --- linear address (combinational) ---
+    logic [31:0] lin_addr;
+    logic        lin_nibble_hi;   // PSMT4: this texel is the HIGH nibble of its byte
+    gs_texel_addr #(.ADDR_W(32)) u_addr (
+        .base_byte_addr (tbp0_base_bytes),
+        .u              (u_eff),
+        .v              (v_eff),
+        .tbw            (tbw),
+        .psm            (psm),
+        .texel_byte_addr(lin_addr),
+        .nibble_hi      (lin_nibble_hi)
+    );
+
+    // --- swizzled PSMT4 address (combinational) ---
+    // EXACTLY mirrors the texture-UPLOAD path (gif_image_xfer_stub Ch139):
+    // the swizzle module is fed FBP=0 so it emits only the WITHIN-TEXTURE
+    // byte OFFSET, and the texture base (tbp0_base_bytes) is ADDED on top.
+    // This makes the sampled address bit-identical to the uploaded one for
+    // ANY 256-byte-aligned base (using the swizzle module's `fbp` input here
+    // would discard the low 11 bits of a non-2048-aligned base). FBW=TBW (in
+    // 64-texel units); PSMT4 swizzle needs FBW even (bw_pg = FBW>>1). The
+    // texture's (u,v) ARE the swizzle (x,y). Output is byte-offset + nibble_hi
+    // — the SAME shape gs_texel_addr emits for linear PSMT4, so downstream
+    // (word-align, byte-lane, nibble select, CLUT) is untouched.
+    logic [31:0] swz_off;
+    logic [31:0] swz_addr;
+    logic        swz_nibble_hi;
+    generate
+        if (PSMT4_SWIZZLE) begin : g_swizzle4
+            gs_swizzle_psmt4_stub u_swizzle4 (
+                .fbp       (9'd0),
+                .fbw       (tbw[5:0]),
+                .x         ({1'b0, u_eff}),
+                .y         ({1'b0, v_eff}),
+                .addr      (swz_off),
+                .nibble_hi (swz_nibble_hi)
+            );
+            assign swz_addr = tbp0_base_bytes + swz_off;
+        end else begin : g_no_swizzle4
+            assign swz_off       = 32'd0;
+            assign swz_addr      = 32'd0;
+            assign swz_nibble_hi = 1'b0;
+        end
+    endgenerate
+
+    // --- swizzled PSMT8 address (combinational) ---
+    // Ch299 — EXACTLY mirrors the PSMT4-swizzle sampler arm above (and the
+    // PSMT8 UPLOAD path in gif_image_xfer_stub Ch133), MINUS the nibble.
+    // gs_swizzle_psmt8_stub is fed FBP=0 so it emits only the WITHIN-TEXTURE
+    // byte OFFSET; the texture base (tbp0_base_bytes) is ADDED on top. This
+    // makes the sampled address bit-identical to the uploaded one for ANY
+    // 256-byte-aligned base. FBW=TBW (in 64-texel units); the PSMT8 swizzle
+    // needs FBW even (bw_pg = FBW>>1). The texture's (u,v) ARE the swizzle
+    // (x,y). Output is a byte address — the SAME shape gs_texel_addr emits for
+    // linear PSMT8 — so downstream (word-align, byte-lane, CLUT) is untouched.
+    // No nibble_hi: PSMT8 is one full byte per texel.
+    logic [31:0] swz8_off;
+    logic [31:0] swz8_addr;
+    generate
+        if (PSMT8_SWIZZLE) begin : g_swizzle8
+            gs_swizzle_psmt8_stub u_swizzle8 (
+                .fbp  (9'd0),
+                .fbw  (tbw[5:0]),
+                .x    ({1'b0, u_eff}),
+                .y    ({1'b0, v_eff}),
+                .addr (swz8_off)
+            );
+            assign swz8_addr = tbp0_base_bytes + swz8_off;
+        end else begin : g_no_swizzle8
+            assign swz8_off  = 32'd0;
+            assign swz8_addr = 32'd0;
+        end
+    endgenerate
+
+    // --- swizzled PSMCT32 address (combinational) ---
+    // Ch300 — direct-color sibling of the PSMT4/PSMT8 swizzle arms above, using
+    // the SAME proven gs_swizzle_psmct32_stub already on the FB WRITE / SCANOUT
+    // / UPLOAD paths. Fed FBP=0 so it emits only the WITHIN-TEXTURE byte OFFSET;
+    // the texture base (tbp0_base_bytes) is ADDED on top, making the sampled
+    // address bit-identical to the uploaded one for ANY 2048-byte-aligned base.
+    // FBW=TBW (in 64-pixel units — PSMCT32 page is 64 px wide, so TBW units
+    // match the stub's fbw directly, NO >>1). The texture's (u,v) ARE the
+    // swizzle (x,y). Output is a 4-byte-aligned byte address — gs_texel_addr's
+    // PSMCT32 shape — so downstream is untouched. NO nibble, NO byte-lane, NO
+    // CLUT: the fetched word is the color (tex_color = tex_rd_data).
+    logic [31:0] swz32_off;
+    logic [31:0] swz32_addr;
+    generate
+        if (PSMCT32_SWIZZLE) begin : g_swizzle32
+            gs_swizzle_psmct32_stub u_swizzle32 (
+                .fbp  (9'd0),
+                .fbw  (tbw[5:0]),
+                .x    ({1'b0, u_eff}),
+                .y    ({1'b0, v_eff}),
+                .addr (swz32_off)
+            );
+            assign swz32_addr = tbp0_base_bytes + swz32_off;
+        end else begin : g_no_swizzle32
+            assign swz32_off  = 32'd0;
+            assign swz32_addr = 32'd0;
+        end
+    endgenerate
+
+    // --- linear-vs-swizzled select ---
+    // Swizzle applies to a PSMT4 texture when PSMT4_SWIZZLE is set, and to a
+    // PSMT8 texture when PSMT8_SWIZZLE is set; every other psm always takes the
+    // linear address, and the two swizzle gates are mutually exclusive by psm.
+    // With both params 0 the selects are constant-false, so the synthesized
+    // logic — and every linear TB/demo — is byte-identical.
+    logic        use_swizzle4;
+    logic        use_swizzle8;
+    logic        use_swizzle32;
+    logic [31:0] addr;
+    logic        nibble_hi;
+    assign use_swizzle4  = (PSMT4_SWIZZLE   != 1'b0) && (psm == PSM_PSMT4);
+    assign use_swizzle8  = (PSMT8_SWIZZLE   != 1'b0) && (psm == PSM_PSMT8);
+    assign use_swizzle32 = (PSMCT32_SWIZZLE != 1'b0) && (psm == PSM_PSMCT32);
+    // Only the PSMT4 path carries a nibble; PSMT8/PSMCT32 swizzle and the
+    // linear fallback have none.
+    assign addr         = use_swizzle4  ? swz_addr      :
+                          use_swizzle8  ? swz8_addr     :
+                          use_swizzle32 ? swz32_addr    : lin_addr;
+    assign nibble_hi    = use_swizzle4  ? swz_nibble_hi : lin_nibble_hi;
+
+    // Nearest-path read enable / address. These are muxed at the module
+    // outputs (tex_rd_en/tex_rd_addr) below: on the nearest path they ARE the
+    // outputs (byte-identical); on the bilinear path the FSM drives the
+    // outputs instead. The word-align mask is a no-op for PSMCT32.
+    logic        near_rd_en;
+    logic [31:0] near_rd_addr;
+    assign near_rd_en   = in_valid;
+    // The VRAM read port is 32-bit WORD-addressed (and vram_bram_stub's
+    // read2 only returns data for word-aligned addresses). PSMCT32 texel
+    // addresses are already word-aligned; PSMT8 byte addresses are not, so
+    // present the word-aligned address and recover the right byte via the
+    // low 2 bits (sel_lo) in the index extract below. Masking is a no-op
+    // for PSMCT32, so that path stays byte-identical.
+    assign near_rd_addr = addr & ~32'd3;
+
+    // --- PSMT8 index extract ---
+    // gs_texel_addr returns a 1-byte/texel address for PSMT8, so the
+    // fetched 32-bit word (read at addr & ~3 by the word-addressed VRAM
+    // port) packs 4 indices; the issued address' low 2 bits select which
+    // byte is THIS texel.
+    //
+    // The byte selector uses the addr[1:0] from the issue cycle of the
+    // returned word. SEL_DELAY (see the param comment) is 0 when the
+    // address is held stable across the read (current addr is correct) and
+    // >0 when the address advances while the read is in flight (delay the
+    // selector to re-pair it with the in-flight word). `sel_lo` carries it.
+    //
+    // PSMT4 (Ch297) adds a NIBBLE selector on top of the byte selector.
+    // gs_texel_addr emits a byte address (texel_offset>>1) plus `nibble_hi`
+    // (= texel_offset[0]: even texel -> LOW nibble, odd -> HIGH nibble). The
+    // selected byte (via sel_lo, exactly as PSMT8) holds TWO 4-bit indices;
+    // nibble_hi picks which. Because nibble_hi is derived from the texel
+    // ADDRESS — which advances every cycle while a read is in flight under
+    // TEX_RD_REGISTERED=1 — it must be SEL_DELAY-aligned by the SAME pipe
+    // depth as sel_lo so it re-pairs with the returned word. (Same class as
+    // the PSMT8 byte-lane realignment; get it wrong and odd/even texels smear.)
+    logic [1:0] sel_lo;
+    logic       nib_sel;          // SEL_DELAY-aligned nibble_hi
+    generate
+        if (SEL_DELAY == 0) begin : g_sel_comb
+            assign sel_lo  = addr[1:0];
+            assign nib_sel = nibble_hi;
+        end else begin : g_sel_reg
+            logic [1:0] sel_pipe [0:SEL_DELAY-1];
+            logic       nib_pipe [0:SEL_DELAY-1];
+            always_ff @(posedge clk or negedge rst_n) begin
+                if (!rst_n) begin
+                    for (int i = 0; i < SEL_DELAY; i++) begin
+                        sel_pipe[i] <= 2'd0;
+                        nib_pipe[i] <= 1'b0;
+                    end
+                end else begin
+                    sel_pipe[0] <= addr[1:0];
+                    nib_pipe[0] <= nibble_hi;
+                    for (int i = 1; i < SEL_DELAY; i++) begin
+                        sel_pipe[i] <= sel_pipe[i-1];
+                        nib_pipe[i] <= nib_pipe[i-1];
+                    end
+                end
+            end
+            assign sel_lo  = sel_pipe[SEL_DELAY-1];
+            assign nib_sel = nib_pipe[SEL_DELAY-1];
+        end
+    endgenerate
+
+    // Byte select (shared by PSMT8 and PSMT4): pick the texel's byte lane.
+    logic [7:0] sel_byte;
+    always_comb begin
+        unique case (sel_lo)
+            2'b00:   sel_byte = tex_rd_data[ 7: 0];
+            2'b01:   sel_byte = tex_rd_data[15: 8];
+            2'b10:   sel_byte = tex_rd_data[23:16];
+            default: sel_byte = tex_rd_data[31:24];
+        endcase
+    end
+
+    // Nibble select for PSMT4 (4-bit index, zero-extended to 8 bits so the
+    // SAME clut_rd_idx port + clut_stub feed it; CLUT entries 0..15 used).
+    // iverilog-12: no bit-select on a parenthesized expr, so split into a
+    // named net first, then index it.
+    logic [7:0] sel_byte_for_nib;
+    assign sel_byte_for_nib = sel_byte;
+    logic [3:0] psmt4_nibble;
+    assign psmt4_nibble = nib_sel ? sel_byte_for_nib[7:4] : sel_byte_for_nib[3:0];
+
+    // Index out: PSMT4 -> zero-extended nibble; PSMT8 -> full byte.
+    assign clut_rd_idx = (psm == PSM_PSMT4) ? {4'd0, psmt4_nibble} : sel_byte;
+
+    // --- valid pipeline matching the read latency ---
+    // in_valid presented with the address this cycle; tex_rd_data for it
+    // arrives RD_LATENCY cycles later. Delay valid by the same amount.
+    logic [RD_LATENCY-1:0] valid_pipe;
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            valid_pipe <= '0;
+        end else begin
+            if (RD_LATENCY == 1)
+                valid_pipe[0] <= in_valid;
+            else begin
+                valid_pipe[0] <= in_valid;
+                for (int i = 1; i < RD_LATENCY; i++)
+                    valid_pipe[i] <= valid_pipe[i-1];
+            end
+        end
+    end
+
+    logic        near_out_valid;
+    assign near_out_valid = valid_pipe[RD_LATENCY-1];
+
+    // --- decode (DECAL) ---
+    //   PSMT4   : texel color = CLUT[nibble]     (indexed indirection)
+    //   PSMT8   : texel color = CLUT[byte index] (indexed indirection)
+    //   PSMCT32 : texel word IS the color directly (byte-identical to v1)
+    logic [31:0] near_color;
+    assign near_color = (psm == PSM_PSMT8 || psm == PSM_PSMT4)
+                      ? clut_rd_data : tex_rd_data;
+
+    // ========================================================================
+    // Ch308 — BILINEAR (4-tap) PSMCT32 FILTER
+    // ========================================================================
+    // When BILINEAR_ENABLE=1 and psm==PSMCT32 we sample the 4 texels around the
+    // fractional coord and blend them. The whole block is wrapped in a generate
+    // that is empty when BILINEAR_ENABLE=0, so the default build is pruned to
+    // exactly the nearest path and is BYTE-IDENTICAL.
+    //
+    // CYCLE SCHEDULE (RD_LATENCY-aware; example RD_LATENCY=L):
+    //   T0           : caller asserts in_valid (with u,v,u_frac,v_frac). FSM in
+    //                  IDLE latches u/v/frac, sets beat index k=0, drives
+    //                  bili_active=1, busy=1, moves to ISSUE.
+    //   T0+ (ISSUE)  : present neighbor[k] coord (beat_u/beat_v -> wrap ->
+    //                  gs_texel_addr -> tex_rd_addr) and pulse tex_rd_en for 1
+    //                  cycle; start an L-cycle wait; -> WAIT.
+    //   ISSUE+1..+L  : WAIT counts L cycles; on the L-th cycle tex_rd_data holds
+    //                  beat[k]'s 32-bit ABGR word -> capture into tap[k].
+    //                  If k<3: k++ and -> ISSUE (next neighbor). If k==3: -> DONE.
+    //   DONE         : combinationally lerp the 4 captured taps by u_frac/v_frac
+    //                  per channel; assert out_valid for 1 cycle with tex_color;
+    //                  drop busy; -> IDLE.
+    //   => total ~ 4*(1+L)+1 cycles per filtered sample. Throughput is NOT a
+    //      goal here (a later texture-cache pass collapses the 4 reads).
+    //
+    // Neighbor table (k -> du,dv): 0->(0,0) 1->(1,0) 2->(0,1) 3->(1,1).
+    // Each neighbor coord is fed through the SAME u_eff/v_eff wrap (via
+    // u_in/v_in above) so edge taps repeat/clamp and never read outside the
+    // texture (proven in the TB clamp/repeat cases).
+    //
+    // lerp(a,b,f) = a + (($signed({1'b0,b}) - $signed({1'b0,a})) * $signed({1'b0,f})) >>> 4
+    //   with f the 4-bit frac (0..15 => /16). a,b are 8-bit channels. The
+    //   bracketed product is computed in a SIGNED temp (no bit-select on a
+    //   parenthesized expr — iverilog-12 rule), then arithmetic-shifted >>>4,
+    //   then defensively clamped to 0..255.
+    generate
+    if (BILINEAR_ENABLE) begin : g_bilinear
+        localparam logic [1:0] BS_IDLE  = 2'd0;
+        localparam logic [1:0] BS_ISSUE = 2'd1;
+        localparam logic [1:0] BS_WAIT  = 2'd2;
+        localparam logic [1:0] BS_DONE  = 2'd3;
+
+        logic [1:0]  state;
+        logic [1:0]  beat;               // which neighbor 0..3
+        logic [31:0] wait_cnt;           // counts RD_LATENCY
+        logic [31:0] tap [0:3];          // captured ABGR per neighbor
+        logic [10:0] lat_u, lat_v;       // latched coord for this sample
+        logic [3:0]  lat_uf, lat_vf;     // latched fracs
+
+        // is this a PSMCT32 sample? bilinear runs for PSMCT32 always, and (Ch314)
+        // for PSMT8/PSMT4 when PALETTE_BILINEAR=1; any other psm falls back to the
+        // nearest path even with BILINEAR_ENABLE=1.
+        logic        is_ct32;
+        logic        is_indexed;
+        logic        bili_psm_ok;
+        assign is_ct32     = (psm == PSM_PSMCT32);
+        assign is_indexed  = (psm == PSM_PSMT8) || (psm == PSM_PSMT4);
+        assign bili_psm_ok = is_ct32 || (PALETTE_BILINEAR && is_indexed);
+
+        // Ch310 — RUNTIME filter gate. The 4-tap path runs ONLY for a PSMCT32
+        // texture whose primitive selected LINEAR magnification (filter_lin=1,
+        // i.e. TEX1.MMAG=1). With filter_lin=0 (NEAREST) we fall back to the
+        // single-read nearest path even with BILINEAR_ENABLE=1, so an
+        // MMAG=0 primitive stays nearest. `do_lin` is the single predicate that
+        // selects the bilinear datapath everywhere below.
+        //
+        // NOTE on the `!== 1'b0` test: it makes filter_lin DEFAULT-ON when the
+        // port is left UNCONNECTED (sim Z). The standalone tb_gs_texture_bilinear
+        // exercises the 4-tap path directly without driving filter_lin, so an
+        // unconnected input must keep bilinear running (Z !== 0 → true). A
+        // driven 0 (gs_stub MMAG=0) gives nearest; a driven 1 gives bilinear.
+        // In synthesis filter_lin is always driven by gs_stub, so this reduces
+        // to a plain `is_ct32 && filter_lin`.
+        logic        do_lin;
+        assign do_lin = bili_psm_ok && (filter_lin !== 1'b0);
+
+        // bili_active (read by the wrap mux above): high whenever a filtered
+        // PSMCT32 sample is being processed by the FSM (ISSUE/WAIT/DONE) so the
+        // wrap consumes the per-beat neighbor coord. When do_lin=0 it is low so
+        // the wrap uses the port u/v (nearest), byte-identical to the
+        // non-bilinear coord path.
+        assign bili_active = do_lin;
+
+        // neighbor delta for the current beat
+        logic [10:0] du, dv;
+        always_comb begin
+            unique case (beat)
+                2'd0: begin du = 11'd0; dv = 11'd0; end
+                2'd1: begin du = 11'd1; dv = 11'd0; end
+                2'd2: begin du = 11'd0; dv = 11'd1; end
+                default: begin du = 11'd1; dv = 11'd1; end
+            endcase
+        end
+        // beat coord feeds the wrap (u_in/v_in). In IDLE (before latching) use
+        // the live ports so the first ISSUE sees neighbor 0 of the live coord;
+        // once latched, use the latched coord.
+        always_comb begin
+            if (state == BS_IDLE) begin
+                beat_u = u   + du;       // beat==0 here -> u+0
+                beat_v = v   + dv;
+            end else begin
+                beat_u = lat_u + du;
+                beat_v = lat_v + dv;
+            end
+        end
+
+        // The bilinear read address reuses the SAME addr-gen (gs_texel_addr via
+        // the u_eff/v_eff wrap fed by beat_u/beat_v). near_rd_addr already is
+        // (addr & ~3) for the currently-selected coord; for PSMCT32 the linear
+        // path is used and it is word-aligned. We pulse rd_en only on ISSUE.
+        logic        bi_rd_en;
+        assign bi_rd_en = (state == BS_ISSUE);
+
+        always_ff @(posedge clk or negedge rst_n) begin
+            if (!rst_n) begin
+                state    <= BS_IDLE;
+                beat     <= 2'd0;
+                wait_cnt <= 32'd0;
+                lat_u    <= 11'd0;  lat_v  <= 11'd0;
+                lat_uf   <= 4'd0;   lat_vf <= 4'd0;
+                for (int i = 0; i < 4; i++) tap[i] <= 32'd0;
+            end else begin
+                unique case (state)
+                    BS_IDLE: begin
+                        if (in_valid && do_lin) begin
+                            lat_u  <= u;   lat_v  <= v;
+                            lat_uf <= u_frac; lat_vf <= v_frac;
+                            beat   <= 2'd0;
+                            state  <= BS_ISSUE;
+                        end
+                    end
+                    BS_ISSUE: begin
+                        // address presented this cycle (combinationally via
+                        // beat -> beat_u/beat_v -> wrap -> addr). Begin the
+                        // RD_LATENCY wait.
+                        wait_cnt <= 32'd1;
+                        state    <= BS_WAIT;
+                    end
+                    BS_WAIT: begin
+                        if (wait_cnt >= RD_LATENCY[31:0]) begin
+                            // tex_rd_data now holds beat's word. Capture the
+                            // resolved COLOR (`near_color`): for PSMCT32 that is
+                            // the raw word (byte-identical to the original);
+                            // for PSMT8/PSMT4 (Ch314) it is clut_rd_data — the
+                            // index extracted from this beat's word (sel_byte /
+                            // psmt4_nibble, stable across the held beat) then CLUT'd.
+                            // Capturing the CLUT'd color per tap is what makes the
+                            // downstream lerp interpolate COLORS, not indices.
+                            tap[beat] <= near_color;
+                            if (beat == 2'd3) begin
+                                state <= BS_DONE;
+                            end else begin
+                                beat  <= beat + 2'd1;
+                                state <= BS_ISSUE;
+                            end
+                        end else begin
+                            wait_cnt <= wait_cnt + 32'd1;
+                        end
+                    end
+                    default: begin // BS_DONE
+                        state <= BS_IDLE;
+                    end
+                endcase
+            end
+        end
+
+        // --- 4-tap blend (combinational, on the captured taps) ---
+        // PSMCT32 word layout: [31:24]=A [23:16]=B [15:8]=G [7:0]=R (ABGR8888).
+        // tap0=(u,v) tap1=(u+1,v) tap2=(u,v+1) tap3=(u+1,v+1).
+        function automatic logic [7:0] lerp8(input logic [7:0] a,
+                                             input logic [7:0] b,
+                                             input logic [3:0] f);
+            logic signed [16:0] diff;     // b-a, signed, range -255..255
+            logic signed [21:0] prod;     // diff*f
+            logic signed [21:0] shifted;  // prod >>> 4
+            logic signed [21:0] res;      // a + shifted
+            begin
+                diff    = $signed({1'b0, b}) - $signed({1'b0, a});
+                prod    = diff * $signed({1'b0, f});
+                shifted = prod >>> 4;
+                res     = $signed({14'd0, a}) + shifted;
+                // defensive clamp 0..255 (in-range inputs keep res in range)
+                if (res < 0)            lerp8 = 8'd0;
+                else if (res > 22'sd255) lerp8 = 8'd255;
+                else                    lerp8 = res[7:0];
+            end
+        endfunction
+
+        // per-channel taps
+        logic [7:0] t0_r, t0_g, t0_b, t0_a;
+        logic [7:0] t1_r, t1_g, t1_b, t1_a;
+        logic [7:0] t2_r, t2_g, t2_b, t2_a;
+        logic [7:0] t3_r, t3_g, t3_b, t3_a;
+        assign t0_r = tap[0][ 7: 0]; assign t0_g = tap[0][15: 8];
+        assign t0_b = tap[0][23:16]; assign t0_a = tap[0][31:24];
+        assign t1_r = tap[1][ 7: 0]; assign t1_g = tap[1][15: 8];
+        assign t1_b = tap[1][23:16]; assign t1_a = tap[1][31:24];
+        assign t2_r = tap[2][ 7: 0]; assign t2_g = tap[2][15: 8];
+        assign t2_b = tap[2][23:16]; assign t2_a = tap[2][31:24];
+        assign t3_r = tap[3][ 7: 0]; assign t3_g = tap[3][15: 8];
+        assign t3_b = tap[3][23:16]; assign t3_a = tap[3][31:24];
+
+        // top = lerp(tap0,tap1,uf); bot = lerp(tap2,tap3,uf); out = lerp(top,bot,vf)
+        logic [7:0] top_r, top_g, top_b, top_a;
+        logic [7:0] bot_r, bot_g, bot_b, bot_a;
+        logic [7:0] cv_r,  cv_g,  cv_b,  cv_a;
+        always_comb begin
+            top_r = lerp8(t0_r, t1_r, lat_uf);
+            top_g = lerp8(t0_g, t1_g, lat_uf);
+            top_b = lerp8(t0_b, t1_b, lat_uf);
+            top_a = lerp8(t0_a, t1_a, lat_uf);
+            bot_r = lerp8(t2_r, t3_r, lat_uf);
+            bot_g = lerp8(t2_g, t3_g, lat_uf);
+            bot_b = lerp8(t2_b, t3_b, lat_uf);
+            bot_a = lerp8(t2_a, t3_a, lat_uf);
+            cv_r  = lerp8(top_r, bot_r, lat_vf);
+            cv_g  = lerp8(top_g, bot_g, lat_vf);
+            cv_b  = lerp8(top_b, bot_b, lat_vf);
+            cv_a  = lerp8(top_a, bot_a, lat_vf);
+        end
+
+        // Ch310 — HOLD register for the filtered color. The combined-renderer
+        // FSM (gs_stub CB_TWAIT) may latch the result a cycle or two AFTER the
+        // out_valid pulse (it steps at half-rate on z_advance beats), so the
+        // blended ABGR must stay STABLE from out_valid until the next sample.
+        // tex_color is the LIVE combinational blend during DONE (so an
+        // out_valid-keyed caller — tb_gs_texture_bilinear — reads the fresh
+        // value the SAME cycle out_valid pulses, byte-identical to before) and
+        // the LATCHED copy afterward (so a caller that reads one+ cycles later,
+        // like CB_TWAIT→CB_T, still sees it). The register captures the blend
+        // on the clk edge that LEAVES DONE; combining "live during DONE, held
+        // after" gives a value stable from out_valid until the next sample
+        // overwrites it at its DONE.
+        logic [31:0] tex_color_blend;
+        assign tex_color_blend = {cv_a, cv_b, cv_g, cv_r};
+        logic [31:0] tex_color_hold;
+        always_ff @(posedge clk or negedge rst_n) begin
+            if (!rst_n)
+                tex_color_hold <= 32'd0;
+            else if (state == BS_DONE)
+                tex_color_hold <= tex_color_blend;   // capture the just-blended value
+        end
+        // live during the DONE pulse, held (last captured) otherwise
+        logic [31:0] tex_color_lin;
+        assign tex_color_lin = (state == BS_DONE) ? tex_color_blend : tex_color_hold;
+
+        // --- output mux: bilinear FSM owns the outputs for a FILTERED PSMCT32
+        // sample (do_lin). When do_lin=0 — non-PSMCT32 psm OR MMAG=0 NEAREST —
+        // we transparently fall back to the nearest single-read path so
+        // PSMT8/PSMT4/swizzle and nearest PSMCT32 still work with
+        // BILINEAR_ENABLE=1, and busy stays 0 there.
+        // tex_color: the HELD blended ABGR (stable from out_valid to next DONE).
+        assign tex_rd_en   = do_lin ? bi_rd_en      : near_rd_en;
+        // tex_rd_addr is the SAME addr-gen output for both paths (the wrap
+        // selects beat_u/beat_v vs port u/v); the FSM just gates rd_en.
+        assign tex_rd_addr = near_rd_addr;
+        assign out_valid   = do_lin ? (state == BS_DONE) : near_out_valid;
+        assign tex_color   = do_lin ? tex_color_lin : near_color;
+        assign busy        = do_lin && (state != BS_IDLE);
+    end else begin : g_nearest
+        // BYTE-IDENTICAL nearest path: outputs are exactly the original assigns.
+        assign bili_active = 1'b0;       // constant -> wrap uses port u/v
+        assign beat_u      = 11'd0;      // unused (pruned)
+        assign beat_v      = 11'd0;
+        assign tex_rd_en   = near_rd_en;
+        assign tex_rd_addr = near_rd_addr;
+        assign out_valid   = near_out_valid;
+        assign tex_color   = near_color;
+        assign busy        = 1'b0;
+    end
+    endgenerate
+
+endmodule : gs_texture_unit
@@ -0,0 +1,48 @@
+// retroDE_ps2 — gs_tile_ram (Ch303)
+//
+// Generic on-chip TILE-LOCAL RAM for the tiled GS renderer: a single-write /
+// single-read scratchpad sized to one render tile (e.g. 16x16 = 256 entries).
+// Instantiated TWICE by the tile renderer — once as the color tile, once as the
+// Z tile — so a combined textured+alpha+depth pixel resolves its color/Z
+// read-modify-write entirely ON CHIP (per docs/decisions/0008 §6), with only the
+// texture fetch and the per-tile flush crossing to VRAM/LPDDR.
+//
+// Contract (matches vram_bram_stub.read2 so the raster FSM retarget is minimal):
+//   - 1 write port (we/waddr/wdata), committed this cycle.
+//   - 1 read port (raddr), data REGISTERED → valid ONE cycle later (rdata).
+//   - Same-address read+write in the same cycle is NOT used by the tile renderer
+//     (the FSM reads Z at beat 0 and writes Z at beat 4 of a 5-beat pixel; color
+//     read at beat 2, write at beat 3 — never the same cycle), so no R/W-collision
+//     forwarding is needed; this stays a plain 1W1R inferred BRAM.
+//
+// The memory is NOT reset (BRAM-friendly); the renderer's CLEAR phase initializes
+// every entry (color=clear color, Z=clear/far Z) before the first primitive.
+
+`timescale 1ns/1ps
+
+module gs_tile_ram #(
+    parameter int ADDR_W = 8,    // 256 entries = one 16x16 tile
+    parameter int DATA_W = 32
+) (
+    input  logic              clk,
+    input  logic              rst_n,
+    // write port (1 cycle, committed)
+    input  logic              we,
+    input  logic [ADDR_W-1:0] waddr,
+    input  logic [DATA_W-1:0] wdata,
+    // read port (registered, valid 1 cycle after raddr presented)
+    input  logic [ADDR_W-1:0] raddr,
+    output logic [DATA_W-1:0] rdata
+);
+    logic [DATA_W-1:0] mem [0:(1<<ADDR_W)-1];
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            rdata <= '0;
+        end else begin
+            if (we) mem[waddr] <= wdata;
+            rdata <= mem[raddr];        // 1-cycle registered read
+        end
+    end
+
+endmodule : gs_tile_ram
@@ -0,0 +1,212 @@
+// ============================================================================
+// gs_tile_reload.sv  (Ch323 Brick 2 — tile color+Z reload staging engine)
+//
+// The reload counterpart to the GS tile-flush writers, and a DIRECT structural
+// clone of the silicon-proven gs_texture_cache (Ch322): an emif_clk fill FSM that
+// reads a tile's worth of color+Z from FPGA-private LPDDR4B into on-chip staging
+// RAMs, plus a design_clk serve port that returns one (color,Z) per tile index at
+// the existing 1-cycle latency. gs_stub's TP_RELOAD phase sweeps the serve port and
+// writes the tile color/Z RAMs before rendering. Same CDC shape as gs_texture_cache
+// (one-shot warm fill, fill_done 2-FF synced into the serve clock) — NOT a new CDC.
+//
+// SEPARATE LPDDR bases (Codex): COLOR_BASE (the color framebuffer) and Z_BASE (the
+// Z-backing region) are distinct. A 16x16 tile lives at FB stride STRIDE_BYTES per
+// row (FBW*64*4 = 256 for FBW=1), so the fill reads ROW_BEATS 256-bit beats per row
+// from each base — sparse/strided, exactly like the Ch322 texture (which read a
+// 64-texel-stride region). Single-beat reads (arlen=0, the only proven EMIF pattern).
+//
+// Counters (Codex): color_beats, z_beats, rd_errs — all distinct.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_tile_reload #(
+    parameter [29:0] COLOR_BASE   = 30'd0,        // LPDDR byte base of the color framebuffer
+    parameter [29:0] Z_BASE       = 30'h0010_0000,// LPDDR byte base of the Z-backing (DISTINCT)
+    parameter int    TILE_W       = 16,
+    parameter int    TILE_H       = 16,
+    parameter int    STRIDE_BYTES = 256,          // FB row stride (FBW*64 px * 4 B = 256 for FBW=1)
+    parameter int    ROW_BEATS    = 2,            // 16 words/row * 4 B / 32 B = 2 single-beat reads
+    parameter int    COLOR_W      = 32
+)(
+    // ---- AXI read clock domain (emif_clk) — fill side ----
+    input  logic         axi_clk,
+    input  logic         axi_rst_n,
+    input  logic         reload_start,  // STROBE (gs/serve domain, CDC-synced): each RISING edge (re)fills
+    // Ch324 — RUNTIME per-tile byte offset into the raster LPDDR framebuffer. Latched at the fill
+    // arm (fs_edge) so it is stable for the whole fill. = ((tile_oy*(FBW*64)) + tile_ox)*4, the SAME
+    // formula the flush side uses, so reload gathers exactly the tile the spill wrote. 0 = origin
+    // tile (byte-identical to the Ch323 single-tile path). Quasi-static: gs_stub holds the current
+    // tile constant across TP_RELOAD, so sampling it at the synced strobe needs no extra CDC.
+    input  logic [29:0]  reload_base,
+    output logic         reload_done,   // tile fully resident (until the next fill arm)
+    output logic [31:0]  color_beats,   // color beats read (cumulative)
+    output logic [31:0]  z_beats,       // Z beats read (cumulative)
+    output logic [31:0]  rd_errs,       // non-OKAY read responses (cumulative)
+
+    // ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
+    output logic [29:0]  araddr,
+    output logic [1:0]   arburst,
+    output logic [6:0]   arid,
+    output logic [7:0]   arlen,
+    output logic [2:0]   arsize,
+    output logic         arvalid,
+    input  logic         arready,
+    input  logic [255:0] rdata,
+    input  logic [1:0]   rresp,
+    input  logic         rlast,
+    input  logic         rvalid,
+    output logic         rready,
+
+    // ---- serve clock domain (design_clk) — gs_stub TP_RELOAD reads this ----
+    input  logic         serve_clk,
+    input  logic [7:0]   raddr,         // tile index (row*16 + col), 0..255
+    output logic [COLOR_W-1:0] color_o, // 1-cycle REGISTERED color for raddr
+    output logic [31:0]  z_o,           // 1-cycle REGISTERED Z for raddr
+    output logic         reload_ready   // reload_done synced into serve_clk (TP_RELOAD ready gate)
+);
+    localparam int N_ENTRIES = TILE_W*TILE_H;     // 256
+    localparam int N_ROWS    = TILE_H;            // 16
+    localparam int WORDS_ROW = TILE_W;            // 16 words/row
+
+    assign arburst = 2'b01;   // INCR
+    assign arid    = 7'd6;    // distinct: writer=0/rd-probe=1/fcache=2/linebuf=3/texfill=4/wr-probe=5/tile-reload=6
+    assign arlen   = 8'd0;    // single-beat (only proven EMIF read pattern)
+    assign arsize  = 3'b101;  // 32 bytes
+
+    // On-chip staging RAMs: written by the fill FSM (axi_clk), read by gs_stub (serve_clk).
+    // One-shot warm fill => static during reads => no read/write CDC hazard (gs_texture_cache pattern).
+    logic [COLOR_W-1:0] color_ram [0:N_ENTRIES-1];
+    logic [31:0]        z_ram     [0:N_ENTRIES-1];
+
+    // ================= fill side (axi_clk) =================
+    // For each of N_ROWS rows, read ROW_BEATS color beats then ROW_BEATS Z beats. Each 256-bit
+    // beat = 8 words; WORDS_ROW=16 spans ROW_BEATS=2 beats. Store the row's 16 words into the
+    // staging RAM at indices row*16 + (0..15).
+    typedef enum logic [2:0] { R_IDLE, R_C_AR, R_C_R, R_C_W, R_Z_AR, R_Z_R, R_Z_W, R_DONE } rstate_t;
+    rstate_t rst_q;
+    logic [$clog2(N_ROWS):0]   row;
+    logic [$clog2(ROW_BEATS):0] beat;
+    logic [2:0]                 lane;   // serialized unpack lane 0..7 — ONE RAM write/cycle (M20K, not an 8-wide reg file)
+    logic [255:0]               beat_q; // latched 256-bit beat, drained one 32-bit lane per cycle
+    logic [29:0]                base_q; // Ch324 — per-tile byte offset latched at fill arm (stable across the fill)
+    logic [2:0] fs_sync;
+    // reload_start is a STROBE (gs_stub pulses it once per tile reload): trigger on the
+    // RISING edge only — one pulse => exactly one fill. (Was an any-edge toggle, which made
+    // a pulse trigger TWO fills; harmless but wasteful and confusing.)
+    wire        fs_edge = fs_sync[1] & ~fs_sync[2];
+
+    function automatic [29:0] row_base(input [29:0] base, input int r);
+        row_base = base + r*STRIDE_BYTES;
+    endfunction
+
+    // SINGLE write port per RAM (one index, one data, per clock) so Quartus infers M20K
+    // instead of the 8-wide register file the old parallel beat-unpack forced (~8.7K ALMs).
+    // wa = row*WORDS_ROW + beat*8 + lane. Uses the CURRENT row/beat/lane; the lane==7 branch
+    // updates row/beat non-blockingly, so this cycle's write still targets the right entry.
+    wire [$clog2(N_ENTRIES)-1:0] wa = row[$clog2(N_ROWS)-1:0]*WORDS_ROW
+                                    + beat[$clog2(ROW_BEATS)-1:0]*8 + lane;
+
+    always_ff @(posedge axi_clk) begin
+        if (!axi_rst_n) begin
+            rst_q <= R_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
+            row <= '0; beat <= '0; lane <= '0; reload_done <= 1'b0; base_q <= 30'd0;
+            color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0; fs_sync <= 3'd0;
+        end else begin
+            fs_sync <= {fs_sync[1:0], reload_start};
+            case (rst_q)
+                R_IDLE, R_DONE: begin
+                    if (fs_edge) begin
+                        reload_done <= 1'b0; color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0;
+                        row <= '0; beat <= '0; lane <= '0;
+                        base_q  <= reload_base;              // latch this tile's offset for the whole fill
+                        araddr  <= COLOR_BASE + reload_base; // row 0 color of THIS tile
+                        arvalid <= 1'b1;
+                        rst_q   <= R_C_AR;
+                    end
+                end
+                R_C_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_C_R; end
+                R_C_R:  if (rvalid) begin            // latch the beat; drain it serially in R_C_W
+                    beat_q <= rdata;
+                    if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
+                    rready <= 1'b0; color_beats <= color_beats + 32'd1;
+                    lane   <= '0;
+                    rst_q  <= R_C_W;
+                end
+                R_C_W: begin                         // 8 cycles: one 32-bit lane -> color_ram per clock
+                    color_ram[wa] <= beat_q[lane*32 +: 32];
+                    if (lane == 3'd7) begin
+                        if (beat == ROW_BEATS-1) begin   // color row done -> Z row
+                            beat    <= '0;
+                            araddr  <= row_base(Z_BASE + base_q, row);
+                            arvalid <= 1'b1;
+                            rst_q   <= R_Z_AR;
+                        end else begin
+                            beat    <= beat + 1'b1;
+                            araddr  <= araddr + 30'd32;
+                            arvalid <= 1'b1;
+                            rst_q   <= R_C_AR;
+                        end
+                    end else lane <= lane + 1'b1;
+                end
+                R_Z_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_Z_R; end
+                R_Z_R:  if (rvalid) begin
+                    beat_q <= rdata;
+                    if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
+                    rready <= 1'b0; z_beats <= z_beats + 32'd1;
+                    lane   <= '0;
+                    rst_q  <= R_Z_W;
+                end
+                R_Z_W: begin                         // 8 cycles: one 32-bit lane -> z_ram per clock
+                    z_ram[wa] <= beat_q[lane*32 +: 32];
+                    if (lane == 3'd7) begin
+                        if (beat == ROW_BEATS-1) begin   // Z row done -> next row (or finish).
+                            // reload_done stays LOW until THIS final-row final-Z-lane write.
+                            if (row == N_ROWS-1) begin
+                                reload_done <= 1'b1;
+                                rst_q       <= R_DONE;
+                            end else begin
+                                row     <= row + 1'b1;
+                                beat    <= '0;
+                                araddr  <= row_base(COLOR_BASE + base_q, row + 1);
+                                arvalid <= 1'b1;
+                                rst_q   <= R_C_AR;
+                            end
+                        end else begin
+                            beat    <= beat + 1'b1;
+                            araddr  <= araddr + 30'd32;
+                            arvalid <= 1'b1;
+                            rst_q   <= R_Z_AR;
+                        end
+                    end else lane <= lane + 1'b1;
+                end
+                default: rst_q <= R_IDLE;
+            endcase
+        end
+    end
+
+    // ================= serve side (serve_clk) =================
+    // 1-cycle REGISTERED read, identical timing to the tile RAM / vram read2.
+    always_ff @(posedge serve_clk) begin
+        color_o <= color_ram[raddr];
+        z_o     <= z_ram[raddr];
+    end
+    // reload_ready handshake (Ch323 fix): a fresh reload_start MUST drop ready immediately,
+    // and ready re-raises only when THIS fill completes. Without this, a back-to-back reload
+    // (two tile batches) sees ready still high from the PREVIOUS fill and gs_stub sweeps the
+    // stale (pre-fill) z_ram before the new fill populates it — the reloaded Z is lost (the
+    // board's "region A wrong color" bug; reproduced in tb_gs_tile_spill_lpddr). reload_start
+    // is in the serve_clk (design) domain; reload_done is edge-detected after CDC.
+    logic [1:0] done_sync = 2'b00;
+    logic       ready_q   = 1'b0;
+    wire        done_rise = done_sync[0] & ~done_sync[1];
+    always_ff @(posedge serve_clk) begin
+        done_sync <= {done_sync[0], reload_done};
+        if      (reload_start) ready_q <= 1'b0;   // new fill armed -> not ready
+        else if (done_rise)    ready_q <= 1'b1;   // this fill completed
+    end
+    // COMBINATIONALLY mask ready low while reload_start is asserted: gs_stub pulses
+    // reload_start and checks ready in the SAME cycle, so the registered clear above lands
+    // one cycle too late — without the mask gs_stub sees the PREVIOUS fill's stale ready=1
+    // and sweeps before this fill populates z_ram (the region-A-wrong-color bug).
+    assign reload_ready = ready_q & ~reload_start;
+endmodule
@@ -0,0 +1,236 @@
+// ============================================================================
+// gs_z_flush_writer.sv  (Ch323 Brick 2 — tile color/Z-flush LPDDR writer; PACKED)
+//
+// Writes a gs_stub tile-flush stream (one 32-bit word per tile pixel on design_clk —
+// either the TP_ZFLUSH Z stream or the TP_FLUSH color stream) to an FPGA-private
+// LPDDR4B scratch region (emif_clk). Used twice in the de25 top: once for Z, once for
+// the 32-bit color spill (the module is generic — it writes the 32-bit `data` at
+// BASE + `addr`).
+//
+// PACKED (Ch323 board fix): the FIRST cut did ONE single-32-bit-lane AXI write PER
+// pixel through a strictly-sequential AW->W->B FSM. The tile sweep emits one pixel per
+// design_clk (256 back-to-back), but each isolated write pays the full LPDDR round-trip
+// latency, so the drain fell far behind the emit rate and the 16-deep async FIFO
+// OVERFLOWED within ~16 px — dropping most of the spilled tile (grey-with-specks on
+// HDMI, spill_ovf=1). The proven framebuffer writer (gs_lpddr_axi_master) avoids this by
+// PACKING pixels into 256-bit beats; this writer now does the same with 32-bit lanes:
+//
+//   design_clk : PACKER — accumulate 8 consecutive 32-bit pixels of a tile-row into one
+//                256-bit (32-byte) beat {block_off, data, strb}, keyed by the 32-byte
+//                block address (addr[29:5]). A 16-px tile-row is exactly two 32-byte-
+//                aligned blocks, so each beat completes naturally on its 8th px (no
+//                dangling partial); a block-address change flushes the in-flight beat.
+//                One FIFO push per 8 px => 8x fewer AXI writes => the sequential drain
+//                keeps up with the same small FIFO.
+//   async FIFO : gray-code CDC, carries {block_off[29:0], data[255:0], strb[31:0]}.
+//   emif_clk   : AXI FSM — pop a beat, issue a single-beat INCR write (AWSIZE=5 = 32 B,
+//                AWLEN=0, full WSTRB on the populated lanes) at BASE + block_off.
+//
+// The packed beats land at exactly the offsets gs_tile_reload reads back (row r at
+// BASE + r*STRIDE, two 32-byte beats), so the reload side is unchanged.
+//
+// SEPARATE base (Codex): BASE is distinct from the color FB and the other scratch
+// region. A synthesis-off CANARY asserts no beat lands inside the canary-guard regions.
+//
+// Counters (Codex, distinct per instance): z_write_beats (256-bit beats written),
+// z_wr_errs (non-OKAY responses), fifo_overflow (sticky).
+//
+// NOTE (parity with gs_lpddr_axi_master): assumes the flush stream produces FULL 8-lane
+// beats (true for a tile width that is a multiple of 8 — the 16-wide spill tile). A
+// trailing partial beat at end-of-stream is NOT flushed.
+// ============================================================================
+`timescale 1ns/1ps
+
+module gs_z_flush_writer #(
+    parameter [29:0] Z_BASE     = 30'h0010_0000,  // LPDDR byte base of this scratch region (DISTINCT)
+    parameter [29:0] FB_BASE    = 30'd0,          // color framebuffer base   (canary guard)
+    parameter int    FB_BYTES   = 32'h0001_0000,  // color framebuffer size   (canary guard)
+    parameter [29:0] TEX_BASE   = 30'h0020_0000,  // other scratch base       (canary guard)
+    parameter int    TEX_BYTES  = 32'h0000_8000,  // other scratch size       (canary guard)
+    parameter int    FIFO_DEPTH = 16
+)(
+    // ---- GS / design clock domain: the flush emit stream ----
+    input  logic         gs_clk,
+    input  logic         gs_rst_n,
+    input  logic         enable,        // 1 = accept emits (default off => inert)
+    input  logic         z_flush_emit,  // one pulse per tile pixel
+    input  logic [31:0]  z_flush_addr,  // scratch-RELATIVE byte offset (pixel_index*4)
+    input  logic [31:0]  z_flush_data,  // 32-bit word for this pixel (Z or color)
+
+    // ---- status (emif_clk domain unless noted) ----
+    output logic [31:0]  z_write_beats, // 256-bit beats written (cumulative)
+    output logic [31:0]  z_wr_errs,     // non-OKAY write responses (cumulative)
+    output logic         fifo_overflow, // sticky (gs domain): an emit dropped (FIFO full)
+    // Pipeline-split counters (Codex): emit/push (GS, reset by gs_rst_n=per-render core reset) and
+    // pop/beats (EMIF, reset by trace_clear) localize any spill divergence: healthy = 512/64/64/64;
+    // push>64 = packer partial beats; pop/beats>push = FIFO/reset broken; beats!=pop = AXI-FSM bug.
+    input  logic         trace_clear,     // resets the EMIF-domain counters (beats/pop) per render
+    output logic [31:0]  dbg_beat_count,  // beats committed (B handshakes) since the last trace_clear
+    output logic [31:0]  dbg_emit_count,  // GS:   enable&&z_flush_emit accepted (per render)
+    output logic [31:0]  dbg_push_count,  // GS:   beats pushed into the FIFO (per render)
+    output logic [31:0]  dbg_pop_count,   // EMIF: beats popped from the FIFO (since trace_clear)
+    output logic [31:0]  dbg_aw_count,    // EMIF: AW handshakes (since trace_clear)
+    output logic [31:0]  dbg_w_count,     // EMIF: W  handshakes (since trace_clear)
+
+    // ---- AXI4 write channel to the EMIF user port (emif_clk, 256-bit) ----
+    input  logic         axi_clk,
+    input  logic         axi_rst_n,
+    output logic [29:0]  awaddr,
+    output logic [1:0]   awburst,
+    output logic [6:0]   awid,
+    output logic [7:0]   awlen,
+    output logic [2:0]   awsize,
+    output logic         awvalid,
+    input  logic         awready,
+    output logic [255:0] wdata,
+    output logic [31:0]  wstrb,
+    output logic         wlast,
+    output logic         wvalid,
+    input  logic         wready,
+    input  logic [1:0]   bresp,
+    input  logic         bvalid,
+    output logic         bready
+);
+    assign awburst = 2'b01;   // INCR
+    assign awid    = 7'd6;    // distinct from FB writer(0)/probes/reload(6 too; arb priority disambiguates)
+    assign awlen   = 8'd0;    // single beat
+    assign awsize  = 3'b101;  // 32 bytes (256-bit)
+    assign bready  = 1'b1;
+
+    localparam int PW = 318;  // {block_off[29:0], data[255:0], strb[31:0]}
+
+    // ============================ design_clk PACKER ============================
+    // Accumulate 8 consecutive 32-bit pixels into one 256-bit beat keyed by the 32-byte
+    // block address; push a COMPLETE beat to the FIFO (one push per 8 px, not per px).
+    logic [29:0]  cur_off;
+    logic [255:0] cur_data;
+    logic [31:0]  cur_strb;
+    logic         has_data;
+    logic         fifo_wr;
+    logic [PW-1:0] fifo_wdata;
+    wire          fifo_full, fifo_empty;
+    wire [PW-1:0] fifo_rdata;
+    logic         fifo_rd;
+
+    always_ff @(posedge gs_clk or negedge gs_rst_n) begin
+        if (!gs_rst_n) begin
+            cur_off <= '0; cur_data <= '0; cur_strb <= '0; has_data <= 1'b0;
+            fifo_wr <= 1'b0; fifo_wdata <= '0; fifo_overflow <= 1'b0;
+            dbg_emit_count <= 32'd0; dbg_push_count <= 32'd0;
+        end else begin
+            fifo_wr <= 1'b0;
+            if (enable && z_flush_emit) dbg_emit_count <= dbg_emit_count + 32'd1;
+            if (fifo_wr && !fifo_full)  dbg_push_count <= dbg_push_count + 32'd1;
+            if (enable && z_flush_emit) begin
+                logic [29:0]  block_off;
+                logic [2:0]   lane;          // 0..7 (which 32-bit lane)
+                logic [255:0] nd;
+                logic [31:0]  ns;
+                block_off = {z_flush_addr[29:5], 5'd0};
+                lane      = z_flush_addr[4:2];
+                if (has_data && (block_off != cur_off)) begin
+                    // block changed before the previous beat filled — flush it, restart.
+                    fifo_wdata <= {cur_off, cur_data, cur_strb};
+                    fifo_wr    <= 1'b1;
+                    cur_off    <= block_off;
+                    cur_data   <= (256'd0 | (256'(z_flush_data) << ({29'd0, lane} * 32)));
+                    cur_strb   <= (32'hF << ({29'd0, lane} * 4));
+                    has_data   <= 1'b1;
+                end else begin
+                    nd = has_data ? cur_data : 256'd0;
+                    ns = has_data ? cur_strb : 32'd0;
+                    nd[ ({29'd0, lane} * 32) +: 32 ] = z_flush_data;
+                    ns[ ({29'd0, lane} * 4)  +: 4  ] = 4'hF;
+                    if (&ns) begin
+                        // beat complete (all 8 lanes) — flush, beat consumed.
+                        fifo_wdata <= {block_off, nd, ns};
+                        fifo_wr    <= 1'b1;
+                        has_data   <= 1'b0;
+                    end else begin
+                        cur_off  <= block_off;
+                        cur_data <= nd;
+                        cur_strb <= ns;
+                        has_data <= 1'b1;
+                    end
+                end
+            end
+            // overflow witness: a push attempt while the FIFO is full (must stay 0).
+            if (fifo_wr && fifo_full) fifo_overflow <= 1'b1;
+        end
+    end
+
+    // CRITICAL (Ch323 board bug): the async FIFO's two pointers MUST reset together. The
+    // packer side uses gs_rst_n (= core reset, which a CORE_CTRL pulse toggles EVERY render);
+    // the read side uses axi_rst_n (= EMIF cal, power-on only). If wrst_n followed gs_rst_n,
+    // each render's core-reset pulse would reset ONLY the write pointer → gray-code pointer
+    // desync → FIFO corruption (garbage data, spurious overflow, writes that never commit).
+    // Sim missed it (single reset, both sides together). So reset BOTH FIFO sides from the
+    // STABLE axi_rst_n: assert async on axi_rst_n, deassert synchronized into gs_clk.
+    reg [1:0] wrst_sync;
+    always_ff @(posedge gs_clk or negedge axi_rst_n) begin
+        if (!axi_rst_n) wrst_sync <= 2'b00;
+        else            wrst_sync <= {wrst_sync[0], 1'b1};
+    end
+    wire fifo_wrst_n = wrst_sync[1];
+    gs_async_fifo #(.WIDTH(PW), .DEPTH(FIFO_DEPTH)) u_fifo (
+        .wclk(gs_clk), .wrst_n(fifo_wrst_n), .wr(fifo_wr && !fifo_full), .wdata(fifo_wdata), .wfull(fifo_full),
+        .rclk(axi_clk), .rrst_n(axi_rst_n), .rd(fifo_rd), .rdata(fifo_rdata), .rempty(fifo_empty)
+    );
+
+    // ============================ emif_clk AXI FSM ============================
+    wire [29:0]  beat_block = fifo_rdata[PW-1 -: 30];      // block_off[29:0]
+    wire [255:0] beat_data  = fifo_rdata[287:32];
+    wire [31:0]  beat_strb  = fifo_rdata[31:0];
+    wire [29:0]  full_addr  = Z_BASE + beat_block;
+    typedef enum logic [1:0] { W_IDLE, W_AW, W_W, W_B } wstate_t;
+    wstate_t wst;
+    logic [29:0]  lat_addr;
+    logic [255:0] lat_data;
+    logic [31:0]  lat_strb;
+
+    always_ff @(posedge axi_clk or negedge axi_rst_n) begin
+        if (!axi_rst_n) begin
+            wst <= W_IDLE; awaddr <= '0; awvalid <= 1'b0; wdata <= '0; wstrb <= '0;
+            wlast <= 1'b0; wvalid <= 1'b0; fifo_rd <= 1'b0;
+            z_write_beats <= 32'd0; z_wr_errs <= 32'd0;
+            dbg_beat_count <= 32'd0;
+            dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
+            lat_addr <= '0; lat_data <= '0; lat_strb <= '0;
+        end else begin
+            fifo_rd <= 1'b0;
+            if (trace_clear) begin
+                dbg_beat_count <= 32'd0;
+                dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
+            end
+            if (fifo_rd)             dbg_pop_count <= dbg_pop_count + 32'd1;
+            if (awvalid && awready)  dbg_aw_count  <= dbg_aw_count  + 32'd1;
+            if (wvalid  && wready)   dbg_w_count   <= dbg_w_count   + 32'd1;
+            case (wst)
+                W_IDLE: if (!fifo_empty) begin
+                    lat_addr <= full_addr; lat_data <= beat_data; lat_strb <= beat_strb;
+                    fifo_rd  <= 1'b1;                       // pop this beat
+                    awaddr   <= {full_addr[29:5], 5'd0};    // 32-byte aligned
+                    awvalid  <= 1'b1;
+                    wst      <= W_AW;
+                    // synthesis translate_off
+                    if (((full_addr >= FB_BASE)  && (full_addr < FB_BASE  + FB_BYTES[29:0])) ||
+                        ((full_addr >= TEX_BASE) && (full_addr < TEX_BASE + TEX_BYTES[29:0])))
+                        $error("gs_z_flush_writer CANARY: beat addr 0x%07x overlaps a canary-guard region", full_addr);
+                    // synthesis translate_on
+                end
+                W_AW: if (awready) begin
+                    awvalid <= 1'b0; wdata <= lat_data;
+                    wstrb <= lat_strb; wlast <= 1'b1; wvalid <= 1'b1; wst <= W_W;
+                end
+                W_W: if (wready) begin wvalid <= 1'b0; wlast <= 1'b0; wst <= W_B; end
+                W_B: if (bvalid) begin
+                    if (bresp != 2'b00) z_wr_errs <= z_wr_errs + 32'd1;
+                    z_write_beats  <= z_write_beats + 32'd1;
+                    dbg_beat_count <= dbg_beat_count + 32'd1;
+                    wst <= W_IDLE;
+                end
+                default: wst <= W_IDLE;
+            endcase
+        end
+    end
+endmodule
@@ -0,0 +1,263 @@
+// retroDE_ps2 — vram_bram_stub (Ch154)
+//
+// Hardware-friendly sibling of `vram_stub`. Maps cleanly onto Agilex 5
+// M20K block-RAM:
+//   - 2048 × 32-bit word storage (instead of 8192 × 8-bit byte
+//     storage). Internal width matches Agilex M20K native widths;
+//     external addressing stays byte-addressable to keep the same
+//     mental model as `vram_stub`.
+//   - SYNCHRONOUS reads (registered 32-bit output). One-cycle read
+//     latency — the rd_valid pulse fires the cycle the data is on
+//     read_data.
+//   - BYTE write enable only (4-bit `write_be`). The Ch106 PSMT4
+//     per-bit `write_mask` RMW is NOT supported; PSMT4 callers must
+//     do the nibble splice on the writer side BEFORE issuing the
+//     write here. Ch155+ task to rework gs_stub.raster_pixel_emit
+//     and gif_image_xfer_stub for that.
+//   - Two synchronous read ports. Quartus implements two
+//     independent read addresses by REPLICATING the M20K storage
+//     across two RAM blocks rather than using a single native
+//     dual-read port — exp_c shows 8 RAM Blocks for 8 KB vs
+//     exp_a's 4 RAM Blocks for the same 8 KB single-port shape.
+//     Two replicated RAM blocks is still vastly cheaper than the
+//     65,536 flip-flops the legacy `vram_stub` shape produced;
+//     the cost just isn't free.
+//
+// Empirical motivation (Ch153 forensics):
+//   The legacy `vram_stub` shape (byte-addressable + combinational
+//   dual reads + per-bit-mask RMW) failed to fit on Agilex 5 — the
+//   8 KB array consumed 65,536 dedicated registers and 261,578
+//   combinational nodes, dominating Ch152's 331 % ALM overrun.
+//   `exp_a_bram_friendly` proved that a 2048 × 32-bit sync-read
+//   byte-WE shape maps to 4 RAM Blocks + 0 registers + 46 ALMs.
+//
+// External port shape vs `vram_stub`:
+//   IDENTICAL: clk, rst_n, write_en, write_addr[31:0],
+//              write_data[31:0], write_be[3:0], read_addr[31:0],
+//              read_data[31:0], read2_addr[31:0], read2_data[31:0].
+//   NEW       : read_valid + read2_valid (1-cycle pulse with the data).
+//   DROPPED   : write_mask[31:0] (Ch106 per-bit RMW; callers must
+//              splice nibbles on the writer side).
+//
+// Address contract:
+//   - Writes: write_addr is byte-aligned; the low 2 bits MUST be 0
+//             (4-byte writes only). Each `write_be[i]` independently
+//             commits byte `i` of the addressed word. Per-byte non-
+//             wrapping admission: an enabled byte beyond `BYTES`
+//             drops the WHOLE write (matches vram_stub Ch95 audit).
+//   - Reads:  read_addr is byte-aligned; the low 2 bits MUST be 0.
+//             `read_data` is the 32-bit word at `read_addr / 4`.
+//             Byte / halfword extraction is the caller's job
+//             (matches Ch141 / Ch142 nibble-readback pattern).
+//
+// Sim behaviour: time-0 mem is power-on-zero matching real M20K (the
+// `// synthesis translate_off` initial block matches vram_stub's
+// post-Ch152 pattern).
+
+`timescale 1ns/1ps
+
+module vram_bram_stub #(
+    parameter int unsigned BYTES = 8192,
+
+    // Ch251.4 — hardware-demo M20K rescue. When ENABLE_READ2 = 0, the
+    // second sync-read port is FEATURE-STRIPPED: `read2_data` ties to
+    // 0, `read2_valid` ties to 0, and Quartus no longer infers a
+    // separate read port on `mem`. This collapses the storage from
+    // two replicated 1W+1R simple-dual-port M20K banks (~410 M20Ks at
+    // 512 KiB) to ONE 1W+1R bank (~205 M20Ks) — the savings that get
+    // the 512 KiB framebuffer to fit on Agilex 5 (358 M20K budget).
+    //
+    // Contract caveat: read2 is the PSMT4 RMW old-byte read path. Any
+    // build that exercises PSMT4 rasterization MUST keep this `1`. The
+    // PSMCT32-only hardware demo (top_psmct32_raster_demo_bram) sets
+    // it to `0`; all simulation TBs leave it at the default `1`.
+    //
+    // This is a SCOPED build profile, not a general fix — see
+    // docs/decisions/0006-vram-roadmap.md for the longer-term
+    // arbitrated / line-buffered VRAM plan.
+    parameter bit         ENABLE_READ2 = 1'b1
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // Write port (byte-WE; 4-byte-aligned write_addr).
+    input  logic        write_en,
+    input  logic [31:0] write_addr,
+    input  logic [31:0] write_data,
+    input  logic [3:0]  write_be,
+
+    // Read port 0 (sync read; 4-byte-aligned read_addr).
+    input  logic [31:0] read_addr,
+    output logic [31:0] read_data,
+    output logic        read_valid,
+
+    // Read port 1 (sync read; 4-byte-aligned).
+    input  logic [31:0] read2_addr,
+    output logic [31:0] read2_data,
+    output logic        read2_valid
+);
+
+    // 2048 × 32-bit storage. Index is the WORD index (write_addr / 4).
+    //
+    // Parameter contract: `BYTES` MUST be a power-of-two multiple of 4.
+    // The WORD_AW-bit slice `*_addr[WORD_AW+1:2]` truncates the byte
+    // address to a word index; for non-power-of-two `WORDS`, an out-
+    // of-range byte address can map to a slice value that exceeds
+    // `WORDS-1` and indexes beyond `mem[]`. `read_valid` already
+    // marks such reads invalid downstream, but the BRAM read template
+    // still indexes the array unconditionally to satisfy Quartus's
+    // M20K inference (Ch154 audit), so the index itself must remain
+    // in bounds. The Ch155 audit-low fix: clamp the read indices
+    // with `& (WORDS-1)` so a power-of-two depth is required AND any
+    // bit beyond the legal slice is masked away. Power-of-two also
+    // matches every Agilex M20K depth target (256/512/1024/2048/...).
+    localparam int unsigned WORDS     = BYTES / 4;
+    localparam int unsigned WORD_AW   = $clog2(WORDS);
+
+    logic [31:0] mem [0:WORDS-1];
+
+    // synthesis translate_off
+    initial begin
+        if (BYTES < 4 || (BYTES & 32'd3) != 0)
+            $error("vram_bram_stub: BYTES (%0d) must be >= 4 and a multiple of 4", BYTES);
+        // Power-of-two check on WORDS: (WORDS != 0) && ((WORDS & (WORDS-1)) == 0).
+        if (WORDS == 0 || (WORDS & (WORDS - 1)) != 0)
+            $error("vram_bram_stub: BYTES (%0d) must yield a power-of-two WORDS depth (got %0d)",
+                   BYTES, WORDS);
+
+        // Ch252 — VRAM replication tripwire (simulation/elaboration only).
+        //
+        // At BYTES >= 256 KiB, each 1W+1R simple-dual-port replica costs
+        // ~100 M20Ks. With ENABLE_READ2 = 1, Quartus replicates the
+        // storage to give the second read its own port, doubling that
+        // cost (>= 200 M20Ks per pair). Above this threshold a Quartus
+        // fitter overrun on Agilex 5 (358 M20K budget) becomes likely.
+        //
+        // This `$fatal` runs in simulation and elaboration-aware lint
+        // tools — it is the loud canary. The REAL protection is the
+        // board-top profile: hardware builds explicitly set
+        // ENABLE_READ2 = 0 when VRAM_BYTES is large (see
+        // de25_nano_psmct32_raster_demo_top). Re-enabling read2 on a
+        // large hardware VRAM requires landing one of the architectural
+        // follow-ups in docs/decisions/0006-vram-roadmap.md first.
+        if (ENABLE_READ2 && (BYTES >= 32'd262144)) begin
+            $display("vram_bram_stub: ENABLE_READ2=1 with BYTES=%0d (>= 256 KiB) trips the replication tripwire.", BYTES);
+            $display("  The 2nd read port forces Quartus to replicate the storage, ~doubling M20K cost.");
+            $display("  Either set ENABLE_READ2=0 (PSMCT32-only hardware profile) or land the");
+            $display("  arbitrated/line-buffered VRAM follow-up before re-enabling read2 at this size.");
+            $display("  See docs/decisions/0006-vram-roadmap.md.");
+            $fatal(1, "vram_bram_stub: replication-tripwire fatal exit");
+        end
+
+        for (int i = 0; i < int'(WORDS); i++) mem[i] = 32'd0;
+    end
+    // synthesis translate_on
+
+    // ----------------------------------------------------------------
+    // Write port — per-byte WE, per-byte non-wrapping admission.
+    // ----------------------------------------------------------------
+    logic [32:0] addr33;
+    logic        admit_b0, admit_b1, admit_b2, admit_b3;
+    logic        write_admit;
+    assign addr33   = {1'b0, write_addr};
+    assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
+    assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
+    assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
+    assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
+    assign write_admit = write_en
+                       && (write_addr[1:0] == 2'b00)   // word-aligned
+                       && (!write_be[0] || admit_b0)
+                       && (!write_be[1] || admit_b1)
+                       && (!write_be[2] || admit_b2)
+                       && (!write_be[3] || admit_b3);
+
+    logic [WORD_AW-1:0] write_word_idx;
+    assign write_word_idx = write_addr[WORD_AW+1:2];
+
+    // BRAM-native byte-WE template — each `if (write_be[i])` slice
+    // updates a separate 8-bit lane of the 32-bit word. This is the
+    // canonical Quartus inference shape (proven in Ch153 exp_a).
+    always_ff @(posedge clk) begin
+        if (rst_n && write_admit) begin
+            if (write_be[0]) mem[write_word_idx][ 7: 0] <= write_data[ 7: 0];
+            if (write_be[1]) mem[write_word_idx][15: 8] <= write_data[15: 8];
+            if (write_be[2]) mem[write_word_idx][23:16] <= write_data[23:16];
+            if (write_be[3]) mem[write_word_idx][31:24] <= write_data[31:24];
+        end
+    end
+
+    // ----------------------------------------------------------------
+    // Read ports — sync, registered output, 1-cycle latency.
+    //
+    // The read path is the CANONICAL Quartus M20K inference template:
+    // a single unconditional `read_data <= mem[idx]` registered
+    // assignment, with NO reset on the data register and NO read-side
+    // gating. Quartus rejected an earlier draft that gated reads on
+    // `read_addr[1:0]==2'b00 && in-bounds` with
+    //   "Info (276007): RAM logic ... uninferred due to asynchronous
+    //   read logic"
+    // and synthesized the storage as flip-flops. Bounds + alignment
+    // checks land on the separate `read_valid` pipeline below where
+    // they don't poison the data path.
+    // ----------------------------------------------------------------
+    // Word-index extraction. For a power-of-two `WORDS` depth (the
+    // parameter contract enforced above), the slice
+    // `read_addr[WORD_AW+1:2]` is naturally bounded to `[0, WORDS-1]`
+    // — the high bits beyond WORD_AW+1 represent address ranges
+    // already rejected by the `read_valid` gate below. The mask
+    // `& WORD_AW'(WORDS - 1)` is redundant for power-of-two WORDS
+    // (it just keeps the same bits) but documents the contract: a
+    // future relaxation that allows non-power-of-two depths would
+    // need to either remove that change OR force the mem-read index
+    // through a real range-clamp rather than relying on the natural
+    // truncation.
+    logic [WORD_AW-1:0] read_word_idx;
+    assign read_word_idx  = read_addr [WORD_AW+1:2] & WORD_AW'(WORDS - 1);
+
+    always_ff @(posedge clk) begin
+        read_data <= mem[read_word_idx];
+    end
+
+    // Out-of-range / misaligned detection on a parallel pipeline so
+    // it doesn't gate the BRAM read path. read_valid pulses 1 cycle
+    // late, aligned with read_data.
+    logic read_in_range_pre;
+    assign read_in_range_pre  = (read_addr [1:0] == 2'b00) &&
+                                ({1'b0, read_addr } + 33'd3 < 33'(BYTES));
+    always_ff @(posedge clk) begin
+        if (!rst_n) read_valid <= 1'b0;
+        else        read_valid <= read_in_range_pre;
+    end
+
+    // ----------------------------------------------------------------
+    // Read port 1 — feature-strippable via ENABLE_READ2 (Ch251.4).
+    // When ENABLE_READ2=1: full sync read + range gate, matching the
+    // pre-Ch251.4 behaviour. When ENABLE_READ2=0: NO reference to
+    // `mem` from this branch, so Quartus does not infer a second M20K
+    // read port and the VRAM storage stops replicating.
+    // ----------------------------------------------------------------
+    generate
+    if (ENABLE_READ2) begin : g_read2_en
+        logic [WORD_AW-1:0] read2_word_idx;
+        assign read2_word_idx = read2_addr[WORD_AW+1:2] & WORD_AW'(WORDS - 1);
+
+        always_ff @(posedge clk) begin
+            read2_data <= mem[read2_word_idx];
+        end
+
+        logic read2_in_range_pre;
+        assign read2_in_range_pre = (read2_addr[1:0] == 2'b00) &&
+                                    ({1'b0, read2_addr} + 33'd3 < 33'(BYTES));
+        always_ff @(posedge clk) begin
+            if (!rst_n) read2_valid <= 1'b0;
+            else        read2_valid <= read2_in_range_pre;
+        end
+    end else begin : g_read2_dis
+        always_ff @(posedge clk) begin
+            read2_data  <= 32'd0;
+            read2_valid <= 1'b0;
+        end
+    end
+    endgenerate
+
+endmodule : vram_bram_stub
@@ -0,0 +1,200 @@
+// retroDE_ps2 — vram_normalize_pkg (Ch155)
+//
+// Writer-side normalization for `vram_bram_stub`. The new BRAM-friendly
+// VRAM (Ch154) requires word-aligned writes (`write_addr[1:0] == 0`)
+// with payload pre-shifted into the selected byte lane(s) and
+// `write_be` set per byte. Today's writer-side RTL emits at sub-word
+// boundaries for PSMCT16 (halfword), PSMT8 (byte), and PSMT4 (nibble);
+// this package's `normalize_write` function bridges the contract.
+//
+// Codex Ch155 framing: "Add a small helper module or function for
+// VRAM write normalization: input: natural byte address, PSM,
+// pixel/index payload, old byte for PSMT4 if needed; output:
+// word-aligned write_addr, shifted write_data, write_be."
+//
+// Scope (Ch155):
+//   - Function is defined + standalone-verified for all 4 PSMs.
+//   - NOT yet applied inside `gs_stub.raster_pixel_emit` or
+//     `gif_image_xfer_stub`. The PSMT4 case needs a read-then-write
+//     pipeline upstream (to source `old_byte`); that's a Ch156+
+//     RTL plumbing chapter. CT32/CT16/T8 cases are pure-comb and
+//     can be plumbed in as soon as the wiring lands.
+//
+// Pure-comb function — no RTL pipelining inside the function itself.
+// Callers that need a read-then-write pipeline (PSMT4) own that
+// pipelining and pass the read result as `old_byte`.
+
+`timescale 1ns/1ps
+
+package vram_normalize_pkg;
+
+    // GS PSM codes (subset relevant to VRAM writes).
+    localparam logic [5:0] PSM_PSMCT32 = 6'h00;
+    localparam logic [5:0] PSM_PSMCT16 = 6'h02;
+    localparam logic [5:0] PSM_PSMT8   = 6'h13;
+    localparam logic [5:0] PSM_PSMT4   = 6'h14;
+
+    typedef struct packed {
+        logic [31:0] write_addr;   // word-aligned
+        logic [31:0] write_data;   // payload shifted to lane
+        logic [3:0]  write_be;     // per-byte write enable
+    } norm_out_t;
+
+    // ----------------------------------------------------------------
+    // normalize_write — pure-comb writer-side normalization.
+    //
+    // Inputs:
+    //   byte_addr   — natural byte address as the legacy writers
+    //                 already emit. CT32 callers must already pass
+    //                 a word-aligned address. CT16 callers may pass
+    //                 a halfword address (byte_addr[1] selects low
+    //                 or high halfword). T8/T4 callers may pass any
+    //                 byte address.
+    //   psm         — GS PSM code (use one of the localparams above).
+    //   payload     — payload bits in the LSBs:
+    //                   CT32 → payload[31:0] is the full ABGR word.
+    //                   CT16 → payload[15:0] is the RGB5A1 halfword.
+    //                   T8   → payload[ 7:0] is the byte index.
+    //                   T4   → payload[ 3:0] is the nibble index.
+    //   nibble_hi   — T4 only. 0 = splice payload[3:0] into the LOW
+    //                 nibble of the byte at byte_addr, 1 = HIGH.
+    //                 Ignored for CT32/CT16/T8.
+    //   old_byte    — T4 only. Current value of mem[byte_addr]; the
+    //                 function splices the new nibble into this byte
+    //                 to preserve the other nibble. Ignored for
+    //                 CT32/CT16/T8.
+    //
+    // Output: word-aligned write_addr + shifted write_data + write_be.
+    //
+    // For PSMs other than CT32/CT16/T8/T4 the function returns a
+    // dropped write (write_be = 4'b0000, write_data = 32'd0); this
+    // matches `vram_stub`'s "unsupported PSMs are silent no-ops"
+    // posture (Ch95).
+    // ----------------------------------------------------------------
+    function automatic norm_out_t normalize_write(
+        input logic [31:0] byte_addr,
+        input logic [5:0]  psm,
+        input logic [31:0] payload,
+        input logic        nibble_hi,
+        input logic [7:0]  old_byte
+    );
+        norm_out_t r;
+
+        // Word-aligned base address common to every PSM.
+        r.write_addr = byte_addr & ~32'd3;
+
+        unique case (psm)
+            // ------------------------------------------------------
+            // PSMCT32 — natural 32-bit-aligned write. byte_addr MUST
+            // already be word-aligned; if it isn't, the function
+            // produces a dropped write so the BRAM module never sees
+            // the misuse.
+            // ------------------------------------------------------
+            PSM_PSMCT32: begin
+                if (byte_addr[1:0] != 2'b00) begin
+                    r.write_data = 32'd0;
+                    r.write_be   = 4'b0000;
+                end else begin
+                    r.write_data = payload;
+                    r.write_be   = 4'b1111;
+                end
+            end
+
+            // ------------------------------------------------------
+            // PSMCT16 — halfword write. byte_addr[1] picks low or
+            // high halfword; byte_addr[0] MUST be 0.
+            // ------------------------------------------------------
+            PSM_PSMCT16: begin
+                if (byte_addr[0] != 1'b0) begin
+                    r.write_data = 32'd0;
+                    r.write_be   = 4'b0000;
+                end else if (byte_addr[1] == 1'b0) begin
+                    r.write_data = {16'd0, payload[15:0]};
+                    r.write_be   = 4'b0011;
+                end else begin
+                    r.write_data = {payload[15:0], 16'd0};
+                    r.write_be   = 4'b1100;
+                end
+            end
+
+            // ------------------------------------------------------
+            // PSMT8 — single byte at any byte address. byte_addr[1:0]
+            // selects which of the 4 byte lanes gets the byte.
+            // ------------------------------------------------------
+            PSM_PSMT8: begin
+                unique case (byte_addr[1:0])
+                    2'b00: begin
+                        r.write_data = {24'd0, payload[7:0]};
+                        r.write_be   = 4'b0001;
+                    end
+                    2'b01: begin
+                        r.write_data = {16'd0, payload[7:0], 8'd0};
+                        r.write_be   = 4'b0010;
+                    end
+                    2'b10: begin
+                        r.write_data = {8'd0, payload[7:0], 16'd0};
+                        r.write_be   = 4'b0100;
+                    end
+                    2'b11: begin
+                        r.write_data = {payload[7:0], 24'd0};
+                        r.write_be   = 4'b1000;
+                    end
+                endcase
+            end
+
+            // ------------------------------------------------------
+            // PSMT4 — nibble splice. The function takes `old_byte` as
+            // the current value of mem[byte_addr] and produces a
+            // full-byte write at that address containing the new
+            // byte: (old_byte & ~nibble_mask) | (new_nibble in lane).
+            //
+            // The caller is responsible for sourcing `old_byte` —
+            // typically a 1-cycle read of mem[byte_addr] before the
+            // write fires. Ch156+ inserts that read pipeline inside
+            // gs_stub.raster_pixel_emit + gif_image_xfer_stub.
+            //
+            // byte_addr[1:0] selects the byte lane in the 32-bit
+            // word; nibble_hi selects which nibble of that byte gets
+            // the new value.
+            // ------------------------------------------------------
+            PSM_PSMT4: begin
+                logic [7:0] new_byte;
+                if (nibble_hi)
+                    new_byte = {payload[3:0], old_byte[3:0]};
+                else
+                    new_byte = {old_byte[7:4], payload[3:0]};
+
+                unique case (byte_addr[1:0])
+                    2'b00: begin
+                        r.write_data = {24'd0, new_byte};
+                        r.write_be   = 4'b0001;
+                    end
+                    2'b01: begin
+                        r.write_data = {16'd0, new_byte, 8'd0};
+                        r.write_be   = 4'b0010;
+                    end
+                    2'b10: begin
+                        r.write_data = {8'd0, new_byte, 16'd0};
+                        r.write_be   = 4'b0100;
+                    end
+                    2'b11: begin
+                        r.write_data = {new_byte, 24'd0};
+                        r.write_be   = 4'b1000;
+                    end
+                endcase
+            end
+
+            // ------------------------------------------------------
+            // Unsupported PSM → drop the write. Matches vram_stub's
+            // Ch95 stance.
+            // ------------------------------------------------------
+            default: begin
+                r.write_data = 32'd0;
+                r.write_be   = 4'b0000;
+            end
+        endcase
+
+        return r;
+    endfunction
+
+endpackage : vram_normalize_pkg
@@ -0,0 +1,185 @@
+// retroDE_ps2 — vram_stub (Ch89)
+//
+// Linear byte-addressable VRAM backing store for gs_stub's
+// `raster_pixel_emit` channel. This is the FIRST persistence
+// layer the rasterizer has had — pre-Ch89, pixels only pulsed as
+// trace-visible events and updated `raster_pixel_color_q` /
+// `raster_pixel_fb_addr_q` snapshot regs, then evaporated. Now
+// they actually land somewhere a TB (or a future scanout path)
+// can read back.
+//
+// Scope (intentionally minimal for Ch89):
+//   - Linear byte-addressable: NO page/block VRAM swizzle. Real
+//     PS2 VRAM is 4 MiB, organized into pages × blocks × columns
+//     per PSM. The fb_addr math in gs_stub matches the linear-
+//     framebuffer layout that PCSX2's gs_state pages out for
+//     "linear" PSM channels; that's what this stub speaks.
+//   - PSMCT32 only: writes 4 bytes per emitted pixel. PSMCT16
+//     (2 bytes) and PSMT8 (1 byte) are deferred until a future
+//     chapter exposes per-pixel PSM at the raster channel.
+//   - Combinational debug read port: byte-addressable, returns
+//     the 4 bytes starting at read_addr packed little-endian.
+//     For TBs to verify pixel storage; not on any hardware path.
+//
+// Wiring contract:
+//   - write_en  ← gs_stub.raster_pixel_emit
+//   - write_addr ← gs_stub.raster_pixel_fb_addr_q
+//   - write_data ← gs_stub.raster_pixel_color_q[31:0]   (lower 32 bits)
+//   - write_be  ← gs_stub.raster_pixel_be_q             (Ch95)
+//
+// The full 64-bit raster_pixel_color_q carries Q (texture-coord
+// IEEE float) in the upper 32 bits — those bits are NOT part of
+// the framebuffer pixel and are deliberately discarded here.
+//
+// `write_be[3:0]` (Ch95): per-byte write enable. byte i (the
+// byte at `write_addr + i`) is committed only when
+// `write_en && write_be[i]`. PSMCT32 writes use 4'b1111;
+// PSMCT16 writes use 4'b0011 (the 2 bytes at write_addr — gs_stub
+// passes the actual byte address of the pixel, which is
+// 2-byte-aligned but not necessarily 4-byte-aligned). TBs that
+// bypass gs_stub (e.g. `tb_vram_stub`, `tb_gs_scanout_psm16`)
+// tie write_be to 4'b1111.
+//
+// `write_mask[31:0]` (Ch106): per-BIT merge mask used to support
+// sub-byte writes (PSMT4 — 4-bit nibble per pixel). The committed
+// byte i (still gated by write_be[i]) is:
+//   mem[addr+i] <= (mem[addr+i] & ~mask_i) | (data_i & mask_i)
+// where mask_i = write_mask[i*8 +: 8] and data_i =
+// write_data[i*8 +: 8]. PSMCT32/16 + PSMT8 writes tie write_mask
+// to 32'hFFFFFFFF (full byte writes — equivalent to the pre-Ch106
+// behavior). PSMT4 emits use 0x0F (low nibble) or 0xF0 (high
+// nibble) on the enabled byte. The merge happens inside the same
+// always_ff that commits the byte, so back-to-back nibble writes
+// to the SAME byte chain cleanly through NBA semantics: the
+// second write samples mem[addr] AFTER the prior NBA committed.
+//
+// Bounds check (Ch95 audit-medium fix): the write is admitted
+// only if EVERY enabled byte's address is in [0, BYTES). This
+// uses non-wrapping 33-bit arithmetic so a write near the 32-bit
+// address space limit (e.g. write_addr near 0xFFFF_FFFC with
+// be=4'b1111) is rejected cleanly. Halfword writes at the last
+// valid 2-byte slot (write_addr=BYTES-2 with be=4'b0011) are
+// accepted; write_addr=BYTES-1 with be=4'b0011 is rejected
+// because byte 1 of that slot is OOB.
+
+`timescale 1ns/1ps
+
+module vram_stub
+#(
+    parameter int unsigned BYTES = 65536
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // Write side: one 32-bit pixel slot per cycle when write_en
+    // pulses. write_addr is a byte offset (already PSM-aware via
+    // gs_stub's bpp_shift math). write_be[i] gates byte i — used
+    // by Ch95 to commit just the 2 bytes of a PSMCT16 pixel
+    // without stomping the adjacent halfword.
+    input  logic        write_en,
+    input  logic [31:0] write_addr,
+    input  logic [31:0] write_data,
+    input  logic [3:0]  write_be,
+    input  logic [31:0] write_mask,
+
+    // Debug read port: combinational, byte-addressable, little-
+    // endian 4-byte read. Used by gs_pcrtc_stub for scanout, and
+    // by TBs for verification.
+    input  logic [31:0] read_addr,
+    output logic [31:0] read_data,
+
+    // Ch99 — second combinational read port for clients that
+    // need to read VRAM concurrently with pcrtc scanout (the
+    // canonical example is `clut_loader_stub`, which copies
+    // CLUT bytes from VRAM into clut_stub when TEX0.CLD fires).
+    // Same byte-addressed 4-byte semantics as port 0. Tie
+    // `read2_addr` to 0 in TBs that don't use it; the unused
+    // `read2_data` output can be left unconnected.
+    input  logic [31:0] read2_addr,
+    output logic [31:0] read2_data
+);
+
+    logic [7:0] mem [0:BYTES-1];
+
+    // Largest base address that admits a 4-byte access without
+    // overrunning the array. Used by the READ port (always 4
+    // bytes). The write port now does per-byte admission below
+    // (Ch95 audit-medium fix) so it can accept halfword writes
+    // near the end of VRAM that the old `addr <= MAX_BASE` gate
+    // would have spuriously dropped.
+    localparam logic [31:0] MAX_BASE = (BYTES >= 4)
+                                     ? (32'(BYTES) - 32'd4)
+                                     : 32'd0;
+
+    // Sim-only memory init. Real Altera/Intel BRAM is power-on-zero
+    // on FPGA configuration, so the procedural loop is unnecessary
+    // in synthesis — and at BYTES=8192 it exceeds Quartus's 5000-
+    // iteration synthesizable-loop limit (Quartus error 13356).
+    // The pragma pair tells Quartus to skip this initial block;
+    // iverilog and other simulators ignore the pragma and run the
+    // init normally so time-0 values are deterministic in sim.
+    // synthesis translate_off
+    initial begin
+        if (BYTES < 4)
+            $error("vram_stub: BYTES (%0d) must be >= 4", BYTES);
+        for (int i = 0; i < BYTES; i++) mem[i] = 8'd0;
+    end
+    // synthesis translate_on
+
+    always_comb begin
+        if (read_addr <= MAX_BASE) begin
+            read_data = {mem[read_addr + 32'd3],
+                         mem[read_addr + 32'd2],
+                         mem[read_addr + 32'd1],
+                         mem[read_addr]};
+        end else begin
+            read_data = 32'd0;
+        end
+    end
+
+    always_comb begin
+        if (read2_addr <= MAX_BASE) begin
+            read2_data = {mem[read2_addr + 32'd3],
+                          mem[read2_addr + 32'd2],
+                          mem[read2_addr + 32'd1],
+                          mem[read2_addr]};
+        end else begin
+            read2_data = 32'd0;
+        end
+    end
+
+    // Per-byte admission. We use non-wrapping 33-bit arithmetic
+    // for `write_addr + i` so a near-0xFFFFFFFF address can't
+    // wrap and falsely pass the comparison. An enabled byte is
+    // admitted only if its byte address is strictly less than
+    // BYTES; the entire write is dropped if ANY enabled byte
+    // would land out of range, matching the Ch89-audit "no
+    // partial writes near the boundary" stance.
+    logic [32:0] addr33;
+    logic        admit_b0, admit_b1, admit_b2, admit_b3;
+    logic        write_admit;
+    assign addr33   = {1'b0, write_addr};
+    assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
+    assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
+    assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
+    assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
+    assign write_admit = write_en
+                       && (!write_be[0] || admit_b0)
+                       && (!write_be[1] || admit_b1)
+                       && (!write_be[2] || admit_b2)
+                       && (!write_be[3] || admit_b3);
+
+    always_ff @(posedge clk) begin
+        if (rst_n && write_admit) begin
+            if (write_be[0]) mem[write_addr]         <= (mem[write_addr]         & ~write_mask[7:0])
+                                                       | (write_data[7:0]        &  write_mask[7:0]);
+            if (write_be[1]) mem[write_addr + 32'd1] <= (mem[write_addr + 32'd1] & ~write_mask[15:8])
+                                                       | (write_data[15:8]       &  write_mask[15:8]);
+            if (write_be[2]) mem[write_addr + 32'd2] <= (mem[write_addr + 32'd2] & ~write_mask[23:16])
+                                                       | (write_data[23:16]      &  write_mask[23:16]);
+            if (write_be[3]) mem[write_addr + 32'd3] <= (mem[write_addr + 32'd3] & ~write_mask[31:24])
+                                                       | (write_data[31:24]      &  write_mask[31:24]);
+        end
+    end
+
+endmodule : vram_stub
@@ -0,0 +1,49 @@
+# rtl/intc
+
+Interrupt controller scaffolding. Matches `docs/contracts/intc.md`.
+
+## Current contents
+
+- `intc_stub.sv` — generic PS2-style INTC register shell.
+  Register-visible INTC_STAT / INTC_MASK (offsets parameterized) plus a
+  16-source injection port `irq_src[15:0]`. The aggregate output
+  `cpu_irq` is polarity-neutral: the same module is instantiated both
+  as the EE INTC and as the IOP INTC (with appropriate offsets and a
+  different set of wired sources in each case).
+
+## Register semantics
+
+- `INTC_STAT` (offset is a parameter; default 0x00): W1C on writes;
+  sticky until cleared. `irq_src` sets bits on each cycle they're
+  observed; same-cycle inject-over-W1C collisions keep the pending bit
+  — interrupts are never silently swallowed.
+- `INTC_MASK` (offset is a parameter; default 0x10): plain write-to-set.
+  Real PS2 uses XOR/toggle semantics on mask writes; stub uses plain
+  write for simplicity. Escalate if a BIOS trace demands it.
+
+## Instantiation conventions
+
+- **EE INTC**: default offsets (STAT=0x00, MASK=0x10). Instantiated
+  stand-alone in most benches; the EE memory map does not route INTC
+  addresses yet (deferred).
+- **IOP INTC**: parameterized to STAT=0x70, MASK=0x74 to match real
+  PS2 IOP INTC placement. Reached through `iop_memory_map_stub` at
+  physical address 0x1F80_1070+ (region id = 5).
+
+## Wired sources (current)
+
+- EE INTC bit 0 = EE DMAC completion (`dmac_reg_stub.irq_completion_o`).
+- IOP INTC bit 0 = IOP DMAC ch9 completion
+  (`iop_dmac_reg_stub.irq_completion_o`).
+
+Both are one-cycle pulses driven from the respective DMAC's `S_DONE`
+state. The INTC latches them into its own pending bit; software (the
+TB, for now) reads STAT through the architectural register port and
+acks with a W1C write.
+
+## Scope boundary
+
+Module is side-neutral by design. Source-routing from other real
+subsystems (timers, GIF/GS, IPU, SPU2, bridge `last_seen_o`) is the
+next natural expansion. Re-arm / re-assertion ordering is already
+proven in the integration benches.
@@ -0,0 +1,189 @@
+// retroDE_ps2 — intc_stub
+//
+// Generic PS2-style interrupt controller shell. Register-visible
+// status/mask behaviour plus a 16-source injection port; the same
+// module is reusable as either the EE-side or IOP-side INTC by picking
+// the appropriate address offsets and instantiating with different
+// sources. The aggregate output `cpu_irq` is side-neutral.
+//
+// Contract refs:
+//   docs/stub_module_plan.md    (Wave 1, item 7)
+//   docs/contracts/intc.md
+//
+// Register layout (Wave 1):
+//   offset 0x000: INTC_STAT    read: current pending, write: W1C
+//   offset 0x010: INTC_MASK    read: current mask,    write: plain set
+//
+//   Real PS2 INTC_MASK uses write-to-toggle (XOR) semantics. Wave 1 uses
+//   plain write semantics for stub simplicity; toggle semantics are a
+//   Wave 2+ concern if BIOS traces demand them.
+//
+// Injection:
+//   irq_src[i] high on any cycle latches bit i in INTC_STAT. Sticky until
+//   cleared by a W1C write. Sixteen sources are exposed (matches real PS2
+//   INTC source count); testbenches drive whichever they need.
+//
+// Trace payload schema (per stub plan):
+//   INTC IRQ  arg0=source_bitmap arg1=masked arg2=pending arg3=ack
+//     one event per cycle max. Priority if multiple triggers coincide:
+//       ack (STAT W1C) > new assertion > mask write.
+//     ack arg3=1 when the event is a W1C ack, 0 otherwise.
+//     flags bit 0 = register write (vs. source-driven assertion)
+
+`timescale 1ns/1ps
+
+module intc_stub
+    import trace_pkg::*;
+#(
+    parameter logic [7:0] INTC_STAT_OFFSET = 8'h00,
+    parameter logic [7:0] INTC_MASK_OFFSET = 8'h10
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Register port
+    input  logic          reg_wr_en,
+    input  logic          reg_rd_en,
+    input  logic [7:0]    reg_addr,
+    input  logic [31:0]   reg_wr_data,
+    output logic [31:0]   reg_rd_data,
+    output logic          reg_rd_valid,
+
+    // Synthetic interrupt sources
+    input  logic [15:0]   irq_src,
+
+    // Aggregate interrupt line to whichever CPU side this INTC serves
+    // (EE or IOP). Named generically because this module is reused on
+    // both sides.
+    output logic          cpu_irq,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    logic [15:0] intc_stat;
+    logic [15:0] intc_mask;
+
+    // ------------------------------------------------------------------
+    // Register reads (1-cycle latency, matches bios_rom_stub pattern)
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            reg_rd_data  <= 32'd0;
+            reg_rd_valid <= 1'b0;
+        end else begin
+            reg_rd_valid <= reg_rd_en;
+            if (reg_rd_en) begin
+                case (reg_addr)
+                    INTC_STAT_OFFSET: reg_rd_data <= {16'd0, intc_stat};
+                    INTC_MASK_OFFSET: reg_rd_data <= {16'd0, intc_mask};
+                    default:          reg_rd_data <= 32'd0;
+                endcase
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Pending/mask update + synthetic injection
+    //   - W1C on INTC_STAT clears bits where write_data has 1.
+    //   - Plain write on INTC_MASK replaces current mask.
+    //   - irq_src sets bits in INTC_STAT (sticky).
+    //   - If W1C and irq_src collide on the same cycle and same bit, the
+    //     assertion wins — we don't want to swallow an interrupt.
+    // ------------------------------------------------------------------
+
+    logic [15:0] stat_w1c_mask;
+    logic [15:0] stat_inject;
+    logic        mask_wr;
+
+    assign stat_w1c_mask = (reg_wr_en && (reg_addr == INTC_STAT_OFFSET))
+                           ? reg_wr_data[15:0] : 16'd0;
+    assign stat_inject   = irq_src;
+    assign mask_wr       = reg_wr_en && (reg_addr == INTC_MASK_OFFSET);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            intc_stat <= 16'd0;
+            intc_mask <= 16'd0;
+        end else begin
+            intc_stat <= (intc_stat & ~stat_w1c_mask) | stat_inject;
+            if (mask_wr) intc_mask <= reg_wr_data[15:0];
+        end
+    end
+
+    assign cpu_irq = |(intc_stat & intc_mask);
+
+    // ------------------------------------------------------------------
+    // Trace
+    // ------------------------------------------------------------------
+
+    logic [15:0] new_assertions;
+    logic [15:0] bits_acked;
+    logic        had_ack;
+    logic        had_assertion;
+    logic        had_mask_wr;
+
+    // "new_assertions" = bits becoming pending this cycle that weren't pending
+    // before. Combinational on the pre-edge state.
+    assign new_assertions = stat_inject & ~intc_stat;
+    assign bits_acked     = stat_w1c_mask & intc_stat;
+    assign had_ack        = |bits_acked;
+    assign had_assertion  = |new_assertions;
+    assign had_mask_wr    = mask_wr;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_INTC;
+            ev_event  <= EV_IRQ;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (had_ack) begin
+            // arg1/arg2 must reflect the post-update state. The state
+            // update preserves simultaneous stat_inject over W1C clears
+            // (see always_ff above), so if inject and ack collide on the
+            // same bit, that bit stays pending. arg0 still reports what
+            // software tried to ack, regardless of whether it took effect.
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_INTC;
+            ev_event  <= EV_IRQ;
+            ev_arg0   <= {48'd0, bits_acked};
+            ev_arg1   <= {48'd0, ((intc_stat & ~stat_w1c_mask) | stat_inject) & intc_mask};
+            ev_arg2   <= {48'd0, (intc_stat & ~stat_w1c_mask) | stat_inject};
+            ev_arg3   <= 64'd1;       // ack = 1
+            ev_flags  <= 32'h0000_0001;
+        end else if (had_assertion) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_INTC;
+            ev_event  <= EV_IRQ;
+            ev_arg0   <= {48'd0, new_assertions};
+            ev_arg1   <= {48'd0, (intc_stat | stat_inject) & intc_mask};
+            ev_arg2   <= {48'd0, (intc_stat | stat_inject)};
+            ev_arg3   <= 64'd0;       // ack = 0
+            ev_flags  <= 32'd0;
+        end else if (had_mask_wr) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_INTC;
+            ev_event  <= EV_IRQ;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= {48'd0, intc_stat & reg_wr_data[15:0]};
+            ev_arg2   <= {48'd0, intc_stat};
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'h0000_0001;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : intc_stub
@@ -0,0 +1,151 @@
+# rtl/iop
+
+IOP subsystem. Matches `docs/contracts/iop.md`.
+
+## Current contents
+
+- `iop_ram_stub.sv` — 32-bit IOP RAM primitive. Default 16 KiB,
+  parameterizable. Read + write ports with byte-enable granularity,
+  one-cycle read latency, caller-provided `master_id` for trace
+  attribution. Emits trace events under `SUBSYS_IOP`.
+- `iop_memory_map_stub.sv` — IOP-side address decode. CPU-side port uses
+  kseg0/kseg1 stripping (`phys = iop_addr[28:0]`). Second write-master
+  port for DMA bridges (`bridge_wr_*`), physical addressing. Regions now
+  decoded:
+    - IOP RAM (phys 0x00000000-0x001FFFFF) → `iop_ram_stub`
+    - SIF registers (phys 0x1D000000 block) → SIF register shell
+      (`sif_mailbox_stub` IOP side) via `sif_rd_*` / `sif_wr_*` ports
+    - IOP DMAC channel 9 (phys 0x1F801520-0x1F80152F) → IOP DMAC
+      register shell via `iop_dmac_rd_*` / `iop_dmac_wr_*` ports
+    - IOP INTC (phys 0x1F801070-0x1F80107F) → `intc_stub` (IOP-side
+      instance) via `iop_intc_rd_*` / `iop_intc_wr_*` ports
+    - Shared BIOS ROM (phys 0x1FC00000-0x1FFFFFFF, 4 MiB) →
+      `bios_rom_stub` via `bios_rd_*` port. kseg1 aliasing makes
+      `0xBFC0_0000` reset fetches land here transparently. BIOS is
+      read-only; writes to this window trace as UNMAPPED.
+    - everything else → UNMAPPED with deterministic 0xDEADBEEF
+  Future regions (other DMAC channels, IOP timers, SPU2) reserved in
+  comments. Arbitration between CPU and bridge writes on RAM path:
+  CPU wins on same-cycle collision. SIF, DMAC, INTC, and BIOS are
+  separate ports and don't contend with RAM.
+- `iop_fetch_stub.sv` — minimal sequential 32-bit fetcher. Mirrors
+  `ee_fetch_stub` in shape: PC-incrementing, no decode, no branches, no
+  exceptions. Default `RESET_VECTOR` is in IOP RAM (0x00000000), NOT in
+  BIOS space — explicitly non-BIOS boot. Emits `IOP RESET` once and
+  `IOP IFETCH` per response. First execution-visible IOP traffic in the
+  project; fetches route through `iop_memory_map_stub`.
+- `iop_core_stub.sv` — **real instruction-decoding IOP core with
+  minimal COP0, asynchronous interrupt exception entry, and the
+  architectural MIPS reset vector.** Tiny MIPS R3000 subset,
+  multi-cycle FSM, speaks the same map / DMAC / INTC protocol as every
+  previous engine. Default `PC_RESET = 0xBFC0_0000` (kseg1 into the
+  shared BIOS window; override with a parameter for RAM-only tests).
+  Supported opcodes:
+  LUI, ORI, ADDIU, LW, SW, BEQ, BNE, J, JR (SPECIAL func 0x08),
+  NOP (any other SPECIAL func / unknown opcode), SYSCALL
+  (SPECIAL func 0x0C, halts), **MFC0 / MTC0 / RFE** (COP0 opcode 0x10).
+  32-entry register file with `$0` hardwired.
+  **COP0 subset:** Status (IE/KU triple stack + IM), Cause (ExcCode +
+  IP reflecting cpu_irq), EPC. Exception entry is sampled at clean
+  instruction-retire boundaries: if a delay slot is outstanding, the
+  exception defers until the delay slot resolves. On entry: push IE/KU
+  stack, ExcCode=0, save EPC=next_pc, PC←EXC_VECTOR (parameter).
+  **Branch delay slot** honoured from day one; taken-branch and
+  delay-slot retires are both flagged in the trace.
+  **Strict mode:** `STRICT_UNSUPPORTED` parameter (default 0). When
+  set, unsupported opcodes halt the core and latch the offending
+  pc/instr word into `trap_o` / `trap_pc_o` / `trap_instr_o` instead
+  of silently retiring as NOPs. The canonical NOP (`instr == 32'h0`,
+  SLL $0,$0,0) is always treated as a real NOP. Retire trace flag
+  bit 7 marks strict-trap retires. Used by the BIOS smoke TB; other
+  benches leave it off for backwards compatibility.
+  Deferred: BD bit in Cause, nested interrupts, syscall/break
+  exception dispatch, R-type ALU/shifts/HI-LO.
+- `iop_exec_stub.sv` — **RAM-backed IOP execution primitive (bridge
+  module).**
+  Not a MIPS core, not an ISA decoder. A tiny FSM sequencer that fetches
+  its micro-ops from IOP RAM through the real `iop_memory_map_stub`
+  CPU-side port — the same way a future instruction-fetching CPU will.
+  The control program is no longer RTL-resident; it lives as data in
+  RAM that someone (a TB, eventually a BIOS loader) preloads before
+  pulsing `go_i`.
+  **Five opcodes**: `HALT`, `WRITE(addr, data)`, `READ(addr)`,
+  `WAIT_IRQ`, `BNE(target_pc, expected)` — branch if the last READ's
+  result does not equal `expected`, enabling real loops.
+  Op layout in RAM: 16 bytes per op (`pc<<4` addressing). Word 0 is
+  the opcode (low 4 bits), word 1 is addr or branch target, word 2 is
+  data or expected value, word 3 is reserved. `SCRIPT_BASE` is a
+  parameter (default 0x0000_0400).
+  Takes `cpu_irq` from the IOP INTC; `WAIT_IRQ` genuinely blocks until
+  a real interrupt asserts. One trace event per op completion with
+  flag bits marking WAIT_IRQ exit (bit 1), HALT entry (bit 2), and
+  BNE taken (bit 3). When a real MIPS decode primitive eventually
+  arrives, it replaces this module while keeping the same map / DMA /
+  INTC hookup verbatim.
+- `iop_dmac_reg_stub.sv` — IOP DMAC for one SIF-facing channel
+  (CHANNEL=9, PATH_ID=9, MASTER_ID=4). Register surface (low-byte
+  offsets): MADR @ 0x00, BCR @ 0x04, CHCR @ 0x08, DONE_COUNT @ 0x0C
+  (read-only monotonic counter); start bit is CHCR[0].
+  Real data path: on start, DMAC latches MADR/BCR, then steps through
+  IDLE → FETCH_WAIT → ACTIVE_SEND → DONE per beat, sourcing 32-bit words
+  from IOP RAM through the map's `dma_rd_*` port (src_addr stepping by
+  4 per beat). Endpoint is a word-granularity ready/valid/last stream
+  with `ep_ready` back-pressure — no false completion under stall.
+  Emits DMA_CFG on register writes, DMA_START on arm, DMA_BEAT per
+  accepted beat (with src_addr + remaining count), DMA_DONE on the
+  final beat. `done_count_o` is a monotonic visible counter.
+  `irq_completion_o` is a one-cycle pulse on S_DONE — wired into the
+  IOP INTC as source bit 0 so software can observe channel completion.
+  Only reachable through the real IOP map at 0x1F80_1520.
+
+## Explicit non-goals (current step)
+
+- Full MIPS R3000 ISA coverage (the core is still a narrow subset;
+  strict-mode halts on the first unsupported opcode so the BIOS tells
+  us what to grow next)
+- Full 2 MiB RAM sizing (stub defaults stay small for sim speed; the map
+  window is 2 MiB and truncates at the connection to the smaller stub)
+- IOP I/O beyond the currently decoded regions (DMAC ch9 / INTC / BIOS);
+  SPU2, timers, and other peripherals are not wired yet
+- IOP DMAC channels other than ch9 (SIF0 IOP→EE)
+- Real Sony BIOS execution (the smoke TB's synthetic bootstrap is the
+  current committed content; swapping in a user-supplied dump is a
+  drop-in exercise that will reveal the next missing opcode)
+
+## Scope boundary
+
+This directory owns IOP CPU execution, IOP-local RAM/I/O decode, IOP
+interrupt intake, IOP DMAC channels, and BIOS-side IOP boot sequencing
+behavior (per `docs/contracts/iop.md`).
+
+The IOP side now runs a MIPS R3000 subset from an architecturally
+correct BIOS reset vector, with precise interrupt exception entry and
+a RAM-resident ISR. The project has crossed five architectural seams:
+  1. TB-orchestrated → fabric-orchestrated (scripted exec stub)
+  2. RTL-resident → RAM-resident control (exec stub reads ops from RAM)
+  3. Micro-op bridge → real ISA decode (iop_core_stub)
+  4. Polled completion → asynchronous exception-driven control flow
+     (COP0 + cpu_irq)
+  5. TB-preloaded RAM as reset source → BIOS ROM at 0xBFC0_0000
+     (shared BIOS wired through the IOP map; hand-assembled bootstraps
+     prove the seam before any real Sony BIOS is attempted)
+
+Each seam preserved every prior module — only where code comes from
+evolved.
+
+## Planned next increments
+
+These are possibilities, not commitments — order will be decided per the
+next architectural question:
+
+- **BIOS-driven core growth:** point `tb_iop_core_bios_smoke` at a
+  user-supplied BIOS dump (swap the TB's synthetic preload for
+  `$readmemh` into `u_bios.mem`), observe the first unsupported
+  opcode, add it to `iop_core_stub`, repeat. Expected near-term
+  additions: ANDI, ADDU/SUBU, SLL/SRL/SRA, JAL, SLT(U). Do not add
+  speculatively; let the BIOS trace drive the order.
+- Core exception growth as the BIOS path demands it: BD bit in
+  Cause, nested interrupts, syscall/break exception dispatch.
+- Other IOP DMAC channels (CDVD / SPU2 / DEV9 / SIF1-2 / SIO2).
+- IOP map expansion: remaining IOP I/O (0x1F800000), SPU2
+  (0x1F900000).
@@ -0,0 +1,711 @@
+// retroDE_ps2 — iop_core_stub
+//
+// Minimal MIPS R3000 subset for the IOP side, now with real interrupt
+// exception entry. The engine sits where `iop_exec_stub` sat, drives
+// `iop_memory_map_stub`'s CPU-side port for ifetch and data accesses,
+// and finally *uses* `cpu_irq` from the IOP INTC instead of ignoring it.
+//
+// Wave 1 (decode): LUI/ORI/ADDIU/LW/SW/BEQ/BNE/J/NOP/SYSCALL, honest
+// branch delay slots. Programs polled INTC_STAT through the real map.
+//
+// Wave 2 (this module revision): minimal COP0 + asynchronous interrupt
+// exception entry. cpu_irq becomes a real vectoring event when
+// enabled through Status. Mainline no longer needs to touch INTC_STAT;
+// an ISR at the exception vector handles acknowledgement.
+//
+// Intentionally still NOT a full R3000:
+//   - No TLB / cache / HI/LO / R-type ALU / shifts / mul / div.
+//   - No syscall / break exception *handling* beyond SYSCALL-as-halt.
+//   - No BD bit in Cause for branch-delay exceptions (we simply
+//     refuse to take exceptions between a taken branch and its delay
+//     slot — see "delay-slot rule" below).
+//   - No kernel/user mode enforcement: KU state exists on the stack
+//     for forward compatibility but nothing in the core consults it.
+//
+// Supported opcodes (MIPS encoding):
+//   SPECIAL (opcode = 0x00):
+//     func 0x08 (JR)         — pc <= rs_val; has delay slot.
+//     func 0x0C (SYSCALL)    — halt_o asserts; FSM stops fetching.
+//     any other func         — treated as NOP (incl. SLL $0,$0,0).
+//   0x02 J                   — jump; has delay slot.
+//   0x04 BEQ / 0x05 BNE      — conditional branch; has delay slot.
+//   0x09 ADDIU               — no overflow trap.
+//   0x0D ORI / 0x0F LUI      — logical immediate / upper load.
+//   0x10 COP0:
+//     rs 0x00 (MFC0)         — rt <= COP0[rd]
+//     rs 0x04 (MTC0)         — COP0[rd] <= rt
+//     rs 0x10, func 0x10 (RFE)
+//                            — shift IE/KU stack right (pop)
+//   0x23 LW / 0x2B SW        — word memory access.
+//   Anything else            — treated as NOP.
+//
+// COP0 register surface (subset):
+//   12  Status      [0]=IEc [1]=KUc [2]=IEp [3]=KUp [4]=IEo [5]=KUo
+//                   [15:8]=IM   (bit 10 = IM2 gates the HW interrupt
+//                   wired to cpu_irq)
+//   13  Cause       [6:2]=ExcCode, [15:8]=IP. IP[2] reflects cpu_irq.
+//                   Software may write Cause but we only latch SW
+//                   interrupt pending bits IP[1:0] — not load-bearing
+//                   in the first TB.
+//   14  EPC         saved PC on exception entry.
+//
+// Exception entry semantics:
+//   Sampled at *instruction-retire boundaries*, never mid-fetch or
+//   mid-memory. An exception is taken iff all of the following hold
+//   at the retire boundary:
+//     - Status.IEc == 1                 (master interrupts enabled)
+//     - Cause.IP[i] & Status.IM[i]      (any unmasked pending source)
+//     - new_branch_pending == 0         (delay slot already resolved)
+//   On entry:
+//     EPC  <= next_pc                   (the pc that would have been
+//                                        fetched next; branch_target
+//                                        if a delay slot just resolved,
+//                                        pc+4 otherwise)
+//     Cause.ExcCode <= 5'h00            (Int exception)
+//     Status stack pushes left:
+//         IEo <= IEp; IEp <= IEc; IEc <= 0
+//         KUo <= KUp; KUp <= KUc; KUc <= 0
+//     pc <= EXC_VECTOR                  (fixed, parameter)
+//     branch_pending <= 0               (any pending control flow is
+//                                        canceled; EPC captured it)
+//
+// RFE semantics (pop stack, one level):
+//   IEc <= IEp; IEp <= IEo
+//   KUc <= KUp; KUp <= KUo
+//   (IEo, KUo left intact — matches impl-defined R3000 behaviour
+//   for non-nested use)
+//
+// Trace (SUBSYS_IOP, EV_IFETCH one-per-retire as before):
+//   flags bit 0 = SW (write)            (unchanged)
+//   flags bit 1 = LW (read)             (unchanged)
+//   flags bit 2 = branch / jump taken   (unchanged)
+//   flags bit 3 = SYSCALL (halt)        (unchanged)
+//   flags bit 4 = this instruction was in a delay slot
+//   flags bit 5 = exception taken at the end of this instruction
+//                 (EPC saved = next_pc, PC redirected to EXC_VECTOR)
+//   flags bit 6 = RFE retired (IE stack popped)
+//   flags bit 7 = strict trap (unsupported instruction halted the core)
+//
+// Strict mode (STRICT_UNSUPPORTED parameter):
+//   Default is 0 (lenient) to preserve every prior bench's regression
+//   behaviour — any instruction the core doesn't actively decode retires
+//   as a NOP. When STRICT_UNSUPPORTED=1, the core instead halts on the
+//   first unsupported opcode it encounters, latches the offending PC +
+//   instruction word into trap_pc_o / trap_instr_o, asserts trap_o, and
+//   emits a retire trace with flag bit 7 set. Intended for real-BIOS
+//   smoke bring-up — "the first missing opcode is the one the core
+//   needs to grow next." The canonical NOP (32'h0000_0000 =
+//   SLL $0,$0,0) is always treated as a NOP regardless of strict mode.
+
+`timescale 1ns/1ps
+
+module iop_core_stub
+    import trace_pkg::*;
+#(
+    // Architectural MIPS R3000 reset vector (kseg1 into the shared BIOS
+    // window). kseg1 strip in iop_memory_map_stub maps this to physical
+    // 0x1FC0_0000, which the map now routes to bios_rom_stub.
+    // Tests that don't have a BIOS image must override PC_RESET.
+    parameter logic [31:0] PC_RESET    = 32'hBFC0_0000,
+    parameter logic [31:0] EXC_VECTOR  = 32'h0000_0080,
+    // See header comment "Strict mode". Default 0 preserves existing
+    // regression behaviour; BIOS-oriented benches should set to 1.
+    parameter bit          STRICT_UNSUPPORTED = 1'b0
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    input  logic          go_i,
+
+    output logic          map_rd_en,
+    output logic [31:0]   map_rd_addr,
+    input  logic [31:0]   map_rd_data,
+    input  logic          map_rd_valid,
+
+    output logic          map_wr_en,
+    output logic [31:0]   map_wr_addr,
+    output logic [31:0]   map_wr_data,
+    output logic [3:0]    map_wr_be,
+
+    input  logic          cpu_irq,
+
+    output logic          halt_o,
+    output logic [31:0]   pc_o,
+
+    // Strict-mode trap reporting. `trap_o` rises the cycle the core
+    // halts on an unsupported instruction; `trap_pc_o` / `trap_instr_o`
+    // latch the offending fetch. All three stay stable after the halt.
+    output logic          trap_o,
+    output logic [31:0]   trap_pc_o,
+    output logic [31:0]   trap_instr_o,
+
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    // ------------------------------------------------------------------
+    // Opcode / func / COP0 rs constants
+    // ------------------------------------------------------------------
+
+    localparam logic [5:0] OP_SPECIAL = 6'h00;
+    localparam logic [5:0] OP_J       = 6'h02;
+    localparam logic [5:0] OP_BEQ     = 6'h04;
+    localparam logic [5:0] OP_BNE     = 6'h05;
+    localparam logic [5:0] OP_ADDIU   = 6'h09;
+    localparam logic [5:0] OP_ORI     = 6'h0D;
+    localparam logic [5:0] OP_LUI     = 6'h0F;
+    localparam logic [5:0] OP_COP0    = 6'h10;
+    localparam logic [5:0] OP_LW      = 6'h23;
+    localparam logic [5:0] OP_SW      = 6'h2B;
+
+    localparam logic [5:0] FUNC_JR       = 6'h08;
+    localparam logic [5:0] FUNC_SYSCALL  = 6'h0C;
+    localparam logic [5:0] FUNC_RFE      = 6'h10;
+
+    localparam logic [4:0] COP0_RS_MF = 5'h00;
+    localparam logic [4:0] COP0_RS_MT = 5'h04;
+    localparam logic [4:0] COP0_RS_CO = 5'h10;
+
+    localparam logic [4:0] COP0_REG_STATUS = 5'd12;
+    localparam logic [4:0] COP0_REG_CAUSE  = 5'd13;
+    localparam logic [4:0] COP0_REG_EPC    = 5'd14;
+
+    // ------------------------------------------------------------------
+    // FSM state
+    // ------------------------------------------------------------------
+
+    typedef enum logic [3:0] {
+        S_IDLE         = 4'd0,
+        S_IFETCH_REQ   = 4'd1,
+        S_IFETCH_WAIT  = 4'd2,
+        S_EXECUTE      = 4'd3,
+        S_MEM_REQ      = 4'd4,
+        S_MEM_WAIT     = 4'd5,
+        S_MEM_WRITE    = 4'd6,
+        S_HALT         = 4'd7
+    } state_e;
+
+    state_e state;
+
+    // Architectural state
+    logic [31:0] pc;
+    logic [31:0] instr;
+    logic [31:0] regfile [0:31];
+
+    // Branch delay-slot tracking
+    logic        branch_pending;
+    logic [31:0] branch_target;
+    logic        instr_in_delay_slot;
+
+    // COP0 — Status (IE/KU triple stack + IM)
+    logic        status_iec, status_iep, status_ieo;
+    logic        status_kuc, status_kup, status_kuo;
+    logic [7:0]  status_im;
+
+    // COP0 — Cause / EPC
+    logic [4:0]  cause_exc_code;
+    logic [7:0]  cause_ip_sw;          // software-writable pending bits (IP[1:0])
+    logic [31:0] epc;
+
+    // Combinational composition of IP. IP[2] mirrors cpu_irq directly;
+    // higher sources are not wired in the current scope.
+    logic [7:0]  cause_ip;
+    always_comb begin
+        cause_ip        = 8'd0;
+        cause_ip[1:0]   = cause_ip_sw[1:0];
+        cause_ip[2]     = cpu_irq;
+    end
+
+    // Composed Status word (for MFC0) and Cause word (for MFC0)
+    logic [31:0] status_word;
+    logic [31:0] cause_word;
+    always_comb begin
+        status_word     = 32'd0;
+        status_word[0]  = status_iec;
+        status_word[1]  = status_kuc;
+        status_word[2]  = status_iep;
+        status_word[3]  = status_kup;
+        status_word[4]  = status_ieo;
+        status_word[5]  = status_kuo;
+        status_word[15:8] = status_im;
+
+        cause_word      = 32'd0;
+        cause_word[6:2] = cause_exc_code;
+        cause_word[15:8] = cause_ip;
+    end
+
+    // ------------------------------------------------------------------
+    // Decode — combinational extraction from `instr`
+    // ------------------------------------------------------------------
+
+    logic [5:0]  opcode;
+    logic [4:0]  rs_idx;
+    logic [4:0]  rt_idx;
+    logic [4:0]  rd_idx;
+    logic [5:0]  func;
+    logic [15:0] imm16;
+    logic [25:0] imm26;
+    logic [31:0] imm_sx;
+    logic [31:0] imm_zx;
+    logic [31:0] branch_offset;
+    logic [31:0] branch_tgt;
+    logic [31:0] j_tgt;
+    logic [31:0] rs_val;
+    logic [31:0] rt_val;
+    logic [31:0] ea;
+
+    assign opcode        = instr[31:26];
+    assign rs_idx        = instr[25:21];
+    assign rt_idx        = instr[20:16];
+    assign rd_idx        = instr[15:11];
+    assign imm16         = instr[15:0];
+    assign imm26         = instr[25:0];
+    assign func          = instr[5:0];
+    assign imm_sx        = {{16{imm16[15]}}, imm16};
+    assign imm_zx        = {16'd0, imm16};
+    assign branch_offset = {{14{imm16[15]}}, imm16, 2'b00};
+    assign branch_tgt    = pc + 32'd4 + branch_offset;
+    assign j_tgt         = {pc[31:28], imm26, 2'b00};
+    assign rs_val        = (rs_idx == 5'd0) ? 32'd0 : regfile[rs_idx];
+    assign rt_val        = (rt_idx == 5'd0) ? 32'd0 : regfile[rt_idx];
+    assign ea            = rs_val + imm_sx;
+
+    // Instruction classification
+    logic is_special, is_syscall, is_jr;
+    logic is_cop0, is_mfc0, is_mtc0, is_rfe;
+    logic is_nop_class;
+    logic is_lui, is_ori, is_addiu, is_lw, is_sw, is_beq, is_bne, is_j;
+    logic is_branch, is_jump;
+    logic branch_taken;
+    logic is_taken_branch_or_jump;
+
+    assign is_special = (opcode == OP_SPECIAL);
+    assign is_syscall = is_special && (func == FUNC_SYSCALL);
+    assign is_jr      = is_special && (func == FUNC_JR);
+    assign is_cop0    = (opcode == OP_COP0);
+    assign is_mfc0    = is_cop0 && (rs_idx == COP0_RS_MF);
+    assign is_mtc0    = is_cop0 && (rs_idx == COP0_RS_MT);
+    assign is_rfe     = is_cop0 && (rs_idx == COP0_RS_CO) && (func == FUNC_RFE);
+
+    assign is_lui     = (opcode == OP_LUI);
+    assign is_ori     = (opcode == OP_ORI);
+    assign is_addiu   = (opcode == OP_ADDIU);
+    assign is_lw      = (opcode == OP_LW);
+    assign is_sw      = (opcode == OP_SW);
+    assign is_beq     = (opcode == OP_BEQ);
+    assign is_bne     = (opcode == OP_BNE);
+    assign is_j       = (opcode == OP_J);
+
+    assign is_branch  = is_beq || is_bne;
+    assign is_jump    = is_j || is_jr;
+    assign branch_taken = (is_beq && (rs_val == rt_val)) ||
+                          (is_bne && (rs_val != rt_val));
+    assign is_taken_branch_or_jump = branch_taken || is_jump;
+
+    // "NOP class" = anything we don't actively decode. In lenient mode
+    // these retire as a NOP; in strict mode the core halts on them
+    // (see `is_unsupported` / `strict_trap` below).
+    assign is_nop_class = (is_special && !is_syscall && !is_jr)
+                          || (is_cop0 && !is_mfc0 && !is_mtc0 && !is_rfe)
+                          || (!is_special && !is_cop0
+                              && !is_lui && !is_ori && !is_addiu
+                              && !is_lw && !is_sw && !is_beq && !is_bne
+                              && !is_j);
+
+    // The canonical NOP is the all-zero instruction word
+    // (SLL $0,$0,0). It is always treated as a NOP even in strict mode
+    // so the bios_rom_stub default NOP sled doesn't look like a field
+    // of traps.
+    logic is_nop_instr;
+    logic is_unsupported;
+    logic strict_trap;
+    assign is_nop_instr  = (instr == 32'd0);
+    assign is_unsupported = is_nop_class && !is_nop_instr;
+    assign strict_trap   = STRICT_UNSUPPORTED && is_unsupported;
+
+    // ALU writeback value (for LUI/ORI/ADDIU)
+    logic [31:0] alu_wb;
+    always_comb begin
+        if      (is_lui)   alu_wb = {imm16, 16'd0};
+        else if (is_ori)   alu_wb = rs_val | imm_zx;
+        else if (is_addiu) alu_wb = rs_val + imm_sx;
+        else               alu_wb = 32'd0;
+    end
+
+    // MFC0 source value (selected by rd_idx)
+    logic [31:0] cop0_read_val;
+    always_comb begin
+        unique case (rd_idx)
+            COP0_REG_STATUS: cop0_read_val = status_word;
+            COP0_REG_CAUSE:  cop0_read_val = cause_word;
+            COP0_REG_EPC:    cop0_read_val = epc;
+            default:         cop0_read_val = 32'd0;
+        endcase
+    end
+
+    // Taken-branch / jump target selection
+    logic [31:0] taken_target;
+    always_comb begin
+        if (is_jr)       taken_target = rs_val;
+        else if (is_j)   taken_target = j_tgt;
+        else             taken_target = branch_tgt;
+    end
+
+    // ------------------------------------------------------------------
+    // Trace book-keeping (captured at retire)
+    // ------------------------------------------------------------------
+
+    logic [31:0] retired_pc;
+    logic [31:0] retired_instr;
+    logic [31:0] retired_arg2;
+    logic [31:0] retired_arg3;
+    logic        retired_flag_write;
+    logic        retired_flag_read;
+    logic        retired_flag_branch;
+    logic        retired_flag_halt;
+    logic        retired_flag_in_delay;
+    logic        retired_flag_except;
+    logic        retired_flag_rfe;
+    logic        retired_flag_trap;
+    logic        retire_pulse;
+
+    // ------------------------------------------------------------------
+    // Map-port drive (combinational on state)
+    // ------------------------------------------------------------------
+
+    always_comb begin
+        map_rd_en   = 1'b0;
+        map_rd_addr = 32'd0;
+        map_wr_en   = 1'b0;
+        map_wr_addr = 32'd0;
+        map_wr_data = 32'd0;
+        map_wr_be   = 4'd0;
+
+        case (state)
+            S_IFETCH_REQ: begin
+                map_rd_en   = 1'b1;
+                map_rd_addr = pc;
+            end
+            S_MEM_REQ: begin
+                map_rd_en   = 1'b1;
+                map_rd_addr = ea;
+            end
+            S_MEM_WRITE: begin
+                map_wr_en   = 1'b1;
+                map_wr_addr = ea;
+                map_wr_data = rt_val;
+                map_wr_be   = 4'b1111;
+            end
+            default: ;
+        endcase
+    end
+
+    // ------------------------------------------------------------------
+    // Retire helper — applies pc advance, branch queuing, and
+    // exception entry at a clean instruction boundary.
+    //
+    // Inputs (implicit from decoded state):
+    //   - is_taken_branch_or_jump, taken_target
+    //   - branch_pending (current, pre-advance)
+    //   - branch_target  (the pending target if any)
+    //   - Status / Cause state for exception gating
+    //
+    // Outputs (all registered on this clock edge):
+    //   - pc, branch_pending, branch_target
+    //   - epc, status/cause on exception
+    //   - retired_flag_except set when exception fires
+    // ------------------------------------------------------------------
+
+    task automatic retire_advance;
+        logic [31:0] next_pc;
+        logic        new_branch_pending;
+        logic [31:0] new_branch_target;
+        logic        irq_pending_masked;
+        logic        exception_now;
+
+        next_pc             = branch_pending ? branch_target : pc + 32'd4;
+        new_branch_pending  = is_taken_branch_or_jump;
+        new_branch_target   = taken_target;
+
+        irq_pending_masked  = |(cause_ip & status_im);
+        exception_now       = !new_branch_pending
+                              && status_iec
+                              && irq_pending_masked;
+
+        if (exception_now) begin
+            epc             <= next_pc;
+            cause_exc_code  <= 5'h00;     // Int exception code
+            status_ieo      <= status_iep;
+            status_iep      <= status_iec;
+            status_iec      <= 1'b0;
+            status_kuo      <= status_kup;
+            status_kup      <= status_kuc;
+            status_kuc      <= 1'b0;
+            pc              <= EXC_VECTOR;
+            branch_pending  <= 1'b0;
+            retired_flag_except <= 1'b1;
+        end else begin
+            pc              <= next_pc;
+            branch_pending  <= new_branch_pending;
+            if (new_branch_pending) branch_target <= new_branch_target;
+            retired_flag_except <= 1'b0;
+        end
+    endtask
+
+    // ------------------------------------------------------------------
+    // Main FSM
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state               <= S_IDLE;
+            pc                  <= PC_RESET;
+            instr               <= 32'd0;
+            branch_pending      <= 1'b0;
+            branch_target       <= 32'd0;
+            instr_in_delay_slot <= 1'b0;
+
+            // COP0 reset state: interrupts disabled, mask cleared.
+            status_iec          <= 1'b0;
+            status_iep          <= 1'b0;
+            status_ieo          <= 1'b0;
+            status_kuc          <= 1'b0;
+            status_kup          <= 1'b0;
+            status_kuo          <= 1'b0;
+            status_im           <= 8'd0;
+            cause_exc_code      <= 5'd0;
+            cause_ip_sw         <= 8'd0;
+            epc                 <= 32'd0;
+
+            retire_pulse          <= 1'b0;
+            retired_pc            <= 32'd0;
+            retired_instr         <= 32'd0;
+            retired_arg2          <= 32'd0;
+            retired_arg3          <= 32'd0;
+            retired_flag_write    <= 1'b0;
+            retired_flag_read     <= 1'b0;
+            retired_flag_branch   <= 1'b0;
+            retired_flag_halt     <= 1'b0;
+            retired_flag_in_delay <= 1'b0;
+            retired_flag_except   <= 1'b0;
+            retired_flag_rfe      <= 1'b0;
+            retired_flag_trap     <= 1'b0;
+
+            trap_o                <= 1'b0;
+            trap_pc_o             <= 32'd0;
+            trap_instr_o          <= 32'd0;
+
+            for (int i = 0; i < 32; i++) regfile[i] <= 32'd0;
+        end else begin
+            retire_pulse <= 1'b0;
+
+            case (state)
+                S_IDLE: begin
+                    if (go_i) state <= S_IFETCH_REQ;
+                end
+
+                S_IFETCH_REQ: state <= S_IFETCH_WAIT;
+
+                S_IFETCH_WAIT: begin
+                    if (map_rd_valid) begin
+                        instr               <= map_rd_data;
+                        instr_in_delay_slot <= branch_pending;
+                        state               <= S_EXECUTE;
+                    end
+                end
+
+                S_EXECUTE: begin
+                    // Defaults for retire bookkeeping
+                    retired_pc            <= pc;
+                    retired_instr         <= instr;
+                    retired_arg2          <= 32'd0;
+                    retired_arg3          <= 32'd0;
+                    retired_flag_write    <= 1'b0;
+                    retired_flag_read     <= 1'b0;
+                    retired_flag_branch   <= is_taken_branch_or_jump;
+                    retired_flag_halt     <= 1'b0;
+                    retired_flag_in_delay <= instr_in_delay_slot;
+                    retired_flag_except   <= 1'b0;
+                    retired_flag_rfe      <= 1'b0;
+                    retired_flag_trap     <= 1'b0;
+
+                    if (is_syscall) begin
+                        // SYSCALL halts the core unconditionally; no
+                        // exception vectoring in this scope.
+                        retired_flag_halt <= 1'b1;
+                        retire_pulse      <= 1'b1;
+                        state             <= S_HALT;
+                    end else if (strict_trap) begin
+                        // Unsupported instruction under strict mode.
+                        // Halt and latch the offending fetch; no pc
+                        // advance, no regfile write, no COP0 side
+                        // effect. Trap output stays asserted for the
+                        // TB to inspect after halt_o rises.
+                        retired_flag_trap <= 1'b1;
+                        retire_pulse      <= 1'b1;
+                        trap_o            <= 1'b1;
+                        trap_pc_o         <= pc;
+                        trap_instr_o      <= instr;
+                        state             <= S_HALT;
+                    end else if (is_lw) begin
+                        state <= S_MEM_REQ;
+                    end else if (is_sw) begin
+                        state <= S_MEM_WRITE;
+                    end else begin
+                        // ALU / branch / COP0 / NOP: retire in this
+                        // cycle. Handle per-op writebacks and COP0
+                        // side effects, then advance pc.
+
+                        if ((is_lui || is_ori || is_addiu) && (rt_idx != 5'd0))
+                            regfile[rt_idx] <= alu_wb;
+
+                        if (is_mfc0 && (rt_idx != 5'd0))
+                            regfile[rt_idx] <= cop0_read_val;
+
+                        if (is_mtc0) begin
+                            unique case (rd_idx)
+                                COP0_REG_STATUS: begin
+                                    status_iec   <= rt_val[0];
+                                    status_kuc   <= rt_val[1];
+                                    status_iep   <= rt_val[2];
+                                    status_kup   <= rt_val[3];
+                                    status_ieo   <= rt_val[4];
+                                    status_kuo   <= rt_val[5];
+                                    status_im    <= rt_val[15:8];
+                                end
+                                COP0_REG_CAUSE:  begin
+                                    // Only the software IP[1:0] bits
+                                    // are writable; ExcCode is normally
+                                    // written by the core on exception
+                                    // entry, but allow SW override too
+                                    // since the minimal scope doesn't
+                                    // dispatch on ExcCode.
+                                    cause_exc_code <= rt_val[6:2];
+                                    cause_ip_sw[1:0] <= rt_val[9:8];
+                                end
+                                COP0_REG_EPC:    epc <= rt_val;
+                                default: ;
+                            endcase
+                        end
+
+                        if (is_rfe) begin
+                            status_iec       <= status_iep;
+                            status_iep       <= status_ieo;
+                            status_kuc       <= status_kup;
+                            status_kup       <= status_kuo;
+                            retired_flag_rfe <= 1'b1;
+                        end
+
+                        // Trace payload for ALU / branch / COP0 / NOP
+                        if (is_mfc0) begin
+                            retired_arg2 <= {27'd0, rd_idx};
+                            retired_arg3 <= cop0_read_val;
+                        end else if (is_mtc0) begin
+                            retired_arg2 <= {27'd0, rd_idx};
+                            retired_arg3 <= rt_val;
+                        end else if (is_taken_branch_or_jump) begin
+                            retired_arg2 <= taken_target;
+                            retired_arg3 <= 32'd0;
+                        end else if (is_lui || is_ori || is_addiu) begin
+                            retired_arg3 <= alu_wb;
+                        end
+
+                        retire_pulse <= 1'b1;
+                        retire_advance();
+                        state <= S_IFETCH_REQ;
+                    end
+                end
+
+                S_MEM_REQ: state <= S_MEM_WAIT;
+
+                S_MEM_WAIT: begin
+                    if (map_rd_valid) begin
+                        if (rt_idx != 5'd0) regfile[rt_idx] <= map_rd_data;
+
+                        retired_pc            <= pc;
+                        retired_instr         <= instr;
+                        retired_arg2          <= ea;
+                        retired_arg3          <= map_rd_data;
+                        retired_flag_write    <= 1'b0;
+                        retired_flag_read     <= 1'b1;
+                        retired_flag_branch   <= 1'b0;
+                        retired_flag_halt     <= 1'b0;
+                        retired_flag_in_delay <= instr_in_delay_slot;
+                        retired_flag_rfe      <= 1'b0;
+                        retire_pulse          <= 1'b1;
+
+                        retire_advance();
+                        state <= S_IFETCH_REQ;
+                    end
+                end
+
+                S_MEM_WRITE: begin
+                    retired_pc            <= pc;
+                    retired_instr         <= instr;
+                    retired_arg2          <= ea;
+                    retired_arg3          <= rt_val;
+                    retired_flag_write    <= 1'b1;
+                    retired_flag_read     <= 1'b0;
+                    retired_flag_branch   <= 1'b0;
+                    retired_flag_halt     <= 1'b0;
+                    retired_flag_in_delay <= instr_in_delay_slot;
+                    retired_flag_rfe      <= 1'b0;
+                    retire_pulse          <= 1'b1;
+
+                    retire_advance();
+                    state <= S_IFETCH_REQ;
+                end
+
+                S_HALT: state <= S_HALT;
+
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+    assign halt_o = (state == S_HALT);
+    assign pc_o   = pc;
+
+    // ------------------------------------------------------------------
+    // Trace emission — one event per retire
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_IFETCH;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (retire_pulse) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_IFETCH;
+            ev_arg0   <= {32'd0, retired_pc};
+            ev_arg1   <= {32'd0, retired_instr};
+            ev_arg2   <= {32'd0, retired_arg2};
+            ev_arg3   <= {32'd0, retired_arg3};
+            ev_flags  <= {24'd0,
+                          retired_flag_trap,
+                          retired_flag_rfe,
+                          retired_flag_except,
+                          retired_flag_in_delay,
+                          retired_flag_halt,
+                          retired_flag_branch,
+                          retired_flag_read,
+                          retired_flag_write};
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : iop_core_stub
@@ -0,0 +1,319 @@
+// retroDE_ps2 — iop_dmac_reg_stub
+//
+// IOP DMAC channel 9 (SIF0 IOP→EE) with a real, bounded data path.
+// Upgraded from the earlier register+lifecycle shell: MADR is a real
+// source pointer into IOP RAM, BCR is a real word count, and the
+// state machine pulls 32-bit beats out of IOP RAM through the IOP map
+// and emits them on a word-granularity endpoint with ready/valid/last
+// handshake. Mirrors the EE DMAC shape (dmac_reg_stub) at 32-bit width.
+//
+// Contract refs:
+//   docs/contracts/iop.md         (IOP DMAC ownership)
+//
+// Register surface (per-channel, low-byte offset):
+//   0x00  MADR        — real source address in IOP physical space
+//   0x04  BCR         — transfer length in 32-bit beats
+//   0x08  CHCR        — channel control; bit[0] is the start bit
+//   0x0C  DONE_COUNT  — monotonic completion counter (read-only; writes
+//                       are accepted but ignored). Software reads this
+//                       to distinguish "nth completion" without needing
+//                       to count interrupts externally.
+//   Other offsets: writes accepted but ignored; reads return 0.
+//
+// Memory master interface (to iop_memory_map_stub's dma_rd_* port):
+//   mem_rd_en / mem_rd_addr issue the request (one cycle)
+//   mem_rd_valid / mem_rd_data return the word one cycle later
+//   mem_master_id drives the map trace attribution (convention: 4)
+//
+// Endpoint (to sif_dma_ee_ram_bridge_stub or similar 32-bit sink):
+//   ep_valid / ep_data[31:0] / ep_last
+//   ep_ready is the backpressure signal — when low, the state machine
+//   holds in ACTIVE_SEND with the current beat. No false completion.
+//
+// State machine:
+//   IDLE         → FETCH_WAIT   on CHCR start
+//   FETCH_WAIT   → ACTIVE_SEND  on mem_rd_valid (word latched)
+//   ACTIVE_SEND  → FETCH_WAIT   on endpoint accept with more beats left
+//                → DONE          on endpoint accept for the final beat
+//   DONE         → IDLE          next cycle (clears CHCR.start)
+//
+// Source stepping: src_addr = madr_latched + (beat_index * 4).
+//
+// Trace payload schema (SUBSYS_DMAC):
+//   DMA_CFG    arg0=channel arg1=chcr arg2=madr arg3=bcr flags=reg_offset
+//   DMA_START  arg0=channel arg1=bcr  arg2=madr arg3=path_id
+//   DMA_BEAT   arg0=channel arg1=beat_index arg2=src_addr arg3=remaining
+//   DMA_DONE   arg0=channel arg1=beats arg2=completion_code arg3=path_id
+//     completion_code 0 = OK.
+
+`timescale 1ns/1ps
+
+module iop_dmac_reg_stub
+    import trace_pkg::*;
+#(
+    parameter logic [3:0] CHANNEL   = 4'd9,        // SIF0 (IOP → EE)
+    parameter logic [3:0] PATH_ID   = 4'd9,
+    parameter logic [7:0] MASTER_ID = 8'd4         // for dma_rd trace attribution
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // IOP-side register access (from the memory map's iop_dmac_* port)
+    input  logic          reg_wr_en,
+    input  logic          reg_rd_en,
+    input  logic [3:0]    reg_offset,
+    input  logic [31:0]   reg_wr_data,
+    output logic [31:0]   reg_rd_data,
+    output logic          reg_rd_valid,
+
+    // Memory read master (to iop_memory_map_stub dma_rd_* port)
+    output logic          mem_rd_en,
+    output logic [31:0]   mem_rd_addr,
+    output logic [7:0]    mem_master_id,
+    input  logic [31:0]   mem_rd_data,
+    input  logic          mem_rd_valid,
+
+    // Endpoint (word-granularity stream to SIF egress bridge)
+    output logic          ep_valid,
+    output logic [31:0]   ep_data,
+    output logic          ep_last,
+    input  logic          ep_ready,
+
+    // Completion pulse — one cycle high when the channel reaches S_DONE.
+    // Intended as an IOP INTC source; latching is the interrupt
+    // controller's responsibility.
+    output logic          irq_completion_o,
+
+    // Status
+    output logic          busy_o,
+    output logic [31:0]   done_count_o,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam logic [3:0] MADR_OFFSET       = 4'h0;
+    localparam logic [3:0] BCR_OFFSET        = 4'h4;
+    localparam logic [3:0] CHCR_OFFSET       = 4'h8;
+    localparam logic [3:0] DONE_COUNT_OFFSET = 4'hC;
+
+    typedef enum logic [1:0] {
+        S_IDLE        = 2'd0,
+        S_FETCH_WAIT  = 2'd1,
+        S_ACTIVE_SEND = 2'd2,
+        S_DONE        = 2'd3
+    } state_e;
+
+    logic [31:0] madr;
+    logic [31:0] bcr;
+    logic [31:0] chcr;
+
+    state_e      state;
+    logic [31:0] madr_latched;
+    logic [31:0] bcr_latched;
+    logic [31:0] beat_index;
+    logic [31:0] beat_payload;
+
+    logic start_pulse;
+    assign start_pulse = reg_wr_en && (reg_offset == CHCR_OFFSET)
+                         && reg_wr_data[0] && !chcr[0];
+
+    // ------------------------------------------------------------------
+    // Register file
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            madr <= 32'd0;
+            bcr  <= 32'd0;
+            chcr <= 32'd0;
+        end else begin
+            if (reg_wr_en) begin
+                case (reg_offset)
+                    MADR_OFFSET: madr <= reg_wr_data;
+                    BCR_OFFSET:  bcr  <= reg_wr_data;
+                    CHCR_OFFSET: chcr <= reg_wr_data;
+                    default: ;
+                endcase
+            end
+            if (state == S_DONE) chcr[0] <= 1'b0;
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Register read (1-cycle latency, matches rest of stub ecosystem)
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            reg_rd_data  <= 32'd0;
+            reg_rd_valid <= 1'b0;
+        end else begin
+            reg_rd_valid <= reg_rd_en;
+            if (reg_rd_en) begin
+                case (reg_offset)
+                    MADR_OFFSET:       reg_rd_data <= madr;
+                    BCR_OFFSET:        reg_rd_data <= bcr;
+                    CHCR_OFFSET:       reg_rd_data <= chcr;
+                    DONE_COUNT_OFFSET: reg_rd_data <= done_count_o;
+                    default:           reg_rd_data <= 32'd0;
+                endcase
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Transfer state machine
+    // ------------------------------------------------------------------
+
+    logic [31:0] src_addr;
+    assign src_addr = madr_latched + (beat_index << 2);        // 4 bytes/beat
+
+    logic beat_accepted;
+    assign beat_accepted = ep_valid && ep_ready;
+
+    // Pulse mem_rd_en for one cycle whenever we first enter FETCH_WAIT.
+    logic prev_state_fw;
+    always_ff @(posedge clk) begin
+        if (!rst_n) prev_state_fw <= 1'b0;
+        else        prev_state_fw <= (state == S_FETCH_WAIT);
+    end
+    logic entering_fw;
+    assign entering_fw = (state == S_FETCH_WAIT) && !prev_state_fw;
+
+    assign mem_rd_en     = entering_fw;
+    assign mem_rd_addr   = src_addr;
+    assign mem_master_id = MASTER_ID;
+
+    // Drive endpoint only in ACTIVE_SEND with the latched payload.
+    assign ep_valid = (state == S_ACTIVE_SEND);
+    assign ep_data  = beat_payload;
+    assign ep_last  = (state == S_ACTIVE_SEND) &&
+                          (beat_index + 32'd1 == bcr_latched);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state        <= S_IDLE;
+            madr_latched <= 32'd0;
+            bcr_latched  <= 32'd0;
+            beat_index   <= 32'd0;
+            beat_payload <= 32'd0;
+        end else begin
+            unique case (state)
+                S_IDLE: begin
+                    if (start_pulse) begin
+                        state        <= S_FETCH_WAIT;
+                        madr_latched <= madr;
+                        bcr_latched  <= bcr;
+                        beat_index   <= 32'd0;
+                    end
+                end
+
+                S_FETCH_WAIT: begin
+                    if (mem_rd_valid) begin
+                        beat_payload <= mem_rd_data;
+                        state        <= S_ACTIVE_SEND;
+                    end
+                end
+
+                S_ACTIVE_SEND: begin
+                    if (beat_accepted) begin
+                        if (beat_index + 32'd1 == bcr_latched) begin
+                            state <= S_DONE;
+                        end else begin
+                            beat_index <= beat_index + 32'd1;
+                            state      <= S_FETCH_WAIT;
+                        end
+                    end
+                end
+
+                S_DONE: begin
+                    state <= S_IDLE;
+                end
+
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+    assign busy_o           = (state != S_IDLE);
+    assign irq_completion_o = (state == S_DONE);
+
+    // ------------------------------------------------------------------
+    // Trace emission — one event per cycle. Priority:
+    //   DONE > BEAT > START > CFG (register write)
+    // ------------------------------------------------------------------
+
+    logic prev_in_transfer;
+    always_ff @(posedge clk) begin
+        if (!rst_n) prev_in_transfer <= 1'b0;
+        else        prev_in_transfer <= (state != S_IDLE);
+    end
+
+    logic enter_start;
+    assign enter_start = (state == S_FETCH_WAIT) && !prev_in_transfer;
+
+    logic enter_done;
+    assign enter_done = (state == S_DONE);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid      <= 1'b0;
+            ev_subsys     <= SUBSYS_DMAC;
+            ev_event      <= EV_DMA_CFG;
+            ev_arg0       <= 64'd0;
+            ev_arg1       <= 64'd0;
+            ev_arg2       <= 64'd0;
+            ev_arg3       <= 64'd0;
+            ev_flags      <= 32'd0;
+            done_count_o  <= 32'd0;
+        end else if (enter_done) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_DONE;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, beat_index + 32'd1};   // beats completed
+            ev_arg2   <= 64'd0;                         // completion OK
+            ev_arg3   <= {60'd0, PATH_ID};
+            ev_flags  <= 32'd0;
+            done_count_o <= done_count_o + 32'd1;
+        end else if (beat_accepted) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_BEAT;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, beat_index};
+            ev_arg2   <= {32'd0, src_addr};
+            ev_arg3   <= {32'd0, bcr_latched - beat_index - 32'd1};
+            ev_flags  <= 32'd0;
+        end else if (enter_start) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_START;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, bcr_latched};
+            ev_arg2   <= {32'd0, madr_latched};
+            ev_arg3   <= {60'd0, PATH_ID};
+            ev_flags  <= 32'd0;
+        end else if (reg_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_DMAC;
+            ev_event  <= EV_DMA_CFG;
+            ev_arg0   <= {60'd0, CHANNEL};
+            ev_arg1   <= {32'd0, (reg_offset == CHCR_OFFSET) ? reg_wr_data : chcr};
+            ev_arg2   <= {32'd0, (reg_offset == MADR_OFFSET) ? reg_wr_data : madr};
+            ev_arg3   <= {32'd0, (reg_offset == BCR_OFFSET)  ? reg_wr_data : bcr};
+            ev_flags  <= {28'd0, reg_offset};
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : iop_dmac_reg_stub
@@ -0,0 +1,320 @@
+// retroDE_ps2 — iop_exec_stub
+//
+// First RAM-backed IOP execution primitive. Micro-op fetch now comes
+// from IOP RAM through the real `iop_memory_map_stub` CPU-side port —
+// the same way a future MIPS-class CPU would fetch instructions. The
+// control program is no longer RTL-resident; it lives as data in RAM
+// that someone (a TB, eventually a BIOS / loader path) preloads before
+// pulsing `go_i`.
+//
+// NOT a MIPS core, NOT an ISA decoder. A tiny FSM sequencer over a
+// five-opcode micro-op ISA, designed as the bridge between "testbench
+// choreographs everything" and a real instruction-fetching CPU. When
+// the real CPU arrives, it replaces this module but keeps the same
+// map / DMA / INTC hookup verbatim.
+//
+// Contract refs:
+//   docs/contracts/iop.md         (IOP-local programming model)
+//
+// Opcodes (encoded in word 0 low nibble):
+//   OP_HALT      0x0  — terminal; halt_o rises, no further accesses.
+//   OP_WRITE     0x1  — pulse map CPU write with (addr, data). pc++
+//   OP_READ      0x2  — pulse map CPU read;  latch into last_read_data.
+//                       pc++
+//   OP_WAIT_IRQ  0x3  — block until cpu_irq==1. pc++
+//   OP_BNE       0x4  — if last_read_data != expected, pc <= target;
+//                       else pc++.
+//                       target is in word1[7:0]; expected is in word2.
+//
+// Micro-op layout in RAM (16 bytes per op, little-endian word order):
+//   +0   word 0:  {28'd0, opcode[3:0]}
+//   +4   word 1:  addr (for WRITE/READ) or target_pc in low 8 bits (for BNE)
+//   +8   word 2:  data (for WRITE) or expected value (for BNE); unused for
+//                 READ/WAIT_IRQ/HALT
+//   +12  word 3:  reserved for future opcodes
+//
+// Fetch sequence: three map reads per op (words 0/1/2). Word 3 is
+// skipped to save a cycle. Each read has one-cycle latency via the
+// map — so a full fetch is ~6 cycles, after which dispatch takes one
+// more cycle. Negligible in the current scope; swap the engine for a
+// real CPU later and the instruction width stops mattering.
+//
+// Trace payload (SUBSYS_IOP, EV_IFETCH, emitted on each op completion):
+//   arg0 = pc value of the op that just completed
+//   arg1 = opcode
+//   arg2 = addr (0 for WAIT_IRQ/HALT)
+//   arg3 = data written, data read back, expected (for BNE), or 0
+//   flags bit 0 = 1 → write-flavour op
+//   flags bit 1 = 1 → WAIT_IRQ just exited (IRQ observed)
+//   flags bit 2 = 1 → HALT entered
+//   flags bit 3 = 1 → BNE branch taken (pc changed to target, not +1)
+
+`timescale 1ns/1ps
+
+module iop_exec_stub
+    import trace_pkg::*;
+#(
+    parameter logic [31:0] SCRIPT_BASE = 32'h0000_0400
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    input  logic          go_i,
+
+    // Drive the IOP memory map's CPU-side port. Both ifetch reads and
+    // the script's own WRITE/READ ops flow through here.
+    output logic          map_rd_en,
+    output logic [31:0]   map_rd_addr,
+    input  logic [31:0]   map_rd_data,
+    input  logic          map_rd_valid,
+
+    output logic          map_wr_en,
+    output logic [31:0]   map_wr_addr,
+    output logic [31:0]   map_wr_data,
+    output logic [3:0]    map_wr_be,
+
+    input  logic          cpu_irq,
+
+    output logic          halt_o,
+    output logic [7:0]    pc_o,
+
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam logic [3:0] OP_HALT     = 4'h0;
+    localparam logic [3:0] OP_WRITE    = 4'h1;
+    localparam logic [3:0] OP_READ     = 4'h2;
+    localparam logic [3:0] OP_WAIT_IRQ = 4'h3;
+    localparam logic [3:0] OP_BNE      = 4'h4;
+
+    typedef enum logic [3:0] {
+        S_IDLE      = 4'd0,
+        S_IF0_REQ   = 4'd1,
+        S_IF0_WAIT  = 4'd2,
+        S_IF1_REQ   = 4'd3,
+        S_IF1_WAIT  = 4'd4,
+        S_IF2_REQ   = 4'd5,
+        S_IF2_WAIT  = 4'd6,
+        S_DECODE    = 4'd7,
+        S_WRITE     = 4'd8,
+        S_READ_REQ  = 4'd9,
+        S_READ_WAIT = 4'd10,
+        S_WAIT_IRQ  = 4'd11,
+        S_BNE       = 4'd12,
+        S_HALT      = 4'd13
+    } state_e;
+
+    state_e      state;
+    logic [7:0]  pc;
+    logic [3:0]  cur_opcode;
+    logic [31:0] cur_addr;
+    logic [31:0] cur_data;
+    logic [31:0] last_read_data;
+
+    // Op-completion event triggers (one-cycle pulses)
+    logic ev_op_done;
+    logic ev_wait_irq_exit;
+    logic ev_enter_halt;
+    logic ev_bne_taken;
+
+    // Address for the next ifetch word: SCRIPT_BASE + pc*16 + word_offset
+    logic [31:0] ifetch_base;
+    assign ifetch_base = SCRIPT_BASE + {20'd0, pc, 4'd0};   // pc << 4
+
+    // ------------------------------------------------------------------
+    // Map-port drive (combinational on state)
+    // ------------------------------------------------------------------
+
+    always_comb begin
+        map_wr_en   = 1'b0;
+        map_wr_addr = 32'd0;
+        map_wr_data = 32'd0;
+        map_wr_be   = 4'd0;
+        map_rd_en   = 1'b0;
+        map_rd_addr = 32'd0;
+
+        case (state)
+            S_IF0_REQ: begin
+                map_rd_en   = 1'b1;
+                map_rd_addr = ifetch_base + 32'd0;
+            end
+            S_IF1_REQ: begin
+                map_rd_en   = 1'b1;
+                map_rd_addr = ifetch_base + 32'd4;
+            end
+            S_IF2_REQ: begin
+                map_rd_en   = 1'b1;
+                map_rd_addr = ifetch_base + 32'd8;
+            end
+            S_WRITE: begin
+                map_wr_en   = 1'b1;
+                map_wr_addr = cur_addr;
+                map_wr_data = cur_data;
+                map_wr_be   = 4'b1111;
+            end
+            S_READ_REQ: begin
+                map_rd_en   = 1'b1;
+                map_rd_addr = cur_addr;
+            end
+            default: ;
+        endcase
+    end
+
+    // ------------------------------------------------------------------
+    // State machine
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state            <= S_IDLE;
+            pc               <= 8'd0;
+            cur_opcode       <= 4'd0;
+            cur_addr         <= 32'd0;
+            cur_data         <= 32'd0;
+            last_read_data   <= 32'd0;
+            ev_op_done       <= 1'b0;
+            ev_wait_irq_exit <= 1'b0;
+            ev_enter_halt    <= 1'b0;
+            ev_bne_taken     <= 1'b0;
+        end else begin
+            ev_op_done       <= 1'b0;
+            ev_wait_irq_exit <= 1'b0;
+            ev_enter_halt    <= 1'b0;
+            ev_bne_taken     <= 1'b0;
+
+            case (state)
+                S_IDLE: begin
+                    if (go_i) begin
+                        pc    <= 8'd0;
+                        state <= S_IF0_REQ;
+                    end
+                end
+
+                S_IF0_REQ:  state <= S_IF0_WAIT;
+                S_IF0_WAIT: if (map_rd_valid) begin
+                    cur_opcode <= map_rd_data[3:0];
+                    state      <= S_IF1_REQ;
+                end
+
+                S_IF1_REQ:  state <= S_IF1_WAIT;
+                S_IF1_WAIT: if (map_rd_valid) begin
+                    cur_addr <= map_rd_data;
+                    state    <= S_IF2_REQ;
+                end
+
+                S_IF2_REQ:  state <= S_IF2_WAIT;
+                S_IF2_WAIT: if (map_rd_valid) begin
+                    cur_data <= map_rd_data;
+                    state    <= S_DECODE;
+                end
+
+                S_DECODE: begin
+                    case (cur_opcode)
+                        OP_HALT: begin
+                            state         <= S_HALT;
+                            ev_enter_halt <= 1'b1;
+                        end
+                        OP_WRITE:    state <= S_WRITE;
+                        OP_READ:     state <= S_READ_REQ;
+                        OP_WAIT_IRQ: state <= S_WAIT_IRQ;
+                        OP_BNE:      state <= S_BNE;
+                        default:     state <= S_HALT;   // unknown opcode → safe stop
+                    endcase
+                end
+
+                S_WRITE: begin
+                    pc         <= pc + 8'd1;
+                    state      <= S_IF0_REQ;
+                    ev_op_done <= 1'b1;
+                end
+
+                S_READ_REQ: state <= S_READ_WAIT;
+                S_READ_WAIT: if (map_rd_valid) begin
+                    last_read_data <= map_rd_data;
+                    pc             <= pc + 8'd1;
+                    state          <= S_IF0_REQ;
+                    ev_op_done     <= 1'b1;
+                end
+
+                S_WAIT_IRQ: begin
+                    if (cpu_irq) begin
+                        pc               <= pc + 8'd1;
+                        state            <= S_IF0_REQ;
+                        ev_op_done       <= 1'b1;
+                        ev_wait_irq_exit <= 1'b1;
+                    end
+                end
+
+                S_BNE: begin
+                    // target_pc = cur_addr[7:0], expected = cur_data
+                    if (last_read_data != cur_data) begin
+                        pc           <= cur_addr[7:0];
+                        ev_bne_taken <= 1'b1;
+                    end else begin
+                        pc           <= pc + 8'd1;
+                    end
+                    state      <= S_IF0_REQ;
+                    ev_op_done <= 1'b1;
+                end
+
+                S_HALT: state <= S_HALT;
+
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+    assign halt_o = (state == S_HALT);
+    assign pc_o   = pc;
+
+    // ------------------------------------------------------------------
+    // Trace emission. One event per op completion + one on HALT entry.
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_IFETCH;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (ev_enter_halt) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_IFETCH;
+            ev_arg0   <= {56'd0, pc};
+            ev_arg1   <= {60'd0, cur_opcode};
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'h0000_0004;             // halt marker
+        end else if (ev_op_done) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_IFETCH;
+            ev_arg0   <= {56'd0, pc};
+            ev_arg1   <= {60'd0, cur_opcode};
+            ev_arg2   <= {32'd0, cur_addr};
+            ev_arg3   <= (cur_opcode == OP_READ)
+                             ? {32'd0, map_rd_data}
+                             : {32'd0, cur_data};
+            ev_flags  <= {28'd0,
+                          ev_bne_taken,
+                          1'b0,                          // (was halt; halt has its own path above)
+                          ev_wait_irq_exit,
+                          (cur_opcode == OP_WRITE)};
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : iop_exec_stub
@@ -0,0 +1,128 @@
+// retroDE_ps2 — iop_fetch_stub
+//
+// Minimal IOP-side sequential fetcher. Mirrors ee_fetch_stub in shape and
+// discipline — just the smallest honest primitive that produces visible
+// IOP-side execution-flow traffic. Not a CPU. Explicitly NOT a BIOS boot
+// stub: the default reset vector lives in IOP RAM, not in BIOS space.
+//
+// Contract refs:
+//   docs/contracts/iop.md         (IOP CPU execution, required debug
+//                                  visibility: PC stream)
+//
+// Behavior:
+//   - On reset, PC = RESET_VECTOR (default 0x00000000, the low end of
+//     IOP RAM).
+//   - Each cycle while `enable` is high: issue a 32-bit read at PC,
+//     advance PC += 4. No decode, no branches, no exceptions, no FPU.
+//   - Responses return 1 cycle later via rd_valid/rd_data from the
+//     map. The issued address is latched (pc_d1) so trace lines pair
+//     address with data.
+//
+// Non-goals:
+//   - full decode
+//   - branch / exception / interrupt handling
+//   - real IOP R3000 pipeline timing
+//   - BIOS fetch (use a BIOS-pointing RESET_VECTOR param override if
+//     needed, but that's intentionally not the default)
+//
+// Trace payload schema (matches ee_fetch_stub structure under SUBSYS_IOP):
+//   IOP RESET  arg0=reset_vector
+//   IOP IFETCH arg0=pc arg1=data arg2=resp_kind arg3=-
+//     resp_kind: 0=OK (only path in this scope)
+
+`timescale 1ns/1ps
+
+module iop_fetch_stub
+    import trace_pkg::*;
+#(
+    parameter logic [31:0] RESET_VECTOR = 32'h0000_0000
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+    input  logic          enable,
+
+    // Map-facing fetch port
+    output logic          rd_en,
+    output logic [31:0]   rd_addr,
+    input  logic [31:0]   rd_data,
+    input  logic          rd_valid,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    // ------------------------------------------------------------------
+    // PC and issued-address shadow (same pattern as ee_fetch_stub):
+    //   pc     is the address being issued THIS cycle (rd_addr)
+    //   pc_d1  is the address whose response arrives THIS cycle on rd_valid
+    // pc_d1 only advances alongside pc when enable is high, so it stays
+    // aligned with the in-flight request.
+    // ------------------------------------------------------------------
+
+    logic [31:0] pc;
+    logic [31:0] pc_d1;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            pc    <= RESET_VECTOR;
+            pc_d1 <= RESET_VECTOR;
+        end else if (enable) begin
+            pc_d1 <= pc;
+            pc    <= pc + 32'd4;
+        end
+    end
+
+    assign rd_en   = enable;
+    assign rd_addr = pc;
+
+    // ------------------------------------------------------------------
+    // Trace
+    //   - Single EV_RESET pulse at reset exit.
+    //   - EV_IFETCH one cycle after each rd_valid response.
+    // ------------------------------------------------------------------
+
+    logic reset_emit_pending;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid           <= 1'b0;
+            ev_subsys          <= SUBSYS_IOP;
+            ev_event           <= EV_RESET;
+            ev_arg0            <= 64'd0;
+            ev_arg1            <= 64'd0;
+            ev_arg2            <= 64'd0;
+            ev_arg3            <= 64'd0;
+            ev_flags           <= 32'd0;
+            reset_emit_pending <= 1'b1;
+        end else if (reset_emit_pending) begin
+            ev_valid           <= 1'b1;
+            ev_subsys          <= SUBSYS_IOP;
+            ev_event           <= EV_RESET;
+            ev_arg0            <= {32'd0, RESET_VECTOR};
+            ev_arg1            <= 64'd0;
+            ev_arg2            <= 64'd0;
+            ev_arg3            <= 64'd0;
+            ev_flags           <= 32'd0;
+            reset_emit_pending <= 1'b0;
+        end else if (rd_valid) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_IFETCH;
+            ev_arg0   <= {32'd0, pc_d1};
+            ev_arg1   <= {32'd0, rd_data};
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : iop_fetch_stub
@@ -0,0 +1,652 @@
+// retroDE_ps2 — iop_memory_map_stub
+//
+// IOP-side memory map. Gives IOP-visible addresses architectural meaning.
+// Wave 3 first-pass scope is deliberately narrow: only the IOP-RAM window
+// is routed; every other address decodes as UNMAPPED with deterministic
+// fault data. SIF registers, IOP I/O, SPU2, CDVD, and IOP-side BIOS are
+// all intentionally deferred — slots noted in comments so the map can
+// grow without re-shaping its interface.
+//
+// Contract refs:
+//   docs/contracts/iop.md         (IOP-local address decode)
+//   docs/contracts/memory.md      (IOP RAM lives at phys 0x00000000-
+//                                  0x001FFFFF, 2 MiB)
+//
+// Address semantics:
+//   IOP CPU side is MIPS R3000-class. kseg0/kseg1 aliases
+//   (0x80000000/0xA0000000 mirrors of 0x00000000) are modelled via
+//   `phys = iop_addr[28:0]`, consistent with how ee_memory_map_stub
+//   strips kseg for EE fetches. Physical window decode then works on
+//   the low 29 bits.
+//
+//   Bridge-side write port (Wave 3 addition): the SIF-to-IOP-RAM bridge
+//   writes directly at physical offsets — no kseg strip. The map decodes
+//   its address bits directly against the same region rules.
+//
+//   DMA read-master port (Wave 3 reverse-direction addition): the IOP
+//   DMAC (ch9 SIF0 egress, and any other future channel) fetches source
+//   bytes through this port. Physical addressing; RAM-only decode in
+//   current scope. Caller provides its own master_id (convention: 4 =
+//   IOP_DMAC).
+//
+// Arbitration (Wave 3 scope):
+//   Two potential write masters on the RAM path: the IOP CPU port and
+//   the bridge port. Two potential read masters on the RAM path: the
+//   IOP CPU port and the DMA read master. Collisions within the same
+//   cycle are not expected in the current TBs (CPU programming / readback
+//   phases are separate from DMA transfer phases). Policy if they ever
+//   collide: CPU wins. Documented here rather than hidden in priority
+//   ordering; RAM port is mux'd accordingly.
+//
+// Region decode (current):
+//   - IOP RAM window:           phys[28:21] == 8'b00000000
+//                               (0x0000_0000 - 0x001F_FFFF, 2 MiB)
+//                               → route to iop_ram_stub (offset phys[20:0])
+//   - SIF registers (IOP side): phys[28:24] == 5'b11101
+//                               (0x1D00_0000 block) → route to the SIF
+//                               register shell with offset phys[7:0]. The
+//                               mailbox stub's register surface covers
+//                               offsets 0x00/0x10/0x20/0x30.
+//   - IOP DMAC channel 9:       phys[28:4] == 25'h01F8_0152
+//                               (0x1F80_1520 - 0x1F80_152F, 16 bytes)
+//                               → route to iop_dmac_reg_stub with 4-bit
+//                               offset phys[3:0]. Channel 9 is SIF0
+//                               (IOP→EE) in the real PS2 DMAC map; other
+//                               channels are intentionally not decoded.
+//   - IOP INTC:                 phys[28:4] == 25'h01F8_0107
+//                               (0x1F80_1070 - 0x1F80_107F, 16 bytes)
+//                               → route to intc_stub with 8-bit offset
+//                               phys[7:0]. Matches the real PS2 IOP INTC
+//                               placement (I_STAT / I_MASK).
+//   - Shared BIOS ROM:          phys[28:22] == 7'b1111111
+//                               (0x1FC0_0000 - 0x1FFF_FFFF, 4 MiB)
+//                               → route to bios_rom_stub with 22-bit
+//                               offset phys[21:0]. kseg1 aliasing maps
+//                               0xBFC0_0000 fetches to this window via
+//                               the standard [28:0] strip. The IOP core
+//                               reset vector normally points here.
+//                               Writes to BIOS decode as UNMAPPED
+//                               (read-only ROM).
+//   - everything else: UNMAPPED, reads return 32'hDEADBEEF
+//
+// Future regions (reserved in comments, not wired):
+//   - Other IOP DMAC channels:   0x1F80_1080-0x1F80_156F (partial block)
+//   - IOP timers / SIO:          elsewhere in 0x1F80_0000 block
+//   - SPU2:                      0x1F90_0000 block
+//
+// Trace semantics (matches ee_memory_map_stub's request-routing pattern):
+//   Map-layer events describe routing (what was asked for, where it was
+//   sent). Arg1 is 0 when the request is routed to a backing store that
+//   will emit its own delivery event; 0xDEADBEEF on unmapped reads; the
+//   actual write data on unmapped writes (so the TB can see what software
+//   tried to write).
+//
+// Latency assumption (mirrors ee_memory_map_stub note):
+//   Assumes fixed one-cycle backing-store latency. `ram_rd_valid` is not
+//   consulted — the map asserts its own `iop_rd_valid` one cycle after
+//   request unconditionally. All Wave 3 backing stubs honour that. If a
+//   later backing store introduces wait states, the map must grow proper
+//   response handshaking.
+//
+// Trace payload schema:
+//   IOP READ      arg0=addr arg1=0            arg2=master_id arg3=region_id
+//   IOP WRITE     arg0=addr arg1=wr_data      arg2=master_id arg3=region_id
+//   IOP UNMAPPED  arg0=addr arg1=0xDEADBEEF   arg2=master_id arg3=0xFF
+//     region_id: 2 = IOP_RAM, 3 = SIF_REGS, 4 = IOP_DMAC, 5 = IOP_INTC,
+//                6 = IOP_BIOS
+//     master_id: 2 = IOP_CPU, 3 = SIF bridge (writes), 4 = IOP_DMAC (reads)
+//     flags bit 0: 1 = write, 0 = read
+
+`timescale 1ns/1ps
+
+module iop_memory_map_stub
+    import trace_pkg::*;
+(
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // ------------------------------------------------------------------
+    // IOP CPU-side request interface (32-bit data, virtual address)
+    // ------------------------------------------------------------------
+    input  logic          iop_rd_en,
+    input  logic [31:0]   iop_rd_addr,
+    output logic [31:0]   iop_rd_data,
+    output logic          iop_rd_valid,
+
+    input  logic          iop_wr_en,
+    input  logic [31:0]   iop_wr_addr,
+    input  logic [31:0]   iop_wr_data,
+    input  logic [3:0]    iop_wr_be,
+
+    // Caller-provided master id for trace attribution. Conventional:
+    //   0 = TB direct, 2 = IOP CPU (once a fetch stub exists).
+    input  logic [7:0]    master_id,
+
+    // ------------------------------------------------------------------
+    // Bridge-side write port (Wave 3). Physical addresses; no kseg strip.
+    // Used by sif_dma_iop_ram_bridge_stub and similar DMA-side masters.
+    // Caller provides its own master_id (convention: 3 = SIF bridge).
+    // ------------------------------------------------------------------
+    input  logic          bridge_wr_en,
+    input  logic [31:0]   bridge_wr_addr,
+    input  logic [31:0]   bridge_wr_data,
+    input  logic [3:0]    bridge_wr_be,
+    input  logic [7:0]    bridge_master_id,
+
+    // ------------------------------------------------------------------
+    // DMA read-master port (Wave 3). Physical addressing; intended for
+    // IOP DMAC ch9 reads out of IOP RAM. One-cycle read latency, same
+    // pipeline shape as the CPU read. Caller provides its own master_id
+    // (convention: 4 = IOP_DMAC).
+    // ------------------------------------------------------------------
+    input  logic          dma_rd_en,
+    input  logic [31:0]   dma_rd_addr,
+    input  logic [7:0]    dma_master_id,
+    output logic [31:0]   dma_rd_data,
+    output logic          dma_rd_valid,
+
+    // ------------------------------------------------------------------
+    // Downstream to iop_ram_stub.
+    // Address presented as a 21-bit offset within the 2 MiB IOP RAM
+    // window; consumers may truncate to match their backing-store width.
+    // ------------------------------------------------------------------
+    output logic          ram_rd_en,
+    output logic [20:0]   ram_rd_addr,
+    input  logic [31:0]   ram_rd_data,
+    input  logic          ram_rd_valid,
+
+    output logic          ram_wr_en,
+    output logic [20:0]   ram_wr_addr,
+    output logic [31:0]   ram_wr_data,
+    output logic [3:0]    ram_wr_be,
+    output logic [7:0]    ram_master_id,
+
+    // ------------------------------------------------------------------
+    // Downstream to the SIF register shell (sif_mailbox_stub IOP-side
+    // port). Low byte of the physical address is presented; writes go
+    // out with the CPU's data/be; reads come back with 1-cycle latency
+    // consistent with the rest of the stub ecosystem.
+    // ------------------------------------------------------------------
+    output logic          sif_rd_en,
+    output logic [7:0]    sif_rd_addr,
+    input  logic [31:0]   sif_rd_data,
+    input  logic          sif_rd_valid,
+
+    output logic          sif_wr_en,
+    output logic [7:0]    sif_wr_addr,
+    output logic [31:0]   sif_wr_data,
+
+    // ------------------------------------------------------------------
+    // Downstream to the IOP DMAC register shell (channel 9). 4-bit
+    // offset; data path uses the CPU write data. Read returns with
+    // one-cycle latency like the rest of the stub ecosystem.
+    // ------------------------------------------------------------------
+    output logic          iop_dmac_rd_en,
+    output logic [3:0]    iop_dmac_rd_addr,
+    input  logic [31:0]   iop_dmac_rd_data,
+    input  logic          iop_dmac_rd_valid,
+
+    output logic          iop_dmac_wr_en,
+    output logic [3:0]    iop_dmac_wr_addr,
+    output logic [31:0]   iop_dmac_wr_data,
+
+    // ------------------------------------------------------------------
+    // Downstream to the IOP INTC register shell (intc_stub reused).
+    // 8-bit offset passed downstream; read returns with one-cycle
+    // latency consistent with the rest of the stub ecosystem.
+    // ------------------------------------------------------------------
+    output logic          iop_intc_rd_en,
+    output logic [7:0]    iop_intc_rd_addr,
+    input  logic [31:0]   iop_intc_rd_data,
+    input  logic          iop_intc_rd_valid,
+
+    output logic          iop_intc_wr_en,
+    output logic [7:0]    iop_intc_wr_addr,
+    output logic [31:0]   iop_intc_wr_data,
+
+    // ------------------------------------------------------------------
+    // Downstream to bios_rom_stub (shared BIOS window).
+    // 22-bit byte offset within the 4 MiB window. Writes are never
+    // forwarded (BIOS is ROM); the map routes any bios-window write
+    // attempt to the UNMAPPED trace event instead.
+    // ------------------------------------------------------------------
+    output logic          bios_rd_en,
+    output logic [21:0]   bios_rd_addr,
+    input  logic [31:0]   bios_rd_data,
+    input  logic          bios_rd_valid,
+
+    // ------------------------------------------------------------------
+    // Ch234 — bridge-clock-domain pad bitmaps from ps2_hps_bridge
+    // (INPUT_P1/P2 latches @ 0x040/0x044). Sync'd into the IOP clock
+    // by the internal `sio2_input_stub` instance below. TBs that
+    // don't exercise the pad path can tie both ports to `32'd0`.
+    // ------------------------------------------------------------------
+    input  logic [31:0]   input_p1,
+    input  logic [31:0]   input_p2,
+
+    // ------------------------------------------------------------------
+    // Trace
+    // ------------------------------------------------------------------
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam logic [63:0] REGION_IOP_RAM  = 64'd2;
+    localparam logic [63:0] REGION_SIF_REGS = 64'd3;
+    localparam logic [63:0] REGION_IOP_DMAC = 64'd4;
+    localparam logic [63:0] REGION_IOP_INTC = 64'd5;
+    localparam logic [63:0] REGION_IOP_BIOS = 64'd6;
+    localparam logic [63:0] REGION_PAD_IO   = 64'd7;  // Ch234
+    localparam logic [63:0] REGION_UNMAPPED = 64'hFF;
+
+    localparam logic [28:0] DMAC_CH9_BASE = 29'h1F80_1520;
+    localparam logic [28:0] IOP_INTC_BASE = 29'h1F80_1070;
+    // Ch234 — retroDE-local pad I/O window (256 bytes), deliberately
+    // OUTSIDE the real SIO2 range (0x1F80_8200..0x1F80_82FF) so a
+    // faithful SIO2 emulation chapter can land later without collision.
+    localparam logic [28:0] PAD_IO_BASE   = 29'h1F80_8500;
+
+    // ------------------------------------------------------------------
+    // Region decode (combinational, shared for read + write)
+    // ------------------------------------------------------------------
+
+    // CPU-side decode (virtual → physical via kseg strip)
+    logic [28:0] rd_phys_addr;
+    logic [28:0] wr_phys_addr;
+    logic        rd_is_ram;
+    logic        rd_is_sif;
+    logic        rd_is_dmac;
+    logic        rd_is_intc;
+    logic        rd_is_bios;
+    logic        rd_is_pad;     // Ch234
+    logic        cpu_wr_is_ram;
+    logic        cpu_wr_is_sif;
+    logic        cpu_wr_is_dmac;
+    logic        cpu_wr_is_intc;
+    logic        cpu_wr_is_bios;
+    logic        cpu_wr_is_pad; // Ch234
+    logic [20:0] rd_ram_offset;
+    logic [20:0] cpu_wr_ram_offset;
+
+    assign rd_phys_addr      = iop_rd_addr[28:0];
+    assign wr_phys_addr      = iop_wr_addr[28:0];
+    assign rd_is_ram         = (rd_phys_addr[28:21] == 8'd0);
+    assign rd_is_sif         = (rd_phys_addr[28:24] == 5'b11101);
+    assign rd_is_dmac        = (rd_phys_addr[28:4]  == DMAC_CH9_BASE[28:4]);
+    assign rd_is_intc        = (rd_phys_addr[28:4]  == IOP_INTC_BASE[28:4]);
+    assign rd_is_bios        = (rd_phys_addr[28:22] == 7'b1111111);
+    // Ch234 — pad I/O region is 256 bytes at PAD_IO_BASE, so the
+    // decode is bits [28:8] (= 21 high bits of the 29-bit phys addr).
+    assign rd_is_pad         = (rd_phys_addr[28:8]  == PAD_IO_BASE[28:8]);
+    assign cpu_wr_is_ram     = (wr_phys_addr[28:21] == 8'd0);
+    assign cpu_wr_is_sif     = (wr_phys_addr[28:24] == 5'b11101);
+    assign cpu_wr_is_dmac    = (wr_phys_addr[28:4]  == DMAC_CH9_BASE[28:4]);
+    assign cpu_wr_is_intc    = (wr_phys_addr[28:4]  == IOP_INTC_BASE[28:4]);
+    assign cpu_wr_is_bios    = (wr_phys_addr[28:22] == 7'b1111111);
+    assign cpu_wr_is_pad     = (wr_phys_addr[28:8]  == PAD_IO_BASE[28:8]);
+    assign rd_ram_offset     = rd_phys_addr[20:0];
+    assign cpu_wr_ram_offset = wr_phys_addr[20:0];
+
+    // Bridge-side decode (physical, no strip). Bridge writes are routed
+    // to IOP RAM only — no SIF destination from the bridge side yet.
+    logic        bridge_wr_is_ram;
+    logic [20:0] bridge_wr_ram_offset;
+
+    assign bridge_wr_is_ram     = (bridge_wr_addr[28:21] == 8'd0);
+    assign bridge_wr_ram_offset = bridge_wr_addr[20:0];
+
+    // DMA-side read decode (physical, no strip). Scope covers RAM only.
+    logic        dma_rd_is_ram;
+    logic [20:0] dma_rd_ram_offset;
+
+    assign dma_rd_is_ram     = (dma_rd_addr[28:21] == 8'd0);
+    assign dma_rd_ram_offset = dma_rd_addr[20:0];
+
+    // RAM routing. Ch261 — DMA wins the port on CPU+DMA collision; the
+    // CPU's read address is latched into a one-entry pending slot and
+    // serviced on the next RAM cycle that the DMA does not consume.
+    // Pre-Ch261 the comment above this block was "CPU read wins over
+    // DMA read on same-cycle collision" but the silent consequence was
+    // the DMA path sampling `ram_rd_data` from the CPU's address —
+    // silent DMA data corruption. The Ch261 SIF-landing TB found it.
+    //
+    // Single-entry slot is sufficient because every existing CPU
+    // client of this map is request-then-wait-for-valid (no second
+    // outstanding read in flight): exec stub, iop_core_stub, fetch
+    // stub all stall in their own wait state until `iop_rd_valid`
+    // asserts. A sim-only overflow assertion below catches any future
+    // client that breaks that assumption.
+    logic cpu_rd_hit;
+    logic dma_rd_hit;
+    logic cpu_dma_collision;
+    assign cpu_rd_hit        = iop_rd_en && rd_is_ram;
+    assign dma_rd_hit        = dma_rd_en && dma_rd_is_ram;
+    assign cpu_dma_collision = cpu_rd_hit && dma_rd_hit;
+
+    // One-entry deferred CPU-RAM-read slot.
+    logic        cpu_pend_valid;
+    logic [20:0] cpu_pend_addr;
+
+    // Service priority (mutually exclusive):
+    //   serve_dma     — DMA wins the bus any cycle it asks
+    //   serve_cpu_def — deferred CPU read services on the next non-DMA cycle
+    //   serve_cpu_now — live CPU read services when neither of the above fires
+    logic serve_dma;
+    logic serve_cpu_def;
+    logic serve_cpu_now;
+    assign serve_dma     = dma_rd_hit;
+    assign serve_cpu_def = !dma_rd_hit && cpu_pend_valid;
+    assign serve_cpu_now = !dma_rd_hit && !cpu_pend_valid && cpu_rd_hit;
+
+    assign ram_rd_en   = serve_dma || serve_cpu_def || serve_cpu_now;
+    assign ram_rd_addr = serve_dma     ? dma_rd_ram_offset
+                       : serve_cpu_def ? cpu_pend_addr
+                                       : rd_ram_offset;
+
+    // Slot update: latch on collision, clear on service.
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            cpu_pend_valid <= 1'b0;
+            cpu_pend_addr  <= 21'd0;
+        end else begin
+            if (cpu_dma_collision && !cpu_pend_valid) begin
+                cpu_pend_valid <= 1'b1;
+                cpu_pend_addr  <= rd_ram_offset;
+            end else if (serve_cpu_def) begin
+                cpu_pend_valid <= 1'b0;
+            end
+        end
+    end
+
+`ifndef SYNTHESIS
+    // Overflow detector: a second CPU+DMA collision while the slot is
+    // already pending means we'd drop the new CPU read silently. The
+    // current set of CPU clients can't trigger this (single outstanding
+    // read each), but future producers should fail loudly here.
+    always_ff @(posedge clk) begin
+        if (rst_n && cpu_dma_collision && cpu_pend_valid) begin
+            $error("[iop_memory_map_stub] Ch261 deferred-CPU-slot overflow: cpu_dma_collision while cpu_pend_valid (live addr=0x%05h pending addr=0x%05h)",
+                   rd_ram_offset, cpu_pend_addr);
+        end
+    end
+`endif
+
+    // SIF register-shell routing. Low byte of the physical address is
+    // presented downstream (mailbox uses 8-bit offsets).
+    assign sif_rd_en   = iop_rd_en && rd_is_sif;
+    assign sif_rd_addr = rd_phys_addr[7:0];
+    assign sif_wr_en   = iop_wr_en && cpu_wr_is_sif;
+    assign sif_wr_addr = wr_phys_addr[7:0];
+    assign sif_wr_data = iop_wr_data;
+
+    // IOP DMAC ch9 routing. Low 4 bits of the physical address select
+    // among MADR / BCR / CHCR (and any other in-block offsets).
+    assign iop_dmac_rd_en   = iop_rd_en && rd_is_dmac;
+    assign iop_dmac_rd_addr = rd_phys_addr[3:0];
+    assign iop_dmac_wr_en   = iop_wr_en && cpu_wr_is_dmac;
+    assign iop_dmac_wr_addr = wr_phys_addr[3:0];
+    assign iop_dmac_wr_data = iop_wr_data;
+
+    // IOP INTC routing. Low byte of the physical address selects
+    // INTC_STAT (0x00) or INTC_MASK (0x10).
+    assign iop_intc_rd_en   = iop_rd_en && rd_is_intc;
+    assign iop_intc_rd_addr = rd_phys_addr[7:0];
+    assign iop_intc_wr_en   = iop_wr_en && cpu_wr_is_intc;
+    assign iop_intc_wr_addr = wr_phys_addr[7:0];
+    assign iop_intc_wr_data = iop_wr_data;
+
+    // Ch234 — pad-I/O region wiring. The map owns a single internal
+    // `sio2_input_stub` instance; the bridge's INPUT_P1/P2 latches
+    // flow into it directly. `pad_rd_*` / `pad_wr_*` are the
+    // map↔stub handshake (4-bit word offset within the 256-byte
+    // region, captured from phys_addr[5:2]).
+    wire        pad_rd_en;
+    wire [3:0]  pad_rd_addr;
+    wire [31:0] pad_rd_data;
+    wire        pad_rd_valid;
+    wire        pad_wr_en;
+    wire [3:0]  pad_wr_addr;
+    wire [31:0] pad_wr_data;
+
+    assign pad_rd_en   = iop_rd_en && rd_is_pad;
+    assign pad_rd_addr = rd_phys_addr[5:2];
+    assign pad_wr_en   = iop_wr_en && cpu_wr_is_pad;
+    assign pad_wr_addr = wr_phys_addr[5:2];
+    assign pad_wr_data = iop_wr_data;
+
+    sio2_input_stub u_sio2_input (
+        .clk      (clk),
+        .rst_n    (rst_n),
+        .input_p1 (input_p1),
+        .input_p2 (input_p2),
+        .rd_en    (pad_rd_en),
+        .rd_addr  (pad_rd_addr),
+        .rd_data  (pad_rd_data),
+        .rd_valid (pad_rd_valid),
+        .wr_en    (pad_wr_en),
+        .wr_addr  (pad_wr_addr),
+        .wr_data  (pad_wr_data)
+    );
+
+    // BIOS ROM routing. 22-bit byte offset within the 4 MiB window.
+    // No write path — BIOS is read-only.
+    assign bios_rd_en   = iop_rd_en && rd_is_bios;
+    assign bios_rd_addr = rd_phys_addr[21:0];
+
+    // Write-path arbitration for the RAM side: CPU wins on same-cycle
+    // collision. Neither TB nor current design exercises collision;
+    // priority is defensive. SIF writes are a separate port and don't
+    // contend with RAM writes.
+    logic cpu_wr_hit;
+    logic bridge_wr_hit;
+    assign cpu_wr_hit    = iop_wr_en    && cpu_wr_is_ram;
+    assign bridge_wr_hit = bridge_wr_en && bridge_wr_is_ram;
+
+    assign ram_wr_en     = cpu_wr_hit || bridge_wr_hit;
+    assign ram_wr_addr   = cpu_wr_hit ? cpu_wr_ram_offset : bridge_wr_ram_offset;
+    assign ram_wr_data   = cpu_wr_hit ? iop_wr_data        : bridge_wr_data;
+    assign ram_wr_be     = cpu_wr_hit ? iop_wr_be          : bridge_wr_be;
+    assign ram_master_id = cpu_wr_hit ? master_id          : bridge_master_id;
+
+    // ------------------------------------------------------------------
+    // Read response pipeline
+    //   cycle N  : iop_rd_en high, request routed downstream (or unmapped)
+    //   cycle N+1: iop_rd_valid high, data from RAM or fault
+    // ------------------------------------------------------------------
+
+    logic rd_pending;
+    logic rd_was_ram;
+    logic rd_was_sif;
+    logic rd_was_dmac;
+    logic rd_was_intc;
+    logic rd_was_bios;
+    logic rd_was_pad;  // Ch234
+
+    // Ch261 — rd_pending only pulses when the CPU read is ACTUALLY
+    // serviced this cycle. Three cases:
+    //   1. Non-RAM CPU read: always serviced (separate decode paths,
+    //      no arbitration). Pulse rd_pending normally.
+    //   2. RAM CPU read, no collision: serviced this cycle (serve_cpu_now
+    //      fires above). Pulse rd_pending.
+    //   3. RAM CPU read in collision: deferred (cpu_pend_valid latches).
+    //      Do NOT pulse rd_pending — iop_rd_valid stays low until the
+    //      deferred read finally fires (serve_cpu_def).
+    //   4. Deferred RAM read finally serviced (serve_cpu_def): pulse
+    //      rd_pending with rd_was_ram=1; the data arrives next cycle.
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            rd_pending  <= 1'b0;
+            rd_was_ram  <= 1'b0;
+            rd_was_sif  <= 1'b0;
+            rd_was_dmac <= 1'b0;
+            rd_was_intc <= 1'b0;
+            rd_was_bios <= 1'b0;
+            rd_was_pad  <= 1'b0;
+        end else if (serve_cpu_def) begin
+            // Deferred RAM read serviced this cycle — data next cycle.
+            rd_pending  <= 1'b1;
+            rd_was_ram  <= 1'b1;
+            rd_was_sif  <= 1'b0;
+            rd_was_dmac <= 1'b0;
+            rd_was_intc <= 1'b0;
+            rd_was_bios <= 1'b0;
+            rd_was_pad  <= 1'b0;
+        end else if (iop_rd_en && !(rd_is_ram && cpu_dma_collision)) begin
+            // Normal read path: live RAM read with no collision, OR
+            // any non-RAM CPU read (decoded by rd_is_*, routed via
+            // independent paths so no arbitration concern).
+            rd_pending  <= 1'b1;
+            rd_was_ram  <= rd_is_ram;
+            rd_was_sif  <= rd_is_sif;
+            rd_was_dmac <= rd_is_dmac;
+            rd_was_intc <= rd_is_intc;
+            rd_was_bios <= rd_is_bios;
+            rd_was_pad  <= rd_is_pad;
+        end else begin
+            // Collision-deferred OR idle cycle. CPU waits for deferred
+            // read to fire; iop_rd_valid stays low.
+            rd_pending <= 1'b0;
+        end
+    end
+
+    assign iop_rd_valid = rd_pending;
+    assign iop_rd_data  = rd_was_ram  ? ram_rd_data
+                        : rd_was_sif  ? sif_rd_data
+                        : rd_was_dmac ? iop_dmac_rd_data
+                        : rd_was_intc ? iop_intc_rd_data
+                        : rd_was_bios ? bios_rd_data
+                        : rd_was_pad  ? pad_rd_data
+                                      : 32'hDEADBEEF;
+
+    // ------------------------------------------------------------------
+    // DMA read response pipeline (separate from CPU pipeline). Ch261 —
+    // CPU+DMA collision is now handled cleanly by the deferred-CPU-slot
+    // above: DMA wins the port immediately, CPU's read is latched and
+    // serviced on the next non-DMA cycle. DMA always gets its own word
+    // on its expected timing; no silent corruption.
+    // ------------------------------------------------------------------
+
+    logic dma_rd_pending;
+    logic dma_rd_was_ram;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            dma_rd_pending <= 1'b0;
+            dma_rd_was_ram <= 1'b0;
+        end else begin
+            dma_rd_pending <= dma_rd_en;
+            if (dma_rd_en) dma_rd_was_ram <= dma_rd_is_ram;
+        end
+    end
+
+    assign dma_rd_valid = dma_rd_pending;
+    assign dma_rd_data  = dma_rd_was_ram ? ram_rd_data : 32'hDEADBEEF;
+
+    // ------------------------------------------------------------------
+    // Trace emission — one event per cycle. Priority:
+    //   CPU read  >  CPU write  >  DMA read  >  bridge write
+    // Masters are expected to be sequenced in TBs; priority is defensive
+    // for the rare collision case.
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_READ;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (iop_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            if (rd_is_ram) begin
+                ev_event <= EV_READ;
+                ev_arg1  <= 64'd0;
+                ev_arg3  <= REGION_IOP_RAM;
+            end else if (rd_is_sif) begin
+                ev_event <= EV_READ;
+                ev_arg1  <= 64'd0;
+                ev_arg3  <= REGION_SIF_REGS;
+            end else if (rd_is_dmac) begin
+                ev_event <= EV_READ;
+                ev_arg1  <= 64'd0;
+                ev_arg3  <= REGION_IOP_DMAC;
+            end else if (rd_is_intc) begin
+                ev_event <= EV_READ;
+                ev_arg1  <= 64'd0;
+                ev_arg3  <= REGION_IOP_INTC;
+            end else if (rd_is_bios) begin
+                ev_event <= EV_READ;
+                ev_arg1  <= 64'd0;
+                ev_arg3  <= REGION_IOP_BIOS;
+            end else if (rd_is_pad) begin
+                ev_event <= EV_READ;
+                ev_arg1  <= 64'd0;
+                ev_arg3  <= REGION_PAD_IO;
+            end else begin
+                ev_event <= EV_UNMAPPED;
+                ev_arg1  <= 64'hDEADBEEF;
+                ev_arg3  <= REGION_UNMAPPED;
+            end
+            ev_arg0  <= {32'd0, iop_rd_addr};
+            ev_arg2  <= {56'd0, master_id};
+            ev_flags <= 32'd0;
+        end else if (iop_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            if (cpu_wr_is_ram) begin
+                ev_event <= EV_WRITE;
+                ev_arg3  <= REGION_IOP_RAM;
+            end else if (cpu_wr_is_sif) begin
+                ev_event <= EV_WRITE;
+                ev_arg3  <= REGION_SIF_REGS;
+            end else if (cpu_wr_is_dmac) begin
+                ev_event <= EV_WRITE;
+                ev_arg3  <= REGION_IOP_DMAC;
+            end else if (cpu_wr_is_intc) begin
+                ev_event <= EV_WRITE;
+                ev_arg3  <= REGION_IOP_INTC;
+            end else if (cpu_wr_is_pad) begin
+                ev_event <= EV_WRITE;
+                ev_arg3  <= REGION_PAD_IO;
+            end else begin
+                ev_event <= EV_UNMAPPED;
+                ev_arg3  <= REGION_UNMAPPED;
+            end
+            ev_arg0  <= {32'd0, iop_wr_addr};
+            ev_arg1  <= {32'd0, iop_wr_data};
+            ev_arg2  <= {56'd0, master_id};
+            ev_flags <= 32'h0000_0001;
+        end else if (dma_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= dma_rd_is_ram ? EV_READ : EV_UNMAPPED;
+            ev_arg0   <= {32'd0, dma_rd_addr};
+            ev_arg1   <= dma_rd_is_ram ? 64'd0 : 64'hDEADBEEF;
+            ev_arg2   <= {56'd0, dma_master_id};
+            ev_arg3   <= dma_rd_is_ram ? REGION_IOP_RAM : REGION_UNMAPPED;
+            ev_flags  <= 32'd0;
+        end else if (bridge_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= bridge_wr_is_ram ? EV_WRITE : EV_UNMAPPED;
+            ev_arg0   <= {32'd0, bridge_wr_addr};
+            ev_arg1   <= {32'd0, bridge_wr_data};
+            ev_arg2   <= {56'd0, bridge_master_id};
+            ev_arg3   <= bridge_wr_is_ram ? REGION_IOP_RAM : REGION_UNMAPPED;
+            ev_flags  <= 32'h0000_0001;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : iop_memory_map_stub
@@ -0,0 +1,153 @@
+// retroDE_ps2 — iop_ram_stub
+//
+// First narrow IOP-side primitive. 32-bit IOP-RAM stub, architecturally
+// honest to the IOP's R3000-class 32-bit bus. NOT an IOP CPU — this is
+// pure memory. No fetch, no execution, no BIOS bring-up. Future IOP-side
+// work (fetch stub, IOP memory map, DMAC routing) can build on top of it.
+//
+// Contract refs:
+//   docs/contracts/iop.md           (IOP-local RAM/I/O decode)
+//   docs/contracts/memory.md        (2 MiB IOP RAM in the PS2 memory map)
+//
+// Scope:
+//   - read/write 32-bit data
+//   - byte-enable granularity on writes
+//   - one-cycle read latency (matches existing stub ecosystem)
+//   - caller-provided master_id for trace attribution
+//   - trace events tagged as SUBSYS_IOP so IOP-side memory traffic is
+//     distinct from EE MEM events even when both are active
+//
+// Explicit non-goals (Wave 3 IOP first step):
+//   - IOP CPU execution
+//   - full 2 MiB sizing (default is 16 KiB — plenty for stub tests)
+//   - integration into any IOP memory map yet
+//   - connection to SIF receive path (intentional: kept independent so
+//     future bridging is explicit, not accidental)
+//
+// Trace payload schema:
+//   IOP READ   arg0=addr arg1=data arg2=master_id arg3=region_id
+//   IOP WRITE  arg0=addr arg1=data arg2=master_id arg3=region_id
+//     master_id : caller-provided (e.g. 0 = TB direct, future: 2 = IOP CPU,
+//                 3 = SIF bridge, etc.)
+//     region_id : 2 = IOP_RAM (constant for this module)
+//     flags[0]  : 1 = write, 0 = read
+
+`timescale 1ns/1ps
+
+module iop_ram_stub
+    import trace_pkg::*;
+#(
+    parameter int    SIZE_BYTES = 16 * 1024,   // 16 KiB default
+    parameter string IMAGE_FILE = ""
+) (
+    input  logic                          clk,
+    input  logic                          rst_n,
+
+    // Read port
+    input  logic                          rd_en,
+    input  logic [$clog2(SIZE_BYTES)-1:0] rd_addr,
+    output logic [31:0]                   rd_data,
+    output logic                          rd_valid,
+
+    // Write port
+    input  logic                          wr_en,
+    input  logic [$clog2(SIZE_BYTES)-1:0] wr_addr,
+    input  logic [31:0]                   wr_data,
+    input  logic [3:0]                    wr_be,
+
+    // Caller-provided master id for trace attribution
+    input  logic [7:0]                    master_id,
+
+    // Trace
+    output logic                          ev_valid,
+    output subsys_e                       ev_subsys,
+    output event_e                        ev_event,
+    output logic [63:0]                   ev_arg0,
+    output logic [63:0]                   ev_arg1,
+    output logic [63:0]                   ev_arg2,
+    output logic [63:0]                   ev_arg3,
+    output logic [31:0]                   ev_flags
+);
+
+    localparam int ADDR_WIDTH       = $clog2(SIZE_BYTES);
+    localparam int WORD_COUNT       = SIZE_BYTES / 4;
+    localparam int WORD_INDEX_WIDTH = ADDR_WIDTH - 2;
+    localparam logic [63:0] REGION_IOP_RAM = 64'd2;
+
+    logic [31:0] mem [0:WORD_COUNT-1];
+
+    initial begin
+        if (IMAGE_FILE != "") begin
+            $display("[iop_ram_stub] loading image: %0s", IMAGE_FILE);
+            $readmemh(IMAGE_FILE, mem);
+        end else begin
+            for (int i = 0; i < WORD_COUNT; i++) mem[i] = 32'd0;
+            $display("[iop_ram_stub] zero-initialised (%0d words / %0d bytes)",
+                     WORD_COUNT, SIZE_BYTES);
+        end
+    end
+
+    logic [WORD_INDEX_WIDTH-1:0] rd_word_idx;
+    logic [WORD_INDEX_WIDTH-1:0] wr_word_idx;
+    assign rd_word_idx = rd_addr[ADDR_WIDTH-1:2];
+    assign wr_word_idx = wr_addr[ADDR_WIDTH-1:2];
+
+    // ------------------------------------------------------------------
+    // Read + write (one-cycle latency)
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            rd_data  <= 32'd0;
+            rd_valid <= 1'b0;
+        end else begin
+            rd_valid <= rd_en;
+            if (rd_en) rd_data <= mem[rd_word_idx];
+
+            if (wr_en) begin
+                for (int b = 0; b < 4; b++) begin
+                    if (wr_be[b]) mem[wr_word_idx][b*8 +: 8] <= wr_data[b*8 +: 8];
+                end
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace emission — read wins on same-cycle collision (single-port
+    // RAM wouldn't see that anyway in Wave 3).
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_READ;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_READ;
+            ev_arg0   <= {{(64-ADDR_WIDTH){1'b0}}, rd_addr};
+            ev_arg1   <= {32'd0, mem[rd_word_idx]};
+            ev_arg2   <= {56'd0, master_id};
+            ev_arg3   <= REGION_IOP_RAM;
+            ev_flags  <= 32'd0;
+        end else if (wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_IOP;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {{(64-ADDR_WIDTH){1'b0}}, wr_addr};
+            ev_arg1   <= {32'd0, wr_data};
+            ev_arg2   <= {56'd0, master_id};
+            ev_arg3   <= REGION_IOP_RAM;
+            ev_flags  <= 32'h0000_0001;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : iop_ram_stub
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+// Copyright (c) 2025-2026 retroDE contributors
+// ============================================================================
+// sio2_input_stub — Ch234 retroDE-local IOP-readable pad input stub
+// ============================================================================
+// **Not real SIO2.** A deliberately minimal MMIO surface that translates
+// the Ch222 HPS-written `INPUT_P1`/`INPUT_P2` controller bitmaps into a
+// Sony-format 16-bit digital pad word, exposed as IOP-readable
+// registers in the retroDE-local I/O window
+// `0x1F80_8500..0x1F80_85FF`. Real SIO2 emulation (`0x1F80_8200..0x1F80_82FF`,
+// FIFO, command/response, IOP DMAC channel 11) is intentionally deferred
+// — see `docs/contracts/sio2_pad.md` for the reconnaissance + scoping.
+//
+// **Register surface** (offsets relative to PAD_IO_BASE = 0x1F80_8500):
+//
+//   0x500  PAD_P1_STATE  (RO)  [15:0]  = Sony 16-bit pad word for P1
+//                              [31:16] = 0
+//   0x504  PAD_P2_STATE  (RO)  Same shape, sourced from `input_p2`.
+//   0x508  PAD_STATUS    (RO)  [0]     = pad path present/valid = 1
+//                              [31:1]  = 0
+//   other  reserved            reads return 32'd0; writes accepted-and-ignored.
+//
+// **Sony pad word format (Sony "digital mode" / type 0x41 response,
+// bytes 3 and 4 of the libpad/padman struct):**
+//
+//   pad_byte3 (D-pad / start / select / sticks; active-low, 0 = pressed):
+//     bit 7  LEFT     bit 6  DOWN    bit 5  RIGHT   bit 4  UP
+//     bit 3  START    bit 2  R3      bit 1  L3      bit 0  SELECT
+//
+//   pad_byte4 (face / shoulder buttons; active-low):
+//     bit 7  □ square   bit 6  × cross   bit 5  ○ circle   bit 4  △ triangle
+//     bit 3  R1         bit 2  L1        bit 1  R2         bit 0  L2
+//
+//   PAD_P1_STATE[7:0]  = pad_byte3
+//   PAD_P1_STATE[15:8] = pad_byte4
+//
+// **INPUT_P1 → Sony mapping** (per `docs/contracts/sio2_pad.md`,
+// SNES-style 32-bit retroDE bitmap folded onto Sony names by spatial
+// face-button layout — matches the convention coco2 / a2600 already use):
+//
+//   INPUT_P1[ 0] JOY_RIGHT  → Sony RIGHT       (byte3.5)
+//   INPUT_P1[ 1] JOY_LEFT   → Sony LEFT        (byte3.7)
+//   INPUT_P1[ 2] JOY_DOWN   → Sony DOWN        (byte3.6)
+//   INPUT_P1[ 3] JOY_UP     → Sony UP          (byte3.4)
+//   INPUT_P1[ 4] JOY_START  → Sony START       (byte3.3)
+//   INPUT_P1[ 5] JOY_SELECT → Sony SELECT      (byte3.0)
+//   INPUT_P1[ 6] JOY_Y      → Sony △ triangle  (byte4.4)
+//   INPUT_P1[ 7] JOY_B      → Sony × cross     (byte4.6)
+//   INPUT_P1[ 8] JOY_X      → Sony □ square    (byte4.7)
+//   INPUT_P1[ 9] JOY_A      → Sony ○ circle    (byte4.5)
+//   INPUT_P1[10] JOY_L      → Sony L1          (byte4.2)
+//   INPUT_P1[11] JOY_R      → Sony R1          (byte4.3)
+//   INPUT_P1[12] JOY_L2     → Sony L2          (byte4.0)
+//   INPUT_P1[13] JOY_R2     → Sony R2          (byte4.1)
+//   INPUT_P1[14] JOY_L3     → Sony L3          (byte3.1)
+//   INPUT_P1[15] JOY_R3     → Sony R3          (byte3.2)
+//   INPUT_P1[16] JOY_OSD    → not forwarded (retrodesd consumes it)
+//
+// retroDE bitmap is **active-high** (1 = pressed); Sony word is
+// **active-low** (0 = pressed). The two `pad_byteN` assigns invert
+// per-bit and reorder.
+//
+// **CDC contract.** `input_p1`/`input_p2` are bridge-clock-domain
+// signals (CLOCK2_50). This module runs on the IOP/design clock.
+// The 2-FF synchronizer chain inside is the standard retroDE
+// single-bit sync; tearing between bits during a partial-write
+// settling window is theoretically possible but practically
+// vanishingly rare (retrodesd writes the whole 32-bit latch at
+// one bridge edge ≤ 1 kHz; the IOP-side read is a small window
+// against millions of bridge cycles). A future chapter can promote
+// this to "snapshot CDC" (latch + 2-sample coherency) if tearing
+// ever becomes observable.
+//
+// In the focused TB and single-clock sim setups, the 2-FF sync is
+// a no-op functionally and adds 2 cycles of read latency from
+// input change to readable register update.
+// ============================================================================
+
+`timescale 1ns/1ps
+
+module sio2_input_stub (
+    input  logic        clk,             // IOP / design clock
+    input  logic        rst_n,
+
+    // Bridge-clock-domain inputs (sync'd internally).
+    input  logic [31:0] input_p1,
+    input  logic [31:0] input_p2,
+
+    // IOP map read port. `rd_addr` is the 4-bit word offset within
+    // the PAD I/O region (so 0x500 → addr 0x0, 0x504 → 0x1, etc.).
+    input  logic        rd_en,
+    input  logic [3:0]  rd_addr,
+    output logic [31:0] rd_data,
+    output logic        rd_valid,
+
+    // IOP map write port. Writes are accepted-and-ignored.
+    input  logic        wr_en,
+    input  logic [3:0]  wr_addr,
+    input  logic [31:0] wr_data,
+
+    // Ch250 — surface the post-translation Sony 16-bit pad words for
+    // fabric consumers that don't go through the IOP read memory map.
+    // The synth top uses `p1_sony_word_o` bits to drive status LEDs as
+    // a hardware proof that `bridge_input_p1_raw` actually reaches a
+    // live fabric consumer. (Ch241 noted those wires terminated at
+    // unconnected nets that Quartus elided; Ch250 ends that.) Bits
+    // are still active-LOW per Sony's wire-format convention. Both
+    // outputs are parallel taps of the same internal logic that feeds
+    // the 0x500/0x504 read responses — no functional change to the
+    // existing IOP-side path.
+    output logic [15:0] p1_sony_word_o,
+    output logic [15:0] p2_sony_word_o
+);
+
+    // -----------------------------------------------------------------
+    // 2-FF sync of each P1/P2 bit into the IOP clock domain.
+    // -----------------------------------------------------------------
+    logic [31:0] p1_sync_0, p1_sync_1;
+    logic [31:0] p2_sync_0, p2_sync_1;
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            p1_sync_0 <= 32'd0;
+            p1_sync_1 <= 32'd0;
+            p2_sync_0 <= 32'd0;
+            p2_sync_1 <= 32'd0;
+        end else begin
+            p1_sync_0 <= input_p1;
+            p1_sync_1 <= p1_sync_0;
+            p2_sync_0 <= input_p2;
+            p2_sync_1 <= p2_sync_0;
+        end
+    end
+    wire [31:0] p1_q = p1_sync_1;
+    wire [31:0] p2_q = p2_sync_1;
+
+    // -----------------------------------------------------------------
+    // Sony pad-word translation. Each `pad_byteN` is the *active-low*
+    // Sony byte; inversion folds the active-high retroDE bitmap.
+    // Bit positions per `docs/contracts/sio2_pad.md`:
+    //   byte3 = {LEFT, DOWN, RIGHT, UP, START, R3, L3, SELECT}  (MSB→LSB)
+    //   byte4 = {□,    ×,    ○,     △,  R1,    L1, R2, L2}
+    // -----------------------------------------------------------------
+    function automatic logic [15:0] sony_word(input logic [31:0] joy);
+        logic [7:0] byte3;
+        logic [7:0] byte4;
+        // byte3 MSB→LSB: LEFT[1], DOWN[2], RIGHT[0], UP[3], START[4], R3[15], L3[14], SELECT[5]
+        byte3 = ~{joy[1], joy[2], joy[0], joy[3], joy[4], joy[15], joy[14], joy[5]};
+        // byte4 MSB→LSB: SQUARE[8], CROSS[7], CIRCLE[9], TRIANGLE[6], R1[11], L1[10], R2[13], L2[12]
+        byte4 = ~{joy[8], joy[7], joy[9], joy[6], joy[11], joy[10], joy[13], joy[12]};
+        sony_word = {byte4, byte3};
+    endfunction
+
+    wire [15:0] p1_word = sony_word(p1_q);
+    wire [15:0] p2_word = sony_word(p2_q);
+
+    // Ch250 — surface the post-translation Sony words to fabric.
+    assign p1_sony_word_o = p1_word;
+    assign p2_sony_word_o = p2_word;
+
+    // -----------------------------------------------------------------
+    // Register address constants (word-aligned within the PAD I/O
+    // region; address bits [3:2] passed in as `rd_addr[1:0]`).
+    //   0x500 → rd_addr = 4'h0  PAD_P1_STATE
+    //   0x504 → rd_addr = 4'h1  PAD_P2_STATE
+    //   0x508 → rd_addr = 4'h2  PAD_STATUS
+    // -----------------------------------------------------------------
+    localparam logic [3:0] OFF_P1_STATE = 4'h0;
+    localparam logic [3:0] OFF_P2_STATE = 4'h1;
+    localparam logic [3:0] OFF_STATUS   = 4'h2;
+
+    // -----------------------------------------------------------------
+    // Read response. Combinational lookup + 1-cycle valid pipeline
+    // (matches the rest of the IOP map peripherals).
+    // -----------------------------------------------------------------
+    logic [31:0] rd_data_c;
+    always_comb begin
+        unique case (rd_addr)
+            OFF_P1_STATE: rd_data_c = {16'd0, p1_word};
+            OFF_P2_STATE: rd_data_c = {16'd0, p2_word};
+            OFF_STATUS:   rd_data_c = {31'd0, 1'b1};
+            default:      rd_data_c = 32'd0;
+        endcase
+    end
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            rd_data  <= 32'd0;
+            rd_valid <= 1'b0;
+        end else begin
+            rd_valid <= rd_en;
+            if (rd_en)
+                rd_data <= rd_data_c;
+        end
+    end
+
+    // -----------------------------------------------------------------
+    // Writes are accepted-and-ignored. We tie `wr_*` to a placeholder
+    // wire so lint tools don't flag them as unused.
+    // -----------------------------------------------------------------
+    // verilator lint_off UNUSED
+    wire _wr_unused = &{1'b0, wr_en, wr_addr, wr_data, 1'b0};
+    // verilator lint_on UNUSED
+
+endmodule : sio2_input_stub
@@ -0,0 +1,33 @@
+# rtl/memory
+
+Memory visibility, storage, and arbitration. Matches `docs/contracts/memory.md`.
+
+Per the BIOS-ownership split (memory owns storage, IOP owns behavior), this
+directory contains the storage/mapping layer. BIOS boot sequencing
+(IOPBOOT / IOPBTCONF parsing) belongs under `rtl/iop/`.
+
+## Wave 1 contents
+
+- `bios_rom_stub.sv` — 4 MiB BIOS ROM adapter. Loads a user-supplied hex
+  image via `$readmemh` when `IMAGE_FILE` is set, otherwise falls back to a
+  synthetic NOP sled. One-cycle read latency.
+- `ee_memory_map_stub.sv` — EE-side address decode. Wave 2.7 revision adds
+  a DMAC read-master port (128-bit data, physical addressing) with its own
+  RAM-window decode at 0x00000000-0x01FFFFFF routing to `ee_ram_stub`. EE
+  fetch path still uses kseg-aliased decode and is BIOS-only.
+
+## Wave 2.5 addition
+
+- `ee_ram_stub.sv` — small addressable EE-RAM block (default 16 KiB,
+  128-bit data path). First real memory source for DMAC-backed transfers.
+  Read port: `rd_en/rd_addr/rd_data/rd_valid`. Write port: `wr_en/wr_addr/
+  wr_data/wr_be`. Caller-provided `master_id` gets tagged into MEM READ /
+  WRITE trace events. Not the final 32 MiB EE-RAM model — see
+  `docs/wave25_memory_backed_dma_plan.md` for scope boundaries.
+
+## BIOS policy note
+
+Per `docs/decisions/0002-bios-policy.md`, no BIOS image is distributed from
+this repository. Synthetic fixture is the default so the project can run
+stubs without any Sony firmware. Real BIOS usage requires a user-supplied
+dump placed at the path passed to `IMAGE_FILE`.
@@ -0,0 +1,123 @@
+// retroDE_ps2 — bios_rom_stub
+//
+// Simulation stub for the 4 MiB BIOS ROM window. Gives Milestone B a
+// deterministic instruction source before the rest of the memory system
+// exists.
+//
+// Contract refs:
+//   docs/stub_module_plan.md        (Wave 1, item 2)
+//   docs/contracts/memory.md        (memory owns BIOS storage/visibility)
+//   docs/contracts/iop.md           (IOP owns BIOS behavior — NOT here)
+//   docs/decisions/0002-bios-policy.md (real BIOS + narrow stubs; this stub
+//                                       is the storage adapter, not firmware
+//                                       behavior, and needs no stub-policy
+//                                       tracking)
+//
+// Backing store:
+//   - If IMAGE_FILE is a non-empty string, `$readmemh` loads it at
+//     elaboration. Caller is responsible for supplying a hex image produced
+//     from a user-supplied BIOS dump. No BIOS image is shipped with this
+//     repository (see third_party/LICENSING.md).
+//   - If IMAGE_FILE is empty (default), a synthetic fixture is generated:
+//         mem[word_i] = 32'h00000000 (MIPS NOP: sll $0, $0, 0)
+//     Rationale: straight-line valid MIPS so the fixture is a legitimate
+//     execution target for any future emulator comparison. This aligns
+//     with sim/golden/trace_compare_spec.md ("first comparison target").
+//     Earlier versions used 32'hBFC00000 | word_index for trace-distinct
+//     inspection, but the spec explicitly rules out fixtures whose words
+//     are not a sensible execution target.
+//
+// Interface:
+//   - Byte-addressed within the 4 MiB window. The lower 2 bits of rd_addr
+//     are ignored (word-aligned fetch). Upstream address decode is owned
+//     by ee_memory_map_stub; this block does not validate the window itself.
+//   - One-cycle read latency: rd_en pulses on cycle N, rd_data/rd_valid
+//     present on cycle N+1.
+//   - Each completed read emits a MEM.READ trace event.
+//
+// Trace payload schema (per stub plan):
+//   MEM READ  arg0=addr  arg1=data  arg2=master  arg3=region
+//     master: 0=EE_IFETCH (only source wired in Wave 1)
+//     region: 0=BIOS
+
+`timescale 1ns/1ps
+
+module bios_rom_stub
+    import trace_pkg::*;
+#(
+    parameter int    SIZE_BYTES = 4 * 1024 * 1024,
+    parameter string IMAGE_FILE = ""
+) (
+    input  logic                          clk,
+    input  logic                          rst_n,
+
+    input  logic                          rd_en,
+    input  logic [$clog2(SIZE_BYTES)-1:0] rd_addr,
+    output logic [31:0]                   rd_data,
+    output logic                          rd_valid,
+
+    output logic                          ev_valid,
+    output subsys_e                       ev_subsys,
+    output event_e                        ev_event,
+    output logic [63:0]                   ev_arg0,
+    output logic [63:0]                   ev_arg1,
+    output logic [63:0]                   ev_arg2,
+    output logic [63:0]                   ev_arg3,
+    output logic [31:0]                   ev_flags
+);
+
+    localparam int WORD_COUNT = SIZE_BYTES / 4;
+    localparam int ADDR_WIDTH = $clog2(SIZE_BYTES);
+    localparam int WORD_INDEX_WIDTH = ADDR_WIDTH - 2;
+
+    logic [31:0] mem [0:WORD_COUNT-1];
+
+    initial begin
+        if (IMAGE_FILE != "") begin
+            $display("[bios_rom_stub] loading image: %0s", IMAGE_FILE);
+            $readmemh(IMAGE_FILE, mem);
+        end else begin
+            for (int i = 0; i < WORD_COUNT; i++) begin
+                mem[i] = 32'h00000000;      // MIPS NOP
+            end
+            $display("[bios_rom_stub] synthetic NOP sled loaded (%0d words)", WORD_COUNT);
+        end
+    end
+
+    logic [WORD_INDEX_WIDTH-1:0] word_index;
+    assign word_index = rd_addr[ADDR_WIDTH-1:2];
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            rd_data  <= 32'd0;
+            rd_valid <= 1'b0;
+
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_MEM;
+            ev_event  <= EV_READ;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else begin
+            rd_valid <= rd_en;
+
+            if (rd_en) begin
+                rd_data  <= mem[word_index];
+
+                ev_valid  <= 1'b1;
+                ev_subsys <= SUBSYS_MEM;
+                ev_event  <= EV_READ;
+                ev_arg0   <= {{(64-ADDR_WIDTH){1'b0}}, rd_addr};
+                ev_arg1   <= {32'd0, mem[word_index]};
+                ev_arg2   <= 64'd0;   // master: EE_IFETCH
+                ev_arg3   <= 64'd0;   // region: BIOS
+                ev_flags  <= 32'd0;
+            end else begin
+                ev_valid <= 1'b0;
+            end
+        end
+    end
+
+endmodule : bios_rom_stub
@@ -0,0 +1,165 @@
+// retroDE_ps2 — ee_ram_stub
+//
+// Tiny addressable EE-RAM block for Wave 2.5. Provides the first real memory
+// source for DMAC-backed transfers. Not the final 32 MiB main-RAM model —
+// see docs/wave25_memory_backed_dma_plan.md for explicit scope.
+//
+// Contract refs:
+//   docs/wave25_memory_backed_dma_plan.md   (ee_ram_stub scope)
+//   docs/contracts/memory.md                (memory subsystem ownership)
+//
+// Interface:
+//   - 128-bit wide data path, qword-aligned addressing (low 4 bits ignored).
+//   - One-cycle read latency: rd_en on cycle N → rd_data / rd_valid on N+1.
+//   - Write port has per-byte enables (wr_be[15:0]).
+//   - Optional `$readmemh` preload via IMAGE_FILE parameter.
+//
+// Trace:
+//   Emits MEM READ / MEM WRITE events one cycle after the request, matching
+//   the existing MEM schema. master_id is a caller-provided input (8 bits);
+//   the integration TB tags reads as 1 (DMAC) while TB-initiated writes are
+//   tagged 0. Any downstream master can drive its own id without RAM-side
+//   changes.
+//
+// Trace payload:
+//   MEM READ   arg0=addr arg1=data_lo arg2=master_id arg3=region_id
+//   MEM WRITE  arg0=addr arg1=data_lo arg2=master_id arg3=region_id
+//     master_id : caller-provided (e.g. 0 = TB direct, 1 = DMAC)
+//     region_id : 1 = EE_RAM (constant for this module)
+//   flags bit 0: 1 = write, 0 = read
+
+`timescale 1ns/1ps
+
+module ee_ram_stub
+    import trace_pkg::*;
+#(
+    parameter int    SIZE_BYTES = 16 * 1024,    // 16 KiB default
+    parameter string IMAGE_FILE = ""
+) (
+    input  logic                          clk,
+    input  logic                          rst_n,
+
+    // Read port (qword-aligned)
+    input  logic                          rd_en,
+    input  logic [$clog2(SIZE_BYTES)-1:0] rd_addr,
+    output logic [127:0]                  rd_data,
+    output logic                          rd_valid,
+
+    // Write port (qword-aligned; wr_be provides per-byte granularity)
+    input  logic                          wr_en,
+    input  logic [$clog2(SIZE_BYTES)-1:0] wr_addr,
+    input  logic [127:0]                  wr_data,
+    input  logic [15:0]                   wr_be,
+
+    // Optional caller-provided master id for trace attribution. Default tie
+    // to 8'd0 (TB direct) if the caller doesn't drive; DMAC drives 8'd1.
+    input  logic [7:0]                    master_id,
+
+    // Trace
+    output logic                          ev_valid,
+    output subsys_e                       ev_subsys,
+    output event_e                        ev_event,
+    output logic [63:0]                   ev_arg0,
+    output logic [63:0]                   ev_arg1,
+    output logic [63:0]                   ev_arg2,
+    output logic [63:0]                   ev_arg3,
+    output logic [31:0]                   ev_flags
+);
+
+    localparam int ADDR_WIDTH       = $clog2(SIZE_BYTES);
+    localparam int QWORD_COUNT      = SIZE_BYTES / 16;
+    localparam int QW_INDEX_WIDTH   = $clog2(QWORD_COUNT);
+    localparam logic [63:0] REGION_EE_RAM = 64'd1;
+
+    logic [127:0] mem [0:QWORD_COUNT-1];
+
+    initial begin
+        if (IMAGE_FILE != "") begin
+            $display("[ee_ram_stub] loading image: %0s", IMAGE_FILE);
+            $readmemh(IMAGE_FILE, mem);
+        end else begin
+            for (int i = 0; i < QWORD_COUNT; i++) mem[i] = 128'd0;
+            $display("[ee_ram_stub] zero-initialised (%0d qwords / %0d bytes)",
+                     QWORD_COUNT, SIZE_BYTES);
+        end
+    end
+
+    logic [QW_INDEX_WIDTH-1:0] rd_qw_idx;
+    logic [QW_INDEX_WIDTH-1:0] wr_qw_idx;
+    assign rd_qw_idx = rd_addr[ADDR_WIDTH-1:4];
+    assign wr_qw_idx = wr_addr[ADDR_WIDTH-1:4];
+
+    // ------------------------------------------------------------------
+    // Read + write (one-cycle latency). Reads and writes to the same
+    // address in the same cycle are not expected in Wave 2.5; if they
+    // occur, the read sees pre-write data (standard register-file
+    // semantics).
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            rd_data  <= 128'd0;
+            rd_valid <= 1'b0;
+        end else begin
+            rd_valid <= rd_en;
+            if (rd_en) rd_data <= mem[rd_qw_idx];
+
+            if (wr_en) begin
+                for (int b = 0; b < 16; b++) begin
+                    if (wr_be[b]) mem[wr_qw_idx][b*8 +: 8] <= wr_data[b*8 +: 8];
+                end
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace emission: one event per cycle, read wins over write on the
+    // unlikely same-cycle collision (single-port RAM would not see that
+    // anyway). Registered so ev_valid lines up with rd_valid / wr_ack
+    // boundaries.
+    // ------------------------------------------------------------------
+
+    logic [127:0] rd_data_sampled;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid        <= 1'b0;
+            ev_subsys       <= SUBSYS_MEM;
+            ev_event        <= EV_READ;
+            ev_arg0         <= 64'd0;
+            ev_arg1         <= 64'd0;
+            ev_arg2         <= 64'd0;
+            ev_arg3         <= 64'd0;
+            ev_flags        <= 32'd0;
+            rd_data_sampled <= 128'd0;
+        end else begin
+            // The actual fetched data is available one cycle after rd_en.
+            // Sample it in parallel with rd_data so the trace fires on the
+            // same edge as rd_valid.
+            rd_data_sampled <= mem[rd_qw_idx];
+
+            if (rd_en) begin
+                ev_valid  <= 1'b1;
+                ev_subsys <= SUBSYS_MEM;
+                ev_event  <= EV_READ;
+                ev_arg0   <= {{(64-ADDR_WIDTH){1'b0}}, rd_addr};
+                ev_arg1   <= mem[rd_qw_idx][63:0];
+                ev_arg2   <= {56'd0, master_id};
+                ev_arg3   <= REGION_EE_RAM;
+                ev_flags  <= 32'd0;
+            end else if (wr_en) begin
+                ev_valid  <= 1'b1;
+                ev_subsys <= SUBSYS_MEM;
+                ev_event  <= EV_WRITE;
+                ev_arg0   <= {{(64-ADDR_WIDTH){1'b0}}, wr_addr};
+                ev_arg1   <= wr_data[63:0];
+                ev_arg2   <= {56'd0, master_id};
+                ev_arg3   <= REGION_EE_RAM;
+                ev_flags  <= 32'h0000_0001;     // bit 0 = write
+            end else begin
+                ev_valid <= 1'b0;
+            end
+        end
+    end
+
+endmodule : ee_ram_stub
@@ -0,0 +1,234 @@
+// ============================================================================
+// I2C_Controller.v — Fixed-frame I2C master (3-byte write transactions)
+// ============================================================================
+//
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026 retroDE contributors
+//
+// Clean-room implementation, not derived from any GPL upstream. Released
+// under the MIT license to allow reuse outside the retroDE project. The
+// retroDE project as a whole is distributed under GPLv3 — see ../LICENSE
+// for the combined-work terms. See ../LICENSES/MIT.txt for the full MIT
+// text.
+//
+// ----------------------------------------------------------------------------
+// Purpose
+//   Simple I2C master that sends a fixed 24-bit frame per transaction:
+//       I2C_DATA = { slave_addr[6:0], rw, reg_addr[7:0], data[7:0] }
+//   Slave ACK is sampled after each of the three bytes. A STOP condition is
+//   generated at the end of the frame.
+//
+// Timing contract (compatible with legacy I2C_HDMI_Config parent)
+//   CLK       Fabric clock; state is registered on its rising edge.
+//   CLK_EN    One-cycle pulse at the desired I2C bit rate. State only advances
+//             when CLK_EN is asserted.
+//   CLK_PHASE Square wave at the SCL rate. During actively clocked bit cells,
+//             I2C_SCLK is driven as ~CLK_PHASE. Dedicated START/STOP hold
+//             phases force SCL high or low for a full cell.
+//   I2C_SDAT  Open-drain: driven low via 1'b0 or released to 1'bz. An
+//             external or FPGA internal pull-up is required on this line.
+//   I2C_SCLK  Actively driven (not open-drain). This matches the known-good
+//             DE25-Nano HDMI path and avoids relying on board-side pull-ups.
+//
+// Interface
+//   Port list is preserved verbatim from the legacy module so this file is
+//   a drop-in replacement. W_R is retained as a no-op input for source-
+//   compatibility; direction is encoded in I2C_DATA[16] by convention.
+//   SD_COUNTER and SDO are exposed for debug/observation only.
+//
+// Implementation note
+//   The transaction is modeled as explicit phases rather than implicit state
+//   updates. This keeps the START, bit, ACK, and STOP cells easy to inspect
+//   while preserving the known-good bus waveform used by retroDE_splash.
+// ============================================================================
+
+`timescale 1ns/1ps
+
+module I2C_Controller (
+    input  wire        CLK,
+    input  wire        CLK_EN,
+    input  wire        CLK_PHASE,
+    output wire        I2C_SCLK,
+    inout  wire        I2C_SDAT,
+    input  wire [23:0] I2C_DATA,
+    input  wire        GO,
+    output reg         END,
+    input  wire        W_R,           // retained for interface compat; unused
+    output wire        ACK,
+    input  wire        RESET,
+    output wire [5:0]  SD_COUNTER,    // debug: current transaction phase
+    output wire        SDO            // debug: current SDA release state
+);
+
+    localparam [5:0] PH_IDLE       = 6'd0,
+                     PH_START_HOLD = 6'd1,
+                     PH_START_LOW  = 6'd2,
+                     PH_B0_7       = 6'd3,
+                     PH_B0_6       = 6'd4,
+                     PH_B0_5       = 6'd5,
+                     PH_B0_4       = 6'd6,
+                     PH_B0_3       = 6'd7,
+                     PH_B0_2       = 6'd8,
+                     PH_B0_1       = 6'd9,
+                     PH_B0_0       = 6'd10,
+                     PH_ACK0       = 6'd11,
+                     PH_B1_7       = 6'd12,
+                     PH_B1_6       = 6'd13,
+                     PH_B1_5       = 6'd14,
+                     PH_B1_4       = 6'd15,
+                     PH_B1_3       = 6'd16,
+                     PH_B1_2       = 6'd17,
+                     PH_B1_1       = 6'd18,
+                     PH_B1_0       = 6'd19,
+                     PH_ACK1       = 6'd20,
+                     PH_B2_7       = 6'd21,
+                     PH_B2_6       = 6'd22,
+                     PH_B2_5       = 6'd23,
+                     PH_B2_4       = 6'd24,
+                     PH_B2_3       = 6'd25,
+                     PH_B2_2       = 6'd26,
+                     PH_B2_1       = 6'd27,
+                     PH_B2_0       = 6'd28,
+                     PH_ACK2       = 6'd29,
+                     PH_STOP_LOW   = 6'd30,
+                     PH_STOP_HIGH  = 6'd31,
+                     PH_DONE       = 6'd32;
+
+    reg  [5:0]  phase;
+    reg  [23:0] frame_data;
+    reg  [2:0]  ack_bits;
+    reg         sda_release;
+
+    reg  [5:0]  phase_next;
+    reg  [23:0] frame_data_next;
+    reg  [2:0]  ack_bits_next;
+    reg         sda_release_next;
+    reg         end_next;
+
+    assign I2C_SCLK =
+        (phase == PH_IDLE || phase == PH_START_HOLD || phase == PH_STOP_HIGH || phase == PH_DONE) ? 1'b1 :
+        (phase == PH_START_LOW || phase == PH_STOP_LOW) ? 1'b0 :
+                                                          ~CLK_PHASE;
+
+    assign I2C_SDAT   = sda_release ? 1'bz : 1'b0;
+    assign ACK        = |ack_bits;
+    assign SDO        = sda_release;
+    assign SD_COUNTER = phase;
+
+    always @(*) begin
+        phase_next       = phase;
+        frame_data_next  = frame_data;
+        ack_bits_next    = ack_bits;
+        sda_release_next = sda_release;
+        end_next         = END;
+
+        case (phase)
+            PH_IDLE: begin
+                end_next         = 1'b0;
+                sda_release_next = 1'b1;
+                if (GO) begin
+                    phase_next       = PH_START_HOLD;
+                    frame_data_next  = I2C_DATA;
+                    ack_bits_next    = 3'd0;
+                    sda_release_next = 1'b0;
+                end
+            end
+
+            PH_START_HOLD: begin
+                phase_next       = PH_START_LOW;
+                sda_release_next = 1'b0;
+            end
+
+            PH_START_LOW: begin
+                phase_next       = PH_B0_7;
+                sda_release_next = frame_data[23];
+            end
+
+            PH_B0_7: begin phase_next = PH_B0_6; sda_release_next = frame_data[22]; end
+            PH_B0_6: begin phase_next = PH_B0_5; sda_release_next = frame_data[21]; end
+            PH_B0_5: begin phase_next = PH_B0_4; sda_release_next = frame_data[20]; end
+            PH_B0_4: begin phase_next = PH_B0_3; sda_release_next = frame_data[19]; end
+            PH_B0_3: begin phase_next = PH_B0_2; sda_release_next = frame_data[18]; end
+            PH_B0_2: begin phase_next = PH_B0_1; sda_release_next = frame_data[17]; end
+            PH_B0_1: begin phase_next = PH_B0_0; sda_release_next = frame_data[16]; end
+            PH_B0_0: begin phase_next = PH_ACK0; sda_release_next = 1'b1;           end
+
+            PH_ACK0: begin
+                phase_next       = PH_B1_7;
+                ack_bits_next[0] = I2C_SDAT;
+                sda_release_next = frame_data[15];
+            end
+
+            PH_B1_7: begin phase_next = PH_B1_6; sda_release_next = frame_data[14]; end
+            PH_B1_6: begin phase_next = PH_B1_5; sda_release_next = frame_data[13]; end
+            PH_B1_5: begin phase_next = PH_B1_4; sda_release_next = frame_data[12]; end
+            PH_B1_4: begin phase_next = PH_B1_3; sda_release_next = frame_data[11]; end
+            PH_B1_3: begin phase_next = PH_B1_2; sda_release_next = frame_data[10]; end
+            PH_B1_2: begin phase_next = PH_B1_1; sda_release_next = frame_data[9];  end
+            PH_B1_1: begin phase_next = PH_B1_0; sda_release_next = frame_data[8];  end
+            PH_B1_0: begin phase_next = PH_ACK1; sda_release_next = 1'b1;           end
+
+            PH_ACK1: begin
+                phase_next       = PH_B2_7;
+                ack_bits_next[1] = I2C_SDAT;
+                sda_release_next = frame_data[7];
+            end
+
+            PH_B2_7: begin phase_next = PH_B2_6; sda_release_next = frame_data[6]; end
+            PH_B2_6: begin phase_next = PH_B2_5; sda_release_next = frame_data[5]; end
+            PH_B2_5: begin phase_next = PH_B2_4; sda_release_next = frame_data[4]; end
+            PH_B2_4: begin phase_next = PH_B2_3; sda_release_next = frame_data[3]; end
+            PH_B2_3: begin phase_next = PH_B2_2; sda_release_next = frame_data[2]; end
+            PH_B2_2: begin phase_next = PH_B2_1; sda_release_next = frame_data[1]; end
+            PH_B2_1: begin phase_next = PH_B2_0; sda_release_next = frame_data[0]; end
+            PH_B2_0: begin phase_next = PH_ACK2; sda_release_next = 1'b1;          end
+
+            PH_ACK2: begin
+                phase_next       = PH_STOP_LOW;
+                ack_bits_next[2] = I2C_SDAT;
+                sda_release_next = 1'b0;
+            end
+
+            PH_STOP_LOW: begin
+                phase_next       = PH_STOP_HIGH;
+                sda_release_next = 1'b0;
+            end
+
+            PH_STOP_HIGH: begin
+                phase_next       = PH_DONE;
+                sda_release_next = 1'b1;
+            end
+
+            PH_DONE: begin
+                end_next         = 1'b1;
+                sda_release_next = 1'b1;
+                if (!GO)
+                    phase_next = PH_IDLE;
+            end
+
+            default: begin
+                phase_next       = PH_IDLE;
+                sda_release_next = 1'b1;
+                end_next         = 1'b0;
+            end
+        endcase
+    end
+
+    always @(posedge CLK or negedge RESET) begin
+        if (!RESET) begin
+            phase       <= PH_IDLE;
+            frame_data  <= 24'd0;
+            ack_bits    <= 3'd0;
+            sda_release <= 1'b1;
+            END         <= 1'b0;
+        end
+        else if (CLK_EN) begin
+            phase       <= phase_next;
+            frame_data  <= frame_data_next;
+            ack_bits    <= ack_bits_next;
+            sda_release <= sda_release_next;
+            END         <= end_next;
+        end
+    end
+
+endmodule
@@ -0,0 +1,236 @@
+// ============================================================================
+// I2C_HDMI_Config.v — ADV7513 HDMI transmitter configuration via I2C
+// ============================================================================
+//
+// Derived from Terasic DE-series reference design (I2C_HDMI_Config.v).
+// Original copyright belongs to Terasic Technologies Inc.; this file is
+// distributed under the terms of the Terasic Reference Design license that
+// ships with the DE25-Nano System CD (free use on Terasic hardware,
+// copyright notice retained).
+//
+// retroDE modifications (2025-2026):
+//   - LUT_SIZE expanded to 38 entries
+//   - Audio configuration for I2S input @ 48 kHz, MCLK 12.288 MHz
+//   - HPD override (0xD6 = 0xC0) for monitors that misreport hot-plug
+//   - AVI InfoFrame configured for full-range RGB 444 output
+//   - Comments documenting each ADV7513 register write
+//
+// ============================================================================
+
+`timescale 1ns/1ps
+
+module I2C_HDMI_Config (	//	Host Side
+					iCLK,
+					iRST_N,
+					//	I2C Side
+					I2C_SCLK,
+					I2C_SDAT,
+					HDMI_TX_INT,
+					READY,
+					//	Ch166: sticky NACK watchdog
+					ERROR
+					 );
+//	Host Side
+input				iCLK;
+input				iRST_N;
+//	I2C Side: SCL is actively driven by the master; SDA is open-drain
+//	(master drives low / releases to 1'bz; slave drives ACK).
+output			I2C_SCLK;
+inout				I2C_SDAT;
+input				HDMI_TX_INT;
+output READY ;
+//	Ch166: ERROR latches HIGH if the same LUT entry NACKs
+//	NACK_LIMIT consecutive times (chip absent, address wrong,
+//	bus shorted). Sticky until iRST_N. Cleared on reset.
+output ERROR;
+
+//	Internal Registers/Wires
+reg	[15:0]	mI2C_CLK_DIV;
+reg	[23:0]	mI2C_DATA;
+reg				mI2C_CTRL_CLK;
+reg				mI2C_GO;
+wire				mI2C_END;
+wire				mI2C_ACK;
+reg	[15:0]	LUT_DATA;
+reg	[5:0]		LUT_INDEX;
+reg	[3:0]		mSetup_ST;
+reg READY ;
+
+//	Clock Setting
+parameter	CLK_Freq	=	50000000;	//	50	MHz
+parameter	I2C_Freq	=	20000;		//	20	KHz
+//	LUT Data Number
+parameter	LUT_SIZE	=	38;
+//	Ch166 - NACK watchdog threshold (consecutive retries on the
+//	same LUT entry before ERROR latches). At I2C_Freq=20 kHz a
+//	full byte transaction is ~1.5 ms, so 16 retries ~= 24 ms before
+//	we declare the bus dead - generous enough for real-world bus
+//	settling but well short of a stuck-LED user complaint.
+parameter	NACK_LIMIT	=	16;
+
+/////////////////////	I2C Control Clock	////////////////////////
+always@(posedge iCLK or negedge iRST_N)
+begin
+	if(!iRST_N)
+	begin
+		mI2C_CTRL_CLK	<=	0;
+		mI2C_CLK_DIV		<=	0;
+	end
+	else
+	begin
+		if( mI2C_CLK_DIV	< (CLK_Freq/I2C_Freq) )
+			mI2C_CLK_DIV	<=	mI2C_CLK_DIV+1;
+		else
+		begin
+			mI2C_CLK_DIV	<=	0;
+			mI2C_CTRL_CLK	<=	~mI2C_CTRL_CLK;
+		end
+	end
+end
+
+////////////////////////////////////////////////////////////////////
+I2C_Controller 	u0	(	.CLK(mI2C_CTRL_CLK),			//	Controller work clock
+						.CLK_EN(1'b1),					//	Advance every controller clock
+						.CLK_PHASE(mI2C_CTRL_CLK),		//	Phase for SCL generation
+						.I2C_SCLK(I2C_SCLK),				//	I2C CLOCK
+ 	 	 	 	 	 	.I2C_SDAT(I2C_SDAT),				//	I2C DATA
+						.I2C_DATA(mI2C_DATA),			//	DATA:[SLAVE_ADDR,SUB_ADDR,DATA]
+						.GO(mI2C_GO),						//	GO transfor
+						.END(mI2C_END),					//	END transfor
+						.W_R(1'b0),						//	Ch165 audit Low — tie retained-compat port off (always WRITE)
+						.ACK(mI2C_ACK),					//	ACK
+						.RESET(iRST_N)	);
+////////////////////////////////////////////////////////////////////
+//////////////////////	Config Control	////////////////////////////
+always@(posedge mI2C_CTRL_CLK or negedge iRST_N)
+begin
+	if(!iRST_N)
+	begin
+	   READY       <= 0;
+		LUT_INDEX	<=	0;
+		mSetup_ST	<=	0;
+		mI2C_GO		<=	0;
+	end
+	else
+	begin
+		if(LUT_INDEX<LUT_SIZE)
+		begin
+			READY<=0;
+			case(mSetup_ST)
+			0:	begin
+					mI2C_DATA	<=	{8'h72,LUT_DATA};
+					mI2C_GO		<=	1;
+					mSetup_ST	<=	1;
+				end
+			1:	begin
+					if(mI2C_END)
+					begin
+						if(!mI2C_ACK)
+						mSetup_ST	<=	2;
+						else
+						mSetup_ST	<=	0;
+						mI2C_GO		<=	0;
+					end
+				end
+			2:	begin
+					LUT_INDEX	<=	LUT_INDEX+1;
+					mSetup_ST	<=	0;
+				end
+			endcase
+		end
+		else
+		begin
+		  READY<=1;
+		  if(!HDMI_TX_INT)
+		  begin
+		    LUT_INDEX <= 0;
+		  end
+		  else
+		    LUT_INDEX <= LUT_INDEX;
+		end
+	end
+end
+////////////////////////////////////////////////////////////////////
+//////////////////  Ch166 NACK watchdog (sticky)  //////////////////
+//
+// Counts consecutive NACK retries on the *current* LUT entry.
+// In the config FSM above, state 1 sees mI2C_END at the end of
+// each I2C transaction; if mI2C_ACK is HIGH (slave didn't drive
+// the ACK bit LOW), the FSM bounces back to state 0 and retries
+// the same LUT_DATA. State 2 means the byte ACKed and LUT_INDEX
+// is about to advance, so we clear the retry count there. Once
+// the count hits NACK_LIMIT, ERROR latches HIGH (sticky until
+// iRST_N) so the top level can surface a stuck bus on an LED.
+reg [7:0]	nack_retries;
+reg			error_latched;
+always @(posedge mI2C_CTRL_CLK or negedge iRST_N)
+begin
+	if (!iRST_N)
+	begin
+		nack_retries  <= 0;
+		error_latched <= 1'b0;
+	end
+	else
+	begin
+		if (mSetup_ST == 1 && mI2C_END && mI2C_ACK)
+		begin
+			nack_retries <= nack_retries + 1;
+			if (nack_retries == NACK_LIMIT - 1)
+				error_latched <= 1'b1;
+		end
+		else if (mSetup_ST == 2)
+		begin
+			nack_retries <= 0;
+		end
+	end
+end
+assign ERROR = error_latched;
+////////////////////////////////////////////////////////////////////
+/////////////////////	Config Data LUT	  //////////////////////////
+always@(*)
+begin
+	case(LUT_INDEX)
+	//	Video Config Data
+	00	:	LUT_DATA	<=	16'h9803;  //Must be set to 0x03 for proper operation
+	01	:	LUT_DATA	<=	16'hD6C0;  //HPD override: force HPD always-high (bits[7:6]=11)
+	02	:	LUT_DATA	<=	16'h0100;  //Set 'N' value at 6144
+	03	:	LUT_DATA	<=	16'h0218;  //Set 'N' value at 6144
+	04	:	LUT_DATA	<=	16'h0300;  //Set 'N' value at 6144
+	05	:	LUT_DATA	<=	16'h0a01;  //MCLK ratio = 256x fs (12.288 MHz / 48 kHz)
+	06	:	LUT_DATA	<=	16'h0b2e;  //MCLK Active
+	07	:	LUT_DATA	<=	16'h0cbc;  //Serial Audio standard i2s, R0x0C[1:0] = '00
+	08	:	LUT_DATA	<=	16'h1402;  //Audio Word Length 16 bit, stereo (2 channels)
+	09	:	LUT_DATA	<=	16'h1520;  //Input 444 (RGB or YCrCb) with Separate Syncs, 48kHz fs
+	10	:	LUT_DATA	<=	16'h1630;  //Output format 444, 24-bit input
+	11	:	LUT_DATA	<=	16'h1846;  //Disable CSC
+	12	:	LUT_DATA	<=	16'h4080;  //General control packet enable
+	13	:	LUT_DATA	<=	16'h4110;  //Power down control
+	14	:	LUT_DATA	<=	16'h49A8;  //Set dither mode - 12-to-10 bit
+	15	:	LUT_DATA	<=	16'h5510;  //AVI InfoFrame byte 1: Y=RGB, A0=active fmt valid
+	16	:	LUT_DATA	<=	16'h5608;  //AVI InfoFrame byte 2: active format aspect
+	17	:	LUT_DATA	<=	16'h5708;  //AVI InfoFrame byte 3: Q=10 (full range RGB 0-255)
+	18	:	LUT_DATA	<=	16'h94C0;  //INT enable 1: HPD + monitor sense only
+	19	:	LUT_DATA	<=	16'h9500;  //INT enable 2: all disabled
+	20	:	LUT_DATA	<=	16'h96C0;  //Clear HPD + monitor sense status (matches 0x94 enable mask)
+	21	:	LUT_DATA	<=	16'h7301;  //Info frame Ch count = 2 (stereo)
+	22	:	LUT_DATA	<=	16'h7600;  //Speaker allocation: FL+FR (stereo)
+	23	:	LUT_DATA	<=	16'h9803;  //Must be set to 0x03 for proper operation
+	24	:	LUT_DATA	<=	16'h9902;  //Must be set to Default Value
+	25	:	LUT_DATA	<=	16'h9ae0;  //Must be set to 0b1110000
+	26	:	LUT_DATA	<=	16'h9c30;  //PLL filter R1 value
+	27	:	LUT_DATA	<=	16'h9d61;  //Set clock divide
+	28	:	LUT_DATA	<=	16'ha2a4;  //Must be set to 0xA4 for proper operation
+	29	:	LUT_DATA	<=	16'ha3a4;  //Must be set to 0xA4 for proper operation
+	30	:	LUT_DATA	<=	16'ha504;  //Must be set to Default Value
+	31	:	LUT_DATA	<=	16'hab40;  //Must be set to Default Value
+	32	:	LUT_DATA	<=	16'haf16;  //Select HDMI mode
+	33	:	LUT_DATA	<=	16'hba60;  //No clock delay
+	34	:	LUT_DATA	<=	16'hd1ff;  //Must be set to Default Value
+	35	:	LUT_DATA	<=	16'hde10;  //Must be set to Default for proper operation
+	36	:	LUT_DATA	<=	16'he460;  //Must be set to Default Value
+	37	:	LUT_DATA	<=	16'hfa7d;  //Nbr of times to look for good phase
+	default:		LUT_DATA	<=	16'h9803;
+	endcase
+end
+////////////////////////////////////////////////////////////////////
+endmodule
@@ -0,0 +1,29 @@
+# rtl/platform
+
+retroDE-specific platform integration. Matches `docs/contracts/platform.md`.
+
+## Wave 1 contents
+
+- `platform_video_stub.sv` — free-running raster generator. Default VGA
+  640x480 timing (overridable per-testbench to tiny values for fast sim).
+  Takes `bg_{r,g,b}` from `gs_stub` and flood-fills the active region.
+  Emits one `EV_MODE` per completed frame so testbenches can count frames
+  without sampling raw video.
+
+## Scope boundary
+
+This directory owns:
+
+- clock/reset sequencing entry points,
+- retroDE-facing video and audio adaptation,
+- HPS bridge plumbing (future),
+- top-level wrappers not belonging inside PS2 subsystems.
+
+It does **not** own GS/PCRTC semantics (that's `rtl/gif_gs/`), SPU2 audio
+synthesis (`rtl/spu2/`), or any PS2 register behavior.
+
+## Replacement path
+
+`platform_video_stub` stays as the platform adaptation layer. What changes
+is the upstream pixel source: Wave 1 → flat BGCOLOR from `gs_stub`,
+later waves → fuller GS/PCRTC output including framebuffer scan-out.
@@ -0,0 +1 @@
+/home/ubuntu/FPGA_Projects/retroDE_splash/rtl/platform/cp437_8x8.mem
@@ -0,0 +1,163 @@
+// retroDE_ps2 — platform_video_stub
+//
+// Smallest retroDE-facing video adapter needed for Milestone A. Accepts a
+// flat pixel source (bg_{r,g,b}) from gs_stub and generates a free-running
+// VGA-style raster with configurable timing. Wave 1 produces a flood-fill
+// frame at the current BGCOLOR — enough to prove the platform video path
+// end-to-end without waiting for real GS/PCRTC behavior.
+//
+// Contract refs:
+//   docs/stub_module_plan.md    (Wave 1, item 5)
+//   docs/contracts/platform.md
+//
+// Default timing is VGA 640x480 @ 25.175 MHz pixel clock. Testbenches
+// typically override to tiny values (e.g. 16x8 with minimal porches) to
+// keep simulation turnaround short.
+//
+// Replacement path: this module remains as the platform adaptation layer
+// while the upstream pixel source evolves from gs_stub to fuller GS/PCRTC
+// output.
+//
+// Trace payload schema:
+//   PLAT MODE   arg0=frame_number arg1=pixels_per_frame arg2=- arg3=-
+//     emitted once per frame on vsync rising edge, so testbenches can count
+//     frames without sampling raw video signals.
+
+`timescale 1ns/1ps
+
+module platform_video_stub
+    import trace_pkg::*;
+#(
+    // Horizontal timing (in pixel clocks)
+    parameter int H_ACTIVE = 640,
+    parameter int H_FRONT  = 16,
+    parameter int H_SYNC   = 96,
+    parameter int H_BACK   = 48,
+    // Vertical timing (in line counts)
+    parameter int V_ACTIVE = 480,
+    parameter int V_FRONT  = 10,
+    parameter int V_SYNC   = 2,
+    parameter int V_BACK   = 33,
+    // Sync polarity. VGA 640x480 is active-low on both.
+    parameter bit HSYNC_ACTIVE_LOW = 1'b1,
+    parameter bit VSYNC_ACTIVE_LOW = 1'b1
+) (
+    input  logic          clk,        // pixel clock
+    input  logic          rst_n,
+
+    // Pixel source from gs_stub
+    input  logic [7:0]    bg_r,
+    input  logic [7:0]    bg_g,
+    input  logic [7:0]    bg_b,
+
+    // Platform-facing video
+    output logic          hsync,
+    output logic          vsync,
+    output logic          de,
+    output logic [7:0]    r,
+    output logic [7:0]    g,
+    output logic [7:0]    b,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam int H_TOTAL = H_ACTIVE + H_FRONT + H_SYNC + H_BACK;
+    localparam int V_TOTAL = V_ACTIVE + V_FRONT + V_SYNC + V_BACK;
+
+    localparam int H_SYNC_START = H_ACTIVE + H_FRONT;
+    localparam int H_SYNC_END   = H_SYNC_START + H_SYNC;
+    localparam int V_SYNC_START = V_ACTIVE + V_FRONT;
+    localparam int V_SYNC_END   = V_SYNC_START + V_SYNC;
+
+    localparam int HCNT_W = $clog2(H_TOTAL);
+    localparam int VCNT_W = $clog2(V_TOTAL);
+
+    logic [HCNT_W-1:0] hcnt;
+    logic [VCNT_W-1:0] vcnt;
+
+    // ------------------------------------------------------------------
+    // Raster counters
+    // ------------------------------------------------------------------
+
+    logic end_of_line;
+    logic end_of_frame;
+
+    assign end_of_line  = (hcnt == HCNT_W'(H_TOTAL - 1));
+    assign end_of_frame = end_of_line && (vcnt == VCNT_W'(V_TOTAL - 1));
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            hcnt <= '0;
+            vcnt <= '0;
+        end else if (end_of_line) begin
+            hcnt <= '0;
+            vcnt <= end_of_frame ? '0 : (vcnt + VCNT_W'(1));
+        end else begin
+            hcnt <= hcnt + HCNT_W'(1);
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Sync + data-enable + pixel colour
+    // ------------------------------------------------------------------
+
+    logic active_h;
+    logic active_v;
+    logic in_hsync;
+    logic in_vsync;
+
+    assign active_h = (hcnt < HCNT_W'(H_ACTIVE));
+    assign active_v = (vcnt < VCNT_W'(V_ACTIVE));
+    assign in_hsync = (hcnt >= HCNT_W'(H_SYNC_START)) && (hcnt < HCNT_W'(H_SYNC_END));
+    assign in_vsync = (vcnt >= VCNT_W'(V_SYNC_START)) && (vcnt < VCNT_W'(V_SYNC_END));
+
+    assign hsync = HSYNC_ACTIVE_LOW ? ~in_hsync : in_hsync;
+    assign vsync = VSYNC_ACTIVE_LOW ? ~in_vsync : in_vsync;
+    assign de    = active_h && active_v;
+    assign r     = de ? bg_r : 8'd0;
+    assign g     = de ? bg_g : 8'd0;
+    assign b     = de ? bg_b : 8'd0;
+
+    // ------------------------------------------------------------------
+    // Trace: one EV_MODE pulse per completed frame.
+    // ------------------------------------------------------------------
+
+    logic [31:0] frame_count;
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            frame_count <= 32'd0;
+
+            ev_valid    <= 1'b0;
+            ev_subsys   <= SUBSYS_PLAT;
+            ev_event    <= EV_MODE;
+            ev_arg0     <= 64'd0;
+            ev_arg1     <= 64'd0;
+            ev_arg2     <= 64'd0;
+            ev_arg3     <= 64'd0;
+            ev_flags    <= 32'd0;
+        end else if (end_of_frame) begin
+            frame_count <= frame_count + 32'd1;
+
+            ev_valid    <= 1'b1;
+            ev_subsys   <= SUBSYS_PLAT;
+            ev_event    <= EV_MODE;
+            ev_arg0     <= {32'd0, frame_count};
+            ev_arg1     <= {{(64-32){1'b0}}, 32'(H_ACTIVE * V_ACTIVE)};
+            ev_arg2     <= 64'd0;
+            ev_arg3     <= 64'd0;
+            ev_flags    <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : platform_video_stub
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+// Copyright (c) 2025-2026 retroDE contributors
+// ============================================================================
+// ps2_hps_bridge_null — minimal AXI4 slave for the PS2 core's Ch170 shell
+// ============================================================================
+//
+// Purpose: present an AXI4 slave endpoint to the HPS hps2fpga bridge that
+// (a) does proper AXI handshake so HPS transactions can't stall the bus,
+// and (b) exposes a minimal "core identity" register window at 0x000-0x00F
+// so retrodesd / probing utilities can read back who loaded.
+//
+// This is the Ch170 placeholder — when a real ps2_hps_bridge.sv lands (with
+// HPS-driven core_reset, status mirrors, ROM staging, etc.), it should keep
+// the same AXI4 port signature so the top-wrapper instantiation doesn't
+// need to change.
+//
+// AXI4 subset (matches splash_hps_bridge.sv):
+//   - 128-bit data bus with byte-lane selection via {awaddr[3:2] / araddr[3:2]}
+//   - Single-beat only (awlen=0, arlen=0)
+//   - 4-bit ID echo
+//   - 38-bit address
+//
+// Identity register map (ABI v1.0 — read-only):
+//   0x000  CORE_ID        = 32'h70533200  ("pS2\0" — placeholder, refine later)
+//   0x004  ABI_VERSION    = 32'h00000100  (v1.0)
+//   0x008  CORE_STATUS    = 32'h00000001  (bit 0 = loaded)
+//   0x00C  CORE_CAPS      = 32'h00000000  (no caps advertised)
+//
+// Everything else: reads return 0, writes ACK'd and discarded.
+// ============================================================================
+
+`timescale 1ns/1ps
+
+module ps2_hps_bridge_null (
+    input  logic         clk,        // qsys clk_100_clk domain
+    input  logic         reset_n,
+    input  logic         h2f_reset,  // HPS-driven fabric reset (active high) — unused; reserved
+
+    // AXI4 slave — write address channel
+    input  logic [3:0]   s_axi_awid,
+    input  logic [37:0]  s_axi_awaddr,
+    input  logic [7:0]   s_axi_awlen,
+    input  logic [2:0]   s_axi_awsize,
+    input  logic [1:0]   s_axi_awburst,
+    input  logic         s_axi_awlock,
+    input  logic [3:0]   s_axi_awcache,
+    input  logic [2:0]   s_axi_awprot,
+    input  logic         s_axi_awvalid,
+    output logic         s_axi_awready,
+
+    // AXI4 slave — write data channel
+    input  logic [127:0] s_axi_wdata,
+    input  logic [15:0]  s_axi_wstrb,
+    input  logic         s_axi_wlast,
+    input  logic         s_axi_wvalid,
+    output logic         s_axi_wready,
+
+    // AXI4 slave — write response channel
+    output logic [3:0]   s_axi_bid,
+    output logic [1:0]   s_axi_bresp,
+    output logic         s_axi_bvalid,
+    input  logic         s_axi_bready,
+
+    // AXI4 slave — read address channel
+    input  logic [3:0]   s_axi_arid,
+    input  logic [37:0]  s_axi_araddr,
+    input  logic [7:0]   s_axi_arlen,
+    input  logic [2:0]   s_axi_arsize,
+    input  logic [1:0]   s_axi_arburst,
+    input  logic         s_axi_arlock,
+    input  logic [3:0]   s_axi_arcache,
+    input  logic [2:0]   s_axi_arprot,
+    input  logic         s_axi_arvalid,
+    output logic         s_axi_arready,
+
+    // AXI4 slave — read data channel
+    output logic [3:0]   s_axi_rid,
+    output logic [127:0] s_axi_rdata,
+    output logic [1:0]   s_axi_rresp,
+    output logic         s_axi_rlast,
+    output logic         s_axi_rvalid,
+    input  logic         s_axi_rready
+);
+
+    // ----------------------------------------------------------------
+    // Identity register window (Ch170 ABI v1.0).
+    // ----------------------------------------------------------------
+    localparam logic [31:0] CORE_ID     = 32'h70533200;
+    localparam logic [31:0] ABI_VERSION = 32'h00000100;
+    localparam logic [31:0] CORE_STATUS = 32'h00000001;
+    localparam logic [31:0] CORE_CAPS   = 32'h00000000;
+
+    function automatic logic [31:0] identity_lookup(input logic [37:0] addr);
+        // Identity registers live in the first 16 bytes of the bridge map.
+        // Anything else returns 0. addr[3:2] picks one of four 32-bit slots.
+        if (addr[37:4] != '0)
+            return 32'd0;
+        case (addr[3:2])
+            2'b00:   identity_lookup = CORE_ID;
+            2'b01:   identity_lookup = ABI_VERSION;
+            2'b10:   identity_lookup = CORE_STATUS;
+            default: identity_lookup = CORE_CAPS;
+        endcase
+    endfunction
+
+    // ----------------------------------------------------------------
+    // Write FSM. Single-beat: accept awvalid + wvalid together, hold
+    // them ready for one cycle each, then emit bvalid. Stays in the
+    // BRESP state until bready, so multi-cycle bready timing from
+    // qsys still completes cleanly.
+    // ----------------------------------------------------------------
+    typedef enum logic [1:0] { W_IDLE, W_DATA, W_RESP } w_state_t;
+    w_state_t w_state;
+    logic [3:0] aw_id_q;
+
+    always_ff @(posedge clk or negedge reset_n) begin
+        if (!reset_n) begin
+            w_state      <= W_IDLE;
+            aw_id_q      <= '0;
+            s_axi_bvalid <= 1'b0;
+        end else begin
+            case (w_state)
+                W_IDLE: begin
+                    s_axi_bvalid <= 1'b0;
+                    if (s_axi_awvalid && s_axi_awready) begin
+                        aw_id_q <= s_axi_awid;
+                        w_state <= W_DATA;
+                    end
+                end
+                W_DATA: begin
+                    if (s_axi_wvalid && s_axi_wready) begin
+                        s_axi_bvalid <= 1'b1;
+                        w_state      <= W_RESP;
+                    end
+                end
+                W_RESP: begin
+                    if (s_axi_bready) begin
+                        s_axi_bvalid <= 1'b0;
+                        w_state      <= W_IDLE;
+                    end
+                end
+                default: w_state <= W_IDLE;
+            endcase
+        end
+    end
+
+    assign s_axi_awready = (w_state == W_IDLE);
+    assign s_axi_wready  = (w_state == W_DATA);
+    assign s_axi_bid     = aw_id_q;
+    assign s_axi_bresp   = 2'b00;  // OKAY
+
+    // ----------------------------------------------------------------
+    // Read FSM. Same shape — accept arvalid, drive rdata + rvalid,
+    // hold until rready.
+    // ----------------------------------------------------------------
+    typedef enum logic [0:0] { R_IDLE, R_RESP } r_state_t;
+    r_state_t r_state;
+    logic [3:0]   ar_id_q;
+    logic [37:0]  ar_addr_q;
+    logic [127:0] rdata_q;
+
+    always_ff @(posedge clk or negedge reset_n) begin
+        if (!reset_n) begin
+            r_state      <= R_IDLE;
+            ar_id_q      <= '0;
+            ar_addr_q    <= '0;
+            rdata_q      <= '0;
+            s_axi_rvalid <= 1'b0;
+        end else begin
+            case (r_state)
+                R_IDLE: begin
+                    s_axi_rvalid <= 1'b0;
+                    if (s_axi_arvalid && s_axi_arready) begin
+                        ar_id_q   <= s_axi_arid;
+                        ar_addr_q <= s_axi_araddr;
+                        // Replicate the 32-bit identity word into the
+                        // matching 32-bit lane of the 128-bit response,
+                        // mirroring splash_hps_bridge's lane semantics.
+                        case (s_axi_araddr[3:2])
+                            2'b00:   rdata_q <= {96'd0,           identity_lookup(s_axi_araddr)};
+                            2'b01:   rdata_q <= {64'd0,           identity_lookup(s_axi_araddr), 32'd0};
+                            2'b10:   rdata_q <= {32'd0,           identity_lookup(s_axi_araddr), 64'd0};
+                            default: rdata_q <= {identity_lookup(s_axi_araddr), 96'd0};
+                        endcase
+                        s_axi_rvalid <= 1'b1;
+                        r_state      <= R_RESP;
+                    end
+                end
+                R_RESP: begin
+                    if (s_axi_rready) begin
+                        s_axi_rvalid <= 1'b0;
+                        r_state      <= R_IDLE;
+                    end
+                end
+                default: r_state <= R_IDLE;
+            endcase
+        end
+    end
+
+    assign s_axi_arready = (r_state == R_IDLE);
+    assign s_axi_rid     = ar_id_q;
+    assign s_axi_rdata   = rdata_q;
+    assign s_axi_rresp   = 2'b00;  // OKAY
+    assign s_axi_rlast   = 1'b1;   // single-beat
+
+    // ----------------------------------------------------------------
+    // Tie off the AXI4 fields we don't consume so Quartus doesn't
+    // emit lint warnings: awlen/awsize/awburst/awlock/awcache/awprot,
+    // wstrb/wlast, arlen/arsize/arburst/arlock/arcache/arprot, h2f_reset.
+    // ----------------------------------------------------------------
+    // verilator lint_off UNUSED
+    wire _unused_ok = &{ 1'b0,
+        s_axi_awlen, s_axi_awsize, s_axi_awburst,
+        s_axi_awlock, s_axi_awcache, s_axi_awprot,
+        s_axi_wdata, s_axi_wstrb, s_axi_wlast,
+        s_axi_arlen, s_axi_arsize, s_axi_arburst,
+        s_axi_arlock, s_axi_arcache, s_axi_arprot,
+        h2f_reset,
+        1'b0 };
+    // verilator lint_on UNUSED
+
+endmodule : ps2_hps_bridge_null
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+// Copyright (c) 2025-2026 retroDE contributors
+// ============================================================================
+// tile_ram_cdc — Ch229 bridge-clock → design-clock tile-RAM shadow
+// ============================================================================
+// Implements the design-domain side of the Ch229 tile-RAM CDC. Owns a
+// 1024 × 32-bit shadow memory in the design clock domain. Bridge-side
+// writes arrive as a toggle-based "event" signal plus latched index +
+// data; a 2-FF synchronizer + XOR edge detector turns each toggle edge
+// into a 1-cycle write pulse against the shadow RAM. Read port is
+// purely combinational (the consumer is the Ch245 platform-OSD
+// char-BRAM read adapter in the top, which selects high/low 16-bit
+// cells from each 32-bit shadow word and feeds them to the platform
+// `osd_overlay`. Pre-Ch245 the consumer was the now-retired
+// PS2-local `osd_overlay_stub`). No back-pressure — the bridge is assumed
+// to space tile writes far enough apart for the sync chain to keep up.
+//
+// **CDC contract (read carefully before refactoring):**
+//   - The bridge updates `bclk_wr_toggle`, `bclk_wr_index`, `bclk_wr_data`
+//     at the same `bclk` edge (one bridge clock cycle).
+//   - The receiver sees the toggle through a 2-FF synchronizer; the
+//     edge-detection wire `wr_pulse` fires on the dclk cycle where the
+//     synchronized toggle has FULLY settled. That guarantees ≥ 2 dclk
+//     periods of stability on `bclk_wr_index/data` before they're
+//     sampled into the shadow memory.
+//   - Multiple bridge writes faster than ~3 dclk periods apart will
+//     race and may drop or merge events. For the Ch229 use case
+//     (retrodesd OSD updates at ≤ 1 kHz, design_clk at 25–50 MHz),
+//     this is many orders of magnitude of slack. **Do not** wire a
+//     fast-cycling source (e.g. a counter) into the bridge's tile
+//     write path without first replacing this CDC with an async FIFO.
+//
+// **Reset behavior:**
+//   - On `breset_n` deasserted: bridge clears `bclk_wr_toggle` to 0
+//     (matching the receiver's post-reset state). When both domains
+//     reset together (the normal case on FPGA configure), no spurious
+//     edge fires after release.
+//   - On `dreset_n` deasserted: synchronizer chain clears to 0;
+//     shadow memory contents are NOT cleared (matches Ch227 retention
+//     semantics — sim `initial` block zeroes for determinism, hardware
+//     power-up is undefined). The Ch229 contract is "tile RAM survives
+//     warm reset"; rebooting both sides is a power-cycle scenario and
+//     the bridge will re-broadcast any written tiles via the next set
+//     of AXI writes from HPS.
+// ============================================================================
+
+`timescale 1ns/1ps
+
+module tile_ram_cdc (
+    // ---- Bridge clock domain (write port) ----
+    input  logic        bclk,
+    input  logic        breset_n,
+    input  logic        bclk_wr_toggle,
+    input  logic [9:0]  bclk_wr_index,
+    input  logic [31:0] bclk_wr_data,
+
+    // ---- Design clock domain (read port) ----
+    input  logic        dclk,
+    input  logic        dreset_n,
+    input  logic [9:0]  dclk_rd_index,
+    output logic [31:0] dclk_rd_data,
+
+    // ---- Ch230 design-domain diagnostic counter ----
+    // Saturating count of "tile writes too close" events — successive
+    // wr_pulse events fewer than MIN_DCLK_GAP dclk cycles apart.
+    // Exposed as an output so the top can route it to a reverse-CDC +
+    // bridge-readable diagnostic register in a future chapter (Ch231+).
+    // For Ch230 the top leaves it unconnected; the counter still exists
+    // in the design domain as a synthesis artifact ready for hookup.
+    output logic [15:0] tile_wr_too_close_count
+);
+
+    // Shadow RAM lives in the design clock domain. Matched-size with
+    // the bridge-side `ps2_hps_bridge.tile_mem` (1024 × 32-bit). The
+    // `ramstyle = "M20K"` attribute (added in the Ch232 hardware
+    // bring-up hotfix) forces Quartus to use a single M20K block
+    // instead of distributing the storage across LABs.
+    (* ramstyle = "M20K" *) logic [31:0] shadow_mem [0:1023];
+    initial begin
+        for (int i = 0; i < 1024; i++)
+            shadow_mem[i] = 32'd0;
+    end
+
+    // 2-FF synchronizer on the bridge toggle into the design clock.
+    // Three stages let us compute an edge detector against the
+    // already-resampled bits ([2] ^ [1]), giving the wr_pulse a full
+    // dclk cycle of bclk_wr_index/data stability before we sample.
+    logic [2:0] toggle_sync;
+    always_ff @(posedge dclk or negedge dreset_n) begin
+        if (!dreset_n)
+            toggle_sync <= 3'b000;
+        else
+            toggle_sync <= {toggle_sync[1:0], bclk_wr_toggle};
+    end
+    wire wr_pulse = toggle_sync[2] ^ toggle_sync[1];
+
+    // Shadow write port. At the dclk edge where wr_pulse fires,
+    // sample bclk_wr_index + bclk_wr_data. Both have been stable for
+    // ≥ 2 dclk cycles by construction of the CDC contract above.
+    always_ff @(posedge dclk) begin
+        if (wr_pulse)
+            shadow_mem[bclk_wr_index] <= bclk_wr_data;
+    end
+
+    // Read port: combinational lookup. The consumer pulls index from
+    // its pixel position and uses the data to decide overlay vs
+    // transparent for each pixel.
+    assign dclk_rd_data = shadow_mem[dclk_rd_index];
+
+    // ---- Ch229 / Ch230 tile-write rate watchdog ----
+    // The CDC contract requires writes to be spaced far enough apart
+    // that each toggle edge passes through the sync chain cleanly.
+    // Two consecutive bridge writes that both flip toggle within one
+    // dclk of each other can be merged into a single transition at
+    // sync[0] — the first write's bclk_wr_index/bclk_wr_data are
+    // overwritten before the receiver samples them, and the write is
+    // silently lost.
+    //
+    // The actual minimum gap is ≥ 3 dclk between successive
+    // wr_pulse events at the receiver:
+    //   - 1 dclk for the synchronizer to fully settle (so the
+    //     second edge is visible as a distinct transition)
+    //   - 1 dclk for the receiver to fire wr_pulse for write 1
+    //   - 1 dclk of margin for jitter / setup time
+    //
+    // Production rate enforcer is software-side (retrodesd OSD
+    // updates at ≤ 1 kHz ≫ 3 dclk @ 25 MHz = 120 ns); the bridge
+    // does not back-pressure AXI on this constraint. Ch229 added a
+    // sim-only `$display` warning; Ch230 promotes the gap-tracker to
+    // a real **saturating counter** (16-bit) exposed as
+    // `tile_wr_too_close_count` so a future chapter can route it
+    // through a reverse CDC into a bridge-readable register
+    // (HDMI_DIAG upper bits or a new diagnostic offset). The
+    // `$display` aid remains in `\`ifndef SYNTHESIS` for pre-silicon
+    // log visibility.
+    localparam int unsigned MIN_DCLK_GAP = 3;
+    logic [31:0]  dclk_since_last_pulse;
+    wire          too_close = wr_pulse && (dclk_since_last_pulse < MIN_DCLK_GAP);
+
+    always_ff @(posedge dclk or negedge dreset_n) begin
+        if (!dreset_n) begin
+            dclk_since_last_pulse   <= 32'hFFFF_FFFF;
+            tile_wr_too_close_count <= 16'd0;
+        end else begin
+            if (wr_pulse)
+                dclk_since_last_pulse <= 32'd0;
+            else if (dclk_since_last_pulse != 32'hFFFF_FFFF)
+                dclk_since_last_pulse <= dclk_since_last_pulse + 32'd1;
+
+            if (too_close && (tile_wr_too_close_count != 16'hFFFF))
+                tile_wr_too_close_count <= tile_wr_too_close_count + 16'd1;
+        end
+    end
+
+`ifndef SYNTHESIS
+    always_ff @(posedge dclk) begin
+        if (dreset_n && too_close) begin
+            $display(
+                "[tile_ram_cdc] WARN time=%0t: tile writes too close - %0d dclk cycles between toggle edges (CDC needs >= %0d for safe sample).",
+                $time, dclk_since_last_pulse, MIN_DCLK_GAP);
+        end
+    end
+`endif
+
+    // ---- Lint: bclk + breset_n are intentionally referenced ONLY
+    //      via the bclk_wr_toggle path. Tie a placeholder reference
+    //      to silence "unused" warnings on tools that don't trace
+    //      through the upstream toggle source.
+    // verilator lint_off UNUSED
+    wire _unused_ok = &{1'b0, bclk, breset_n, 1'b0};
+    // verilator lint_on UNUSED
+
+endmodule : tile_ram_cdc
@@ -0,0 +1,91 @@
+# rtl/sif
+
+EE↔IOP subsystem interface. Matches `docs/contracts/sif.md`.
+
+## Current contents
+
+- `sif_mailbox_stub.sv` — minimal four-register mailbox/flag shell
+  (MSCOM / SMCOM / MSFLG / SMFLG). Independent EE-side and IOP-side register
+  ports. Directional set/clear semantics deferred; this phase only proves
+  that both sides observe consistent storage and that side-of-origin is
+  trace-visible. Per-register write arbitration: EE wins on same-register
+  collision, independent writes to different registers coexist.
+- `sif_dma_stub.sv` — receive-side DMA endpoint. Accepts qwords from a
+  DMAC channel's `ep_*` port into a small internal buffer (default DEPTH=8).
+  Capacity-safe: `in_ready` drops when `rx_count >= DEPTH`, `full_o`
+  exposed for testbench observation. TB-controlled `stall_in` input for
+  explicit stall testing. Read port for payload verification. No consume
+  path yet — once full, stays full. NOT an IOP — purely a bounded receive
+  buffer with trace emission per accepted beat.
+- `sif_mailbox_peer_stub.sv` — tiny active peer used in integration tests
+  to play "the IOP side" of a specific mailbox protocol. Re-armable
+  command-echo state machine (poll MSFLG → read MSCOM → write SMCOM →
+  write SMFLG → wait for TB to clear MSFLG → repeat). Refuses to re-fire
+  while the doorbell bit stays high, so lifecycle is explicit. Exposes
+  `ack_count_o` for testbench synchronisation.
+  Explicitly NOT an IOP core: no code execution, no BIOS bring-up, no
+  implicit flag clearing (re-arm is the TB's responsibility). Kept under
+  `rtl/sif/` precisely so it does not get misread as IOP maturity progress.
+- `sif_dma_iop_ram_bridge_stub.sv` — width-adapting bridge from a 128-bit
+  SIF DMA endpoint to 32-bit IOP-side writes. Splits each incoming qword
+  into four 32-bit writes at consecutive physical addresses from
+  `DEST_BASE_ADDR`. Little-endian unpacking. Drives the IOP memory map's
+  bridge-write port (`bridge_wr_*`). In-ready drops while the bridge is
+  flushing a qword — natural backpressure to the DMAC.
+- `sif_dma_ack_peer_stub.sv` — protocol combiner for the first combined
+  control+data SIF milestone. Observes a mailbox doorbell (MSFLG pending
+  bit) AND `sif_dma_stub.last_seen` (payload completion); only emits the
+  ack sequence (SMCOM=cmd + SMFLG=ACK) once both are true. Composes two
+  existing SIF primitives; does not fatten the plain mailbox peer with
+  DMA awareness.
+  Explicitly NOT an IOP.
+- `sif_dma_ee_ram_bridge_stub.sv` — width-adapting bridge from a 32-bit
+  SIF DMA endpoint (IOP→EE egress) to 128-bit EE-side writes. Mirror of
+  `sif_dma_iop_ram_bridge_stub` in the other direction: accumulates four
+  consecutive 32-bit beats into a qword (little-endian), then issues
+  one write through the EE memory map's bridge write port. Drops
+  `in_ready` during the one-cycle emit for natural back-pressure.
+  Handles partial-quad on `in_last` via byte-enable masking. Exposes
+  `last_seen_o` — a level-held latch that rises when the final beat of
+  a transfer is accepted, so EE-side protocol combiners can gate on
+  "payload fully landed."
+- `sif_dma_ee_ack_peer_stub.sv` — protocol combiner for the first
+  IOP-driven combined control+data SIF milestone. Polarity mirror of
+  `sif_dma_ack_peer_stub`: observes the mailbox's EE side for an IOP
+  doorbell (SMFLG pending bit), gates on
+  `sif_dma_ee_ram_bridge_stub.last_seen_o`, and only then reads SMCOM
+  and echoes MSCOM + MSFLG=ACK back IOP-ward. One-shot. Explicitly NOT
+  an EE core — purely a composition of two existing SIF primitives.
+
+## Current status
+
+The SIF seam is feature-complete for staged bring-up in both directions.
+Storage, active peer, lifecycle/re-arm, negative-path, EE→IOP DMA, three
+classes of backpressure (start / mid-transfer / full-stop), EE-driven
+combined control+data gating, a reverse-direction (IOP→EE) data path
+with its own stall semantics, AND the matching IOP-driven combined
+control+data handshake are all proven end-to-end. Further SIF-only work
+would be symmetry-chasing rather than unlocking new architectural
+questions.
+
+## Deferred follow-ons (not gaps)
+
+These are known extension points, intentionally not pursued yet:
+
+- **Re-armable combined control+data handshakes.** Both directions are
+  currently one-shot; re-arm mostly composes pieces already proven
+  separately. Nice-to-have.
+- **Directional write-ownership + flag set/clear semantics.** Currently
+  both sides of the mailbox can write any register with plain replace
+  semantics; real PS2 has directional set/W1C rules.
+- **Real EE↔IOP coordination.** Arrives once an IOP-side execution
+  primitive exists that can observe SIF as "IOP behaviour," not as a
+  peer stub.
+
+## Scope boundary
+
+This directory owns the SIF register shell and DMA-visible coordination.
+It does **not** own:
+- IOP CPU execution (`rtl/iop/`, not yet created)
+- EE-side addressing / kseg stripping for SIF registers (memory-map work)
+- Interrupt routing to INTC on SIF transitions (Wave 3)
@@ -0,0 +1,217 @@
+// retroDE_ps2 — boot_install_agent_stub (Ch55 / Ch56)
+//
+// Minimal external producer that streams a coordinated low-RAM handler
+// image into EE RAM through the SIF EE-RAM bridge. Emits 32-bit beats
+// on a ready/valid handshake compatible with sif_dma_ee_ram_bridge_stub.
+//
+// NOT an IOP, NOT a full boot firmware. This is the thinnest possible
+// stand-in for "whatever on real PS2 populates EE useg [0x80..0x1FF]
+// with exception-entry + safe-return stubs before the EE starts
+// faulting" (IOP→EE SIF DMA, BootROM/CDVD handoff, etc.). The point
+// is to validate the transport path and the coordinated-install
+// thesis, not to model the producer's identity.
+//
+// Payload source (Ch56):
+//   USE_IMAGE_FILE=0 (default) — built-in Ch54 image, hardcoded below
+//   USE_IMAGE_FILE=1           — $readmemh(IMAGE_FILE, payload) once
+//                                at sim start, expects TOTAL_WORDS
+//                                hex words
+// Transport (timing, handshake, trace) is identical across both modes.
+//
+// Built-in image (USE_IMAGE_FILE=0):
+//   word[0..3]   → AdES handler at useg 0x80..0x8C:
+//                    MFC0  $26, $14          (32'h401A7000)
+//                    ADDIU $26, $26, 4       (32'h275A0004)
+//                    JR    $26               (32'h03400008)
+//                    RFE                     (32'h42000010)
+//   word[4..95]  → 46× (JR $31; NOP) safe-return pairs covering
+//                  useg 0x90..0x1FC.
+//
+// Downstream contract (matches sif_dma_ee_ram_bridge_stub upstream):
+//   out_valid / out_data[31:0] / out_last / out_ready
+//   out_last asserted on the final word. One-beat-per-cycle while
+//   out_ready is high.
+//
+// Trace:
+//   SUBSYS_SIF / EV_DMA_START once on go.
+//   SUBSYS_SIF / EV_DMA_BEAT per accepted beat.
+//     arg0 = word index, arg1 = word data, arg2 = MASTER_ID,
+//     arg3 = TOTAL_WORDS, flags bit0 = out_last.
+//   SUBSYS_SIF / EV_DMA_DONE once on completion.
+
+`timescale 1ns/1ps
+
+module boot_install_agent_stub
+    import trace_pkg::*;
+#(
+    parameter int          TOTAL_WORDS     = 96,
+    parameter logic [7:0]  MASTER_ID       = 8'd6, // install agent
+    parameter bit          USE_IMAGE_FILE  = 1'b0, // 0: built-in ROM, 1: $readmemh
+    parameter string       IMAGE_FILE      = ""
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    input  logic          go_i,
+
+    output logic          out_valid,
+    output logic [31:0]   out_data,
+    output logic          out_last,
+    input  logic          out_ready,
+
+    output logic          busy_o,
+    output logic          done_o,
+
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    // ------------------------------------------------------------------
+    // Payload ROM
+    // ------------------------------------------------------------------
+    // The 4-word exception-return stub at [0..3] plus (JR $31; NOP)
+    // pairs filling the rest requires TOTAL_WORDS to be even and at
+    // least 4. Guard explicitly (Ch55 audit low-1): odd values would
+    // walk past the array end in the pair loop.
+    initial begin
+        if (TOTAL_WORDS < 4 || (TOTAL_WORDS & 1) != 0) begin
+            $fatal(1, "boot_install_agent_stub: TOTAL_WORDS must be even and >= 4, got %0d",
+                   TOTAL_WORDS);
+        end
+    end
+
+    logic [31:0] payload [0:TOTAL_WORDS-1];
+
+    initial begin
+        if (USE_IMAGE_FILE) begin
+            if (IMAGE_FILE == "") begin
+                $fatal(1, "boot_install_agent_stub: USE_IMAGE_FILE=1 but IMAGE_FILE is empty");
+            end
+            $readmemh(IMAGE_FILE, payload);
+        end else begin
+            payload[0] = 32'h401A7000;             // MFC0  $26, $14
+            payload[1] = 32'h275A0004;             // ADDIU $26, $26, 4
+            payload[2] = 32'h03400008;             // JR    $26
+            payload[3] = 32'h42000010;             // RFE (delay slot)
+            for (int i = 4; i < TOTAL_WORDS; i = i + 2) begin
+                payload[i]     = 32'h03E00008;     // JR $31
+                payload[i + 1] = 32'h00000000;     // NOP
+            end
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Streaming FSM
+    // ------------------------------------------------------------------
+    typedef enum logic [1:0] {
+        S_IDLE   = 2'd0,
+        S_STREAM = 2'd1,
+        S_DONE   = 2'd2
+    } state_e;
+
+    state_e      state;
+    logic [31:0] idx;                              // next word to emit
+
+    logic accept_beat;
+    assign accept_beat = out_valid && out_ready;
+
+    assign out_valid = (state == S_STREAM);
+    assign out_data  = (state == S_STREAM) ? payload[idx[$clog2(TOTAL_WORDS)-1:0]]
+                                           : 32'd0;
+    assign out_last  = (state == S_STREAM) && (idx == TOTAL_WORDS - 1);
+    assign busy_o    = (state == S_STREAM);
+    assign done_o    = (state == S_DONE);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state <= S_IDLE;
+            idx   <= 32'd0;
+        end else begin
+            unique case (state)
+                S_IDLE: begin
+                    if (go_i) begin
+                        state <= S_STREAM;
+                        idx   <= 32'd0;
+                    end
+                end
+                S_STREAM: begin
+                    if (accept_beat) begin
+                        if (idx == TOTAL_WORDS - 1) state <= S_DONE;
+                        else                        idx   <= idx + 32'd1;
+                    end
+                end
+                S_DONE: ;                          // terminal
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace
+    // ------------------------------------------------------------------
+    // START fires combinationally on the cycle the caller pulses go_i
+    // while we're still in S_IDLE. That cycle has out_valid=0 and
+    // accept_beat=0, so the event doesn't compete with a BEAT event
+    // in the priority if-else below (the bug pre-fix: flopping
+    // go_latched delayed START onto the same cycle as beat 0, dropping
+    // one of the two).
+    logic go_pulse;
+    assign go_pulse = (state == S_IDLE) && go_i;
+
+    logic done_edge;
+    state_e state_prev;
+    always_ff @(posedge clk) begin
+        if (!rst_n) state_prev <= S_IDLE;
+        else        state_prev <= state;
+    end
+    assign done_edge = (state == S_DONE) && (state_prev != S_DONE);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_DMA_START;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (go_pulse) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_DMA_START;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= {56'd0, MASTER_ID};
+            ev_arg3   <= 64'(TOTAL_WORDS);
+            ev_flags  <= 32'd0;
+        end else if (accept_beat) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_DMA_BEAT;
+            ev_arg0   <= {32'd0, idx};
+            ev_arg1   <= {32'd0, out_data};
+            ev_arg2   <= {56'd0, MASTER_ID};
+            ev_arg3   <= 64'(TOTAL_WORDS);
+            ev_flags  <= {31'd0, out_last};
+        end else if (done_edge) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_DMA_DONE;
+            ev_arg0   <= 64'(TOTAL_WORDS);
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= {56'd0, MASTER_ID};
+            ev_arg3   <= 64'(TOTAL_WORDS);
+            ev_flags  <= 32'd0;
+        end else begin
+            ev_valid  <= 1'b0;
+        end
+    end
+
+endmodule : boot_install_agent_stub
@@ -0,0 +1,181 @@
+// retroDE_ps2 — sif_dma_ack_peer_stub
+//
+// Protocol combiner for the first combined control+data SIF milestone.
+// Observes a mailbox command doorbell on one seam and the SIF DMA receive
+// endpoint's payload-complete indication on the other; only issues the
+// mailbox ack sequence once BOTH are true.
+//
+// Explicitly NOT an IOP. This module has no code execution, no bus master,
+// and no capability beyond composing two existing SIF primitives. Kept
+// under `rtl/sif/` with the other SIF scaffolding.
+//
+// Contract refs:
+//   docs/contracts/sif.md
+//
+// Layering:
+//   sif_mailbox_stub         — storage primitive
+//   sif_mailbox_peer_stub    — mailbox-only active peer (no DMA awareness)
+//   sif_dma_stub             — data-plane receive endpoint
+//   sif_dma_ack_peer_stub    — THIS module. Wires the two together.
+//
+// Protocol (one-shot):
+//   1. EE writes MSCOM = cmd
+//   2. EE writes MSFLG = CMD_PENDING_BIT    (request doorbell)
+//   3. DMAC transfers bounded payload into sif_dma_stub
+//   4. sif_dma_stub asserts last_seen once the final beat arrives
+//   5. this peer observes (MSFLG & CMD_PENDING_BIT) AND last_seen
+//   6. peer reads MSCOM
+//   7. peer writes SMCOM = cmd
+//   8. peer writes SMFLG = CMD_ACK_BIT
+//   9. terminal DONE (one-shot for this milestone)
+//
+// The peer does NOT clear MSFLG or SMFLG — lifecycle is the TB's
+// responsibility, consistent with sif_mailbox_peer_stub's guardrail.
+//
+// Ports connect to:
+//   obs_*   → sif_mailbox_stub iop_rd_* (peer reads MSFLG, then MSCOM)
+//   resp_*  → sif_mailbox_stub iop_wr_* (peer writes SMCOM, then SMFLG)
+//   payload_complete ← sif_dma_stub.last_seen
+
+`timescale 1ns/1ps
+
+module sif_dma_ack_peer_stub
+#(
+    parameter logic [7:0]  MSCOM_OFF       = 8'h00,
+    parameter logic [7:0]  SMCOM_OFF       = 8'h10,
+    parameter logic [7:0]  MSFLG_OFF       = 8'h20,
+    parameter logic [7:0]  SMFLG_OFF       = 8'h30,
+    parameter logic [31:0] CMD_PENDING_BIT = 32'h0000_0001,
+    parameter logic [31:0] CMD_ACK_BIT     = 32'h0000_0002
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // Mailbox observation (IOP-side read port)
+    output logic        obs_rd_en,
+    output logic [7:0]  obs_rd_addr,
+    input  logic [31:0] obs_rd_data,
+    input  logic        obs_rd_valid,
+
+    // Mailbox response (IOP-side write port)
+    output logic        resp_wr_en,
+    output logic [7:0]  resp_wr_addr,
+    output logic [31:0] resp_wr_data,
+
+    // Payload completion indication from sif_dma_stub (level)
+    input  logic        payload_complete,
+
+    // Status
+    output logic        done_o,
+    output logic [31:0] ack_count_o
+);
+
+    typedef enum logic [2:0] {
+        S_POLL_REQ    = 3'd0,  // pulse rd_en for MSFLG
+        S_POLL_WAIT   = 3'd1,  // wait for rd_valid, gate on BOTH conditions
+        S_MSCOM_REQ   = 3'd2,  // pulse rd_en for MSCOM
+        S_MSCOM_WAIT  = 3'd3,  // wait for rd_valid, latch cmd
+        S_WRITE_SMCOM = 3'd4,  // drive wr_en, addr=SMCOM, data=cmd
+        S_WRITE_SMFLG = 3'd5,  // drive wr_en, addr=SMFLG, data=ACK
+        S_DONE        = 3'd6   // terminal (one-shot for this milestone)
+    } state_e;
+
+    state_e      state;
+    logic [31:0] latched_cmd;
+
+    // ------------------------------------------------------------------
+    // State machine — advance to MSCOM_REQ only when MSFLG pending is set
+    // AND payload_complete is observed simultaneously. This is the
+    // load-bearing guarantee of the whole combiner.
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state       <= S_POLL_REQ;
+            latched_cmd <= 32'd0;
+        end else begin
+            unique case (state)
+                S_POLL_REQ: state <= S_POLL_WAIT;
+
+                S_POLL_WAIT: begin
+                    if (obs_rd_valid) begin
+                        if (((obs_rd_data & CMD_PENDING_BIT) != 32'd0) &&
+                            payload_complete)
+                            state <= S_MSCOM_REQ;
+                        else
+                            state <= S_POLL_REQ;       // keep polling
+                    end
+                end
+
+                S_MSCOM_REQ: state <= S_MSCOM_WAIT;
+
+                S_MSCOM_WAIT: begin
+                    if (obs_rd_valid) begin
+                        latched_cmd <= obs_rd_data;
+                        state       <= S_WRITE_SMCOM;
+                    end
+                end
+
+                S_WRITE_SMCOM: state <= S_WRITE_SMFLG;
+
+                S_WRITE_SMFLG: state <= S_DONE;
+
+                S_DONE: state <= S_DONE;
+
+                default: state <= S_POLL_REQ;
+            endcase
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Output drive (combinational, one-hot on state)
+    // ------------------------------------------------------------------
+
+    always_comb begin
+        obs_rd_en     = 1'b0;
+        obs_rd_addr   = 8'd0;
+        resp_wr_en    = 1'b0;
+        resp_wr_addr  = 8'd0;
+        resp_wr_data  = 32'd0;
+
+        unique case (state)
+            S_POLL_REQ: begin
+                obs_rd_en   = 1'b1;
+                obs_rd_addr = MSFLG_OFF;
+            end
+            S_MSCOM_REQ: begin
+                obs_rd_en   = 1'b1;
+                obs_rd_addr = MSCOM_OFF;
+            end
+            S_WRITE_SMCOM: begin
+                resp_wr_en   = 1'b1;
+                resp_wr_addr = SMCOM_OFF;
+                resp_wr_data = latched_cmd;
+            end
+            S_WRITE_SMFLG: begin
+                resp_wr_en   = 1'b1;
+                resp_wr_addr = SMFLG_OFF;
+                resp_wr_data = CMD_ACK_BIT;
+            end
+            default: ;
+        endcase
+    end
+
+    // ------------------------------------------------------------------
+    // Ack bookkeeping
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ack_count_o <= 32'd0;
+            done_o      <= 1'b0;
+        end else if (state == S_WRITE_SMFLG) begin
+            // S_WRITE_SMFLG is a single-cycle state: unconditionally
+            // transitions to S_DONE on the next edge, so this observes
+            // exactly one completion.
+            ack_count_o <= ack_count_o + 32'd1;
+            done_o      <= 1'b1;
+        end
+    end
+
+endmodule : sif_dma_ack_peer_stub
@@ -0,0 +1,185 @@
+// retroDE_ps2 — sif_dma_ee_ack_peer_stub
+//
+// Protocol combiner for the first reverse-direction (IOP→EE) combined
+// control+data SIF milestone. Mirror of sif_dma_ack_peer_stub with
+// polarity swapped to observe the IOP's doorbell from the EE side and
+// respond back IOP-ward through the mailbox.
+//
+// Explicitly NOT an EE. This module has no code execution, no bus
+// master, no capability beyond composing two existing SIF primitives.
+// Kept under `rtl/sif/` with the other protocol scaffolding so it
+// does not get misread as EE maturity progress.
+//
+// Contract refs:
+//   docs/contracts/sif.md
+//
+// Layering:
+//   sif_mailbox_stub               — storage primitive
+//   sif_dma_ee_ram_bridge_stub     — reverse-direction data-plane landing
+//   sif_dma_ee_ack_peer_stub       — THIS module. Ties them together on
+//                                    the EE side.
+//
+// Protocol (one-shot, reverse direction):
+//   1. IOP writes SMCOM = cmd                 (what the IOP wants to say)
+//   2. IOP writes SMFLG = CMD_PENDING_BIT     (doorbell IOP→EE)
+//   3. IOP DMAC ch9 transfers bounded payload through the SIF egress
+//      bridge, which lands qwords in EE RAM. Bridge's last_seen_o rises
+//      on the final beat and stays high.
+//   4. this peer observes (SMFLG & CMD_PENDING_BIT) AND payload_complete
+//   5. peer reads SMCOM (captures the command)
+//   6. peer writes MSCOM = cmd                (echo back IOP-ward)
+//   7. peer writes MSFLG = CMD_ACK_BIT        (ack back IOP-ward)
+//   8. terminal DONE (one-shot for this milestone)
+//
+// The peer does NOT clear SMFLG or MSFLG — lifecycle is the TB's
+// responsibility, consistent with sif_mailbox_peer_stub's guardrail.
+//
+// Ordering guarantee: the load-bearing behaviour is that the peer will
+// not advance to the ack write sequence unless BOTH the doorbell AND
+// payload_complete are observed simultaneously. The milestone is about
+// verifying that the ack is gated on data arriving, not just control.
+//
+// Ports connect to:
+//   obs_*   → sif_mailbox_stub ee_rd_* (peer reads SMFLG, then SMCOM)
+//   resp_*  → sif_mailbox_stub ee_wr_* (peer writes MSCOM, then MSFLG)
+//   payload_complete ← sif_dma_ee_ram_bridge_stub.last_seen_o
+
+`timescale 1ns/1ps
+
+module sif_dma_ee_ack_peer_stub
+#(
+    parameter logic [7:0]  MSCOM_OFF       = 8'h00,
+    parameter logic [7:0]  SMCOM_OFF       = 8'h10,
+    parameter logic [7:0]  MSFLG_OFF       = 8'h20,
+    parameter logic [7:0]  SMFLG_OFF       = 8'h30,
+    parameter logic [31:0] CMD_PENDING_BIT = 32'h0000_0001,
+    parameter logic [31:0] CMD_ACK_BIT     = 32'h0000_0002
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // Mailbox observation (EE-side read port)
+    output logic        obs_rd_en,
+    output logic [7:0]  obs_rd_addr,
+    input  logic [31:0] obs_rd_data,
+    input  logic        obs_rd_valid,
+
+    // Mailbox response (EE-side write port)
+    output logic        resp_wr_en,
+    output logic [7:0]  resp_wr_addr,
+    output logic [31:0] resp_wr_data,
+
+    // Payload completion indication from sif_dma_ee_ram_bridge_stub (level)
+    input  logic        payload_complete,
+
+    // Status
+    output logic        done_o,
+    output logic [31:0] ack_count_o
+);
+
+    typedef enum logic [2:0] {
+        S_POLL_REQ    = 3'd0,  // pulse rd_en for SMFLG
+        S_POLL_WAIT   = 3'd1,  // wait for rd_valid, gate on BOTH conditions
+        S_SMCOM_REQ   = 3'd2,  // pulse rd_en for SMCOM
+        S_SMCOM_WAIT  = 3'd3,  // wait for rd_valid, latch cmd
+        S_WRITE_MSCOM = 3'd4,  // drive wr_en, addr=MSCOM, data=cmd
+        S_WRITE_MSFLG = 3'd5,  // drive wr_en, addr=MSFLG, data=ACK
+        S_DONE        = 3'd6   // terminal (one-shot for this milestone)
+    } state_e;
+
+    state_e      state;
+    logic [31:0] latched_cmd;
+
+    // ------------------------------------------------------------------
+    // State machine — advance to SMCOM_REQ only when SMFLG pending is set
+    // AND payload_complete is observed simultaneously. This is the
+    // load-bearing guarantee of the whole combiner.
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state       <= S_POLL_REQ;
+            latched_cmd <= 32'd0;
+        end else begin
+            unique case (state)
+                S_POLL_REQ: state <= S_POLL_WAIT;
+
+                S_POLL_WAIT: begin
+                    if (obs_rd_valid) begin
+                        if (((obs_rd_data & CMD_PENDING_BIT) != 32'd0) &&
+                            payload_complete)
+                            state <= S_SMCOM_REQ;
+                        else
+                            state <= S_POLL_REQ;       // keep polling
+                    end
+                end
+
+                S_SMCOM_REQ: state <= S_SMCOM_WAIT;
+
+                S_SMCOM_WAIT: begin
+                    if (obs_rd_valid) begin
+                        latched_cmd <= obs_rd_data;
+                        state       <= S_WRITE_MSCOM;
+                    end
+                end
+
+                S_WRITE_MSCOM: state <= S_WRITE_MSFLG;
+
+                S_WRITE_MSFLG: state <= S_DONE;
+
+                S_DONE: state <= S_DONE;
+
+                default: state <= S_POLL_REQ;
+            endcase
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Output drive (combinational, one-hot on state)
+    // ------------------------------------------------------------------
+
+    always_comb begin
+        obs_rd_en     = 1'b0;
+        obs_rd_addr   = 8'd0;
+        resp_wr_en    = 1'b0;
+        resp_wr_addr  = 8'd0;
+        resp_wr_data  = 32'd0;
+
+        unique case (state)
+            S_POLL_REQ: begin
+                obs_rd_en   = 1'b1;
+                obs_rd_addr = SMFLG_OFF;
+            end
+            S_SMCOM_REQ: begin
+                obs_rd_en   = 1'b1;
+                obs_rd_addr = SMCOM_OFF;
+            end
+            S_WRITE_MSCOM: begin
+                resp_wr_en   = 1'b1;
+                resp_wr_addr = MSCOM_OFF;
+                resp_wr_data = latched_cmd;
+            end
+            S_WRITE_MSFLG: begin
+                resp_wr_en   = 1'b1;
+                resp_wr_addr = MSFLG_OFF;
+                resp_wr_data = CMD_ACK_BIT;
+            end
+            default: ;
+        endcase
+    end
+
+    // ------------------------------------------------------------------
+    // Ack bookkeeping
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ack_count_o <= 32'd0;
+            done_o      <= 1'b0;
+        end else if (state == S_WRITE_MSFLG) begin
+            ack_count_o <= ack_count_o + 32'd1;
+            done_o      <= 1'b1;
+        end
+    end
+
+endmodule : sif_dma_ee_ack_peer_stub
@@ -0,0 +1,202 @@
+// retroDE_ps2 — sif_dma_ee_ram_bridge_stub
+//
+// Width-adapting bridge from a 32-bit SIF DMA endpoint (IOP→EE egress)
+// to the 128-bit EE memory map. Accumulates four incoming 32-bit beats
+// into a qword and issues one qword write through ee_memory_map_stub's
+// bridge write port.
+//
+// Mirror of sif_dma_iop_ram_bridge_stub, but in the other direction
+// (words → qword, EE-side landing).
+//
+// Contract refs:
+//   docs/contracts/sif.md        (DMA-linked data movement endpoints)
+//   docs/contracts/memory.md     (EE RAM is 128-bit qword-aligned)
+//
+// Handshake (upstream, from DMAC ep_* port or equivalent):
+//   in_valid / in_data[31:0] / in_last / in_ready
+//   Bridge asserts in_ready while it's accumulating (up to the 3rd beat
+//   of a quad, inclusive). It drops in_ready during the one-cycle emit
+//   that follows the 4th beat, so the DMAC naturally stalls with
+//   back-pressure for a single cycle between qwords.
+//
+// Handshake (downstream, to ee_memory_map_stub bridge-write port):
+//   bridge_wr_en / bridge_wr_addr[31:0] / bridge_wr_data[127:0] /
+//   bridge_wr_be[15:0] / bridge_master_id[7:0]
+//
+// Data layout (little-endian):
+//   beat 0 → bridge_wr_data[31:0]
+//   beat 1 → bridge_wr_data[63:32]
+//   beat 2 → bridge_wr_data[95:64]
+//   beat 3 → bridge_wr_data[127:96]
+//   qword address advances DEST_BASE_ADDR by 16 per emit.
+//
+// Partial quad on in_last:
+//   If `in_last` arrives before the 4th beat of a quad, the bridge
+//   emits the partial qword with wr_be masked to cover only the bytes
+//   that were actually accepted. Not exercised by the current TB (BCR
+//   is chosen to be a multiple of 4), but kept defensively.
+//
+// Payload-complete indication (last_seen_o):
+//   Level-held output, set when `in_last && accept_beat` fires on the
+//   upstream handshake. Intended for EE-side protocol combiners that
+//   need to gate an ack on "payload fully moved" independently of when
+//   the IOP posted a control doorbell. Latch stays high until reset —
+//   this mirrors sif_dma_stub.last_seen.
+//
+// Parameters:
+//   DEST_BASE_ADDR  — byte offset where the first qword lands. Advances
+//                     by 16 per emit for the life of the transfer.
+//   MASTER_ID       — bridge's identity for MEM / EE-map trace attribution
+//                     (default 5 = SIF EE-side bridge).
+//
+// Non-goals:
+//   - multiple in-flight qwords
+//   - arbitration against other bridge writers on the EE map's write path
+
+`timescale 1ns/1ps
+
+module sif_dma_ee_ram_bridge_stub
+#(
+    parameter logic [31:0] DEST_BASE_ADDR = 32'h0000_0000,
+    parameter logic [7:0]  MASTER_ID      = 8'd5
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Upstream (DMAC endpoint side)
+    input  logic          in_valid,
+    input  logic [31:0]   in_data,
+    input  logic          in_last,
+    output logic          in_ready,
+
+    // Downstream (EE map bridge-write port)
+    output logic          bridge_wr_en,
+    output logic [31:0]   bridge_wr_addr,
+    output logic [127:0]  bridge_wr_data,
+    output logic [15:0]   bridge_wr_be,
+    output logic [7:0]    bridge_master_id,
+
+    // Payload-complete indication (level, latched). Consumers gate on
+    // "full payload landed" without needing to count beats.
+    output logic          last_seen_o,
+
+    // Ch239 — single-cycle "rewind" pulse. When asserted (and the
+    // bridge is idle in S_ACCUM with no beat in flight), the running
+    // `wr_offset` returns to 0 so the NEXT emit lands at
+    // DEST_BASE_ADDR. Lets a producer that wants single-slot buffer
+    // semantics (e.g. a libpad-style pad packet) overwrite the same
+    // 16-byte slot on every transfer instead of streaming forward.
+    // Existing producers that don't need this leave it tied to 1'b0
+    // and the bridge keeps its streaming behaviour exactly as before.
+    // Pulse must be asserted between transfers; firing mid-transfer
+    // (`state==S_EMIT` or `pos != 0`) is illegal and logged as a
+    // sim-only `$error` (no defensive RTL gating — keeps the path
+    // single-purpose). See `docs/contracts/sio2_pad.md` Ch239.
+    input  logic          rewind_i = 1'b0
+);
+
+    typedef enum logic [0:0] {
+        S_ACCUM = 1'b0,
+        S_EMIT  = 1'b1
+    } state_e;
+
+    state_e       state;
+    logic [127:0] acc_data;
+    logic [15:0]  acc_be;
+    logic [1:0]   pos;                              // 0..3 within qword
+    logic [31:0]  wr_offset;                        // running byte offset
+
+    assign in_ready         = (state == S_ACCUM);
+    assign bridge_master_id = MASTER_ID;
+
+    logic accept_beat;
+    assign accept_beat = in_valid && in_ready;
+
+    // ------------------------------------------------------------------
+    // Accumulator / state machine
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state     <= S_ACCUM;
+            acc_data  <= 128'd0;
+            acc_be    <= 16'd0;
+            pos       <= 2'd0;
+            wr_offset <= 32'd0;
+        end else begin
+            // Ch239 — between-transfer rewind. Resets only the
+            // streaming offset; `acc_data`/`acc_be`/`pos` are
+            // already 0 after every emit's tail. Misuse (rewind
+            // pulse during a transfer) is reported via sim $error
+            // below; the RTL still applies the rewind because the
+            // guard would otherwise hide producer-side bugs.
+            if (rewind_i) wr_offset <= 32'd0;
+
+            unique case (state)
+                S_ACCUM: begin
+                    if (accept_beat) begin
+                        // Place the incoming word in slot `pos` and mark
+                        // its four bytes enabled.
+                        acc_data[pos*32 +: 32] <= in_data;
+                        acc_be[pos*4  +: 4]    <= 4'b1111;
+
+                        if (pos == 2'd3 || in_last) begin
+                            state <= S_EMIT;
+                        end else begin
+                            pos <= pos + 2'd1;
+                        end
+                    end
+                end
+
+                S_EMIT: begin
+                    // Single-cycle emit; bridge_wr_en is combinationally
+                    // tied to state. Advance qword offset, reset slot /
+                    // accumulator for the next quad. The Ch239 rewind
+                    // above runs first, so a `rewind_i` pulse coincident
+                    // with an emit cycle leaves wr_offset at 0 (no +16
+                    // increment) — but that combination is the illegal
+                    // "rewind mid-transfer" case and the $error below
+                    // catches it for the producer to fix.
+                    wr_offset <= wr_offset + 32'd16;
+                    acc_data  <= 128'd0;
+                    acc_be    <= 16'd0;
+                    pos       <= 2'd0;
+                    state     <= S_ACCUM;
+                end
+
+                default: state <= S_ACCUM;
+            endcase
+        end
+    end
+
+`ifndef SYNTHESIS
+    // Misuse detector — `rewind_i` while a transfer is in flight is
+    // a producer-side bug. Caught here so the path stays clean.
+    always_ff @(posedge clk) begin
+        if (rst_n && rewind_i && (state != S_ACCUM || pos != 2'd0)) begin
+            $error("[sif_dma_ee_ram_bridge_stub] illegal rewind_i mid-transfer (state=%0d pos=%0d)",
+                   state, pos);
+        end
+    end
+`endif
+
+    // ------------------------------------------------------------------
+    // Downstream write-port drive (combinational on state)
+    // ------------------------------------------------------------------
+
+    assign bridge_wr_en   = (state == S_EMIT);
+    assign bridge_wr_addr = DEST_BASE_ADDR + wr_offset;
+    assign bridge_wr_data = acc_data;
+    assign bridge_wr_be   = acc_be;
+
+    // ------------------------------------------------------------------
+    // last_seen_o: set once the upstream asserts in_last on a beat that
+    // is actually accepted. Level-held until reset.
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n)                      last_seen_o <= 1'b0;
+        else if (accept_beat && in_last) last_seen_o <= 1'b1;
+    end
+
+endmodule : sif_dma_ee_ram_bridge_stub
@@ -0,0 +1,139 @@
+// retroDE_ps2 — sif_dma_iop_ram_bridge_stub
+//
+// Width-adapting bridge from a 128-bit SIF DMA endpoint to the 32-bit
+// IOP memory map. Splits each incoming qword into four 32-bit writes at
+// consecutive physical addresses starting from DEST_BASE_ADDR.
+//
+// First real coupling between the SIF data-plane and the IOP side. NOT an
+// IOP, not a DMAC, not a peer — just a width/ordering adapter.
+//
+// Contract refs:
+//   docs/contracts/sif.md        (DMA-linked data movement endpoints)
+//   docs/contracts/iop.md        (IOP-local RAM/I/O decode; writes land
+//                                 through the IOP memory map)
+//
+// Handshake (upstream, from DMAC ep_* port or equivalent):
+//   in_valid / in_data[127:0] / in_last / in_ready
+//   Bridge asserts in_ready only while idle. During the four-write
+//   expansion of a qword, in_ready drops — natural backpressure onto
+//   whatever's producing qwords.
+//
+// Handshake (downstream, to iop_memory_map_stub's bridge-write port):
+//   bridge_wr_en / bridge_wr_addr[31:0] / bridge_wr_data[31:0] /
+//   bridge_wr_be[3:0] / bridge_master_id[7:0]
+//   Addresses are physical (no kseg stripping) — the IOP map must treat
+//   this port's addresses differently from its CPU-side port.
+//
+// Data layout:
+//   Little-endian unpacking: in_data[31:0]   -> DEST_BASE+0
+//                            in_data[63:32]  -> DEST_BASE+4
+//                            in_data[95:64]  -> DEST_BASE+8
+//                            in_data[127:96] -> DEST_BASE+12
+//   Subsequent qwords append: DEST_BASE+16, +20, +24, +28, ...
+//
+// Parameters:
+//   DEST_BASE_ADDR  — where the bridge starts writing. Persistent across
+//                     the life of the transfer; would become a register in
+//                     a later wave where software programs the target.
+//   MASTER_ID       — bridge's identity in MEM / IOP traces (default 3,
+//                     distinct from EE IFETCH=0, DMAC=1, IOP_CPU=2).
+//
+// Non-goals:
+//   - multiple in-flight qwords
+//   - ack back upstream beyond in_ready / in_last observation
+//   - byte-enable variation per write (all writes are full 32-bit)
+//   - arbitration against other masters on the map's write path
+
+`timescale 1ns/1ps
+
+module sif_dma_iop_ram_bridge_stub
+#(
+    parameter logic [31:0] DEST_BASE_ADDR = 32'h0000_0000,
+    parameter logic [7:0]  MASTER_ID      = 8'd3
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // Upstream (DMAC endpoint side)
+    input  logic          in_valid,
+    input  logic [127:0]  in_data,
+    input  logic          in_last,
+    output logic          in_ready,
+
+    // Downstream (IOP map bridge-write port)
+    output logic          bridge_wr_en,
+    output logic [31:0]   bridge_wr_addr,
+    output logic [31:0]   bridge_wr_data,
+    output logic [3:0]    bridge_wr_be,
+    output logic [7:0]    bridge_master_id
+);
+
+    typedef enum logic [1:0] {
+        S_IDLE  = 2'd0,
+        S_WRITE = 2'd1
+    } state_e;
+
+    state_e       state;
+    logic [127:0] latched_qword;
+    logic [1:0]   beat_index;              // 0..3 across the 4 writes
+    logic [31:0]  wr_offset;                // running byte offset
+
+    assign in_ready         = (state == S_IDLE);
+    assign bridge_master_id = MASTER_ID;
+
+    logic accept_new_qword;
+    assign accept_new_qword = in_valid && in_ready;
+
+    // ------------------------------------------------------------------
+    // State machine
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state         <= S_IDLE;
+            latched_qword <= 128'd0;
+            beat_index    <= 2'd0;
+            wr_offset     <= 32'd0;
+        end else begin
+            unique case (state)
+                S_IDLE: begin
+                    if (accept_new_qword) begin
+                        latched_qword <= in_data;
+                        beat_index    <= 2'd0;
+                        state         <= S_WRITE;
+                    end
+                end
+
+                S_WRITE: begin
+                    // Each cycle here drives one 32-bit write. After the
+                    // fourth, go idle. wr_offset advances per write.
+                    wr_offset <= wr_offset + 32'd4;
+                    if (beat_index == 2'd3) begin
+                        state      <= S_IDLE;
+                        beat_index <= 2'd0;
+                    end else begin
+                        beat_index <= beat_index + 2'd1;
+                    end
+                end
+
+                default: state <= S_IDLE;
+            endcase
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Downstream write-port drive (combinational on state)
+    // ------------------------------------------------------------------
+
+    // Indexed part-select picks the 32-bit slice for the current beat.
+    // Avoids the constant-select-in-always_comb pattern that trips
+    // portability warnings on some simulators.
+    logic [31:0] beat_data;
+    assign beat_data = latched_qword[beat_index*32 +: 32];
+
+    assign bridge_wr_en   = (state == S_WRITE);
+    assign bridge_wr_addr = DEST_BASE_ADDR + wr_offset;
+    assign bridge_wr_data = beat_data;
+    assign bridge_wr_be   = 4'b1111;        // full-word writes only
+
+endmodule : sif_dma_iop_ram_bridge_stub
@@ -0,0 +1,154 @@
+// retroDE_ps2 — sif_dma_stub
+//
+// Minimal SIF DMA receive-side endpoint. First data-plane step on the SIF
+// seam. NOT an IOP — this is a bounded receive buffer that accepts qwords
+// from a DMAC channel and exposes them to the TB via a small read port.
+// No IOP CPU, no live peer logic, no directional policy beyond "incoming
+// qwords land in sequential slots."
+//
+// Contract refs:
+//   docs/contracts/sif.md            (DMA-linked data movement endpoints)
+//
+// Receive interface (connects to DMAC's ep_* endpoint):
+//   in_valid / in_data / in_last / in_ready
+//   One-cycle accept per beat when in_ready is high. in_last observed
+//   alongside the final qword of a transfer.
+//
+// Read interface (TB-side verification):
+//   rd_en pulses with rd_idx; rd_data / rd_valid return the stored qword
+//   one cycle later.
+//
+// Stall input:
+//   stall_in (level) forces in_ready low while asserted. Used by the
+//   negative-path test to prove that a not-ready receiver does not let
+//   the DMAC spuriously complete.
+//
+// Buffer:
+//   Small internal array (DEPTH qwords). Full detection is tracked from
+//   `rx_count`: once `rx_count >= DEPTH` the buffer is full and `in_ready`
+//   drops so the DMAC stalls in ACTIVE_SEND. No silent wrap. There is no
+//   consume path yet — once full, the buffer stays full (intentional for
+//   the current scope). `full_o` is exposed for testbench observation.
+//
+// Trace:
+//   One SIF EV_WRITE per accepted beat (one event per cycle).
+//   arg0 = slot index into the receive buffer
+//   arg1 = data[63:0] (low half)
+//   arg2 = source id (hard-wired to 8'd1 = DMAC for Wave 3)
+//   arg3 = 0
+//   flags bit 0 = in_last value for this beat
+//   flags bit 1 = 1 (distinguishes DMA-receive writes from mailbox writes
+//                    if both subsystems are ever instantiated together)
+
+`timescale 1ns/1ps
+
+module sif_dma_stub
+    import trace_pkg::*;
+#(
+    parameter int DEPTH = 8          // max qwords buffered
+) (
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // DMAC-facing receive
+    input  logic          in_valid,
+    input  logic [127:0]  in_data,
+    input  logic          in_last,
+    output logic          in_ready,
+
+    // TB verification read port
+    input  logic          rd_en,
+    input  logic [$clog2(DEPTH)-1:0] rd_idx,
+    output logic [127:0]  rd_data,
+    output logic          rd_valid,
+
+    // Negative-path control
+    input  logic          stall_in,
+
+    // Status
+    output logic [31:0]   rx_count,          // monotonic accepted-beat count
+    output logic          last_seen,         // sticky: in_last observed
+    output logic          full_o,            // buffer full, in_ready=0
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam int IDX_W = $clog2(DEPTH);
+
+    logic [127:0] buf_mem [0:DEPTH-1];
+    logic [IDX_W-1:0] wr_ptr;
+
+    logic beat_accepted;
+    assign full_o        = (rx_count >= DEPTH);
+    assign in_ready      = !stall_in && !full_o;
+    assign beat_accepted = in_valid && in_ready;
+
+    // ------------------------------------------------------------------
+    // Receive path
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            wr_ptr    <= '0;
+            rx_count  <= 32'd0;
+            last_seen <= 1'b0;
+            for (int i = 0; i < DEPTH; i++) buf_mem[i] <= 128'd0;
+        end else if (beat_accepted) begin
+            buf_mem[wr_ptr] <= in_data;
+            wr_ptr          <= wr_ptr + IDX_W'(1);
+            rx_count        <= rx_count + 32'd1;
+            if (in_last) last_seen <= 1'b1;
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Read port (1-cycle latency)
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            rd_data  <= 128'd0;
+            rd_valid <= 1'b0;
+        end else begin
+            rd_valid <= rd_en;
+            if (rd_en) rd_data <= buf_mem[rd_idx];
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace — one event per accepted beat
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (beat_accepted) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {{(64-IDX_W){1'b0}}, wr_ptr};
+            ev_arg1   <= in_data[63:0];
+            ev_arg2   <= 64'd1;                    // DMAC
+            ev_arg3   <= 64'd0;
+            ev_flags  <= {30'd0, 1'b1, in_last};   // bit1=DMA, bit0=in_last
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : sif_dma_stub
@@ -0,0 +1,184 @@
+// retroDE_ps2 — sif_mailbox_peer_stub
+//
+// Re-armable active peer for the SIF mailbox. Second step on the two-actor
+// coordination track (A'' — lifecycle). Observes one mailbox/flag pattern
+// and responds with a known acknowledgement pattern. NOT an IOP — does not
+// execute code, does not boot anything, does not claim to be a CPU.
+//
+// Contract refs:
+//   docs/contracts/sif.md           (mailbox/flag-only SIF stub)
+//   docs/stub_module_plan.md        (Wave 2 SIF track)
+//
+// Canonical command-echo protocol:
+//   1. EE writes MSCOM = cmd
+//   2. EE writes MSFLG = CMD_PENDING_BIT (doorbell rising edge)
+//   3. peer polls MSFLG; when it sees CMD_PENDING_BIT set AND it has not
+//      already responded to the current request, it reads MSCOM
+//   4. peer writes SMCOM = <the cmd it just read>
+//   5. peer writes SMFLG = CMD_ACK_BIT
+//   6. peer latches `responded` and resumes polling; it will NOT respond
+//      again until the TB (or EE) clears CMD_PENDING_BIT in MSFLG
+//   7. when the peer observes CMD_PENDING_BIT cleared, `responded` clears
+//      and the next rising edge of CMD_PENDING_BIT triggers a fresh echo
+//
+// The peer still does NOT clear any mailbox state itself. Re-arm is the
+// TB's responsibility; the peer just refuses to double-fire while the
+// doorbell bit is still high.
+//
+// Ports connect directly to sif_mailbox_stub's IOP-side register port:
+//   obs_*  → mailbox iop_rd_* (peer reads MSFLG then MSCOM)
+//   resp_* → mailbox iop_wr_* (peer writes SMCOM then SMFLG)
+//
+// All peer activity is visible through the mailbox's own trace output
+// (side_id=IOP=1). The peer does not emit its own trace; `ack_count_o`
+// provides a testbench synchronisation point.
+
+`timescale 1ns/1ps
+
+module sif_mailbox_peer_stub
+#(
+    parameter logic [7:0]  MSCOM_OFF       = 8'h00,
+    parameter logic [7:0]  SMCOM_OFF       = 8'h10,
+    parameter logic [7:0]  MSFLG_OFF       = 8'h20,
+    parameter logic [7:0]  SMFLG_OFF       = 8'h30,
+    parameter logic [31:0] CMD_PENDING_BIT = 32'h0000_0001,
+    parameter logic [31:0] CMD_ACK_BIT     = 32'h0000_0002
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+
+    // Observation — connects to mailbox iop_rd_*
+    output logic        obs_rd_en,
+    output logic [7:0]  obs_rd_addr,
+    input  logic [31:0] obs_rd_data,
+    input  logic        obs_rd_valid,
+
+    // Response — connects to mailbox iop_wr_*
+    output logic        resp_wr_en,
+    output logic [7:0]  resp_wr_addr,
+    output logic [31:0] resp_wr_data,
+
+    // Status
+    output logic        done_o,        // latched high after the first ack
+    output logic [31:0] ack_count_o    // monotonic count of completed acks
+);
+
+    typedef enum logic [2:0] {
+        S_POLL_REQ    = 3'd0,  // drive rd_en for MSFLG
+        S_POLL_WAIT   = 3'd1,  // wait for obs_rd_valid, decide
+        S_MSCOM_REQ   = 3'd2,  // drive rd_en for MSCOM
+        S_MSCOM_WAIT  = 3'd3,  // wait for obs_rd_valid, latch cmd
+        S_WRITE_SMCOM = 3'd4,  // drive wr_en, addr=SMCOM, data=cmd
+        S_WRITE_SMFLG = 3'd5   // drive wr_en, addr=SMFLG, data=ACK
+    } state_e;
+
+    state_e      state;
+    logic [31:0] latched_cmd;
+    logic        responded;      // peer has already acked the current
+                                 // doorbell assertion; suppresses re-fire
+                                 // until the doorbell is observed low
+
+    // ------------------------------------------------------------------
+    // State machine
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            state       <= S_POLL_REQ;
+            latched_cmd <= 32'd0;
+            responded   <= 1'b0;
+        end else begin
+            unique case (state)
+                S_POLL_REQ: state <= S_POLL_WAIT;
+
+                S_POLL_WAIT: begin
+                    if (obs_rd_valid) begin
+                        if (responded) begin
+                            // Waiting for the TB to clear CMD_PENDING_BIT
+                            // before we arm again.
+                            if ((obs_rd_data & CMD_PENDING_BIT) == 32'd0)
+                                responded <= 1'b0;
+                            state <= S_POLL_REQ;
+                        end else begin
+                            if ((obs_rd_data & CMD_PENDING_BIT) != 32'd0)
+                                state <= S_MSCOM_REQ;
+                            else
+                                state <= S_POLL_REQ;
+                        end
+                    end
+                end
+
+                S_MSCOM_REQ: state <= S_MSCOM_WAIT;
+
+                S_MSCOM_WAIT: begin
+                    if (obs_rd_valid) begin
+                        latched_cmd <= obs_rd_data;
+                        state       <= S_WRITE_SMCOM;
+                    end
+                end
+
+                S_WRITE_SMCOM: state <= S_WRITE_SMFLG;
+
+                S_WRITE_SMFLG: begin
+                    responded <= 1'b1;       // refuse to re-fire until MSFLG
+                                             // clears
+                    state     <= S_POLL_REQ;
+                end
+
+                default: state <= S_POLL_REQ;
+            endcase
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Output drive (combinational, one-hot on state)
+    // ------------------------------------------------------------------
+
+    always_comb begin
+        obs_rd_en     = 1'b0;
+        obs_rd_addr   = 8'd0;
+        resp_wr_en    = 1'b0;
+        resp_wr_addr  = 8'd0;
+        resp_wr_data  = 32'd0;
+
+        unique case (state)
+            S_POLL_REQ: begin
+                obs_rd_en   = 1'b1;
+                obs_rd_addr = MSFLG_OFF;
+            end
+            S_MSCOM_REQ: begin
+                obs_rd_en   = 1'b1;
+                obs_rd_addr = MSCOM_OFF;
+            end
+            S_WRITE_SMCOM: begin
+                resp_wr_en   = 1'b1;
+                resp_wr_addr = SMCOM_OFF;
+                resp_wr_data = latched_cmd;
+            end
+            S_WRITE_SMFLG: begin
+                resp_wr_en   = 1'b1;
+                resp_wr_addr = SMFLG_OFF;
+                resp_wr_data = CMD_ACK_BIT;
+            end
+            default: ;
+        endcase
+    end
+
+    // ------------------------------------------------------------------
+    // Ack bookkeeping
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ack_count_o <= 32'd0;
+            done_o      <= 1'b0;
+        end else if (state == S_WRITE_SMFLG) begin
+            // S_WRITE_SMFLG is a single-cycle state: the state machine
+            // unconditionally transitions to S_POLL_REQ on the next edge,
+            // so this branch is observed exactly once per completed ack.
+            ack_count_o <= ack_count_o + 32'd1;
+            done_o      <= 1'b1;
+        end
+    end
+
+endmodule : sif_mailbox_peer_stub
@@ -0,0 +1,230 @@
+// retroDE_ps2 — sif_mailbox_stub
+//
+// Minimal EE↔IOP subsystem-interface mailbox shell. First stub on the SIF
+// track. Standalone unit — does not yet integrate with any live IOP core.
+// Testbenches drive both the EE-side port and the IOP-side port directly,
+// playing both roles, to prove the register semantics without requiring
+// a full dual-CPU bring-up.
+//
+// Contract refs:
+//   docs/stub_module_plan.md        (Wave 2, item 10)
+//   docs/contracts/sif.md           (mailbox/flag-only stub is allowed here)
+//
+// Register surface (offsets within the SIF block):
+//   0x00  MSCOM  — 32-bit mailbox, conventionally EE→IOP
+//   0x10  SMCOM  — 32-bit mailbox, conventionally IOP→EE
+//   0x20  MSFLG  — 32-bit flag word, conventionally EE-owned for set,
+//                  IOP-owned for clear (directional semantics deferred)
+//   0x30  SMFLG  — 32-bit flag word, conventionally IOP-owned for set,
+//                  EE-owned for clear (directional semantics deferred)
+//
+// Wave 2 scope intentionally does NOT enforce direction or set/clear
+// semantics. Both ports can read and write any register with plain
+// replace-on-write. The trace records which side initiated each access
+// (side_id in arg2) so future-wave work can layer directional rules on
+// top without changing the storage model.
+//
+// Port semantics:
+//   Each side (EE / IOP) has an independent register port:
+//     wr_en, rd_en, addr[7:0], wr_data[31:0], rd_data[31:0], rd_valid
+//   Reads have 1-cycle latency to match the existing stub ecosystem.
+//
+// Write arbitration (per-register):
+//   - EE and IOP writes to *different* registers on the same cycle both
+//     land. Storage is not serialized across independent registers.
+//   - EE and IOP writes to the *same* register on the same cycle: EE
+//     wins, IOP write is dropped that cycle.
+//   - Trace is limited to one event per cycle by the shared trace bus
+//     (priority EE > IOP). An IOP write that lands silently when EE is
+//     driving a different register will not be traced this wave — future
+//     waves can add a second trace output port if that becomes a gap.
+//
+// Trace payload schema (SUBSYS_SIF, existing EV_READ/EV_WRITE codes):
+//   SIF WRITE  arg0=offset arg1=data arg2=side_id arg3=0  flags[0]=1
+//   SIF READ   arg0=offset arg1=data arg2=side_id arg3=0  flags[0]=0
+//     side_id: 0 = EE, 1 = IOP
+//
+// Trace priority on same cycle: EE write > IOP write > EE read > IOP read.
+// In practice TBs drive at most one operation per cycle.
+
+`timescale 1ns/1ps
+
+module sif_mailbox_stub
+    import trace_pkg::*;
+(
+    input  logic          clk,
+    input  logic          rst_n,
+
+    // EE-side register port
+    input  logic          ee_wr_en,
+    input  logic          ee_rd_en,
+    input  logic [7:0]    ee_addr,
+    input  logic [31:0]   ee_wr_data,
+    output logic [31:0]   ee_rd_data,
+    output logic          ee_rd_valid,
+
+    // IOP-side register port
+    input  logic          iop_wr_en,
+    input  logic          iop_rd_en,
+    input  logic [7:0]    iop_addr,
+    input  logic [31:0]   iop_wr_data,
+    output logic [31:0]   iop_rd_data,
+    output logic          iop_rd_valid,
+
+    // Trace
+    output logic          ev_valid,
+    output subsys_e       ev_subsys,
+    output event_e        ev_event,
+    output logic [63:0]   ev_arg0,
+    output logic [63:0]   ev_arg1,
+    output logic [63:0]   ev_arg2,
+    output logic [63:0]   ev_arg3,
+    output logic [31:0]   ev_flags
+);
+
+    localparam logic [7:0] MSCOM_OFF = 8'h00;
+    localparam logic [7:0] SMCOM_OFF = 8'h10;
+    localparam logic [7:0] MSFLG_OFF = 8'h20;
+    localparam logic [7:0] SMFLG_OFF = 8'h30;
+
+    localparam logic [63:0] SIDE_EE  = 64'd0;
+    localparam logic [63:0] SIDE_IOP = 64'd1;
+
+    // ------------------------------------------------------------------
+    // Register file
+    // ------------------------------------------------------------------
+
+    logic [31:0] mscom;
+    logic [31:0] smcom;
+    logic [31:0] msflg;
+    logic [31:0] smflg;
+
+    function automatic logic [31:0] select_reg(input logic [7:0] offset,
+                                               input logic [31:0] mscom_v,
+                                               input logic [31:0] smcom_v,
+                                               input logic [31:0] msflg_v,
+                                               input logic [31:0] smflg_v);
+        case (offset)
+            MSCOM_OFF: select_reg = mscom_v;
+            SMCOM_OFF: select_reg = smcom_v;
+            MSFLG_OFF: select_reg = msflg_v;
+            SMFLG_OFF: select_reg = smflg_v;
+            default:   select_reg = 32'hDEAD_BEEF;
+        endcase
+    endfunction
+
+    // Per-register write arbitration: EE wins on same-register collision,
+    // but writes to different registers land independently.
+    logic ee_hits_mscom, ee_hits_smcom, ee_hits_msflg, ee_hits_smflg;
+    logic iop_hits_mscom, iop_hits_smcom, iop_hits_msflg, iop_hits_smflg;
+
+    assign ee_hits_mscom = ee_wr_en && (ee_addr == MSCOM_OFF);
+    assign ee_hits_smcom = ee_wr_en && (ee_addr == SMCOM_OFF);
+    assign ee_hits_msflg = ee_wr_en && (ee_addr == MSFLG_OFF);
+    assign ee_hits_smflg = ee_wr_en && (ee_addr == SMFLG_OFF);
+
+    assign iop_hits_mscom = iop_wr_en && (iop_addr == MSCOM_OFF);
+    assign iop_hits_smcom = iop_wr_en && (iop_addr == SMCOM_OFF);
+    assign iop_hits_msflg = iop_wr_en && (iop_addr == MSFLG_OFF);
+    assign iop_hits_smflg = iop_wr_en && (iop_addr == SMFLG_OFF);
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            mscom <= 32'd0;
+            smcom <= 32'd0;
+            msflg <= 32'd0;
+            smflg <= 32'd0;
+        end else begin
+            if      (ee_hits_mscom)  mscom <= ee_wr_data;
+            else if (iop_hits_mscom) mscom <= iop_wr_data;
+
+            if      (ee_hits_smcom)  smcom <= ee_wr_data;
+            else if (iop_hits_smcom) smcom <= iop_wr_data;
+
+            if      (ee_hits_msflg)  msflg <= ee_wr_data;
+            else if (iop_hits_msflg) msflg <= iop_wr_data;
+
+            if      (ee_hits_smflg)  smflg <= ee_wr_data;
+            else if (iop_hits_smflg) smflg <= iop_wr_data;
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Reads (1-cycle latency, both ports independent)
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ee_rd_data  <= 32'd0;
+            ee_rd_valid <= 1'b0;
+            iop_rd_data <= 32'd0;
+            iop_rd_valid <= 1'b0;
+        end else begin
+            ee_rd_valid <= ee_rd_en;
+            if (ee_rd_en)
+                ee_rd_data <= select_reg(ee_addr, mscom, smcom, msflg, smflg);
+
+            iop_rd_valid <= iop_rd_en;
+            if (iop_rd_en)
+                iop_rd_data <= select_reg(iop_addr, mscom, smcom, msflg, smflg);
+        end
+    end
+
+    // ------------------------------------------------------------------
+    // Trace emission — priority EE_wr > IOP_wr > EE_rd > IOP_rd.
+    // Reads emit with the data that will be delivered next cycle, keeping
+    // the trace line self-consistent.
+    // ------------------------------------------------------------------
+
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            ev_valid  <= 1'b0;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_READ;
+            ev_arg0   <= 64'd0;
+            ev_arg1   <= 64'd0;
+            ev_arg2   <= 64'd0;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (ee_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {56'd0, ee_addr};
+            ev_arg1   <= {32'd0, ee_wr_data};
+            ev_arg2   <= SIDE_EE;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'h0000_0001;
+        end else if (iop_wr_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_WRITE;
+            ev_arg0   <= {56'd0, iop_addr};
+            ev_arg1   <= {32'd0, iop_wr_data};
+            ev_arg2   <= SIDE_IOP;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'h0000_0001;
+        end else if (ee_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_READ;
+            ev_arg0   <= {56'd0, ee_addr};
+            ev_arg1   <= {32'd0, select_reg(ee_addr, mscom, smcom, msflg, smflg)};
+            ev_arg2   <= SIDE_EE;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else if (iop_rd_en) begin
+            ev_valid  <= 1'b1;
+            ev_subsys <= SUBSYS_SIF;
+            ev_event  <= EV_READ;
+            ev_arg0   <= {56'd0, iop_addr};
+            ev_arg1   <= {32'd0, select_reg(iop_addr, mscom, smcom, msflg, smflg)};
+            ev_arg2   <= SIDE_IOP;
+            ev_arg3   <= 64'd0;
+            ev_flags  <= 32'd0;
+        end else begin
+            ev_valid <= 1'b0;
+        end
+    end
+
+endmodule : sif_mailbox_stub
@@ -0,0 +1,45 @@
+// retroDE_ps2 — de25_nano_pll_stub (Ch151)
+//
+// Sim-friendly stub matching the Quartus IOPLL "pll" module signature
+// used by sibling cores (retroDE_nes/ip/pll/pll_bb.v and
+// retroDE_splash/ip/sys_pll/sys_pll_bb.v). Real synthesis swaps this
+// stub for Terasic-supplied IP via Quartus's IP catalog and a
+// `\`ifdef USE_PLL_IP` gate in the board top.
+//
+// Behavior:
+//   - `outclk_0` is a direct pass-through of `refclk` (no PLL
+//     multiplication; sim doesn't need a different frequency, and a
+//     pass-through still exercises the PLL-gated reset bridge in the
+//     Ch149 board top).
+//   - `locked` rises after a small post-reset delay (~32 cycles),
+//     mimicking real-IP behavior where lock acquires after rst goes
+//     low. Held LOW while `rst` is HIGH.
+//
+// The signature matches Quartus's IOPLL exactly so swapping in the
+// real IP is a single `\`ifdef` at instantiation; the rest of the
+// board top is unchanged.
+
+`timescale 1ns/1ps
+
+module de25_nano_pll_stub (
+    input  wire  refclk,   // reference clock from CLOCK2_50
+    input  wire  rst,      // active-HIGH async reset (Quartus convention)
+    output wire  outclk_0, // pass-through of refclk
+    output wire  locked    // high once "lock" is acquired
+);
+
+    assign outclk_0 = refclk;
+
+    // Lock counter — tick up while rst is low; saturate at 32 and hold
+    // `locked` high. While rst is high, hold counter at 0 and locked low.
+    logic [5:0] lock_cnt;
+    always_ff @(posedge refclk or posedge rst) begin
+        if (rst)
+            lock_cnt <= 6'd0;
+        else if (lock_cnt < 6'd32)
+            lock_cnt <= lock_cnt + 6'd1;
+    end
+
+    assign locked = (lock_cnt == 6'd32);
+
+endmodule : de25_nano_pll_stub
@@ -0,0 +1,666 @@
+// retroDE_ps2 — top_psmct32_raster_demo (Ch146)
+//
+// First hardware-targeted top wrapper, structured around the Ch123 PSMCT32
+// raster end-to-end demo (the simplest direct-color path; see the Ch144
+// hardware-readiness report in docs/contracts/gif_gs.md for rationale and
+// dep-tree audit). This module is the one a board-level synthesis project
+// would target — board-level concerns (HDMI/VGA PHY, pin constraints,
+// .mem bake tooling, clock-domain crossings) are deliberately deferred to
+// later chapters. Ch146's job is to prove the design can be expressed as
+// a single SystemVerilog module with a sensible top-level shape.
+//
+// Topology mirrors the Ch123 TB exactly — the 11 modules in the Ch144
+// dep tree, all instantiated here, with hardware-friendly tweaks:
+//
+//   bios_rom_stub#(.IMAGE_FILE(BIOS_IMAGE_FILE))                — EE bootlet at 0xBFC0_0000
+//   ee_ram_stub#(.IMAGE_FILE(PAYLOAD_IMAGE_FILE))               — GIF payload at phys 0x100
+//   ee_memory_map_stub#(.USEG_SHADOW_WORDS_PARAM(1024))         — Ch145 BRAM shrink
+//   ee_core_stub#(.PC_RESET(0xBFC00000))                        — MIPS R5900 core
+//   ee_gs_priv_bridge_stub                                      — 32-bit MMIO → 64-bit GS-priv
+//   dmac_reg_stub                                               — DMAC ch2
+//   gif_packed_stub#(.REAL_AD_REG_MAP(1'b1))                    — GIFtag + PACKED A+D parser
+//   gs_stub#(.PSMCT32_SWIZZLE(1'b1))                            — GS register file + raster
+//   gif_image_xfer_stub#(.PSMCT32_SWIZZLE(1'b1))                — TRXDIR/IMAGE engine (idle in Ch123)
+//   vram_stub#(.BYTES(8192))                                    — 8 KiB VRAM (one PSMCT32 page)
+//   gs_pcrtc_stub#(.PSMCT32_SWIZZLE(1'b1))                      — PCRTC scanout
+//
+// Differences from the Ch123 TB:
+//   - No procedural ee_prog_word() / preload_qword() drives. The BIOS
+//     bootlet and GIF payload are preloaded by `$readmemh` from the
+//     IMAGE_FILE parameters (default empty = synthetic NOP-sled fallback
+//     in bios_rom_stub for a "won't crash on power-up" smoke baseline,
+//     and an all-zeros ee_ram_stub which yields no DMAC payload but a
+//     stable PCRTC frame).
+//   - useg_shadow_mem trimmed to 1024 words (4 KiB) via Ch145
+//     parameter — no useg traffic in the Ch123 data plane.
+//   - All trace event outputs left open. Status is exposed as a
+//     debug bundle (core_halt, dma_done_seen, frame_seen) that a
+//     board can wire to LEDs.
+//   - The Ch123 TB's collision-check `$error` and observer counters
+//     are TB-only and do not appear here. (Their checks land in the
+//     focused Ch146 TB tb_top_psmct32_raster_demo.sv instead.)
+//
+// Top-level ports:
+//   clk, rst_n          — single clock domain, active-low synchronous reset
+//   core_go             — pulsed high for one cycle to start the EE bootlet
+//                         (a board reset-release sequencer can tie it high
+//                         after rst_n deasserts)
+//   r/g/b, hsync, vsync, de  — 8-bit RGB scanout (PCRTC active region)
+//   core_halt           — high once SYSCALL halts the EE
+//   dma_done_seen       — sticky: high once DMAC channel-2 fires its DONE event
+//   frame_seen          — sticky: high once one full PCRTC frame end-of-frame fires
+//
+// Parameters:
+//   H_ACTIVE / V_ACTIVE — PCRTC active region (defaults to the Ch123 16×8)
+//   BIOS_SIZE_BYTES     — bios_rom_stub size (default 4 KiB)
+//   RAM_SIZE_BYTES      — ee_ram_stub size (default 4 KiB)
+//   VRAM_BYTES          — vram_stub size (default 8 KiB)
+//   USEG_SHADOW_WORDS_PARAM — Ch145 useg-shadow size (default 1024 = 4 KiB)
+//
+// Macros (NOT parameters — iverilog-12 string-parameter forwarding
+// limitation forced them to be macros; see the `\`define` block
+// below the `timescale directive):
+//   TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE     — path to bios.mem
+//                                                 (one 32-bit hex word/line)
+//   TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE  — path to payload.mem
+//                                                 (one 128-bit hex qword/line)
+// Both default to "" so the wrapper is still elaborable without
+// fixtures (synthetic NOP-sled in bios_rom_stub + zero-init
+// ee_ram_stub, which produces no DMAC payload but a stable PCRTC
+// frame). On synthesis these become FPGA-tool defines.
+//
+// PASS for the Ch146 focused TB matches Ch123 exactly:
+//   dma=(1,24,1) ee_dmac_wr=3 giftags=4 ad_writes=20 xfer_writes=0
+//   ee_priv_wr=4 bridge_fires=4 core_halt=1 emits=128 frame=16x8
+
+`timescale 1ns/1ps
+
+// BIOS / payload image paths are passed via macros (iverilog-12
+// limitation: string parameter forwarding through hierarchy
+// elaborates inconsistently). On synthesis the same macros become
+// FPGA-tool defines pointing at .mem fixtures or board-specific
+// files. The macros default to empty strings (synthetic NOP-sled +
+// zero-RAM fallback in bios_rom_stub / ee_ram_stub) so the wrapper
+// is still elaborable without bake artifacts present.
+`ifndef TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE
+  `define TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE ""
+`endif
+`ifndef TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE
+  `define TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE ""
+`endif
+
+module top_psmct32_raster_demo
+    import trace_pkg::*;
+#(
+    parameter int     H_ACTIVE               = 16,
+    parameter int     V_ACTIVE               = 8,
+    parameter int     BIOS_SIZE_BYTES        = 4 * 1024,
+    parameter int     RAM_SIZE_BYTES         = 4 * 1024,
+    parameter int     VRAM_BYTES             = 8 * 1024,
+    parameter int unsigned USEG_SHADOW_WORDS_PARAM = 1024,
+    // Brick 1 — PSMCT32 page/block swizzle gate. Default 1 preserves
+    // the Ch123/Ch251 swizzled raster+scanout behavior (and every
+    // existing TB that drives this top). A TEXTURED-sprite demo
+    // fixture sets this to 0 so the linear gs_texel_addr fetch and the
+    // BITBLT upload land in the SAME (linear) VRAM layout — avoiding
+    // the swizzle reconciliation the gs_stub TODO flags. The gate is
+    // forwarded to gs_stub / gif_image_xfer_stub / gs_pcrtc_stub
+    // together so all three VRAM views stay consistent.
+    parameter bit     PSMCT32_SWIZZLE        = 1'b1
+) (
+    input  logic        clk,
+    input  logic        rst_n,
+    input  logic        core_go,
+
+    output logic [7:0]  r,
+    output logic [7:0]  g,
+    output logic [7:0]  b,
+    output logic        hsync,
+    output logic        vsync,
+    output logic        de,
+
+    output logic        core_halt,
+    output logic        dma_done_seen,
+    output logic        frame_seen,
+    output logic        raster_overflow,
+    // Ch174 — event toggles for HPS-visible counters. See the
+    // mirror block in top_psmct32_raster_demo_bram.sv for the full
+    // pulse-CDC contract. Toggle, not pulse — by design.
+    output logic        frame_toggle,
+    output logic        dma_done_toggle
+);
+
+    localparam int RAM_ADDR_W  = $clog2(RAM_SIZE_BYTES);
+    localparam int BIOS_ADDR_W = $clog2(BIOS_SIZE_BYTES);
+
+    // ---------------------------------------------------------------------
+    // ee_ram_stub — DMAC-side GIF payload
+    // ---------------------------------------------------------------------
+    logic                  ram_rd_en;
+    logic [RAM_ADDR_W-1:0] ram_rd_addr;
+    logic [127:0]          ram_rd_data;
+    logic                  ram_rd_valid;
+    // Top has no TB-direct write path; the wr_* ports are tied off.
+    logic [7:0]            ram_master_id;
+    assign ram_master_id = ram_rd_en ? 8'd1 : 8'd0;
+
+    ee_ram_stub #(
+        .SIZE_BYTES(RAM_SIZE_BYTES),
+        .IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE)
+    ) u_ram (
+        .clk(clk), .rst_n(rst_n),
+        .rd_en(ram_rd_en), .rd_addr(ram_rd_addr),
+        .rd_data(ram_rd_data), .rd_valid(ram_rd_valid),
+        .wr_en(1'b0), .wr_addr('0), .wr_data(128'd0), .wr_be(16'd0),
+        .master_id(ram_master_id),
+        .ev_valid(), .ev_subsys(), .ev_event(),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    // ---------------------------------------------------------------------
+    // bios_rom_stub — EE bootlet at 0xBFC0_0000
+    // ---------------------------------------------------------------------
+    logic                   bios_rd_en;
+    logic [21:0]            bios_rd_addr_full;
+    logic [BIOS_ADDR_W-1:0] bios_rd_addr;
+    logic                   bios_rd_valid;
+    logic [31:0]            bios_rd_data;
+    assign bios_rd_addr = bios_rd_addr_full[BIOS_ADDR_W-1:0];
+
+    bios_rom_stub #(
+        .SIZE_BYTES(BIOS_SIZE_BYTES),
+        .IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE)
+    ) u_bios (
+        .clk(clk), .rst_n(rst_n),
+        .rd_en(bios_rd_en),
+        .rd_addr(bios_rd_addr),
+        .rd_data(bios_rd_data),
+        .rd_valid(bios_rd_valid),
+        .ev_valid(), .ev_subsys(), .ev_event(),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    // ---------------------------------------------------------------------
+    // dmac_reg_stub — channel-2 NORMAL transfer
+    // ---------------------------------------------------------------------
+    logic        dmac_reg_wr_en;
+    logic [7:0]  dmac_reg_offset;
+    logic [31:0] dmac_reg_wr_data;
+    logic        dmac_mem_rd_en;
+    logic [31:0] dmac_mem_rd_addr;
+    logic         dmac_gif_valid;
+    logic [127:0] dmac_gif_data;
+    logic         dmac_gif_last;
+    logic         dmac_gif_ready;
+
+    logic                dmac_ev_valid;
+    subsys_e             dmac_ev_subsys;
+    event_e              dmac_ev_event;
+
+    logic [127:0] map_to_dmac_rd_data;
+    logic         map_to_dmac_rd_valid;
+
+    dmac_reg_stub u_dmac (
+        .clk(clk), .rst_n(rst_n),
+        .reg_wr_en(dmac_reg_wr_en), .reg_offset(dmac_reg_offset),
+        .reg_wr_data(dmac_reg_wr_data),
+        .reg_rd_en(1'b0), .reg_rd_data(), .reg_rd_valid(),
+        .mem_rd_en(dmac_mem_rd_en), .mem_rd_addr(dmac_mem_rd_addr),
+        .mem_rd_data(map_to_dmac_rd_data), .mem_rd_valid(map_to_dmac_rd_valid),
+        .ep_valid(dmac_gif_valid), .ep_data(dmac_gif_data),
+        .ep_last(dmac_gif_last), .ep_ready(dmac_gif_ready),
+        .irq_completion_o(),
+        .ev_valid(dmac_ev_valid), .ev_subsys(dmac_ev_subsys),
+        .ev_event(dmac_ev_event),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    // ---------------------------------------------------------------------
+    // ee_memory_map_stub — bus arbiter (USEG_SHADOW shrunk per Ch145)
+    // ---------------------------------------------------------------------
+    logic        ee_cpu_rd_en;
+    logic [31:0] ee_cpu_rd_addr;
+    logic [31:0] ee_cpu_rd_data;
+    logic        ee_cpu_rd_valid;
+    logic        ee_cpu_wr_en;
+    logic [31:0] ee_cpu_wr_addr;
+    logic [31:0] ee_cpu_wr_data;
+    logic [3:0]  ee_cpu_wr_be;
+
+    logic        map_gs_priv_wr_en;
+    logic [15:0] map_gs_priv_wr_addr;
+    logic [31:0] map_gs_priv_wr_data;
+    logic [3:0]  map_gs_priv_wr_be;
+
+    logic         map_ram_rd_en;
+    logic [24:0]  map_ram_rd_addr;
+
+    ee_memory_map_stub #(
+        .USEG_SHADOW_WORDS_PARAM(USEG_SHADOW_WORDS_PARAM)
+    ) u_map (
+        .clk(clk), .rst_n(rst_n),
+        .ee_rd_en  (ee_cpu_rd_en),
+        .ee_rd_addr(ee_cpu_rd_addr),
+        .ee_rd_data(ee_cpu_rd_data),
+        .ee_rd_valid(ee_cpu_rd_valid),
+        .ee_wr_en  (ee_cpu_wr_en),
+        .ee_wr_addr(ee_cpu_wr_addr),
+        .ee_wr_data(ee_cpu_wr_data),
+        .ee_wr_be  (ee_cpu_wr_be),
+        .dmac_rd_en(dmac_mem_rd_en), .dmac_rd_addr(dmac_mem_rd_addr),
+        .dmac_rd_data(map_to_dmac_rd_data),
+        .dmac_rd_valid(map_to_dmac_rd_valid),
+        .bios_rd_en  (bios_rd_en),
+        .bios_rd_addr(bios_rd_addr_full),
+        .bios_rd_data(bios_rd_data),
+        .bios_rd_valid(bios_rd_valid),
+        .ram_rd_en(map_ram_rd_en), .ram_rd_addr(map_ram_rd_addr),
+        .ram_rd_data(ram_rd_data), .ram_rd_valid(ram_rd_valid),
+        .bridge_wr_en(1'b0), .bridge_wr_addr(32'd0),
+        .bridge_wr_data(128'd0), .bridge_wr_be(16'd0),
+        .bridge_master_id(8'd0),
+        .ram_wr_en(), .ram_wr_addr(), .ram_wr_data(),
+        .ram_wr_be(), .ram_master_id(),
+        .ee_dmac_ch2_wr_en  (dmac_reg_wr_en),
+        .ee_dmac_ch2_wr_addr(dmac_reg_offset),
+        .ee_dmac_ch2_wr_data(dmac_reg_wr_data),
+        .ee_dmac_ch2_rd_en(), .ee_dmac_ch2_rd_addr(),
+        .ee_dmac_ch2_rd_data(32'd0), .ee_dmac_ch2_rd_valid(1'b0),
+        .ee_intc_wr_en(), .ee_intc_wr_addr(), .ee_intc_wr_data(),
+        .ee_intc_rd_en(), .ee_intc_rd_addr(),
+        .ee_intc_rd_data(32'd0), .ee_intc_rd_valid(1'b0),
+        .ee_misc_mmio_wr_en(), .ee_misc_mmio_wr_addr(), .ee_misc_mmio_wr_data(), .ee_misc_mmio_wr_be(),
+        .ee_misc_mmio_rd_en(), .ee_misc_mmio_rd_addr(),
+        .ee_misc_mmio_rd_data(32'd0), .ee_misc_mmio_rd_valid(1'b0),
+        .ee_biu_wr_en(), .ee_biu_wr_addr(), .ee_biu_wr_data(), .ee_biu_wr_be(),
+        .ee_biu_rd_en(), .ee_biu_rd_addr(),
+        .ee_biu_rd_data(32'd0), .ee_biu_rd_valid(1'b0),
+        .ee_gs_priv_wr_en  (map_gs_priv_wr_en),
+        .ee_gs_priv_wr_addr(map_gs_priv_wr_addr),
+        .ee_gs_priv_wr_data(map_gs_priv_wr_data),
+        .ee_gs_priv_wr_be  (map_gs_priv_wr_be),
+        .ev_valid(), .ev_subsys(), .ev_event(),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    assign ram_rd_en   = map_ram_rd_en;
+    assign ram_rd_addr = map_ram_rd_addr[RAM_ADDR_W-1:0];
+
+    // ---------------------------------------------------------------------
+    // ee_core_stub
+    // ---------------------------------------------------------------------
+    logic [31:0] core_pc;
+    logic        core_trap;
+
+    ee_core_stub #(
+        .PC_RESET(32'hBFC0_0000),
+        .STRICT_UNSUPPORTED(1'b0)
+    ) u_core (
+        .clk(clk), .rst_n(rst_n),
+        .go_i(core_go),
+        .map_rd_en (ee_cpu_rd_en),
+        .map_rd_addr(ee_cpu_rd_addr),
+        .map_rd_data(ee_cpu_rd_data),
+        .map_rd_valid(ee_cpu_rd_valid),
+        .map_wr_en (ee_cpu_wr_en),
+        .map_wr_addr(ee_cpu_wr_addr),
+        .map_wr_data(ee_cpu_wr_data),
+        .map_wr_be (ee_cpu_wr_be),
+        .cpu_irq(1'b0),
+        .halt_o(core_halt),
+        .pc_o  (core_pc),
+        .trap_o(core_trap),
+        .trap_pc_o(),
+        .trap_instr_o(),
+        .ev_valid(), .ev_subsys(), .ev_event(),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    // ---------------------------------------------------------------------
+    // gif_packed_stub
+    // ---------------------------------------------------------------------
+    logic         gif_in_ready;
+    logic [7:0]   gif_gif_reg_num;
+    logic         gif_gif_reg_wr_en;
+    logic [63:0]  gif_gif_reg_data;
+    logic         gif_image_data_valid;
+    logic [127:0] gif_image_data;
+    logic         gif_image_data_last;
+    logic         xfer_data_ready;
+    // Ch172 — raster FIFO full from gs_stub, fed back into gif_packed_stub.
+    logic         gs_raster_fifo_full;
+
+    gif_packed_stub #(.REAL_AD_REG_MAP(1'b1)) u_gif (
+        .clk(clk), .rst_n(rst_n),
+        .in_valid(dmac_gif_valid), .in_data(dmac_gif_data),
+        .in_last(dmac_gif_last), .in_ready(gif_in_ready),
+        .image_data_valid(gif_image_data_valid),
+        .image_data(gif_image_data),
+        .image_data_last(gif_image_data_last),
+        .image_data_ready(xfer_data_ready),
+        .raster_fifo_full(gs_raster_fifo_full),
+        .gs_wr_en(), .gs_wr_addr(), .gs_wr_data(),
+        .gif_reg_wr_en(gif_gif_reg_wr_en),
+        .gif_reg_num(gif_gif_reg_num),
+        .gif_reg_data(gif_gif_reg_data),
+        .ev_valid(), .ev_subsys(), .ev_event(),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    // DMAC ready follows gif_packed_stub's in_ready directly (Ch110
+    // image-xfer backpressure propagates through gif_packed_stub).
+    assign dmac_gif_ready = gif_in_ready;
+
+    // ---------------------------------------------------------------------
+    // gs_stub — PSMCT32 raster, swizzled
+    // ---------------------------------------------------------------------
+    logic        priv_reg_wr_en;
+    logic [15:0] priv_reg_wr_addr;
+    logic [63:0] priv_reg_wr_data;
+
+    logic [63:0] pmode_q, dispfb1_q, display1_q;
+    logic [63:0] bitbltbuf_q, trxpos_q, trxreg_q, trxdir_q;
+    logic        trxdir_wr_q;
+
+    logic        raster_pixel_emit;
+    logic [63:0] raster_pixel_color_q;
+    logic [31:0] raster_pixel_fb_addr_q;
+    logic [3:0]  raster_pixel_be_q;
+    logic [31:0] raster_pixel_mask_q;
+
+    // Brick 1 — texture-sampler read port out of gs_stub. Wired to
+    // vram_stub's SECOND read port (read2) below. In this top there is
+    // no clut_loader_stub instantiated (clut_enable=0 at the PCRTC), so
+    // read2 is dedicated to the texel fetch; the mux contract (CLUT load
+    // at TEX0 commit vs texel fetch during scan) is documented at the
+    // read2 wiring site.
+    logic        gs_tex_rd_en;
+    logic [31:0] gs_tex_rd_addr;
+    logic [31:0] gs_tex_rd_data;
+
+    // Brick 2a — dest-framebuffer read port for alpha blending. Wired
+    // to vram_stub.read2 below, arbitrated with the texel-fetch port.
+    // vram_stub.read2 is COMBINATIONAL, so FB_RD_REGISTERED defaults to
+    // 0 (dest data valid the same cycle the S2 address is presented).
+    logic        gs_fb_rd_en;
+    logic [31:0] gs_fb_rd_addr;
+    logic [31:0] gs_fb_rd_data;
+
+    // Brick 2b — Z-buffer stored-Z read port. Wired to vram_stub.read2
+    // below, arbitrated with the texel-fetch + alpha dest-fb ports.
+    // vram_stub.read2 is COMBINATIONAL, so Z_RD_REGISTERED defaults to 0.
+    logic        gs_z_rd_en;
+    logic [31:0] gs_z_rd_addr;
+    logic [31:0] gs_z_rd_data;
+
+    gs_stub #(
+        .PSMCT32_SWIZZLE(PSMCT32_SWIZZLE)
+    ) u_gs (
+        .clk(clk), .rst_n(rst_n),
+        .reg_wr_en  (priv_reg_wr_en),
+        .reg_wr_addr(priv_reg_wr_addr),
+        .reg_wr_data(priv_reg_wr_data),
+        .gif_reg_wr_en(gif_gif_reg_wr_en),
+        .gif_reg_num  (gif_gif_reg_num),
+        .gif_reg_data (gif_gif_reg_data),
+        .bg_r(), .bg_g(), .bg_b(),
+        .pmode_q(pmode_q), .dispfb1_q(dispfb1_q), .display1_q(display1_q),
+        .prim_q(), .rgbaq_q(),
+        .xyz2_q(), .xyzf2_q(),
+        .frame_1_q(), .zbuf_1_q(),
+        .tex0_1_q(), .tex0_1_cbp_q(), .tex0_1_cpsm_q(),
+        .tex0_1_csm_q(), .tex0_1_csa_q(), .tex0_1_cld_q(), .tex0_1_wr_q(),
+        .bitbltbuf_q(bitbltbuf_q),
+        .trxpos_q(trxpos_q),
+        .trxreg_q(trxreg_q),
+        .trxdir_q(trxdir_q),
+        .trxdir_wr_q(trxdir_wr_q),
+        .prim_complete(), .prim_complete_count(),
+        .prim_v0_q(), .prim_v1_q(), .prim_v2_q(),
+        .prim_color_q(),
+        .prim_color_v0_q(), .prim_color_v1_q(), .prim_color_v2_q(),
+        .prim_v0_decoded_q(), .prim_v1_decoded_q(), .prim_v2_decoded_q(),
+        .prim_v0_color_decoded_q(), .prim_v1_color_decoded_q(), .prim_v2_color_decoded_q(),
+        .pixel_emit(), .pixel_emit_count(),
+        .pixel_x_q(), .pixel_y_q(),
+        .pixel_color_q(),
+        .pixel_fbp_q(), .pixel_fbw_q(), .pixel_psm_q(), .pixel_fb_addr_q(),
+        .raster_pixel_emit(raster_pixel_emit),
+        .raster_pixel_emit_count(),
+        .raster_pixel_x_q(), .raster_pixel_y_q(),
+        .raster_pixel_color_q(raster_pixel_color_q),
+        .raster_pixel_fb_addr_q(raster_pixel_fb_addr_q),
+        .raster_pixel_be_q(raster_pixel_be_q),
+        .raster_pixel_mask_q(raster_pixel_mask_q),
+        .raster_pixel_psm_q(),
+        .raster_active(),
+        .raster_overflow(raster_overflow),
+        .raster_fifo_full(gs_raster_fifo_full),
+        .raster_degenerate(),
+        .tex_rd_en  (gs_tex_rd_en),
+        .tex_rd_addr(gs_tex_rd_addr),
+        .tex_rd_data(gs_tex_rd_data),
+        // Ch296 — PSMCT32-only top: no CLUT instantiated, the PSMT8 index
+        // path is never selected (s1_tex_active gates it on PSM==0x13).
+        // Tie the lookup data to 0; leave the index output open.
+        .clut_rd_idx (),
+        .clut_rd_data(32'd0),
+        .clut_load_busy(1'b0),
+        .fb_rd_en  (gs_fb_rd_en),
+        .fb_rd_addr(gs_fb_rd_addr),
+        .fb_rd_data(gs_fb_rd_data),
+        .z_rd_en  (gs_z_rd_en),
+        .z_rd_addr(gs_z_rd_addr),
+        .z_rd_data(gs_z_rd_data),
+        .ev_valid(), .ev_subsys(), .ev_event(),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    // ---------------------------------------------------------------------
+    // ee_gs_priv_bridge_stub
+    // ---------------------------------------------------------------------
+    ee_gs_priv_bridge_stub u_priv_bridge (
+        .clk(clk), .rst_n(rst_n),
+        .ee_wr_en  (map_gs_priv_wr_en),
+        .ee_wr_addr(map_gs_priv_wr_addr),
+        .ee_wr_data(map_gs_priv_wr_data),
+        .ee_wr_be  (map_gs_priv_wr_be),
+        .gs_reg_wr_en  (priv_reg_wr_en),
+        .gs_reg_wr_addr(priv_reg_wr_addr),
+        .gs_reg_wr_data(priv_reg_wr_data)
+    );
+
+    // ---------------------------------------------------------------------
+    // gif_image_xfer_stub — idle in Ch123 (no TRXDIR/IMAGE), but
+    // instantiated for symmetry. The TRXDIR-driven Ch124 demo would
+    // turn it load-bearing.
+    // ---------------------------------------------------------------------
+    logic        xfer_we;
+    logic [31:0] xfer_waddr;
+    logic [31:0] xfer_wdata;
+    logic [3:0]  xfer_wbe;
+    logic [31:0] xfer_wmask;
+    logic        xfer_busy;
+
+    gif_image_xfer_stub #(
+        .PSMCT32_SWIZZLE(PSMCT32_SWIZZLE)
+    ) u_xfer (
+        .clk(clk), .rst_n(rst_n),
+        .trxdir_wr_pulse(trxdir_wr_q),
+        .trxdir(trxdir_q),
+        .bitbltbuf(bitbltbuf_q),
+        .trxpos(trxpos_q),
+        .trxreg(trxreg_q),
+        .data_valid(gif_image_data_valid),
+        .data_qword(gif_image_data),
+        .data_last (gif_image_data_last),
+        .data_ready(xfer_data_ready),
+        .vram_we   (xfer_we),
+        .vram_waddr(xfer_waddr),
+        .vram_wdata(xfer_wdata),
+        .vram_wbe  (xfer_wbe),
+        .vram_wmask(xfer_wmask),
+        .busy      (xfer_busy)
+    );
+
+    // ---------------------------------------------------------------------
+    // VRAM mux: xfer-OWNED when xfer.busy, raster-OWNED otherwise.
+    // (Sequenced: in Ch123 raster fills exclusively; xfer never fires.
+    // In a future TRXDIR variant the mux still works — payload upload
+    // finishes before raster starts.)
+    // ---------------------------------------------------------------------
+    logic        vram_we_mux;
+    logic [31:0] vram_waddr_mux;
+    logic [31:0] vram_wdata_mux;
+    logic [3:0]  vram_wbe_mux;
+    logic [31:0] vram_wmask_mux;
+
+    assign vram_we_mux    = xfer_busy ? xfer_we    : raster_pixel_emit;
+    assign vram_waddr_mux = xfer_busy ? xfer_waddr : raster_pixel_fb_addr_q;
+    assign vram_wdata_mux = xfer_busy ? xfer_wdata : raster_pixel_color_q[31:0];
+    assign vram_wbe_mux   = xfer_busy ? xfer_wbe   : raster_pixel_be_q;
+    assign vram_wmask_mux = xfer_busy ? xfer_wmask : raster_pixel_mask_q;
+
+    logic [31:0] vram_raddr;
+    logic [31:0] vram_rdata;
+
+    // ---------------------------------------------------------------------
+    // Brick 1 — read2 (second VRAM read port) MUX.
+    //
+    // read2 is shared between two consumers that are sequenced in time:
+    //   - clut_loader_stub : VRAM→CLUT copy at TEX0 commit (BEFORE the
+    //                        raster scan). NOT instantiated in this top
+    //                        (PCRTC clut_enable=0), so it never drives
+    //                        read2 here.
+    //   - gs_stub texel fetch : during the raster SCAN, one read per
+    //                        inside pixel of a textured SPRITE.
+    // Because CLUT load completes before scanout begins, a simple
+    // gs_tex_rd_en select is collision-free. When a future variant adds
+    // clut_loader, extend this select: read2_addr = clut_active ?
+    // clut_rd_addr : gs_tex_rd_addr.
+    // vram_stub's read2 is COMBINATIONAL; gs_stub presents the address
+    // from a registered S1 stage and consumes tex_rd_data one cycle
+    // later, so the effective latency matches TEX_RD_LATENCY=1.
+    // ---------------------------------------------------------------------
+    //
+    // Brick 2a — THIRD potential read2 consumer: the alpha-blend
+    // dest-fb read (gs_fb_rd_en/gs_fb_rd_addr). A flat alpha-blended
+    // SPRITE never textures, so gs_tex_rd_en and gs_fb_rd_en are
+    // mutually exclusive (gs_stub.new_abe_active requires
+    // !close_tme_effective). The combinational read2_data is fanned out
+    // to both consumers; only the active one's address selects the mux.
+    //
+    // Brick 2b — FOURTH potential read2 consumer: the Z-buffer stored-Z
+    // read (gs_z_rd_en/gs_z_rd_addr). A flat Z-tested SPRITE never
+    // textures and never alpha-blends (gs_stub.new_zte_active requires
+    // !close_tme_effective && !new_abe_active), so the four read2
+    // consumers are mutually exclusive by feature.
+    logic [31:0] vram_read2_addr;
+    logic [31:0] vram_read2_data;
+    assign vram_read2_addr = gs_tex_rd_en ? gs_tex_rd_addr
+                           : gs_fb_rd_en  ? gs_fb_rd_addr
+                           : gs_z_rd_en   ? gs_z_rd_addr
+                                          : 32'd0;
+    assign gs_tex_rd_data = vram_read2_data;
+    assign gs_fb_rd_data  = vram_read2_data;
+    assign gs_z_rd_data   = vram_read2_data;
+
+    // synthesis translate_off
+    always_ff @(posedge clk) begin
+        if (rst_n && gs_tex_rd_en && gs_fb_rd_en)
+            $error("Brick2a: read2 overlap @%0t — texel fetch and alpha dest-fb read both active; one read is being dropped (must be mutually exclusive by texturing).",
+                   $time);
+        if (rst_n && gs_z_rd_en && (gs_tex_rd_en || gs_fb_rd_en))
+            $error("Brick2b: read2 overlap @%0t — Z-buffer read collides with another consumer; one read is being dropped (Z-tested flat sprite must be mutually exclusive with texel/alpha).",
+                   $time);
+    end
+    // synthesis translate_on
+
+    vram_stub #(.BYTES(VRAM_BYTES)) u_vram (
+        .clk(clk), .rst_n(rst_n),
+        .write_en  (vram_we_mux),
+        .write_addr(vram_waddr_mux),
+        .write_data(vram_wdata_mux),
+        .write_be  (vram_wbe_mux),
+        .write_mask(vram_wmask_mux),
+        .read_addr (vram_raddr),
+        .read_data (vram_rdata),
+        .read2_addr(vram_read2_addr),
+        .read2_data(vram_read2_data)
+    );
+
+    // ---------------------------------------------------------------------
+    // gs_pcrtc_stub — PSMCT32 swizzled scanout
+    // ---------------------------------------------------------------------
+    logic end_of_frame;
+
+    gs_pcrtc_stub #(
+        .H_ACTIVE(H_ACTIVE), .H_FRONT(1), .H_SYNC(1), .H_BACK(1),
+        .V_ACTIVE(V_ACTIVE), .V_FRONT(1), .V_SYNC(1), .V_BACK(1),
+        .PSMCT32_SWIZZLE(PSMCT32_SWIZZLE)
+    ) u_pcrtc (
+        .clk(clk), .rst_n(rst_n),
+        .pmode_q       (pmode_q),
+        .dispfb1_q     (dispfb1_q),
+        .display1_q    (display1_q),
+        .vram_read_addr(vram_raddr),
+        .vram_read_data(vram_rdata),
+        .clut_enable   (1'b0),
+        .clut_csa      (5'd0),
+        .clut_read_idx (),
+        .clut_read_data(32'd0),
+        .hsync(hsync), .vsync(vsync), .de(de),
+        .r(r), .g(g), .b(b),
+        .ev_valid(), .ev_subsys(), .ev_event(),
+        .ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
+    );
+
+    // gs_pcrtc_stub doesn't expose end_of_frame as a port; the Ch123 TB
+    // taps it via hierarchical ref. For the top wrapper we synthesize an
+    // equivalent edge by watching vsync rise.
+    logic vsync_d;
+    always_ff @(posedge clk) begin
+        if (!rst_n) vsync_d <= 1'b0;
+        else        vsync_d <= vsync;
+    end
+    assign end_of_frame = vsync && !vsync_d;
+
+    // ---------------------------------------------------------------------
+    // Sticky status outputs.
+    // ---------------------------------------------------------------------
+    logic dma_done_seen_q;
+    logic frame_seen_q;
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            dma_done_seen_q <= 1'b0;
+            frame_seen_q    <= 1'b0;
+        end else begin
+            if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
+                dma_done_seen_q <= 1'b1;
+            if (end_of_frame)
+                frame_seen_q <= 1'b1;
+        end
+    end
+    assign dma_done_seen = dma_done_seen_q;
+    assign frame_seen    = frame_seen_q;
+
+    // ---------------------------------------------------------------------
+    // Ch174 — event toggles for HPS-visible counters.
+    // ---------------------------------------------------------------------
+    logic frame_toggle_q;
+    logic dma_done_toggle_q;
+    always_ff @(posedge clk) begin
+        if (!rst_n) begin
+            frame_toggle_q    <= 1'b0;
+            dma_done_toggle_q <= 1'b0;
+        end else begin
+            if (end_of_frame)
+                frame_toggle_q <= ~frame_toggle_q;
+            if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
+                dma_done_toggle_q <= ~dma_done_toggle_q;
+        end
+    end
+    assign frame_toggle    = frame_toggle_q;
+    assign dma_done_toggle = dma_done_toggle_q;
+
+endmodule : top_psmct32_raster_demo
				`@@ -0,0 +1 @@`
				`/home/ubuntu/FPGA_Projects/retroDE_splash/rtl/platform/cp437_8x8.mem`