Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)

RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-29 20:10:50 -04:00
commit ec82764bef
2462 changed files with 2174303 additions and 0 deletions
+44
View File
@@ -0,0 +1,44 @@
# rtl/debug
Trace taps, observability modules, and first-class debug infrastructure.
This directory is intentionally first-class per the debug/validation strategy
in `docs/contracts/validation.md`. Nothing here is ornamental; stubs and real
blocks alike depend on it.
## Wave 1 contents
- `trace_pkg.sv` — shared types (`subsys_e`, `event_e`) and string renderers
used by all trace producers and `trace_sink_stub`.
- `trace_sink_stub.sv` — simulation-only text trace writer. One instance per
output file; each Wave 1 stub wires its event port to its own sink. See
`docs/decisions/0000-trace-format.md` for format rationale.
## Usage pattern for Wave 1 testbenches
```systemverilog
trace_sink_stub #(
.FILENAME ("ee.trace"),
.SINK_LABEL("ee_fetch")
) u_trace_ee (
.clk (clk),
.rst_n (rst_n),
.ev_valid (dut_ev_valid),
.ev_subsys(dut_ev_subsys),
.ev_event (dut_ev_event),
.ev_arg0 (dut_ev_arg0),
.ev_arg1 (dut_ev_arg1),
.ev_arg2 (dut_ev_arg2),
.ev_arg3 (dut_ev_arg3),
.ev_flags (dut_ev_flags)
);
```
## Notes
- Cycle counter is internal to the sink and advances on `clk` while `rst_n`
is high. Cross-clock correlation is a later-wave concern.
- `ev_flags == 0` renders as `-` in the trace line; any non-zero value is
printed as an 8-hex-digit field.
- Event/subsystem codes are globally unique in Wave 1 (not per-subsystem).
Revisit if the namespace gets crowded.
+166
View File
@@ -0,0 +1,166 @@
// retroDE_ps2 — trace package
//
// Defines the shared trace vocabulary used by Wave 1 stubs. Kept small on
// purpose: subsystem IDs, event codes, and string renderers only. See
// docs/decisions/0000-trace-format.md and docs/stub_module_plan.md.
//
// Contract:
// trace line = cycle subsystem event arg0 arg1 arg2 arg3 flags
//
// Event codes are globally unique (not per-subsystem) in Wave 1 to keep the
// renderer trivial. Revisit if the namespace gets crowded.
`ifndef RETRODE_PS2_TRACE_PKG_SV
`define RETRODE_PS2_TRACE_PKG_SV
`timescale 1ns/1ps
package trace_pkg;
typedef enum logic [3:0] {
SUBSYS_EE = 4'h0,
SUBSYS_MEM = 4'h1,
SUBSYS_GS = 4'h2,
SUBSYS_INTC = 4'h3,
SUBSYS_SIF = 4'h4,
SUBSYS_DMAC = 4'h5,
SUBSYS_IOP = 4'h6,
SUBSYS_GIF = 4'h7,
SUBSYS_PLAT = 4'h8,
SUBSYS_OTHER = 4'hF
} subsys_e;
typedef enum logic [7:0] {
EV_RESET = 8'h00,
EV_IFETCH = 8'h01,
EV_READ = 8'h02,
EV_WRITE = 8'h03,
EV_UNMAPPED = 8'h04,
EV_IRQ = 8'h05,
EV_MODE = 8'h06,
EV_BGCOLOR = 8'h07,
// Wave 2 additions — DMAC / GIF / GS write-path visibility
// (see docs/wave2_dma_gif_plan.md).
EV_DMA_CFG = 8'h08,
EV_DMA_START = 8'h09,
EV_DMA_BEAT = 8'h0A,
EV_DMA_DONE = 8'h0B,
EV_GIFTAG = 8'h0C,
EV_GS_WRITE = 8'h0D,
// Ch76 — GS primitive-observer "primitive complete" pulse.
// Fired by gs_stub when an XYZ2/XYZF2 vertex commit closes a
// discrete primitive (POINT / LINE / TRI / SPRITE) per the
// currently-latched PRIM[2:0]. arg0=prim_type, arg1=vert
// threshold, arg2=cumulative prim count after this draw,
// arg3=closing vertex data. No rasterization yet.
EV_PRIM_DRAW = 8'h0E,
EV_OTHER = 8'hFF
} event_e;
// ------------------------------------------------------------------
// Ch81 — structured GIF/GS field decoders
//
// The raw 64-bit XYZ2 / XYZF2 / RGBAQ payloads are an awkward
// contract for the next layer of the pipeline (rasterizer or
// pixel emit). These struct types carry the same data already
// unpacked into channel components so a consumer doesn't have
// to re-derive the bit slices.
//
// XYZ2 (PCSX2 GSRegs.h: bits[15:0]=X, [31:16]=Y, [63:32]=Z 32-bit)
// XYZF2 (PCSX2 GSRegs.h: bits[15:0]=X, [31:16]=Y, [55:32]=Z 24-bit,
// [63:56]=F fog byte)
// RGBAQ (bits[7:0]=R, [15:8]=G, [23:16]=B, [31:24]=A, [63:32]=Q float)
//
// X and Y are PS2 12.4 fixed-point screen coordinates (top 12
// bits = integer pixel, low 4 bits = sub-pixel). Z is treated
// as opaque in this package — depth interpretation depends on
// the GS framebuffer/zbuffer config, which the recognition
// layer doesn't model. Q is the texture-coordinate divisor
// (IEEE single-precision float); we carry it verbatim.
//
// is_xyzf2 records the source format so a consumer can
// disambiguate the 24-bit-Z + 8-bit-fog packing from the full
// 32-bit-Z packing without re-reading the original reg#.
// ------------------------------------------------------------------
typedef struct packed {
logic is_xyzf2; // 1 = XYZF2 source, 0 = XYZ2
logic [7:0] fog; // valid iff is_xyzf2; else 0
logic [31:0] z; // 32-bit (XYZ2) or zero-extended 24-bit (XYZF2)
logic [15:0] y; // 12.4 fixed-point screen Y
logic [15:0] x; // 12.4 fixed-point screen X
} vertex_t;
typedef struct packed {
logic [31:0] q; // texture-coord divisor (IEEE float)
logic [7:0] a;
logic [7:0] b;
logic [7:0] g;
logic [7:0] r;
} color_t;
function automatic vertex_t decode_vertex(input logic [63:0] data,
input logic is_xyzf2);
vertex_t v;
v.x = data[15:0];
v.y = data[31:16];
v.is_xyzf2 = is_xyzf2;
if (is_xyzf2) begin
v.z = {8'd0, data[55:32]}; // zero-extend 24 bits
v.fog = data[63:56];
end else begin
v.z = data[63:32];
v.fog = 8'd0;
end
return v;
endfunction
function automatic color_t decode_color(input logic [63:0] data);
color_t c;
c.r = data[7:0];
c.g = data[15:8];
c.b = data[23:16];
c.a = data[31:24];
c.q = data[63:32];
return c;
endfunction
function automatic string subsys_str(input subsys_e s);
case (s)
SUBSYS_EE: return "EE";
SUBSYS_MEM: return "MEM";
SUBSYS_GS: return "GS";
SUBSYS_INTC: return "INTC";
SUBSYS_SIF: return "SIF";
SUBSYS_DMAC: return "DMAC";
SUBSYS_IOP: return "IOP";
SUBSYS_GIF: return "GIF";
SUBSYS_PLAT: return "PLAT";
default: return "OTHER";
endcase
endfunction
function automatic string event_str(input event_e e);
case (e)
EV_RESET: return "RESET";
EV_IFETCH: return "IFETCH";
EV_READ: return "READ";
EV_WRITE: return "WRITE";
EV_UNMAPPED: return "UNMAPPED";
EV_IRQ: return "IRQ";
EV_MODE: return "MODE";
EV_BGCOLOR: return "BGCOLOR";
EV_DMA_CFG: return "DMA_CFG";
EV_DMA_START: return "DMA_START";
EV_DMA_BEAT: return "DMA_BEAT";
EV_DMA_DONE: return "DMA_DONE";
EV_GIFTAG: return "GIFTAG";
EV_GS_WRITE: return "GS_WRITE";
EV_PRIM_DRAW: return "PRIM_DRAW";
default: return "OTHER";
endcase
endfunction
endpackage : trace_pkg
`endif // RETRODE_PS2_TRACE_PKG_SV
+88
View File
@@ -0,0 +1,88 @@
// retroDE_ps2 — trace_sink_stub
//
// Simulation-only text trace writer for Wave 1 stubs.
//
// Purpose, owns, success condition, replacement path: see
// docs/stub_module_plan.md (Wave 1, item 1)
// docs/contracts/validation.md
// docs/decisions/0000-trace-format.md
//
// Interface shape:
// - One sink instance per output file. A testbench instantiates multiple
// sinks (one per stub under test) and wires each stub's event port to
// its own sink. Offline tooling merges files by cycle when needed.
// - The cycle counter is internal and advances on clk while rst_n is high.
// Multi-clock-domain correlation is a later-wave concern.
//
// Line format (docs/decisions/0000):
// cycle subsys event arg0 arg1 arg2 arg3 flags
// where flags is rendered as `-` when zero.
`timescale 1ns/1ps
module trace_sink_stub
import trace_pkg::*;
#(
parameter string FILENAME = "trace.txt",
parameter int SCHEMA_VERSION = 1,
parameter string SINK_LABEL = "trace"
) (
input logic clk,
input logic rst_n,
input logic ev_valid,
input subsys_e ev_subsys,
input event_e ev_event,
input logic [63:0] ev_arg0,
input logic [63:0] ev_arg1,
input logic [63:0] ev_arg2,
input logic [63:0] ev_arg3,
input logic [31:0] ev_flags
);
integer fd;
longint unsigned cycle_count;
initial begin
fd = $fopen(FILENAME, "w");
if (fd == 0) begin
$fatal(1, "[trace_sink_stub %0s] cannot open %0s", SINK_LABEL, FILENAME);
end
$fdisplay(fd, "# retroDE_ps2 trace, schema v%0d, sink=%0s",
SCHEMA_VERSION, SINK_LABEL);
$fdisplay(fd, "# columns: cycle subsystem event arg0 arg1 arg2 arg3 flags");
cycle_count = 64'd0;
end
always_ff @(posedge clk) begin
if (!rst_n) begin
cycle_count <= 64'd0;
end else begin
cycle_count <= cycle_count + 64'd1;
if (ev_valid) begin
if (ev_flags == 32'd0) begin
$fdisplay(fd,
"%0d %0s %0s 0x%016h 0x%016h 0x%016h 0x%016h -",
cycle_count,
subsys_str(ev_subsys),
event_str(ev_event),
ev_arg0, ev_arg1, ev_arg2, ev_arg3);
end else begin
$fdisplay(fd,
"%0d %0s %0s 0x%016h 0x%016h 0x%016h 0x%016h 0x%08h",
cycle_count,
subsys_str(ev_subsys),
event_str(ev_event),
ev_arg0, ev_arg1, ev_arg2, ev_arg3,
ev_flags);
end
end
end
end
final begin
if (fd != 0) $fclose(fd);
end
endmodule : trace_sink_stub
+40
View File
@@ -0,0 +1,40 @@
# rtl/dmac
EE DMAC. Matches `docs/contracts/dmac.md`.
## Wave 2 / Wave 2.5 contents
- `dmac_reg_stub.sv` — channel-2-focused register shell + single-transfer
state machine. Wave 2.5 revision is memory-backed: DMAC now issues real
memory reads via the `mem_rd_*` port (connected directly to
`ee_ram_stub` in the current topology; routing through
`ee_memory_map_stub` is deferred). State flow: IDLE → FETCH_WAIT →
ACTIVE_SEND → DONE. MADR is the real fetch source address.
See `docs/wave25_memory_backed_dma_plan.md`.
**EE-core chapter 3** added a CPU write path: the EE memory map's
new `ee_dmac_ch2_wr_*` port drives `reg_wr_en` / `reg_offset` /
`reg_wr_data`, so the EE core can program MADR/QWC/CHCR from a
MIPS bootstrap via `SW`.
**EE-core chapter 4** added a CPU read path (`reg_rd_en` /
`reg_rd_data` / `reg_rd_valid`, 1-cycle latency) plus a DONE_COUNT
monotonic counter at offset 0x40. CHCR/MADR/QWC/TADR read back
their stored values; DONE_COUNT increments each time the state
machine enters S_DONE. The EE map forwards CPU reads in the same
DMAC window through a new `ee_dmac_ch2_rd_*` pair, so software
can now poll CHCR.start or compare DONE_COUNT before/after a
transfer without needing INTC.
## Explicit non-goals (Wave 2 / 2.5)
- Multi-channel arbitration or fairness.
- Chain mode (normal / chain / interleaved transfer modes).
- Stall / ring / suspend semantics.
- Interrupt routing to INTC.
- QWC > 1 multi-beat transfers (state machine is shaped for it; initial
signoff is QWC == 1 per Wave 2.5 plan).
- Routing through `ee_memory_map_stub` (current topology is direct to
`ee_ram_stub`).
Each of these is a future-wave concern, not a stub-plan shortcut.
+355
View File
@@ -0,0 +1,355 @@
// retroDE_ps2 — dmac_reg_stub
//
// EE DMAC stub. Channel-agnostic: the module's behaviour is generic across
// PS2 DMA channels and downstream endpoints. The specific channel and path
// id are set via parameters; the downstream endpoint wires (ep_*) are
// valid/data/last/ready regardless of what consumer is connected. Current
// uses: CHANNEL=2 (GIF path), CHANNEL=5 (SIF0 path).
//
// Payload source: memory-backed via the `mem_rd_*` master port, typically
// routed through `ee_memory_map_stub` to `ee_ram_stub`. MADR is the real
// fetch source address.
//
// Contract refs:
// docs/stub_module_plan.md (Wave 2, item 8)
// docs/wave2_dma_gif_plan.md (Wave 2 scope)
// docs/wave25_memory_backed_dma_plan.md (Wave 2.5 scope — THIS REVISION)
// docs/contracts/dmac.md
//
// Register surface (single channel, selected by CHANNEL parameter):
// offset 0x00 CHCR — start bit at [0], other bits recorded
// offset 0x10 MADR — real fetch source address (Wave 2.5)
// offset 0x20 QWC — transfer length in 128-bit qwords (first sign-off
// path requires QWC == 1; state machine is QWC-
// generic for a future Wave 2.6 extension)
// offset 0x30 TADR — recorded for future chain-mode use
// offset 0x40 DONE_COUNT — monotonic completion counter (read-only;
// writes are accepted but ignored). Software reads
// this to distinguish "nth completion" without
// counting interrupts externally. EE-core chapter 4
// addition; mirrors iop_dmac_reg_stub's DONE_COUNT
// but at a new slot (0x0C is occupied on the IOP
// stub; EE stub's 16-byte register spacing puts
// DONE_COUNT at 0x40).
//
// Register reads (EE-core chapter 4, added alongside the original write
// surface): reg_rd_en / reg_rd_data / reg_rd_valid with 1-cycle latency,
// matching the rest of the stub ecosystem. All four config registers plus
// DONE_COUNT are readable; all other offsets return 0.
//
// Memory master interface (to ee_ram_stub in Wave 2.5):
// mem_rd_en / mem_rd_addr drive the request
// mem_rd_valid / mem_rd_data return data one cycle later
//
// Downstream endpoint: ep_{valid,data,last,ready}. The port names are
// channel-agnostic because the DMAC's behaviour is generic across PS2
// channels (ch2 = GIF, ch5 = SIF0, etc.). Connect the endpoint side to
// whichever consumer matches the instantiated CHANNEL/PATH_ID.
//
// State machine:
// IDLE → FETCH_WAIT on CHCR start
// FETCH_WAIT → ACTIVE_SEND on mem_rd_valid (data latched)
// ACTIVE_SEND → FETCH_WAIT on endpoint accept with more beats pending
// → DONE on endpoint accept for the final beat
// DONE → IDLE next cycle (clears CHCR.start)
//
// Trace payload schemas (per wave25_memory_backed_dma_plan.md):
// DMAC DMA_CFG arg0=channel arg1=chcr arg2=madr arg3=qwc
// flags=reg_offset (which reg was written)
// DMAC DMA_START arg0=channel arg1=qwc arg2=MADR arg3=path_id
// DMAC DMA_BEAT arg0=channel arg1=beat arg2=src_addr arg3=remaining
// DMAC DMA_DONE arg0=channel arg1=beats arg2=completion arg3=path_id
// completion code: 0 = OK
`timescale 1ns/1ps
module dmac_reg_stub
import trace_pkg::*;
#(
parameter logic [3:0] CHANNEL = 4'd2,
parameter logic [3:0] PATH_ID = 4'd2
) (
input logic clk,
input logic rst_n,
// CPU / testbench register write port (single-channel, see CHANNEL).
// reg_offset is shared by read and write; callers must not assert both
// enables in the same cycle (the map ensures this because the EE CPU
// emits either rd or wr per transaction, never both).
input logic reg_wr_en,
input logic [7:0] reg_offset,
input logic [31:0] reg_wr_data,
// Register read port (EE-core chapter 4). 1-cycle latency.
input logic reg_rd_en,
output logic [31:0] reg_rd_data,
output logic reg_rd_valid,
// Memory master (Wave 2.5) — direct link to ee_ram_stub in this phase.
// Future waves will route this through ee_memory_map_stub.
output logic mem_rd_en,
output logic [31:0] mem_rd_addr,
input logic [127:0] mem_rd_data,
input logic mem_rd_valid,
// Downstream to gif_path_stub
output logic ep_valid,
output logic [127:0] ep_data,
output logic ep_last,
input logic ep_ready,
// Completion pulse — one cycle high when the transfer reaches S_DONE.
// Intended as an INTC source; level-held bit latching happens in the
// interrupt controller, not here.
output logic irq_completion_o,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam logic [7:0] CHCR_OFFSET = 8'h00;
localparam logic [7:0] MADR_OFFSET = 8'h10;
localparam logic [7:0] QWC_OFFSET = 8'h20;
localparam logic [7:0] TADR_OFFSET = 8'h30;
localparam logic [7:0] DONE_COUNT_OFFSET = 8'h40;
// ------------------------------------------------------------------
// Register file (ch2 only)
// ------------------------------------------------------------------
logic [31:0] chcr;
logic [31:0] madr;
logic [31:0] qwc;
logic [31:0] tadr;
logic [31:0] done_count;
logic start_pulse;
assign start_pulse = reg_wr_en && (reg_offset == CHCR_OFFSET) &&
reg_wr_data[0] && !chcr[0];
// Single owner for the config regs: software writes win over the
// S_DONE auto-clear on CHCR[0] in the unlikely same-cycle case
// (the NBA queue lets the case-statement full-width assign
// override the partial bit-0 clear). Software writing CHCR while
// the DMA is completing is not part of any sane flow, so this
// ordering is defensive — the point is: chcr has one procedural
// driver, not two.
always_ff @(posedge clk) begin
if (!rst_n) begin
chcr <= 32'd0;
madr <= 32'd0;
qwc <= 32'd0;
tadr <= 32'd0;
end else begin
if (state == S_DONE) chcr[0] <= 1'b0;
if (reg_wr_en) begin
case (reg_offset)
CHCR_OFFSET: chcr <= reg_wr_data;
MADR_OFFSET: madr <= reg_wr_data;
QWC_OFFSET: qwc <= reg_wr_data;
TADR_OFFSET: tadr <= reg_wr_data;
default: ;
endcase
end
end
end
// DONE_COUNT: monotonic completion counter. Increments on S_DONE
// entry. Reset-only clear path; writes at the DONE_COUNT offset are
// silently dropped by the write always_ff above (read-only register).
always_ff @(posedge clk) begin
if (!rst_n) done_count <= 32'd0;
else if (state == S_DONE) done_count <= done_count + 32'd1;
end
// Register read (1-cycle latency, matches rest of stub ecosystem).
always_ff @(posedge clk) begin
if (!rst_n) begin
reg_rd_data <= 32'd0;
reg_rd_valid <= 1'b0;
end else begin
reg_rd_valid <= reg_rd_en;
if (reg_rd_en) begin
case (reg_offset)
CHCR_OFFSET: reg_rd_data <= chcr;
MADR_OFFSET: reg_rd_data <= madr;
QWC_OFFSET: reg_rd_data <= qwc;
TADR_OFFSET: reg_rd_data <= tadr;
DONE_COUNT_OFFSET: reg_rd_data <= done_count;
default: reg_rd_data <= 32'd0;
endcase
end
end
end
// ------------------------------------------------------------------
// Transfer state machine
// ------------------------------------------------------------------
typedef enum logic [1:0] {
S_IDLE = 2'd0,
S_FETCH_WAIT = 2'd1,
S_ACTIVE_SEND = 2'd2,
S_DONE = 2'd3
} state_e;
state_e state;
logic [31:0] madr_latched;
logic [31:0] qwc_latched;
logic [31:0] beat_index;
logic [127:0] beat_payload;
logic [31:0] src_addr;
assign src_addr = madr_latched + (beat_index << 4); // beat * 16 bytes
logic beat_accepted;
assign beat_accepted = ep_valid && ep_ready;
// Pulse mem_rd_en for one cycle whenever we first enter FETCH_WAIT.
logic prev_state_fw;
always_ff @(posedge clk) begin
if (!rst_n) prev_state_fw <= 1'b0;
else prev_state_fw <= (state == S_FETCH_WAIT);
end
logic entering_fw;
assign entering_fw = (state == S_FETCH_WAIT) && !prev_state_fw;
assign mem_rd_en = entering_fw;
assign mem_rd_addr = src_addr;
// Drive endpoint only in ACTIVE_SEND with the latched payload.
assign ep_valid = (state == S_ACTIVE_SEND);
assign ep_data = beat_payload;
assign ep_last = (state == S_ACTIVE_SEND) &&
(beat_index + 32'd1 == qwc_latched);
assign irq_completion_o = (state == S_DONE);
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
madr_latched <= 32'd0;
qwc_latched <= 32'd0;
beat_index <= 32'd0;
beat_payload <= 128'd0;
end else begin
unique case (state)
S_IDLE: begin
if (start_pulse) begin
// start_pulse is gated by reg_wr_en && reg_offset ==
// CHCR_OFFSET, so a same-cycle QWC write is
// structurally impossible through this interface.
// Latch the currently-visible register state.
state <= S_FETCH_WAIT;
madr_latched <= madr;
qwc_latched <= qwc;
beat_index <= 32'd0;
end
end
S_FETCH_WAIT: begin
if (mem_rd_valid) begin
beat_payload <= mem_rd_data;
state <= S_ACTIVE_SEND;
end
end
S_ACTIVE_SEND: begin
if (beat_accepted) begin
if (beat_index + 32'd1 == qwc_latched) begin
state <= S_DONE;
end else begin
beat_index <= beat_index + 32'd1;
state <= S_FETCH_WAIT;
end
end
end
S_DONE: begin
state <= S_IDLE;
// chcr[0] auto-clear on S_DONE now lives in the
// register-ownership always_ff above (single
// procedural driver for chcr).
end
default: state <= S_IDLE;
endcase
end
end
// ------------------------------------------------------------------
// Trace emission — one event per cycle; priority:
// DONE pulse > BEAT accept > START on transition > CFG on write
// ------------------------------------------------------------------
logic prev_state_fetch_or_later;
always_ff @(posedge clk) begin
if (!rst_n) prev_state_fetch_or_later <= 1'b0;
else prev_state_fetch_or_later <= (state != S_IDLE);
end
logic enter_start; // transitioning from IDLE into the transfer
assign enter_start = (state == S_FETCH_WAIT) && !prev_state_fetch_or_later;
logic enter_done;
assign enter_done = (state == S_DONE);
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_CFG;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (enter_done) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_DONE;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, beat_index + 32'd1}; // beats completed
ev_arg2 <= 64'd0; // completion: OK
ev_arg3 <= {60'd0, PATH_ID};
ev_flags <= 32'd0;
end else if (beat_accepted) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_BEAT;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, beat_index};
ev_arg2 <= {32'd0, src_addr}; // this beat's source
ev_arg3 <= {32'd0, qwc_latched - beat_index - 32'd1};
ev_flags <= 32'd0;
end else if (enter_start) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_START;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, qwc_latched};
ev_arg2 <= {32'd0, madr_latched}; // MADR is the source
ev_arg3 <= {60'd0, PATH_ID};
ev_flags <= 32'd0;
end else if (reg_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_CFG;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, (reg_offset == CHCR_OFFSET) ? reg_wr_data : chcr};
ev_arg2 <= {32'd0, (reg_offset == MADR_OFFSET) ? reg_wr_data : madr};
ev_arg3 <= {32'd0, (reg_offset == QWC_OFFSET) ? reg_wr_data : qwc};
ev_flags <= {24'd0, reg_offset};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : dmac_reg_stub
+177
View File
@@ -0,0 +1,177 @@
// retroDE_ps2 — ee_dmac_ctrl_stub
//
// Ch287 — EE DMAC global control/status registers at
// 0x1000_E000..0x1000_E0FF (256 bytes). NOT the per-channel registers
// (those live in dmac_reg_stub at 0x1000_A000+ for channel 2; per-
// channel registers for other channels are not modelled yet).
//
// Surface modelled here (R5900 DMAC global):
// offset 0x00 D_CTRL — DMAC enable / cycle-stealing / RELE / etc.
// Latched write, read returns last-written.
// offset 0x10 D_STAT — Per-channel interrupt status (CIS) + per-
// channel interrupt mask (CIM) + stall / MEIS.
// Read returns current latch (reset = 0 = no
// pending interrupts). Writes are W1C against
// the CIS/MEIS half (bits where write_data has
// a 1 are cleared); CIM half is NOT W1C — bits
// are unconditionally written. Real R5900
// splits the word: bits[15:0] = CIS (W1C), bits
// [31:16] = CIM (write). With nothing in the
// stub yet setting bits, qbert sees "no
// interrupts pending" on every read, which is
// exactly the wait-for-quiet pattern its init
// loop polls for.
// offset 0x20 D_PCR — Per-channel priority + W1C enables. Latched
// write, read returns last-written.
// offset 0x30 D_SQWC — Stall/skip cycles. Latched.
// offset 0x40 D_RBSR — Ring-buffer size. Latched.
// offset 0x50 D_RBOR — Ring-buffer base. Latched.
// any other offset — write traced + dropped; read returns 0.
//
// Codex framing: "If the hot PC is truly a D_STAT poll, read-as-zero
// may or may not be the right 'ready' value. Let the next run tell us.
// If it still loops, the next chapter should decode the branch
// condition and choose the exact D_STAT bit semantics, not guess the
// whole region." The implementation honors that — every offset has
// minimal-sufficient behavior; future chapters can refine specific
// bits once a real ELF surfaces a divergence.
//
// Port interface mirrors the dmac_reg_stub / intc_stub conventions:
// reg_wr_en / reg_offset / reg_wr_data : write port
// reg_rd_en / reg_offset / reg_rd_data / reg_rd_valid : read port,
// 1-cycle latency
// trace_pkg::* : ev_* events tagged SUBSYS_DMAC + EV_READ/EV_WRITE
// with arg0 = offset, arg1 = data.
`timescale 1ns/1ps
module ee_dmac_ctrl_stub
import trace_pkg::*;
(
input logic clk,
input logic rst_n,
// Write port (single-cycle, shared offset with read).
input logic reg_wr_en,
input logic [7:0] reg_offset,
input logic [31:0] reg_wr_data,
// Read port (1-cycle latency).
input logic reg_rd_en,
output logic [31:0] reg_rd_data,
output logic reg_rd_valid,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam logic [7:0] D_CTRL_OFFSET = 8'h00;
localparam logic [7:0] D_STAT_OFFSET = 8'h10;
localparam logic [7:0] D_PCR_OFFSET = 8'h20;
localparam logic [7:0] D_SQWC_OFFSET = 8'h30;
localparam logic [7:0] D_RBSR_OFFSET = 8'h40;
localparam logic [7:0] D_RBOR_OFFSET = 8'h50;
// ------------------------------------------------------------------
// Register file
// ------------------------------------------------------------------
logic [31:0] d_ctrl;
logic [31:0] d_stat; // CIS in low half (W1C), CIM in high half (W)
logic [31:0] d_pcr;
logic [31:0] d_sqwc;
logic [31:0] d_rbsr;
logic [31:0] d_rbor;
always_ff @(posedge clk) begin
if (!rst_n) begin
d_ctrl <= 32'd0;
d_stat <= 32'd0;
d_pcr <= 32'd0;
d_sqwc <= 32'd0;
d_rbsr <= 32'd0;
d_rbor <= 32'd0;
end else if (reg_wr_en) begin
unique case (reg_offset)
D_CTRL_OFFSET: d_ctrl <= reg_wr_data;
D_STAT_OFFSET: begin
// W1C on the low half (interrupt-status bits): a 1
// in reg_wr_data clears that bit; a 0 leaves it.
// Direct-write on the high half (mask bits).
d_stat[15:0] <= d_stat[15:0] & ~reg_wr_data[15:0];
d_stat[31:16] <= reg_wr_data[31:16];
end
D_PCR_OFFSET: d_pcr <= reg_wr_data;
D_SQWC_OFFSET: d_sqwc <= reg_wr_data;
D_RBSR_OFFSET: d_rbsr <= reg_wr_data;
D_RBOR_OFFSET: d_rbor <= reg_wr_data;
default: ; // unknown offsets: write dropped (traced)
endcase
end
end
// Read mux (1-cycle latency to match the stub ecosystem).
always_ff @(posedge clk) begin
if (!rst_n) begin
reg_rd_data <= 32'd0;
reg_rd_valid <= 1'b0;
end else begin
reg_rd_valid <= reg_rd_en;
if (reg_rd_en) begin
unique case (reg_offset)
D_CTRL_OFFSET: reg_rd_data <= d_ctrl;
D_STAT_OFFSET: reg_rd_data <= d_stat;
D_PCR_OFFSET: reg_rd_data <= d_pcr;
D_SQWC_OFFSET: reg_rd_data <= d_sqwc;
D_RBSR_OFFSET: reg_rd_data <= d_rbsr;
D_RBOR_OFFSET: reg_rd_data <= d_rbor;
default: reg_rd_data <= 32'd0;
endcase
end
end
end
// ------------------------------------------------------------------
// Trace — one event per cycle, write priority over read (consistent
// with the rest of the stub ecosystem).
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_WRITE;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (reg_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_WRITE;
ev_arg0 <= {56'd0, reg_offset};
ev_arg1 <= {32'd0, reg_wr_data};
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'h0000_0001; // write
end else if (reg_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_READ;
ev_arg0 <= {56'd0, reg_offset};
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : ee_dmac_ctrl_stub
+181
View File
@@ -0,0 +1,181 @@
// retroDE_ps2 — ee_dmac_passive_chan_stub
//
// Ch288 — Lightweight per-channel register surface for the EE DMAC
// channels NOT covered by a dedicated transfer-FSM stub. Hosts the
// four standard per-channel registers (CHCR/MADR/QWC/TADR) for each
// covered channel; reset to zero, writes latch, reads return the
// latched value. NO transfer FSM, NO start-bit side effects, NO
// D_STAT interaction. This is the "init-time channel-clear / quiet
// register surface" Codex framed for Ch288.
//
// Channels covered (4 KiB window each, starting at 0x1000_8000):
// ch0 (VIF0) 0x1000_8000-0x1000_8FFF
// ch1 (VIF1) 0x1000_9000-0x1000_9FFF
// ch3 (IPU_FROM) 0x1000_B000-0x1000_BFFF
// ch4 (IPU_TO) 0x1000_C000-0x1000_CFFF ← qbert's first hit
// ch5 (SIF0) 0x1000_D000-0x1000_DFFF
//
// SKIPPED:
// ch2 (GIF) 0x1000_A000-0x1000_AFFF — already routed
// externally to dmac_reg_stub via the map's
// ee_dmac_ch2_* ports. Do NOT shadow it here.
//
// Channel index extracted from chan_addr[15:12]:
// 0x8 → ch0, 0x9 → ch1, 0xB → ch3, 0xC → ch4, 0xD → ch5
// (0xA / ch2 is filtered by the caller; if chan_addr[15:12]==0xA
// arrives here the module silently drops it.)
//
// Register offsets (chan_addr[11:0], matches dmac_reg_stub layout):
// 0x00 CHCR — control (start bit at [0]); latched, no FSM
// 0x10 MADR — main address
// 0x20 QWC — quadword count
// 0x30 TADR — tag address
// any other offset: read = 0, write dropped + traced
`timescale 1ns/1ps
module ee_dmac_passive_chan_stub
import trace_pkg::*;
(
input logic clk,
input logic rst_n,
// Write port. chan_addr is the 16-bit offset into the entire
// 0x1000_8000-base window: chan_addr[15:12] = channel selector,
// chan_addr[11:0] = register offset within that channel.
input logic reg_wr_en,
input logic [15:0] chan_addr,
input logic [31:0] reg_wr_data,
// Read port (1-cycle latency).
input logic reg_rd_en,
output logic [31:0] reg_rd_data,
output logic reg_rd_valid,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam logic [11:0] CHCR_OFFSET = 12'h000;
localparam logic [11:0] MADR_OFFSET = 12'h010;
localparam logic [11:0] QWC_OFFSET = 12'h020;
localparam logic [11:0] TADR_OFFSET = 12'h030;
// Channel index from the high nibble of chan_addr. Out-of-range
// nibbles (anything outside 0x8/0x9/0xB/0xC/0xD) get
// chan_valid=0 and the access is dropped.
logic [3:0] chan_nibble;
logic [2:0] chan_idx; // 0..4 packed: 0=ch0, 1=ch1, 2=ch3, 3=ch4, 4=ch5
logic chan_valid;
always_comb begin
chan_nibble = chan_addr[15:12];
chan_idx = 3'd0;
chan_valid = 1'b0;
unique case (chan_nibble)
4'h8: begin chan_idx = 3'd0; chan_valid = 1'b1; end // ch0
4'h9: begin chan_idx = 3'd1; chan_valid = 1'b1; end // ch1
4'hB: begin chan_idx = 3'd2; chan_valid = 1'b1; end // ch3
4'hC: begin chan_idx = 3'd3; chan_valid = 1'b1; end // ch4
4'hD: begin chan_idx = 3'd4; chan_valid = 1'b1; end // ch5
default: ;
endcase
end
logic [11:0] reg_offset;
assign reg_offset = chan_addr[11:0];
// ------------------------------------------------------------------
// Register file: 5 channels × 4 registers
// ------------------------------------------------------------------
logic [31:0] chcr [0:4];
logic [31:0] madr [0:4];
logic [31:0] qwc [0:4];
logic [31:0] tadr [0:4];
always_ff @(posedge clk) begin
if (!rst_n) begin
for (int i = 0; i < 5; i++) begin
chcr[i] <= 32'd0;
madr[i] <= 32'd0;
qwc[i] <= 32'd0;
tadr[i] <= 32'd0;
end
end else if (reg_wr_en && chan_valid) begin
unique case (reg_offset)
CHCR_OFFSET: chcr[chan_idx] <= reg_wr_data;
MADR_OFFSET: madr[chan_idx] <= reg_wr_data;
QWC_OFFSET: qwc[chan_idx] <= reg_wr_data;
TADR_OFFSET: tadr[chan_idx] <= reg_wr_data;
default: ;
endcase
end
end
// Read mux (1-cycle latency). Returns 0 for invalid channel /
// unknown offset.
always_ff @(posedge clk) begin
if (!rst_n) begin
reg_rd_data <= 32'd0;
reg_rd_valid <= 1'b0;
end else begin
reg_rd_valid <= reg_rd_en;
if (reg_rd_en && chan_valid) begin
unique case (reg_offset)
CHCR_OFFSET: reg_rd_data <= chcr[chan_idx];
MADR_OFFSET: reg_rd_data <= madr[chan_idx];
QWC_OFFSET: reg_rd_data <= qwc[chan_idx];
TADR_OFFSET: reg_rd_data <= tadr[chan_idx];
default: reg_rd_data <= 32'd0;
endcase
end else if (reg_rd_en) begin
reg_rd_data <= 32'd0; // invalid channel
end
end
end
// ------------------------------------------------------------------
// Trace — write priority over read; tagged SUBSYS_DMAC with
// arg0 = chan_nibble (0x8/0x9/0xB/0xC/0xD = phys channel), arg1
// = data, arg2 = reg_offset, arg3 = chan_idx (packed 0..4).
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_WRITE;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (reg_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_WRITE;
ev_arg0 <= {60'd0, chan_nibble};
ev_arg1 <= {32'd0, reg_wr_data};
ev_arg2 <= {52'd0, reg_offset};
ev_arg3 <= {61'd0, chan_idx};
ev_flags <= {31'd0, chan_valid};
end else if (reg_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_READ;
ev_arg0 <= {60'd0, chan_nibble};
ev_arg1 <= 64'd0;
ev_arg2 <= {52'd0, reg_offset};
ev_arg3 <= {61'd0, chan_idx};
ev_flags <= {31'd0, chan_valid};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : ee_dmac_passive_chan_stub
+222
View File
@@ -0,0 +1,222 @@
# rtl/ee
Emotion Engine-side RTL. Matches `docs/contracts/ee.md`.
## Current contents
- `ee_fetch_stub.sv` — minimal sequential fetcher from the early waves.
On reset, PC = BIOS reset vector (0xBFC00000). Each cycle while
`enable` is high, issues a read at PC and advances PC += 4. No
decode, no branches, no exceptions. Emits `EV_RESET` once at reset
exit and `EV_IFETCH` for each returned response. Retained for the
Milestone-B golden-reference comparison.
- `ee_core_stub.sv`**first real EE instruction-decoding core.**
Structural mirror of `iop_core_stub`: same multi-cycle FSM, same
R3000 subset (LUI/ORI/ADDIU/LW/SW/BEQ/BNE/J/JR/NOP/SYSCALL/MFC0/MTC0/
RFE), same branch-delay-slot discipline, same minimal COP0 +
exception entry, same `STRICT_UNSUPPORTED` trap gate. Separate file
from the IOP core because the EE is fundamentally an R5900 and will
eventually need 64-bit registers, COP1/COP2, VU-side plumbing the
IOP will never grow. Emits traces under `SUBSYS_EE` (vs.
`SUBSYS_IOP` for the IOP core).
## Current status
The EE side has a first real execution primitive (`ee_core_stub`) and
runs hand-assembled bootstraps from the shared BIOS ROM window. The
IOP side is ahead — it has DMAC ch9 data path, real interrupt
exception entry, BIOS reset, and strict-mode BIOS smoke bring-up. The
EE side's next natural growth (in roughly this order) is:
1. ~~CPU-side LW/SW to EE RAM.~~ **Done** (`tb_ee_core_memops`). EE
memory map now routes CPU 32-bit reads and writes into the 128-bit
`ee_ram_stub` with lane-select on reads and byte-enable masking on
writes. CPU wins over DMAC on same-cycle RAM-read collisions and
over the SIF egress bridge on RAM-write collisions.
2. ~~EE DMAC register access from the core.~~ **Done**
(`tb_ee_core_dmac`, `tb_ee_core_dmac_poll`). Chapter 3 added the
write-side: EE map decodes a CPU write at `phys[28:12] ==
17'h1_000A` (0x1000_A000-0x1000_AFFF, ch2 GIF) and routes it
through a new `ee_dmac_ch2_wr_*` port into `dmac_reg_stub`. The
EE core programs MADR/QWC/CHCR via SW; the DMAC fetches from EE
RAM through the map's `dmac_rd_*` port and completes with real
DMA_START/BEAT/DONE events. Chapter 4 added the read-side:
`dmac_reg_stub` grew a `reg_rd_*` surface (CHCR/MADR/QWC/TADR +
DONE_COUNT monotonic counter at 0x40), and the EE map forwards
CPU reads in the same DMAC window via a new `ee_dmac_ch2_rd_*`
port. The core polls CHCR.start until the DMAC clears it, then
reads DONE_COUNT and writes the witness to RAM — no more fixed
NOP padding.
3. ~~EE INTC + exception entry.~~ **Done** (`tb_ee_core_dmac_intc`).
EE map now decodes the EE INTC register window at `phys[28:12] ==
17'h1_000F` (0x1000_F000/0x1000_F010 for STAT/MASK) and carries
both directions through new `ee_intc_{wr,rd}_*` ports. An
`intc_stub` instance on the EE side latches
`dmac_reg_stub.irq_completion_o` and drives `ee_core_stub.cpu_irq`
(which feeds `cause_ip[2]`). Bootstrap enables interrupts
(Status = IEc | IM[2]), programs INTC_MASK, kicks the DMAC, and
waits on DONE_COUNT; a RAM-resident ISR at `EXC_VECTOR=0x80` acks
INTC_STAT via W1C, MFC0 EPC, JR + RFE. Core takes exactly one
exception + one RFE, strictly after DMA_DONE.
4. ~~EE-side strict BIOS smoke.~~ **Done** (`tb_ee_core_bios_smoke`).
EE mirror of the IOP smoke harness: `ee_core_stub` instantiated
with `STRICT_UNSUPPORTED=1'b1`; synthetic CI bootstrap ends in an
`AND` (SPECIAL func 0x24) that the core doesn't decode, so
`trap_o`/`trap_pc_o`/`trap_instr_o` fire and halt the core loudly.
Swap in a real BIOS via `make tb_ee_core_bios_smoke
BIOS=/path/to/bios.hex` (plusarg-driven `$readmemh` into
`u_bios.mem`, same convention as the IOP target). Output line
includes an inline mnemonic decoder so the iteration loop (drop
in BIOS, read output, add the missing opcode) works without a
separate disassembler.
5. **Widen the core opcode set, driven by real-BIOS smoke.** The
iteration loop is live: drop a BIOS dump in via
`make tb_ee_core_bios_smoke BIOS=...`, read `trap_instr` +
`mnemonic` from the output, implement the op, re-run. Progress
so far (each step landed a dedicated coverage TB and kept
full_checks green):
- **SLTI / SLTIU** (I-type compare, opcodes 0x0A / 0x0B). First
real-BIOS trip at 0xBFC0_0008. TB: `tb_ee_core_slti`.
- **ADDI** (opcode 0x08). Implemented as ADDIU (no overflow
trap — real BIOS doesn't emit ADDI where overflow could
actually happen). TB: `tb_ee_core_addi`.
- **ANDI** (opcode 0x0C, zero-extended). TB: `tb_ee_core_andi`.
- **AND / OR / XOR / NOR** (SPECIAL R-type logic family, func
0x24-0x27; destination = rd). Batched because they share the
R-type ALU plumbing. TB: `tb_ee_core_rtype_logic`.
- **SB** (opcode 0x28, byte store with lane broadcast +
one-hot byte-enable on the map write bus). TB:
`tb_ee_core_sb`. Unlocked a 1500-instruction stretch
(retired=180 → 1704).
- **LB** (opcode 0x20, sign-extended byte load via
`map_rd_data` lane extraction + 24-bit sign-extend in
`S_MEM_WAIT`). TB: `tb_ee_core_lb`.
- **JAL** (opcode 0x03, jump-and-link; writes `$31 = pc+8`).
TB: `tb_ee_core_jal`.
- **ADDU / SUBU** (SPECIAL R-type arith, func 0x21 / 0x23).
Batched, share R-type ALU. TB: `tb_ee_core_rtype_addu`.
Codex pre-approved the grouping.
- **SLT / SLTU** (SPECIAL R-type compare, func 0x2A / 0x2B).
Batched with the R-type ALU; register-form pair of
SLTI/SLTIU. TB: `tb_ee_core_slt`. Unlocked a 5700-
instruction stretch (retired=1717 → 7385).
- **LH / LHU** (opcodes 0x21 / 0x25, halfword load with sign-
and zero-extension respectively). Batched — same lane-
extraction plumbing, differ only in fill semantics. Halfword
addressing uses `ea[1]` (ea[0] must be zero for aligned
access). TBs: `tb_ee_core_lh`, `tb_ee_core_lhu` (each
covers both halfword lanes + the fill discipline for
negative high-lane values). Unlocked retired=7385 → 8207.
- **SLL / SRL / SRA** (SPECIAL R-type shifts, func 0x00 /
0x02 / 0x03). Batched per Codex pre-approval. Destination
= rd, operand = rt, shift amount = `shamt` (bits [10:6]).
SRA uses `$signed(rt_val) >>> shamt` for arithmetic right
shift (sign fill); SRL uses `rt_val >> shamt` (zero fill).
SLL $0,$0,0 is the canonical NOP encoding and flows through
this path harmlessly — the rd_idx=0 writeback guard blocks
any phantom write. TB: `tb_ee_core_shift` (critical probes:
SRL vs SRA on the same negative input to catch sign-vs-zero
fill bugs). Unlocked a **12,000-instruction stretch**
(retired=8207 → 20327).
- **SH** (opcode 0x29, halfword store). Store-side mate to
LH/LHU; same lane-broadcast + byte-enable idiom as SB but
at halfword granularity via `ea[1]`. 2-of-4 byte-enable
(`4'b0011` for low lane, `4'b1100` for high lane) preserves
the non-addressed halfword. TB: `tb_ee_core_sh` — two
chained probes with register values that have distinctive
upper halves (0xCAFE_FACE, 0x1234_5678). If the byte-enable
is wrong or the full register leaks into the map_wr_data
bus, the preservation check catches it (RAM word ends up
0x5678_FACE after both stores; wrong behavior would corrupt
the non-addressed halfword). Unlocked a **56,000-
instruction stretch** (retired=20327 → 76406) once the
RAM-size infra issue was also fixed in the same chapter
— see next bullet.
- **Real-BIOS RAM size (chapter 7.9 infra fix).** Before this
chapter, `tb_ee_core_bios_smoke` used only 4 KiB of EE RAM
— fine for the synthetic CI program (which never writes
beyond the first qword), but destructive once the real
BIOS copies a large chunk of itself into RAM and jumps
there. Addresses beyond 4 KiB silently aliased into the
same window, producing 156k "retires" that were actually
the core executing a scrambled mix of overwritten bytes,
with no trap ever firing because whatever happened to land
at the aliased offset decoded to something supported.
Bumped `EE_RAM_BYTES` in the bench to 4 MiB (real PS2 has
32 MiB; 4 MiB covers BIOS init comfortably without
ballooning sim memory). After the fix, real-BIOS smoke
runs honestly and trapped on JALR at 0xBFC5_29E8.
- **JALR** (SPECIAL func 0x09, register-indirect call). Target
is `rs_val` (same path as JR); link address pc+8 is written
to `rd_idx`. Unlike JAL's hardcoded `$31`, JALR's link
destination is explicit in the instruction, and `rd==0` is
a valid encoding that suppresses the link write. TB:
`tb_ee_core_jalr` — two probes: canonical `jalr $31, $rs`
(what the BIOS used) plus `jalr $20, $rs` with the return
via `jr $20` to prove the rd field is honored and not
accidentally hardcoded to $31. Unlocked retired=76406 →
84112 and the BIOS fully jumped into RAM-resident code
(next trap_pc is `0x0000_060C`, a RAM address, not BIOS).
- **ADD / SUB** (SPECIAL R-type, func 0x20 / 0x22). Batched
per Codex's guidance — same pragmatic policy as ADDI vs
ADDIU: this core does not model the Arithmetic Overflow
exception, so ADD behaves as ADDU and SUB behaves as SUBU.
Merged into the existing `rs_val + rt_val` / `rs_val - rt_val`
arms of `rtype_alu_wb`. TB: `tb_ee_core_add_sub` — four
probes including INT_MAX+1 wrap, which documents the
deferred-exception policy (the wrap is the *expected*
outcome, so the TB will fail loudly if overflow trapping
ever lands without the TB being updated).
- **COP0 Count (reg 9)** — first machine-state chapter after
the iter-14 transition. Free-running 32-bit counter that
increments every clock and resets to 0. Exposed read-only
through MFC0 $9. MTC0 $9 silently dropped (no reset-to-value
yet; revisit if BIOS depends on it). TB:
`tb_ee_core_cop0_count` — two probes covering consecutive-
MFC0 advance and a canonical `while (now < target)` poll
that must exit.
- **Enhanced bios_smoke PC sampler** with `peek_instr(addr)`
helper (hierarchical read through `u_bios.mem` / `u_ee_ram.mem`)
and a parallel `retired_history` array. Timeout now reports
the instruction and retired count at each sample, not just
pc. Timeout window bumped 5 ms → 20 ms for BIOS runway.
- **Sampler pointer snapshots + 80 ms timeout.** After the
instruction-aware sampler showed the loop was a linked-list
walk (not a hardware wait), Codex directed "extend timeout
first, then add pointer snapshots only if still stuck".
Timeout bumped 20 ms → 80 ms: retired grew linearly to
2.46 M, still 100% in the same loop (≈350k iterations — way
beyond any plausible BIOS list length). Added `u_core.regfile[5]`
and `[6]` hierarchical snapshots at each sample. Finding:
- `$5` (sentinel) = `0x00000974` — plausible low-RAM pointer
- `$6` (current) = `0xDEADBEEF` — **the EE map's unmapped-
read poison value**.
The cycle is self-perpetuating: `lw $2, 0($6)` with
`$6 = 0xDEADBEEF` reads address 0xDEADBEEF, which is
unmapped, returning 0xDEADBEEF; the `bne $2, $0` stays
taken forever. The real root cause is an **earlier** BIOS
read from an unmapped address that poisoned a data structure
— the traversal followed the poisoned pointer and locked in.
- *(next-move call is with Codex: add an unmapped-read tracer
to find the first bad address, implement whatever peripheral
the BIOS was reading, change the poison value to 0 so the
loop exits and exposes further BIOS progress, or something
else.)*
- **Bench-drift note (chapter 7.5):** the synthetic BIOS smoke
sentinel was originally AND; once AND was added to the
R-type ALU, the synthetic test silently stopped tripping
and started timing out. Codex caught it; sentinel is now
BREAK (SPECIAL func 0x0D). See project memory for the full
post-mortem. Lesson: avoid using real opcodes as
"unsupported sentinels" in test benches.
## Scope boundary
This directory owns EE CPU execution and its immediate coprocessors
(COP0 minimum; eventually COP1 FPU and COP2 VU macro mode). It does
**not** own:
- memory map / address decode — that's `rtl/memory/ee_memory_map_stub.sv`.
- interrupt controller — that's `rtl/intc/` (generic; the same
`intc_stub` module already serves the IOP side).
- DMAC, VIF/VU, GIF/GS — separate directories.
+142
View File
@@ -0,0 +1,142 @@
// retroDE_ps2 — ee_biu_mmio_stub
//
// Narrow latched-register-file stub for the EE Bus Interface Unit /
// cache-control window at virtual `0xFFFE_0000 - 0xFFFE_0FFF`
// (physical `0x1FFE_0000 - 0x1FFE_0FFF` after kseg1-stripping).
// Architecturally this is the R5900's privileged BIU/control
// register space — the same place the BIOS writes CACHE-control
// and BIU-config values during boot.
//
// Chapter 9: chapter 8 closed the 0x1F80_xxxx hole. The first-
// unmapped observer in tb_ee_core_bios_smoke then showed the next
// unmapped event was a WRITE at 0xFFFE_0130 (pc=0xBFC0_21BC,
// cycle 808). Multiple more writes to that same offset fire later
// with values 0xCC4, 0xCC0, 0x1E988, 0xC04, 0x3202_000F —
// classic cache/BIU config dance. Without a stub, these writes
// land as UNMAPPED events; the first one reads back to this stub
// would return 0xDEADBEEF and re-poison the pointer chain chapter
// 8 just cleaned up.
//
// Codex's call for chapter 9: give this its own dedicated stub
// with its own region tag, NOT a broad "everything else" fallback.
// Keep architecturally distinct surfaces distinct. If the BIOS
// later touches 0x1FA0_0000 (next unmapped in the observer), that
// will be its own chapter, not folded in here.
//
// Semantics (same shape as ee_bootstrap_mmio_stub):
// - 4 KiB window = 1024 × 32-bit latched registers, zero-init.
// - Writes latch per-byte: for each `wr_be[i]`, byte[i] of the
// addressed register updates; untouched lanes preserve their
// prior value. Makes SB/SH through the window safe.
// - Reads return currently-latched value, one-cycle latency.
// - No side effects. BIOS read-modify-write sequences stay
// self-consistent.
//
// Size cost: 1024 × 32 bits = 4 KiB sim memory. Negligible.
//
// Trace: per-access event on SUBSYS_MEM with region tag
// `REGION_EE_BIU = 10` (distinct from REGION_EE_MISC_MMIO=9 so
// post-run analysis can separate the two windows).
`timescale 1ns/1ps
module ee_biu_mmio_stub
import trace_pkg::*;
(
input logic clk,
input logic rst_n,
// Write port — 12-bit offset within the 4 KiB window
input logic reg_wr_en,
input logic [11:0] reg_wr_addr,
input logic [31:0] reg_wr_data,
input logic [3:0] reg_wr_be,
// Read port — 1-cycle latency
input logic reg_rd_en,
input logic [11:0] reg_rd_addr,
output logic [31:0] reg_rd_data,
output logic reg_rd_valid,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int WORDS = 1024; // 4 KiB / 4
localparam logic [63:0] REGION_EE_BIU = 64'd10;
logic [31:0] regs [0:WORDS-1];
initial begin
for (int i = 0; i < WORDS; i++) regs[i] = 32'd0;
end
logic [9:0] wr_idx;
logic [9:0] rd_idx;
assign wr_idx = reg_wr_addr[11:2];
assign rd_idx = reg_rd_addr[11:2];
// Per-byte write latch
always_ff @(posedge clk) begin
if (rst_n && reg_wr_en) begin
if (reg_wr_be[0]) regs[wr_idx][ 7: 0] <= reg_wr_data[ 7: 0];
if (reg_wr_be[1]) regs[wr_idx][15: 8] <= reg_wr_data[15: 8];
if (reg_wr_be[2]) regs[wr_idx][23:16] <= reg_wr_data[23:16];
if (reg_wr_be[3]) regs[wr_idx][31:24] <= reg_wr_data[31:24];
end
end
// Read — 1-cycle latency
always_ff @(posedge clk) begin
if (!rst_n) begin
reg_rd_data <= 32'd0;
reg_rd_valid <= 1'b0;
end else begin
reg_rd_valid <= reg_rd_en;
if (reg_rd_en) reg_rd_data <= regs[rd_idx];
end
end
// Trace — write wins same-cycle collision (defensive; map enforces
// mutual exclusion)
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_WRITE;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (reg_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_WRITE;
ev_arg0 <= {52'd0, reg_wr_addr};
ev_arg1 <= {32'd0, reg_wr_data};
ev_arg2 <= {60'd0, reg_wr_be};
ev_arg3 <= REGION_EE_BIU;
ev_flags <= 32'h0000_0001;
end else if (reg_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_READ;
ev_arg0 <= {52'd0, reg_rd_addr};
ev_arg1 <= {32'd0, regs[rd_idx]};
ev_arg2 <= 64'd0;
ev_arg3 <= REGION_EE_BIU;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : ee_biu_mmio_stub
+269
View File
@@ -0,0 +1,269 @@
// retroDE_ps2 — ee_bootstrap_mmio_stub
//
// Latched-register-file stub for the EE "bootstrap MMIO" window at
// physical `0x1F80_0000 - 0x1F80_FFFF` (64 KiB). Covers the real
// PS2 MCH (memory controller), SBUS gateway, and RDRAM init
// registers the BIOS touches very early in boot. This is the
// narrowest thing that closes the poisoned-dataflow hole found by
// chapter 7.99: before this module existed, the EE map returned
// `0xDEADBEEF` for every CPU read in this window, and the BIOS
// laundered that poison into a data structure whose later
// traversal wedged the core forever.
//
// Semantics (deliberately simple, not architecturally accurate):
// - Full window is a 16 KiB word-addressed register file; all
// registers reset/init to 0.
// - Writes latch per-byte: for each `wr_be[i]` that is asserted,
// `regs[addr[15:2]][8*i +: 8] <= wr_data[8*i +: 8]`. Untouched
// byte lanes preserve their existing value. This makes SB/SH
// write-through-this-window safe — prior chapters added SB/SH
// for BIOS progress, and without be-aware latching a sub-word
// store here would clobber the other three (or two) bytes.
// - Reads return the currently-latched value, one-cycle latency,
// matching the rest of the stub ecosystem.
// - No side effects, no per-register behavior (no ready-bit
// auto-set, no interrupt generation, no state machines).
//
// That keeps BIOS read/modify/write sequences self-consistent:
// if the BIOS reads reg X, ORs a bit, writes back, it sees the
// merged value on the next read. It does NOT emulate real
// hardware semantics (e.g. status bits that flip on their own,
// interrupt latches, FIFO behavior). If the BIOS tripwire-depends
// on any of that, it will reveal itself the same way the 0x14B4
// linked-list wedge did — via a new diagnostic signal, handled
// in a future chapter.
//
// Trace:
// Per-access event on SUBSYS_MEM with the region tag
// `REGION_EE_MISC_MMIO = 9`. arg0 is the 16-bit offset within
// the window (not the full 32-bit address — the map's own
// trace already carries the full address; the stub's finer
// trace carries the offset so downstream analysis can see
// which register was touched without having to mask). arg1 is
// the data (write data, or the value being returned on read).
// arg3 is the region constant. flags bit 0 = write.
//
// Size cost: 16384 × 32 bits ≈ 64 KiB of sim memory. Negligible.
`timescale 1ns/1ps
module ee_bootstrap_mmio_stub
import trace_pkg::*;
#(
// Ch202 — narrow "ready" return for offset 0x1814. Pre-Ch201 the
// window returned the latched register value (which initialises to
// 0); the BIOS at PC=0xBFC4FB04..FB30 polls this address waiting
// for ($read & $mask) != 0 and our zero return left it spinning.
// Default = 32'hFFFFFFFF satisfies any non-zero mask the BIOS may
// hold in $a0 — wider than a real PS2 GPUSTAT (typical idle =
// 0x1C00_0000), but the BIOS has not been observed to USE the
// value beyond the bit-test so the wider satisfaction is safe.
// A future chapter can narrow this if a side-effect is observed.
parameter logic [31:0] MMIO_1814_RDY_VALUE = 32'hFFFF_FFFF,
// Ch258 — IOP DMAC PCR realism stub. The IOP DMAC Priority Control
// Register lives at phys 0x1F8010F0 (= EE kseg1 0xBF8010F0). Real
// PS1/IOP hardware resets this to 0x07654321 (priority 1 for ch0,
// 2 for ch1, ... 7 for ch6, with bit[31:24]=0x07 as the enable
// mask). Ch218 observer captured BIOS reading this address three
// times during the Ch215 longjmp treadmill (PC=0xbfc4d2cc /
// 0xbfc4d2dc / 0xbfc4d350), all returning 0 from our latched-zero
// stub. Whether the zero return is the cause of the treadmill or
// an incidental noise read is open — Ch258's job is to flip the
// PCR to its real reset value and re-observe.
//
// This is a REALISM STUB, not a fix. We are not modelling the
// IOP DMA channel priority semantics; we are just declining to
// return poison-zero for a named hardware register with a known
// reset value. If BIOS escapes the Ch215 treadmill after this
// change, great. If it does not, Ch258 closes with "PCR was not
// the gate" and we name the next observed blocker.
parameter logic [31:0] MMIO_10F0_PCR_VALUE = 32'h0765_4321
)
(
input logic clk,
input logic rst_n,
// Write port
input logic reg_wr_en,
input logic [15:0] reg_wr_addr,
input logic [31:0] reg_wr_data,
input logic [3:0] reg_wr_be,
// Read port — 1-cycle latency, matches rest of stub ecosystem
input logic reg_rd_en,
input logic [15:0] reg_rd_addr,
output logic [31:0] reg_rd_data,
output logic reg_rd_valid,
// Ch259 / Ch260 — DIAGNOSTIC source-injection port for the named
// IOP INTC view at 0x1F801070/0x1F801074. DEFAULT IS ZERO in every
// existing instantiation (tb_ee_bootstrap_mmio.sv and
// tb_ee_core_bios_smoke.sv both tie this to 16'd0 unless the
// BIOS-long TB's +IOP_INTC_BOOT_SRC plusarg overrides it).
//
// When non-zero, each set bit is ORed into I_STAT every cycle so
// the assertion survives W1C clears (matches the "real device
// asserts the line until serviced" shape, not a one-shot pulse).
//
// This port exists ONLY as a controlled diagnostic knob. Ch259
// closed the BIOS-mmio-probe arc with the finding that single
// synthetic source bits do not break the Ch215 treadmill — the
// multi-state IOP/SBUS/kernel activity is needed instead. Any
// future use of this port should be similarly scoped (TB-driven,
// documented intent, default-zero on instantiation).
input logic [15:0] iop_intc_inject_src_i,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int WORDS = 16384; // 64 KiB / 4
localparam logic [63:0] REGION_EE_MISC_MMIO = 64'd9;
logic [31:0] regs [0:WORDS-1];
initial begin
for (int i = 0; i < WORDS; i++) regs[i] = 32'd0;
end
logic [13:0] wr_idx;
logic [13:0] rd_idx;
assign wr_idx = reg_wr_addr[15:2];
assign rd_idx = reg_rd_addr[15:2];
// Per-byte write latch — honors reg_wr_be so SB/SH through this
// window preserves the untouched byte lanes instead of clobbering
// the whole 32-bit register.
always_ff @(posedge clk) begin
if (rst_n && reg_wr_en) begin
if (reg_wr_be[0]) regs[wr_idx][ 7: 0] <= reg_wr_data[ 7: 0];
if (reg_wr_be[1]) regs[wr_idx][15: 8] <= reg_wr_data[15: 8];
if (reg_wr_be[2]) regs[wr_idx][23:16] <= reg_wr_data[23:16];
if (reg_wr_be[3]) regs[wr_idx][31:24] <= reg_wr_data[31:24];
end
end
// Read — 1-cycle latency. Ch202: offset 0x1814 ignores the latched
// register and returns MMIO_1814_RDY_VALUE so the BIOS bit-test
// poll satisfies (read & mask) != 0 on the first read. Writes to
// 0x1814 still latch into regs[]; a future chapter can promote
// 0x1814 to a true read-write register if BIOS-write semantics
// matter, but the current observed behavior is read-only-status.
// Ch258 adds the same shape for offset 0x10F0 (IOP DMAC PCR).
// Ch259 promotes 0x1070 (IOP INTC I_STAT) and 0x1074 (I_MASK)
// OUT of the anonymous regfile into named INTC behavior — W1C
// on STAT writes, plain-write on MASK writes, sticky source
// injection from `iop_intc_inject_src_i`. Matches the existing
// `rtl/intc/intc_stub.sv` shape exactly so the EE-side view of
// the IOP INTC behaves like the IOP-side view does.
localparam logic [13:0] OFFSET_1814_WIDX = 14'h0605; // 0x1814 >> 2 (1541)
localparam logic [13:0] OFFSET_10F0_WIDX = 14'h043C; // 0x10F0 >> 2 (1084)
localparam logic [13:0] OFFSET_1070_WIDX = 14'h041C; // 0x1070 >> 2 (1052)
localparam logic [13:0] OFFSET_1074_WIDX = 14'h041D; // 0x1074 >> 2 (1053)
// Ch259 — named IOP INTC state. Independent of the anonymous
// regs[] (writes to 0x1070/0x1074 still update regs[] via the
// generic per-byte latch above, but reads bypass it for these
// offsets, matching the Ch202/Ch258 override pattern).
logic [15:0] iop_intc_stat_q;
logic [15:0] iop_intc_mask_q;
wire [15:0] iop_intc_stat_w1c_mask =
(reg_wr_en && wr_idx == OFFSET_1070_WIDX && (&reg_wr_be))
? reg_wr_data[15:0] : 16'd0;
wire iop_intc_mask_wr_en =
reg_wr_en && wr_idx == OFFSET_1074_WIDX && (&reg_wr_be);
always_ff @(posedge clk) begin
if (!rst_n) begin
iop_intc_stat_q <= 16'd0;
iop_intc_mask_q <= 16'd0;
end else begin
// I_STAT: W1C of cleared bits, OR'd with sticky injection.
// Assertion-wins on same-cycle W1C+source collision —
// matches `intc_stub.sv` lines ~102-110 so we don't
// swallow an interrupt that's still held.
iop_intc_stat_q <= (iop_intc_stat_q & ~iop_intc_stat_w1c_mask)
| iop_intc_inject_src_i;
if (iop_intc_mask_wr_en)
iop_intc_mask_q <= reg_wr_data[15:0];
end
end
wire [31:0] iop_intc_stat_read = {16'd0, iop_intc_stat_q | iop_intc_inject_src_i};
wire [31:0] iop_intc_mask_read = {16'd0, iop_intc_mask_q};
always_ff @(posedge clk) begin
if (!rst_n) begin
reg_rd_data <= 32'd0;
reg_rd_valid <= 1'b0;
end else begin
reg_rd_valid <= reg_rd_en;
if (reg_rd_en) begin
if (rd_idx == OFFSET_1814_WIDX)
reg_rd_data <= MMIO_1814_RDY_VALUE;
else if (rd_idx == OFFSET_10F0_WIDX)
reg_rd_data <= MMIO_10F0_PCR_VALUE;
else if (rd_idx == OFFSET_1070_WIDX)
reg_rd_data <= iop_intc_stat_read;
else if (rd_idx == OFFSET_1074_WIDX)
reg_rd_data <= iop_intc_mask_read;
else
reg_rd_data <= regs[rd_idx];
end
end
end
// Trace emission — one event per cycle, write wins on same-cycle
// collision (mirrors the rd/wr_en mutual-exclusion at the map level;
// this is defensive for mechanical safety).
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_WRITE;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (reg_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_WRITE;
ev_arg0 <= {48'd0, reg_wr_addr};
ev_arg1 <= {32'd0, reg_wr_data};
ev_arg2 <= 64'd0;
ev_arg3 <= REGION_EE_MISC_MMIO;
ev_flags <= 32'h0000_0001;
end else if (reg_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_READ;
ev_arg0 <= {48'd0, reg_rd_addr};
ev_arg1 <= (rd_idx == OFFSET_1814_WIDX)
? {32'd0, MMIO_1814_RDY_VALUE}
: (rd_idx == OFFSET_10F0_WIDX)
? {32'd0, MMIO_10F0_PCR_VALUE}
: (rd_idx == OFFSET_1070_WIDX)
? {32'd0, iop_intc_stat_read}
: (rd_idx == OFFSET_1074_WIDX)
? {32'd0, iop_intc_mask_read}
: {32'd0, regs[rd_idx]};
ev_arg2 <= 64'd0;
ev_arg3 <= REGION_EE_MISC_MMIO;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : ee_bootstrap_mmio_stub
File diff suppressed because it is too large Load Diff
+128
View File
@@ -0,0 +1,128 @@
// retroDE_ps2 — ee_fetch_stub
//
// Minimal sequential-fetch stand-in for the R5900. Wave 1 scope only: enough
// to drive ee_memory_map_stub → bios_rom_stub for Milestone B.
//
// Contract refs:
// docs/stub_module_plan.md (Wave 1, item 4)
// docs/contracts/ee.md
//
// Behavior:
// - On reset, PC = RESET_VECTOR (default 0xBFC00000, the MIPS BIOS
// reset vector in kseg1).
// - Each cycle while `enable` is high: issue a read at PC, advance
// PC += 4. No decode, no branches, no exceptions, no retirement
// fidelity (all out-of-scope per plan).
// - Responses return 1 cycle later via rd_valid/rd_data from the
// memory map. The issued address is latched so the trace line can
// pair address with data.
//
// Non-goals for this wave (stub plan, explicit):
// - full decode,
// - exceptions beyond deterministic fault handling,
// - FPU/MMI behavior,
// - instruction retirement fidelity.
//
// Trace payload schema (per stub plan):
// EE RESET arg0=reset_vector
// EE IFETCH arg0=pc arg1=data arg2=resp_kind arg3=-
// resp_kind: 0=OK (only path in Wave 1)
`timescale 1ns/1ps
module ee_fetch_stub
import trace_pkg::*;
#(
parameter logic [31:0] RESET_VECTOR = 32'hBFC00000
) (
input logic clk,
input logic rst_n,
input logic enable,
// Memory-facing fetch port
output logic rd_en,
output logic [31:0] rd_addr,
input logic [31:0] rd_data,
input logic rd_valid,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
// ------------------------------------------------------------------
// PC and one-cycle issued-address shadow
//
// pc is the address being issued THIS cycle (rd_addr)
// pc_d1 is the address whose response arrives THIS cycle on rd_valid
//
// pc_d1 only advances alongside pc when enable is high, so it stays
// aligned with the in-flight request.
// ------------------------------------------------------------------
logic [31:0] pc;
logic [31:0] pc_d1;
always_ff @(posedge clk) begin
if (!rst_n) begin
pc <= RESET_VECTOR;
pc_d1 <= RESET_VECTOR;
end else if (enable) begin
pc_d1 <= pc;
pc <= pc + 32'd4;
end
end
assign rd_en = enable;
assign rd_addr = pc;
// ------------------------------------------------------------------
// Trace
// - Single EV_RESET pulse at reset exit.
// - EV_IFETCH one cycle after each rd_valid response.
// ------------------------------------------------------------------
logic reset_emit_pending;
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_EE;
ev_event <= EV_RESET;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
reset_emit_pending <= 1'b1;
end else if (reset_emit_pending) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_EE;
ev_event <= EV_RESET;
ev_arg0 <= {32'd0, RESET_VECTOR};
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
reset_emit_pending <= 1'b0;
end else if (rd_valid) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_EE;
ev_event <= EV_IFETCH;
ev_arg0 <= {32'd0, pc_d1};
ev_arg1 <= {32'd0, rd_data};
ev_arg2 <= 64'd0; // resp_kind: 0 = OK
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : ee_fetch_stub
+43
View File
@@ -0,0 +1,43 @@
# rtl/gif_gs
GIF path and Graphics Synthesizer logic. Matches `docs/contracts/gif_gs.md`.
## Current contents
- `gs_stub.sv` — GS shell with **two architecturally distinct write ports**
(Ch75 namespace split):
- `reg_wr_*` — privileged-block writes (16-bit offset within `0x12000000`).
Latches `BGCOLOR` (offset `0x00E0`) into `bg_{r,g,b}`; other offsets emit
`EV_MODE`.
- `gif_reg_*` — GIF A+D register-number writes (8-bit reg# + 64-bit data).
Decodes `PRIM=0x00`, `RGBAQ=0x01`, `XYZF2=0x04`, `XYZ2=0x05`,
`FRAME_1=0x4C`, `ZBUF_1=0x4E` into per-register 64-bit latches; unknown
reg numbers emit `EV_MODE`.
- No VRAM, no drawing yet — that is the next architectural step.
- `gif_path_stub.sv` — Wave 2 minimal GIF packet logger; project-local
single-qword register-write format. Used by `tb_bgcolor_via_dma`.
- `gif_packed_stub.sv` — real PS2 GIFtag parser (Ch72-Ch75). Handles PACKED
(FLG=0), REGLIST (FLG=1), IMAGE (FLG=2), DISABLE (FLG=3). The
`REAL_AD_REG_MAP` parameter selects the A+D dispatch port:
- `REAL_AD_REG_MAP=0` (default, back-compat) — drives `gs_stub.reg_wr_*`
using a project-local 16-bit offset carried in `in_data[79:64]`.
- `REAL_AD_REG_MAP=1` — drives `gs_stub.gif_reg_*` using the real PS2
8-bit reg# carried in `in_data[71:64]`. Source-of-truth: PCSX2
`GSRegs.h`.
## BGCOLOR reset value
At reset, `bg_{r,g,b}` default to `0x40` each (mid-grey) rather than black.
Rationale: this makes "gs_stub reset but no BGCOLOR write yet" visually
distinct from "video output disabled / black frame" in Milestone A. Override
is a `BGCOLOR` write from the test harness.
## Pitfall: namespace conflation
Ch74 conflated GIF A+D reg numbers with GS privileged-block offsets and
mapped e.g. `0x14`→PMODE@`0x0000`. That is fiction — those are separate
namespaces. Ch75 split them. **`ZBUF_1` is `0x4E`, not `0x4F` (that's
`ZBUF_2`).** When adding a new GIF-context register, source the reg# from
PCSX2 `GSRegs.h`, never from the privileged-block map.
+275
View File
@@ -0,0 +1,275 @@
// retroDE_ps2 — clut_loader_stub (Ch99 + Ch100 + Ch101)
//
// VRAM→CLUT load engine triggered by GIF TEX0.CLD. Watches the
// 1-cycle `tex0_wr_pulse` from gs_stub and starts a 256-entry
// load when the just-written TEX0 satisfies all three:
// - CSM == 1 (CSM2 linear; CSM1 swizzle deferred)
// - CPSM ∈ {PSMCT32, PSMCT16}
// - CLD permits a load under the change-detect policy:
// 0 = never, 1 = always,
// 2 = CBP changed since last load,
// 3 = CBP, CPSM, or CSA changed since last load,
// 4 = always, but write only the 16-entry CSA window
// (Ch102) — destination indices CSA*16..CSA*16+15
// wrap mod 256; the rest of clut_stub is preserved.
// 5..7 = reserved/edge cases at this scope (no-op).
// Per-CPSM stride: PSMCT32 reads 4 bytes/entry from
// VRAM[CBP*256 + i*4]; PSMCT16 reads 2 bytes/entry from
// VRAM[CBP*256 + i*2] and unpacks RGB5A1 → PSMCT32 ABGR with
// 5→8 bit-replicate. clut_stub always sees PSMCT32 entries.
//
// Scope (Ch99 + Ch100):
// - CSM2 (linear addressing) only — entry i lives at byte
// offset i*entry_stride from CBP*256, where entry_stride is
// 4 (PSMCT32) or 2 (PSMCT16). The loader explicitly gates
// start on tex0_csm == 1'b1 (CSM2). A TEX0_1 write with
// CSM=0 (CSM1, 16×16 grid swizzle) is silently ignored at
// this scope rather than performing a wrong linear load.
// - CPSM=PSMCT32 (=0) and CPSM=PSMCT16 (=2) accepted. PSMCT16
// entries are unpacked from RGB5A1 to PSMCT32 ABGR via 5→8
// bit-replicate ({c5, c5[4:2]}) so clut_stub always stores
// PSMCT32 regardless of source format and pcrtc's existing
// PSMT8+CLUT lookup path stays unchanged. Alpha is replicated
// across 8 bits ({8{a1}}). Other CPSM codes (PSMCT24, PSMT8H,
// etc.) are silently ignored.
// - CLD modes (Ch101 + Ch102): full conditional policy
// honored for CLD ∈ {0, 1, 2, 3, 4}.
// 0 = no load.
// 1 = always load — full 256 entries.
// 2 = load only when CBP changed since last load.
// 3 = load when CBP, CPSM, or CSA changed since last load.
// 4 = partial CSA-window load (Ch102) — always fires, but
// writes only 16 entries at clut_stub[CSA*16 + i] for
// i ∈ 0..15 (CSA*16 wraps mod 256). The other 240
// entries are preserved; the VRAM source still starts
// at CBP*256 and uses the same per-CPSM byte stride.
// CLD ∈ {5, 6, 7} silently no-op at this scope (reserved /
// edge cases). The change-detect compares against `prev_*`
// regs latched on entry to S_LOAD; reset clears them to 0,
// so a first CLD=2 with CBP==0 is silently skipped (matches
// the "nothing changed" interpretation).
// - Reference (kept for posterity): real PS2 CLD encodes:
// 1 = always
// 2 = CBP changed
// 3 = CBP, CPSM, or CSA changed
// 4 = CSA changed (partial 16-entry load at CSA)
// 5..7 = reserved/edge cases
// Modeling those needs a full-CLUT register snapshot for
// change detection — deferred.
// - CSA is consumed two ways. (a) For CLD=3 it's a
// change-detect input (any prev-vs-new CSA delta triggers
// a full reload). (b) For CLD=4 it picks the destination
// window: load_csa_base = {CSA, 4'd0} (8-bit, so CSA=16..31
// wrap to base 0..240). Full-CLUT loads (CLD ∈ {1,2,3})
// overwrite all 256 entries regardless of CSA.
// - One in-flight load at a time. A new TEX0_1 write while
// `load_busy=1` is silently ignored at this scope.
//
// Timing: full load = 256 clocks; partial (CLD=4) = 16 clocks.
// `load_busy` is high throughout. TBs typically `wait (load_busy == 0)` to
// gate scanout configuration on the load completing.
`timescale 1ns/1ps
module clut_loader_stub #(
// Ch350 — CSM1 (16×16 CT32 grid) CLUT-load path. Default OFF so all existing CSM2-linear behaviour is
// BYTE-IDENTICAL (a CSM=0 TEX0 is still silently ignored when this is 0, exactly as Ch99..Ch102). When 1,
// a CSM=0 / CPSM=PSMCT32 TEX0 commit loads the palette in the real GS CSM1 grid order: palette entry i is
// read at (x=i[3:0], y=i[7:4]) of a 16×16 PSMCT32 surface based at CBP, via the CT32 block+byte swizzle.
// This is the order Ch349 proved SH3 uses (host gs_localmem 'grid'); CSM2-linear scatters those colours.
parameter bit CLUT_CSM1_ENABLE = 1'b0
) (
input logic clk,
input logic rst_n,
// From gs_stub: 1-cycle pulse on TEX0_1 commit + the
// newly-decoded sub-fields.
input logic tex0_wr_pulse,
input logic [13:0] tex0_cbp,
input logic [3:0] tex0_cpsm,
input logic tex0_csm, // Ch99 audit: must be 1 (CSM2)
input logic [4:0] tex0_csa, // Ch101: change-detect for CLD=3
input logic [2:0] tex0_cld,
// VRAM second read port — combinational byte-addressed read.
output logic [31:0] vram_read_addr,
input logic [31:0] vram_read_data,
// CLUT staging-area write port.
output logic clut_write_en,
output logic [7:0] clut_write_idx,
output logic [31:0] clut_write_data,
// Status: high while a load is in flight.
output logic load_busy
);
typedef enum logic [0:0] {
S_IDLE,
S_LOAD
} state_e;
state_e state;
logic [7:0] load_idx;
logic [13:0] load_cbp;
logic load_cpsm_is_ct16; // Ch100: latched CPSM mode.
// Ch101 — change-detect snapshots. Updated on every entry to
// S_LOAD (i.e., every successful start). Used by CLD=2 (CBP
// change) and CLD=3 (CBP/CPSM/CSA any-change).
logic [13:0] prev_cbp;
logic [3:0] prev_cpsm;
logic [4:0] prev_csa;
// Ch102 — partial CSA-window load mode (CLD=4). When set,
// walks 16 entries instead of 256 and writes them to
// clut_stub[load_csa_base + load_idx] (8-bit wrap).
logic load_partial;
logic [7:0] load_csa_base;
logic [7:0] load_terminal;
assign load_terminal = load_partial ? 8'h0F : 8'hFF;
logic load_csm1; // Ch350: latched CSM1-grid mode (PSMCT32 only).
logic [31:0] cbp_bytes;
logic [31:0] addr_offset_ct32;
logic [31:0] addr_offset_ct16;
logic [31:0] addr_offset_csm1;
logic [31:0] addr_offset;
assign load_busy = (state == S_LOAD);
// CBP is in 256-byte units (matches PS2 GS docs for the CLUT
// staging area: each CBP step covers one 256-byte block).
assign cbp_bytes = {18'd0, load_cbp} << 8;
// Per-PSM byte offset within the staging block.
// PSMCT32 entries are 4 bytes → byte offset = idx * 4.
// PSMCT16 entries are 2 bytes → byte offset = idx * 2.
assign addr_offset_ct32 = {22'd0, load_idx, 2'd0};
assign addr_offset_ct16 = {23'd0, load_idx, 1'd0};
// Ch350 — CSM1 16×16 CT32 grid offset for entry load_idx (ix=load_idx[3:0], iy=load_idx[7:4]):
// block = {iy[3], ix[3]} (0..3) → block*256 ; byte_in_block = iy[2:0]*32 + ix[2:0]*4.
// Matches gs_localmem.ct32_addr(cbp,dbw=1,ix,iy) (page_index=0 for a 16×16 region). PSMCT32 only.
assign addr_offset_csm1 = ({30'd0, load_idx[7], load_idx[3]} << 8) // block * 256
+ ({29'd0, load_idx[6:4]} << 5) // iy[2:0] * 32
+ ({29'd0, load_idx[2:0]} << 2); // ix[2:0] * 4
assign addr_offset = load_csm1 ? addr_offset_csm1
: load_cpsm_is_ct16 ? addr_offset_ct16
: addr_offset_ct32;
assign vram_read_addr = cbp_bytes + addr_offset;
// Ch100 — PSMCT16 → PSMCT32 unpack. RGB5A1 packing in the
// low 16 bits of vram_read_data: R[4:0] G[9:5] B[14:10] A[15].
// 5→8 bit-replicate matches the same expansion pcrtc uses
// for direct PSMCT16 framebuffer scanout (Ch94). Alpha is
// replicated across 8 bits.
logic [15:0] psm16_entry;
logic [4:0] psm16_r5, psm16_g5, psm16_b5;
logic psm16_a1;
logic [7:0] psm16_r8, psm16_g8, psm16_b8, psm16_a8;
logic [31:0] write_data_ct16;
assign psm16_entry = vram_read_data[15:0];
assign psm16_r5 = psm16_entry[4:0];
assign psm16_g5 = psm16_entry[9:5];
assign psm16_b5 = psm16_entry[14:10];
assign psm16_a1 = psm16_entry[15];
assign psm16_r8 = {psm16_r5, psm16_r5[4:2]};
assign psm16_g8 = {psm16_g5, psm16_g5[4:2]};
assign psm16_b8 = {psm16_b5, psm16_b5[4:2]};
assign psm16_a8 = {8{psm16_a1}};
assign write_data_ct16 = {psm16_a8, psm16_b8, psm16_g8, psm16_r8};
// Combinational addr/data feed for vram_stub port 1 and
// clut_stub write port. Idle when not loading. In partial
// (CLD=4) mode the destination index is the CSA window base
// + load_idx, with 8-bit wrap; in full mode it's just
// load_idx (0..255).
assign clut_write_en = (state == S_LOAD);
assign clut_write_idx = load_partial ? (load_csa_base + load_idx)
: load_idx;
assign clut_write_data = load_cpsm_is_ct16 ? write_data_ct16
: vram_read_data;
// Ch101 — CLD-mode trigger policy. cld_match says "the CLD
// value alone permits a load (assuming CSM/CPSM also OK)."
// The full start gate ANDs this with the existing CSM/CPSM
// checks below.
logic cld_match;
always_comb begin
unique case (tex0_cld)
3'd0: cld_match = 1'b0; // no load
3'd1: cld_match = 1'b1; // always (full)
3'd2: cld_match = (tex0_cbp != prev_cbp); // CBP changed
3'd3: cld_match = (tex0_cbp != prev_cbp)
|| (tex0_cpsm != prev_cpsm)
|| (tex0_csa != prev_csa);
3'd4: cld_match = 1'b1; // always (partial CSA window)
default: cld_match = 1'b0; // CLD ∈ {5..7} reserved
endcase
end
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
load_idx <= 8'd0;
load_cbp <= 14'd0;
load_cpsm_is_ct16 <= 1'b0;
load_csm1 <= 1'b0;
load_partial <= 1'b0;
load_csa_base <= 8'd0;
prev_cbp <= 14'd0;
prev_cpsm <= 4'd0;
prev_csa <= 5'd0;
end else begin
unique case (state)
S_IDLE: begin
// Ch99 audit-medium: CSM=0 means CSM1 (16×16
// grid swizzle inside a CSPM block) which we
// do NOT model yet. Silently treating CSM=0
// as CSM2-linear would produce wrong palette
// bytes, so gate the start on CSM=1 as well.
// Ch100: CPSM == 0 (PSMCT32) and CPSM == 2
// (PSMCT16) are both honored. Other CPSMs
// are silently ignored.
// Ch101: CLD-mode policy decides whether the
// load fires AND records the new TEX0
// CBP/CPSM/CSA in prev_* for future change
// detection.
// Ch102: CLD=4 starts a 16-entry partial load
// at the CSA window; load_partial /
// load_csa_base latch the mode + destination
// base.
// Ch350 — start when CSM2 (csm=1, CPSM∈{CT32,CT16}, unchanged) OR, only when
// CLUT_CSM1_ENABLE, CSM1-grid (csm=0, CPSM=CT32). The CSM1 branch is the sole new
// trigger; with the param OFF this AND-term is constant-0 so a csm=0 TEX0 is ignored
// exactly as before (CSM2 path byte-identical).
if (tex0_wr_pulse
&& cld_match
&& ( ((tex0_csm == 1'b1) && ((tex0_cpsm == 4'd0) || (tex0_cpsm == 4'd2)))
|| (CLUT_CSM1_ENABLE && (tex0_csm == 1'b0) && (tex0_cpsm == 4'd0)) )) begin
state <= S_LOAD;
load_idx <= 8'd0;
load_cbp <= tex0_cbp;
load_cpsm_is_ct16 <= (tex0_cpsm == 4'd2);
load_csm1 <= (tex0_csm == 1'b0); // CSM1-grid addressing (PSMCT32)
load_partial <= (tex0_cld == 3'd4);
load_csa_base <= {tex0_csa, 4'd0};
prev_cbp <= tex0_cbp;
prev_cpsm <= tex0_cpsm;
prev_csa <= tex0_csa;
end
end
S_LOAD: begin
// Terminal index is 0xFF for full load, 0x0F
// for partial (CSA window). load_terminal
// mux below picks between them.
if (load_idx == load_terminal) begin
state <= S_IDLE;
end
load_idx <= load_idx + 8'd1;
end
endcase
end
end
endmodule : clut_loader_stub
+80
View File
@@ -0,0 +1,80 @@
// retroDE_ps2 — clut_stub (Ch97)
//
// Minimal palette RAM for indexed-color scanout. PSMT8 scanout
// (Ch96) currently surfaces the index as grayscale; with this
// CLUT wired in, the index is looked up to produce real RGB.
//
// Scope (intentionally minimal for Ch97):
// - 256 entries × 32 bits (PSMCT32 ABGR per entry). PSMT4
// (16 entries) uses the same RAM with a smaller index range.
// - CSM2 (linear) addressing only. Index N reads entry N. CSA
// (entry offset) is honored OUTSIDE this module — pcrtc
// computes effective_idx = idx + (CSA << 4) and presents it
// as `read_idx`. CSM1 (16×16 grid swizzle inside a CSPM
// block) is deferred.
// - Combinational read port for pcrtc (tight scanout latency).
// - Single registered write port. Two writers exist at this
// scope, picked by the wiring at the TB level:
// (a) TB-direct programming for tests that want to lock
// pcrtc-side decode in isolation (Ch97 PSMT8+CLUT TB,
// Ch98 TEX0_1 CSA-flow TB).
// (b) `clut_loader_stub` (Ch99/Ch100) — a small FSM that
// copies 256 entries from VRAM[CBP*256] into this RAM
// when a TEX0_1 GIF write commits with CLD!=0,
// CSM=CSM2, and CPSM ∈ {PSMCT32, PSMCT16}. PSMCT16
// entries are unpacked from RGB5A1 to PSMCT32 ABGR
// inside the loader, so clut_stub always stores
// PSMCT32 regardless of source. clut_stub doesn't know
// which writer is in play; it just commits whatever
// the wired write_* port carries.
//
// Real PS2 CLUT is held in a 1 KiB internal staging area and
// loaded from VRAM[CBP] when CLD bits in TEX0 fire. Ch99/Ch100
// model the load path for CPSM ∈ {PSMCT32, PSMCT16} with CSM2;
// CSM1 swizzle, conditional CLD modes (2..7), CSA partial-window
// loads (CLD=4), and CPSM ∉ {PSMCT32, PSMCT16} stay deferred.
`timescale 1ns/1ps
module clut_stub
#(
parameter int unsigned ENTRIES = 256
) (
input logic clk,
input logic rst_n,
// TB-direct write port (no GIF TEX0 path yet).
input logic write_en,
input logic [7:0] write_idx,
input logic [31:0] write_data,
// Combinational read port consumed by gs_pcrtc_stub (scanout).
input logic [7:0] read_idx,
output logic [31:0] read_data,
// Ch296 — SECOND combinational read port for the TEXTURE sampler
// (gs_stub texel-fetch path). Independent of the pcrtc scanout port
// above: the table is a tiny 256x32 LUT, so a second read fan-out is
// free and keeps the two consumers (scanout vs sampler) decoupled.
// PSMT8 indexed texturing looks up clut_stub[tex_read_idx] to turn a
// fetched 8-bit index into a PSMCT32 color.
input logic [7:0] tex_read_idx,
output logic [31:0] tex_read_data
);
logic [31:0] mem [0:ENTRIES-1];
initial begin
for (int i = 0; i < ENTRIES; i++) mem[i] = 32'd0;
end
assign read_data = mem[read_idx];
assign tex_read_data = mem[tex_read_idx];
always_ff @(posedge clk) begin
if (rst_n && write_en) begin
mem[write_idx] <= write_data;
end
end
endmodule : clut_stub
+131
View File
@@ -0,0 +1,131 @@
// retroDE_ps2 — ee_gs_priv_bridge_stub (Ch111)
//
// Bridges 32-bit EE-MMIO writes targeting the GS privileged-
// register window at 0x1200_0000 into the 64-bit gs_stub.reg_wr_*
// port. Real PS2 driver code reaches PMODE / DISPFB1 / DISPLAY1
// (etc.) via 64-bit MIPS `sd` instructions; the EE microarch
// breaks each `sd` into a pair of 32-bit `sw` operations to the
// low+high halves of the 8-byte register slot. This bridge does
// the inverse — it watches the 32-bit EE write stream, latches a
// 64-bit shadow per 8-byte slot, and fires a gs_stub.reg_wr_*
// pulse on EVERY half-write with the running 64-bit shadow value.
//
// Scope:
// - One shared 64-bit shadow + an offset[15:3] tag identifying
// the currently-tracked 8-byte slot. Sequential writes to the
// SAME slot accumulate (low first, then high → final shadow
// has both halves correct on the second fire). Switching
// slots resets the shadow to zero so partial-half writes to
// a fresh slot don't carry stale data from a different reg.
// - Each EE half-write fires a gs_stub.reg_wr_* pulse with the
// 8-byte-aligned offset (`{ee_wr_addr[15:3], 3'b000}`) and
// the FULL 64-bit shadow. Single-half writes (e.g. PMODE
// where only the low byte matters) work because the high
// half stays zero and gs_stub's latch sees the right value.
// - 32-bit EE write width (matches ee_memory_map_stub's
// ee_wr_*-port surface). **Full-word writes only**:
// `ee_wr_be` MUST be 4'b1111 on every accepted write. Byte-
// lane merging into the 64-bit shadow is intentionally NOT
// modelled here — control-plane GS registers (PMODE/
// DISPFB1/DISPLAY1/etc.) are always written as full 32-bit
// halves of an `sd`, and constraining the contract keeps the
// shadow + commit logic small. A simulation-time `$error`
// fires if a non-full be is presented; a future chapter can
// widen the bridge to per-byte merge if/when a real driver
// pattern needs sub-word writes here.
//
// Wiring contract (TB-level for Ch111):
// ee_wr_en ← TB EE-MMIO write strobe at 0x12000000+offset
// ee_wr_addr ← 16-bit offset within the GS priv window (= EE
// phys addr [15:0]; the upper EE-window decode
// lives in the test bench / memory map)
// ee_wr_data ← 32-bit EE data (one of two halves of a 64-bit
// GS register)
// ee_wr_be ← 4-bit per-byte enable (typically 4'b1111)
//
// The bridge does NOT participate in EE reads. The gs_stub
// privileged-register port is write-only at this scope, matching
// the limited read coverage of the GS priv block in the rest of
// the design.
`timescale 1ns/1ps
module ee_gs_priv_bridge_stub
(
input logic clk,
input logic rst_n,
// EE-MMIO write port (32-bit data, 16-bit offset within
// 0x1200_0000 window).
input logic ee_wr_en,
input logic [15:0] ee_wr_addr,
input logic [31:0] ee_wr_data,
input logic [3:0] ee_wr_be,
// gs_stub privileged-register port (16-bit offset, 64-bit data).
output logic gs_reg_wr_en,
output logic [15:0] gs_reg_wr_addr,
output logic [63:0] gs_reg_wr_data
);
// Shared 64-bit shadow + the 13-bit offset[15:3] tag of the
// currently-tracked slot. Resets to zero on rst_n or on a
// switch to a different slot.
logic [63:0] shadow;
logic [12:0] shadow_tag;
logic shadow_valid;
logic [12:0] cur_tag;
logic cur_is_high;
logic [63:0] new_shadow;
assign cur_tag = ee_wr_addr[15:3];
assign cur_is_high = ee_wr_addr[2];
always_comb begin
logic [63:0] base_shadow;
// If the EE write hits the same 8-byte slot we're already
// tracking, merge into the existing shadow. Otherwise start
// a fresh shadow at zero (the un-touched half stays 0 — that's
// safe for the demo where we always write the half that
// matters first; high-only writes are not used in this TB
// family).
base_shadow = (shadow_valid && shadow_tag == cur_tag) ? shadow
: 64'd0;
if (cur_is_high)
new_shadow = {ee_wr_data, base_shadow[31:0]};
else
new_shadow = {base_shadow[63:32], ee_wr_data};
end
always_ff @(posedge clk) begin
if (!rst_n) begin
shadow <= 64'd0;
shadow_tag <= 13'd0;
shadow_valid <= 1'b0;
gs_reg_wr_en <= 1'b0;
gs_reg_wr_addr <= 16'd0;
gs_reg_wr_data <= 64'd0;
end else begin
gs_reg_wr_en <= 1'b0;
if (ee_wr_en) begin
// Contract: full-word writes only. Sub-word
// (per-byte) merging into the 64-bit shadow is
// out of scope at Ch111. Catch contract violations
// loudly so a future driver pattern that needs
// byte-lane writes is forced to widen the bridge.
if (ee_wr_be !== 4'b1111) begin
$error("ee_gs_priv_bridge_stub: ee_wr_be=%b — only 4'b1111 supported (full-word writes); offset=0x%04x data=0x%08h",
ee_wr_be, ee_wr_addr, ee_wr_data);
end
shadow <= new_shadow;
shadow_tag <= cur_tag;
shadow_valid <= 1'b1;
gs_reg_wr_en <= 1'b1;
gs_reg_wr_addr <= {cur_tag, 3'b000};
gs_reg_wr_data <= new_shadow;
end
end
end
endmodule : ee_gs_priv_bridge_stub
+663
View File
@@ -0,0 +1,663 @@
// retroDE_ps2 — gif_image_xfer_stub (Ch110)
//
// Host→local image-transfer engine. On a TRXDIR write that arms a
// host→local upload (XDIR == 0), the engine snapshots the
// already-latched BITBLTBUF / TRXPOS / TRXREG fields and consumes
// IMAGE-mode quadwords from gif_packed_stub, unpacking them into
// per-pixel VRAM writes at the destination region defined by
// (DBP, DBW, DPSM, DSAX, DSAY, RRW, RRH).
//
// Scope (after Ch139):
// - PSMCT32 (DPSM == 6'h00): 4 bytes/pixel, 4 pixels/qword,
// row_stride = DBW * 256, write_be = 4'b1111, mask=0xFFFFFFFF.
// - PSMCT16 (DPSM == 6'h02): 2 bytes/pixel, 8 pixels/qword,
// row_stride = DBW * 128, write_be = 4'b0011, mask=0xFFFFFFFF.
// - PSMT8 (DPSM == 6'h13): 1 byte/pixel (an 8-bit CLUT index),
// 16 pixels/qword, row_stride = DBW * 64, write_be = 4'b0001,
// mask = 0xFFFFFFFF.
// - PSMT4 (DPSM == 6'h14): 0.5 bytes/pixel (a 4-bit CLUT index),
// 32 pixels/qword (2 px/byte × 16 bytes), row_stride = DBW * 32,
// write_be = 4'b0001 with a per-emit nibble mask: 0x0000_000F
// for the LOW nibble of the byte (when (DSAX+x) is even) or
// 0x0000_00F0 for the HIGH nibble (when (DSAX+x) is odd). The
// 4-bit index sits at the matching nibble position in
// write_data[7:0]; vram_stub's per-bit merge commits exactly
// that nibble — the OTHER nibble of the same byte is preserved.
// Back-to-back emits to the same byte (e.g. x=0 + x=1 of the
// same row) chain through NBA semantics without bypass logic
// (same trick the raster channel uses since Ch106).
// - Other PSMs (PSMCT24/PSMZ-*): the engine still consumes
// IMAGE qwords (so gif_packed_stub doesn't desync) but emits
// zero VRAM writes. Lane cadence falls back to PSMCT32
// (4 lanes/qword).
// - Addressing: linear by DEFAULT — the destination address
// math is
// dest_base = DBP * 256
// row_stride = DBW * 64 * bpp
// addr(x, y) = dest_base + (DSAY + y) * row_stride
// + (DSAX + x) * bpp
// Four OPTIONAL per-PSM swizzle paths gated by parameters:
// `PSMCT32_SWIZZLE=1` (Ch121) routes PSMCT32 uploads through
// gs_swizzle_psmct32_stub; `PSMCT16_SWIZZLE=1` (Ch127) routes
// PSMCT16 uploads through gs_swizzle_psmct16_stub;
// `PSMT8_SWIZZLE=1` (Ch133) routes PSMT8 uploads through
// gs_swizzle_psmt8_stub (page=128×64 px, bw_pg=DBW>>1 — DBW
// must be even for PSMT8); `PSMT4_SWIZZLE=1` (Ch139) routes
// PSMT4 uploads through gs_swizzle_psmt4_stub (page=128×128
// px, bw_pg=DBW>>1 — DBW must be even for PSMT4 too; module
// also outputs nibble_hi selector since PSMT4 packs 2 pixels
// per byte). In all four cases the per-pixel byte address is
// `dest_base + swizzle(FBP=0, FBW=DBW, x=DSAX+cur_x,
// y=DSAY+cur_y)`. The PSMT4 path additionally uses the
// swizzle's `nibble_hi` output (instead of the linear
// formula's x_eff[0]) to pick which nibble of the byte gets
// the upload's 4-bit pixel — the existing Ch118 nibble RMW
// write-mask machinery (write_be=4'b0001, write_mask=
// 0x0F or 0xF0) layers on top of the swizzled byte address.
// The four parameters are independent. All four parameter
// defaults are 0 → legacy linear behavior.
// - One pending qword buffer + a 5-bit lane counter (0..3 for
// PSMCT32, 0..7 for PSMCT16, 0..15 for PSMT8, 0..31 for
// PSMT4; the last-lane index is snapshotted at TRXDIR-arm
// time per `lane_last_q`).
// Backpressure to the upstream is exposed via `data_ready`.
// Wired into `gif_packed_stub.image_data_ready` (Ch110), so
// the GIF gates `in_ready` only in S_IMAGE state with FLG=2;
// the DMAC's ep_ready follows gif_in_ready directly. Outside
// S_IMAGE the gate is a no-op.
//
// Wiring contract (TB-level):
// trxdir_wr_pulse ← gs_stub.trxdir_wr_q
// trxdir ← gs_stub.trxdir_q
// bitbltbuf ← gs_stub.bitbltbuf_q
// trxpos ← gs_stub.trxpos_q
// trxreg ← gs_stub.trxreg_q
// data_valid ← gif_packed_stub.image_data_valid
// data_qword ← gif_packed_stub.image_data
// data_last ← gif_packed_stub.image_data_last
// data_ready → gif_packed_stub.image_data_ready (Ch110).
// The GIF FSM uses it to gate in_ready only in
// S_IMAGE+FLG=2; dmac.ep_ready follows
// gif.in_ready directly (no TB-level AND).
// vram_we / waddr / wdata / wbe / wmask → muxed into vram_stub's
// write port (the TB selects between the engine, the raster
// channel, and any TB-direct path).
// busy → high while a transfer is active (between trxdir_wr arm
// and the last lane emit). TB uses this for the vram_stub
// write-port mux.
//
// What this stub does NOT do:
// - Source-direction (local→host or local→local) transfers.
// - PSMCT24 / PSMZ-* image transfers (not currently exercised
// in the demo flow).
// - Mid-transfer TRXDIR re-arm or interleaving with REGLIST.
// - HWREG-side legacy/non-PSM-aware swizzle (out of scope —
// PSMCT32 since Ch121, PSMCT16 since Ch127, PSMT8 since
// Ch133, PSMT4 since Ch139 all support the canonical PCSX2
// swizzle behind their respective parameter gates).
// - HWREG via privileged-MMIO (the real PS2 path that reads
// pixel data through the privileged HWREG register at
// 0x12001000); IMAGE-mode GIF qwords are the only data
// source modelled here.
`timescale 1ns/1ps
module gif_image_xfer_stub
import trace_pkg::*;
#(
// Ch121 — when set, PSMCT32 uploads compute the per-pixel VRAM
// byte address via the real PS2 GS page/block swizzle
// (gs_swizzle_psmct32_stub) instead of the legacy linear formula
// `dest_base + (DSAY+y)*row_stride + (DSAX+x)*4`. Other PSMs
// are not affected by this parameter — PSMCT16 has its own
// gate (PSMCT16_SWIZZLE, Ch127), PSMT8 has PSMT8_SWIZZLE
// (Ch133), PSMT4 has PSMT4_SWIZZLE (Ch139, see below).
// Default 0 keeps every existing PSMCT32 image-xfer TB on
// the original linear addressing — its expectations don't
// change.
parameter bit PSMCT32_SWIZZLE = 1'b0,
// Ch127 — when set, PSMCT16 uploads compute the per-pixel VRAM
// byte address via the canonical PS2 GS page/block/column
// swizzle (gs_swizzle_psmct16_stub) instead of the legacy
// linear formula `dest_base + (DSAY+y)*row_stride +
// (DSAX+x)*2`. PSMCT32 / PSMT8 / PSMT4 are governed by their
// own gates (PSMCT32_SWIZZLE / PSMT8_SWIZZLE / PSMT4_SWIZZLE).
// Default 0 keeps every existing PSMCT16 image-xfer
// TB on the legacy linear path. Mirrors the Ch126 PCRTC
// read-side wiring at the upload write side, completing
// the second integration point for the Ch125 PSMCT16
// primitive.
parameter bit PSMCT16_SWIZZLE = 1'b0,
// Ch133 — when set, PSMT8 uploads compute the per-pixel VRAM
// byte address via the canonical PS2 GS page/block/column
// swizzle (gs_swizzle_psmt8_stub) instead of the legacy
// linear formula `dest_base + (DSAY+y)*row_stride +
// (DSAX+x)*1`. PSMT8 pages are 128 px wide so the swizzle
// internally uses `bw_pg = DBW >> 1` — PCSX2 asserts DBW must
// be even for PSMT8 at GSLocalMemory.h:553. PSMCT32 / PSMCT16
// / PSMT4 are governed by their own gates.
// Default 0 keeps every existing PSMT8 image-xfer TB
// (Ch117 PSMT8, Ch107 PSMT4-via-CT16-CLUT palette path) on
// the legacy linear addressing. Mirrors the Ch132 PCRTC
// read-side wiring at the upload write side, completing
// the second integration point for the Ch131 PSMT8 primitive.
parameter bit PSMT8_SWIZZLE = 1'b0,
// Ch139 — when set, PSMT4 uploads compute the per-pixel VRAM
// byte address via the canonical PS2 GS page/block/column
// swizzle (gs_swizzle_psmt4_stub) instead of the legacy
// linear formula `dest_base + (DSAY+y)*row_stride +
// (DSAX+x)*0.5`. PSMT4 pages are 128 px wide AND 128 px tall;
// the swizzle internally uses `bw_pg = DBW >> 1` — PCSX2
// asserts DBW must be even for PSMT4 at GSLocalMemory.h:560.
// The PSMT4 swizzle module also outputs a `nibble_hi`
// selector that picks which nibble of the byte at the
// swizzled address holds this pixel — the linear formula's
// x_eff[0] selector is wrong under the swizzled layout
// because the canonical PCSX2 column table reorders nibbles
// within a block. The existing Ch118 nibble RMW machinery
// (write_be=4'b0001 + write_mask 0x0F or 0xF0) layers on top
// of the swizzled byte address: the mask is selected by the
// swizzle's nibble_hi when this gate is on, instead of by
// x_eff[0]. PSMCT32 / PSMCT16 / PSMT8 are governed by their
// own gates. Default 0 keeps every existing PSMT4 image-xfer
// TB (Ch118 PSMT4, Ch107 PSMT4-e2e palette path) on the
// legacy linear addressing. Mirrors the Ch138 PCRTC
// read-side wiring at the upload write side, completing the
// second integration point for the Ch137 PSMT4 primitive.
parameter bit PSMT4_SWIZZLE = 1'b0
)(
input logic clk,
input logic rst_n,
// Arm input — pulses for one cycle on TRXDIR commit.
input logic trxdir_wr_pulse,
input logic [63:0] trxdir,
input logic [63:0] bitbltbuf,
input logic [63:0] trxpos,
input logic [63:0] trxreg,
// IMAGE qword stream from gif_packed_stub.
input logic data_valid,
input logic [127:0] data_qword,
input logic data_last,
output logic data_ready,
// VRAM write port. PSM-aware be + per-bit merge mask:
// PSMCT32 (Ch110): be = 4'b1111, mask = 0xFFFFFFFF.
// PSMCT16 (Ch116): be = 4'b0011, mask = 0xFFFFFFFF.
// PSMT8 (Ch117): be = 4'b0001, mask = 0xFFFFFFFF.
// PSMT4 (Ch118): be = 4'b0001, mask = 0x0000_000F (low
// nibble) or 0x0000_00F0 (high nibble),
// keyed by (DSAX+x)[0]. The 4-bit index
// sits at the matching nibble position in
// write_data[7:0]; vram_stub merges only
// the targeted nibble.
output logic vram_we,
output logic [31:0] vram_waddr,
output logic [31:0] vram_wdata,
output logic [3:0] vram_wbe,
output logic [31:0] vram_wmask,
// Engine status.
output logic busy
);
// BITBLTBUF field decode (real PS2 layout, per PCSX2 GSRegs.h):
// [13:0] SBP
// [21:16] SBW
// [29:24] SPSM
// [45:32] DBP
// [53:48] DBW
// [61:56] DPSM
logic [13:0] dbp;
logic [5:0] dbw;
logic [5:0] dpsm;
assign dbp = bitbltbuf[45:32];
assign dbw = bitbltbuf[53:48];
assign dpsm = bitbltbuf[61:56];
// TRXPOS field decode:
// [10:0] SSAX
// [26:16] SSAY
// [42:32] DSAX
// [58:48] DSAY
// [60:59] DIR
logic [10:0] dsax;
logic [10:0] dsay;
assign dsax = trxpos[42:32];
assign dsay = trxpos[58:48];
// TRXREG field decode:
// [11:0] RRW
// [43:32] RRH
logic [11:0] rrw;
logic [11:0] rrh;
assign rrw = trxreg[11:0];
assign rrh = trxreg[43:32];
// TRXDIR field decode (XDIR is bits [1:0]).
logic [1:0] xdir;
assign xdir = trxdir[1:0];
// Snapshotted transfer parameters (latched at trxdir_wr arm).
logic [13:0] dbp_q;
logic [5:0] dbw_q;
logic [5:0] dpsm_q;
logic [10:0] dsax_q;
logic [10:0] dsay_q;
logic [11:0] rrw_q;
logic [11:0] rrh_q;
logic [31:0] dest_base_q; // DBP * 256 (bytes)
logic [31:0] row_stride_q; // DBW * 64 * bpp
logic psmct32_q; // DPSM == 0x00 → 4 bytes/pixel
logic psmct16_q; // DPSM == 0x02 → 2 bytes/pixel (Ch116)
logic psmt8_q; // DPSM == 0x13 → 1 byte/pixel (Ch117)
logic psmt4_q; // DPSM == 0x14 → 0.5 byte/pixel (Ch118)
// Last-lane index for the current PSM (3 for PSMCT32 → 4
// lanes, 7 for PSMCT16 → 8 lanes, 15 for PSMT8 → 16 lanes,
// 31 for PSMT4 → 32 lanes).
// Other PSMs use the PSMCT32 cadence (3) for silent consume.
logic [4:0] lane_last_q;
// Per-emit progression: which qword (0..NLOOP-1) and which
// lane within the qword (0..3 for PSMCT32, 0..7 for PSMCT16,
// 0..15 for PSMT8, 0..31 for PSMT4).
logic [127:0] qword_q;
logic [4:0] lane_q; // widened to 5 bits for PSMT4
logic lane_valid_q; // a buffered qword is being drained
// Pixel cursor (cur_x, cur_y) within the destination rect,
// measured from (DSAX, DSAY). Wrap at RRW.
logic [11:0] cur_x_q;
logic [11:0] cur_y_q;
logic [23:0] pix_total_q; // RRW * RRH (cap 16M)
logic [23:0] pix_done_q;
// FSM.
typedef enum logic [1:0] {
S_IDLE = 2'd0,
S_RUN = 2'd1
} state_e;
state_e state;
assign busy = (state == S_RUN);
// The engine is "ready" for a new qword when no qword is
// currently being drained. In S_IDLE we admit qwords too —
// upstream image_data_valid won't pulse outside an active
// S_IMAGE state, so this is benign.
assign data_ready = !lane_valid_q;
// Combinational pixel address for the in-flight lane.
// PSMCT32: addr = dest_base + (DSAY+cur_y) * row_stride
// + (DSAX+cur_x) * 4
// PSMCT16: addr = dest_base + (DSAY+cur_y) * row_stride
// + (DSAX+cur_x) * 2
// PSMT8 : addr = dest_base + (DSAY+cur_y) * row_stride
// + (DSAX+cur_x) * 1
// PSMT4 : addr = dest_base + (DSAY+cur_y) * row_stride
// + ((DSAX+cur_x) >> 1)
// nibble = (DSAX+cur_x)[0] high vs low
// (row_stride already encodes the bpp factor.)
logic [31:0] cur_addr_c;
logic [31:0] cur_data_c;
logic [3:0] cur_be_c;
logic [31:0] cur_mask_c;
always_comb begin
logic [31:0] x_off;
logic [11:0] x_eff;
logic [3:0] t4_nibble;
x_eff = dsax_q + cur_x_q;
if (psmt4_q)
x_off = {21'd0, x_eff[11:1]}; // (x_eff >> 1)
else if (psmt8_q)
x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q});
else if (psmct16_q)
x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q}) * 32'd2;
else
x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q}) * 32'd4;
cur_addr_c = dest_base_q
+ (32'(dsay_q) + 32'(cur_y_q)) * row_stride_q
+ x_off;
// PSMT4: extract the 4-bit nibble at lane_q from qword_q.
// qword[lane*4 +: 4] for lane in 0..31. iverilog 12 supports
// indexed part-select with variable base + constant width.
t4_nibble = qword_q[(5'(lane_q) * 4) +: 4];
if (psmt4_q) begin
// 32 PSMT4 pixels per qword (2 px/byte × 16 bytes).
// Place the 4-bit index at the matching nibble position
// in write_data[7:0] keyed by the nibble selector.
// Linear (PSMT4_SWIZZLE=0): x_eff[0] is the selector
// (low nibble = even pixel, high = odd pixel).
// Swizzled (Ch139, PSMT4_SWIZZLE=1): the swizzle module
// outputs `nibble_hi` directly — required because the
// canonical PCSX2 columnTable4 reorders nibbles within
// a block, so x_eff[0] is no longer correct. write_be
// is 4'b0001 (single-byte commit) and write_mask
// gates the targeted nibble; vram_stub merges only
// that nibble, preserving the OTHER nibble of the
// same byte.
logic psmt4_nibble_select;
psmt4_nibble_select = PSMT4_SWIZZLE ? swizzle4_nibble_hi
: x_eff[0];
if (psmt4_nibble_select) begin
cur_data_c = {24'd0, t4_nibble, 4'd0}; // high nibble
cur_mask_c = 32'h0000_00F0;
end else begin
cur_data_c = {24'd0, 4'd0, t4_nibble}; // low nibble
cur_mask_c = 32'h0000_000F;
end
cur_be_c = 4'b0001;
end else if (psmt8_q) begin
// 16 PSMT8 pixels per qword. Place the 8-bit index in
// the LOW byte of write_data; vram_stub's per-byte BE
// commits exactly 1 byte at the exact pixel address
// (write_addr = cur_addr_c) at any byte alignment.
cur_mask_c = 32'hFFFF_FFFF;
unique case (lane_q[3:0])
4'd0: cur_data_c = {24'd0, qword_q[ 7: 0]};
4'd1: cur_data_c = {24'd0, qword_q[ 15: 8]};
4'd2: cur_data_c = {24'd0, qword_q[ 23: 16]};
4'd3: cur_data_c = {24'd0, qword_q[ 31: 24]};
4'd4: cur_data_c = {24'd0, qword_q[ 39: 32]};
4'd5: cur_data_c = {24'd0, qword_q[ 47: 40]};
4'd6: cur_data_c = {24'd0, qword_q[ 55: 48]};
4'd7: cur_data_c = {24'd0, qword_q[ 63: 56]};
4'd8: cur_data_c = {24'd0, qword_q[ 71: 64]};
4'd9: cur_data_c = {24'd0, qword_q[ 79: 72]};
4'd10: cur_data_c = {24'd0, qword_q[ 87: 80]};
4'd11: cur_data_c = {24'd0, qword_q[ 95: 88]};
4'd12: cur_data_c = {24'd0, qword_q[103: 96]};
4'd13: cur_data_c = {24'd0, qword_q[111:104]};
4'd14: cur_data_c = {24'd0, qword_q[119:112]};
default: cur_data_c = {24'd0, qword_q[127:120]};
endcase
cur_be_c = 4'b0001;
end else if (psmct16_q) begin
// 8 PSMCT16 pixels per qword. Place the 16-bit value
// in the LOW halfword of write_data; vram_stub's per-
// byte BE commits exactly 2 bytes at the 2-byte-
// aligned pixel address (write_addr = cur_addr_c).
cur_mask_c = 32'hFFFF_FFFF;
unique case (lane_q[2:0])
3'd0: cur_data_c = {16'd0, qword_q[ 15: 0]};
3'd1: cur_data_c = {16'd0, qword_q[ 31: 16]};
3'd2: cur_data_c = {16'd0, qword_q[ 47: 32]};
3'd3: cur_data_c = {16'd0, qword_q[ 63: 48]};
3'd4: cur_data_c = {16'd0, qword_q[ 79: 64]};
3'd5: cur_data_c = {16'd0, qword_q[ 95: 80]};
3'd6: cur_data_c = {16'd0, qword_q[111: 96]};
default: cur_data_c = {16'd0, qword_q[127:112]};
endcase
cur_be_c = 4'b0011;
end else begin
// PSMCT32: 4 pixels per qword, full 32-bit.
cur_mask_c = 32'hFFFF_FFFF;
unique case (lane_q[1:0])
2'd0: cur_data_c = qword_q[ 31: 0];
2'd1: cur_data_c = qword_q[ 63: 32];
2'd2: cur_data_c = qword_q[ 95: 64];
default: cur_data_c = qword_q[127: 96];
endcase
cur_be_c = 4'b1111;
end
end
// Ch121 — optional PSMCT32 swizzled write address.
//
// When PSMCT32_SWIZZLE=1 AND the active PSM is PSMCT32, route
// the per-pixel byte address through gs_swizzle_psmct32_stub
// instead of the linear formula. The swizzle module gives
// a within-FB byte offset relative to FBP=0; we add dest_base_q
// (= DBP*256) to anchor the upload at the same DBP-relative
// base the linear path uses. dbw_q feeds the swizzle's FBW
// input directly (both are in 64-pixel units, matching the
// PSMCT32 page = 64 px wide convention). The per-pixel x and
// y inputs are the FULL effective coordinates (DSAX+cur_x,
// DSAY+cur_y), so the swizzle correctly handles non-zero
// DSAX/DSAY uploads as well.
//
// Other PSMs are governed by their own dispatch branches in
// the per-PSM mux below (PSMCT16 via PSMCT16_SWIZZLE Ch127,
// PSMT8 via PSMT8_SWIZZLE Ch133, PSMT4 via PSMT4_SWIZZLE
// Ch139). With PSMCT32_SWIZZLE=0 the PSMCT32 path falls
// through to cur_addr_c. The swizzle module is purely
// combinational; when its gate is off its output is unused
// and the synthesizer trims it.
logic [31:0] cur_addr_swizzled_c;
logic [11:0] swizzle_x_in;
logic [11:0] swizzle_y_in;
assign swizzle_x_in = dsax_q + cur_x_q;
assign swizzle_y_in = dsay_q + cur_y_q;
logic [31:0] swizzle_addr_off;
gs_swizzle_psmct32_stub u_swizzle (
.fbp (9'd0),
.fbw (dbw_q),
.x (swizzle_x_in),
.y (swizzle_y_in),
.addr(swizzle_addr_off)
);
assign cur_addr_swizzled_c = dest_base_q + swizzle_addr_off;
// Ch127 — optional PSMCT16 swizzled write address. Same shape
// as Ch121 above but uses gs_swizzle_psmct16_stub. The PSMCT16
// page (64×64) and block grid (4 cols × 8 rows of 16×8 blocks)
// and within-block columnTable16 are all baked into that
// module — we just feed it `dbw_q` as FBW and the full
// effective coords. dest_base_q (= DBP*256) is added on top
// so any DBP works; the swizzle module is given FBP=0 so its
// output is the within-FB byte offset only.
logic [31:0] cur_addr_swizzled16_c;
logic [31:0] swizzle16_addr_off;
gs_swizzle_psmct16_stub u_swizzle16 (
.fbp (9'd0),
.fbw (dbw_q),
.x (swizzle_x_in),
.y (swizzle_y_in),
.addr(swizzle16_addr_off)
);
assign cur_addr_swizzled16_c = dest_base_q + swizzle16_addr_off;
// Ch133 — optional PSMT8 swizzled write address. Same shape as
// Ch121 / Ch127 above but uses gs_swizzle_psmt8_stub. PSMT8
// pages are 128 px wide so the swizzle internally uses
// bw_pg = DBW>>1 (PCSX2 asserts DBW must be even for PSMT8).
// dest_base_q (= DBP*256) is added on top so any DBP works;
// the swizzle module is given FBP=0 so its output is the
// within-FB byte offset only.
logic [31:0] cur_addr_swizzled8_c;
logic [31:0] swizzle8_addr_off;
gs_swizzle_psmt8_stub u_swizzle8 (
.fbp (9'd0),
.fbw (dbw_q),
.x (swizzle_x_in),
.y (swizzle_y_in),
.addr(swizzle8_addr_off)
);
assign cur_addr_swizzled8_c = dest_base_q + swizzle8_addr_off;
// Ch139 — optional PSMT4 swizzled write address. Same wiring
// shape as Ch121/Ch127/Ch133 but uses gs_swizzle_psmt4_stub,
// which outputs both an absolute byte address AND a
// `nibble_hi` selector. PSMT4 pages are 128 px wide AND tall;
// the swizzle internally uses bw_pg=DBW>>1 (PCSX2 asserts
// DBW must be even for PSMT4). dest_base_q (= DBP*256) is
// added on top so any DBP works; the swizzle module is given
// FBP=0 so its addr output is the within-FB byte offset only.
// The nibble_hi output threads into the PSMT4 data lane mux
// below: when this gate is on AND psmt4_q, the existing Ch118
// nibble RMW machinery (write_be=4'b0001, write_mask 0x0F or
// 0xF0) keys on the swizzle's nibble_hi instead of x_eff[0].
logic [31:0] cur_addr_swizzled4_c;
logic [31:0] swizzle4_addr_off;
logic swizzle4_nibble_hi;
gs_swizzle_psmt4_stub u_swizzle4 (
.fbp (9'd0),
.fbw (dbw_q),
.x (swizzle_x_in),
.y (swizzle_y_in),
.addr (swizzle4_addr_off),
.nibble_hi(swizzle4_nibble_hi)
);
assign cur_addr_swizzled4_c = dest_base_q + swizzle4_addr_off;
// VRAM write outputs — pulse for one cycle per pixel emit.
// Only fire when DPSM is supported (PSMCT32, PSMCT16, PSMT8,
// or PSMT4). Other PSMs still consume qwords lane-by-lane to
// keep gif_packed_stub from desync, but no VRAM write happens.
logic emit_now;
assign emit_now = lane_valid_q &&
(psmct32_q || psmct16_q || psmt8_q || psmt4_q);
// Per-PSM swizzle dispatch. The four parameters are
// independent; defaults of 0 keep every PSM on the legacy
// linear path.
assign vram_we = emit_now;
assign vram_waddr = (PSMCT32_SWIZZLE && psmct32_q) ? cur_addr_swizzled_c :
(PSMCT16_SWIZZLE && psmct16_q) ? cur_addr_swizzled16_c :
(PSMT8_SWIZZLE && psmt8_q) ? cur_addr_swizzled8_c :
(PSMT4_SWIZZLE && psmt4_q) ? cur_addr_swizzled4_c :
cur_addr_c;
assign vram_wdata = cur_data_c;
assign vram_wbe = cur_be_c;
assign vram_wmask = cur_mask_c;
// Compute target pixel count = RRW * RRH (24-bit cap is fine
// for any palette/texture upload we model here).
logic [23:0] pix_total_calc;
assign pix_total_calc = {12'd0, rrw} * {12'd0, rrh};
// Step / wrap logic (on the cycle a pixel emits).
logic [11:0] next_x;
logic [11:0] next_y;
logic wrap_row;
assign wrap_row = (cur_x_q + 12'd1 == rrw_q);
assign next_x = wrap_row ? 12'd0 : (cur_x_q + 12'd1);
assign next_y = wrap_row ? (cur_y_q + 12'd1) : cur_y_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
dbp_q <= 14'd0;
dbw_q <= 6'd0;
dpsm_q <= 6'd0;
dsax_q <= 11'd0;
dsay_q <= 11'd0;
rrw_q <= 12'd0;
rrh_q <= 12'd0;
dest_base_q <= 32'd0;
row_stride_q <= 32'd0;
psmct32_q <= 1'b0;
psmct16_q <= 1'b0;
psmt8_q <= 1'b0;
psmt4_q <= 1'b0;
lane_last_q <= 5'd3;
qword_q <= 128'd0;
lane_q <= 5'd0;
lane_valid_q <= 1'b0;
cur_x_q <= 12'd0;
cur_y_q <= 12'd0;
pix_total_q <= 24'd0;
pix_done_q <= 24'd0;
end else begin
unique case (state)
S_IDLE: begin
if (trxdir_wr_pulse && (xdir == 2'd0)) begin
logic is_ct32, is_ct16, is_t8, is_t4;
is_ct32 = (dpsm == 6'h00);
is_ct16 = (dpsm == 6'h02);
is_t8 = (dpsm == 6'h13);
is_t4 = (dpsm == 6'h14);
// Snapshot all transfer params.
dbp_q <= dbp;
dbw_q <= dbw;
dpsm_q <= dpsm;
dsax_q <= dsax;
dsay_q <= dsay;
rrw_q <= rrw;
rrh_q <= rrh;
dest_base_q <= {18'd0, dbp} << 8;
// row_stride = DBW * 64 * bpp:
// PSMCT32 → DBW * 256 (DBW << 8)
// PSMCT16 → DBW * 128 (DBW << 7)
// PSMT8 → DBW * 64 (DBW << 6)
// PSMT4 → DBW * 32 (DBW << 5)
// other → fall back to PSMCT32-stride
// (no VRAM emit anyway)
row_stride_q <= is_t4 ? ({18'd0, dbw} << 5)
: is_t8 ? ({18'd0, dbw} << 6)
: is_ct16 ? ({18'd0, dbw} << 7)
: ({18'd0, dbw} << 8);
psmct32_q <= is_ct32;
psmct16_q <= is_ct16;
psmt8_q <= is_t8;
psmt4_q <= is_t4;
// Lanes/qword: 4 (PSMCT32) → last=3,
// 8 (PSMCT16) → last=7, 16 (PSMT8) → last=15,
// 32 (PSMT4) → last=31. Other PSMs use the
// PSMCT32 cadence (silent consume).
lane_last_q <= is_t4 ? 5'd31
: is_t8 ? 5'd15
: is_ct16 ? 5'd7
: 5'd3;
cur_x_q <= 12'd0;
cur_y_q <= 12'd0;
pix_total_q <= pix_total_calc;
pix_done_q <= 24'd0;
lane_valid_q <= 1'b0;
state <= S_RUN;
end
end
S_RUN: begin
if (!lane_valid_q && data_valid && data_ready) begin
// Latch a fresh qword to drain.
qword_q <= data_qword;
lane_q <= 5'd0;
lane_valid_q <= 1'b1;
end else if (lane_valid_q) begin
// Drain one lane per cycle. Step the cursor
// when the emit fires (supported PSM or not —
// for unsupported PSM, no VRAM write fires
// but we still consume the lane to keep
// gif_packed_stub from desync).
if (lane_q == lane_last_q) begin
lane_valid_q <= 1'b0;
lane_q <= 5'd0;
end else begin
lane_q <= lane_q + 5'd1;
end
// Pixel-cursor + done-count step.
cur_x_q <= next_x;
cur_y_q <= next_y;
pix_done_q <= pix_done_q + 24'd1;
// Did this lane emit complete the rect?
if (pix_done_q + 24'd1 >= pix_total_q) begin
// End of transfer. Drop any remaining
// unused lanes — for PSMCT32 the rect
// size should be a multiple of 4 px,
// for PSMCT16 a multiple of 8 px, for
// PSMT8 a multiple of 16 px, for PSMT4
// a multiple of 32 px (else the extra
// trailing lanes within the last qword
// are silently swallowed).
// Return to IDLE on the same cycle the
// last lane emits.
state <= S_IDLE;
lane_valid_q <= 1'b0;
lane_q <= 5'd0;
end
end
end
default: state <= S_IDLE;
endcase
end
end
endmodule : gif_image_xfer_stub
+428
View File
@@ -0,0 +1,428 @@
// retroDE_ps2 — gif_packed_stub (Ch72 + Ch73)
//
// Real-format GIF parser. Sits in the same upstream/downstream slot
// as gif_path_stub but accepts a real PS2 GIFtag in front of the
// data, instead of the project-local single-qword register-write
// format that gif_path_stub uses for Wave 2.
//
// Scope:
// - PACKED (FLG=0): NLOOP×NREG PACKED entries, one entry per qword.
// A+D (REGS nibble 0xE) emits a GS register write; other nibbles
// are traced EV_MODE no-ops. (Ch72.)
// - REGLIST (FLG=1): NLOOP×NREG REGLIST entries, two entries per
// qword (low 64 = entry #0, high 64 = entry #1). REGLIST data
// bytes are register values keyed by REGS nibbles in order; only
// A+D (0xE) gets a GS write here, since real REGLIST treats each
// nibble as the register *number* (not A+D), and we don't yet
// have a reg# → gs-offset map. Other nibbles consume the entry
// and trace EV_MODE. (Ch73.)
// - FLG=2 (IMAGE) and FLG=3 (DISABLE): payload is NLOOP qwords,
// consumed silently with EV_MODE traces, no GS effect. (Ch73 —
// prevents the "next data qword is mistaken for a new GIFtag"
// desync flagged in the Ch73 audit.)
// - NLOOP up to 15 bits, NREG up to 16 registers. PS2 docs: a
// NREG field of 0 means 16; we use a 5-bit effective count to
// represent 16 correctly (Ch73 audit-medium fix — was clamped
// to 4'd15, which mis-counted PACKED 16-reg packets by one).
// - EOP carries no behavioral difference here (always-ready
// sink); preserved as trace metadata.
//
// PACKED A+D data qword layout — selectable via REAL_AD_REG_MAP:
//
// REAL_AD_REG_MAP=0 (default, project-local Ch72/Ch73 back-compat):
// bits[ 63: 0] = 64-bit register data
// bits[ 79: 64] = 16-bit project-local GS privileged offset
// (drives gs_stub.reg_wr_*)
// bits[127: 80] = reserved
//
// REAL_AD_REG_MAP=1 (real PS2 layout, Ch75):
// bits[ 63: 0] = 64-bit register data
// bits[ 71: 64] = 8-bit GIF A+D register number per PCSX2 GSRegs.h
// (drives gs_stub.gif_reg_*; gs_stub owns the
// decode into PRIM/RGBAQ/XYZF2/XYZ2/FRAME_1/ZBUF_1)
// bits[127: 72] = reserved
//
// The two namespaces are architecturally distinct. Do NOT add a
// reg# → privileged-offset LUT here — that conflation is the Ch74
// mistake Ch75 corrected. New GIF-context registers belong inside
// gs_stub, keyed by reg#.
//
// in_ready is held high — same one-shot contract as gif_path_stub.
//
// Trace schema:
// On tag accept: EV_GIFTAG arg0={flg,path_id} arg1={eop,nreg,nloop_lo}
// arg2=regs_64 arg3=0 flags={in_last,1}
// On PACKED data (A+D): EV_WRITE arg0=path_id arg1=regnib
// arg2={offset16} arg3=data64
// flags={in_last,0}
// On PACKED data (other): EV_MODE same layout, no GS write.
// On REGLIST entry: EV_MODE arg0=path_id arg1=regnib
// arg2=0 arg3=entry64
// flags={in_last,0}
// On IMAGE/DISABLE qword: EV_MODE arg0={flg,path_id}
// arg2=0 arg3=in_data[63:0]
// flags={in_last,0}
`timescale 1ns/1ps
module gif_packed_stub
import trace_pkg::*;
#(
parameter logic [3:0] PATH_ID = 4'd2,
// Ch75 (was Ch74, corrected): switch the PACKED A+D address-source
// from the project-local 16-bit-offset layout (default,
// bits[79:64]=gs_offset) to the real PS2 A+D layout where
// bits[71:64] is the 8-bit GS A+D register *number*. Per PCSX2's
// GSRegs.h, the GIF A+D register namespace is distinct from the
// GS privileged-MMIO offset namespace — Ch74's LUT mistakenly
// mapped one to the other. The corrected design hands the 8-bit
// reg# to gs_stub via its new gif_reg_* port and lets gs_stub
// own the GIF-context register file decode. When this parameter
// is 0, the legacy gs_wr_* port (16-bit privileged-style offset)
// is driven for back-compat with Ch72/Ch73 PACKED-A+D TBs and
// tb_bgcolor_via_dma.
parameter bit REAL_AD_REG_MAP = 1'b0
) (
input logic clk,
input logic rst_n,
// Upstream from DMAC
input logic in_valid,
input logic [127:0] in_data,
input logic in_last,
output logic in_ready,
// Downstream — legacy 16-bit-offset port (REAL_AD_REG_MAP=0).
// Drives gs_stub's privileged-style reg_wr_* port.
output logic gs_wr_en,
output logic [15:0] gs_wr_addr,
output logic [63:0] gs_wr_data,
// Ch110 — IMAGE-mode (FLG=2) data passthrough. `image_data_valid`
// pulses for one cycle on every accepted IMAGE qword (i.e., when
// gif_packed_stub.state == S_IMAGE and a qword is consumed).
// `image_data` is the raw 128-bit qword payload; `image_data_last`
// mirrors the upstream `in_last`. Downstream `gif_image_xfer_stub`
// captures the qword and writes the unpacked pixels into vram_stub
// at the BITBLTBUF/TRXPOS/TRXREG-described destination. NOT wired
// by TBs that don't model image transfers — leaving these outputs
// unconnected is fine (named-port instantiation).
output logic image_data_valid,
output logic [127:0] image_data,
output logic image_data_last,
// Ch110 — backpressure from the IMAGE consumer. When state is
// S_IMAGE, in_ready is gated by image_data_ready so the upstream
// DMA stalls while gif_image_xfer_stub is busy emitting the
// previous qword's pixel writes. Outside S_IMAGE the gate has no
// effect — in_ready stays high. TBs that don't model image
// transfers tie this to 1'b1 (the always-ready default).
input logic image_data_ready,
// Ch172 — backpressure from the raster command FIFO inside
// gs_stub. When the GIF is processing PACKED/REGLIST qwords
// and the raster FIFO is full, deassert in_ready so the DMAC
// pauses BEFORE the next qword is consumed (which might
// trigger prim_complete → push_drop). The stall point is the
// qword-acceptance handshake — once a qword is `accept`-ed,
// the parser fully processes it; we never have a "consumed
// but not committed" state. Outside PACKED/REGLIST this input
// is ignored. TBs that don't model the raster FIFO tie this
// to 1'b0 (always-have-space).
input logic raster_fifo_full,
// Ch75 — real-PS2 GIF A+D register-number port (REAL_AD_REG_MAP=1).
// Drives gs_stub's GIF-context gif_reg_* port. Only one of
// {gs_wr_*, gif_reg_*} is active per accept depending on the
// parameter.
output logic gif_reg_wr_en,
output logic [7:0] gif_reg_num,
output logic [63:0] gif_reg_data,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
// Ch73: state widened to cover REGLIST and IMAGE/DISABLE payloads
// so unsupported FLG packets don't desync onto the next qword.
typedef enum logic [1:0] {
S_TAG = 2'd0,
S_PACKED = 2'd1,
S_REGLIST = 2'd2,
S_IMAGE = 2'd3 // also used for FLG=3 DISABLE
} state_e;
state_e state;
// Ch110 — in_ready is high outside S_IMAGE (no backpressure
// path) and gated by image_data_ready ONLY when an actual
// FLG=2 IMAGE payload is in flight. S_IMAGE is also reused
// for FLG=3 DISABLE qwords (per Ch73 desync fix); those are
// opaque consume-only and must NOT route to
// gif_image_xfer_stub or apply backpressure. flg_q is the
// currently-running tag's FLG; gating on flg_q==2'd2 keeps
// DISABLE payloads silent.
logic image_active;
assign image_active = (state == S_IMAGE) && (flg_q == 2'd2);
// Ch172 — three-tier in_ready policy:
// 1) S_IMAGE with FLG=2 → wait for gif_image_xfer_stub to be
// ready for the next qword.
// 2) Otherwise (PACKED / REGLIST / TAG-fetch) → stall when
// the downstream raster command FIFO is full so the
// next register write can't trigger an unrecoverable
// prim_complete push_drop. Over-stalling on PACKED qwords
// that aren't going to cause a push is intentionally
// conservative — it has no functional impact and keeps
// the gate simple.
// 3) Default (sim TBs without the raster path) → always
// ready (raster_fifo_full tied to 1'b0 in those cases).
assign in_ready = image_active ? image_data_ready
: !raster_fifo_full;
logic accept;
assign accept = in_valid && in_ready;
// Ch110 — IMAGE-mode data passthrough (combinational).
// Gated on image_active so FLG=3 DISABLE qwords are NOT
// forwarded to gif_image_xfer_stub.
assign image_data = in_data;
assign image_data_last = in_last;
assign image_data_valid = accept && image_active;
// Latched tag context (valid in S_PACKED / S_REGLIST / S_IMAGE)
logic [14:0] nloop_q;
logic eop_q;
logic [1:0] flg_q;
// Ch73: nreg_eff widened to 5 bits. NREG field is 4 bits (0..15);
// PS2 docs say a value of 0 means 16. Encoding 16 as 5'b10000
// lets reg_idx == nreg_eff_q correctly terminate a 16-register
// packet. (Old 4-bit clamp made NREG=0 consume only 15 entries.)
logic [4:0] nreg_eff_q;
logic [63:0] regs_q;
logic [4:0] reg_idx;
// Ch73: REGLIST and IMAGE/DISABLE consume opaque qwords. We track
// how many qwords are still left in the payload, computed at
// S_TAG entry. PACKED keeps its per-entry reg_idx scheme.
// REGLIST count = ceil(NLOOP * NREG / 2) (2 entries / qword)
// IMAGE count = NLOOP (1 qword / loop)
logic [19:0] payload_qwords_left;
// Combinational tag-field decode for the qword on the wire in S_TAG.
logic [14:0] tag_nloop;
logic tag_eop;
logic [1:0] tag_flg;
logic [3:0] tag_nreg_field;
logic [4:0] tag_nreg_eff;
logic [63:0] tag_regs;
assign tag_nloop = in_data[14:0];
assign tag_eop = in_data[15];
assign tag_flg = in_data[59:58];
assign tag_nreg_field = in_data[63:60];
assign tag_nreg_eff = (tag_nreg_field == 4'd0) ? 5'd16
: {1'b0, tag_nreg_field};
assign tag_regs = in_data[127:64];
// Ch73 audit-low: replace indexed bit-select with shift/mask. The
// big case statement triggered iverilog's "constant selects in
// always_*" "sorry" warnings repeatedly. Use concat-pad to form
// the shift amount (reg_idx * 4) without going through `*`, which
// iverilog truncates to operand width and would alias high
// reg_idx values back to small shifts (e.g., reg_idx=8 wrapping
// to shift=0 — the bug found in Ch73 bring-up).
logic [6:0] cur_regnib_shift;
logic [3:0] cur_regnib;
assign cur_regnib_shift = {reg_idx, 2'b00}; // reg_idx*4 in 7 bits
assign cur_regnib = (regs_q >> cur_regnib_shift) & 64'hF;
// ------------------------------------------------------------------
// FSM
// ------------------------------------------------------------------
logic packed_last_in_loop;
logic packet_loop_last;
assign packed_last_in_loop = (reg_idx + 5'd1 == nreg_eff_q);
assign packet_loop_last = (nloop_q == 15'd1);
// Ch73: pre-compute REGLIST payload-qword count = ceil(NLOOP *
// NREG / 2). Done at S_TAG accept so the FSM only needs an
// opaque countdown afterwards.
logic [19:0] reglist_total_entries;
logic [19:0] reglist_total_qwords;
assign reglist_total_entries = tag_nloop * tag_nreg_eff;
assign reglist_total_qwords = (reglist_total_entries + 20'd1) >> 1;
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_TAG;
nloop_q <= 15'd0;
eop_q <= 1'b0;
flg_q <= 2'd0;
nreg_eff_q <= 5'd0;
regs_q <= 64'd0;
reg_idx <= 5'd0;
payload_qwords_left <= 20'd0;
end else if (accept) begin
unique case (state)
S_TAG: begin
nloop_q <= tag_nloop;
eop_q <= tag_eop;
flg_q <= tag_flg;
nreg_eff_q <= tag_nreg_eff;
regs_q <= tag_regs;
reg_idx <= 5'd0;
payload_qwords_left <= 20'd0;
if (tag_nloop == 15'd0) begin
state <= S_TAG; // empty tag
end else begin
unique case (tag_flg)
2'd0: state <= S_PACKED;
2'd1: begin
state <= S_REGLIST;
payload_qwords_left <= reglist_total_qwords;
end
default: begin
state <= S_IMAGE; // FLG=2/3
payload_qwords_left <= {5'd0, tag_nloop};
end
endcase
end
end
S_PACKED: begin
if (packed_last_in_loop) begin
reg_idx <= 5'd0;
if (packet_loop_last) state <= S_TAG;
else nloop_q <= nloop_q - 15'd1;
end else begin
reg_idx <= reg_idx + 5'd1;
end
end
S_REGLIST, S_IMAGE: begin
// Both branches consume opaque qwords. Trace fires
// per accept (see trace block below); decode of
// individual REGLIST entries is left to a future
// chapter once gs_stub gains the matching reg
// surface. The point of Ch73 here is just: don't
// desync onto the next GIFtag.
if (payload_qwords_left == 20'd1) state <= S_TAG;
else payload_qwords_left <= payload_qwords_left - 20'd1;
end
default: state <= S_TAG;
endcase
end
end
// ------------------------------------------------------------------
// GS write — fires only on PACKED A+D data accepts. REGLIST
// entries don't generate GS writes here (real REGLIST treats each
// nibble as a register *number*, not A+D, and we don't have GS
// routing for that path yet). IMAGE/DISABLE never generates GS
// writes.
//
// Ch75: split into two output ports based on REAL_AD_REG_MAP.
// Only one fires per accept; the other stays low. gs_stub's
// privileged-side `reg_wr_*` and GIF-A+D-side `gif_reg_*` ports
// are architecturally distinct.
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
gs_wr_en <= 1'b0;
gs_wr_addr <= 16'd0;
gs_wr_data <= 64'd0;
gif_reg_wr_en <= 1'b0;
gif_reg_num <= 8'd0;
gif_reg_data <= 64'd0;
end else if (accept && state == S_PACKED && cur_regnib == 4'hE) begin
if (REAL_AD_REG_MAP) begin
gs_wr_en <= 1'b0;
gif_reg_wr_en <= 1'b1;
gif_reg_num <= in_data[71:64];
gif_reg_data <= in_data[63:0];
end else begin
gs_wr_en <= 1'b1;
gs_wr_addr <= in_data[79:64];
gs_wr_data <= in_data[63:0];
gif_reg_wr_en <= 1'b0;
end
end else begin
gs_wr_en <= 1'b0;
gif_reg_wr_en <= 1'b0;
end
end
// ------------------------------------------------------------------
// Trace
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_GIF;
ev_event <= EV_GIFTAG;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (accept && state == S_TAG) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_GIF;
ev_event <= EV_GIFTAG;
// arg0[3:0]=path_id, arg0[5:4]=flg → callers can grep by FLG
ev_arg0 <= {58'd0, tag_flg, PATH_ID};
// Compact tag summary: {eop[15], reserved[14:13]=flg, nreg[12:9], nloop[8:0]}
ev_arg1 <= {49'd0, tag_eop, tag_flg, tag_nreg_field,
tag_nloop[8:0]};
ev_arg2 <= tag_regs;
ev_arg3 <= 64'd0;
ev_flags <= {30'd0, in_last, 1'b1}; // bit0=is_tag
end else if (accept && state == S_PACKED) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_GIF;
ev_event <= (cur_regnib == 4'hE) ? EV_WRITE : EV_MODE;
ev_arg0 <= {60'd0, PATH_ID};
ev_arg1 <= {60'd0, cur_regnib};
ev_arg2 <= {48'd0, in_data[79:64]};
ev_arg3 <= in_data[63:0];
ev_flags <= {30'd0, in_last, 1'b0}; // bit0=is_data
end else if (accept && state == S_REGLIST) begin
// Two entries per qword: low half (reglist_half=0) → bits
// [63:0]; high half (reglist_half=1) → bits[127:64]. Trace
// each as EV_MODE (no GS write). reglist_half is the
// already-flopped bit, so the same trace block fires for
// both halves of the same qword on consecutive accepts —
// wait, no: REGLIST's S_REGLIST branch consumes one accept
// per half of the same qword? In our FSM, the high half
// re-enters S_REGLIST on the SAME qword? It does not — the
// FSM advances reglist_half within a single accept. Trace
// the low-half entry on the accept; the high-half entry's
// trace is omitted in this minimal Ch73 path.
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_GIF;
ev_event <= EV_MODE;
ev_arg0 <= {60'd0, PATH_ID};
ev_arg1 <= {60'd0, cur_regnib};
ev_arg2 <= 64'd0;
ev_arg3 <= in_data[63:0]; // low-half entry data
ev_flags <= {30'd0, in_last, 1'b0};
end else if (accept && state == S_IMAGE) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_GIF;
ev_event <= EV_MODE;
ev_arg0 <= {58'd0, flg_q, PATH_ID};
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= in_data[63:0];
ev_flags <= {30'd0, in_last, 1'b0};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : gif_packed_stub
+128
View File
@@ -0,0 +1,128 @@
// retroDE_ps2 — gif_path_stub
//
// Narrow GIF ingress stub for Wave 2. Accepts qword payloads from
// dmac_reg_stub, interprets them as register-write packets (project-local
// format — see below), and forwards register writes into gs_stub.
//
// This is NOT real GIFtag decode. Real GIFtag/PACKED/REGLIST/IMAGE formats
// arrive in Wave 3. The wire format here is a project-local shortcut sized
// to the "programmable BGCOLOR via DMA/GIF" target in
// docs/wave2_dma_gif_plan.md.
//
// Project-local Wave 2 packet format (per qword):
// bits [ 15: 0] = target register offset within GS privileged block
// (e.g., 0x00E0 for BGCOLOR)
// bits [ 79: 16] = 64-bit register value (low 24 bits are RGB for BGCOLOR)
// bits [127: 80] = reserved, must be zero
//
// Each qword produced by the DMAC is treated as one standalone register
// write. This module is stateless with respect to packet framing —
// multi-beat transfers (Wave 2.6 onward) work transparently because every
// accepted qword is independently decoded. `in_last` is preserved as
// trace-visible metadata in ev_flags[0] but does not gate decode. Real
// GIFtag/PACKED/REGLIST/IMAGE format decode, along with tag-phase vs.
// data-phase state, is deferred to Wave 3.
//
// PATH selection is hard-scoped to the DMAC channel-2 path (PATH id 2)
// since no arbitration exists yet.
//
// Trace payload schema:
// GIF GIFTAG arg0=path_id arg1=packet_type arg2=reg_offset arg3=payload_lo
`timescale 1ns/1ps
module gif_path_stub
import trace_pkg::*;
#(
parameter logic [3:0] PATH_ID = 4'd2
) (
input logic clk,
input logic rst_n,
// Upstream from DMAC
input logic in_valid,
input logic [127:0] in_data,
input logic in_last,
output logic in_ready,
// Downstream to gs_stub (register-write style)
output logic gs_wr_en,
output logic [15:0] gs_wr_addr,
output logic [63:0] gs_wr_data,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
// Wave 2 packet-type magic — one value for "register-write packet".
// Future waves will add real GIFtag formats.
localparam logic [15:0] PKT_TYPE_REG_WRITE = 16'hA01A;
// Always ready in Wave 2 — no internal backpressure.
assign in_ready = 1'b1;
logic accept;
assign accept = in_valid && in_ready;
// Decode fields from the qword.
logic [15:0] decoded_offset;
logic [63:0] decoded_value;
assign decoded_offset = in_data[15:0];
assign decoded_value = in_data[79:16];
// ------------------------------------------------------------------
// Downstream to gs_stub (registered one-shot pulse)
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
gs_wr_en <= 1'b0;
gs_wr_addr <= 16'd0;
gs_wr_data <= 64'd0;
end else begin
gs_wr_en <= accept;
if (accept) begin
gs_wr_addr <= decoded_offset;
gs_wr_data <= decoded_value;
end
end
end
// ------------------------------------------------------------------
// Trace emission
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_GIF;
ev_event <= EV_GIFTAG;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (accept) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_GIF;
ev_event <= EV_GIFTAG;
ev_arg0 <= {60'd0, PATH_ID};
ev_arg1 <= {48'd0, PKT_TYPE_REG_WRITE};
ev_arg2 <= {48'd0, decoded_offset};
ev_arg3 <= {{32{1'b0}}, decoded_value[31:0]};
// flags[0] marks end-of-packet (tracks DMAC's in_last)
ev_flags <= {31'd0, in_last};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : gif_path_stub
+157
View File
@@ -0,0 +1,157 @@
// retroDE_ps2 — gs_alpha_blend
//
// Brick 2a — GS ALPHA blending (transparency), the source-over case.
//
// Computes, per RGB channel:
// Cv = ((Cs - Cd) * As) >> 7 + Cd (clamped to [0,255])
// where
// Cs = source color (the sprite's RGBAQ color channel)
// Cd = destination (the framebuffer pixel READ back at the write addr)
// As = source alpha (RGBAQ.A; PS2 0..128 scale where 0x80 == 1.0)
//
// This is the standard PS2 GS ALPHA register config A=0(Cs) B=1(Cd)
// C=0(As) D=1(Cd) — i.e. the canonical alpha-over blend. The brick-2a
// scope implements ONLY this config; selecting any other (A,B,C,D)
// tuple is handled by the caller (gs_stub) which falls back to an
// opaque write for unsupported configs, so this unit is always asked
// for the source-over result.
//
// Purely combinational: a subtract, a multiply by As (<= 8 bits), an
// arithmetic shift right by 7, an add, and a clamp. No divide. Fully
// synthesizable — there is NO `// synthesis translate_off` on this
// path. The >>7 is a wire shift; the *As is a single small multiply.
//
// The (Cs - Cd) term is signed (can be negative when the dest is
// brighter than the source), so the multiply and the shift are done
// in signed arithmetic and the final sum is clamped back into the
// unsigned [0,255] byte range. As is treated as unsigned 0..128; the
// caller passes RGBAQ.A[7:0] (real GS uses A[6:0]*2 internally for
// the 0..128 mapping, but A[7:0] already encodes 0x80=1.0 for the
// values our demo programs, and clamping As at 128 keeps a stray
// A>0x80 from over-shooting).
//
// Alpha (the A channel of the output) follows real-GS behavior for a
// framebuffer write: the SOURCE alpha is written through. brick-2a
// keeps the existing emit-lane A byte (= source A) unchanged; only
// R/G/B are blended. The 'a_out' port forwards the source A so the
// caller can repack the 32-bit ABGR.
`timescale 1ns/1ps
module gs_alpha_blend #(
// Brick-2c — generic GS blend selector. Default OFF → the module is
// BYTE-IDENTICAL to the original source-over-only implementation,
// regardless of the selector inputs (which default to 0 when an
// instantiation leaves them unconnected). When set, the generic
// GS ALPHA datapath (A/B/C/D selectors + FIX) is used.
parameter bit ALPHA_MODES = 1'b0
) (
// Source (sprite) color channels + alpha.
input logic [7:0] cs_r,
input logic [7:0] cs_g,
input logic [7:0] cs_b,
input logic [7:0] as, // source alpha, 0..128 scale (0x80 = 1.0)
// Destination (framebuffer) color channels.
input logic [7:0] cd_r,
input logic [7:0] cd_g,
input logic [7:0] cd_b,
// Brick-2c — generic GS ALPHA_1 selectors (only read when ALPHA_MODES=1).
// a_sel : A operand 0=Cs 1=Cd 2=0
// b_sel : B operand 0=Cs 1=Cd 2=0
// c_sel : C coeff 0=As 1=Ad 2=FIX
// d_sel : D operand 0=Cs 1=Cd 2=0
// ad : destination alpha (8-bit, used when c_sel==1)
// fix : fixed alpha coefficient (8-bit, used when c_sel==2)
input logic [1:0] a_sel,
input logic [1:0] b_sel,
input logic [1:0] c_sel,
input logic [1:0] d_sel,
input logic [7:0] ad,
input logic [7:0] fix,
// Blended output.
output logic [7:0] cv_r,
output logic [7:0] cv_g,
output logic [7:0] cv_b,
output logic [7:0] a_out // source alpha, passed through
);
// Clamp As at 128 (0x80) — anything above 1.0 is treated as 1.0.
logic [7:0] as_eff;
assign as_eff = (as > 8'd128) ? 8'd128 : as;
function automatic logic [7:0] blend_ch(input logic [7:0] cs,
input logic [7:0] cd,
input logic [7:0] alpha);
logic signed [9:0] diff; // Cs - Cd, range -255..+255
logic signed [17:0] prod; // diff * alpha, alpha 0..128
logic signed [17:0] shifted; // prod >>> 7
logic signed [17:0] sum; // shifted + Cd
diff = $signed({2'b00, cs}) - $signed({2'b00, cd});
prod = diff * $signed({1'b0, alpha});
shifted = prod >>> 7; // arithmetic shift
sum = shifted + $signed({10'd0, cd});
// Clamp to [0,255].
if (sum < 18'sd0)
return 8'd0;
else if (sum > 18'sd255)
return 8'd255;
else
return sum[7:0];
endfunction
// ------------------------------------------------------------------
// Brick-2c — generic GS blend selector datapath.
// Cv = clamp( (((A - B) * C) >>> 7) + D ) per RGB channel.
// A/B/D ∈ {Cs, Cd, 0}; C ∈ {As, Ad, FIX} (8-bit coeff, 0x80==1.0).
// (A-B) is signed; *C is unsigned 0..255; >>>7 arithmetic; +D; clamp.
// ------------------------------------------------------------------
function automatic logic [7:0] blend_generic(
input logic [7:0] cs, input logic [7:0] cd,
input logic [1:0] asel, input logic [1:0] bsel,
input logic [1:0] dsel, input logic [7:0] coef);
logic [7:0] op_a;
logic [7:0] op_b;
logic [7:0] op_d;
logic signed [31:0] diff; // signed (A - B)
logic signed [31:0] prod; // diff * coef (coef unsigned 0..255)
logic signed [31:0] shifted; // prod >>> 7
logic signed [31:0] sum; // shifted + D
op_a = (asel == 2'd0) ? cs : (asel == 2'd1) ? cd : 8'd0;
op_b = (bsel == 2'd0) ? cs : (bsel == 2'd1) ? cd : 8'd0;
op_d = (dsel == 2'd0) ? cs : (dsel == 2'd1) ? cd : 8'd0;
diff = $signed({1'b0, op_a}) - $signed({1'b0, op_b});
prod = diff * $signed({24'd0, coef});
shifted = prod >>> 7; // arithmetic shift
sum = shifted + $signed({24'd0, op_d});
if (sum < 32'sd0)
return 8'd0;
else if (sum > 32'sd255)
return 8'd255;
else
return sum[7:0];
endfunction
// Shared 8-bit C coefficient (same for all three channels).
logic [7:0] coef_c;
assign coef_c = (c_sel == 2'd0) ? as_eff :
(c_sel == 2'd1) ? ad : fix;
generate
if (ALPHA_MODES) begin : g_generic
assign cv_r = blend_generic(cs_r, cd_r, a_sel, b_sel, d_sel, coef_c);
assign cv_g = blend_generic(cs_g, cd_g, a_sel, b_sel, d_sel, coef_c);
assign cv_b = blend_generic(cs_b, cd_b, a_sel, b_sel, d_sel, coef_c);
end else begin : g_source_over
// EXACT original source-over expression — byte-identical.
assign cv_r = blend_ch(cs_r, cd_r, as_eff);
assign cv_g = blend_ch(cs_g, cd_g, as_eff);
assign cv_b = blend_ch(cs_b, cd_b, as_eff);
end
endgenerate
assign a_out = as; // source alpha passes through unchanged
endmodule : gs_alpha_blend
+89
View File
@@ -0,0 +1,89 @@
// retroDE_ps2 — gs_async_fifo (Ch318)
//
// Generic dual-clock (asynchronous) FIFO with gray-code pointers and 2-FF pointer
// synchronizers — the standard CDC-safe ring buffer. Used by gs_lpddr_axi_master to
// cross 256-bit framebuffer-row packets {addr,data,strb} from the GS clock domain to
// the f2sdram (LPDDR AXI) clock domain. Both domains are treated as GENUINELY async
// even when nominally the same frequency (GS = PLL design_clk; f2sdram = raw board
// clock), per the Ch318 directive.
//
// DEPTH must be a power of two. `wr`/`rd` are single-cycle handshakes gated by
// !full / !empty. Standard caveats: do NOT assert wr when full or rd when empty
// (the wrapper gates both). One-deep gray pointers, single 2-FF synchronizer each
// way — adequate for the modest packet rate (one 32-byte beat per 16 flushed pixels).
module gs_async_fifo #(
parameter int WIDTH = 320, // {addr[31:0], data[255:0], strb[31:0]}
parameter int DEPTH = 16 // power of two
) (
// write domain
input logic wclk,
input logic wrst_n,
input logic wr,
input logic [WIDTH-1:0] wdata,
output logic wfull,
// read domain
input logic rclk,
input logic rrst_n,
input logic rd,
output logic [WIDTH-1:0] rdata,
output logic rempty
);
localparam int AW = $clog2(DEPTH);
logic [WIDTH-1:0] mem [0:DEPTH-1];
// ---- binary + gray pointers (one extra MSB for full/empty disambiguation) ----
logic [AW:0] wbin, wgray, wbin_nxt, wgray_nxt;
logic wfull_nxt; // Ch352 — combinational next-value for the now-REGISTERED wfull
logic [AW:0] rbin, rgray, rbin_nxt, rgray_nxt;
// synchronized opposite-domain gray pointers (2-FF)
logic [AW:0] rgray_s1, rgray_s2; // read gray -> write domain
logic [AW:0] wgray_s1, wgray_s2; // write gray -> read domain
function automatic logic [AW:0] bin2gray(input logic [AW:0] b);
bin2gray = b ^ (b >> 1);
endfunction
// ---------------- write domain ----------------
assign wbin_nxt = wbin + (wr && !wfull);
assign wgray_nxt = bin2gray(wbin_nxt);
// full: next write gray == read gray with top two bits inverted. Ch352 — wfull is now a REGISTERED flag
// (Cummings canonical). The previous `assign wfull = (wgray_nxt == ...)` was combinational, and since
// wgray_nxt <- wbin_nxt <- wfull, it formed a wbin_nxt->wgray_nxt->wfull->wbin_nxt COMBINATIONAL LOOP that
// Quartus reports and that made Place churn. Registering it breaks the loop with no overflow-behavior change:
// wfull still asserts the cycle after the filling write (full is computed from wgray_nxt = the pointer AFTER
// the current write), so the (DEPTH+1)th write is still blocked. rempty is intentionally left unchanged.
assign wfull_nxt = (wgray_nxt == {~rgray_s2[AW:AW-1], rgray_s2[AW-2:0]});
always_ff @(posedge wclk or negedge wrst_n) begin
if (!wrst_n) begin
wbin <= '0; wgray <= '0; wfull <= 1'b0;
rgray_s1 <= '0; rgray_s2 <= '0;
end else begin
wbin <= wbin_nxt;
wgray <= wgray_nxt;
wfull <= wfull_nxt;
rgray_s1 <= rgray; // sync read gray into write domain
rgray_s2 <= rgray_s1;
end
end
always_ff @(posedge wclk) if (wr && !wfull) mem[wbin[AW-1:0]] <= wdata;
// ---------------- read domain ----------------
assign rbin_nxt = rbin + (rd && !rempty);
assign rgray_nxt = bin2gray(rbin_nxt);
always_ff @(posedge rclk or negedge rrst_n) begin
if (!rrst_n) begin
rbin <= '0; rgray <= '0;
wgray_s1 <= '0; wgray_s2 <= '0;
end else begin
rbin <= rbin_nxt;
rgray <= rgray_nxt;
wgray_s1 <= wgray; // sync write gray into read domain
wgray_s2 <= wgray_s1;
end
end
assign rdata = mem[rbin[AW-1:0]];
assign rempty = (rgray == wgray_s2);
endmodule : gs_async_fifo
+88
View File
@@ -0,0 +1,88 @@
// ============================================================================
// gs_grad_divider.sv (Ch352 — sequential signed divider for the triangle-setup gradient solve)
//
// Replaces the single combinational `grad_num_q[grad_step] / grad_det_q` in gs_stub. That combinational
// divider is a ~6700-cell, ~100ns cone at the 25MHz design clock — the worst setup path, and (the real lesson)
// it CANNOT be covered by any SDC timing exception: both a multicycle and a false_path made the Quartus fitter
// grind on its cone indefinitely (Place stuck <1% for hours). A sequential divider has REGISTERED iterations and
// no combinational cone, so every internal path is an ordinary single-cycle path that closes timing normally —
// no exception needed, no grind.
//
// BIT-EXACT to SystemVerilog signed `/`:
// * truncation toward zero (divide magnitudes, then apply the XOR-of-signs);
// * den == 0 -> quotient 0 (matches the gs_stub `if (grad_det_q==0) grad_quo=0` guard).
// Restoring division of the W-bit magnitudes (W iterations), one iteration per clock.
//
// Handshake: pulse `start` with num/den stable -> `busy` high for the solve -> `done` pulses for one cycle
// with `quo` valid (and stays valid until the next start). The gs_stub gradient FSM waits on `done`.
// ============================================================================
`timescale 1ns/1ps
module gs_grad_divider #(
parameter int W = 56 // operand width (gs_stub: grad_num_q / sign-extended grad_det)
)(
input logic clk,
input logic rst_n,
input logic start, // pulse: begin a divide (num/den sampled this cycle)
input logic signed [W-1:0] num,
input logic signed [W-1:0] den,
output logic signed [W-1:0] quo, // truncate-toward-zero quotient (== $signed(num)/$signed(den))
output logic busy,
output logic done // 1-cycle pulse when quo is valid
);
localparam int CW = $clog2(W+1);
// magnitude + sign capture
function automatic logic [W-1:0] absval(input logic signed [W-1:0] v);
absval = v[W-1] ? (~v + 1'b1) : v; // |v| (the most-negative wraps to 2^(W-1), which fits unsigned W)
endfunction
logic [W:0] rem; // remainder, W+1 bits for the compare/subtract
logic [W-1:0] qbuild; // quotient under construction (also shifts the dividend out of its top)
logic [W-1:0] den_mag; // |den|
logic qsign; // result sign = num_sign ^ den_sign
logic [CW-1:0] iter;
logic run;
// one restoring step: bring the next dividend bit into rem, conditionally subtract |den|.
wire [W:0] rem_sh = {rem[W-1:0], qbuild[W-1]}; // rem<<1 | dividend MSB
wire sub_ok = (rem_sh >= {1'b0, den_mag});
wire [W:0] rem_nxt = sub_ok ? (rem_sh - {1'b0, den_mag}) : rem_sh;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
rem <= '0; qbuild <= '0; den_mag <= '0; qsign <= 1'b0; iter <= '0;
run <= 1'b0; busy <= 1'b0; done <= 1'b0; quo <= '0;
end else begin
done <= 1'b0;
if (start && !busy) begin
if (den == '0) begin
// den == 0 -> quotient 0 (matches the gs_stub guard), available next cycle
quo <= '0;
done <= 1'b1;
busy <= 1'b0;
run <= 1'b0;
end else begin
rem <= '0;
qbuild <= absval(num);
den_mag <= absval(den);
qsign <= num[W-1] ^ den[W-1];
iter <= CW'(W);
run <= 1'b1;
busy <= 1'b1;
end
end else if (run) begin
rem <= rem_nxt;
qbuild <= {qbuild[W-2:0], sub_ok}; // shift dividend out, shift quotient bit in
iter <= iter - 1'b1;
if (iter == CW'(1)) begin
// final iteration: qbuild now holds the W-bit magnitude quotient -> apply sign
run <= 1'b0;
busy <= 1'b0;
done <= 1'b1;
quo <= qsign ? (~{qbuild[W-2:0], sub_ok} + 1'b1) : {qbuild[W-2:0], sub_ok};
end
end
end
end
endmodule
+248
View File
@@ -0,0 +1,248 @@
// retroDE_ps2 — gs_lpddr_axi_master (Ch318)
//
// HARDWARE-facing wrapper that takes the PSMCT16 tile-FLUSH pixel stream (GS clock)
// and writes it to real LPDDR over the qsys f2sdram AXI4 port (f2sdram clock). It
// does NOT modify the proven gs_lpddr_fb_writer (the Ch317 sim model) — it is a
// sibling hardware path with the same input stream.
//
// Pipeline (per the Ch318 directive):
// GS clock : PACKER — accumulate 16 PSMCT16 pixels of a tile-row into one 256-bit
// (32-byte) beat {addr, data, strb}. A tile-row is exactly 16 px on a
// 32-byte-aligned line, so a beat completes naturally on its 16th px
// (no dangling partial beat). On completion, push to the async FIFO.
// async FIFO: gray-code CDC, carries {addr[31:0], data[255:0], strb[31:0]} (320b).
// f2sdram : AXI burst FSM — pop a beat and issue a single-beat INCR write
// (AWSIZE=5 = 32 B, AWLEN=0, AWBURST=INCR, full per-byte WSTRB, never
// crossing a 4 KiB boundary since each beat is one 32-byte line). AW
// then W then B, all with backpressure (await ready/valid).
//
// Address: awaddr = FB_BASE + packet_addr (packet_addr is the FB-relative byte addr
// from raster_pixel_fb_addr_q). FB_BASE must point at a LINUX-SAFE reserved LPDDR
// region before any board run — the qsys aperture proves fabric CAN address SDRAM,
// not which physical range is safe to scribble on (Ch318 board gate).
//
// Counters (f2sdram domain, TB/status readable): beats, bursts, bresp_err, fifo
// overflow, done-ish (idle && fifo empty). enable=0 → fully inert.
module gs_lpddr_axi_master #(
parameter int FIFO_DEPTH = 16
) (
// GS clock domain — flush pixel stream
input logic gs_clk,
input logic gs_rst_n,
input logic enable,
// ---- RUNTIME controls (driven by the HPS bridge register, axi_clk domain) ----
// arm: HARD SAFETY GATE — no AXI write can issue unless high. Defaults LOW at the
// bridge register, so the booted core is inert until the HPS explicitly arms it.
// Synced into gs_clk for the packer; used directly in the axi_clk FSM.
input logic arm,
// canary: when high, write ONLY the offset-0 beat (the 32-byte top-of-frame line)
// and discard all others — a deterministic, blast-radius-limited first test.
input logic canary,
// fb_base: LPDDR byte base address for the framebuffer (e.g. 0x8000_0000). awaddr
// = fb_base + frame-relative offset. Runtime so a wrong base is re-targetable
// without a rebuild.
input logic [31:0] fb_base,
// Ch352 CDC (Codex) — {arm,canary,fb_base} arrive RAW from the HPS bridge (CLOCK2_50), NOT axi_clk as the
// legacy comment above implies. ctrl_commit is a TOGGLE the bridge flips on any control write; we sync it
// into axi_clk and latch the controls on its edge, so the multi-bit fb_base crosses COHERENTLY (the CDC
// lives here, at the receiving boundary, so no caller can supply raw controls into the AW path).
input logic ctrl_commit,
input logic px_emit,
input logic [31:0] px_addr, // FB-relative byte address (raster_pixel_fb_addr_q)
input logic [15:0] px_pix16,
// f2sdram (LPDDR AXI) clock domain
input logic axi_clk,
input logic axi_rst_n,
// AXI4 write-address
output logic [31:0] awaddr,
output logic [7:0] awlen,
output logic [2:0] awsize,
output logic [1:0] awburst,
output logic [4:0] awid,
output logic awvalid,
input logic awready,
// AXI4 write-data
output logic [255:0] wdata,
output logic [31:0] wstrb,
output logic wlast,
output logic wvalid,
input logic wready,
// AXI4 write-response
input logic bvalid,
output logic bready,
input logic [1:0] bresp,
// status / counters (axi domain)
output logic [31:0] beats_written,
output logic [31:0] bursts_issued,
output logic [31:0] bresp_err_count,
output logic [31:0] fifo_overflow_count,
output logic idle
);
localparam int PW = 320; // {addr[31:0], data[255:0], strb[31:0]}
// ============================ GS-clock PACKER ============================
logic [31:0] cur_addr;
logic [255:0] cur_data;
logic [31:0] cur_strb;
logic has_data;
logic fifo_wr;
logic [PW-1:0] fifo_wdata;
logic fifo_wfull;
// Ch352 — axi_clk control snapshot: sync the bridge commit toggle and latch {arm,canary,fb_base} on its
// edge. Init to the bridge's SAFE defaults (arm=0, canary=1, fb_base=0x8000_0000) so the booted core is
// inert until the HPS arms it, even before the first commit. All axi_clk uses + the gs_clk arm-sync read
// these coherent latched copies instead of the raw bridge buses.
logic [2:0] commit_sync;
logic arm_axi, canary_axi;
logic [31:0] fb_base_axi;
always_ff @(posedge axi_clk or negedge axi_rst_n) begin
if (!axi_rst_n) begin
commit_sync <= 3'd0; arm_axi <= 1'b0; canary_axi <= 1'b1; fb_base_axi <= 32'h8000_0000;
end else begin
commit_sync <= {commit_sync[1:0], ctrl_commit};
if (commit_sync[2] != commit_sync[1]) begin // commit edge: bridge buses are stable, latch them
arm_axi <= arm;
canary_axi <= canary;
fb_base_axi <= fb_base;
end
end
end
// High for the one cycle the snapshot updates. Admission is blocked then so the FSM never consumes a beat
// straddling a config change (old base/arm on the pop cycle, new on the next).
wire commit_edge = (commit_sync[2] != commit_sync[1]);
// arm crosses from axi_clk into gs_clk — 2-FF synchronizer (from the COHERENT latched arm).
logic arm_s1, arm_gs;
always_ff @(posedge gs_clk or negedge gs_rst_n) begin
if (!gs_rst_n) begin arm_s1 <= 1'b0; arm_gs <= 1'b0; end
else begin arm_s1 <= arm_axi; arm_gs <= arm_s1; end
end
always_ff @(posedge gs_clk or negedge gs_rst_n) begin
if (!gs_rst_n) begin
cur_addr <= '0; cur_data <= '0; cur_strb <= '0; has_data <= 1'b0;
fifo_wr <= 1'b0; fifo_wdata <= '0; fifo_overflow_count <= '0;
end else begin
fifo_wr <= 1'b0;
if (enable && arm_gs && px_emit) begin // gate: no accumulation until armed
logic [31:0] abeat;
logic [3:0] lane; // 0..15 (which 16-bit lane)
logic [255:0] nd;
logic [31:0] ns;
abeat = {px_addr[31:5], 5'd0};
lane = px_addr[4:1];
if (has_data && (abeat != cur_addr)) begin
// line changed before the previous beat filled — flush it, restart
fifo_wdata <= {cur_addr, cur_data, cur_strb};
fifo_wr <= 1'b1;
cur_addr <= abeat;
cur_data <= (256'(px_pix16) << ({28'd0, lane} * 16));
cur_strb <= (32'd3 << ({28'd0, lane} * 2));
has_data <= 1'b1;
end else begin
nd = has_data ? cur_data : 256'd0;
ns = has_data ? cur_strb : 32'd0;
nd[ ({28'd0, lane} * 16) +: 16 ] = px_pix16;
ns[ ({28'd0, lane} * 2) +: 2 ] = 2'b11;
if (&ns) begin
// beat complete (all 16 lanes) — flush, beat consumed
fifo_wdata <= {abeat, nd, ns};
fifo_wr <= 1'b1;
has_data <= 1'b0;
end else begin
cur_addr <= abeat;
cur_data <= nd;
cur_strb <= ns;
has_data <= 1'b1;
end
end
end
// overflow witness: a push attempt while the FIFO is full (must stay 0)
if (fifo_wr && fifo_wfull)
fifo_overflow_count <= fifo_overflow_count + 32'd1;
end
end
// ============================ async FIFO (CDC) ============================
logic [PW-1:0] fifo_rdata;
logic fifo_rempty;
logic fifo_rd;
// Ch323 — reset BOTH FIFO pointers from the STABLE axi_rst_n (assert async, deassert
// synced into gs_clk). gs_rst_n (= core reset) toggles on every CORE_CTRL re-render; if
// the write pointer reset followed it while the read pointer stayed, the gray pointers
// would desync → FIFO corruption (phantom beats, no commit). Same fix as gs_z_flush_writer.
reg [1:0] wrst_sync;
always_ff @(posedge gs_clk or negedge axi_rst_n) begin
if (!axi_rst_n) wrst_sync <= 2'b00;
else wrst_sync <= {wrst_sync[0], 1'b1};
end
wire fifo_wrst_n = wrst_sync[1];
gs_async_fifo #(.WIDTH(PW), .DEPTH(FIFO_DEPTH)) u_fifo (
.wclk(gs_clk), .wrst_n(fifo_wrst_n), .wr(fifo_wr && !fifo_wfull), .wdata(fifo_wdata), .wfull(fifo_wfull),
.rclk(axi_clk), .rrst_n(axi_rst_n), .rd(fifo_rd), .rdata(fifo_rdata), .rempty(fifo_rempty)
);
// ============================ f2sdram-clock AXI FSM ============================
localparam logic [1:0] S_IDLE=2'd0, S_AW=2'd1, S_W=2'd2, S_B=2'd3;
logic [1:0] state;
logic [31:0] beat_addr;
logic [255:0] beat_data;
logic [31:0] beat_strb;
logic [31:0] awaddr_q; // Ch352 — full AW address latched at admission, held stable AW->W->B
assign awsize = 3'd5; // 32 bytes/beat (256-bit)
assign awburst = 2'b01; // INCR
assign awid = 5'd0;
assign awlen = 8'd0; // single beat per line (tile-rows aren't contiguous)
assign awaddr = awaddr_q; // Ch352 — latched at admission; STABLE through AW->W->B (AXI requires it)
assign wdata = beat_data;
assign wstrb = beat_strb;
assign wlast = 1'b1; // 1-beat burst
// Ch352 — AXI transaction stability (Codex): arm_axi/commit gate ADMISSION ONLY (S_IDLE pop). Once a beat is
// admitted, awvalid/wvalid are driven by STATE alone and run to completion, so a later arm-deassert or a
// fb_base commit can never drop VALID mid-handshake or move awaddr while AWVALID && !AWREADY.
assign awvalid = (state == S_AW);
assign wvalid = (state == S_W);
assign bready = (state == S_B);
assign fifo_rd = (state == S_IDLE) && !fifo_rempty && arm_axi && !commit_edge;
assign idle = (state == S_IDLE) && fifo_rempty;
always_ff @(posedge axi_clk or negedge axi_rst_n) begin
if (!axi_rst_n) begin
state <= S_IDLE; beat_addr <= '0; beat_data <= '0; beat_strb <= '0; awaddr_q <= '0;
beats_written <= '0; bursts_issued <= '0; bresp_err_count <= '0;
end else begin
unique case (state)
S_IDLE: if (!fifo_rempty && arm_axi && !commit_edge) begin
beat_addr <= fifo_rdata[319:288]; // {addr, data, strb}
beat_data <= fifo_rdata[287:32];
beat_strb <= fifo_rdata[31:0];
awaddr_q <= fb_base_axi + fifo_rdata[319:288]; // latch FULL AW addr from the STABLE base
// canary: write ONLY the offset-0 (top-of-frame) 32-byte line;
// discard every other beat (fifo_rd still pops it this cycle).
if (canary_axi && (fifo_rdata[319:288] != 32'd0))
state <= S_IDLE;
else
state <= S_AW;
end
S_AW: if (awready) begin
bursts_issued <= bursts_issued + 32'd1;
state <= S_W;
end
S_W: if (wready) begin
beats_written <= beats_written + 32'd1;
state <= S_B;
end
default: if (bvalid) begin // S_B
if (bresp != 2'b00) bresp_err_count <= bresp_err_count + 32'd1;
state <= S_IDLE;
end
endcase
end
end
endmodule : gs_lpddr_axi_master
+133
View File
@@ -0,0 +1,133 @@
// retroDE_ps2 — gs_lpddr_fb_writer (Ch317)
//
// FIRST LPDDR-backed-framebuffer step: a write sink that takes the GS tile-FLUSH
// pixel stream (PSMCT16, one pixel per emit) and commits it to an LPDDR-style
// framebuffer, modelling the real EMIF AXI4 write path so the addressing / data /
// stride / burst behaviour can be proven in sim before wiring the hard EMIF.
//
// SCOPE (Ch317, deliberately tight — see doc 0010 Ch317):
// * Tile color/Z stay ON-CHIP; texture stays local. ONLY the framebuffer FLUSH
// is redirected here.
// * Address gen is the simple linear `fb_base + (screen_y*pitch + screen_x)*bpp`
// — which the GS already produces on `raster_pixel_fb_addr_q` for PSMCT16
// ((fbp<<11) + (pixel_index<<1)), so we consume that byte address directly.
// * PSMCT16 (2 bytes/pixel) — lower bandwidth, already-proven format.
// * BURSTS: the flush emits a tile-row's 16 pixels at contiguous +2 byte
// addresses, then jumps by `pitch` to the next row. The burst engine COALESCES
// a contiguous +2 run into one burst, capped at MAX_BURST_BYTES (the doc 0008
// 4 KiB-boundary AXI rule). Real per-tile-row burst = 16 beats = 32 bytes.
// * A staging FIFO decouples the 1-pixel/cycle emit from the burst engine and
// surfaces under/overflow — the realistic shape a hard EMIF AXI master needs.
// * Backing memory `fbmem` is byte-addressed and TB-readable for the
// write/readback PROOF (a later rung swaps it for the EMIF AXI master +
// LPDDR scanout). At enable=0 the whole module is inert (no writes, counters 0).
//
// COUNTERS (Codex acceptance — bandwidth/diag): bytes_written, burst_count,
// busy_cycles (engine draining), fifo_overflow/underflow, fifo_occ_max. The TB
// computes effective GB/s off bytes_written / (busy_cycles * clk_period).
module gs_lpddr_fb_writer #(
parameter int FB_BYTES = 8192, // backing FB size (64x64 PSMCT16 = 8 KiB)
parameter int FIFO_DEPTH = 32, // pixel staging FIFO depth (power-of-2)
parameter int MAX_BURST_BYTES = 4096 // AXI4 4 KiB-boundary cap (doc 0008 lesson)
) (
input logic clk,
input logic rst_n,
input logic enable, // LPDDR_FB_ENABLE; 0 → fully inert
// GS tile-flush pixel stream (PSMCT16, one pixel per emit)
input logic px_emit,
input logic [31:0] px_addr, // linear FB byte address (raster_pixel_fb_addr_q)
input logic [15:0] px_pix16, // raster_pixel_color_q[15:0]
// diagnostics / proof (read hierarchically by the TB; no functional consumers)
output logic [31:0] bytes_written,
output logic [31:0] burst_count,
output logic [31:0] busy_cycles,
output logic [31:0] fifo_overflow_count,
output logic [31:0] fifo_underflow_count,
output logic [15:0] fifo_occ
);
localparam int ADDR_W = $clog2(FB_BYTES);
localparam int PTR_W = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1;
// ---- byte-addressed backing framebuffer (the LPDDR model) ----
logic [7:0] fbmem [0:FB_BYTES-1];
// ---- staging FIFO of {addr, pix16} ----
logic [31:0] fifo_addr [0:FIFO_DEPTH-1];
logic [15:0] fifo_pix [0:FIFO_DEPTH-1];
logic [PTR_W-1:0] wptr, rptr;
logic [PTR_W:0] count; // 0..FIFO_DEPTH (PTR_W+1 bits)
// count==FIFO_DEPTH sets the top bit (FIFO_DEPTH is a power of 2 == 1<<PTR_W),
// so count[PTR_W] alone is the full flag. (Do NOT compare against a PTR_W-wide
// literal — PTR_W'(FIFO_DEPTH) truncates FIFO_DEPTH to 0 and reads empty as full.)
wire fifo_full = count[PTR_W];
wire fifo_empty = (count == '0);
// ---- burst engine state (coalesce contiguous +2 runs) ----
logic in_burst; // currently extending a burst
logic [31:0] last_addr; // last byte address written
logic [31:0] burst_bytes; // bytes in the current burst so far
logic do_push, do_pop;
logic [31:0] a; // popped byte address
logic [15:0] p; // popped pixel
logic contig;
always_comb begin
do_push = px_emit && !fifo_full;
do_pop = !fifo_empty; // drain one entry/cycle when available
a = fifo_addr[rptr];
p = fifo_pix [rptr];
contig = in_burst && (a == last_addr + 32'd2)
&& (burst_bytes + 32'd2 <= 32'(MAX_BURST_BYTES));
end
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
wptr <= '0; rptr <= '0; count <= '0;
in_burst <= 1'b0; last_addr <= '0; burst_bytes <= '0;
bytes_written <= '0; burst_count <= '0; busy_cycles <= '0;
fifo_overflow_count <= '0; fifo_underflow_count <= '0; fifo_occ <= '0;
end else if (enable) begin
// ---- push side: one flushed pixel per emit ----
if (px_emit && fifo_full)
fifo_overflow_count <= fifo_overflow_count + 32'd1; // dropped — proof must show 0
if (do_push) begin
fifo_addr[wptr] <= px_addr;
fifo_pix [wptr] <= px_pix16;
wptr <= wptr + PTR_W'(1);
end
// ---- drain side: pop one entry/cycle, commit fbmem, coalesce bursts ----
if (do_pop) begin
// commit the 2 PSMCT16 bytes at the linear address
if (a < 32'(FB_BYTES)) fbmem[a[ADDR_W-1:0]] <= p[7:0];
if ((a + 1) < 32'(FB_BYTES)) fbmem[a[ADDR_W-1:0] + 1'b1] <= p[15:8];
bytes_written <= bytes_written + 32'd2;
busy_cycles <= busy_cycles + 32'd1;
rptr <= rptr + PTR_W'(1);
if (contig)
burst_bytes <= burst_bytes + 32'd2; // extend current burst
else begin
burst_count <= burst_count + 32'd1; // start a NEW burst
burst_bytes <= 32'd2;
end
in_burst <= 1'b1;
last_addr <= a;
end else if (in_burst) begin
in_burst <= 1'b0; // FIFO drained → close burst
end
// single count update (push and pop net correctly)
if (do_push && !do_pop) count <= count + 1'b1;
else if (!do_push && do_pop) count <= count - 1'b1;
// (both or neither → unchanged)
if (16'(count) > fifo_occ) fifo_occ <= 16'(count);
// fifo_underflow_count: the engine never pops empty (do_pop gated on
// !fifo_empty), so it stays 0 here — surfaced for the future EMIF rung
// where an external AXI master could request beyond the staged data.
end
end
endmodule : gs_lpddr_fb_writer
+155
View File
@@ -0,0 +1,155 @@
// ============================================================================
// gs_lpddr_rd_arb.sv (Ch320 Brick 2; Ch322 extended 2:1 -> 3:1)
//
// 3:1 AXI4 READ-channel arbiter for the FPGA-private LPDDR4B EMIF user port.
// Lets the Ch320 scanout reader (port 0, priority), the Ch319 HPS read-probe
// (port 1), and the Ch322 texture-cache fill (port 2, lowest priority) share the
// single EMIF read channel. The write channel is arbitrated separately
// (gs_lpddr_wr_arb). Adapted from ao486 axi_fb_arbiter (read half): grant held
// for a whole transaction, watchdog force-release, idle-drain rready so a late
// response can't wedge the bus. All single-clock (emif_clk).
//
// Port 2 (texture fill) is a ONE-SHOT prefill before raster; scanout (port 0)
// keeps priority. Leave s2_* unconnected (arvalid=0) on builds without a texture
// cache — the arbiter is then bit-for-bit the old 2:1 behavior.
//
// Single-beat transactions (ARLEN=0), so a response completes on rvalid&rlast.
// ============================================================================
`timescale 1ns/1ps
module gs_lpddr_rd_arb (
input logic clk,
input logic rst_n,
// ---- Port 0: scanout reader (priority) ----
input logic [29:0] s0_araddr,
input logic [1:0] s0_arburst,
input logic [6:0] s0_arid,
input logic [7:0] s0_arlen,
input logic [2:0] s0_arsize,
input logic s0_arvalid,
output logic s0_arready,
output logic [255:0] s0_rdata,
output logic [1:0] s0_rresp,
output logic s0_rlast,
output logic s0_rvalid,
input logic s0_rready,
// ---- Port 1: HPS read-probe ----
input logic [29:0] s1_araddr,
input logic [1:0] s1_arburst,
input logic [6:0] s1_arid,
input logic [7:0] s1_arlen,
input logic [2:0] s1_arsize,
input logic s1_arvalid,
output logic s1_arready,
output logic [255:0] s1_rdata,
output logic [1:0] s1_rresp,
output logic s1_rlast,
output logic s1_rvalid,
input logic s1_rready,
// ---- Port 2: texture-cache fill (lowest priority; Ch322) ----
input logic [29:0] s2_araddr,
input logic [1:0] s2_arburst,
input logic [6:0] s2_arid,
input logic [7:0] s2_arlen,
input logic [2:0] s2_arsize,
input logic s2_arvalid,
output logic s2_arready,
output logic [255:0] s2_rdata,
output logic [1:0] s2_rresp,
output logic s2_rlast,
output logic s2_rvalid,
input logic s2_rready,
// ---- Port 3: tile-reload fill (Ch323; priority ABOVE probe/texfill, below scanout) ----
input logic [29:0] s3_araddr,
input logic [1:0] s3_arburst,
input logic [6:0] s3_arid,
input logic [7:0] s3_arlen,
input logic [2:0] s3_arsize,
input logic s3_arvalid,
output logic s3_arready,
output logic [255:0] s3_rdata,
output logic [1:0] s3_rresp,
output logic s3_rlast,
output logic s3_rvalid,
input logic s3_rready,
// ---- Master out: EMIF read channel ----
output logic [29:0] m_araddr,
output logic [1:0] m_arburst,
output logic [6:0] m_arid,
output logic [7:0] m_arlen,
output logic [2:0] m_arsize,
output logic m_arvalid,
input logic m_arready,
input logic [255:0] m_rdata,
input logic [1:0] m_rresp,
input logic m_rlast,
input logic m_rvalid,
output logic m_rready
);
// grant: 0=idle, 1=s0 scanout, 2=s1 probe, 3=s2 texfill, 4=s3 tile-reload.
// EXPLICIT priority (Ch323, Codex): scanout > tile_reload > probe > texture_fill — i.e.
// s0 > s3 > s1 > s2. Render-display (scanout) highest; the render-prep tile reload above
// the debug read-probe so a debug read can never starve a render's Z/color reload.
reg [2:0] grant;
// Ch326 — NON-ABORTING ARBITER (Codex). The OLD design force-released the grant on a
// watchdog (was 2^10 ~3.3us) at ANY point in the transaction; when it fired AFTER the AR had
// handshaked, the idle state's m_rready=1 drained the now-orphaned response and the requester
// hung forever (blank HDMI + stuck HPS probe under the always-on-scanout traffic). Once
// m_arvalid && m_arready, the read is COMMITTED and its response BELONGS to that requester —
// there is no AXI-legal way to abandon it. So: the watchdog gates ONLY the pre-AR wait (no
// transaction committed yet — safe to drop); after AR acceptance the grant is held until
// m_rvalid && m_rlast && selected_rready, regardless of how long the read takes.
reg ar_done; // AR handshake captured for the active grant -> never abort past here
reg [21:0] watchdog; // pre-AR only (waiting for m_arready); ~6.7 ms @ 310 MHz dead-bus backstop
wire wd_expired = watchdog[21];
wire sel_rready = (grant==3'd1)?s0_rready:(grant==3'd2)?s1_rready:
(grant==3'd3)?s2_rready:(grant==3'd4)?s3_rready:1'b1;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
grant <= 3'd0; ar_done <= 1'b0; watchdog <= '0;
end else if (grant == 3'd0) begin
ar_done <= 1'b0; watchdog <= '0;
if (s0_arvalid) grant <= 3'd1; // scanout (highest)
else if (s3_arvalid) grant <= 3'd4; // tile reload (render-prep)
else if (s1_arvalid) grant <= 3'd2; // read probe (debug)
else if (s2_arvalid) grant <= 3'd3; // texture fill (lowest)
end else begin
if (m_arvalid && m_arready) ar_done <= 1'b1; // AR accepted -> COMMITTED
if (m_rvalid && m_rlast && sel_rready) begin
grant <= 3'd0; ar_done <= 1'b0; watchdog <= '0; // response delivered -> release
end else if (!ar_done) begin // still waiting for AR (nothing owed)
if (wd_expired) begin grant <= 3'd0; ar_done <= 1'b0; watchdog <= '0; end
else watchdog <= watchdog + 22'd1;
end
// ar_done && response not yet complete: HOLD the grant, never abort.
end
end
// AR mux
assign m_araddr = (grant==3'd4)?s3_araddr :(grant==3'd3)?s2_araddr :(grant==3'd2)?s1_araddr :s0_araddr;
assign m_arburst = (grant==3'd4)?s3_arburst:(grant==3'd3)?s2_arburst:(grant==3'd2)?s1_arburst:s0_arburst;
assign m_arid = (grant==3'd4)?s3_arid :(grant==3'd3)?s2_arid :(grant==3'd2)?s1_arid :s0_arid;
assign m_arlen = (grant==3'd4)?s3_arlen :(grant==3'd3)?s2_arlen :(grant==3'd2)?s1_arlen :s0_arlen;
assign m_arsize = (grant==3'd4)?s3_arsize :(grant==3'd3)?s2_arsize :(grant==3'd2)?s1_arsize :s0_arsize;
assign m_arvalid = (grant==3'd1)?s0_arvalid:(grant==3'd2)?s1_arvalid:(grant==3'd3)?s2_arvalid:(grant==3'd4)?s3_arvalid:1'b0;
assign s0_arready = (grant==3'd1)?m_arready:1'b0;
assign s1_arready = (grant==3'd2)?m_arready:1'b0;
assign s2_arready = (grant==3'd3)?m_arready:1'b0;
assign s3_arready = (grant==3'd4)?m_arready:1'b0;
// R demux (idle: rready=1 drains any stale/late response)
assign s0_rdata=m_rdata; assign s1_rdata=m_rdata; assign s2_rdata=m_rdata; assign s3_rdata=m_rdata;
assign s0_rresp=m_rresp; assign s1_rresp=m_rresp; assign s2_rresp=m_rresp; assign s3_rresp=m_rresp;
assign s0_rlast=m_rlast; assign s1_rlast=m_rlast; assign s2_rlast=m_rlast; assign s3_rlast=m_rlast;
assign s0_rvalid = (grant==3'd1)?m_rvalid:1'b0;
assign s1_rvalid = (grant==3'd2)?m_rvalid:1'b0;
assign s2_rvalid = (grant==3'd3)?m_rvalid:1'b0;
assign s3_rvalid = (grant==3'd4)?m_rvalid:1'b0;
assign m_rready = (grant==3'd1)?s0_rready:(grant==3'd2)?s1_rready:(grant==3'd3)?s2_rready:(grant==3'd4)?s3_rready:1'b1;
endmodule
+117
View File
@@ -0,0 +1,117 @@
// ============================================================================
// gs_lpddr_rd_probe.sv (Ch319 Brick 3)
//
// HPS-triggered single-word AXI4 READ probe for FPGA-private LPDDR4B.
//
// Lets the HPS read framebuffer bytes back THROUGH THE HPS BRIDGE (never
// /dev/mem) for checksum + screen-dump. Drives the EMIF user port's READ
// channel (AR/R) only — the write channel (AW/W/B) is the GS tile-flush
// writer (gs_lpddr_axi_master); read and write channels are independent, so
// the two masters share the one EMIF port with no arbitration.
//
// Runs on axi_clk (= emif_clk, ~310 MHz). The control input `rd_pulse` is a
// TOGGLE in the bridge (design_clk) domain; it is 2-FF synced + edge-detected
// here. The outputs `rd_done` (toggle) + `rd_data` are produced in axi_clk;
// the bridge syncs `rd_done` and latches `rd_data` on its edge (same return-
// path CDC contract as ao486 ao486_hps_bridge ↔ lpddr4b_loader).
//
// One AXI read = one 32-byte (256-bit) beat; the requested 32-bit word is the
// lane selected by addr[4:2] (8 lanes per beat). araddr is 32-byte aligned.
// ============================================================================
module gs_lpddr_rd_probe #(
parameter ADDR_W = 30
)(
input logic axi_clk, // emif_clk
input logic axi_rst_n, // emif_reset_n (EMIF cal-ready)
// ---- control / status (rd_pulse is a design_clk-domain toggle) ----
input logic rd_pulse, // toggles when the HPS requests a read
input logic [31:0] rd_addr, // byte address (stable when rd_pulse toggles)
output logic rd_done, // toggles (axi_clk) on completion
output logic [31:0] rd_data, // 32-bit word (stable after rd_done edge)
output logic rd_busy,
// ---- AXI4 READ channel to the EMIF user port (axi_clk, 256-bit data) ----
output logic [ADDR_W-1:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready
);
// AXI read-address constants
assign arburst = 2'b01; // INCR
assign arid = 7'd1; // distinct from the writer (awid = 0)
assign arlen = 8'd0; // single beat
assign arsize = 3'b101; // 32 bytes (full 256-bit width)
// CDC: sync rd_pulse (design_clk) into axi_clk + edge-detect.
reg [2:0] pulse_sync;
wire pulse_edge = (pulse_sync[2] != pulse_sync[1]);
reg [2:0] rd_lane; // which 32-bit lane of the 256-bit beat (addr[4:2])
typedef enum logic [1:0] { S_IDLE, S_AR, S_R } st_t;
st_t st;
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
pulse_sync <= 3'd0;
st <= S_IDLE;
araddr <= '0;
arvalid <= 1'b0;
rready <= 1'b0;
rd_done <= 1'b0;
rd_data <= 32'd0;
rd_busy <= 1'b0;
rd_lane <= 3'd0;
end else begin
pulse_sync <= {pulse_sync[1:0], rd_pulse};
case (st)
S_IDLE: begin
if (pulse_edge) begin
rd_lane <= rd_addr[4:2];
araddr <= {rd_addr[ADDR_W-1:5], 5'd0}; // 32-byte aligned
arvalid <= 1'b1;
rd_busy <= 1'b1;
st <= S_AR;
end
end
S_AR: begin
if (arready) begin
arvalid <= 1'b0;
rready <= 1'b1;
st <= S_R;
end
end
S_R: begin
if (rvalid) begin
rready <= 1'b0;
case (rd_lane)
3'd0: rd_data <= rdata[31:0];
3'd1: rd_data <= rdata[63:32];
3'd2: rd_data <= rdata[95:64];
3'd3: rd_data <= rdata[127:96];
3'd4: rd_data <= rdata[159:128];
3'd5: rd_data <= rdata[191:160];
3'd6: rd_data <= rdata[223:192];
default: rd_data <= rdata[255:224];
endcase
rd_busy <= 1'b0;
rd_done <= ~rd_done;
st <= S_IDLE;
end
end
default: st <= S_IDLE;
endcase
end
end
endmodule
+173
View File
@@ -0,0 +1,173 @@
// ============================================================================
// gs_lpddr_scanout.sv (Ch320 Brick 1)
//
// LPDDR4B-backed scanout for the SMALL framebuffer demo (explicitly scoped to
// a tiny frame — default 64x64 PSMCT16 = 8 KiB). Instead of ao486-style line
// buffering/streaming, it copies the WHOLE framebuffer from LPDDR4B into a
// small on-chip cache once per frame, then serves scanout pixels from the
// cache. When the framebuffer grows past "tiny demo", revisit ao486's
// vga_fb_ddr line-buffer approach.
//
// Pixel mapping is automatic: the GS writer mirrors BRAM VRAM byte-for-byte
// into LPDDR4B, so cache[addr] == BRAM_VRAM[addr]. We index the cache by the
// PCRTC's own `vram_read_addr`, so the decoded pixel is identical to the BRAM
// scanout pixel for the same raster position — the video-source mux is seamless
// regardless of swizzle/MAG.
//
// Two clock domains:
// axi_clk (emif_clk, ~310 MHz) — fill the cache from LPDDR4B over AXI4
// video_clk (design_clk) — index the cache by vram_read_addr -> r/g/b
//
// Fill happens on frame_start (vsync) and completes during vblank (256 single
// beats ~ 1 us); scanout reads the stable cache during the active region. The
// read channel is shared with the Ch319 read-probe via an external arbiter.
// ============================================================================
`timescale 1ns/1ps
module gs_lpddr_scanout #(
parameter [29:0] FB_BASE = 30'd0, // LPDDR byte base to FETCH the framebuffer from
parameter int CACHE_BEATS = 256, // 256 * 32 B = 8 KiB = 64x64 PSMCT16
// Ch324 — the LPDDR fetch base (FB_BASE) and the PCRTC vram_read_addr base can DIFFER:
// the spill framebuffer lives at COLOR_SPILL_BASE in LPDDR but the PCRTC addresses it
// BRAM-relative (0-based). VRAM_BASE is the vram_read_addr origin (defaults to FB_BASE
// for the Ch320/321 mirror case where they coincide).
parameter [29:0] VRAM_BASE = FB_BASE,
// Ch324 — pixel format: 0 = PSMCT16 (RGBA5551, 16 px/beat), 1 = PSMCT32 (ABGR, 8 px/beat).
parameter bit PSMCT32 = 1'b0
)(
// ---- AXI read clock domain (emif_clk) ----
input logic axi_clk,
input logic axi_rst_n,
input logic enable, // 1 = refill the cache on frame_start
input logic frame_start, // video-domain pulse/level (vsync); synced internally
// ---- video clock domain (design_clk) ----
input logic video_clk,
input logic [31:0] vram_read_addr,
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
// ---- status (axi_clk domain; the bridge syncs these) ----
output logic cache_valid, // a full frame has been loaded
output logic [31:0] rd_beats, // beats read (cumulative)
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
output logic [29:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready
);
localparam int BEAT_BITS = $clog2(CACHE_BEATS); // 8 for 256
localparam int FB_SPAN = CACHE_BEATS * 32; // 8192 bytes
assign arburst = 2'b01; // INCR
assign arid = 7'd2; // distinct from writer (0) and read-probe (1)
assign arlen = 8'd0; // single beat
assign arsize = 3'b101; // 32 bytes
// Frame cache: one 256-bit word per 32-byte beat.
logic [255:0] cache [0:CACHE_BEATS-1];
// ---------------- fill side (axi_clk) ----------------
logic [2:0] fs_sync;
wire fs_edge = (fs_sync[2] != fs_sync[1]);
typedef enum logic [1:0] { F_IDLE, F_AR, F_R } fstate_t;
fstate_t fst;
logic [BEAT_BITS:0] beat; // 0..CACHE_BEATS (extra bit for the terminal compare)
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
fs_sync <= 3'd0; fst <= F_IDLE; beat <= '0;
araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
cache_valid <= 1'b0; rd_beats <= 32'd0; rd_errs <= 32'd0;
end else begin
fs_sync <= {fs_sync[1:0], frame_start};
case (fst)
F_IDLE: begin
if (fs_edge && enable) begin
beat <= '0;
cache_valid <= 1'b0;
araddr <= FB_BASE;
arvalid <= 1'b1;
fst <= F_AR;
end
end
F_AR: begin
if (arready) begin
arvalid <= 1'b0;
rready <= 1'b1;
fst <= F_R;
end
end
F_R: begin
if (rvalid) begin
cache[beat[BEAT_BITS-1:0]] <= rdata;
rready <= 1'b0;
rd_beats <= rd_beats + 32'd1;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
if (beat == CACHE_BEATS-1) begin
cache_valid <= 1'b1;
fst <= F_IDLE;
end else begin
beat <= beat + 1'b1;
araddr <= FB_BASE + (({{(30-BEAT_BITS-1){1'b0}}, (beat + 1'b1)}) << 5);
arvalid <= 1'b1;
fst <= F_AR;
end
end
end
default: fst <= F_IDLE;
endcase
end
end
// ---------------- scanout side (video_clk) ----------------
// Byte offset of the requested pixel within the framebuffer (vram_read_addr is BRAM-relative
// = VRAM_BASE-origin; the cache holds the SAME bytes fetched from FB_BASE in LPDDR).
wire [31:0] off = vram_read_addr - {2'b00, VRAM_BASE};
wire [BEAT_BITS-1:0] beat_ix = off[BEAT_BITS+4 -: BEAT_BITS]; // off>>5, low BEAT_BITS
wire [3:0] hw_sel = off[4:1]; // PSMCT16: 16 halfwords / beat
wire [2:0] w_sel = off[4:2]; // PSMCT32: 8 words / beat
wire in_range = cache_valid
&& (vram_read_addr >= {2'b00, VRAM_BASE})
&& (off < FB_SPAN);
// Registered (sync-read) cache lookup — 1-cycle latency to match the PCRTC's
// VRAM_SYNC_READ pixel timing so the muxed output aligns with PCRTC de/sync.
// Split the array-index and the part-select across the register boundary
// (chained index+part-select in one expr trips iverilog-12).
logic [255:0] word_q;
logic [3:0] hw_q;
logic [2:0] w_q;
logic in_range_q;
always_ff @(posedge video_clk) begin
word_q <= cache[beat_ix];
hw_q <= hw_sel;
w_q <= w_sel;
in_range_q <= in_range;
end
wire [15:0] px16_q = word_q[hw_q*16 +: 16];
wire [31:0] px32_q = word_q[w_q*32 +: 32];
// PSMCT16 (RGBA5551): R[4:0] G[9:5] B[14:10], 5->8 by bit-replication ({c5,c5[4:2]}).
wire [4:0] r5 = px16_q[4:0];
wire [4:0] g5 = px16_q[9:5];
wire [4:0] b5 = px16_q[14:10];
wire [7:0] r16 = {r5, r5[4:2]}, g16 = {g5, g5[4:2]}, b16 = {b5, b5[4:2]};
// PSMCT32 (ABGR8888): R[7:0] G[15:8] B[23:16] (A discarded) — identical decode to gs_pcrtc.
wire [7:0] r32 = px32_q[7:0], g32 = px32_q[15:8], b32 = px32_q[23:16];
assign r = !in_range_q ? 8'd0 : (PSMCT32 ? r32 : r16);
assign g = !in_range_q ? 8'd0 : (PSMCT32 ? g32 : g16);
assign b = !in_range_q ? 8'd0 : (PSMCT32 ? b32 : b16);
endmodule
+213
View File
@@ -0,0 +1,213 @@
// ============================================================================
// gs_lpddr_scanout_lb.sv (Ch321 Brick 2)
//
// LINE-BUFFER LPDDR4B scanout — the architectural successor to the whole-frame
// cache (gs_lpddr_scanout). Instead of mirroring the entire framebuffer in
// on-chip RAM (which defeats the point of putting the FB in LPDDR), this holds
// just TWO scanlines: it displays row L from one buffer while prefetching row
// L+1 into the other. On-chip cost is O(width), not O(width*height).
//
// NARROW SCOPE (Ch321): the 128x128 PSMCT16 demo. The frame is LINEAR (the GS
// writer mirrors the rasterizer's linear flush addresses), display window at
// origin, 1:1 (MAG off) — so the reader serves pixel (col=pixel_x, line=pixel_y)
// directly when inside the window. No general MAG/window handling beyond that.
//
// Two clock domains:
// axi_clk (emif_clk) — AXI4 burst-read one row (ROW_BEATS beats) into a buffer
// video_clk (design) — pixel_x/pixel_y index the active line buffer -> r/g/b
//
// Prefetch handshake: on each new display line (and at frame start) the video
// side requests the next FB row via a toggle; the axi side fills the OTHER
// buffer. `underflow` flags any pixel read before its row finished loading.
// ============================================================================
`timescale 1ns/1ps
module gs_lpddr_scanout_lb #(
parameter [29:0] FB_BASE = 30'd0,
parameter int STRIDE_BYTES = 256, // PSMCT16 128px*2B=256; PSMCT32 128px*4B=512
parameter int ROW_BEATS = 8, // STRIDE_BYTES / 32 (PSMCT16 128px=8; PSMCT32 128px=16)
parameter int N_ROWS = 128,
// Ch327a — PSMCT32 (ABGR8888, 8 px/256-bit beat) vs the original PSMCT16 (RGBA5551,
// 16 px/beat). The Ch326 LPDDR-only spill framebuffer is PSMCT32 @ COLOR_SPILL_BASE, so the
// line-buffer must decode it — NOT a config flip of the Ch321 PSMCT16/FB-at-0 path.
parameter bit PSMCT32 = 1'b0
)(
// ---- AXI read clock domain (emif_clk) ----
input logic axi_clk,
input logic axi_rst_n,
input logic enable, // 1 = active (prefetch + serve)
// ---- video clock domain (design_clk) ----
input logic video_clk,
input logic frame_start, // vsync pulse/level (synced internally)
input logic [11:0] pixel_x, // raster column (display)
input logic [11:0] pixel_y, // raster line (display)
input logic in_window, // PCRTC displayed-frame window gate
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
// ---- status (axi_clk domain; bridge syncs) ----
output logic line_valid, // at least one row has been loaded
output logic underflow, // a pixel was read before its row was ready (sticky)
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
output logic [29:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready
);
localparam int RB_BITS = $clog2(ROW_BEATS); // 3 for 8
assign arburst = 2'b01; // INCR
assign arid = 7'd3; // distinct: writer=0, probe=1, frame-cache=2, line-buf=3
assign arlen = 8'd0; // SINGLE-BEAT per read — the only AXI read pattern proven on this
// EMIF (writer/probe/frame-cache all use arlen=0). A multi-beat
// burst (arlen=ROW_BEATS-1) was untested and garbled on hardware.
assign arsize = 3'b101; // 32 bytes
// Two line buffers, ROW_BEATS x 256-bit each (one display row).
logic [255:0] lb0 [0:ROW_BEATS-1];
logic [255:0] lb1 [0:ROW_BEATS-1];
// ================= video side (video_clk) =================
// No miss-prone request toggle. The video side just exposes the current
// in-window display row; the axi side free-runs, fetching rows sequentially
// and staying one row ahead (see below). disp_row_v resets on vsync.
logic [$clog2(N_ROWS):0] disp_row_v;
logic [2:0] fs_sync_v;
wire fs_edge_v = (fs_sync_v[2] != fs_sync_v[1]);
// The buffer holding display line L is L&1 (row L is fetched into L&1). Select
// it DIRECTLY from pixel_y[0] (tracks the current pixel) — a separately-registered
// "disp_buf" lags by one cycle and corrupts col 0 of each line.
wire disp_buf = pixel_y[0];
always_ff @(posedge video_clk) begin
if (!enable) begin
disp_row_v <= '0; fs_sync_v <= 3'd0;
end else begin
fs_sync_v <= {fs_sync_v[1:0], frame_start};
if (fs_edge_v) disp_row_v <= '0;
else if (in_window && (pixel_y < N_ROWS)) disp_row_v <= ($clog2(N_ROWS)+1)'(pixel_y);
end
end
// Registered (sync-read) pixel: pick buffer + beat + within-beat lane from pixel_x.
// PSMCT32: 8 px/256-bit beat -> beat = pixel_x>>3, lane = pixel_x[2:0] (32-bit).
// PSMCT16: 16 px/beat -> beat = pixel_x>>4, lane = pixel_x[3:0] (16-bit).
localparam int PXSH = PSMCT32 ? 3 : 4; // px-per-beat shift
localparam int PX_PER_ROW = PSMCT32 ? (STRIDE_BYTES/4) : (STRIDE_BYTES/2);
wire [RB_BITS-1:0] col_beat = pixel_x[RB_BITS+PXSH-1 -: RB_BITS];
wire [3:0] col_lane = PSMCT32 ? {1'b0, pixel_x[2:0]} : pixel_x[3:0];
logic [255:0] word_q; logic [3:0] lane_q; logic in_q;
always_ff @(posedge video_clk) begin
word_q <= disp_buf ? lb1[col_beat] : lb0[col_beat];
lane_q <= col_lane;
in_q <= in_window && (pixel_x < PX_PER_ROW) && (pixel_y < N_ROWS);
end
// PSMCT32 ABGR8888 (r=[7:0],g=[15:8],b=[23:16]) — matches gs_lpddr_scanout (frame-cache).
wire [31:0] px32 = word_q[lane_q[2:0]*32 +: 32]; // 3-bit lane: always in-range (0..224)
wire [7:0] r32 = px32[7:0], g32 = px32[15:8], b32 = px32[23:16];
// PSMCT16 RGBA5551 5-bit lanes expanded to 8-bit.
wire [15:0] px16 = word_q[lane_q*16 +: 16];
wire [4:0] r5 = px16[4:0], g5 = px16[9:5], b5 = px16[14:10];
assign r = !in_q ? 8'd0 : (PSMCT32 ? r32 : {r5, r5[4:2]});
assign g = !in_q ? 8'd0 : (PSMCT32 ? g32 : {g5, g5[4:2]});
assign b = !in_q ? 8'd0 : (PSMCT32 ? b32 : {b5, b5[4:2]});
// ================= axi side (axi_clk) — row fill FSM =================
// free-running prefetcher: fetch rows sequentially, staying <= disp_row+1 ahead.
// disp_row crosses video->axi (slowly-changing; the +1 throttle tolerates a 1-off
// transient). frame_start is edge-detected here to reset next_fetch every frame.
logic [2:0] fs_sync_e;
wire fs_edge_e = (fs_sync_e[2] != fs_sync_e[1]);
logic [$clog2(N_ROWS):0] disp_row_s0, disp_row_e;
logic [$clog2(N_ROWS):0] next_fetch; // next row to load (0..N_ROWS)
typedef enum logic [1:0] { L_IDLE, L_AR, L_R } lstate_t;
lstate_t lst;
logic [$clog2(N_ROWS):0] cur_row;
logic cur_buf;
logic [RB_BITS:0] beat;
logic fs_pending; // a vsync restart is pending; applied in L_IDLE (never mid-read)
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
fs_sync_e <= 3'd0; disp_row_s0 <= '0; disp_row_e <= '0; next_fetch <= '0;
lst <= L_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
cur_row <= '0; cur_buf <= 1'b0; beat <= '0;
line_valid <= 1'b0; rd_errs <= 32'd0; fs_pending <= 1'b0;
end else begin
fs_sync_e <= {fs_sync_e[1:0], frame_start};
disp_row_s0 <= disp_row_v; // 2-FF sync of the display row
disp_row_e <= disp_row_s0;
// vsync: mark a prefetch restart. DEFER it to L_IDLE so an in-flight AXI
// read is never aborted mid-handshake (which would deadlock the slave).
if (fs_edge_e) fs_pending <= 1'b1;
case (lst)
L_IDLE: begin
if (fs_pending) begin
next_fetch <= '0; // restart prefetch sequence from row 0
fs_pending <= 1'b0;
end else if (enable && (next_fetch < N_ROWS) && (next_fetch <= disp_row_e + 1'b1)) begin
cur_row <= next_fetch;
cur_buf <= next_fetch[0];
araddr <= FB_BASE + (next_fetch * STRIDE_BYTES);
beat <= '0;
arvalid <= 1'b1;
lst <= L_AR;
end
end
L_AR: begin
if (arready) begin
arvalid <= 1'b0;
rready <= 1'b1;
lst <= L_R;
end
end
L_R: begin
if (rvalid) begin
if (cur_buf) lb1[beat[RB_BITS-1:0]] <= rdata;
else lb0[beat[RB_BITS-1:0]] <= rdata;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
rready <= 1'b0;
if (beat == ROW_BEATS-1) begin
line_valid <= 1'b1;
next_fetch <= next_fetch + 1'b1; // advance prefetch (rows 0..next_fetch-1 loaded)
lst <= L_IDLE;
end else begin
// next single-beat read of this row (arlen=0 each).
beat <= beat + 1'b1;
araddr <= araddr + 30'd32;
arvalid <= 1'b1;
lst <= L_AR;
end
end
end
default: lst <= L_IDLE;
endcase
end
end
// underflow (sticky, video domain): an in-window pixel for line pixel_y is read
// before that row was prefetched. The axi side loads rows 0..next_fetch-1, so row
// pixel_y is ready iff pixel_y < next_fetch. next_fetch crosses axi->video synced
// (slowly-changing; a 1-off transient is harmless). Resets on vsync.
logic [$clog2(N_ROWS):0] nf_s0, nf_v;
logic underflow_v;
always_ff @(posedge video_clk) begin
nf_s0 <= next_fetch; nf_v <= nf_s0;
if (!enable || fs_edge_v) underflow_v <= 1'b0;
else if (in_window && (pixel_y < N_ROWS) && (($clog2(N_ROWS)+1)'(pixel_y) >= nf_v))
underflow_v <= 1'b1;
end
assign underflow = underflow_v;
endmodule
+179
View File
@@ -0,0 +1,179 @@
// ============================================================================
// gs_lpddr_wr_arb.sv (Ch322 Brick 3; Ch323 extended 2:1 -> 3:1)
//
// 3:1 AXI4 WRITE-channel arbiter for the FPGA-private LPDDR4B EMIF user port.
// The write twin of gs_lpddr_rd_arb. Lets the GS framebuffer writer
// (gs_lpddr_axi_master, port 0, PRIORITY), the Ch323 tile Z-flush writer
// (gs_z_flush_writer, port 2) and the Ch322 HPS write-probe
// (gs_lpddr_wr_probe, port 1) share the single EMIF write channel.
//
// EXPLICIT priority (Ch323, Codex): FB-writer > Z-writer > wr-probe — i.e.
// s0 > s2 > s1. The active render's color (FB) and Z spill outrank the debug
// write-probe so a debug write can never starve a render flush. Leave s2_*
// unconnected (awvalid=0) on builds without a Z writer — the arbiter is then
// bit-for-bit the old 2:1 behavior.
//
// Per-transaction grant held AW->W->B (single-beat writes, AWLEN=0, so B
// completes one transaction). Watchdog force-release guards a lost B.
// All single-clock (emif_clk).
// ============================================================================
`timescale 1ns/1ps
module gs_lpddr_wr_arb (
input logic clk,
input logic rst_n,
// ---- Port 0: GS framebuffer writer (priority) ----
input logic [29:0] s0_awaddr,
input logic [1:0] s0_awburst,
input logic [6:0] s0_awid,
input logic [7:0] s0_awlen,
input logic [2:0] s0_awsize,
input logic s0_awvalid,
output logic s0_awready,
input logic [255:0] s0_wdata,
input logic [31:0] s0_wstrb,
input logic s0_wlast,
input logic s0_wvalid,
output logic s0_wready,
output logic [1:0] s0_bresp,
output logic s0_bvalid,
input logic s0_bready,
// ---- Port 1: HPS write-probe ----
input logic [29:0] s1_awaddr,
input logic [1:0] s1_awburst,
input logic [6:0] s1_awid,
input logic [7:0] s1_awlen,
input logic [2:0] s1_awsize,
input logic s1_awvalid,
output logic s1_awready,
input logic [255:0] s1_wdata,
input logic [31:0] s1_wstrb,
input logic s1_wlast,
input logic s1_wvalid,
output logic s1_wready,
output logic [1:0] s1_bresp,
output logic s1_bvalid,
input logic s1_bready,
// ---- Port 2: tile Z-flush writer (Ch323; priority ABOVE probe, below FB writer) ----
input logic [29:0] s2_awaddr,
input logic [1:0] s2_awburst,
input logic [6:0] s2_awid,
input logic [7:0] s2_awlen,
input logic [2:0] s2_awsize,
input logic s2_awvalid,
output logic s2_awready,
input logic [255:0] s2_wdata,
input logic [31:0] s2_wstrb,
input logic s2_wlast,
input logic s2_wvalid,
output logic s2_wready,
output logic [1:0] s2_bresp,
output logic s2_bvalid,
input logic s2_bready,
// ---- Port 3: HPS write-probe (Ch323 diag; LOWEST priority — debug staging) ----
input logic [29:0] s3_awaddr,
input logic [1:0] s3_awburst,
input logic [6:0] s3_awid,
input logic [7:0] s3_awlen,
input logic [2:0] s3_awsize,
input logic s3_awvalid,
output logic s3_awready,
input logic [255:0] s3_wdata,
input logic [31:0] s3_wstrb,
input logic s3_wlast,
input logic s3_wvalid,
output logic s3_wready,
output logic [1:0] s3_bresp,
output logic s3_bvalid,
input logic s3_bready,
// ---- Master out: EMIF write channel ----
output logic [29:0] m_awaddr,
output logic [1:0] m_awburst,
output logic [6:0] m_awid,
output logic [7:0] m_awlen,
output logic [2:0] m_awsize,
output logic m_awvalid,
input logic m_awready,
output logic [255:0] m_wdata,
output logic [31:0] m_wstrb,
output logic m_wlast,
output logic m_wvalid,
input logic m_wready,
input logic [1:0] m_bresp,
input logic m_bvalid,
output logic m_bready
);
// grant: 0=idle, 1=s0 FB writer, 2=s1 color spill, 3=s2 Z spill, 4=s3 HPS write-probe.
// EXPLICIT priority: FB-writer > Z-spill > color-spill > wr-probe — i.e. s0 > s2 > s1 > s3.
reg [2:0] grant;
// Ch326 — NON-ABORTING ARBITER (Codex), same protocol fix as gs_lpddr_rd_arb. Once
// m_awvalid && m_awready, the write is COMMITTED (the slave will return B); abandoning it on
// a watchdog would orphan the B / leave the slave mid-write. So the watchdog gates ONLY the
// pre-AW wait; after AW acceptance the grant is held until m_bvalid && selected_bready. (The
// FB/spill writers never tripped the old 2^10 watchdog in practice, but the latent bug is the
// same — fixed for safety.)
// "committed" = EITHER the AW or a W beat has handshaked. The current writers send AW-then-W
// so AW sets it first, but tracking either makes this a GENERAL AXI write arbiter that never
// abandons a transaction regardless of AW/W ordering (Codex audit note).
reg aw_done; // a write beat/addr accepted for the active grant -> never abort past here
reg [21:0] watchdog; // pre-commit only; ~6.7 ms @ 310 MHz dead-bus backstop
wire wd_expired = watchdog[21];
wire sel_bready = (grant==3'd1)?s0_bready:(grant==3'd2)?s1_bready:
(grant==3'd3)?s2_bready:(grant==3'd4)?s3_bready:1'b1;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
grant <= 3'd0; aw_done <= 1'b0; watchdog <= '0;
end else if (grant == 3'd0) begin
aw_done <= 1'b0; watchdog <= '0;
if (s0_awvalid) grant <= 3'd1; // FB writer (highest)
else if (s2_awvalid) grant <= 3'd3; // Z spill (render-flush)
else if (s1_awvalid) grant <= 3'd2; // color spill (render-flush)
else if (s3_awvalid) grant <= 3'd4; // HPS write-probe (debug, lowest)
end else begin
if ((m_awvalid && m_awready) || (m_wvalid && m_wready)) aw_done <= 1'b1; // AW or W accepted -> COMMITTED
if (m_bvalid && sel_bready) begin
grant <= 3'd0; aw_done <= 1'b0; watchdog <= '0; // B delivered -> release
end else if (!aw_done) begin // still waiting for AW (nothing owed)
if (wd_expired) begin grant <= 3'd0; aw_done <= 1'b0; watchdog <= '0; end
else watchdog <= watchdog + 22'd1;
end
// aw_done && B not yet seen: HOLD the grant, never abort.
end
end
// AW mux
assign m_awaddr = (grant==3'd4)?s3_awaddr :(grant==3'd3)?s2_awaddr :(grant==3'd2)?s1_awaddr :s0_awaddr;
assign m_awburst = (grant==3'd4)?s3_awburst:(grant==3'd3)?s2_awburst:(grant==3'd2)?s1_awburst:s0_awburst;
assign m_awid = (grant==3'd4)?s3_awid :(grant==3'd3)?s2_awid :(grant==3'd2)?s1_awid :s0_awid;
assign m_awlen = (grant==3'd4)?s3_awlen :(grant==3'd3)?s2_awlen :(grant==3'd2)?s1_awlen :s0_awlen;
assign m_awsize = (grant==3'd4)?s3_awsize :(grant==3'd3)?s2_awsize :(grant==3'd2)?s1_awsize :s0_awsize;
assign m_awvalid = (grant==3'd1)?s0_awvalid:(grant==3'd2)?s1_awvalid:(grant==3'd3)?s2_awvalid:(grant==3'd4)?s3_awvalid:1'b0;
assign s0_awready = (grant==3'd1)?m_awready:1'b0;
assign s1_awready = (grant==3'd2)?m_awready:1'b0;
assign s2_awready = (grant==3'd3)?m_awready:1'b0;
assign s3_awready = (grant==3'd4)?m_awready:1'b0;
// W mux
assign m_wdata = (grant==3'd4)?s3_wdata:(grant==3'd3)?s2_wdata:(grant==3'd2)?s1_wdata:s0_wdata;
assign m_wstrb = (grant==3'd4)?s3_wstrb:(grant==3'd3)?s2_wstrb:(grant==3'd2)?s1_wstrb:s0_wstrb;
assign m_wlast = (grant==3'd4)?s3_wlast:(grant==3'd3)?s2_wlast:(grant==3'd2)?s1_wlast:s0_wlast;
assign m_wvalid = (grant==3'd1)?s0_wvalid:(grant==3'd2)?s1_wvalid:(grant==3'd3)?s2_wvalid:(grant==3'd4)?s3_wvalid:1'b0;
assign s0_wready = (grant==3'd1)?m_wready:1'b0;
assign s1_wready = (grant==3'd2)?m_wready:1'b0;
assign s2_wready = (grant==3'd3)?m_wready:1'b0;
assign s3_wready = (grant==3'd4)?m_wready:1'b0;
// B demux (idle: bready=1 drains any stale/late response)
assign s0_bresp = m_bresp; assign s1_bresp = m_bresp; assign s2_bresp = m_bresp; assign s3_bresp = m_bresp;
assign s0_bvalid = (grant==3'd1)?m_bvalid:1'b0;
assign s1_bvalid = (grant==3'd2)?m_bvalid:1'b0;
assign s2_bvalid = (grant==3'd3)?m_bvalid:1'b0;
assign s3_bvalid = (grant==3'd4)?m_bvalid:1'b0;
assign m_bready = (grant==3'd1)?s0_bready:(grant==3'd2)?s1_bready:(grant==3'd3)?s2_bready:(grant==3'd4)?s3_bready:1'b1;
endmodule
+136
View File
@@ -0,0 +1,136 @@
// ============================================================================
// gs_lpddr_wr_probe.sv (Ch322 Brick 3)
//
// HPS-bridge-driven LPDDR4B WRITE probe — the missing PS2-side LPDDR loader,
// cloned from retroDE_ao486/rtl/ao486/lpddr4b_loader.sv (write half). The PS2
// core already has the READ half (gs_lpddr_rd_probe); this is its symmetric
// twin so the HPS can STAGE arbitrary words into FPGA-private LPDDR4B from Linux
// (e.g. a known texture for Ch322), then read them back / hash via the existing
// read-probe before the texture cache fills from them.
//
// This is HPS -> bridge registers -> FPGA EMIF write. NOT HPS direct memory
// access, and NOT the retired f2sdram path. The EMIF write channel is shared
// with the GS framebuffer writer through gs_lpddr_wr_arb (FB writer = priority,
// this probe writes only when the writer is idle).
//
// Runs on emif_clk. The bridge pulse/addr/data come from CLOCK2_50 and are
// toggle-synchronized internally (same CDC as lpddr4b_loader / gs_lpddr_rd_probe).
//
// Each wr_pulse triggers ONE single-beat 32-bit write: the 32-bit lane within
// the 256-bit EMIF word is selected by addr[4:2] with the matching WSTRB nibble.
// ============================================================================
`timescale 1ns/1ps
module gs_lpddr_wr_probe (
input logic emif_clk,
input logic emif_rst_n,
// ---- control (from HPS bridge, CLOCK2_50 domain) ----
input logic wr_pulse, // toggles when the HPS writes a data word
input logic [29:0] wr_addr, // EMIF byte address (stable when pulse flips)
input logic [31:0] wr_data, // data word (stable when pulse flips)
input logic full_beat, // Ch323 diag: write ALL 8 lanes (wstrb=0xFFFFFFFF) — tests
// full-width commit through THIS arbiter/profile path
// ---- status (emif_clk domain; bridge syncs) ----
output logic busy,
output logic done_toggle, // toggles on each completed write
output logic [31:0] bresp_errs, // count of non-OKAY write responses
// ---- AXI4 write channel to the EMIF user port (emif_clk, 256-bit) ----
output logic [29:0] awaddr,
output logic [1:0] awburst,
output logic [6:0] awid,
output logic [7:0] awlen,
output logic [2:0] awsize,
output logic awvalid,
input logic awready,
output logic [255:0] wdata,
output logic [31:0] wstrb,
output logic wlast,
output logic wvalid,
input logic wready,
input logic [1:0] bresp,
input logic bvalid,
output logic bready
);
assign awburst = 2'b01; // INCR
assign awid = 7'd5; // distinct id: fb-writer/probe ids elsewhere; 5 = wr-probe
assign awlen = 8'd0; // single beat
assign awsize = 3'b101; // 32 bytes (full 256-bit bus)
assign bready = 1'b1;
// CDC: toggle sync CLOCK2_50 -> emif_clk (same as lpddr4b_loader)
reg [2:0] wr_sync;
wire wr_edge = (wr_sync[2] != wr_sync[1]);
reg [29:0] lat_addr;
reg [31:0] lat_wdata;
typedef enum logic [1:0] { S_IDLE, S_AW, S_W, S_B } state_t;
state_t state;
always_ff @(posedge emif_clk or negedge emif_rst_n) begin
if (!emif_rst_n) begin
wr_sync <= 3'd0; lat_addr <= 30'd0; lat_wdata <= 32'd0;
state <= S_IDLE; awaddr <= 30'd0; awvalid <= 1'b0;
wdata <= 256'd0; wstrb <= 32'd0; wlast <= 1'b0; wvalid <= 1'b0;
busy <= 1'b0; done_toggle <= 1'b0; bresp_errs <= 32'd0;
end else begin
wr_sync <= {wr_sync[1:0], wr_pulse};
case (state)
S_IDLE: begin
busy <= 1'b0;
if (wr_edge) begin
lat_addr <= wr_addr;
lat_wdata <= wr_data;
busy <= 1'b1;
awaddr <= {wr_addr[29:5], 5'd0}; // 32-byte aligned beat
awvalid <= 1'b1;
state <= S_AW;
end
end
S_AW: begin
if (awready) begin
awvalid <= 1'b0;
wdata <= 256'd0;
wstrb <= 32'd0;
if (full_beat) begin
// diag: replicate the word across all 8 lanes, full WSTRB.
wdata <= {8{lat_wdata}};
wstrb <= 32'hFFFF_FFFF;
end else
case (lat_addr[4:2]) // place the 32-bit lane + its WSTRB nibble
3'd0: begin wdata[ 31: 0] <= lat_wdata; wstrb[ 3: 0] <= 4'hF; end
3'd1: begin wdata[ 63: 32] <= lat_wdata; wstrb[ 7: 4] <= 4'hF; end
3'd2: begin wdata[ 95: 64] <= lat_wdata; wstrb[11: 8] <= 4'hF; end
3'd3: begin wdata[127: 96] <= lat_wdata; wstrb[15:12] <= 4'hF; end
3'd4: begin wdata[159:128] <= lat_wdata; wstrb[19:16] <= 4'hF; end
3'd5: begin wdata[191:160] <= lat_wdata; wstrb[23:20] <= 4'hF; end
3'd6: begin wdata[223:192] <= lat_wdata; wstrb[27:24] <= 4'hF; end
3'd7: begin wdata[255:224] <= lat_wdata; wstrb[31:28] <= 4'hF; end
endcase
wlast <= 1'b1;
wvalid <= 1'b1;
state <= S_W;
end
end
S_W: begin
if (wready) begin
wvalid <= 1'b0;
wlast <= 1'b0;
state <= S_B;
end
end
S_B: begin
if (bvalid) begin
if (bresp != 2'b00) bresp_errs <= bresp_errs + 32'd1;
busy <= 1'b0;
done_toggle <= ~done_toggle;
state <= S_IDLE;
end
end
default: state <= S_IDLE;
endcase
end
end
endmodule
+847
View File
@@ -0,0 +1,847 @@
// retroDE_ps2 — gs_pcrtc_stub (Ch90)
//
// Minimal PCRTC (Programmable CRT Controller) scanout engine.
// Real PS2 PCRTC reads VRAM via a DISPFB (display framebuffer)
// configuration register and feeds the analog video DAC. This
// stub is the SCANOUT side of the GS pipeline — its dual is
// gs_stub, which is the WRITE side. Together they close the loop
// from `raster_pixel_emit` (Ch88) → vram_stub (Ch89) → visible
// pixels (Ch90).
//
// Architectural note. `platform_video_stub` is a flood-fill video
// adapter that always paints BGCOLOR within its active area —
// it predates VRAM persistence and stays as-is for back-compat.
// `gs_pcrtc_stub` is the SCANOUT-AWARE alternative, used by TBs
// that want to verify the round trip "gs_stub writes a pixel →
// vram_stub stores it → pcrtc reads it back as video." We did
// not extend platform_video_stub (which would have rippled
// through 6 existing TBs); pcrtc is a parallel module that owns
// its own raster timing AND vram read addressing, so a TB picks
// the one that fits.
//
// Scope:
// - Single DISPFB context: pcrtc consumes `pmode_q` and
// `dispfb1_q` directly from gs_stub's privileged CPU MMIO
// latches (Ch91). The Ch90 sideband ports
// (scanout_enable / dispfb_fbp / dispfb_fbw) are gone — TBs
// drive scanout configuration the way a real driver would,
// by writing PMODE and DISPFB1 through the gs_stub.reg_wr_*
// port. This means `wait (raster_done); write PMODE.EN1=1`
// is the canonical sequence, not a sideband poke.
// - Addressing: linear by DEFAULT — fb_addr math mirrors
// gs_stub's pixel fb_addr math byte-exactly so a pixel
// written at (x,y) reads back at (x,y) without swizzle
// reconciliation. Four OPTIONAL per-PSM swizzle paths gated
// by parameters: `PSMCT32_SWIZZLE=1` (Ch120) routes PSMCT32
// reads through gs_swizzle_psmct32_stub; `PSMCT16_SWIZZLE=1`
// (Ch126) routes PSMCT16 reads through gs_swizzle_psmct16_stub;
// `PSMT8_SWIZZLE=1` (Ch132) routes PSMT8 reads through
// gs_swizzle_psmt8_stub (page=128×64 px, bw_pg=FBW>>1 — FBW
// must be even for PSMT8); `PSMT4_SWIZZLE=1` (Ch138) routes
// PSMT4 reads through gs_swizzle_psmt4_stub (page=128×128 px,
// bw_pg=FBW>>1 — FBW must be even for PSMT4; module also
// outputs nibble_hi selector since PSMT4 packs 2 pixels/byte).
// The four parameters are independent. All four defaults are
// 0 → existing TBs see legacy linear behavior.
// - PSMCT32 (PSM=0), PSMCT16 (PSM=2), PSMT8 (PSM=0x13), and
// PSMT4 (PSM=0x14) are honored at this scope. Any other
// PSM forces scanout off rather than mis-decoding the byte
// layout. PSMCT16 reads 2 bytes/pixel and unpacks RGB5A1 →
// RGB888 via bit-replicate. PSMT8 reads 1 byte/pixel and
// PSMT4 reads 4 bits/pixel (2 pixels/byte, low nibble =
// even pixel). For PSMT8 / PSMT4, with `clut_enable=1` the
// index is looked up in clut_stub for real RGB; with
// `clut_enable=0`, the index/nibble surfaces as grayscale.
// gs_stub's raster channel emits PSMCT32 + PSMCT16 (Ch95) +
// PSMT8 (Ch105) + PSMT4 (Ch106). CLUT contents come from a
// TB-direct write OR from a VRAM→CLUT load triggered by
// TEX0_1.CLD via clut_loader_stub (Ch99..Ch102).
// - Single CRTC: one display, one DISPFB context. Real PS2 has
// two (DISPFB1/DISPLAY1 and DISPFB2/DISPLAY2) for interlace/
// merge. The PMODE.EN2 + DISPFB2/DISPLAY2 path is deferred.
// - DISPLAY1 DX/DY/DW/DH ARE honored (Ch92): they define the
// display window inside the active area. Outside the window,
// pcrtc emits 0 for r/g/b even with scanout_enable=1.
// MAGH/MAGV ARE honored (Ch93): each VRAM column shows for
// (MAGH+1) consecutive VCK pulses before advancing, and each
// VRAM line shows for (MAGV+1) raster lines. Practically,
// a 4-pixel-wide VRAM sprite with MAGH=1 (2×) appears 8
// pixels wide on screen. The H/V totals still come from
// module parameters at instantiation. Real PS2 driver-
// equivalent bring-up is now "configure DISPFB1 → configure
// DISPLAY1 → render → set PMODE.EN1=1." Note: DISPLAY1=0
// (post-reset default) means a 1×1 window at (0,0); a TB
// MUST configure DISPLAY1 for anything visible to scan out.
// - When scanout_enable
// (= PMODE.EN1 & (PSMCT32 || PSMCT16 || PSMT8 || PSMT4))
// is 0, r/g/b output is forced to 0 across the active area.
// There's no BGCOLOR fallback in this module — that lives in
// platform_video_stub.
//
// Trace payload: one EV_MODE pulse per completed frame, mirroring
// platform_video_stub's schema (arg0=frame_count, arg1=H*V).
// PLAT MODE arg0=frame_number arg1=pixels_per_frame arg2=- arg3=-
`timescale 1ns/1ps
module gs_pcrtc_stub
import trace_pkg::*;
#(
// Horizontal timing (in pixel clocks). Defaults match
// platform_video_stub's tiny-TB convention.
parameter int H_ACTIVE = 16,
parameter int H_FRONT = 2,
parameter int H_SYNC = 4,
parameter int H_BACK = 2,
// Vertical timing (in lines)
parameter int V_ACTIVE = 8,
parameter int V_FRONT = 1,
parameter int V_SYNC = 1,
parameter int V_BACK = 1,
parameter bit HSYNC_ACTIVE_LOW = 1'b1,
parameter bit VSYNC_ACTIVE_LOW = 1'b1,
// Ch120 — when set, PSMCT32 scanout reads VRAM via the real PS2
// GS page/block swizzle (gs_swizzle_psmct32_stub) instead of the
// legacy linear `FBW*64*y + x*4` formula. PSMCT16 / PSMT8 / PSMT4
// are governed by their own gates (PSMCT16_SWIZZLE Ch126,
// PSMT8_SWIZZLE Ch132, PSMT4_SWIZZLE Ch138 — see below).
// Default 0 keeps every existing PSMCT32 scanout TB on the
// original linear addressing.
parameter bit PSMCT32_SWIZZLE = 1'b0,
// Ch126 — when set, PSMCT16 scanout reads VRAM via the real PS2
// GS page/block/column swizzle (gs_swizzle_psmct16_stub) instead
// of the legacy linear `FBW*64*y + x*2` formula. PSMCT32 / PSMT8
// / PSMT4 are governed by their own gates (PSMCT32_SWIZZLE /
// PSMT8_SWIZZLE) or stay linear. Default 0 keeps every existing
// PSMCT16 scanout TB (Ch94 PSM-aware, Ch95 raster, Ch103 PSMT4-
// via-CT16-CLUT, etc.) on the original linear addressing.
parameter bit PSMCT16_SWIZZLE = 1'b0,
// Ch132 — when set, PSMT8 scanout reads VRAM via the real PS2 GS
// page/block/column swizzle (gs_swizzle_psmt8_stub) instead of
// the legacy linear `FBW*64*y + x` formula. PSMT8 pages are 128
// px wide (vs 64 px for CT32/CT16) so the swizzle internally uses
// bw_pg = FBW>>1 — PCSX2 asserts FBW must be even for PSMT8.
// Default 0 keeps every existing PSMT8 scanout TB (Ch96, Ch97,
// Ch103 PSMT4-via-CT16-CLUT, Ch107 PSMT4-e2e palette path, etc.)
// on the original linear addressing. PSMCT32 / PSMCT16 / PSMT4
// are governed by their own gates or stay linear.
parameter bit PSMT8_SWIZZLE = 1'b0,
// Ch138 — when set, PSMT4 scanout reads VRAM via the real PS2 GS
// page/block/column swizzle (gs_swizzle_psmt4_stub) instead of
// the legacy linear `byte_offset = pixel_index >> 1` formula.
// PSMT4 pixels are 4 bits each (2 pixels per byte); the swizzle
// module outputs both an absolute byte address AND a `nibble_hi`
// selector that picks the high or low nibble of the byte at
// that address. PSMT4 pages are 128 px wide (same as PSMT8) so
// the swizzle internally uses bw_pg = FBW>>1 — PCSX2 asserts
// FBW must be even for PSMT4. The grayscale + CLUT lookup paths
// BOTH use the same swizzle output: the byte at `addr` is read
// from VRAM, and `nibble_hi` (instead of pixel_index[0]) picks
// which nibble. Default 0 keeps every existing PSMT4 scanout TB
// (Ch103 PSMT4+CLUT, Ch104 PSMT4 round-trip, Ch107 PSMT4 e2e,
// etc.) on the original linear addressing. PSMCT32 / PSMCT16 /
// PSMT8 are governed by their own gates.
parameter bit PSMT4_SWIZZLE = 1'b0,
// Ch158 — when set, the data-decode + sync-output pipeline is
// delayed by 1 cycle so it aligns with a sync-read VRAM (e.g.
// `vram_bram_stub`, Ch154) whose `read_data` is registered.
// The address-driving stage (`vram_read_addr`) keeps using the
// current `(hcnt, vcnt)` so the read is issued one pixel
// "ahead"; the registered `vram_read_data` returns a cycle
// later, and the decode comb consumes the matching delayed
// counter view via the `*_dec` signals.
//
// Default 0 preserves the legacy combinational-read behavior
// every existing PCRTC TB (Ch90+ scanout TBs) is written
// against — those TBs drive `vram_read_data` via legacy
// `vram_stub` (comb read) and consume r/g/b on the same
// cycle as the addr drive. Set to 1 in the BRAM wrapper /
// board top once `vram_bram_stub` is the storage.
parameter bit VRAM_SYNC_READ = 1'b0,
// Ch163 — bypass the magnification dividers
// `vram_x_unshift = hwin_rel / hmag_factor` and the matching y
// form when the demo locks `MAGH = MAGV = 0`. Quartus infers a
// 32-bit hardware divider from the `/` operators above (the
// Ch162 STA worst path after STRIP_HW_DIVIDER closed the EE-
// core divider). For demos that never write MAGH/MAGV non-zero
// — which includes the PSMCT32 raster demo and every other
// hardware-target wrapper today — the divisors are constant 1
// and the math collapses to a passthrough.
//
// Default 0 keeps the existing divider math live so every
// Ch93-era scanout MAG TB stays green (the TBs that drive
// MAGH != 0 / MAGV != 0 such as `tb_gs_scanout_magh_magv`
// continue to use the default).
//
// When 1, `vram_x_unshift = hwin_rel` / `vram_y_unshift =
// vwin_rel` — equivalent to the MAGH=MAGV=0 case but without
// the divider. The hardware-demo path forwards this parameter
// through `top_psmct32_raster_demo_bram` and the DE25-Nano
// board top sets it to 1'b1.
parameter bit STRIP_PCRTC_MAG_DIV = 1'b0
) (
input logic clk,
input logic rst_n,
// Ch91/Ch92/Ch93/Ch94/Ch96/Ch103 — PMODE + DISPFB1 + DISPLAY1
// latches from gs_stub's privileged CPU MMIO port.
// EN1 (PMODE bit 0) gates scanout. DISPFB1 carries the
// framebuffer base / width / PSM the PCRTC reads from
// (PSMCT32, PSMCT16, PSMT8, and PSMT4 honored at this scope;
// any other PSM forces scanout off). DISPLAY1 carries the
// display window: DX/DY = origin within the active area;
// DW/DH = width/height MINUS one (real PS2 semantics).
// MAGH/MAGV (Ch93) scale the window-relative coordinate so
// each VRAM column/line repeats for (MAGH+1)/(MAGV+1)
// displayed pulses/lines; pcrtc still takes H/V TOTALS from
// module parameters at instantiation, not from registers.
input logic [63:0] pmode_q,
input logic [63:0] dispfb1_q,
input logic [63:0] display1_q,
// VRAM read port: combinational read from vram_stub.
output logic [31:0] vram_read_addr,
input logic [31:0] vram_read_data,
// Ch97 — CLUT (palette) read port for indexed-color scanout.
// When `clut_enable` is high AND the active PSM is PSMT8,
// pcrtc presents `clut_read_idx = vram_read_data[7:0] +
// (clut_csa << 4)` and decodes the returned PSMCT32 RGB
// entry instead of the grayscale fallback. CSM is implicitly
// CSM2 (linear). CSA shifts the lookup window in 16-entry
// increments and wraps mod 256. When `clut_enable` is low,
// the CLUT is bypassed and PSMT8 still scans out as
// grayscale (Ch96 default).
input logic clut_enable,
input logic [4:0] clut_csa,
output logic [7:0] clut_read_idx,
input logic [31:0] clut_read_data,
// Video out
output logic hsync,
output logic vsync,
output logic de,
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
// Ch320 — high exactly when this scanout pixel is inside the displayed frame
// (scanout enabled AND within the DX/DY/DW/DH display window). Aligned to r/g/b.
// An LPDDR4B scanout reader gates its pixels by this so it shows ONE frame, not
// a tiled fill of the whole active line.
output logic pix_window_o,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int H_TOTAL = H_ACTIVE + H_FRONT + H_SYNC + H_BACK;
localparam int V_TOTAL = V_ACTIVE + V_FRONT + V_SYNC + V_BACK;
localparam int H_SYNC_START = H_ACTIVE + H_FRONT;
localparam int H_SYNC_END = H_SYNC_START + H_SYNC;
localparam int V_SYNC_START = V_ACTIVE + V_FRONT;
localparam int V_SYNC_END = V_SYNC_START + V_SYNC;
localparam int HCNT_W = $clog2(H_TOTAL);
localparam int VCNT_W = $clog2(V_TOTAL);
logic [HCNT_W-1:0] hcnt;
logic [VCNT_W-1:0] vcnt;
logic end_of_line;
logic end_of_frame;
assign end_of_line = (hcnt == HCNT_W'(H_TOTAL - 1));
assign end_of_frame = end_of_line && (vcnt == VCNT_W'(V_TOTAL - 1));
always_ff @(posedge clk) begin
if (!rst_n) begin
hcnt <= '0;
vcnt <= '0;
end else if (end_of_line) begin
hcnt <= '0;
vcnt <= end_of_frame ? '0 : (vcnt + VCNT_W'(1));
end else begin
hcnt <= hcnt + HCNT_W'(1);
end
end
logic active_h;
logic active_v;
logic in_hsync;
logic in_vsync;
assign active_h = (hcnt < HCNT_W'(H_ACTIVE));
assign active_v = (vcnt < VCNT_W'(V_ACTIVE));
assign in_hsync = (hcnt >= HCNT_W'(H_SYNC_START)) && (hcnt < HCNT_W'(H_SYNC_END));
assign in_vsync = (vcnt >= VCNT_W'(V_SYNC_START)) && (vcnt < VCNT_W'(V_SYNC_END));
// ------------------------------------------------------------------
// Ch158 — decode-stage pipeline. When `VRAM_SYNC_READ=1`, every
// hcnt/vcnt-derived signal that the data-decode stage consumes
// is delayed by 1 cycle so it lines up with `vram_bram_stub`'s
// 1-cycle-late `vram_read_data`. The address-side
// (`vram_read_addr`) keeps using the current `hcnt`/`vcnt` so the
// read is issued one pixel "ahead".
//
// The registers below always exist (zero-cost in sim, optimized
// away when unreached in synthesis); the `*_dec` muxes select
// between the registered view (sync) and the live signal
// (legacy comb-read passthrough).
// ------------------------------------------------------------------
logic in_hsync_q, in_vsync_q;
logic active_h_q, active_v_q;
logic in_display_window_q, scanout_enable_q;
logic dispfb_psm_ct32_q, dispfb_psm_ct16_q, dispfb_psm_t8_q, dispfb_psm_t4_q;
logic psm4_nibble_select_q;
logic end_of_frame_q;
logic in_hsync_dec, in_vsync_dec;
logic active_h_dec, active_v_dec;
logic in_display_window_dec, scanout_enable_dec;
logic dispfb_psm_ct32_dec, dispfb_psm_ct16_dec, dispfb_psm_t8_dec, dispfb_psm_t4_dec;
logic psm4_nibble_select_dec;
logic end_of_frame_dec;
// psm4_nibble_select / dispfb_psm_* / scanout_enable /
// in_display_window are forward-referenced — they are declared
// and assigned later in the file (after the address/decode
// logic that produces them). SystemVerilog allows module-level
// forward references inside always_ff/always_comb blocks; the
// registers below capture them at every posedge.
always_ff @(posedge clk) begin
if (!rst_n) begin
in_hsync_q <= 1'b0;
in_vsync_q <= 1'b0;
active_h_q <= 1'b0;
active_v_q <= 1'b0;
in_display_window_q <= 1'b0;
scanout_enable_q <= 1'b0;
dispfb_psm_ct32_q <= 1'b0;
dispfb_psm_ct16_q <= 1'b0;
dispfb_psm_t8_q <= 1'b0;
dispfb_psm_t4_q <= 1'b0;
psm4_nibble_select_q <= 1'b0;
end_of_frame_q <= 1'b0;
end else begin
in_hsync_q <= in_hsync;
in_vsync_q <= in_vsync;
active_h_q <= active_h;
active_v_q <= active_v;
in_display_window_q <= in_display_window;
scanout_enable_q <= scanout_enable;
dispfb_psm_ct32_q <= dispfb_psm_ct32;
dispfb_psm_ct16_q <= dispfb_psm_ct16;
dispfb_psm_t8_q <= dispfb_psm_t8;
dispfb_psm_t4_q <= dispfb_psm_t4;
psm4_nibble_select_q <= psm4_nibble_select;
end_of_frame_q <= end_of_frame;
end
end
assign in_hsync_dec = VRAM_SYNC_READ ? in_hsync_q : in_hsync;
assign in_vsync_dec = VRAM_SYNC_READ ? in_vsync_q : in_vsync;
assign active_h_dec = VRAM_SYNC_READ ? active_h_q : active_h;
assign active_v_dec = VRAM_SYNC_READ ? active_v_q : active_v;
assign in_display_window_dec = VRAM_SYNC_READ ? in_display_window_q : in_display_window;
assign scanout_enable_dec = VRAM_SYNC_READ ? scanout_enable_q : scanout_enable;
// Ch320 — same gate the r/g/b output uses (line ~"if (de && scanout_enable_dec &&
// in_display_window_dec)"), minus de (the HDMI path applies de). Lets an external
// LPDDR4B scanout reader blank outside the displayed frame, matching BRAM scanout.
assign pix_window_o = scanout_enable_dec && in_display_window_dec;
assign dispfb_psm_ct32_dec = VRAM_SYNC_READ ? dispfb_psm_ct32_q : dispfb_psm_ct32;
assign dispfb_psm_ct16_dec = VRAM_SYNC_READ ? dispfb_psm_ct16_q : dispfb_psm_ct16;
assign dispfb_psm_t8_dec = VRAM_SYNC_READ ? dispfb_psm_t8_q : dispfb_psm_t8;
assign dispfb_psm_t4_dec = VRAM_SYNC_READ ? dispfb_psm_t4_q : dispfb_psm_t4;
assign psm4_nibble_select_dec = VRAM_SYNC_READ ? psm4_nibble_select_q : psm4_nibble_select;
assign end_of_frame_dec = VRAM_SYNC_READ ? end_of_frame_q : end_of_frame;
assign hsync = HSYNC_ACTIVE_LOW ? ~in_hsync_dec : in_hsync_dec;
assign vsync = VSYNC_ACTIVE_LOW ? ~in_vsync_dec : in_vsync_dec;
assign de = active_h_dec && active_v_dec;
// ------------------------------------------------------------------
// VRAM addressing. Mirror gs_stub's fb_addr math byte-exactly
// so written-then-scanned pixels round-trip without
// reconciliation:
// fbp_bytes = dispfb_fbp << 11 (FBP * 2048)
// pixels_per_row = dispfb_fbw << 6 (FBW * 64)
// effective_x = (hcnt - DX) / (MAGH+1) + DBX (Ch92/Ch93)
// effective_y = (vcnt - DY) / (MAGV+1) + DBY
// pixel_index = effective_y * pixels_per_row + effective_x
// byte_offset = pixel_index << dispfb_bpp_shift
// fb_addr = fbp_bytes + byte_offset
// dispfb_bpp_shift is now PSM-aware (Ch94/Ch96): 2 for
// PSMCT32, 1 for PSMCT16, 0 for PSMT8. Other PSMs force
// scanout off rather than mis-decoding bytes.
// ------------------------------------------------------------------
// Decode DISPFB1 sub-fields per real PS2 GS register layout
// (PCSX2 GSRegs.h — DISPFB structure):
// FBP : [8:0] base address in 2048-byte units
// FBW : [14:9] width in 64-pixel units
// PSM : [19:15] pixel storage mode (we only honor PSMCT32 = 0)
// DBX : [42:32] display-buffer X origin (Ch91-audit fix)
// DBY : [53:43] display-buffer Y origin (Ch91-audit fix)
//
// DBX/DBY shift the scanout's VRAM origin: the pixel that
// appears at (hcnt=0, vcnt=0) is VRAM (DBX, DBY), not (0, 0).
// Useful for double-buffered framebuffers and offset display
// windows.
logic [8:0] dispfb_fbp;
logic [5:0] dispfb_fbw;
logic [4:0] dispfb_psm;
logic [10:0] dispfb_dbx;
logic [10:0] dispfb_dby;
logic dispfb_psm_ok;
logic pmode_en1;
logic scanout_enable;
assign dispfb_fbp = dispfb1_q[8:0];
assign dispfb_fbw = dispfb1_q[14:9];
assign dispfb_psm = dispfb1_q[19:15];
assign dispfb_dbx = dispfb1_q[42:32];
assign dispfb_dby = dispfb1_q[53:43];
// Ch94/Ch96/Ch97/Ch103 — scanout PSM awareness. Four formats:
// PSMCT32 (5'h00) — 4 bytes/pixel, byte order {A,B,G,R}.
// PSMCT16 (5'h02) — 2 bytes/pixel, RGB5A1 packed:
// R[4:0] G[9:5] B[14:10] A[15].
// PSMT8 (5'h13) — 1 byte/pixel, 8-bit index.
// PSMT4 (5'h14) — 4 bits/pixel = 2 pixels/byte. Byte
// offset = pixel_index >> 1; nibble
// selector = pixel_index[0] (low =
// even, high = odd). The 4-bit nibble
// zero-extends to an 8-bit CLUT index;
// CSA picks the 16-entry palette window.
// For PSMT8/PSMT4, with `clut_enable=1` pcrtc looks up
// CLUT[idx + (CSA << 4)] in the external clut_stub for real
// RGB. With `clut_enable=0`, the index/nibble surfaces as
// grayscale (8-bit replication for PSMT8, 4→8 bit-replicate
// for PSMT4) so the storage lane stays visually verifiable
// without programming a palette.
// 5→8 expansion (PSMCT16) uses bit-replicate ({r5, r5[4:2]}),
// matching PCSX2. Other PSMs still disable scanout rather
// than mis-decode bytes; PSMCT24/PSMCT16S/PSMZ32/etc. force
// scanout off here.
logic dispfb_psm_ct32;
logic dispfb_psm_ct16;
logic dispfb_psm_t8;
logic dispfb_psm_t4;
logic [1:0] dispfb_bpp_shift;
assign dispfb_psm_ct32 = (dispfb_psm == 5'h00);
assign dispfb_psm_ct16 = (dispfb_psm == 5'h02);
assign dispfb_psm_t8 = (dispfb_psm == 5'h13);
assign dispfb_psm_t4 = (dispfb_psm == 5'h14);
assign dispfb_psm_ok = dispfb_psm_ct32 | dispfb_psm_ct16
| dispfb_psm_t8 | dispfb_psm_t4;
assign dispfb_bpp_shift = dispfb_psm_ct32 ? 2'd2 : // 4 bytes/pixel
dispfb_psm_ct16 ? 2'd1 : // 2 bytes/pixel
dispfb_psm_t8 ? 2'd0 : // 1 byte/pixel
2'd2; // PSMT4 uses byte_offset right-shift, not bpp_shift
assign pmode_en1 = pmode_q[0];
assign scanout_enable = pmode_en1 & dispfb_psm_ok;
// Ch92/Ch93 — DISPLAY1 sub-fields per real PS2 GS register
// layout (PCSX2 GSRegs.h — DISPLAY structure):
// DX : [11:0] display window X start (in VCK pulses)
// DY : [22:12] display window Y start (in raster lines)
// MAGH : [26:23] horizontal magnification - 1 (Ch93)
// MAGV : [28:27] vertical magnification - 1 (Ch93)
// DW : [43:32] display width - 1 (in VCK pulses)
// DH : [54:44] display height - 1 (in raster lines)
//
// The display window is the sub-rect (DX..DX+DW, DY..DY+DH)
// inside the active area. Outside the window, r/g/b is 0
// even when scanout_enable is 1. Inside, the VRAM index is
// measured RELATIVE to the window origin, scaled DOWN by the
// magnification factors (MAGH+1 / MAGV+1), then shifted by
// DBX/DBY. This means the pixel at displayed (DX, DY)
// corresponds to VRAM (DBX, DBY); successive displayed
// pixels along H map to the SAME VRAM column for (MAGH+1)
// VCK pulses before advancing.
logic [11:0] display_dx;
logic [10:0] display_dy;
logic [3:0] display_magh;
logic [1:0] display_magv;
logic [11:0] display_dw;
logic [10:0] display_dh;
assign display_dx = display1_q[11:0];
assign display_dy = display1_q[22:12];
assign display_magh = display1_q[26:23];
assign display_magv = display1_q[28:27];
assign display_dw = display1_q[43:32];
assign display_dh = display1_q[54:44];
// Window inside-test: (hcnt - DX) in [0, DW] AND (vcnt - DY)
// in [0, DH]. We do the lower-bound check by comparing >=
// and the upper-bound by computing the relative coord.
logic [11:0] hwin_rel;
logic [11:0] vwin_rel;
logic in_display_window;
assign hwin_rel = {{(12-HCNT_W){1'b0}}, hcnt} - {{0{1'b0}}, display_dx};
assign vwin_rel = {{(12-VCNT_W){1'b0}}, vcnt[VCNT_W-1:0]} - {1'b0, display_dy};
assign in_display_window = ({{(12-HCNT_W){1'b0}}, hcnt} >= {{0{1'b0}}, display_dx})
&& (hwin_rel <= display_dw)
&& ({{(12-VCNT_W){1'b0}}, vcnt[VCNT_W-1:0]} >= {1'b0, display_dy})
&& (vwin_rel <= {1'b0, display_dh});
logic [31:0] fbp_bytes;
logic [31:0] pixels_per_row;
logic [31:0] hmag_factor; // MAGH + 1, range 1..16
logic [31:0] vmag_factor; // MAGV + 1, range 1..4
logic [31:0] vram_x_unshift;
logic [31:0] vram_y_unshift;
logic [31:0] effective_x;
logic [31:0] effective_y;
logic [31:0] pixel_index;
logic [31:0] byte_offset;
// VRAM index is measured from inside the display window and
// SCALED DOWN by the magnification factors:
// effective_x = ((hcnt - DX) / (MAGH+1)) + DBX
// effective_y = ((vcnt - DY) / (MAGV+1)) + DBY
// MAGH=MAGV=0 → factors=1×, math collapses to the pre-Ch93
// form (and the pre-Ch92 form when DISPLAY1 covers the full
// active area). MAGH=N>0 means each VRAM column shows for
// (N+1) consecutive VCK pulses before the next column. SystemVerilog
// `/` truncates toward zero on unsigned 32-bit operands —
// matches PS2 PCRTC behavior since (hcnt-DX) is always
// non-negative inside the window (the window check guards
// hcnt >= DX before VRAM is read).
assign fbp_bytes = {23'd0, dispfb_fbp} << 11;
assign pixels_per_row = {26'd0, dispfb_fbw} << 6;
assign hmag_factor = {28'd0, display_magh} + 32'd1;
assign vmag_factor = {30'd0, display_magv} + 32'd1;
// Ch163 — when STRIP_PCRTC_MAG_DIV is 1, bypass the divisions
// and use the window-relative coords directly. Quartus then has
// nothing to infer for the magnification divider (the Ch162-onwards
// STA worst path on `u_demo|u_pcrtc|div_1_rtl_0|...`). The
// hardware-demo path locks MAGH=MAGV=0 so the divisors are
// constant 1 and this is behavior-neutral. The default 0 keeps
// the live divider math for the existing Ch93 magnification
// scanout TBs (`tb_gs_scanout_magh_magv` etc.).
assign vram_x_unshift = STRIP_PCRTC_MAG_DIV
? {20'd0, hwin_rel}
: ({20'd0, hwin_rel} / hmag_factor);
assign vram_y_unshift = STRIP_PCRTC_MAG_DIV
? {20'd0, vwin_rel}
: ({20'd0, vwin_rel} / vmag_factor);
assign effective_x = vram_x_unshift + {21'd0, dispfb_dbx};
assign effective_y = vram_y_unshift + {21'd0, dispfb_dby};
assign pixel_index = (effective_y * pixels_per_row) + effective_x;
// PSMT4 packs 2 pixels per byte → byte_offset = pixel_index/2;
// all other supported PSMs are integer-bytes-per-pixel and
// use the standard left-shift by bpp_shift.
assign byte_offset = dispfb_psm_t4 ? (pixel_index >> 1)
: (pixel_index << dispfb_bpp_shift);
logic [31:0] vram_linear_addr;
assign vram_linear_addr = fbp_bytes + byte_offset;
// Ch120 — optional PSMCT32 swizzled scanout. The swizzle module
// is purely combinational and reuses dispfb_fbp / dispfb_fbw +
// the per-cycle effective_x / effective_y (already magnification-
// aware via Ch93). When PSMCT32_SWIZZLE=1 AND the active PSM is
// PSMCT32, mux its output into vram_read_addr. Other PSMs (CT16,
// T8, T4) and PSMCT32_SWIZZLE=0 keep the legacy linear address.
logic [31:0] vram_swizzled_addr;
gs_swizzle_psmct32_stub u_swizzle (
.fbp (dispfb_fbp),
.fbw (dispfb_fbw),
.x (effective_x[11:0]),
.y (effective_y[11:0]),
.addr(vram_swizzled_addr)
);
// Ch126 — optional PSMCT16 swizzled scanout. Same wiring shape
// as Ch120 but uses gs_swizzle_psmct16_stub. The PSMCT16 module
// bakes its own page-shape (64×64 vs CT32's 64×32), block grid
// (4 cols × 8 rows vs CT32's 8×4), and within-block column-table
// permutation in. Default PSMCT16_SWIZZLE=0 preserves linear
// PSMCT16 scanout for the legacy TBs (Ch94/Ch95/Ch103/etc.).
logic [31:0] vram_swizzled16_addr;
gs_swizzle_psmct16_stub u_swizzle16 (
.fbp (dispfb_fbp),
.fbw (dispfb_fbw),
.x (effective_x[11:0]),
.y (effective_y[11:0]),
.addr(vram_swizzled16_addr)
);
// Ch132 — optional PSMT8 swizzled scanout. Same wiring shape as
// Ch120/Ch126. PSMT8 pages are 128 px wide so the swizzle
// internally divides FBW by 2 (PCSX2 asserts FBW must be even
// for PSMT8). Default PSMT8_SWIZZLE=0 preserves linear PSMT8
// scanout for the legacy TBs (Ch96, Ch97, Ch103, Ch107, etc.).
logic [31:0] vram_swizzled8_addr;
gs_swizzle_psmt8_stub u_swizzle8 (
.fbp (dispfb_fbp),
.fbw (dispfb_fbw),
.x (effective_x[11:0]),
.y (effective_y[11:0]),
.addr(vram_swizzled8_addr)
);
// Ch138 — optional PSMT4 swizzled scanout. Same wiring shape as
// Ch120/Ch126/Ch132 but uses gs_swizzle_psmt4_stub. PSMT4 is
// 4 bits/pixel, so the module outputs both an absolute byte
// address AND a `nibble_hi` selector. Default PSMT4_SWIZZLE=0
// preserves linear PSMT4 scanout for the legacy TBs (Ch103,
// Ch104, Ch107, etc.) — the linear path uses pixel_index[0] as
// the nibble selector; the swizzled path uses the swizzle
// module's nibble_hi output instead.
logic [31:0] vram_swizzled4_addr;
logic swizzle4_nibble_hi;
gs_swizzle_psmt4_stub u_swizzle4 (
.fbp (dispfb_fbp),
.fbw (dispfb_fbw),
.x (effective_x[11:0]),
.y (effective_y[11:0]),
.addr (vram_swizzled4_addr),
.nibble_hi(swizzle4_nibble_hi)
);
assign vram_read_addr = (PSMCT32_SWIZZLE && dispfb_psm_ct32) ? vram_swizzled_addr :
(PSMCT16_SWIZZLE && dispfb_psm_ct16) ? vram_swizzled16_addr :
(PSMT8_SWIZZLE && dispfb_psm_t8) ? vram_swizzled8_addr :
(PSMT4_SWIZZLE && dispfb_psm_t4) ? vram_swizzled4_addr :
vram_linear_addr;
// PSMCT32 layout in vram_stub: little-endian write of
// raster_pixel_color_q[31:0] = {A, B, G, R}. Read back as:
// data[7:0] = R
// data[15:8] = G
// data[23:16] = B
// data[31:24] = A (alpha, not exposed at the video DAC)
// Ch94/Ch96/Ch97 — PSM-aware color decode.
// PSMCT32: lower 24 bits = {B, G, R}; alpha at [31:24]
// dropped.
// PSMCT16: RGB5A1 in lower 16 bits, 5→8 bit-replicate.
// PSMT8 : index in vram_read_data[7:0]. With clut_enable
// (Ch97), CLUT[idx + (CSA << 4)] is looked up for
// real RGB; without it, the index is emitted as
// grayscale (Ch96 fallback). The vram_stub read
// returns 4 bytes starting at the byte address,
// so [7:0] is the byte at the addressed PSMT8
// pixel regardless of 4-byte alignment.
logic [15:0] psm16_pixel;
logic [4:0] psm16_r5, psm16_g5, psm16_b5;
logic [7:0] psm16_r8, psm16_g8, psm16_b8;
logic [7:0] psm8_idx;
logic [3:0] psm4_nibble;
logic [7:0] psm4_idx;
logic [7:0] psm4_gray;
// Ch158 (audit Medium fix) — sub-word PSM lane selection.
//
// `vram_stub` returns the 4 bytes STARTING at `byte_addr`, so
// for the legacy comb-read shape the sub-word value is always
// at the LOW lane of `vram_read_data` (CT16 → [15:0], T8 → [7:0],
// T4 byte → [7:0]). `vram_bram_stub` is word-addressable
// (returns mem[byte_addr >> 2]), so the sub-word value lives
// at lane `byte_addr[1:0]` within the returned 32-bit word —
// CT16 halfword at byte_addr[1]==1 sits at [31:16] and is
// missed by a fixed-low-lane extract.
//
// The address-LSB register below is a 1-cycle-delayed copy of
// `vram_read_addr[1:0]` matching the `_dec` decode-stage view
// of the registered `vram_read_data`. The `data_lane` mux is
// forced to 0 in legacy mode (so vram_stub's byte-addressable
// semantics keep working) and uses the registered LSBs in
// sync mode (so vram_bram_stub's word-addressable layout
// resolves to the right byte/halfword).
logic [1:0] vram_addr_lane_q;
logic [1:0] vram_addr_lane_dec;
logic [1:0] data_lane;
always_ff @(posedge clk) begin
if (!rst_n) vram_addr_lane_q <= 2'd0;
else vram_addr_lane_q <= vram_read_addr[1:0];
end
assign vram_addr_lane_dec = VRAM_SYNC_READ ? vram_addr_lane_q
: vram_read_addr[1:0];
assign data_lane = VRAM_SYNC_READ ? vram_addr_lane_dec
: 2'd0;
// CT16 halfword: [1] picks low (==0) or high (==1) halfword of
// the 32-bit word. byte_addr[0]==1 is misuse for CT16 (the
// address-stage formula always yields even byte addresses).
assign psm16_pixel = data_lane[1] ? vram_read_data[31:16]
: vram_read_data[15:0];
// PSMT8/T4 byte: [1:0] picks 1 of 4 byte lanes. Used directly
// as `psm8_idx` and as the source byte for the PSMT4 nibble
// extract below.
logic [7:0] vram_byte_lane;
always_comb begin
case (data_lane)
2'b00: vram_byte_lane = vram_read_data[ 7: 0];
2'b01: vram_byte_lane = vram_read_data[15: 8];
2'b10: vram_byte_lane = vram_read_data[23:16];
2'b11: vram_byte_lane = vram_read_data[31:24];
endcase
end
assign psm16_r5 = psm16_pixel[4:0];
assign psm16_g5 = psm16_pixel[9:5];
assign psm16_b5 = psm16_pixel[14:10];
assign psm16_r8 = {psm16_r5, psm16_r5[4:2]};
assign psm16_g8 = {psm16_g5, psm16_g5[4:2]};
assign psm16_b8 = {psm16_b5, psm16_b5[4:2]};
assign psm8_idx = vram_byte_lane;
// Ch103 — PSMT4 nibble extraction. The byte at byte_offset
// holds two pixels: low nibble = even pixel, high nibble =
// odd pixel. pixel_index[0] picks which one this scanout
// cycle is reading. The 4-bit nibble zero-extends to an
// 8-bit CLUT index; the grayscale fallback replicates the
// nibble across both halves of an 8-bit channel value
// (4'hF → 8'hFF, 4'h5 → 8'h55, etc.).
//
// Ch138 — when PSMT4_SWIZZLE=1 AND the active PSM is PSMT4,
// the nibble selector comes from the swizzle module's
// `nibble_hi` output (which is `columnTable4[yb][xb] & 1` —
// the canonical PCSX2 selector under the swizzled layout).
// pixel_index[0] is the linear formula's selector; the
// swizzled formula needs the swizzle's own bit because the
// swizzle reorders pixels within a block.
logic psm4_nibble_select;
assign psm4_nibble_select = (PSMT4_SWIZZLE && dispfb_psm_t4)
? swizzle4_nibble_hi
: pixel_index[0];
// Ch158 — pair the nibble selector with vram_read_data: in
// legacy comb-read mode they are both same-cycle; in sync-read
// mode the selector is registered (psm4_nibble_select_dec) so
// it lines up with the registered VRAM data. The `_dec` mux
// selects between the two views via `VRAM_SYNC_READ`. The
// BYTE that holds the nibble is picked from `vram_byte_lane`
// (the byte_addr[1:0]-keyed lane in sync mode, the low lane
// in legacy mode — see the audit-Medium fix above).
assign psm4_nibble = psm4_nibble_select_dec ? vram_byte_lane[7:4]
: vram_byte_lane[3:0];
assign psm4_idx = {4'd0, psm4_nibble};
assign psm4_gray = {psm4_nibble, psm4_nibble};
// Ch97/Ch103 — CLUT effective index. `clut_csa` shifts the
// lookup window in 16-entry units. The 8-bit add wraps mod
// 256, matching the size of the staging area. The base index
// is the PSMT8 byte index for PSMT8, the zero-extended PSMT4
// nibble for PSMT4, otherwise unused (pcrtc just doesn't
// consume the CLUT output).
// Ch158 — clut_idx_base + clut_read_idx are derived from
// vram_read_data (already aligned with the data-decode stage)
// and from `dispfb_psm_t4_dec` (the registered/passthrough
// PSM flag), so the CLUT lookup happens on the same cycle as
// the pixel-emit decode comb.
logic [7:0] clut_idx_base;
assign clut_idx_base = dispfb_psm_t4_dec ? psm4_idx : psm8_idx;
assign clut_read_idx = clut_idx_base + {clut_csa, 4'd0};
always_comb begin
if (de && scanout_enable_dec && in_display_window_dec) begin
if (dispfb_psm_ct16_dec) begin
r = psm16_r8;
g = psm16_g8;
b = psm16_b8;
end else if (dispfb_psm_t8_dec) begin
if (clut_enable) begin
// CLUT lookup. Each entry is PSMCT32. Byte
// order matches PSMCT32 framebuffer reads:
// [7:0]=R, [15:8]=G, [23:16]=B, [31:24]=A
r = clut_read_data[7:0];
g = clut_read_data[15:8];
b = clut_read_data[23:16];
end else begin
// Ch96 fallback: surface index as grayscale.
r = psm8_idx;
g = psm8_idx;
b = psm8_idx;
end
end else if (dispfb_psm_t4_dec) begin
if (clut_enable) begin
// Ch103 — PSMT4 + CLUT. The 4-bit nibble has
// already been mux'd into clut_read_idx via
// clut_idx_base + (CSA<<4); the returned
// entry is PSMCT32 ABGR.
r = clut_read_data[7:0];
g = clut_read_data[15:8];
b = clut_read_data[23:16];
end else begin
// Grayscale fallback — replicate the nibble
// across the 8-bit DAC value so 4'hF → 8'hFF.
r = psm4_gray;
g = psm4_gray;
b = psm4_gray;
end
end else begin
// PSMCT32 — the only remaining format that
// dispfb_psm_ok admits at this scope.
r = vram_read_data[7:0];
g = vram_read_data[15:8];
b = vram_read_data[23:16];
end
end else begin
r = 8'd0;
g = 8'd0;
b = 8'd0;
end
end
// ------------------------------------------------------------------
// Trace: one EV_MODE per completed frame.
// ------------------------------------------------------------------
logic [31:0] frame_count;
always_ff @(posedge clk) begin
if (!rst_n) begin
frame_count <= 32'd0;
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_PLAT;
ev_event <= EV_MODE;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (end_of_frame_dec) begin
// Ch158: when VRAM_SYNC_READ=1, end_of_frame_dec lags
// the counter-side end_of_frame by 1 cycle so it fires
// when the LAST visible pixel actually emits (which is
// 1 cycle after the address-stage hits the last cell).
// Legacy comb-read passthrough makes end_of_frame_dec
// == end_of_frame, so existing TBs are unaffected.
frame_count <= frame_count + 32'd1;
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_PLAT;
ev_event <= EV_MODE;
ev_arg0 <= {32'd0, frame_count};
ev_arg1 <= {32'd0, 32'(H_ACTIVE * V_ACTIVE)};
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : gs_pcrtc_stub
+109
View File
@@ -0,0 +1,109 @@
// retroDE_ps2 — gs_persp_uv (Ch301)
//
// Per-pixel PERSPECTIVE-CORRECT texture-coordinate divide. Given the three
// affinely-interpolated perspective attributes at a pixel —
//
// uq = (u/w) * 2**FRAC (u-over-w, fixed-point)
// vq = (v/w) * 2**FRAC (v-over-w, fixed-point)
// q = (1/w) * 2**FRAC (one-over-w, fixed-point)
//
// — this recovers the integer texel coordinates:
//
// w_recip = 1/q (= w, via the pipelined gs_reciprocal_stub LUT, NO divider)
// u_texel = (uq * w_recip) >> SCALE (= (u/w) * w = u)
// v_texel = (vq * w_recip) >> SCALE (= (v/w) * w = v)
//
// gs_reciprocal_stub returns recip = floor(2**SCALE / q). With q = (1/w)<<FRAC
// that is recip = w << (SCALE-FRAC). Then uq*recip = (u/w<<FRAC)*(w<<(SCALE-FRAC))
// = u << SCALE, so (uq*recip) >> SCALE = u. (The FRAC scaling cancels.)
//
// Pipeline (NO divider, ~1 result/cycle):
// recip: RLAT cycles (gs_reciprocal_stub, 3).
// uq/vq: delayed RLAT cycles to align with recip.
// mul: 1 cycle (uq*recip, vq*recip) + shift + clamp.
// total latency = RLAT + 1.
//
// Output texel coords are clamped to [0, TEXEL_MAX] (saturating), matching the
// integer-coord clamp the affine path already applies.
`timescale 1ns/1ps
module gs_persp_uv #(
parameter int ATTR_W = 24, // width of uq/vq ((u/w)<<FRAC)
parameter int Q_W = 24, // width of q ((1/w)<<FRAC)
parameter int FRAC = 12, // fixed-point fraction bits of the attributes
parameter int SCALE = 24, // gs_reciprocal scale (recip = floor(2**SCALE/q))
parameter int RECIP_W = 25,
parameter int TEXEL_W = 11,
parameter int TEXEL_MAX = 2047,
// Ch351 — reciprocal LUT mantissa width. Default 8 (256-entry) is byte-identical to Ch301/342/348.
// Far-W perspective draws (small Q at high PERSP_FRAC) want more: 11 (2048-entry) ~ 0.05% rel error.
parameter int RECIP_IDX_BITS = 8
) (
input logic clk,
input logic rst_n,
input logic in_valid,
input logic [ATTR_W-1:0] uq,
input logic [ATTR_W-1:0] vq,
input logic [Q_W-1:0] q,
output logic out_valid,
output logic [TEXEL_W-1:0] u,
output logic [TEXEL_W-1:0] v
);
localparam int RLAT = 3; // gs_reciprocal_stub latency
// --- reciprocal of q (= w), pipelined LUT, no divider ---
logic recip_valid;
logic [RECIP_W-1:0] w_recip;
gs_reciprocal_stub #(
.Q_W(Q_W), .IDX_BITS(RECIP_IDX_BITS), .SCALE(SCALE), .OUT_W(RECIP_W)
) u_recip (
.clk(clk), .rst_n(rst_n),
.in_valid(in_valid), .q(q),
.out_valid(recip_valid), .recip(w_recip)
);
// --- delay uq/vq by RLAT to align with w_recip ---
logic [ATTR_W-1:0] uq_pipe [0:RLAT-1];
logic [ATTR_W-1:0] vq_pipe [0:RLAT-1];
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
for (int i = 0; i < RLAT; i++) begin
uq_pipe[i] <= '0;
vq_pipe[i] <= '0;
end
end else begin
uq_pipe[0] <= uq;
vq_pipe[0] <= vq;
for (int i = 1; i < RLAT; i++) begin
uq_pipe[i] <= uq_pipe[i-1];
vq_pipe[i] <= vq_pipe[i-1];
end
end
end
// --- multiply + shift + clamp (1 reg stage) ---
localparam int PROD_W = ATTR_W + RECIP_W;
function automatic logic [TEXEL_W-1:0] clamp_texel(input logic [PROD_W-1:0] prod);
logic [PROD_W-1:0] shifted;
shifted = prod >> SCALE;
if (shifted > PROD_W'(TEXEL_MAX)) clamp_texel = TEXEL_W'(TEXEL_MAX);
else clamp_texel = shifted[TEXEL_W-1:0];
endfunction
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
out_valid <= 1'b0;
u <= '0;
v <= '0;
end else begin
logic [PROD_W-1:0] u_prod, v_prod;
out_valid <= recip_valid;
u_prod = uq_pipe[RLAT-1] * w_recip;
v_prod = vq_pipe[RLAT-1] * w_recip;
u <= clamp_texel(u_prod);
v <= clamp_texel(v_prod);
end
end
endmodule : gs_persp_uv
+287
View File
@@ -0,0 +1,287 @@
// gs_prim_list_feeder — Ch330 Brick 1
//
// Runtime primitive-list feeder (minimal). Reads a NORMALIZED combined-TAZ triangle
// list from a small staging RAM and EXPANDS each record into the exact gif_reg_*
// write sequence gs_stub already consumes — reusing the entire proven ingestion
// (vertex window, bbox, attr packing, FIFO/grid, tile renderer). This is NOT GIF A+D
// decode: the format is per-PRIMITIVE records and the feeder knows the fixed
// "combined-TAZ triangle" shape (shared state once, then PRIM + 3 vertices each).
//
// In the Ch330 command-list profile the feeder is the EXCLUSIVE owner of gif_reg_*
// (no arbitration with the GIF unpacker / DMAC). It respects the SAME capacity
// boundary as the baked path: before the prim-completing vertex (vtx2's XYZ2) it
// PAUSES while fifo_full, so a full FIFO stalls the feeder instead of dropping prims.
//
// Staging layout (64-bit words, word-addressed):
// [0] : { ..., count[15:0] } — number of triangle records
// [1] : FRAME_1 data — shared state, emitted once at start
// [2] : ALPHA_1 data
// [3] : TEST_1 data
// [4] : ZBUF_1 data
// [5] : TEX0_1 data
// [6] : PRIM data — re-issued per triangle
// [7 + 9*i + 0..2] : tri i vtx0 RGBAQ/UV/XYZ2
// [7 + 9*i + 3..5] : tri i vtx1 RGBAQ/UV/XYZ2
// [7 + 9*i + 6..8] : tri i vtx2 RGBAQ/UV/XYZ2
//
// One `start` pulse plays the whole list; `done` pulses when it finishes.
// Boring on purpose: 2 cycles per emitted register (present addr, then drive).
`timescale 1ns/1ps
module gs_prim_list_feeder #(
parameter int STG_ADDR_W = 12
) (
input logic clk,
input logic rst_n,
input logic start,
output logic busy,
output logic done,
// Ch330 Brick 3 — observability counters (latched per list, cleared at start).
output logic [15:0] records_emitted, // primitives whose final XYZ2 was emitted
output logic [31:0] fifo_wait_cycles, // cycles paused at a completing kick under fifo_full
output logic [STG_ADDR_W-1:0] stg_rd_addr,
input logic [63:0] stg_rd_data,
input logic fifo_full,
output logic gif_reg_wr_en,
output logic [7:0] gif_reg_num,
output logic [63:0] gif_reg_data
);
localparam logic [7:0] REG_PRIM = 8'h00;
localparam logic [7:0] REG_RGBAQ = 8'h01;
localparam logic [7:0] REG_ST = 8'h02; // Ch342 — perspective ST (S/T) for FST=0 tris
localparam logic [7:0] REG_UV = 8'h03;
localparam logic [7:0] REG_XYZ2 = 8'h05;
localparam logic [7:0] REG_TEX0_1 = 8'h06;
localparam logic [7:0] REG_ALPHA_1 = 8'h42;
localparam logic [7:0] REG_TEST_1 = 8'h47;
localparam logic [7:0] REG_FRAME_1 = 8'h4C;
localparam logic [7:0] REG_ZBUF_1 = 8'h4E;
localparam int OFF_COUNT = 0;
localparam int OFF_FRAME = 1; // FRAME,ALPHA,TEST,ZBUF,TEX0,PRIM = words 1..6
localparam int OFF_TRIS = 7;
localparam int WORDS_PER_TRI = 9;
// Header registers (loaded once).
logic [15:0] tri_count;
logic [63:0] hdr_q [0:5]; // [0]=FRAME [1]=ALPHA [2]=TEST [3]=ZBUF [4]=TEX0 [5]=PRIM
// setup-emit index -> GIF reg num (iverilog-12: no unpacked localparam array).
function automatic logic [7:0] hdr_reg_num(input logic [2:0] i);
unique case (i)
3'd0: hdr_reg_num = REG_FRAME_1;
3'd1: hdr_reg_num = REG_ALPHA_1;
3'd2: hdr_reg_num = REG_TEST_1;
3'd3: hdr_reg_num = REG_ZBUF_1;
default: hdr_reg_num = REG_TEX0_1;
endcase
endfunction
typedef enum logic [3:0] {
S_IDLE,
S_HDR_RD, S_HDR_LD, // read words 0..6 into tri_count/rect_count + hdr_q
S_SETUP, // emit FRAME/ALPHA/TEST/ZBUF/TEX0 from hdr_q
S_PRIM, // emit PRIM (hdr_q[5]) for the current tri
S_VTX_RD, S_VTX_EMIT, // walk the 9 vertex words of the current tri
S_AFTER_TRIS, // Ch334 — tris done; start rects if any, else done
S_RECT_RD, S_RECT_LD, // Ch334 — read a rect's 3 words (color, corner0, corner1)
S_RECT_EMIT, // Ch334 — emit the 20-step 2-triangle expansion of one rect
S_DONE
} state_t;
localparam int WORDS_PER_RECT = 3; // Ch334 — color + corner0(XYZ2) + corner1(XYZ2)
state_t state;
logic [3:0] hdr_i; // 0..6 header-word read index
logic [2:0] setup_i; // 0..4 setup-emit index
logic [15:0] tri_idx; // 0..tri_count-1
logic [3:0] vtx_word; // 0..8 within a tri
// Ch334 — native rectangle records (one record -> two colored triangles, expanded HERE).
logic [15:0] rect_count; // count[31:16]
logic [15:0] rect_idx; // 0..rect_count-1
logic [1:0] rect_word; // 0..2 read index within a rect record
logic [63:0] rect_color; // RGBAQ for both triangles
logic [63:0] rect_c0, rect_c1; // the two opposite corners (XYZ2-packed)
logic [4:0] rect_emit; // 0..19 emit step
// Ch342 — PERSPECTIVE format flag (word0[32]). 0 = legacy RGBAQ/UV/XYZ2 per vertex (byte-exact).
// 1 = RGBAQ/ST/XYZ2: the middle vertex word is emitted as REG_ST (host packs S_fp[23:0]/T_fp[55:32],
// 24-bit FRAC=12) and RGBAQ carries Q_fp[55:32]; PRIM (hdr_q[5]) must be FST=0. Same 9 words/tri,
// same 27-tri cap. Rects are not allowed in this format (rect_count forced 0 at header load).
logic perspective_mode;
// Ch345a — SPRITE format flag (word0[33]). 1 = each primitive is a SPRITE record: 2 vertices x
// (RGBAQ, UV, XYZ2) = 6 words, vs a TRI's 3 vertices = 9 words. PRIM (hdr_q[5]) carries SPRITE+TME+ABE
// and gs_stub kicks on the 2nd XYZ2 per the PRIM type. Affine UV only (perspective_mode forced 0 with
// it); rects forced off. Same shared-state setup (FRAME/ALPHA/TEST/ZBUF/TEX0/PRIM). Narrow grammar:
// PSMCT32 dest+tex, UV affine, ABE source-over, TCC texel alpha — the Ch344-proven subset.
logic sprite_mode;
wire [4:0] words_per_prim = sprite_mode ? 5'd6 : 5'd9; // staging words per primitive
wire [3:0] last_vtx_word = sprite_mode ? 4'd5 : 4'd8; // final XYZ2 of the primitive (the kick)
logic [7:0] vtx_reg_num;
always_comb unique case (vtx_word % 3)
2'd0: vtx_reg_num = REG_RGBAQ;
2'd1: vtx_reg_num = perspective_mode ? REG_ST : REG_UV;
default: vtx_reg_num = REG_XYZ2;
endcase
wire vtx_completing = (vtx_word == last_vtx_word); // final XYZ2 = the FIFO push / kick
// Ch334 — corner fields (XYZ2 layout: x=[15:4], y=[31:20], z=[63:32]) + a packer.
wire [11:0] rx0 = rect_c0[15:4]; wire [11:0] ry0 = rect_c0[31:20];
wire [11:0] rx1 = rect_c1[15:4]; wire [11:0] ry1 = rect_c1[31:20];
wire [31:0] rz = rect_c0[63:32];
function automatic logic [63:0] mk_xyz2(input logic [11:0] x, input logic [11:0] y, input logic [31:0] z);
mk_xyz2 = {z, y, 4'd0, x, 4'd0};
endfunction
// 20-step expansion: [PRIM, (RGBAQ,UV,XYZ2)x3] x2. Two tris cover the quad (x0,y0)-(x1,y1).
logic [7:0] rect_reg;
logic [63:0] rect_dat;
always_comb begin
unique case (rect_emit)
5'd0, 5'd10: rect_reg = REG_PRIM;
5'd1,5'd4,5'd7,5'd11,5'd14,5'd17: rect_reg = REG_RGBAQ;
5'd2,5'd5,5'd8,5'd12,5'd15,5'd18: rect_reg = REG_UV;
default: rect_reg = REG_XYZ2; // 3,6,9,13,16,19
endcase
unique case (rect_emit)
5'd0, 5'd10: rect_dat = hdr_q[5]; // PRIM
5'd1,5'd4,5'd7,5'd11,5'd14,5'd17: rect_dat = rect_color; // RGBAQ
5'd2,5'd5,5'd8,5'd12,5'd15,5'd18: rect_dat = 64'd0; // UV (uniform texture)
5'd3: rect_dat = mk_xyz2(rx0, ry0, rz); // tri1 v0
5'd6: rect_dat = mk_xyz2(rx1, ry0, rz); // tri1 v1
5'd9: rect_dat = mk_xyz2(rx0, ry1, rz); // tri1 v2 (completes tri1)
5'd13: rect_dat = mk_xyz2(rx1, ry0, rz); // tri2 v0
5'd16: rect_dat = mk_xyz2(rx0, ry1, rz); // tri2 v1
default: rect_dat = mk_xyz2(rx1, ry1, rz); // tri2 v2 (idx 19, completes the rect)
endcase
end
wire rect_completing = (rect_emit == 5'd9) || (rect_emit == 5'd19); // the two FIFO pushes
assign busy = (state != S_IDLE) && (state != S_DONE);
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
state <= S_IDLE; tri_count <= 0; hdr_i <= 0; setup_i <= 0; perspective_mode <= 1'b0; sprite_mode <= 1'b0;
tri_idx <= 0; vtx_word <= 0; stg_rd_addr <= '0;
gif_reg_wr_en <= 1'b0; gif_reg_num <= 8'd0; gif_reg_data <= 64'd0; done <= 1'b0;
records_emitted <= 16'd0; fifo_wait_cycles <= 32'd0;
rect_count <= 0; rect_idx <= 0; rect_word <= 0; rect_emit <= 0;
rect_color <= 64'd0; rect_c0 <= 64'd0; rect_c1 <= 64'd0;
for (int k=0;k<6;k++) hdr_q[k] <= 64'd0;
end else begin
gif_reg_wr_en <= 1'b0;
done <= 1'b0;
unique case (state)
S_IDLE: if (start) begin
hdr_i <= 4'd0; stg_rd_addr <= STG_ADDR_W'(OFF_COUNT); state <= S_HDR_RD;
records_emitted <= 16'd0; fifo_wait_cycles <= 32'd0; // clear per list
end
// ---- header load: words 0..6 (count + 6 shared-state words) ----
S_HDR_RD: state <= S_HDR_LD; // addr presented; data next cycle
S_HDR_LD: begin
if (hdr_i == 4'd0) begin
tri_count <= stg_rd_data[15:0];
// Ch342 — word0[32] = perspective format flag; rects are not allowed with it
// (force rect_count 0 so the rect-expansion path can never run in this format).
perspective_mode <= stg_rd_data[32];
sprite_mode <= stg_rd_data[33]; // Ch345a
// rects disallowed with the perspective OR sprite format.
rect_count <= (stg_rd_data[32] || stg_rd_data[33]) ? 16'd0 : stg_rd_data[31:16];
end
else hdr_q[hdr_i-4'd1] <= stg_rd_data;
if (hdr_i == 4'd6) begin // all of count + hdr_q[0..5] loaded
setup_i <= 3'd0; state <= S_SETUP;
end else begin
hdr_i <= hdr_i + 4'd1;
stg_rd_addr <= STG_ADDR_W'(OFF_COUNT) + STG_ADDR_W'(hdr_i + 4'd1);
state <= S_HDR_RD;
end
end
// ---- emit shared state once (from hdr_q, no staging read) ----
S_SETUP: begin
gif_reg_wr_en <= 1'b1;
gif_reg_num <= hdr_reg_num(setup_i);
gif_reg_data <= hdr_q[setup_i];
if (setup_i == 3'd4) begin
tri_idx <= 16'd0;
state <= (tri_count == 16'd0) ? S_AFTER_TRIS : S_PRIM;
end else begin
setup_i <= setup_i + 3'd1;
end
end
// ---- per triangle: PRIM, then 9 vertex words ----
S_PRIM: begin
gif_reg_wr_en <= 1'b1; gif_reg_num <= REG_PRIM; gif_reg_data <= hdr_q[5];
vtx_word <= 4'd0;
stg_rd_addr <= STG_ADDR_W'(OFF_TRIS) + STG_ADDR_W'(tri_idx * words_per_prim);
state <= S_VTX_RD;
end
S_VTX_RD: state <= S_VTX_EMIT; // vert-word addr presented; data next cycle
S_VTX_EMIT: begin
if (vtx_completing && fifo_full) begin
// pause: hold addr/data, do not emit, until the FIFO drains a slot
fifo_wait_cycles <= fifo_wait_cycles + 32'd1;
state <= S_VTX_EMIT;
end else begin
gif_reg_wr_en <= 1'b1; gif_reg_num <= vtx_reg_num; gif_reg_data <= stg_rd_data;
if (vtx_word == last_vtx_word) begin
records_emitted <= records_emitted + 16'd1; // a primitive's final XYZ2 emitted
if (tri_idx + 16'd1 == tri_count) state <= S_AFTER_TRIS;
else begin tri_idx <= tri_idx + 16'd1; state <= S_PRIM; end
end else begin
vtx_word <= vtx_word + 4'd1;
stg_rd_addr <= stg_rd_addr + STG_ADDR_W'(1);
state <= S_VTX_RD;
end
end
end
// ---- Ch334 — native rectangles: each record = 3 words, expands to 2 tris ----
S_AFTER_TRIS: begin
if (rect_count != 16'd0) begin
rect_idx <= 16'd0; rect_word <= 2'd0;
stg_rd_addr <= STG_ADDR_W'(OFF_TRIS) + STG_ADDR_W'(tri_count * words_per_prim);
state <= S_RECT_RD;
end else state <= S_DONE;
end
S_RECT_RD: state <= S_RECT_LD; // rect-word addr presented; data next cycle
S_RECT_LD: begin
unique case (rect_word)
2'd0: rect_color <= stg_rd_data;
2'd1: rect_c0 <= stg_rd_data;
default: rect_c1 <= stg_rd_data;
endcase
stg_rd_addr <= stg_rd_addr + STG_ADDR_W'(1); // advance through every rect word
if (rect_word == 2'd2) begin rect_word <= 2'd0; rect_emit <= 5'd0; state <= S_RECT_EMIT; end
else begin rect_word <= rect_word + 2'd1; state <= S_RECT_RD; end
end
S_RECT_EMIT: begin
if (rect_completing && fifo_full) begin
fifo_wait_cycles <= fifo_wait_cycles + 32'd1; // pause at a completing XYZ2
state <= S_RECT_EMIT;
end else begin
gif_reg_wr_en <= 1'b1; gif_reg_num <= rect_reg; gif_reg_data <= rect_dat;
if (rect_completing) records_emitted <= records_emitted + 16'd1; // one tri done
if (rect_emit == 5'd19) begin // whole rect emitted (2 tris)
if (rect_idx + 16'd1 == rect_count) state <= S_DONE;
else begin rect_idx <= rect_idx + 16'd1; state <= S_RECT_RD; end // addr already at next base
end else rect_emit <= rect_emit + 5'd1;
end
end
S_DONE: begin done <= 1'b1; state <= S_IDLE; end
default: state <= S_IDLE;
endcase
end
end
endmodule : gs_prim_list_feeder
+127
View File
@@ -0,0 +1,127 @@
// retroDE_ps2 — gs_reciprocal_stub (Ch301)
//
// Pipelined fixed-point reciprocal unit for PERSPECTIVE-CORRECT texture
// interpolation. Computes recip = floor(2**SCALE / q) for an unsigned input
// q, with NO divider in the datapath — a serialized per-pixel divide would
// stall the ~1-pixel/cycle rasterizer (the architect's explicit constraint).
//
// Method — range-reduced table lookup (classic LUT reciprocal):
// 1. e = position of q's most-significant set bit (0..Q_W-1).
// 2. M = q normalized to an IDX_BITS-wide mantissa with its MSB at the top
// (M in [2**(IDX_BITS-1) .. 2**IDX_BITS-1)), i.e. q ~= M * 2**(e-(IDX_BITS-1)).
// 3. recip = LUT[M] >> e, where LUT[M] = floor(2**(SCALE+IDX_BITS-1) / M).
// Proof: LUT[M] >> e ~= 2**(SCALE+IDX_BITS-1)/(M * 2**e)
// = 2**SCALE / (M * 2**(e-(IDX_BITS-1)))
// = 2**SCALE / q. ✓ (uniform for all e)
//
// Accuracy is ~1 part in 2**IDX_BITS (relative). For the first perspective
// rung (texel coords <= 63) an 8-bit mantissa gives sub-texel error; bump
// IDX_BITS for tighter precision later if real traces demand it.
//
// Pipeline: 3 stages (LAT=3), one result per cycle.
// S0: register input q + valid.
// S1: e = msb(q); M = normalize(q).
// S2: lut_out = LUT[M]; carry e.
// S3: recip = lut_out >> e; out_valid.
//
// q==0 saturates to all-ones (1/0 -> +inf), which is harmless for the demo
// (q = 1/w with w finite positive is always > 0).
//
// LUT init is a computed `initial` for-loop (Quartus infers ROM from it). If a
// future synth flow rejects it, switch to $readmemh of a generated .mem.
`timescale 1ns/1ps
module gs_reciprocal_stub #(
parameter int Q_W = 24, // input width (q in [1, 2**Q_W))
parameter int IDX_BITS = 8, // mantissa / LUT-index width (256 entries)
parameter int SCALE = 24, // output = floor(2**SCALE / q)
parameter int OUT_W = 25 // output width (recip <= 2**SCALE for q>=1)
) (
input logic clk,
input logic rst_n,
input logic in_valid,
input logic [Q_W-1:0] q,
output logic out_valid,
output logic [OUT_W-1:0] recip
);
localparam int LUT_N = (1 << IDX_BITS);
localparam int TOP_BIT = IDX_BITS - 1; // mantissa MSB position
// LUT entries: floor(2**(SCALE+TOP_BIT) / M). Only M in [2**TOP_BIT .. LUT_N-1]
// are ever addressed (M always has its MSB set after normalization).
localparam int LUT_W = SCALE + 1; // wide enough for M=2**TOP_BIT
logic [LUT_W-1:0] lut [0:LUT_N-1];
initial begin
// 2**(SCALE+TOP_BIT) as a 64-bit constant numerator.
longint unsigned num;
num = (64'd1 << (SCALE + TOP_BIT));
for (int m = 0; m < LUT_N; m++) begin
if (m == 0) lut[m] = '0;
else lut[m] = LUT_W'(num / m);
end
end
// --- combinational msb-detect + normalize (S0->S1 inputs) ---
function automatic int unsigned msb_index(input logic [Q_W-1:0] v);
msb_index = 0;
for (int i = 0; i < Q_W; i++)
if (v[i]) msb_index = i;
endfunction
// ---------------- S1: e + mantissa (from the LIVE input) ----------------
// The msb-detect + normalize is combinational on the input q and registered
// here, so the whole unit is exactly 3 register stages (S1/S2/S3) → LAT=3.
logic s1_valid;
logic [$clog2(Q_W):0] s1_e;
logic [IDX_BITS-1:0] s1_m;
logic s1_zero;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s1_valid <= 1'b0; s1_e <= '0; s1_m <= '0; s1_zero <= 1'b0;
end else begin
int unsigned e;
logic [Q_W-1:0] norm;
e = msb_index(q);
s1_valid <= in_valid;
s1_zero <= (q == '0);
s1_e <= ($clog2(Q_W)+1)'(e);
// normalize so the mantissa MSB sits at bit TOP_BIT
if (e >= TOP_BIT) norm = q >> (e - TOP_BIT);
else norm = q << (TOP_BIT - e);
s1_m <= norm[IDX_BITS-1:0];
end
end
// ---------------- S2: LUT read ------------------------
logic s2_valid;
logic [$clog2(Q_W):0] s2_e;
logic [LUT_W-1:0] s2_lut;
logic s2_zero;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s2_valid <= 1'b0; s2_e <= '0; s2_lut <= '0; s2_zero <= 1'b0;
end else begin
s2_valid <= s1_valid;
s2_e <= s1_e;
s2_lut <= lut[s1_m];
s2_zero <= s1_zero;
end
end
// ---------------- S3: shift back ----------------------
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
out_valid <= 1'b0; recip <= '0;
end else begin
logic [LUT_W-1:0] shifted;
out_valid <= s2_valid;
shifted = s2_lut >> s2_e;
if (s2_zero) recip <= '1; // 1/0 -> saturate
else if (shifted > OUT_W'('1)) recip <= '1; // clamp to OUT_W
else recip <= OUT_W'(shifted);
end
end
endmodule : gs_reciprocal_stub
File diff suppressed because it is too large Load Diff
+231
View File
@@ -0,0 +1,231 @@
// retroDE_ps2 — gs_swizzle_psmct16_stub (Ch125)
//
// Pure-combinational PSMCT16 page/block/column swizzle: maps a
// pixel coordinate (x, y) within a framebuffer at (FBP, FBW) to
// its physical VRAM byte address using the real PS2 GS PSMCT16
// layout. Mirrors Ch119's `gs_swizzle_psmct32_stub` shape but
// with PSMCT16's 4-cols × 8-rows page block grid (represented
// as `blockTable16[8][4]` indexed `[block_y][block_x]`) and the
// within-block column table that PSMCT32 didn't need (CT32
// within-block IS row-major halfwords by accident; CT16 is not).
//
// THIS MODULE IS NOT YET WIRED INTO gs_pcrtc_stub /
// gif_image_xfer_stub / gs_stub. Future chapters will wire it
// behind a `PSMCT16_SWIZZLE`-style parameter gate, mirroring the
// PSMCT32 progression (Ch120 read-side → Ch121 image-xfer write
// side → Ch122 raster write side → Ch123/Ch124 e2e demos).
// Default-off keeps the legacy linear PSMCT16 TBs (Ch94, Ch95,
// Ch103, Ch116) on the linear path.
//
// SOURCE-TABLE PROVENANCE (per Codex's Ch125 guidance):
// blockTable16 — pcsx2/GS/GSTables.cpp lines 2939, master
// HEAD commit 3d71e310 (file-touch commit
// d983b2b0, 2026-01-12). 8 rows × 4 cols,
// indexed [block_y][block_x].
// columnTable16 — pcsx2/GS/GSTables.cpp lines 91109, same
// commit. 8 rows × 16 cols, indexed [yb][xb],
// values are halfword-within-block (0..127).
// Cross-check — older GSdx (Debian pcsx2 1.5.0~gfc1d9aef0)
// PixelAddressOrg16(x, y, bp, bw) =
// (BlockNumber16(x, y, bp, bw) << 7) +
// columnTable16[y & 7][x & 15], with
// BlockNumber16 = bp + ((y>>1) & ~0x1f)*bw +
// ((x>>1) & ~0x1f) +
// blockTable16[(y>>3)&7][(x>>4)&3].
// The `<< 7` confirms columnTable16 is in
// halfword units (block = 128 halfwords).
// Multiply final value by 2 for byte address.
// PCSX2's `bp` is in 256-byte block-pointer
// units; in our FBP (2048-byte) units,
// bp = FBP * 8, so bp*256 = FBP*2048.
//
// NOTE on PCSX2 license: the PCSX2 project is GPL-3.0+. This
// stub re-expresses the same PSMCT16 swizzle math in
// SystemVerilog as a hardware contract — the values in the
// blockTable16 / columnTable16 case statements come from PCSX2
// source and represent the PS2 hardware layout itself (not
// PCSX2-original creative content). The retroDE_ps2 project
// authors should consider whether this provenance affects
// licensing for downstream consumers; from an engineering
// correctness standpoint, locking against the canonical source
// is the only way to be byte-accurate to real PS2 VRAM.
//
// Real PS2 PSMCT16 layout:
// - VRAM is 4 MiB total, organized in 8 KiB pages.
// - Each page is 64×64 PSMCT16 pixels (= 64*64*2 = 8192 bytes).
// 2× as many pixels per page as PSMCT32 (which has 64×32 px)
// because each PSMCT16 pixel is 2 bytes vs CT32's 4.
// - Each page is divided into a 4×8 grid of blocks (4 cols of
// blocks across, 8 rows down). Each block is 16×8 PSMCT16
// pixels (= 16*8*2 = 256 bytes). 4×8 = 32 blocks/page.
// - Block ordering within a page follows blockTable16, which
// differs from blockTable32 because the grid shape is
// different (8×4 vs 4×8).
// - Within a block, halfword placement follows columnTable16:
// a 16×8 → 128-entry permutation that organizes the 4
// internal 16×2-pixel sub-columns and interleaves the two
// pixel rows per sub-column.
//
// Address formula (FBP in 2048-byte units; FBW in 64-pixel
// units; addr in bytes):
// page_x = x / 64
// page_y = y / 64
// page_index = page_y * FBW + page_x
// page_base = FBP*2048 + page_index*8192
//
// block_x_in_page = (x % 64) / 16 // 0..3
// block_y_in_page = (y % 64) / 8 // 0..7
// block_idx = blockTable16[block_y_in_page][block_x_in_page]
// block_base = page_base + block_idx*256
//
// xb = x % 16
// yb = y % 8
// hw_idx = columnTable16[yb][xb] // 0..127
// addr = block_base + hw_idx*2
`timescale 1ns/1ps
module gs_swizzle_psmct16_stub
(
input logic [8:0] fbp, // FBP — frame base, in 2048-byte units
input logic [5:0] fbw, // FBW — frame width, in 64-pixel units
input logic [11:0] x,
input logic [11:0] y,
output logic [31:0] addr
);
// --------------------------------------------------------------
// blockTable16 (verbatim from pcsx2/GS/GSTables.cpp lines 2939).
// Indexed [block_y_in_page (0..7)][block_x_in_page (0..3)].
// --------------------------------------------------------------
function automatic logic [4:0] swizzle_psmct16(
input logic [2:0] by,
input logic [1:0] bx);
case ({by, bx})
5'd0: return 5'd0; // (0,0)
5'd1: return 5'd2; // (0,1)
5'd2: return 5'd8; // (0,2)
5'd3: return 5'd10; // (0,3)
5'd4: return 5'd1; // (1,0)
5'd5: return 5'd3; // (1,1)
5'd6: return 5'd9; // (1,2)
5'd7: return 5'd11; // (1,3)
5'd8: return 5'd4; // (2,0)
5'd9: return 5'd6; // (2,1)
5'd10: return 5'd12; // (2,2)
5'd11: return 5'd14; // (2,3)
5'd12: return 5'd5; // (3,0)
5'd13: return 5'd7; // (3,1)
5'd14: return 5'd13; // (3,2)
5'd15: return 5'd15; // (3,3)
5'd16: return 5'd16; // (4,0)
5'd17: return 5'd18; // (4,1)
5'd18: return 5'd24; // (4,2)
5'd19: return 5'd26; // (4,3)
5'd20: return 5'd17; // (5,0)
5'd21: return 5'd19; // (5,1)
5'd22: return 5'd25; // (5,2)
5'd23: return 5'd27; // (5,3)
5'd24: return 5'd20; // (6,0)
5'd25: return 5'd22; // (6,1)
5'd26: return 5'd28; // (6,2)
5'd27: return 5'd30; // (6,3)
5'd28: return 5'd21; // (7,0)
5'd29: return 5'd23; // (7,1)
5'd30: return 5'd29; // (7,2)
default: return 5'd31; // (7,3)
endcase
endfunction
// --------------------------------------------------------------
// columnTable16 (verbatim from pcsx2/GS/GSTables.cpp lines 91109).
// Indexed [yb (0..7)][xb (0..15)] → halfword-within-block 0..127.
// yb=0: 0 2 8 10 16 18 24 26 1 3 9 11 17 19 25 27
// yb=1: 4 6 12 14 20 22 28 30 5 7 13 15 21 23 29 31
// yb=2: 32 34 40 42 48 50 56 58 33 35 41 43 49 51 57 59
// yb=3: 36 38 44 46 52 54 60 62 37 39 45 47 53 55 61 63
// yb=4: 64 66 72 74 80 82 88 90 65 67 73 75 81 83 89 91
// yb=5: 68 70 76 78 84 86 92 94 69 71 77 79 85 87 93 95
// yb=6: 96 98 104 106 112 114 120 122 97 99 105 107 113 115 121 123
// yb=7: 100 102 108 110 116 118 124 126 101 103 109 111 117 119 125 127
// --------------------------------------------------------------
function automatic logic [6:0] col_idx_psmct16(
input logic [2:0] yb,
input logic [3:0] xb);
case ({yb, xb})
// yb=0
7'd0: return 7'd0; 7'd1: return 7'd2; 7'd2: return 7'd8; 7'd3: return 7'd10;
7'd4: return 7'd16; 7'd5: return 7'd18; 7'd6: return 7'd24; 7'd7: return 7'd26;
7'd8: return 7'd1; 7'd9: return 7'd3; 7'd10: return 7'd9; 7'd11: return 7'd11;
7'd12: return 7'd17; 7'd13: return 7'd19; 7'd14: return 7'd25; 7'd15: return 7'd27;
// yb=1
7'd16: return 7'd4; 7'd17: return 7'd6; 7'd18: return 7'd12; 7'd19: return 7'd14;
7'd20: return 7'd20; 7'd21: return 7'd22; 7'd22: return 7'd28; 7'd23: return 7'd30;
7'd24: return 7'd5; 7'd25: return 7'd7; 7'd26: return 7'd13; 7'd27: return 7'd15;
7'd28: return 7'd21; 7'd29: return 7'd23; 7'd30: return 7'd29; 7'd31: return 7'd31;
// yb=2
7'd32: return 7'd32; 7'd33: return 7'd34; 7'd34: return 7'd40; 7'd35: return 7'd42;
7'd36: return 7'd48; 7'd37: return 7'd50; 7'd38: return 7'd56; 7'd39: return 7'd58;
7'd40: return 7'd33; 7'd41: return 7'd35; 7'd42: return 7'd41; 7'd43: return 7'd43;
7'd44: return 7'd49; 7'd45: return 7'd51; 7'd46: return 7'd57; 7'd47: return 7'd59;
// yb=3
7'd48: return 7'd36; 7'd49: return 7'd38; 7'd50: return 7'd44; 7'd51: return 7'd46;
7'd52: return 7'd52; 7'd53: return 7'd54; 7'd54: return 7'd60; 7'd55: return 7'd62;
7'd56: return 7'd37; 7'd57: return 7'd39; 7'd58: return 7'd45; 7'd59: return 7'd47;
7'd60: return 7'd53; 7'd61: return 7'd55; 7'd62: return 7'd61; 7'd63: return 7'd63;
// yb=4
7'd64: return 7'd64; 7'd65: return 7'd66; 7'd66: return 7'd72; 7'd67: return 7'd74;
7'd68: return 7'd80; 7'd69: return 7'd82; 7'd70: return 7'd88; 7'd71: return 7'd90;
7'd72: return 7'd65; 7'd73: return 7'd67; 7'd74: return 7'd73; 7'd75: return 7'd75;
7'd76: return 7'd81; 7'd77: return 7'd83; 7'd78: return 7'd89; 7'd79: return 7'd91;
// yb=5
7'd80: return 7'd68; 7'd81: return 7'd70; 7'd82: return 7'd76; 7'd83: return 7'd78;
7'd84: return 7'd84; 7'd85: return 7'd86; 7'd86: return 7'd92; 7'd87: return 7'd94;
7'd88: return 7'd69; 7'd89: return 7'd71; 7'd90: return 7'd77; 7'd91: return 7'd79;
7'd92: return 7'd85; 7'd93: return 7'd87; 7'd94: return 7'd93; 7'd95: return 7'd95;
// yb=6
7'd96: return 7'd96; 7'd97: return 7'd98; 7'd98: return 7'd104; 7'd99: return 7'd106;
7'd100: return 7'd112; 7'd101: return 7'd114; 7'd102: return 7'd120; 7'd103: return 7'd122;
7'd104: return 7'd97; 7'd105: return 7'd99; 7'd106: return 7'd105; 7'd107: return 7'd107;
7'd108: return 7'd113; 7'd109: return 7'd115; 7'd110: return 7'd121; 7'd111: return 7'd123;
// yb=7
7'd112: return 7'd100; 7'd113: return 7'd102; 7'd114: return 7'd108; 7'd115: return 7'd110;
7'd116: return 7'd116; 7'd117: return 7'd118; 7'd118: return 7'd124; 7'd119: return 7'd126;
7'd120: return 7'd101; 7'd121: return 7'd103; 7'd122: return 7'd109; 7'd123: return 7'd111;
7'd124: return 7'd117; 7'd125: return 7'd119; 7'd126: return 7'd125; default: return 7'd127;
endcase
endfunction
// Decompose pixel coord into page / block / pixel-in-block.
logic [11:0] page_x;
logic [11:0] page_y;
logic [2:0] by;
logic [1:0] bx;
logic [3:0] xb;
logic [2:0] yb;
assign page_x = x >> 6; // x / 64
assign page_y = y >> 6; // y / 64
assign by = y[5:3]; // (y % 64) / 8
assign bx = x[5:4]; // (x % 64) / 16
assign xb = x[3:0]; // x % 16
assign yb = y[2:0]; // y % 8
logic [4:0] block_idx;
assign block_idx = swizzle_psmct16(by, bx);
logic [6:0] hw_idx;
assign hw_idx = col_idx_psmct16(yb, xb);
logic [31:0] page_base;
logic [31:0] block_base;
logic [31:0] byte_in_block;
logic [31:0] page_index;
assign page_index = ({20'd0, page_y} * {26'd0, fbw}) + {20'd0, page_x};
assign page_base = ({23'd0, fbp} << 11) + (page_index << 13); // FBP*2048 + page_index*8192
assign block_base = page_base + ({27'd0, block_idx} << 8); // + block_idx*256
assign byte_in_block = {24'd0, hw_idx, 1'b0}; // hw_idx * 2
assign addr = block_base + byte_in_block;
endmodule : gs_swizzle_psmct16_stub
+148
View File
@@ -0,0 +1,148 @@
// retroDE_ps2 — gs_swizzle_psmct32_stub (Ch119)
//
// Pure-combinational PSMCT32 page/block swizzle: maps a pixel
// coordinate (x, y) within a framebuffer at (FBP, FBW) to its
// physical VRAM byte address using the real PS2 GS layout.
//
// THIS MODULE DOES NOT YET REPLACE THE LINEAR ADDRESSING in
// gs_stub / gs_pcrtc_stub / gif_image_xfer_stub. It is the
// math primitive that future chapters will wire into the
// existing address paths to swap "linear FBW*64*y + x*4"
// for the real GS swizzled addressing. Ch119 establishes the
// math, locks it against the canonical PCSX2 PSMCT32 block
// table with a focused TB, and leaves integration to follow-on
// chapters so the existing 109 TBs stay on the linear path.
//
// Real PS2 PSMCT32 layout (per PCSX2 GS source):
// - VRAM is 4 MiB total, organized in 8 KiB pages.
// - Each page is 64×32 PSMCT32 pixels (= 64×32×4 = 8192 bytes).
// - Each page is divided into a 4×8 grid of blocks (4 rows of
// blocks, 8 cols of blocks per row), each block 8×8 pixels
// (= 256 bytes). 4×8 = 32 blocks/page.
// - Block ordering within a page is NOT row-major; it follows
// the PSMCT32 swizzle table below (a Z-order-like permutation
// of (block_x, block_y) that PCSX2's GSLocalMemoryFunctions.cpp
// defines verbatim).
// - Within a block, PSMCT32 is row-major: pixel (xb, yb) maps
// to byte_offset_in_block = yb*32 + xb*4 (no further swizzle
// for PSMCT32 — other PSMs have intra-block reorderings).
//
// Address formula (linear in pages, swizzled in blocks within a
// page, linear within a block):
// pages_per_fbrow = FBW // FBW is in 64-px units; PSMCT32 page is 64 px wide
// page_x = x / 64
// page_y = y / 32
// page_index = page_y * pages_per_fbrow + page_x
// page_base = FBP*2048 + page_index*8192
//
// block_x_in_page = (x % 64) / 8 // 0..7
// block_y_in_page = (y % 32) / 8 // 0..3
// block_idx = SWIZZLE[block_y_in_page][block_x_in_page]
// block_base = page_base + block_idx*256
//
// xb = x % 8
// yb = y % 8
// byte_in_block = yb*32 + xb*4
//
// addr = block_base + byte_in_block
//
// FBP is a 9-bit field at 2048-byte granularity, so FBP*2048
// can land at any 2048-byte boundary in VRAM — including mid-
// page boundaries (FBP[1:0] != 0). The math here treats FBP*2048
// as the literal byte base, with the swizzled page/block/pixel
// offset added on top, which matches real-PS2 behavior. Page-
// aligned FBP (FBP[1:0]==0) is the common case in our demo, but
// the address formula is bit-correct for any 2048-byte-aligned
// FBP, and the focused TB exercises non-page-aligned FBP=1,2,3
// to lock that.
`timescale 1ns/1ps
module gs_swizzle_psmct32_stub
(
// Framebuffer config (matches FRAME_1 register fields).
input logic [8:0] fbp, // FBP — frame base, in 2048-byte units
input logic [5:0] fbw, // FBW — frame width, in 64-pixel units
// Pixel coordinate within the framebuffer.
input logic [11:0] x,
input logic [11:0] y,
// Resulting VRAM byte address.
output logic [31:0] addr
);
// --------------------------------------------------------------
// Block swizzle table for PSMCT32 (PCSX2 GSLocalMemoryFunctions.cpp,
// psmt32 block order). Indexed [block_y_in_page][block_x_in_page]
// with block_x ∈ 0..7 and block_y ∈ 0..3; value is the linear
// block index within the page (0..31).
// --------------------------------------------------------------
function automatic logic [4:0] swizzle_psmct32(
input logic [1:0] by, // block_y_in_page (0..3)
input logic [2:0] bx); // block_x_in_page (0..7)
case ({by, bx})
5'h00: return 5'd0; // (0,0)
5'h01: return 5'd1; // (0,1)
5'h02: return 5'd4; // (0,2)
5'h03: return 5'd5; // (0,3)
5'h04: return 5'd16; // (0,4)
5'h05: return 5'd17; // (0,5)
5'h06: return 5'd20; // (0,6)
5'h07: return 5'd21; // (0,7)
5'h08: return 5'd2; // (1,0)
5'h09: return 5'd3; // (1,1)
5'h0A: return 5'd6; // (1,2)
5'h0B: return 5'd7; // (1,3)
5'h0C: return 5'd18; // (1,4)
5'h0D: return 5'd19; // (1,5)
5'h0E: return 5'd22; // (1,6)
5'h0F: return 5'd23; // (1,7)
5'h10: return 5'd8; // (2,0)
5'h11: return 5'd9; // (2,1)
5'h12: return 5'd12; // (2,2)
5'h13: return 5'd13; // (2,3)
5'h14: return 5'd24; // (2,4)
5'h15: return 5'd25; // (2,5)
5'h16: return 5'd28; // (2,6)
5'h17: return 5'd29; // (2,7)
5'h18: return 5'd10; // (3,0)
5'h19: return 5'd11; // (3,1)
5'h1A: return 5'd14; // (3,2)
5'h1B: return 5'd15; // (3,3)
5'h1C: return 5'd26; // (3,4)
5'h1D: return 5'd27; // (3,5)
5'h1E: return 5'd30; // (3,6)
default: return 5'd31; // (3,7)
endcase
endfunction
// Decompose pixel coord into page / block / pixel-in-block.
logic [11:0] page_x;
logic [11:0] page_y;
logic [1:0] by;
logic [2:0] bx;
logic [2:0] xb;
logic [2:0] yb;
assign page_x = x >> 6; // x / 64
assign page_y = y >> 5; // y / 32
assign by = y[4:3]; // (y % 32) / 8
assign bx = x[5:3]; // (x % 64) / 8
assign xb = x[2:0]; // x % 8
assign yb = y[2:0]; // y % 8
logic [4:0] block_idx;
assign block_idx = swizzle_psmct32(by, bx);
// Address composition.
logic [31:0] page_base;
logic [31:0] block_base;
logic [31:0] byte_in_block;
logic [31:0] page_index;
assign page_index = ({20'd0, page_y} * {26'd0, fbw}) + {20'd0, page_x};
assign page_base = ({23'd0, fbp} << 11) + (page_index << 13); // FBP*2048 + page_index*8192
assign block_base = page_base + ({27'd0, block_idx} << 8); // + block_idx*256
assign byte_in_block = ({29'd0, yb} << 5) + ({29'd0, xb} << 2); // yb*32 + xb*4
assign addr = block_base + byte_in_block;
endmodule : gs_swizzle_psmct32_stub
+339
View File
@@ -0,0 +1,339 @@
// retroDE_ps2 — gs_swizzle_psmt4_stub (Ch137)
//
// Pure-combinational PSMT4 page/block/column swizzle: maps a
// pixel coordinate (x, y) within a framebuffer at (FBP, FBW) to
// its physical VRAM byte address AND high/low nibble select
// using the real PS2 GS PSMT4 layout. Mirrors Ch119's PSMCT32 +
// Ch125's PSMCT16 + Ch131's PSMT8 stubs but with PSMT4's wider
// 32-px-wide block, 32×16 within-block nibble layout, and the
// half-byte addressing distinction (each PSMT4 pixel is 4 bits;
// two PSMT4 pixels share a byte). Output `nibble_hi` selects
// which nibble of the byte at `addr` the pixel occupies.
//
// THIS MODULE IS NOT YET WIRED INTO gs_pcrtc_stub /
// gif_image_xfer_stub / gs_stub. Future chapters will wire it
// behind a `PSMT4_SWIZZLE`-style parameter gate, mirroring the
// PSMCT32 (Ch120/121/122), PSMCT16 (Ch126/127/128), and PSMT8
// (Ch132/133/134) progressions. Default-off keeps the legacy
// linear PSMT4 TBs (Ch103, Ch106, Ch118, Ch107 e2e palette path)
// on the linear path. The existing per-bit write_mask 0x0F/0xF0
// nibble RMW from Ch106/Ch118 will still apply on top of the
// swizzled byte address — the swizzle doesn't touch the nibble
// merge logic.
//
// SOURCE-TABLE PROVENANCE (per Codex's Ch125/Ch131/Ch137 guidance):
// blockTable4 — pcsx2/GS/GSTables.cpp lines 6169, master
// HEAD commit 3000e113e2b3a76357c08dfa80d3c747f40e2706
// (file blob SHA 3581209b8217378f473f9de22a9dbc8c45ca49b6).
// 8 rows × 4 cols, indexed [block_y][block_x].
// columnTable4 — pcsx2/GS/GSTables.cpp lines 147213, same
// commit. 16 rows × 32 cols, indexed [yb][xb],
// values are nibble-within-block (0..511).
// Cross-check — GSLocalMemory.h:558 BlockNumber4 + the
// pxOffset template at GSTables.cpp:247258
// (blockSize=512, pageSize=16384, pageWidth=128,
// note `blockSize` here is in NIBBLES; byte-
// grain pageSize = 8192 = 16384 nibbles / 2).
// PSMT4 has pageShiftX=7, pageShiftY=7,
// blockShiftX=5, blockShiftY=4,
// m_bwPg = bw >> (pageShiftX - 6) = bw >> 1
// (so FBW must be even for PSMT4 — PCSX2 asserts
// `(bw & 1) == 0` at GSLocalMemory.h:560).
// PCSX2's `bp` is in 256-byte block-pointer
// units; in our FBP (2048-byte) units,
// bp = FBP * 8, so bp*256 = FBP*2048.
//
// NOTE on PCSX2 license: the PCSX2 project is GPL-3.0+. This
// stub re-expresses the same PSMT4 swizzle math in SystemVerilog
// as a hardware contract — the values in the blockTable4 /
// columnTable4 case statements come from PCSX2 source and
// represent the PS2 hardware layout itself (not PCSX2-original
// creative content). The retroDE_ps2 project authors should
// consider whether this provenance affects licensing for
// downstream consumers; from an engineering correctness
// standpoint, locking against the canonical source is the only
// way to be byte-accurate to real PS2 VRAM.
//
// Real PS2 PSMT4 layout:
// - VRAM is 4 MiB total, organized in 8 KiB pages.
// - Each page is 128×128 PSMT4 pixels (= 128*128/2 = 8192
// bytes — 2 PSMT4 pixels per byte). 4× as many pixels per
// page as PSMT8 (128×64) and same byte stride.
// - Each page is divided into a 4×8 grid of blocks (4 cols of
// blocks across, 8 rows down). Each block is 32×16 PSMT4
// pixels (= 32*16/2 = 256 bytes = 512 nibbles). 4×8 = 32
// blocks/page (same number of blocks as the other PSMs).
// - Block ordering within a page follows blockTable4 (which
// happens to be identical to PSMCT16's blockTable16 — both
// PSMs share the same block grid orientation).
// - Within a block, NIBBLE placement follows columnTable4: a
// 32×16 → 512-entry permutation that organizes the internal
// 8-wide × 4-tall sub-columns and 4-tall row-groups of the
// block.
//
// Address formula (FBP in 2048-byte units; FBW in 64-pixel
// units; addr in BYTES; FBW must be even):
// page_x = x / 128
// page_y = y / 128
// bw_pg = FBW / 2 // pages per row
// page_index = page_y * bw_pg + page_x
// page_base = FBP*2048 + page_index*8192
//
// block_x_in_page = (x % 128) / 32 // 0..3
// block_y_in_page = (y % 128) / 16 // 0..7
// block_idx = blockTable4[block_y_in_page][block_x_in_page]
// block_base = page_base + block_idx*256
//
// xb = x % 32
// yb = y % 16
// nibble_idx = columnTable4[yb][xb] // 0..511
// byte_in_block = nibble_idx >> 1 // 0..255
// addr = block_base + byte_in_block
// nibble_hi = nibble_idx[0] // 0=low nibble, 1=high
`timescale 1ns/1ps
module gs_swizzle_psmt4_stub
(
input logic [8:0] fbp, // FBP — frame base, in 2048-byte units
input logic [5:0] fbw, // FBW — frame width, in 64-pixel units (must be even)
input logic [11:0] x,
input logic [11:0] y,
output logic [31:0] addr,
output logic nibble_hi
);
// --------------------------------------------------------------
// blockTable4 (verbatim from pcsx2/GS/GSTables.cpp lines 6169).
// Indexed [block_y_in_page (0..7)][block_x_in_page (0..3)].
// by=0: { 0, 2, 8, 10 }
// by=1: { 1, 3, 9, 11 }
// by=2: { 4, 6, 12, 14 }
// by=3: { 5, 7, 13, 15 }
// by=4: { 16, 18, 24, 26 }
// by=5: { 17, 19, 25, 27 }
// by=6: { 20, 22, 28, 30 }
// by=7: { 21, 23, 29, 31 }
// --------------------------------------------------------------
function automatic logic [4:0] swizzle_psmt4(
input logic [2:0] by,
input logic [1:0] bx);
case ({by, bx})
5'd0: return 5'd0; 5'd1: return 5'd2; 5'd2: return 5'd8; 5'd3: return 5'd10;
5'd4: return 5'd1; 5'd5: return 5'd3; 5'd6: return 5'd9; 5'd7: return 5'd11;
5'd8: return 5'd4; 5'd9: return 5'd6; 5'd10: return 5'd12; 5'd11: return 5'd14;
5'd12: return 5'd5; 5'd13: return 5'd7; 5'd14: return 5'd13; 5'd15: return 5'd15;
5'd16: return 5'd16; 5'd17: return 5'd18; 5'd18: return 5'd24; 5'd19: return 5'd26;
5'd20: return 5'd17; 5'd21: return 5'd19; 5'd22: return 5'd25; 5'd23: return 5'd27;
5'd24: return 5'd20; 5'd25: return 5'd22; 5'd26: return 5'd28; 5'd27: return 5'd30;
5'd28: return 5'd21; 5'd29: return 5'd23; 5'd30: return 5'd29; default: return 5'd31;
endcase
endfunction
// --------------------------------------------------------------
// columnTable4 (verbatim from pcsx2/GS/GSTables.cpp lines 147213).
// Indexed [yb (0..15)][xb (0..31)] → nibble-within-block 0..511.
// 512 entries total. Encoded as one large case statement on
// {yb, xb} (4+5 = 9 bits). Comments separate yb-block boundaries.
// --------------------------------------------------------------
function automatic logic [8:0] col_idx_psmt4(
input logic [3:0] yb,
input logic [4:0] xb);
case ({yb, xb})
// yb=0: 0 8 32 40 64 72 96 104 2 10 34 42 66 74 98 106
// 4 12 36 44 68 76 100 108 6 14 38 46 70 78 102 110
9'd0: return 9'd0; 9'd1: return 9'd8; 9'd2: return 9'd32; 9'd3: return 9'd40;
9'd4: return 9'd64; 9'd5: return 9'd72; 9'd6: return 9'd96; 9'd7: return 9'd104;
9'd8: return 9'd2; 9'd9: return 9'd10; 9'd10: return 9'd34; 9'd11: return 9'd42;
9'd12: return 9'd66; 9'd13: return 9'd74; 9'd14: return 9'd98; 9'd15: return 9'd106;
9'd16: return 9'd4; 9'd17: return 9'd12; 9'd18: return 9'd36; 9'd19: return 9'd44;
9'd20: return 9'd68; 9'd21: return 9'd76; 9'd22: return 9'd100; 9'd23: return 9'd108;
9'd24: return 9'd6; 9'd25: return 9'd14; 9'd26: return 9'd38; 9'd27: return 9'd46;
9'd28: return 9'd70; 9'd29: return 9'd78; 9'd30: return 9'd102; 9'd31: return 9'd110;
// yb=1: 16 24 48 56 80 88 112 120 18 26 50 58 82 90 114 122
// 20 28 52 60 84 92 116 124 22 30 54 62 86 94 118 126
9'd32: return 9'd16; 9'd33: return 9'd24; 9'd34: return 9'd48; 9'd35: return 9'd56;
9'd36: return 9'd80; 9'd37: return 9'd88; 9'd38: return 9'd112; 9'd39: return 9'd120;
9'd40: return 9'd18; 9'd41: return 9'd26; 9'd42: return 9'd50; 9'd43: return 9'd58;
9'd44: return 9'd82; 9'd45: return 9'd90; 9'd46: return 9'd114; 9'd47: return 9'd122;
9'd48: return 9'd20; 9'd49: return 9'd28; 9'd50: return 9'd52; 9'd51: return 9'd60;
9'd52: return 9'd84; 9'd53: return 9'd92; 9'd54: return 9'd116; 9'd55: return 9'd124;
9'd56: return 9'd22; 9'd57: return 9'd30; 9'd58: return 9'd54; 9'd59: return 9'd62;
9'd60: return 9'd86; 9'd61: return 9'd94; 9'd62: return 9'd118; 9'd63: return 9'd126;
// yb=2: 65 73 97 105 1 9 33 41 67 75 99 107 3 11 35 43
// 69 77 101 109 5 13 37 45 71 79 103 111 7 15 39 47
9'd64: return 9'd65; 9'd65: return 9'd73; 9'd66: return 9'd97; 9'd67: return 9'd105;
9'd68: return 9'd1; 9'd69: return 9'd9; 9'd70: return 9'd33; 9'd71: return 9'd41;
9'd72: return 9'd67; 9'd73: return 9'd75; 9'd74: return 9'd99; 9'd75: return 9'd107;
9'd76: return 9'd3; 9'd77: return 9'd11; 9'd78: return 9'd35; 9'd79: return 9'd43;
9'd80: return 9'd69; 9'd81: return 9'd77; 9'd82: return 9'd101; 9'd83: return 9'd109;
9'd84: return 9'd5; 9'd85: return 9'd13; 9'd86: return 9'd37; 9'd87: return 9'd45;
9'd88: return 9'd71; 9'd89: return 9'd79; 9'd90: return 9'd103; 9'd91: return 9'd111;
9'd92: return 9'd7; 9'd93: return 9'd15; 9'd94: return 9'd39; 9'd95: return 9'd47;
// yb=3: 81 89 113 121 17 25 49 57 83 91 115 123 19 27 51 59
// 85 93 117 125 21 29 53 61 87 95 119 127 23 31 55 63
9'd96: return 9'd81; 9'd97: return 9'd89; 9'd98: return 9'd113; 9'd99: return 9'd121;
9'd100: return 9'd17; 9'd101: return 9'd25; 9'd102: return 9'd49; 9'd103: return 9'd57;
9'd104: return 9'd83; 9'd105: return 9'd91; 9'd106: return 9'd115; 9'd107: return 9'd123;
9'd108: return 9'd19; 9'd109: return 9'd27; 9'd110: return 9'd51; 9'd111: return 9'd59;
9'd112: return 9'd85; 9'd113: return 9'd93; 9'd114: return 9'd117; 9'd115: return 9'd125;
9'd116: return 9'd21; 9'd117: return 9'd29; 9'd118: return 9'd53; 9'd119: return 9'd61;
9'd120: return 9'd87; 9'd121: return 9'd95; 9'd122: return 9'd119; 9'd123: return 9'd127;
9'd124: return 9'd23; 9'd125: return 9'd31; 9'd126: return 9'd55; 9'd127: return 9'd63;
// yb=4: 192 200 224 232 128 136 160 168 194 202 226 234 130 138 162 170
// 196 204 228 236 132 140 164 172 198 206 230 238 134 142 166 174
9'd128: return 9'd192; 9'd129: return 9'd200; 9'd130: return 9'd224; 9'd131: return 9'd232;
9'd132: return 9'd128; 9'd133: return 9'd136; 9'd134: return 9'd160; 9'd135: return 9'd168;
9'd136: return 9'd194; 9'd137: return 9'd202; 9'd138: return 9'd226; 9'd139: return 9'd234;
9'd140: return 9'd130; 9'd141: return 9'd138; 9'd142: return 9'd162; 9'd143: return 9'd170;
9'd144: return 9'd196; 9'd145: return 9'd204; 9'd146: return 9'd228; 9'd147: return 9'd236;
9'd148: return 9'd132; 9'd149: return 9'd140; 9'd150: return 9'd164; 9'd151: return 9'd172;
9'd152: return 9'd198; 9'd153: return 9'd206; 9'd154: return 9'd230; 9'd155: return 9'd238;
9'd156: return 9'd134; 9'd157: return 9'd142; 9'd158: return 9'd166; 9'd159: return 9'd174;
// yb=5: 208 216 240 248 144 152 176 184 210 218 242 250 146 154 178 186
// 212 220 244 252 148 156 180 188 214 222 246 254 150 158 182 190
9'd160: return 9'd208; 9'd161: return 9'd216; 9'd162: return 9'd240; 9'd163: return 9'd248;
9'd164: return 9'd144; 9'd165: return 9'd152; 9'd166: return 9'd176; 9'd167: return 9'd184;
9'd168: return 9'd210; 9'd169: return 9'd218; 9'd170: return 9'd242; 9'd171: return 9'd250;
9'd172: return 9'd146; 9'd173: return 9'd154; 9'd174: return 9'd178; 9'd175: return 9'd186;
9'd176: return 9'd212; 9'd177: return 9'd220; 9'd178: return 9'd244; 9'd179: return 9'd252;
9'd180: return 9'd148; 9'd181: return 9'd156; 9'd182: return 9'd180; 9'd183: return 9'd188;
9'd184: return 9'd214; 9'd185: return 9'd222; 9'd186: return 9'd246; 9'd187: return 9'd254;
9'd188: return 9'd150; 9'd189: return 9'd158; 9'd190: return 9'd182; 9'd191: return 9'd190;
// yb=6: 129 137 161 169 193 201 225 233 131 139 163 171 195 203 227 235
// 133 141 165 173 197 205 229 237 135 143 167 175 199 207 231 239
9'd192: return 9'd129; 9'd193: return 9'd137; 9'd194: return 9'd161; 9'd195: return 9'd169;
9'd196: return 9'd193; 9'd197: return 9'd201; 9'd198: return 9'd225; 9'd199: return 9'd233;
9'd200: return 9'd131; 9'd201: return 9'd139; 9'd202: return 9'd163; 9'd203: return 9'd171;
9'd204: return 9'd195; 9'd205: return 9'd203; 9'd206: return 9'd227; 9'd207: return 9'd235;
9'd208: return 9'd133; 9'd209: return 9'd141; 9'd210: return 9'd165; 9'd211: return 9'd173;
9'd212: return 9'd197; 9'd213: return 9'd205; 9'd214: return 9'd229; 9'd215: return 9'd237;
9'd216: return 9'd135; 9'd217: return 9'd143; 9'd218: return 9'd167; 9'd219: return 9'd175;
9'd220: return 9'd199; 9'd221: return 9'd207; 9'd222: return 9'd231; 9'd223: return 9'd239;
// yb=7: 145 153 177 185 209 217 241 249 147 155 179 187 211 219 243 251
// 149 157 181 189 213 221 245 253 151 159 183 191 215 223 247 255
9'd224: return 9'd145; 9'd225: return 9'd153; 9'd226: return 9'd177; 9'd227: return 9'd185;
9'd228: return 9'd209; 9'd229: return 9'd217; 9'd230: return 9'd241; 9'd231: return 9'd249;
9'd232: return 9'd147; 9'd233: return 9'd155; 9'd234: return 9'd179; 9'd235: return 9'd187;
9'd236: return 9'd211; 9'd237: return 9'd219; 9'd238: return 9'd243; 9'd239: return 9'd251;
9'd240: return 9'd149; 9'd241: return 9'd157; 9'd242: return 9'd181; 9'd243: return 9'd189;
9'd244: return 9'd213; 9'd245: return 9'd221; 9'd246: return 9'd245; 9'd247: return 9'd253;
9'd248: return 9'd151; 9'd249: return 9'd159; 9'd250: return 9'd183; 9'd251: return 9'd191;
9'd252: return 9'd215; 9'd253: return 9'd223; 9'd254: return 9'd247; 9'd255: return 9'd255;
// yb=8: 256 264 288 296 320 328 352 360 258 266 290 298 322 330 354 362
// 260 268 292 300 324 332 356 364 262 270 294 302 326 334 358 366
9'd256: return 9'd256; 9'd257: return 9'd264; 9'd258: return 9'd288; 9'd259: return 9'd296;
9'd260: return 9'd320; 9'd261: return 9'd328; 9'd262: return 9'd352; 9'd263: return 9'd360;
9'd264: return 9'd258; 9'd265: return 9'd266; 9'd266: return 9'd290; 9'd267: return 9'd298;
9'd268: return 9'd322; 9'd269: return 9'd330; 9'd270: return 9'd354; 9'd271: return 9'd362;
9'd272: return 9'd260; 9'd273: return 9'd268; 9'd274: return 9'd292; 9'd275: return 9'd300;
9'd276: return 9'd324; 9'd277: return 9'd332; 9'd278: return 9'd356; 9'd279: return 9'd364;
9'd280: return 9'd262; 9'd281: return 9'd270; 9'd282: return 9'd294; 9'd283: return 9'd302;
9'd284: return 9'd326; 9'd285: return 9'd334; 9'd286: return 9'd358; 9'd287: return 9'd366;
// yb=9: 272 280 304 312 336 344 368 376 274 282 306 314 338 346 370 378
// 276 284 308 316 340 348 372 380 278 286 310 318 342 350 374 382
9'd288: return 9'd272; 9'd289: return 9'd280; 9'd290: return 9'd304; 9'd291: return 9'd312;
9'd292: return 9'd336; 9'd293: return 9'd344; 9'd294: return 9'd368; 9'd295: return 9'd376;
9'd296: return 9'd274; 9'd297: return 9'd282; 9'd298: return 9'd306; 9'd299: return 9'd314;
9'd300: return 9'd338; 9'd301: return 9'd346; 9'd302: return 9'd370; 9'd303: return 9'd378;
9'd304: return 9'd276; 9'd305: return 9'd284; 9'd306: return 9'd308; 9'd307: return 9'd316;
9'd308: return 9'd340; 9'd309: return 9'd348; 9'd310: return 9'd372; 9'd311: return 9'd380;
9'd312: return 9'd278; 9'd313: return 9'd286; 9'd314: return 9'd310; 9'd315: return 9'd318;
9'd316: return 9'd342; 9'd317: return 9'd350; 9'd318: return 9'd374; 9'd319: return 9'd382;
// yb=10: 321 329 353 361 257 265 289 297 323 331 355 363 259 267 291 299
// 325 333 357 365 261 269 293 301 327 335 359 367 263 271 295 303
9'd320: return 9'd321; 9'd321: return 9'd329; 9'd322: return 9'd353; 9'd323: return 9'd361;
9'd324: return 9'd257; 9'd325: return 9'd265; 9'd326: return 9'd289; 9'd327: return 9'd297;
9'd328: return 9'd323; 9'd329: return 9'd331; 9'd330: return 9'd355; 9'd331: return 9'd363;
9'd332: return 9'd259; 9'd333: return 9'd267; 9'd334: return 9'd291; 9'd335: return 9'd299;
9'd336: return 9'd325; 9'd337: return 9'd333; 9'd338: return 9'd357; 9'd339: return 9'd365;
9'd340: return 9'd261; 9'd341: return 9'd269; 9'd342: return 9'd293; 9'd343: return 9'd301;
9'd344: return 9'd327; 9'd345: return 9'd335; 9'd346: return 9'd359; 9'd347: return 9'd367;
9'd348: return 9'd263; 9'd349: return 9'd271; 9'd350: return 9'd295; 9'd351: return 9'd303;
// yb=11: 337 345 369 377 273 281 305 313 339 347 371 379 275 283 307 315
// 341 349 373 381 277 285 309 317 343 351 375 383 279 287 311 319
9'd352: return 9'd337; 9'd353: return 9'd345; 9'd354: return 9'd369; 9'd355: return 9'd377;
9'd356: return 9'd273; 9'd357: return 9'd281; 9'd358: return 9'd305; 9'd359: return 9'd313;
9'd360: return 9'd339; 9'd361: return 9'd347; 9'd362: return 9'd371; 9'd363: return 9'd379;
9'd364: return 9'd275; 9'd365: return 9'd283; 9'd366: return 9'd307; 9'd367: return 9'd315;
9'd368: return 9'd341; 9'd369: return 9'd349; 9'd370: return 9'd373; 9'd371: return 9'd381;
9'd372: return 9'd277; 9'd373: return 9'd285; 9'd374: return 9'd309; 9'd375: return 9'd317;
9'd376: return 9'd343; 9'd377: return 9'd351; 9'd378: return 9'd375; 9'd379: return 9'd383;
9'd380: return 9'd279; 9'd381: return 9'd287; 9'd382: return 9'd311; 9'd383: return 9'd319;
// yb=12: 448 456 480 488 384 392 416 424 450 458 482 490 386 394 418 426
// 452 460 484 492 388 396 420 428 454 462 486 494 390 398 422 430
9'd384: return 9'd448; 9'd385: return 9'd456; 9'd386: return 9'd480; 9'd387: return 9'd488;
9'd388: return 9'd384; 9'd389: return 9'd392; 9'd390: return 9'd416; 9'd391: return 9'd424;
9'd392: return 9'd450; 9'd393: return 9'd458; 9'd394: return 9'd482; 9'd395: return 9'd490;
9'd396: return 9'd386; 9'd397: return 9'd394; 9'd398: return 9'd418; 9'd399: return 9'd426;
9'd400: return 9'd452; 9'd401: return 9'd460; 9'd402: return 9'd484; 9'd403: return 9'd492;
9'd404: return 9'd388; 9'd405: return 9'd396; 9'd406: return 9'd420; 9'd407: return 9'd428;
9'd408: return 9'd454; 9'd409: return 9'd462; 9'd410: return 9'd486; 9'd411: return 9'd494;
9'd412: return 9'd390; 9'd413: return 9'd398; 9'd414: return 9'd422; 9'd415: return 9'd430;
// yb=13: 464 472 496 504 400 408 432 440 466 474 498 506 402 410 434 442
// 468 476 500 508 404 412 436 444 470 478 502 510 406 414 438 446
9'd416: return 9'd464; 9'd417: return 9'd472; 9'd418: return 9'd496; 9'd419: return 9'd504;
9'd420: return 9'd400; 9'd421: return 9'd408; 9'd422: return 9'd432; 9'd423: return 9'd440;
9'd424: return 9'd466; 9'd425: return 9'd474; 9'd426: return 9'd498; 9'd427: return 9'd506;
9'd428: return 9'd402; 9'd429: return 9'd410; 9'd430: return 9'd434; 9'd431: return 9'd442;
9'd432: return 9'd468; 9'd433: return 9'd476; 9'd434: return 9'd500; 9'd435: return 9'd508;
9'd436: return 9'd404; 9'd437: return 9'd412; 9'd438: return 9'd436; 9'd439: return 9'd444;
9'd440: return 9'd470; 9'd441: return 9'd478; 9'd442: return 9'd502; 9'd443: return 9'd510;
9'd444: return 9'd406; 9'd445: return 9'd414; 9'd446: return 9'd438; 9'd447: return 9'd446;
// yb=14: 385 393 417 425 449 457 481 489 387 395 419 427 451 459 483 491
// 389 397 421 429 453 461 485 493 391 399 423 431 455 463 487 495
9'd448: return 9'd385; 9'd449: return 9'd393; 9'd450: return 9'd417; 9'd451: return 9'd425;
9'd452: return 9'd449; 9'd453: return 9'd457; 9'd454: return 9'd481; 9'd455: return 9'd489;
9'd456: return 9'd387; 9'd457: return 9'd395; 9'd458: return 9'd419; 9'd459: return 9'd427;
9'd460: return 9'd451; 9'd461: return 9'd459; 9'd462: return 9'd483; 9'd463: return 9'd491;
9'd464: return 9'd389; 9'd465: return 9'd397; 9'd466: return 9'd421; 9'd467: return 9'd429;
9'd468: return 9'd453; 9'd469: return 9'd461; 9'd470: return 9'd485; 9'd471: return 9'd493;
9'd472: return 9'd391; 9'd473: return 9'd399; 9'd474: return 9'd423; 9'd475: return 9'd431;
9'd476: return 9'd455; 9'd477: return 9'd463; 9'd478: return 9'd487; 9'd479: return 9'd495;
// yb=15: 401 409 433 441 465 473 497 505 403 411 435 443 467 475 499 507
// 405 413 437 445 469 477 501 509 407 415 439 447 471 479 503 511
9'd480: return 9'd401; 9'd481: return 9'd409; 9'd482: return 9'd433; 9'd483: return 9'd441;
9'd484: return 9'd465; 9'd485: return 9'd473; 9'd486: return 9'd497; 9'd487: return 9'd505;
9'd488: return 9'd403; 9'd489: return 9'd411; 9'd490: return 9'd435; 9'd491: return 9'd443;
9'd492: return 9'd467; 9'd493: return 9'd475; 9'd494: return 9'd499; 9'd495: return 9'd507;
9'd496: return 9'd405; 9'd497: return 9'd413; 9'd498: return 9'd437; 9'd499: return 9'd445;
9'd500: return 9'd469; 9'd501: return 9'd477; 9'd502: return 9'd501; 9'd503: return 9'd509;
9'd504: return 9'd407; 9'd505: return 9'd415; 9'd506: return 9'd439; 9'd507: return 9'd447;
9'd508: return 9'd471; 9'd509: return 9'd479; 9'd510: return 9'd503; default: return 9'd511;
endcase
endfunction
// Decompose pixel coord into page / block / pixel-in-block.
logic [11:0] page_x;
logic [11:0] page_y;
logic [2:0] by;
logic [1:0] bx;
logic [4:0] xb;
logic [3:0] yb;
logic [5:0] bw_pg;
assign page_x = x >> 7; // x / 128
assign page_y = y >> 7; // y / 128
assign by = y[6:4]; // (y % 128) / 16
assign bx = x[6:5]; // (x % 128) / 32
assign xb = x[4:0]; // x % 32
assign yb = y[3:0]; // y % 16
assign bw_pg = fbw >> 1; // FBW / 2 (FBW must be even)
logic [4:0] block_idx;
assign block_idx = swizzle_psmt4(by, bx);
logic [8:0] nibble_idx;
assign nibble_idx = col_idx_psmt4(yb, xb);
logic [31:0] page_base;
logic [31:0] block_base;
logic [31:0] page_index;
assign page_index = ({20'd0, page_y} * {26'd0, bw_pg}) + {20'd0, page_x};
assign page_base = ({23'd0, fbp} << 11) + (page_index << 13); // FBP*2048 + page_index*8192
assign block_base = page_base + ({27'd0, block_idx} << 8); // + block_idx*256
assign addr = block_base + {23'd0, nibble_idx[8:1]}; // byte_in_block = nibble_idx >> 1
assign nibble_hi = nibble_idx[0]; // 0=low, 1=high
endmodule : gs_swizzle_psmt4_stub
+259
View File
@@ -0,0 +1,259 @@
// retroDE_ps2 — gs_swizzle_psmt8_stub (Ch131)
//
// Pure-combinational PSMT8 page/block/column swizzle: maps a
// pixel coordinate (x, y) within a framebuffer at (FBP, FBW) to
// its physical VRAM byte address using the real PS2 GS PSMT8
// layout. Mirrors Ch119's `gs_swizzle_psmct32_stub` and Ch125's
// `gs_swizzle_psmct16_stub` shape, but with PSMT8's wider page
// (128 px vs 64 px), 8-cols × 4-rows page block grid, and the
// 16×16 within-block column table.
//
// THIS MODULE IS NOT YET WIRED INTO gs_pcrtc_stub /
// gif_image_xfer_stub / gs_stub. Future chapters will wire it
// behind a `PSMT8_SWIZZLE`-style parameter gate, mirroring the
// PSMCT32 (Ch120/121/122) and PSMCT16 (Ch126/127/128) progressions.
// Default-off keeps the legacy linear PSMT8 TBs (Ch96, Ch97,
// Ch103, Ch105, Ch107, Ch117) on the linear path.
//
// SOURCE-TABLE PROVENANCE (per Codex's Ch125/Ch131 guidance):
// blockTable8 — pcsx2/GS/GSTables.cpp lines 5359, master
// HEAD commit 3000e113e2b3a76357c08dfa80d3c747f40e2706
// (file blob SHA 3581209b8217378f473f9de22a9dbc8c45ca49b6).
// 4 rows × 8 cols, indexed [block_y][block_x].
// columnTable8 — pcsx2/GS/GSTables.cpp lines 111145, same
// commit. 16 rows × 16 cols, indexed [yb][xb],
// values are byte-within-block (0..255).
// Cross-check — GSLocalMemory.h line 551 BlockNumber8 +
// pxOffset template at GSTables.cpp lines 247258
// (blockSize=256, pageSize=8192, pageWidth=128).
// PSMT8 has pageShiftX=7, pageShiftY=6,
// blockShiftX=4, blockShiftY=4,
// m_bwPg = bw >> (pageShiftX - 6) = bw >> 1
// (so FBW must be even for PSMT8 — PCSX2 asserts
// `(bw & 1) == 0` at GSLocalMemory.h:553).
// PCSX2's `bp` is in 256-byte block-pointer
// units; in our FBP (2048-byte) units,
// bp = FBP * 8, so bp*256 = FBP*2048.
//
// NOTE on PCSX2 license: the PCSX2 project is GPL-3.0+. This
// stub re-expresses the same PSMT8 swizzle math in SystemVerilog
// as a hardware contract — the values in the blockTable8 /
// columnTable8 case statements come from PCSX2 source and
// represent the PS2 hardware layout itself (not PCSX2-original
// creative content). The retroDE_ps2 project authors should
// consider whether this provenance affects licensing for
// downstream consumers; from an engineering correctness
// standpoint, locking against the canonical source is the only
// way to be byte-accurate to real PS2 VRAM.
//
// Real PS2 PSMT8 layout:
// - VRAM is 4 MiB total, organized in 8 KiB pages.
// - Each page is 128×64 PSMT8 pixels (= 128*64*1 = 8192 bytes).
// 2× as many pixels per page as PSMCT16 (which has 64×64 px)
// and 4× as many as PSMCT32 (64×32 px) because each PSMT8
// pixel is only 1 byte vs CT16's 2 vs CT32's 4.
// - Each page is divided into a 8×4 grid of blocks (8 cols of
// blocks across, 4 rows down). Each block is 16×16 PSMT8
// pixels (= 16*16*1 = 256 bytes). 8×4 = 32 blocks/page.
// - Block ordering within a page follows blockTable8.
// - Within a block, byte placement follows columnTable8: a
// 16×16 → 256-entry permutation that organizes 4 internal
// columns (4 wide each) × 4 internal row-groups (4 tall each)
// with intra-group y-pair interleaving.
//
// Address formula (FBP in 2048-byte units; FBW in 64-pixel
// units; addr in bytes; FBW must be even):
// page_x = x / 128
// page_y = y / 64
// bw_pg = FBW / 2 // pages per row
// page_index = page_y * bw_pg + page_x
// page_base = FBP*2048 + page_index*8192
//
// block_x_in_page = (x % 128) / 16 // 0..7
// block_y_in_page = (y % 64) / 16 // 0..3
// block_idx = blockTable8[block_y_in_page][block_x_in_page]
// block_base = page_base + block_idx*256
//
// xb = x % 16
// yb = y % 16
// byte_idx = columnTable8[yb][xb] // 0..255
// addr = block_base + byte_idx
`timescale 1ns/1ps
module gs_swizzle_psmt8_stub
(
input logic [8:0] fbp, // FBP — frame base, in 2048-byte units
input logic [5:0] fbw, // FBW — frame width, in 64-pixel units (must be even)
input logic [11:0] x,
input logic [11:0] y,
output logic [31:0] addr
);
// --------------------------------------------------------------
// blockTable8 (verbatim from pcsx2/GS/GSTables.cpp lines 5359).
// Indexed [block_y_in_page (0..3)][block_x_in_page (0..7)].
// by=0: { 0, 1, 4, 5,16,17,20,21}
// by=1: { 2, 3, 6, 7,18,19,22,23}
// by=2: { 8, 9,12,13,24,25,28,29}
// by=3: {10,11,14,15,26,27,30,31}
// --------------------------------------------------------------
function automatic logic [4:0] swizzle_psmt8(
input logic [1:0] by,
input logic [2:0] bx);
case ({by, bx})
// by=0
5'd0: return 5'd0; 5'd1: return 5'd1; 5'd2: return 5'd4; 5'd3: return 5'd5;
5'd4: return 5'd16; 5'd5: return 5'd17; 5'd6: return 5'd20; 5'd7: return 5'd21;
// by=1
5'd8: return 5'd2; 5'd9: return 5'd3; 5'd10: return 5'd6; 5'd11: return 5'd7;
5'd12: return 5'd18; 5'd13: return 5'd19; 5'd14: return 5'd22; 5'd15: return 5'd23;
// by=2
5'd16: return 5'd8; 5'd17: return 5'd9; 5'd18: return 5'd12; 5'd19: return 5'd13;
5'd20: return 5'd24; 5'd21: return 5'd25; 5'd22: return 5'd28; 5'd23: return 5'd29;
// by=3
5'd24: return 5'd10; 5'd25: return 5'd11; 5'd26: return 5'd14; 5'd27: return 5'd15;
5'd28: return 5'd26; 5'd29: return 5'd27; 5'd30: return 5'd30; default: return 5'd31;
endcase
endfunction
// --------------------------------------------------------------
// columnTable8 (verbatim from pcsx2/GS/GSTables.cpp lines 111145).
// Indexed [yb (0..15)][xb (0..15)] → byte-within-block 0..255.
// yb=0: 0 4 16 20 32 36 48 52 2 6 18 22 34 38 50 54
// yb=1: 8 12 24 28 40 44 56 60 10 14 26 30 42 46 58 62
// yb=2: 33 37 49 53 1 5 17 21 35 39 51 55 3 7 19 23
// yb=3: 41 45 57 61 9 13 25 29 43 47 59 63 11 15 27 31
// yb=4: 96 100 112 116 64 68 80 84 98 102 114 118 66 70 82 86
// yb=5: 104 108 120 124 72 76 88 92 106 110 122 126 74 78 90 94
// yb=6: 65 69 81 85 97 101 113 117 67 71 83 87 99 103 115 119
// yb=7: 73 77 89 93 105 109 121 125 75 79 91 95 107 111 123 127
// yb=8: 128 132 144 148 160 164 176 180 130 134 146 150 162 166 178 182
// yb=9: 136 140 152 156 168 172 184 188 138 142 154 158 170 174 186 190
// yb=10: 161 165 177 181 129 133 145 149 163 167 179 183 131 135 147 151
// yb=11: 169 173 185 189 137 141 153 157 171 175 187 191 139 143 155 159
// yb=12: 224 228 240 244 192 196 208 212 226 230 242 246 194 198 210 214
// yb=13: 232 236 248 252 200 204 216 220 234 238 250 254 202 206 218 222
// yb=14: 193 197 209 213 225 229 241 245 195 199 211 215 227 231 243 247
// yb=15: 201 205 217 221 233 237 249 253 203 207 219 223 235 239 251 255
// --------------------------------------------------------------
function automatic logic [7:0] col_idx_psmt8(
input logic [3:0] yb,
input logic [3:0] xb);
case ({yb, xb})
// yb=0
8'd0: return 8'd0; 8'd1: return 8'd4; 8'd2: return 8'd16; 8'd3: return 8'd20;
8'd4: return 8'd32; 8'd5: return 8'd36; 8'd6: return 8'd48; 8'd7: return 8'd52;
8'd8: return 8'd2; 8'd9: return 8'd6; 8'd10: return 8'd18; 8'd11: return 8'd22;
8'd12: return 8'd34; 8'd13: return 8'd38; 8'd14: return 8'd50; 8'd15: return 8'd54;
// yb=1
8'd16: return 8'd8; 8'd17: return 8'd12; 8'd18: return 8'd24; 8'd19: return 8'd28;
8'd20: return 8'd40; 8'd21: return 8'd44; 8'd22: return 8'd56; 8'd23: return 8'd60;
8'd24: return 8'd10; 8'd25: return 8'd14; 8'd26: return 8'd26; 8'd27: return 8'd30;
8'd28: return 8'd42; 8'd29: return 8'd46; 8'd30: return 8'd58; 8'd31: return 8'd62;
// yb=2
8'd32: return 8'd33; 8'd33: return 8'd37; 8'd34: return 8'd49; 8'd35: return 8'd53;
8'd36: return 8'd1; 8'd37: return 8'd5; 8'd38: return 8'd17; 8'd39: return 8'd21;
8'd40: return 8'd35; 8'd41: return 8'd39; 8'd42: return 8'd51; 8'd43: return 8'd55;
8'd44: return 8'd3; 8'd45: return 8'd7; 8'd46: return 8'd19; 8'd47: return 8'd23;
// yb=3
8'd48: return 8'd41; 8'd49: return 8'd45; 8'd50: return 8'd57; 8'd51: return 8'd61;
8'd52: return 8'd9; 8'd53: return 8'd13; 8'd54: return 8'd25; 8'd55: return 8'd29;
8'd56: return 8'd43; 8'd57: return 8'd47; 8'd58: return 8'd59; 8'd59: return 8'd63;
8'd60: return 8'd11; 8'd61: return 8'd15; 8'd62: return 8'd27; 8'd63: return 8'd31;
// yb=4
8'd64: return 8'd96; 8'd65: return 8'd100; 8'd66: return 8'd112; 8'd67: return 8'd116;
8'd68: return 8'd64; 8'd69: return 8'd68; 8'd70: return 8'd80; 8'd71: return 8'd84;
8'd72: return 8'd98; 8'd73: return 8'd102; 8'd74: return 8'd114; 8'd75: return 8'd118;
8'd76: return 8'd66; 8'd77: return 8'd70; 8'd78: return 8'd82; 8'd79: return 8'd86;
// yb=5
8'd80: return 8'd104; 8'd81: return 8'd108; 8'd82: return 8'd120; 8'd83: return 8'd124;
8'd84: return 8'd72; 8'd85: return 8'd76; 8'd86: return 8'd88; 8'd87: return 8'd92;
8'd88: return 8'd106; 8'd89: return 8'd110; 8'd90: return 8'd122; 8'd91: return 8'd126;
8'd92: return 8'd74; 8'd93: return 8'd78; 8'd94: return 8'd90; 8'd95: return 8'd94;
// yb=6
8'd96: return 8'd65; 8'd97: return 8'd69; 8'd98: return 8'd81; 8'd99: return 8'd85;
8'd100: return 8'd97; 8'd101: return 8'd101; 8'd102: return 8'd113; 8'd103: return 8'd117;
8'd104: return 8'd67; 8'd105: return 8'd71; 8'd106: return 8'd83; 8'd107: return 8'd87;
8'd108: return 8'd99; 8'd109: return 8'd103; 8'd110: return 8'd115; 8'd111: return 8'd119;
// yb=7
8'd112: return 8'd73; 8'd113: return 8'd77; 8'd114: return 8'd89; 8'd115: return 8'd93;
8'd116: return 8'd105; 8'd117: return 8'd109; 8'd118: return 8'd121; 8'd119: return 8'd125;
8'd120: return 8'd75; 8'd121: return 8'd79; 8'd122: return 8'd91; 8'd123: return 8'd95;
8'd124: return 8'd107; 8'd125: return 8'd111; 8'd126: return 8'd123; 8'd127: return 8'd127;
// yb=8
8'd128: return 8'd128; 8'd129: return 8'd132; 8'd130: return 8'd144; 8'd131: return 8'd148;
8'd132: return 8'd160; 8'd133: return 8'd164; 8'd134: return 8'd176; 8'd135: return 8'd180;
8'd136: return 8'd130; 8'd137: return 8'd134; 8'd138: return 8'd146; 8'd139: return 8'd150;
8'd140: return 8'd162; 8'd141: return 8'd166; 8'd142: return 8'd178; 8'd143: return 8'd182;
// yb=9
8'd144: return 8'd136; 8'd145: return 8'd140; 8'd146: return 8'd152; 8'd147: return 8'd156;
8'd148: return 8'd168; 8'd149: return 8'd172; 8'd150: return 8'd184; 8'd151: return 8'd188;
8'd152: return 8'd138; 8'd153: return 8'd142; 8'd154: return 8'd154; 8'd155: return 8'd158;
8'd156: return 8'd170; 8'd157: return 8'd174; 8'd158: return 8'd186; 8'd159: return 8'd190;
// yb=10
8'd160: return 8'd161; 8'd161: return 8'd165; 8'd162: return 8'd177; 8'd163: return 8'd181;
8'd164: return 8'd129; 8'd165: return 8'd133; 8'd166: return 8'd145; 8'd167: return 8'd149;
8'd168: return 8'd163; 8'd169: return 8'd167; 8'd170: return 8'd179; 8'd171: return 8'd183;
8'd172: return 8'd131; 8'd173: return 8'd135; 8'd174: return 8'd147; 8'd175: return 8'd151;
// yb=11
8'd176: return 8'd169; 8'd177: return 8'd173; 8'd178: return 8'd185; 8'd179: return 8'd189;
8'd180: return 8'd137; 8'd181: return 8'd141; 8'd182: return 8'd153; 8'd183: return 8'd157;
8'd184: return 8'd171; 8'd185: return 8'd175; 8'd186: return 8'd187; 8'd187: return 8'd191;
8'd188: return 8'd139; 8'd189: return 8'd143; 8'd190: return 8'd155; 8'd191: return 8'd159;
// yb=12
8'd192: return 8'd224; 8'd193: return 8'd228; 8'd194: return 8'd240; 8'd195: return 8'd244;
8'd196: return 8'd192; 8'd197: return 8'd196; 8'd198: return 8'd208; 8'd199: return 8'd212;
8'd200: return 8'd226; 8'd201: return 8'd230; 8'd202: return 8'd242; 8'd203: return 8'd246;
8'd204: return 8'd194; 8'd205: return 8'd198; 8'd206: return 8'd210; 8'd207: return 8'd214;
// yb=13
8'd208: return 8'd232; 8'd209: return 8'd236; 8'd210: return 8'd248; 8'd211: return 8'd252;
8'd212: return 8'd200; 8'd213: return 8'd204; 8'd214: return 8'd216; 8'd215: return 8'd220;
8'd216: return 8'd234; 8'd217: return 8'd238; 8'd218: return 8'd250; 8'd219: return 8'd254;
8'd220: return 8'd202; 8'd221: return 8'd206; 8'd222: return 8'd218; 8'd223: return 8'd222;
// yb=14
8'd224: return 8'd193; 8'd225: return 8'd197; 8'd226: return 8'd209; 8'd227: return 8'd213;
8'd228: return 8'd225; 8'd229: return 8'd229; 8'd230: return 8'd241; 8'd231: return 8'd245;
8'd232: return 8'd195; 8'd233: return 8'd199; 8'd234: return 8'd211; 8'd235: return 8'd215;
8'd236: return 8'd227; 8'd237: return 8'd231; 8'd238: return 8'd243; 8'd239: return 8'd247;
// yb=15
8'd240: return 8'd201; 8'd241: return 8'd205; 8'd242: return 8'd217; 8'd243: return 8'd221;
8'd244: return 8'd233; 8'd245: return 8'd237; 8'd246: return 8'd249; 8'd247: return 8'd253;
8'd248: return 8'd203; 8'd249: return 8'd207; 8'd250: return 8'd219; 8'd251: return 8'd223;
8'd252: return 8'd235; 8'd253: return 8'd239; 8'd254: return 8'd251; default: return 8'd255;
endcase
endfunction
// Decompose pixel coord into page / block / pixel-in-block.
logic [11:0] page_x;
logic [11:0] page_y;
logic [1:0] by;
logic [2:0] bx;
logic [3:0] xb;
logic [3:0] yb;
logic [5:0] bw_pg;
assign page_x = x >> 7; // x / 128
assign page_y = y >> 6; // y / 64
assign by = y[5:4]; // (y % 64) / 16
assign bx = x[6:4]; // (x % 128) / 16
assign xb = x[3:0]; // x % 16
assign yb = y[3:0]; // y % 16
assign bw_pg = fbw >> 1; // FBW / 2 (FBW must be even)
logic [4:0] block_idx;
assign block_idx = swizzle_psmt8(by, bx);
logic [7:0] byte_idx;
assign byte_idx = col_idx_psmt8(yb, xb);
logic [31:0] page_base;
logic [31:0] block_base;
logic [31:0] page_index;
assign page_index = ({20'd0, page_y} * {26'd0, bw_pg}) + {20'd0, page_x};
assign page_base = ({23'd0, fbp} << 11) + (page_index << 13); // FBP*2048 + page_index*8192
assign block_base = page_base + ({27'd0, block_idx} << 8); // + block_idx*256
assign addr = block_base + {24'd0, byte_idx}; // + byte_idx (1 byte/pixel)
endmodule : gs_swizzle_psmt8_stub
+74
View File
@@ -0,0 +1,74 @@
// retroDE_ps2 — gs_texel_addr
//
// Texture-sampling address generator (brick 1, step 1 of GS texturing).
//
// Given a texture coordinate (u,v) and the TEX0 texture descriptor, computes
// the LINEAR VRAM byte address of that texel — the read-side mirror of the
// rasterizer's existing framebuffer-address math (gs_stub.sv ~line 530:
// fb_addr = base + (Y*FBW*64 + X) * bytes_per_pixel ).
//
// Linear (non-swizzled) only, on purpose: the swizzle paths in gs_stub are
// param-gated OFF by default, so linear is the baseline. Swizzled texel
// addressing will reuse the existing gs_swizzle_* modules later.
//
// `base_byte_addr` is the texture base in VRAM, ALREADY scaled to bytes by
// the caller from TEX0.TBP0. Keeping the base as a byte input (rather than
// scaling TBP0 here) isolates the one thing that must be reconciled with the
// texture-UPLOAD path (gif_image_xfer_stub / BITBLTBUF) — so we read texels
// from exactly where BITBLT wrote them. That reconciliation is tracked as the
// next integration step; this module's (u,v)->offset math is unambiguous and
// unit-tested below.
`timescale 1ns/1ps
module gs_texel_addr #(
parameter int ADDR_W = 32
) (
input logic [31:0] base_byte_addr, // texture base in VRAM (bytes)
input logic [10:0] u, // texel column (0..2047)
input logic [10:0] v, // texel row (0..2047)
input logic [13:0] tbw, // TEX0.TBW — texels-per-row / 64
input logic [5:0] psm, // pixel storage mode
output logic [ADDR_W-1:0] texel_byte_addr,
output logic nibble_hi // PSMT4: high nibble of the byte?
);
localparam logic [5:0] PSMCT32 = 6'h00;
localparam logic [5:0] PSMCT16 = 6'h02;
localparam logic [5:0] PSMT8 = 6'h13;
localparam logic [5:0] PSMT4 = 6'h14;
// texels per row = TBW * 64
logic [19:0] row_texels;
// linear texel index = v * row_texels + u
logic [31:0] texel_offset;
always_comb begin
row_texels = {tbw, 6'b000000}; // tbw * 64
texel_offset = (v * row_texels) + {21'd0, u};
unique case (psm)
PSMCT32: begin
texel_byte_addr = base_byte_addr + (texel_offset << 2); // 4 B/texel
nibble_hi = 1'b0;
end
PSMCT16: begin
texel_byte_addr = base_byte_addr + (texel_offset << 1); // 2 B/texel
nibble_hi = 1'b0;
end
PSMT8: begin
texel_byte_addr = base_byte_addr + texel_offset; // 1 B/texel
nibble_hi = 1'b0;
end
PSMT4: begin
texel_byte_addr = base_byte_addr + (texel_offset >> 1); // 4 b/texel
nibble_hi = texel_offset[0];
end
default: begin
texel_byte_addr = base_byte_addr + (texel_offset << 2);
nibble_hi = 1'b0;
end
endcase
end
endmodule : gs_texel_addr
+196
View File
@@ -0,0 +1,196 @@
// ============================================================================
// gs_texture_cache.sv (Ch322 — PREFILLED texture cache, correctness-first)
//
// Proves texture bytes can live in FPGA-private LPDDR4B and be consumed by the
// GS sampler through an on-chip RAM at the EXISTING 1-cycle texel latency.
//
// This is a PREFILLED cache, NOT a demand cache. The whole known texture is
// filled from LPDDR into an on-chip RAM ONCE, before rendering (mirroring the
// Ch321 line-buffer trick: warm a bounded buffer, then serve at native latency).
// Every sampler read is therefore a HIT at 1-cycle latency — the nearest-path
// sampler's fixed-latency contract (gs_texture_unit valid_pipe, RD_LATENCY=1) is
// preserved with ZERO pipeline/stall surgery. Demand miss/stall is explicitly
// DEFERRED to a later chapter (it would be a raster-walker pipeline redesign).
//
// Two clock domains (same split as gs_lpddr_scanout_lb):
// axi_clk (emif_clk) — fill FSM: single-beat 256-bit reads (arlen=0, the
// only read pattern proven on this EMIF) from LPDDR.
// sample_clk (design) — the sampler's texel read port; 1-cycle registered,
// byte-identical timing to the vram_bram_stub read2.
//
// The fill is one-shot before raster, so the on-chip RAM is static when the
// sampler reads it (write side idle) — a plain dual-clock simple-dual-port RAM,
// no read/write CDC hazard. `fill_done` is 2-FF synced into sample_clk so the
// read mux only goes live after the texture is fully resident.
// ============================================================================
`timescale 1ns/1ps
module gs_texture_cache #(
parameter [29:0] LPDDR_TEX_BASE = 30'd0, // byte base of the texture in LPDDR4B
parameter [31:0] TEX_VRAM_BASE = 32'd2048,// VRAM byte base the sampler addresses (TBP0*256)
parameter int TEX_BYTES = 256, // texture size in bytes (8x8 PSMCT32 = 256)
parameter int N_BEATS = 8 // TEX_BYTES / 32
)(
// ---- AXI read clock domain (emif_clk) — fill side ----
input logic axi_clk,
input logic axi_rst_n,
input logic fill_start, // TOGGLE (bridge domain, CDC-synced): each edge (re)fills
output logic fill_done, // texture fully resident (until the next fill arm)
output logic [31:0] fill_beats, // beats completed (cumulative)
output logic [31:0] fill_bytes, // bytes filled (cumulative)
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
output logic [31:0] fill_crc, // Ch352 — sum32 of EVERY 32-bit word actually written into tex_mem. The
// host compares this to the file's sum32 to PROVE tex_mem integrity on
// silicon (the LPDDR readback only proves LPDDR, not the cache contents).
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
output logic [29:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready,
// ---- sampler clock domain (design_clk) — texel read port ----
input logic sample_clk,
input logic tex_rd_en, // sampler issues a texel read this cycle
input logic [31:0] tex_rd_addr, // VRAM byte address (TEX_VRAM_BASE + offset)
output logic [31:0] tex_rd_data, // 1-cycle REGISTERED texel word (matches read2)
output logic tex_ready // fill_done synced into sample_clk (mux gate)
);
localparam int TEX_WORDS = TEX_BYTES/4; // 32-bit words in the cache
localparam int WIDX_BITS = (TEX_WORDS <= 1) ? 1 : $clog2(TEX_WORDS);
localparam int BIDX_BITS = (N_BEATS <= 1) ? 1 : $clog2(N_BEATS);
assign arburst = 2'b01; // INCR
assign arid = 7'd4; // distinct id: writer=0, probe=1, frame-cache=2, line-buf=3, tex-fill=4
assign arlen = 8'd0; // SINGLE-BEAT (the only AXI read pattern proven on this EMIF)
assign arsize = 3'b101; // 32 bytes
// On-chip texture RAM: written by the fill FSM (axi_clk), read by the sampler
// (sample_clk). One-shot warm fill => static during reads => no CDC hazard.
//
// One ordinary 32-bit simple-dual-port RAM. History of cache geometry vs Quartus Place:
// flat 8-write array -> exploded to flops (707k ALUT)
// 8x 8192x32 banks -> width fragmentation, 344/358, 9h Place thrash
// 1x 8192x256 RAM -> count OK (320/358) but one ~104-M20K macro too WIDE -> Place 40min+, no QDB
// 2x 8192x128 halves -> still rigid, Place 50min+ no progress
// 4x 8192x64 banks -> 328/358; Place still stalled with ample RAM after read2 was removed.
// 1x 65536x32 (HERE) -> latch each AXI beat, drain 8 lanes over 8 axi_clk cycles.
// Serializing the fill removes the multi-bank/multi-write geometry while preserving the sampler's
// one-cycle registered 32-bit read. The one-shot fill is still tiny compared with board startup.
(* ramstyle = "M20K" *) logic [31:0] tex_mem [0:TEX_WORDS-1];
// ================= fill side (axi_clk) =================
typedef enum logic [2:0] { F_IDLE, F_AR, F_R, F_DRAIN, F_DONE } fstate_t;
fstate_t fst;
logic [$clog2(N_BEATS):0] beat; // 0..N_BEATS
logic [255:0] fill_data_q;
logic [2:0] fill_lane;
logic [WIDX_BITS-1:0] fill_word_base;
wire [WIDX_BITS-1:0] fill_word_idx = fill_word_base + WIDX_BITS'(fill_lane);
// fill_start is an EDGE/TOGGLE (bridge toggles it on each arm), CDC-synced here so the
// cache is RE-FILLABLE: each arm reloads the texture (lets the HPS re-stage a different
// texture without a board reset). 3-FF sync + edge-detect, like the read/write probes.
logic [2:0] fs_sync;
wire fs_edge = (fs_sync[2] != fs_sync[1]);
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
fst <= F_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
beat <= '0; fill_done <= 1'b0; fill_beats <= 32'd0; fill_bytes <= 32'd0;
rd_errs <= 32'd0; fs_sync <= 3'd0; fill_data_q <= '0;
fill_lane <= 3'd0; fill_word_base <= '0; fill_crc <= 32'd0;
end else begin
fs_sync <= {fs_sync[1:0], fill_start};
case (fst)
F_IDLE: begin
if (fs_edge) begin
araddr <= LPDDR_TEX_BASE;
beat <= '0;
fill_done <= 1'b0; // re-arm: drop ready until reloaded
fill_beats <= 32'd0;
fill_bytes <= 32'd0;
rd_errs <= 32'd0;
fill_crc <= 32'd0; // Ch352 — restart the tex_mem integrity sum for this fill
arvalid <= 1'b1;
fst <= F_AR;
end
end
F_AR: begin
if (arready) begin
arvalid <= 1'b0;
rready <= 1'b1;
fst <= F_R;
end
end
F_R: begin
if (rvalid) begin
// Capture the AXI beat, then issue one M20K-native 32-bit write per cycle.
fill_data_q <= rdata;
fill_lane <= 3'd0;
fill_word_base <= {beat[BIDX_BITS-1:0], 3'b000};
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
rready <= 1'b0;
fst <= F_DRAIN;
end
end
F_DRAIN: begin
tex_mem[fill_word_idx] <= fill_data_q[fill_lane*32 +: 32];
fill_crc <= fill_crc + fill_data_q[fill_lane*32 +: 32]; // sum32 over the words written
if (fill_lane == 3'd7) begin
fill_beats <= fill_beats + 32'd1;
fill_bytes <= fill_bytes + 32'd32;
if (beat == N_BEATS-1) begin
fill_done <= 1'b1;
fst <= F_DONE;
end else begin
beat <= beat + 1'b1;
araddr <= araddr + 30'd32; // next single-beat read
arvalid <= 1'b1;
fst <= F_AR;
end
end else begin
fill_lane <= fill_lane + 3'd1;
end
end
F_DONE: begin
// resident until the next arm. A fresh fill_start edge re-loads the
// texture (e.g. HPS re-stages a different one) — start a new fill.
if (fs_edge) begin
araddr <= LPDDR_TEX_BASE;
beat <= '0;
fill_done <= 1'b0;
fill_beats <= 32'd0;
fill_bytes <= 32'd0;
rd_errs <= 32'd0;
fill_crc <= 32'd0; // Ch352 — restart the tex_mem integrity sum for this fill
arvalid <= 1'b1;
fst <= F_AR;
end
end
default: fst <= F_IDLE;
endcase
end
end
// ================= sampler side (sample_clk) =================
// 1-cycle REGISTERED read, identical timing to vram_bram_stub.read2:
// present (tex_rd_addr) when tex_rd_en, data lands next cycle.
wire [31:0] word_off = (tex_rd_addr - TEX_VRAM_BASE) >> 2;
wire [WIDX_BITS-1:0] rd_word = word_off[WIDX_BITS-1:0];
always_ff @(posedge sample_clk) begin
if (tex_rd_en) tex_rd_data <= tex_mem[rd_word];
end
// fill_done -> sample_clk (2-FF). The read mux only goes live once warm.
logic [1:0] done_sync;
always_ff @(posedge sample_clk) done_sync <= {done_sync[0], fill_done};
assign tex_ready = done_sync[1];
endmodule
+757
View File
@@ -0,0 +1,757 @@
// retroDE_ps2 — gs_texture_unit
//
// Per-pixel texture sampler (brick 1, the texturing datapath core).
//
// Takes a per-pixel texture coordinate (u,v) + the TEX0 descriptor, fetches
// the texel from VRAM through a read port, and outputs the sampled color,
// pipelined to absorb the VRAM read latency.
//
// (u,v,valid) --[gs_texel_addr]--> byte addr --> VRAM read port
// | (RD_LATENCY cyc)
// sampled color <--[decode]-- tex_rd_data
//
// v1 scope (kept deliberately minimal so it's fully verifiable now):
// - PSMCT32 only (32-bit ABGR texels, direct — no CLUT).
// - DECAL texture function (texel replaces fragment color).
//
// Ch296 — PSMT8 indexed texturing (this chapter):
// - When psm==PSMT8 (0x13) the fetched 32-bit word holds FOUR packed
// 8-bit indices. The byte for this texel is selected by the texel
// byte address' low 2 bits (gs_texel_addr emits a 1-byte/texel
// address for PSMT8). That index drives a CLUT lookup port; the
// returned PSMCT32 entry is the texel color (DECAL).
// - The lookup is COMBINATIONAL (clut_stub's read port is comb), so it
// lands in the SAME cycle as the direct PSMCT32 path — the existing
// single S1->S2 register in gs_stub aligns it with emit unchanged.
// - PSMCT32 (psm==0x00) behavior is byte-identical to before.
// Next versions add: PSMCT16 unpack, PSMT4 (nibble) + CLUT, swizzle, and
// MODULATE/HIGHLIGHT tex functions.
//
// The VRAM read port here is generic (byte address out, 32-bit word in,
// fixed RD_LATENCY). Integration wires it to vram_stub's spare read port;
// vram_stub's exact address convention is reconciled at integration time.
`timescale 1ns/1ps
module gs_texture_unit #(
// Ch298 — SWIZZLED PSMT4 texture sampling. When PSMT4_SWIZZLE=1 AND the
// texture psm==PSMT4, the texel byte address + nibble_hi are computed by
// gs_swizzle_psmt4_stub (the SAME proven module already on the framebuffer
// WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT4 block layout,
// instead of the linear gs_texel_addr. LINEAR is the default (0) so every
// existing linear PSMT4/PSMT8/PSMCT32 demo + TB is byte-identical. The
// swizzled address feeds the SAME word-aligned read, byte-lane extract,
// nibble select, and CLUT lookup — only the address GENERATION differs.
// Because the swizzled address (its low 2 bits + nibble_hi) is also
// address-derived, it flows through the SAME SEL_DELAY pipe as the linear
// selectors, so registered-read (TEX_RD_REGISTERED=1) alignment is reused
// verbatim. PSMT8/PSMCT32 always take the linear address (this rung is
// PSMT4-only).
parameter bit PSMT4_SWIZZLE = 1'b0,
// Ch299 — SWIZZLED PSMT8 texture sampling. The sibling of PSMT4_SWIZZLE,
// MINUS the nibble (PSMT8 is 1 byte/texel). When PSMT8_SWIZZLE=1 AND the
// texture psm==PSMT8, the texel byte address is computed by
// gs_swizzle_psmt8_stub (the SAME proven module already on the framebuffer
// WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT8 block layout,
// instead of the linear gs_texel_addr. LINEAR is the default (0) so every
// existing linear PSMT8/PSMT4/PSMCT32 demo + TB is byte-identical. The
// swizzled address feeds the SAME word-aligned read, byte-lane extract, and
// CLUT lookup — only the address GENERATION differs. Because the swizzled
// address' low 2 bits (byte-lane selector) are also address-derived, they
// flow through the SAME SEL_DELAY pipe as the linear selectors, so
// registered-read (TEX_RD_REGISTERED=1) alignment is reused verbatim. NO
// nibble pipe is needed — PSMT8 has no nibble. PSMT4/PSMCT32 always take
// their own address (this rung is PSMT8-only).
parameter bit PSMT8_SWIZZLE = 1'b0,
// Ch300 — SWIZZLED PSMCT32 (direct-color) texture sampling. The closure
// rung of the swizzle layout family. When PSMCT32_SWIZZLE=1 AND the texture
// psm==PSMCT32, the texel byte address is computed by gs_swizzle_psmct32_stub
// (the SAME proven module already on the framebuffer WRITE / SCANOUT / UPLOAD
// paths — Ch119/Ch122) using the real PS2 PSMCT32 page/block layout, instead
// of the linear gs_texel_addr. Unlike PSMT4/PSMT8 this needs NO CLUT and NO
// byte-lane select: PSMCT32 is 4 bytes/texel, so the swizzled address is
// already word-aligned and the fetched 32-bit word IS the color directly
// (tex_color = tex_rd_data). LINEAR is the default (0) so every existing
// linear PSMCT32 demo + TB (textured / tritex) is byte-identical. This is
// the SAME single-param-per-format gate as PSMCT32_SWIZZLE on the FB side,
// so a PSMCT32 texture and a PSMCT32 framebuffer swizzle together.
parameter bit PSMCT32_SWIZZLE = 1'b0,
// Ch294 — GS texture WRAP MODES (REPEAT + CLAMP). When TEX_WRAP_ENABLE=1
// the per-pixel (u,v) are resolved against the texture's power-of-two
// dimensions (width=2^TW, height=2^TH from TEX0) using the CLAMP_1 wrap
// mode (WMS for u/S, WMT for v/T): 0=REPEAT (u & (width-1)), 1=CLAMP
// (u>=width -> width-1). REGION_* (2/3) are NOT modelled and pass through.
// The wrap is applied to u/v BEFORE address generation, so it covers the
// linear path AND every swizzle path. With TEX_WRAP_ENABLE=0 (default)
// u_eff===u and v_eff===v as a compile-time constant, so the wrap logic is
// pruned and every existing consumer is BYTE-IDENTICAL.
parameter bit TEX_WRAP_ENABLE = 1'b0,
// Ch308 — BILINEAR (4-tap) texture filtering, PSMCT32-only this rung.
// When BILINEAR_ENABLE=1 AND psm==PSMCT32 the sampler runs a 4-beat read
// FSM: it fetches the 4 texels surrounding the fractional coord
// (u,v) (u+1,v) (u,v+1) (u+1,v+1)
// — each independently wrapped/clamped through the SAME u_eff/v_eff
// machinery (so edge taps repeat/clamp instead of reading outside the
// texture) — then blends them per channel (R,G,B,A) by the 4-bit
// fractional u_frac/v_frac (0..15, /16) using a >>4 fixed-point lerp.
// For !BILINEAR_ENABLE (default) OR psm!=PSMCT32 the EXACT current
// single-read NEAREST path is used and u_frac/v_frac are ignored, so the
// synthesized logic and every existing consumer is BYTE-IDENTICAL (the
// bilinear FSM, the per-beat coord select, and the blend datapath are all
// pruned as compile-time-dead when BILINEAR_ENABLE=0). Bilinear is
// PSMCT32-only by default; with PALETTE_BILINEAR=1 (Ch314) it also covers
// PSMT8/PSMT4 via per-tap CLUT-before-interp. At PALETTE_BILINEAR=0 the
// indexed textures still take the nearest path even with BILINEAR_ENABLE=1.
//
// ALPHA: the alpha channel is INTERPOLATED with the same 4-tap lerp as
// R/G/B (not pass-through-nearest). For an opaque texture (all taps a=255)
// this returns 255 exactly; for a texel-center sample (u_frac=v_frac=0) it
// returns the (u,v) tap's alpha exactly.
parameter bit BILINEAR_ENABLE = 1'b0,
// Ch314 — BILINEAR for PALETTIZED (indexed) textures. When
// PALETTE_BILINEAR=1 (and BILINEAR_ENABLE=1) the 4-tap path also runs for
// PSMT8 (0x13) and PSMT4 (0x14). The CRITICAL rule is CLUT-BEFORE-INTERP:
// each of the 4 taps fetches an INDEX, that index is CLUT'd to an RGBA
// color (the existing combinational clut_rd_idx/clut_rd_data port), and the
// 4 COLORS are then interpolated — NOT the indices. This falls out of
// capturing `near_color` per tap (clut_rd_data for indexed, tex_rd_data for
// PSMCT32) instead of the raw word. Swizzled addressing + wrap/clamp run in
// the SAME per-tap addr-gen that already feeds the nearest path, so they
// happen BEFORE the index/CLUT lookup. Default 0 → indexed textures stay
// nearest even with BILINEAR_ENABLE=1, so every existing build is
// byte-identical (the combined path only ever fed PSMCT32 textures anyway).
parameter bit PALETTE_BILINEAR = 1'b0,
parameter int RD_LATENCY = 1, // VRAM read latency in clk cycles
// Ch296 — PSMT8 byte-lane realignment. The byte selected from the
// fetched word must use the LOW 2 bits of the address that was ISSUED
// for the returned data. When the texel ADDRESS advances every cycle
// while a read is in flight (gs_stub TEX_RD_REGISTERED=1: address
// taken from the S0 walker, registered read returns 1 cycle later),
// the current `addr` no longer matches the in-flight word, so the
// selector must be delayed by SEL_DELAY cycles to re-pair them. When
// the address is HELD stable across the read (combinational read port,
// address from the stable S1 latch), SEL_DELAY=0 and the current addr
// is correct. Driven from gs_stub as TEX_RD_REGISTERED?TEX_RD_LATENCY:0.
parameter int SEL_DELAY = 0
) (
input logic clk,
input logic rst_n,
// per-pixel texture coordinate in
input logic in_valid,
input logic [10:0] u,
input logic [10:0] v,
// Ch308 — fractional texture coords for BILINEAR (4-bit, 0..15 => /16).
// Unused at default (BILINEAR_ENABLE=0) and for non-PSMCT32 psm.
input logic [3:0] u_frac,
input logic [3:0] v_frac,
// Ch310 — RUNTIME filter select (per-primitive TEX1_1.MMAG). When
// BILINEAR_ENABLE=1 the 4-tap path runs ONLY when (is_ct32 && filter_lin);
// with filter_lin=0 (TEX1.MMAG=0 NEAREST) the sampler falls back to the
// exact nearest single-read path (busy stays 0). Unused at
// BILINEAR_ENABLE=0 (g_nearest), so the default build is byte-identical.
input logic filter_lin,
// Ch294 — wrap-mode controls (CLAMP_1 WMS/WMT + TEX0 TW/TH). Unused at
// default (TEX_WRAP_ENABLE=0) since u_eff/v_eff collapse to u/v.
input logic [1:0] wms,
input logic [1:0] wmt,
input logic [3:0] tw,
input logic [3:0] th,
// TEX0 descriptor
input logic [31:0] tbp0_base_bytes, // texture base in VRAM (bytes)
input logic [13:0] tbw, // TEX0.TBW (texels/row / 64)
input logic [5:0] psm, // pixel storage mode
// VRAM texel read port
output logic tex_rd_en,
output logic [31:0] tex_rd_addr, // byte address
input logic [31:0] tex_rd_data, // 32-bit word, valid RD_LATENCY later
// Ch296 — CLUT lookup port (PSMT8 indexed texturing). The extracted
// 8-bit index drives `clut_rd_idx`; the parent wires this to
// clut_stub's second (combinational) read port and returns the
// PSMCT32 entry on `clut_rd_data`. Unused for PSMCT32 textures.
output logic [7:0] clut_rd_idx,
input logic [31:0] clut_rd_data, // PSMCT32 entry for clut_rd_idx
// sampled color out (aligned with out_valid)
output logic out_valid,
output logic [31:0] tex_color, // ABGR8888
// Ch308 — BILINEAR busy: high while the 4-beat read sequence is in flight
// (the caller must not issue a new in_valid until it drops / out_valid
// pulses). Always 0 on the nearest path (BILINEAR_ENABLE=0 or non-PSMCT32),
// so a caller that ignores it sees byte-identical behavior.
output logic busy
);
localparam logic [5:0] PSM_PSMCT32 = 6'h00;
localparam logic [5:0] PSM_PSMT8 = 6'h13;
localparam logic [5:0] PSM_PSMT4 = 6'h14;
// --- Ch294: wrap-mode resolution (u/v -> u_eff/v_eff) ---
// Applied BEFORE any address generation so it covers the linear path AND
// every swizzle path. width=2^TW, height=2^TH (both powers of two), so
// REPEAT is a mask and CLAMP is a >width-1 saturate. u/v are unsigned so
// there is no negative/underflow case to handle. REGION_* (2/3) pass
// through unchanged (not modelled this rung). At TEX_WRAP_ENABLE=0 this is
// a constant pass-through (u_eff===u, v_eff===v) -> byte-identical.
// Ch308 — the coord that FEEDS the wrap. On the nearest path (bilinear off
// or non-PSMCT32) this is the port u/v UNCHANGED, so the wrap output
// (u_eff/v_eff) and everything downstream is byte-identical. On the
// bilinear path it is the current beat's neighbor coord (u+du[k],v+dv[k]),
// so each of the 4 taps is independently wrapped/clamped. `bili_active` is
// a compile-time constant 0 when BILINEAR_ENABLE=0, so u_in===u / v_in===v
// collapses away at the default build.
logic bili_active; // declared below; bilinear running for this psm
logic [10:0] beat_u, beat_v; // declared below; current beat neighbor coord
logic [10:0] u_in, v_in;
always_comb begin
if (BILINEAR_ENABLE && bili_active) begin
u_in = beat_u; v_in = beat_v;
end else begin
u_in = u; v_in = v; // byte-identical nearest coord
end
end
logic [10:0] u_eff, v_eff;
logic [10:0] u_wmask, v_wmask; // width-1 / height-1
logic [10:0] u_wlimit, v_wlimit;
always_comb begin
u_wmask = (11'd1 << tw) - 11'd1; v_wmask = (11'd1 << th) - 11'd1;
u_wlimit = u_wmask; v_wlimit = v_wmask; // width-1 / height-1
if (!TEX_WRAP_ENABLE) begin
u_eff = u_in; v_eff = v_in; // pass-through -> byte-identical at default
end else begin
// U
unique case (wms)
2'd0: u_eff = u_in & u_wmask; // REPEAT
2'd1: u_eff = (u_in > u_wlimit) ? u_wlimit : u_in; // CLAMP
default: u_eff = u_in; // REGION_* not modelled -> pass-through
endcase
// V
unique case (wmt)
2'd0: v_eff = v_in & v_wmask;
2'd1: v_eff = (v_in > v_wlimit) ? v_wlimit : v_in;
default: v_eff = v_in;
endcase
end
end
// --- linear address (combinational) ---
logic [31:0] lin_addr;
logic lin_nibble_hi; // PSMT4: this texel is the HIGH nibble of its byte
gs_texel_addr #(.ADDR_W(32)) u_addr (
.base_byte_addr (tbp0_base_bytes),
.u (u_eff),
.v (v_eff),
.tbw (tbw),
.psm (psm),
.texel_byte_addr(lin_addr),
.nibble_hi (lin_nibble_hi)
);
// --- swizzled PSMT4 address (combinational) ---
// EXACTLY mirrors the texture-UPLOAD path (gif_image_xfer_stub Ch139):
// the swizzle module is fed FBP=0 so it emits only the WITHIN-TEXTURE
// byte OFFSET, and the texture base (tbp0_base_bytes) is ADDED on top.
// This makes the sampled address bit-identical to the uploaded one for
// ANY 256-byte-aligned base (using the swizzle module's `fbp` input here
// would discard the low 11 bits of a non-2048-aligned base). FBW=TBW (in
// 64-texel units); PSMT4 swizzle needs FBW even (bw_pg = FBW>>1). The
// texture's (u,v) ARE the swizzle (x,y). Output is byte-offset + nibble_hi
// — the SAME shape gs_texel_addr emits for linear PSMT4, so downstream
// (word-align, byte-lane, nibble select, CLUT) is untouched.
logic [31:0] swz_off;
logic [31:0] swz_addr;
logic swz_nibble_hi;
generate
if (PSMT4_SWIZZLE) begin : g_swizzle4
gs_swizzle_psmt4_stub u_swizzle4 (
.fbp (9'd0),
.fbw (tbw[5:0]),
.x ({1'b0, u_eff}),
.y ({1'b0, v_eff}),
.addr (swz_off),
.nibble_hi (swz_nibble_hi)
);
assign swz_addr = tbp0_base_bytes + swz_off;
end else begin : g_no_swizzle4
assign swz_off = 32'd0;
assign swz_addr = 32'd0;
assign swz_nibble_hi = 1'b0;
end
endgenerate
// --- swizzled PSMT8 address (combinational) ---
// Ch299 — EXACTLY mirrors the PSMT4-swizzle sampler arm above (and the
// PSMT8 UPLOAD path in gif_image_xfer_stub Ch133), MINUS the nibble.
// gs_swizzle_psmt8_stub is fed FBP=0 so it emits only the WITHIN-TEXTURE
// byte OFFSET; the texture base (tbp0_base_bytes) is ADDED on top. This
// makes the sampled address bit-identical to the uploaded one for ANY
// 256-byte-aligned base. FBW=TBW (in 64-texel units); the PSMT8 swizzle
// needs FBW even (bw_pg = FBW>>1). The texture's (u,v) ARE the swizzle
// (x,y). Output is a byte address — the SAME shape gs_texel_addr emits for
// linear PSMT8 — so downstream (word-align, byte-lane, CLUT) is untouched.
// No nibble_hi: PSMT8 is one full byte per texel.
logic [31:0] swz8_off;
logic [31:0] swz8_addr;
generate
if (PSMT8_SWIZZLE) begin : g_swizzle8
gs_swizzle_psmt8_stub u_swizzle8 (
.fbp (9'd0),
.fbw (tbw[5:0]),
.x ({1'b0, u_eff}),
.y ({1'b0, v_eff}),
.addr (swz8_off)
);
assign swz8_addr = tbp0_base_bytes + swz8_off;
end else begin : g_no_swizzle8
assign swz8_off = 32'd0;
assign swz8_addr = 32'd0;
end
endgenerate
// --- swizzled PSMCT32 address (combinational) ---
// Ch300 — direct-color sibling of the PSMT4/PSMT8 swizzle arms above, using
// the SAME proven gs_swizzle_psmct32_stub already on the FB WRITE / SCANOUT
// / UPLOAD paths. Fed FBP=0 so it emits only the WITHIN-TEXTURE byte OFFSET;
// the texture base (tbp0_base_bytes) is ADDED on top, making the sampled
// address bit-identical to the uploaded one for ANY 2048-byte-aligned base.
// FBW=TBW (in 64-pixel units — PSMCT32 page is 64 px wide, so TBW units
// match the stub's fbw directly, NO >>1). The texture's (u,v) ARE the
// swizzle (x,y). Output is a 4-byte-aligned byte address — gs_texel_addr's
// PSMCT32 shape — so downstream is untouched. NO nibble, NO byte-lane, NO
// CLUT: the fetched word is the color (tex_color = tex_rd_data).
logic [31:0] swz32_off;
logic [31:0] swz32_addr;
generate
if (PSMCT32_SWIZZLE) begin : g_swizzle32
gs_swizzle_psmct32_stub u_swizzle32 (
.fbp (9'd0),
.fbw (tbw[5:0]),
.x ({1'b0, u_eff}),
.y ({1'b0, v_eff}),
.addr (swz32_off)
);
assign swz32_addr = tbp0_base_bytes + swz32_off;
end else begin : g_no_swizzle32
assign swz32_off = 32'd0;
assign swz32_addr = 32'd0;
end
endgenerate
// --- linear-vs-swizzled select ---
// Swizzle applies to a PSMT4 texture when PSMT4_SWIZZLE is set, and to a
// PSMT8 texture when PSMT8_SWIZZLE is set; every other psm always takes the
// linear address, and the two swizzle gates are mutually exclusive by psm.
// With both params 0 the selects are constant-false, so the synthesized
// logic — and every linear TB/demo — is byte-identical.
logic use_swizzle4;
logic use_swizzle8;
logic use_swizzle32;
logic [31:0] addr;
logic nibble_hi;
assign use_swizzle4 = (PSMT4_SWIZZLE != 1'b0) && (psm == PSM_PSMT4);
assign use_swizzle8 = (PSMT8_SWIZZLE != 1'b0) && (psm == PSM_PSMT8);
assign use_swizzle32 = (PSMCT32_SWIZZLE != 1'b0) && (psm == PSM_PSMCT32);
// Only the PSMT4 path carries a nibble; PSMT8/PSMCT32 swizzle and the
// linear fallback have none.
assign addr = use_swizzle4 ? swz_addr :
use_swizzle8 ? swz8_addr :
use_swizzle32 ? swz32_addr : lin_addr;
assign nibble_hi = use_swizzle4 ? swz_nibble_hi : lin_nibble_hi;
// Nearest-path read enable / address. These are muxed at the module
// outputs (tex_rd_en/tex_rd_addr) below: on the nearest path they ARE the
// outputs (byte-identical); on the bilinear path the FSM drives the
// outputs instead. The word-align mask is a no-op for PSMCT32.
logic near_rd_en;
logic [31:0] near_rd_addr;
assign near_rd_en = in_valid;
// The VRAM read port is 32-bit WORD-addressed (and vram_bram_stub's
// read2 only returns data for word-aligned addresses). PSMCT32 texel
// addresses are already word-aligned; PSMT8 byte addresses are not, so
// present the word-aligned address and recover the right byte via the
// low 2 bits (sel_lo) in the index extract below. Masking is a no-op
// for PSMCT32, so that path stays byte-identical.
assign near_rd_addr = addr & ~32'd3;
// --- PSMT8 index extract ---
// gs_texel_addr returns a 1-byte/texel address for PSMT8, so the
// fetched 32-bit word (read at addr & ~3 by the word-addressed VRAM
// port) packs 4 indices; the issued address' low 2 bits select which
// byte is THIS texel.
//
// The byte selector uses the addr[1:0] from the issue cycle of the
// returned word. SEL_DELAY (see the param comment) is 0 when the
// address is held stable across the read (current addr is correct) and
// >0 when the address advances while the read is in flight (delay the
// selector to re-pair it with the in-flight word). `sel_lo` carries it.
//
// PSMT4 (Ch297) adds a NIBBLE selector on top of the byte selector.
// gs_texel_addr emits a byte address (texel_offset>>1) plus `nibble_hi`
// (= texel_offset[0]: even texel -> LOW nibble, odd -> HIGH nibble). The
// selected byte (via sel_lo, exactly as PSMT8) holds TWO 4-bit indices;
// nibble_hi picks which. Because nibble_hi is derived from the texel
// ADDRESS — which advances every cycle while a read is in flight under
// TEX_RD_REGISTERED=1 — it must be SEL_DELAY-aligned by the SAME pipe
// depth as sel_lo so it re-pairs with the returned word. (Same class as
// the PSMT8 byte-lane realignment; get it wrong and odd/even texels smear.)
logic [1:0] sel_lo;
logic nib_sel; // SEL_DELAY-aligned nibble_hi
generate
if (SEL_DELAY == 0) begin : g_sel_comb
assign sel_lo = addr[1:0];
assign nib_sel = nibble_hi;
end else begin : g_sel_reg
logic [1:0] sel_pipe [0:SEL_DELAY-1];
logic nib_pipe [0:SEL_DELAY-1];
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
for (int i = 0; i < SEL_DELAY; i++) begin
sel_pipe[i] <= 2'd0;
nib_pipe[i] <= 1'b0;
end
end else begin
sel_pipe[0] <= addr[1:0];
nib_pipe[0] <= nibble_hi;
for (int i = 1; i < SEL_DELAY; i++) begin
sel_pipe[i] <= sel_pipe[i-1];
nib_pipe[i] <= nib_pipe[i-1];
end
end
end
assign sel_lo = sel_pipe[SEL_DELAY-1];
assign nib_sel = nib_pipe[SEL_DELAY-1];
end
endgenerate
// Byte select (shared by PSMT8 and PSMT4): pick the texel's byte lane.
logic [7:0] sel_byte;
always_comb begin
unique case (sel_lo)
2'b00: sel_byte = tex_rd_data[ 7: 0];
2'b01: sel_byte = tex_rd_data[15: 8];
2'b10: sel_byte = tex_rd_data[23:16];
default: sel_byte = tex_rd_data[31:24];
endcase
end
// Nibble select for PSMT4 (4-bit index, zero-extended to 8 bits so the
// SAME clut_rd_idx port + clut_stub feed it; CLUT entries 0..15 used).
// iverilog-12: no bit-select on a parenthesized expr, so split into a
// named net first, then index it.
logic [7:0] sel_byte_for_nib;
assign sel_byte_for_nib = sel_byte;
logic [3:0] psmt4_nibble;
assign psmt4_nibble = nib_sel ? sel_byte_for_nib[7:4] : sel_byte_for_nib[3:0];
// Index out: PSMT4 -> zero-extended nibble; PSMT8 -> full byte.
assign clut_rd_idx = (psm == PSM_PSMT4) ? {4'd0, psmt4_nibble} : sel_byte;
// --- valid pipeline matching the read latency ---
// in_valid presented with the address this cycle; tex_rd_data for it
// arrives RD_LATENCY cycles later. Delay valid by the same amount.
logic [RD_LATENCY-1:0] valid_pipe;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
valid_pipe <= '0;
end else begin
if (RD_LATENCY == 1)
valid_pipe[0] <= in_valid;
else begin
valid_pipe[0] <= in_valid;
for (int i = 1; i < RD_LATENCY; i++)
valid_pipe[i] <= valid_pipe[i-1];
end
end
end
logic near_out_valid;
assign near_out_valid = valid_pipe[RD_LATENCY-1];
// --- decode (DECAL) ---
// PSMT4 : texel color = CLUT[nibble] (indexed indirection)
// PSMT8 : texel color = CLUT[byte index] (indexed indirection)
// PSMCT32 : texel word IS the color directly (byte-identical to v1)
logic [31:0] near_color;
assign near_color = (psm == PSM_PSMT8 || psm == PSM_PSMT4)
? clut_rd_data : tex_rd_data;
// ========================================================================
// Ch308 — BILINEAR (4-tap) PSMCT32 FILTER
// ========================================================================
// When BILINEAR_ENABLE=1 and psm==PSMCT32 we sample the 4 texels around the
// fractional coord and blend them. The whole block is wrapped in a generate
// that is empty when BILINEAR_ENABLE=0, so the default build is pruned to
// exactly the nearest path and is BYTE-IDENTICAL.
//
// CYCLE SCHEDULE (RD_LATENCY-aware; example RD_LATENCY=L):
// T0 : caller asserts in_valid (with u,v,u_frac,v_frac). FSM in
// IDLE latches u/v/frac, sets beat index k=0, drives
// bili_active=1, busy=1, moves to ISSUE.
// T0+ (ISSUE) : present neighbor[k] coord (beat_u/beat_v -> wrap ->
// gs_texel_addr -> tex_rd_addr) and pulse tex_rd_en for 1
// cycle; start an L-cycle wait; -> WAIT.
// ISSUE+1..+L : WAIT counts L cycles; on the L-th cycle tex_rd_data holds
// beat[k]'s 32-bit ABGR word -> capture into tap[k].
// If k<3: k++ and -> ISSUE (next neighbor). If k==3: -> DONE.
// DONE : combinationally lerp the 4 captured taps by u_frac/v_frac
// per channel; assert out_valid for 1 cycle with tex_color;
// drop busy; -> IDLE.
// => total ~ 4*(1+L)+1 cycles per filtered sample. Throughput is NOT a
// goal here (a later texture-cache pass collapses the 4 reads).
//
// Neighbor table (k -> du,dv): 0->(0,0) 1->(1,0) 2->(0,1) 3->(1,1).
// Each neighbor coord is fed through the SAME u_eff/v_eff wrap (via
// u_in/v_in above) so edge taps repeat/clamp and never read outside the
// texture (proven in the TB clamp/repeat cases).
//
// lerp(a,b,f) = a + (($signed({1'b0,b}) - $signed({1'b0,a})) * $signed({1'b0,f})) >>> 4
// with f the 4-bit frac (0..15 => /16). a,b are 8-bit channels. The
// bracketed product is computed in a SIGNED temp (no bit-select on a
// parenthesized expr — iverilog-12 rule), then arithmetic-shifted >>>4,
// then defensively clamped to 0..255.
generate
if (BILINEAR_ENABLE) begin : g_bilinear
localparam logic [1:0] BS_IDLE = 2'd0;
localparam logic [1:0] BS_ISSUE = 2'd1;
localparam logic [1:0] BS_WAIT = 2'd2;
localparam logic [1:0] BS_DONE = 2'd3;
logic [1:0] state;
logic [1:0] beat; // which neighbor 0..3
logic [31:0] wait_cnt; // counts RD_LATENCY
logic [31:0] tap [0:3]; // captured ABGR per neighbor
logic [10:0] lat_u, lat_v; // latched coord for this sample
logic [3:0] lat_uf, lat_vf; // latched fracs
// is this a PSMCT32 sample? bilinear runs for PSMCT32 always, and (Ch314)
// for PSMT8/PSMT4 when PALETTE_BILINEAR=1; any other psm falls back to the
// nearest path even with BILINEAR_ENABLE=1.
logic is_ct32;
logic is_indexed;
logic bili_psm_ok;
assign is_ct32 = (psm == PSM_PSMCT32);
assign is_indexed = (psm == PSM_PSMT8) || (psm == PSM_PSMT4);
assign bili_psm_ok = is_ct32 || (PALETTE_BILINEAR && is_indexed);
// Ch310 — RUNTIME filter gate. The 4-tap path runs ONLY for a PSMCT32
// texture whose primitive selected LINEAR magnification (filter_lin=1,
// i.e. TEX1.MMAG=1). With filter_lin=0 (NEAREST) we fall back to the
// single-read nearest path even with BILINEAR_ENABLE=1, so an
// MMAG=0 primitive stays nearest. `do_lin` is the single predicate that
// selects the bilinear datapath everywhere below.
//
// NOTE on the `!== 1'b0` test: it makes filter_lin DEFAULT-ON when the
// port is left UNCONNECTED (sim Z). The standalone tb_gs_texture_bilinear
// exercises the 4-tap path directly without driving filter_lin, so an
// unconnected input must keep bilinear running (Z !== 0 → true). A
// driven 0 (gs_stub MMAG=0) gives nearest; a driven 1 gives bilinear.
// In synthesis filter_lin is always driven by gs_stub, so this reduces
// to a plain `is_ct32 && filter_lin`.
logic do_lin;
assign do_lin = bili_psm_ok && (filter_lin !== 1'b0);
// bili_active (read by the wrap mux above): high whenever a filtered
// PSMCT32 sample is being processed by the FSM (ISSUE/WAIT/DONE) so the
// wrap consumes the per-beat neighbor coord. When do_lin=0 it is low so
// the wrap uses the port u/v (nearest), byte-identical to the
// non-bilinear coord path.
assign bili_active = do_lin;
// neighbor delta for the current beat
logic [10:0] du, dv;
always_comb begin
unique case (beat)
2'd0: begin du = 11'd0; dv = 11'd0; end
2'd1: begin du = 11'd1; dv = 11'd0; end
2'd2: begin du = 11'd0; dv = 11'd1; end
default: begin du = 11'd1; dv = 11'd1; end
endcase
end
// beat coord feeds the wrap (u_in/v_in). In IDLE (before latching) use
// the live ports so the first ISSUE sees neighbor 0 of the live coord;
// once latched, use the latched coord.
always_comb begin
if (state == BS_IDLE) begin
beat_u = u + du; // beat==0 here -> u+0
beat_v = v + dv;
end else begin
beat_u = lat_u + du;
beat_v = lat_v + dv;
end
end
// The bilinear read address reuses the SAME addr-gen (gs_texel_addr via
// the u_eff/v_eff wrap fed by beat_u/beat_v). near_rd_addr already is
// (addr & ~3) for the currently-selected coord; for PSMCT32 the linear
// path is used and it is word-aligned. We pulse rd_en only on ISSUE.
logic bi_rd_en;
assign bi_rd_en = (state == BS_ISSUE);
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
state <= BS_IDLE;
beat <= 2'd0;
wait_cnt <= 32'd0;
lat_u <= 11'd0; lat_v <= 11'd0;
lat_uf <= 4'd0; lat_vf <= 4'd0;
for (int i = 0; i < 4; i++) tap[i] <= 32'd0;
end else begin
unique case (state)
BS_IDLE: begin
if (in_valid && do_lin) begin
lat_u <= u; lat_v <= v;
lat_uf <= u_frac; lat_vf <= v_frac;
beat <= 2'd0;
state <= BS_ISSUE;
end
end
BS_ISSUE: begin
// address presented this cycle (combinationally via
// beat -> beat_u/beat_v -> wrap -> addr). Begin the
// RD_LATENCY wait.
wait_cnt <= 32'd1;
state <= BS_WAIT;
end
BS_WAIT: begin
if (wait_cnt >= RD_LATENCY[31:0]) begin
// tex_rd_data now holds beat's word. Capture the
// resolved COLOR (`near_color`): for PSMCT32 that is
// the raw word (byte-identical to the original);
// for PSMT8/PSMT4 (Ch314) it is clut_rd_data — the
// index extracted from this beat's word (sel_byte /
// psmt4_nibble, stable across the held beat) then CLUT'd.
// Capturing the CLUT'd color per tap is what makes the
// downstream lerp interpolate COLORS, not indices.
tap[beat] <= near_color;
if (beat == 2'd3) begin
state <= BS_DONE;
end else begin
beat <= beat + 2'd1;
state <= BS_ISSUE;
end
end else begin
wait_cnt <= wait_cnt + 32'd1;
end
end
default: begin // BS_DONE
state <= BS_IDLE;
end
endcase
end
end
// --- 4-tap blend (combinational, on the captured taps) ---
// PSMCT32 word layout: [31:24]=A [23:16]=B [15:8]=G [7:0]=R (ABGR8888).
// tap0=(u,v) tap1=(u+1,v) tap2=(u,v+1) tap3=(u+1,v+1).
function automatic logic [7:0] lerp8(input logic [7:0] a,
input logic [7:0] b,
input logic [3:0] f);
logic signed [16:0] diff; // b-a, signed, range -255..255
logic signed [21:0] prod; // diff*f
logic signed [21:0] shifted; // prod >>> 4
logic signed [21:0] res; // a + shifted
begin
diff = $signed({1'b0, b}) - $signed({1'b0, a});
prod = diff * $signed({1'b0, f});
shifted = prod >>> 4;
res = $signed({14'd0, a}) + shifted;
// defensive clamp 0..255 (in-range inputs keep res in range)
if (res < 0) lerp8 = 8'd0;
else if (res > 22'sd255) lerp8 = 8'd255;
else lerp8 = res[7:0];
end
endfunction
// per-channel taps
logic [7:0] t0_r, t0_g, t0_b, t0_a;
logic [7:0] t1_r, t1_g, t1_b, t1_a;
logic [7:0] t2_r, t2_g, t2_b, t2_a;
logic [7:0] t3_r, t3_g, t3_b, t3_a;
assign t0_r = tap[0][ 7: 0]; assign t0_g = tap[0][15: 8];
assign t0_b = tap[0][23:16]; assign t0_a = tap[0][31:24];
assign t1_r = tap[1][ 7: 0]; assign t1_g = tap[1][15: 8];
assign t1_b = tap[1][23:16]; assign t1_a = tap[1][31:24];
assign t2_r = tap[2][ 7: 0]; assign t2_g = tap[2][15: 8];
assign t2_b = tap[2][23:16]; assign t2_a = tap[2][31:24];
assign t3_r = tap[3][ 7: 0]; assign t3_g = tap[3][15: 8];
assign t3_b = tap[3][23:16]; assign t3_a = tap[3][31:24];
// top = lerp(tap0,tap1,uf); bot = lerp(tap2,tap3,uf); out = lerp(top,bot,vf)
logic [7:0] top_r, top_g, top_b, top_a;
logic [7:0] bot_r, bot_g, bot_b, bot_a;
logic [7:0] cv_r, cv_g, cv_b, cv_a;
always_comb begin
top_r = lerp8(t0_r, t1_r, lat_uf);
top_g = lerp8(t0_g, t1_g, lat_uf);
top_b = lerp8(t0_b, t1_b, lat_uf);
top_a = lerp8(t0_a, t1_a, lat_uf);
bot_r = lerp8(t2_r, t3_r, lat_uf);
bot_g = lerp8(t2_g, t3_g, lat_uf);
bot_b = lerp8(t2_b, t3_b, lat_uf);
bot_a = lerp8(t2_a, t3_a, lat_uf);
cv_r = lerp8(top_r, bot_r, lat_vf);
cv_g = lerp8(top_g, bot_g, lat_vf);
cv_b = lerp8(top_b, bot_b, lat_vf);
cv_a = lerp8(top_a, bot_a, lat_vf);
end
// Ch310 — HOLD register for the filtered color. The combined-renderer
// FSM (gs_stub CB_TWAIT) may latch the result a cycle or two AFTER the
// out_valid pulse (it steps at half-rate on z_advance beats), so the
// blended ABGR must stay STABLE from out_valid until the next sample.
// tex_color is the LIVE combinational blend during DONE (so an
// out_valid-keyed caller — tb_gs_texture_bilinear — reads the fresh
// value the SAME cycle out_valid pulses, byte-identical to before) and
// the LATCHED copy afterward (so a caller that reads one+ cycles later,
// like CB_TWAIT→CB_T, still sees it). The register captures the blend
// on the clk edge that LEAVES DONE; combining "live during DONE, held
// after" gives a value stable from out_valid until the next sample
// overwrites it at its DONE.
logic [31:0] tex_color_blend;
assign tex_color_blend = {cv_a, cv_b, cv_g, cv_r};
logic [31:0] tex_color_hold;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n)
tex_color_hold <= 32'd0;
else if (state == BS_DONE)
tex_color_hold <= tex_color_blend; // capture the just-blended value
end
// live during the DONE pulse, held (last captured) otherwise
logic [31:0] tex_color_lin;
assign tex_color_lin = (state == BS_DONE) ? tex_color_blend : tex_color_hold;
// --- output mux: bilinear FSM owns the outputs for a FILTERED PSMCT32
// sample (do_lin). When do_lin=0 — non-PSMCT32 psm OR MMAG=0 NEAREST —
// we transparently fall back to the nearest single-read path so
// PSMT8/PSMT4/swizzle and nearest PSMCT32 still work with
// BILINEAR_ENABLE=1, and busy stays 0 there.
// tex_color: the HELD blended ABGR (stable from out_valid to next DONE).
assign tex_rd_en = do_lin ? bi_rd_en : near_rd_en;
// tex_rd_addr is the SAME addr-gen output for both paths (the wrap
// selects beat_u/beat_v vs port u/v); the FSM just gates rd_en.
assign tex_rd_addr = near_rd_addr;
assign out_valid = do_lin ? (state == BS_DONE) : near_out_valid;
assign tex_color = do_lin ? tex_color_lin : near_color;
assign busy = do_lin && (state != BS_IDLE);
end else begin : g_nearest
// BYTE-IDENTICAL nearest path: outputs are exactly the original assigns.
assign bili_active = 1'b0; // constant -> wrap uses port u/v
assign beat_u = 11'd0; // unused (pruned)
assign beat_v = 11'd0;
assign tex_rd_en = near_rd_en;
assign tex_rd_addr = near_rd_addr;
assign out_valid = near_out_valid;
assign tex_color = near_color;
assign busy = 1'b0;
end
endgenerate
endmodule : gs_texture_unit
+48
View File
@@ -0,0 +1,48 @@
// retroDE_ps2 — gs_tile_ram (Ch303)
//
// Generic on-chip TILE-LOCAL RAM for the tiled GS renderer: a single-write /
// single-read scratchpad sized to one render tile (e.g. 16x16 = 256 entries).
// Instantiated TWICE by the tile renderer — once as the color tile, once as the
// Z tile — so a combined textured+alpha+depth pixel resolves its color/Z
// read-modify-write entirely ON CHIP (per docs/decisions/0008 §6), with only the
// texture fetch and the per-tile flush crossing to VRAM/LPDDR.
//
// Contract (matches vram_bram_stub.read2 so the raster FSM retarget is minimal):
// - 1 write port (we/waddr/wdata), committed this cycle.
// - 1 read port (raddr), data REGISTERED → valid ONE cycle later (rdata).
// - Same-address read+write in the same cycle is NOT used by the tile renderer
// (the FSM reads Z at beat 0 and writes Z at beat 4 of a 5-beat pixel; color
// read at beat 2, write at beat 3 — never the same cycle), so no R/W-collision
// forwarding is needed; this stays a plain 1W1R inferred BRAM.
//
// The memory is NOT reset (BRAM-friendly); the renderer's CLEAR phase initializes
// every entry (color=clear color, Z=clear/far Z) before the first primitive.
`timescale 1ns/1ps
module gs_tile_ram #(
parameter int ADDR_W = 8, // 256 entries = one 16x16 tile
parameter int DATA_W = 32
) (
input logic clk,
input logic rst_n,
// write port (1 cycle, committed)
input logic we,
input logic [ADDR_W-1:0] waddr,
input logic [DATA_W-1:0] wdata,
// read port (registered, valid 1 cycle after raddr presented)
input logic [ADDR_W-1:0] raddr,
output logic [DATA_W-1:0] rdata
);
logic [DATA_W-1:0] mem [0:(1<<ADDR_W)-1];
always_ff @(posedge clk) begin
if (!rst_n) begin
rdata <= '0;
end else begin
if (we) mem[waddr] <= wdata;
rdata <= mem[raddr]; // 1-cycle registered read
end
end
endmodule : gs_tile_ram
+212
View File
@@ -0,0 +1,212 @@
// ============================================================================
// gs_tile_reload.sv (Ch323 Brick 2 — tile color+Z reload staging engine)
//
// The reload counterpart to the GS tile-flush writers, and a DIRECT structural
// clone of the silicon-proven gs_texture_cache (Ch322): an emif_clk fill FSM that
// reads a tile's worth of color+Z from FPGA-private LPDDR4B into on-chip staging
// RAMs, plus a design_clk serve port that returns one (color,Z) per tile index at
// the existing 1-cycle latency. gs_stub's TP_RELOAD phase sweeps the serve port and
// writes the tile color/Z RAMs before rendering. Same CDC shape as gs_texture_cache
// (one-shot warm fill, fill_done 2-FF synced into the serve clock) — NOT a new CDC.
//
// SEPARATE LPDDR bases (Codex): COLOR_BASE (the color framebuffer) and Z_BASE (the
// Z-backing region) are distinct. A 16x16 tile lives at FB stride STRIDE_BYTES per
// row (FBW*64*4 = 256 for FBW=1), so the fill reads ROW_BEATS 256-bit beats per row
// from each base — sparse/strided, exactly like the Ch322 texture (which read a
// 64-texel-stride region). Single-beat reads (arlen=0, the only proven EMIF pattern).
//
// Counters (Codex): color_beats, z_beats, rd_errs — all distinct.
// ============================================================================
`timescale 1ns/1ps
module gs_tile_reload #(
parameter [29:0] COLOR_BASE = 30'd0, // LPDDR byte base of the color framebuffer
parameter [29:0] Z_BASE = 30'h0010_0000,// LPDDR byte base of the Z-backing (DISTINCT)
parameter int TILE_W = 16,
parameter int TILE_H = 16,
parameter int STRIDE_BYTES = 256, // FB row stride (FBW*64 px * 4 B = 256 for FBW=1)
parameter int ROW_BEATS = 2, // 16 words/row * 4 B / 32 B = 2 single-beat reads
parameter int COLOR_W = 32
)(
// ---- AXI read clock domain (emif_clk) — fill side ----
input logic axi_clk,
input logic axi_rst_n,
input logic reload_start, // STROBE (gs/serve domain, CDC-synced): each RISING edge (re)fills
// Ch324 — RUNTIME per-tile byte offset into the raster LPDDR framebuffer. Latched at the fill
// arm (fs_edge) so it is stable for the whole fill. = ((tile_oy*(FBW*64)) + tile_ox)*4, the SAME
// formula the flush side uses, so reload gathers exactly the tile the spill wrote. 0 = origin
// tile (byte-identical to the Ch323 single-tile path). Quasi-static: gs_stub holds the current
// tile constant across TP_RELOAD, so sampling it at the synced strobe needs no extra CDC.
input logic [29:0] reload_base,
output logic reload_done, // tile fully resident (until the next fill arm)
output logic [31:0] color_beats, // color beats read (cumulative)
output logic [31:0] z_beats, // Z beats read (cumulative)
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
output logic [29:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready,
// ---- serve clock domain (design_clk) — gs_stub TP_RELOAD reads this ----
input logic serve_clk,
input logic [7:0] raddr, // tile index (row*16 + col), 0..255
output logic [COLOR_W-1:0] color_o, // 1-cycle REGISTERED color for raddr
output logic [31:0] z_o, // 1-cycle REGISTERED Z for raddr
output logic reload_ready // reload_done synced into serve_clk (TP_RELOAD ready gate)
);
localparam int N_ENTRIES = TILE_W*TILE_H; // 256
localparam int N_ROWS = TILE_H; // 16
localparam int WORDS_ROW = TILE_W; // 16 words/row
assign arburst = 2'b01; // INCR
assign arid = 7'd6; // distinct: writer=0/rd-probe=1/fcache=2/linebuf=3/texfill=4/wr-probe=5/tile-reload=6
assign arlen = 8'd0; // single-beat (only proven EMIF read pattern)
assign arsize = 3'b101; // 32 bytes
// On-chip staging RAMs: written by the fill FSM (axi_clk), read by gs_stub (serve_clk).
// One-shot warm fill => static during reads => no read/write CDC hazard (gs_texture_cache pattern).
logic [COLOR_W-1:0] color_ram [0:N_ENTRIES-1];
logic [31:0] z_ram [0:N_ENTRIES-1];
// ================= fill side (axi_clk) =================
// For each of N_ROWS rows, read ROW_BEATS color beats then ROW_BEATS Z beats. Each 256-bit
// beat = 8 words; WORDS_ROW=16 spans ROW_BEATS=2 beats. Store the row's 16 words into the
// staging RAM at indices row*16 + (0..15).
typedef enum logic [2:0] { R_IDLE, R_C_AR, R_C_R, R_C_W, R_Z_AR, R_Z_R, R_Z_W, R_DONE } rstate_t;
rstate_t rst_q;
logic [$clog2(N_ROWS):0] row;
logic [$clog2(ROW_BEATS):0] beat;
logic [2:0] lane; // serialized unpack lane 0..7 — ONE RAM write/cycle (M20K, not an 8-wide reg file)
logic [255:0] beat_q; // latched 256-bit beat, drained one 32-bit lane per cycle
logic [29:0] base_q; // Ch324 — per-tile byte offset latched at fill arm (stable across the fill)
logic [2:0] fs_sync;
// reload_start is a STROBE (gs_stub pulses it once per tile reload): trigger on the
// RISING edge only — one pulse => exactly one fill. (Was an any-edge toggle, which made
// a pulse trigger TWO fills; harmless but wasteful and confusing.)
wire fs_edge = fs_sync[1] & ~fs_sync[2];
function automatic [29:0] row_base(input [29:0] base, input int r);
row_base = base + r*STRIDE_BYTES;
endfunction
// SINGLE write port per RAM (one index, one data, per clock) so Quartus infers M20K
// instead of the 8-wide register file the old parallel beat-unpack forced (~8.7K ALMs).
// wa = row*WORDS_ROW + beat*8 + lane. Uses the CURRENT row/beat/lane; the lane==7 branch
// updates row/beat non-blockingly, so this cycle's write still targets the right entry.
wire [$clog2(N_ENTRIES)-1:0] wa = row[$clog2(N_ROWS)-1:0]*WORDS_ROW
+ beat[$clog2(ROW_BEATS)-1:0]*8 + lane;
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
rst_q <= R_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
row <= '0; beat <= '0; lane <= '0; reload_done <= 1'b0; base_q <= 30'd0;
color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0; fs_sync <= 3'd0;
end else begin
fs_sync <= {fs_sync[1:0], reload_start};
case (rst_q)
R_IDLE, R_DONE: begin
if (fs_edge) begin
reload_done <= 1'b0; color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0;
row <= '0; beat <= '0; lane <= '0;
base_q <= reload_base; // latch this tile's offset for the whole fill
araddr <= COLOR_BASE + reload_base; // row 0 color of THIS tile
arvalid <= 1'b1;
rst_q <= R_C_AR;
end
end
R_C_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_C_R; end
R_C_R: if (rvalid) begin // latch the beat; drain it serially in R_C_W
beat_q <= rdata;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
rready <= 1'b0; color_beats <= color_beats + 32'd1;
lane <= '0;
rst_q <= R_C_W;
end
R_C_W: begin // 8 cycles: one 32-bit lane -> color_ram per clock
color_ram[wa] <= beat_q[lane*32 +: 32];
if (lane == 3'd7) begin
if (beat == ROW_BEATS-1) begin // color row done -> Z row
beat <= '0;
araddr <= row_base(Z_BASE + base_q, row);
arvalid <= 1'b1;
rst_q <= R_Z_AR;
end else begin
beat <= beat + 1'b1;
araddr <= araddr + 30'd32;
arvalid <= 1'b1;
rst_q <= R_C_AR;
end
end else lane <= lane + 1'b1;
end
R_Z_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_Z_R; end
R_Z_R: if (rvalid) begin
beat_q <= rdata;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
rready <= 1'b0; z_beats <= z_beats + 32'd1;
lane <= '0;
rst_q <= R_Z_W;
end
R_Z_W: begin // 8 cycles: one 32-bit lane -> z_ram per clock
z_ram[wa] <= beat_q[lane*32 +: 32];
if (lane == 3'd7) begin
if (beat == ROW_BEATS-1) begin // Z row done -> next row (or finish).
// reload_done stays LOW until THIS final-row final-Z-lane write.
if (row == N_ROWS-1) begin
reload_done <= 1'b1;
rst_q <= R_DONE;
end else begin
row <= row + 1'b1;
beat <= '0;
araddr <= row_base(COLOR_BASE + base_q, row + 1);
arvalid <= 1'b1;
rst_q <= R_C_AR;
end
end else begin
beat <= beat + 1'b1;
araddr <= araddr + 30'd32;
arvalid <= 1'b1;
rst_q <= R_Z_AR;
end
end else lane <= lane + 1'b1;
end
default: rst_q <= R_IDLE;
endcase
end
end
// ================= serve side (serve_clk) =================
// 1-cycle REGISTERED read, identical timing to the tile RAM / vram read2.
always_ff @(posedge serve_clk) begin
color_o <= color_ram[raddr];
z_o <= z_ram[raddr];
end
// reload_ready handshake (Ch323 fix): a fresh reload_start MUST drop ready immediately,
// and ready re-raises only when THIS fill completes. Without this, a back-to-back reload
// (two tile batches) sees ready still high from the PREVIOUS fill and gs_stub sweeps the
// stale (pre-fill) z_ram before the new fill populates it — the reloaded Z is lost (the
// board's "region A wrong color" bug; reproduced in tb_gs_tile_spill_lpddr). reload_start
// is in the serve_clk (design) domain; reload_done is edge-detected after CDC.
logic [1:0] done_sync = 2'b00;
logic ready_q = 1'b0;
wire done_rise = done_sync[0] & ~done_sync[1];
always_ff @(posedge serve_clk) begin
done_sync <= {done_sync[0], reload_done};
if (reload_start) ready_q <= 1'b0; // new fill armed -> not ready
else if (done_rise) ready_q <= 1'b1; // this fill completed
end
// COMBINATIONALLY mask ready low while reload_start is asserted: gs_stub pulses
// reload_start and checks ready in the SAME cycle, so the registered clear above lands
// one cycle too late — without the mask gs_stub sees the PREVIOUS fill's stale ready=1
// and sweeps before this fill populates z_ram (the region-A-wrong-color bug).
assign reload_ready = ready_q & ~reload_start;
endmodule
+236
View File
@@ -0,0 +1,236 @@
// ============================================================================
// gs_z_flush_writer.sv (Ch323 Brick 2 — tile color/Z-flush LPDDR writer; PACKED)
//
// Writes a gs_stub tile-flush stream (one 32-bit word per tile pixel on design_clk —
// either the TP_ZFLUSH Z stream or the TP_FLUSH color stream) to an FPGA-private
// LPDDR4B scratch region (emif_clk). Used twice in the de25 top: once for Z, once for
// the 32-bit color spill (the module is generic — it writes the 32-bit `data` at
// BASE + `addr`).
//
// PACKED (Ch323 board fix): the FIRST cut did ONE single-32-bit-lane AXI write PER
// pixel through a strictly-sequential AW->W->B FSM. The tile sweep emits one pixel per
// design_clk (256 back-to-back), but each isolated write pays the full LPDDR round-trip
// latency, so the drain fell far behind the emit rate and the 16-deep async FIFO
// OVERFLOWED within ~16 px — dropping most of the spilled tile (grey-with-specks on
// HDMI, spill_ovf=1). The proven framebuffer writer (gs_lpddr_axi_master) avoids this by
// PACKING pixels into 256-bit beats; this writer now does the same with 32-bit lanes:
//
// design_clk : PACKER — accumulate 8 consecutive 32-bit pixels of a tile-row into one
// 256-bit (32-byte) beat {block_off, data, strb}, keyed by the 32-byte
// block address (addr[29:5]). A 16-px tile-row is exactly two 32-byte-
// aligned blocks, so each beat completes naturally on its 8th px (no
// dangling partial); a block-address change flushes the in-flight beat.
// One FIFO push per 8 px => 8x fewer AXI writes => the sequential drain
// keeps up with the same small FIFO.
// async FIFO : gray-code CDC, carries {block_off[29:0], data[255:0], strb[31:0]}.
// emif_clk : AXI FSM — pop a beat, issue a single-beat INCR write (AWSIZE=5 = 32 B,
// AWLEN=0, full WSTRB on the populated lanes) at BASE + block_off.
//
// The packed beats land at exactly the offsets gs_tile_reload reads back (row r at
// BASE + r*STRIDE, two 32-byte beats), so the reload side is unchanged.
//
// SEPARATE base (Codex): BASE is distinct from the color FB and the other scratch
// region. A synthesis-off CANARY asserts no beat lands inside the canary-guard regions.
//
// Counters (Codex, distinct per instance): z_write_beats (256-bit beats written),
// z_wr_errs (non-OKAY responses), fifo_overflow (sticky).
//
// NOTE (parity with gs_lpddr_axi_master): assumes the flush stream produces FULL 8-lane
// beats (true for a tile width that is a multiple of 8 — the 16-wide spill tile). A
// trailing partial beat at end-of-stream is NOT flushed.
// ============================================================================
`timescale 1ns/1ps
module gs_z_flush_writer #(
parameter [29:0] Z_BASE = 30'h0010_0000, // LPDDR byte base of this scratch region (DISTINCT)
parameter [29:0] FB_BASE = 30'd0, // color framebuffer base (canary guard)
parameter int FB_BYTES = 32'h0001_0000, // color framebuffer size (canary guard)
parameter [29:0] TEX_BASE = 30'h0020_0000, // other scratch base (canary guard)
parameter int TEX_BYTES = 32'h0000_8000, // other scratch size (canary guard)
parameter int FIFO_DEPTH = 16
)(
// ---- GS / design clock domain: the flush emit stream ----
input logic gs_clk,
input logic gs_rst_n,
input logic enable, // 1 = accept emits (default off => inert)
input logic z_flush_emit, // one pulse per tile pixel
input logic [31:0] z_flush_addr, // scratch-RELATIVE byte offset (pixel_index*4)
input logic [31:0] z_flush_data, // 32-bit word for this pixel (Z or color)
// ---- status (emif_clk domain unless noted) ----
output logic [31:0] z_write_beats, // 256-bit beats written (cumulative)
output logic [31:0] z_wr_errs, // non-OKAY write responses (cumulative)
output logic fifo_overflow, // sticky (gs domain): an emit dropped (FIFO full)
// Pipeline-split counters (Codex): emit/push (GS, reset by gs_rst_n=per-render core reset) and
// pop/beats (EMIF, reset by trace_clear) localize any spill divergence: healthy = 512/64/64/64;
// push>64 = packer partial beats; pop/beats>push = FIFO/reset broken; beats!=pop = AXI-FSM bug.
input logic trace_clear, // resets the EMIF-domain counters (beats/pop) per render
output logic [31:0] dbg_beat_count, // beats committed (B handshakes) since the last trace_clear
output logic [31:0] dbg_emit_count, // GS: enable&&z_flush_emit accepted (per render)
output logic [31:0] dbg_push_count, // GS: beats pushed into the FIFO (per render)
output logic [31:0] dbg_pop_count, // EMIF: beats popped from the FIFO (since trace_clear)
output logic [31:0] dbg_aw_count, // EMIF: AW handshakes (since trace_clear)
output logic [31:0] dbg_w_count, // EMIF: W handshakes (since trace_clear)
// ---- AXI4 write channel to the EMIF user port (emif_clk, 256-bit) ----
input logic axi_clk,
input logic axi_rst_n,
output logic [29:0] awaddr,
output logic [1:0] awburst,
output logic [6:0] awid,
output logic [7:0] awlen,
output logic [2:0] awsize,
output logic awvalid,
input logic awready,
output logic [255:0] wdata,
output logic [31:0] wstrb,
output logic wlast,
output logic wvalid,
input logic wready,
input logic [1:0] bresp,
input logic bvalid,
output logic bready
);
assign awburst = 2'b01; // INCR
assign awid = 7'd6; // distinct from FB writer(0)/probes/reload(6 too; arb priority disambiguates)
assign awlen = 8'd0; // single beat
assign awsize = 3'b101; // 32 bytes (256-bit)
assign bready = 1'b1;
localparam int PW = 318; // {block_off[29:0], data[255:0], strb[31:0]}
// ============================ design_clk PACKER ============================
// Accumulate 8 consecutive 32-bit pixels into one 256-bit beat keyed by the 32-byte
// block address; push a COMPLETE beat to the FIFO (one push per 8 px, not per px).
logic [29:0] cur_off;
logic [255:0] cur_data;
logic [31:0] cur_strb;
logic has_data;
logic fifo_wr;
logic [PW-1:0] fifo_wdata;
wire fifo_full, fifo_empty;
wire [PW-1:0] fifo_rdata;
logic fifo_rd;
always_ff @(posedge gs_clk or negedge gs_rst_n) begin
if (!gs_rst_n) begin
cur_off <= '0; cur_data <= '0; cur_strb <= '0; has_data <= 1'b0;
fifo_wr <= 1'b0; fifo_wdata <= '0; fifo_overflow <= 1'b0;
dbg_emit_count <= 32'd0; dbg_push_count <= 32'd0;
end else begin
fifo_wr <= 1'b0;
if (enable && z_flush_emit) dbg_emit_count <= dbg_emit_count + 32'd1;
if (fifo_wr && !fifo_full) dbg_push_count <= dbg_push_count + 32'd1;
if (enable && z_flush_emit) begin
logic [29:0] block_off;
logic [2:0] lane; // 0..7 (which 32-bit lane)
logic [255:0] nd;
logic [31:0] ns;
block_off = {z_flush_addr[29:5], 5'd0};
lane = z_flush_addr[4:2];
if (has_data && (block_off != cur_off)) begin
// block changed before the previous beat filled — flush it, restart.
fifo_wdata <= {cur_off, cur_data, cur_strb};
fifo_wr <= 1'b1;
cur_off <= block_off;
cur_data <= (256'd0 | (256'(z_flush_data) << ({29'd0, lane} * 32)));
cur_strb <= (32'hF << ({29'd0, lane} * 4));
has_data <= 1'b1;
end else begin
nd = has_data ? cur_data : 256'd0;
ns = has_data ? cur_strb : 32'd0;
nd[ ({29'd0, lane} * 32) +: 32 ] = z_flush_data;
ns[ ({29'd0, lane} * 4) +: 4 ] = 4'hF;
if (&ns) begin
// beat complete (all 8 lanes) — flush, beat consumed.
fifo_wdata <= {block_off, nd, ns};
fifo_wr <= 1'b1;
has_data <= 1'b0;
end else begin
cur_off <= block_off;
cur_data <= nd;
cur_strb <= ns;
has_data <= 1'b1;
end
end
end
// overflow witness: a push attempt while the FIFO is full (must stay 0).
if (fifo_wr && fifo_full) fifo_overflow <= 1'b1;
end
end
// CRITICAL (Ch323 board bug): the async FIFO's two pointers MUST reset together. The
// packer side uses gs_rst_n (= core reset, which a CORE_CTRL pulse toggles EVERY render);
// the read side uses axi_rst_n (= EMIF cal, power-on only). If wrst_n followed gs_rst_n,
// each render's core-reset pulse would reset ONLY the write pointer → gray-code pointer
// desync → FIFO corruption (garbage data, spurious overflow, writes that never commit).
// Sim missed it (single reset, both sides together). So reset BOTH FIFO sides from the
// STABLE axi_rst_n: assert async on axi_rst_n, deassert synchronized into gs_clk.
reg [1:0] wrst_sync;
always_ff @(posedge gs_clk or negedge axi_rst_n) begin
if (!axi_rst_n) wrst_sync <= 2'b00;
else wrst_sync <= {wrst_sync[0], 1'b1};
end
wire fifo_wrst_n = wrst_sync[1];
gs_async_fifo #(.WIDTH(PW), .DEPTH(FIFO_DEPTH)) u_fifo (
.wclk(gs_clk), .wrst_n(fifo_wrst_n), .wr(fifo_wr && !fifo_full), .wdata(fifo_wdata), .wfull(fifo_full),
.rclk(axi_clk), .rrst_n(axi_rst_n), .rd(fifo_rd), .rdata(fifo_rdata), .rempty(fifo_empty)
);
// ============================ emif_clk AXI FSM ============================
wire [29:0] beat_block = fifo_rdata[PW-1 -: 30]; // block_off[29:0]
wire [255:0] beat_data = fifo_rdata[287:32];
wire [31:0] beat_strb = fifo_rdata[31:0];
wire [29:0] full_addr = Z_BASE + beat_block;
typedef enum logic [1:0] { W_IDLE, W_AW, W_W, W_B } wstate_t;
wstate_t wst;
logic [29:0] lat_addr;
logic [255:0] lat_data;
logic [31:0] lat_strb;
always_ff @(posedge axi_clk or negedge axi_rst_n) begin
if (!axi_rst_n) begin
wst <= W_IDLE; awaddr <= '0; awvalid <= 1'b0; wdata <= '0; wstrb <= '0;
wlast <= 1'b0; wvalid <= 1'b0; fifo_rd <= 1'b0;
z_write_beats <= 32'd0; z_wr_errs <= 32'd0;
dbg_beat_count <= 32'd0;
dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
lat_addr <= '0; lat_data <= '0; lat_strb <= '0;
end else begin
fifo_rd <= 1'b0;
if (trace_clear) begin
dbg_beat_count <= 32'd0;
dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
end
if (fifo_rd) dbg_pop_count <= dbg_pop_count + 32'd1;
if (awvalid && awready) dbg_aw_count <= dbg_aw_count + 32'd1;
if (wvalid && wready) dbg_w_count <= dbg_w_count + 32'd1;
case (wst)
W_IDLE: if (!fifo_empty) begin
lat_addr <= full_addr; lat_data <= beat_data; lat_strb <= beat_strb;
fifo_rd <= 1'b1; // pop this beat
awaddr <= {full_addr[29:5], 5'd0}; // 32-byte aligned
awvalid <= 1'b1;
wst <= W_AW;
// synthesis translate_off
if (((full_addr >= FB_BASE) && (full_addr < FB_BASE + FB_BYTES[29:0])) ||
((full_addr >= TEX_BASE) && (full_addr < TEX_BASE + TEX_BYTES[29:0])))
$error("gs_z_flush_writer CANARY: beat addr 0x%07x overlaps a canary-guard region", full_addr);
// synthesis translate_on
end
W_AW: if (awready) begin
awvalid <= 1'b0; wdata <= lat_data;
wstrb <= lat_strb; wlast <= 1'b1; wvalid <= 1'b1; wst <= W_W;
end
W_W: if (wready) begin wvalid <= 1'b0; wlast <= 1'b0; wst <= W_B; end
W_B: if (bvalid) begin
if (bresp != 2'b00) z_wr_errs <= z_wr_errs + 32'd1;
z_write_beats <= z_write_beats + 32'd1;
dbg_beat_count <= dbg_beat_count + 32'd1;
wst <= W_IDLE;
end
default: wst <= W_IDLE;
endcase
end
end
endmodule
+263
View File
@@ -0,0 +1,263 @@
// retroDE_ps2 — vram_bram_stub (Ch154)
//
// Hardware-friendly sibling of `vram_stub`. Maps cleanly onto Agilex 5
// M20K block-RAM:
// - 2048 × 32-bit word storage (instead of 8192 × 8-bit byte
// storage). Internal width matches Agilex M20K native widths;
// external addressing stays byte-addressable to keep the same
// mental model as `vram_stub`.
// - SYNCHRONOUS reads (registered 32-bit output). One-cycle read
// latency — the rd_valid pulse fires the cycle the data is on
// read_data.
// - BYTE write enable only (4-bit `write_be`). The Ch106 PSMT4
// per-bit `write_mask` RMW is NOT supported; PSMT4 callers must
// do the nibble splice on the writer side BEFORE issuing the
// write here. Ch155+ task to rework gs_stub.raster_pixel_emit
// and gif_image_xfer_stub for that.
// - Two synchronous read ports. Quartus implements two
// independent read addresses by REPLICATING the M20K storage
// across two RAM blocks rather than using a single native
// dual-read port — exp_c shows 8 RAM Blocks for 8 KB vs
// exp_a's 4 RAM Blocks for the same 8 KB single-port shape.
// Two replicated RAM blocks is still vastly cheaper than the
// 65,536 flip-flops the legacy `vram_stub` shape produced;
// the cost just isn't free.
//
// Empirical motivation (Ch153 forensics):
// The legacy `vram_stub` shape (byte-addressable + combinational
// dual reads + per-bit-mask RMW) failed to fit on Agilex 5 — the
// 8 KB array consumed 65,536 dedicated registers and 261,578
// combinational nodes, dominating Ch152's 331 % ALM overrun.
// `exp_a_bram_friendly` proved that a 2048 × 32-bit sync-read
// byte-WE shape maps to 4 RAM Blocks + 0 registers + 46 ALMs.
//
// External port shape vs `vram_stub`:
// IDENTICAL: clk, rst_n, write_en, write_addr[31:0],
// write_data[31:0], write_be[3:0], read_addr[31:0],
// read_data[31:0], read2_addr[31:0], read2_data[31:0].
// NEW : read_valid + read2_valid (1-cycle pulse with the data).
// DROPPED : write_mask[31:0] (Ch106 per-bit RMW; callers must
// splice nibbles on the writer side).
//
// Address contract:
// - Writes: write_addr is byte-aligned; the low 2 bits MUST be 0
// (4-byte writes only). Each `write_be[i]` independently
// commits byte `i` of the addressed word. Per-byte non-
// wrapping admission: an enabled byte beyond `BYTES`
// drops the WHOLE write (matches vram_stub Ch95 audit).
// - Reads: read_addr is byte-aligned; the low 2 bits MUST be 0.
// `read_data` is the 32-bit word at `read_addr / 4`.
// Byte / halfword extraction is the caller's job
// (matches Ch141 / Ch142 nibble-readback pattern).
//
// Sim behaviour: time-0 mem is power-on-zero matching real M20K (the
// `// synthesis translate_off` initial block matches vram_stub's
// post-Ch152 pattern).
`timescale 1ns/1ps
module vram_bram_stub #(
parameter int unsigned BYTES = 8192,
// Ch251.4 — hardware-demo M20K rescue. When ENABLE_READ2 = 0, the
// second sync-read port is FEATURE-STRIPPED: `read2_data` ties to
// 0, `read2_valid` ties to 0, and Quartus no longer infers a
// separate read port on `mem`. This collapses the storage from
// two replicated 1W+1R simple-dual-port M20K banks (~410 M20Ks at
// 512 KiB) to ONE 1W+1R bank (~205 M20Ks) — the savings that get
// the 512 KiB framebuffer to fit on Agilex 5 (358 M20K budget).
//
// Contract caveat: read2 is the PSMT4 RMW old-byte read path. Any
// build that exercises PSMT4 rasterization MUST keep this `1`. The
// PSMCT32-only hardware demo (top_psmct32_raster_demo_bram) sets
// it to `0`; all simulation TBs leave it at the default `1`.
//
// This is a SCOPED build profile, not a general fix — see
// docs/decisions/0006-vram-roadmap.md for the longer-term
// arbitrated / line-buffered VRAM plan.
parameter bit ENABLE_READ2 = 1'b1
) (
input logic clk,
input logic rst_n,
// Write port (byte-WE; 4-byte-aligned write_addr).
input logic write_en,
input logic [31:0] write_addr,
input logic [31:0] write_data,
input logic [3:0] write_be,
// Read port 0 (sync read; 4-byte-aligned read_addr).
input logic [31:0] read_addr,
output logic [31:0] read_data,
output logic read_valid,
// Read port 1 (sync read; 4-byte-aligned).
input logic [31:0] read2_addr,
output logic [31:0] read2_data,
output logic read2_valid
);
// 2048 × 32-bit storage. Index is the WORD index (write_addr / 4).
//
// Parameter contract: `BYTES` MUST be a power-of-two multiple of 4.
// The WORD_AW-bit slice `*_addr[WORD_AW+1:2]` truncates the byte
// address to a word index; for non-power-of-two `WORDS`, an out-
// of-range byte address can map to a slice value that exceeds
// `WORDS-1` and indexes beyond `mem[]`. `read_valid` already
// marks such reads invalid downstream, but the BRAM read template
// still indexes the array unconditionally to satisfy Quartus's
// M20K inference (Ch154 audit), so the index itself must remain
// in bounds. The Ch155 audit-low fix: clamp the read indices
// with `& (WORDS-1)` so a power-of-two depth is required AND any
// bit beyond the legal slice is masked away. Power-of-two also
// matches every Agilex M20K depth target (256/512/1024/2048/...).
localparam int unsigned WORDS = BYTES / 4;
localparam int unsigned WORD_AW = $clog2(WORDS);
logic [31:0] mem [0:WORDS-1];
// synthesis translate_off
initial begin
if (BYTES < 4 || (BYTES & 32'd3) != 0)
$error("vram_bram_stub: BYTES (%0d) must be >= 4 and a multiple of 4", BYTES);
// Power-of-two check on WORDS: (WORDS != 0) && ((WORDS & (WORDS-1)) == 0).
if (WORDS == 0 || (WORDS & (WORDS - 1)) != 0)
$error("vram_bram_stub: BYTES (%0d) must yield a power-of-two WORDS depth (got %0d)",
BYTES, WORDS);
// Ch252 — VRAM replication tripwire (simulation/elaboration only).
//
// At BYTES >= 256 KiB, each 1W+1R simple-dual-port replica costs
// ~100 M20Ks. With ENABLE_READ2 = 1, Quartus replicates the
// storage to give the second read its own port, doubling that
// cost (>= 200 M20Ks per pair). Above this threshold a Quartus
// fitter overrun on Agilex 5 (358 M20K budget) becomes likely.
//
// This `$fatal` runs in simulation and elaboration-aware lint
// tools — it is the loud canary. The REAL protection is the
// board-top profile: hardware builds explicitly set
// ENABLE_READ2 = 0 when VRAM_BYTES is large (see
// de25_nano_psmct32_raster_demo_top). Re-enabling read2 on a
// large hardware VRAM requires landing one of the architectural
// follow-ups in docs/decisions/0006-vram-roadmap.md first.
if (ENABLE_READ2 && (BYTES >= 32'd262144)) begin
$display("vram_bram_stub: ENABLE_READ2=1 with BYTES=%0d (>= 256 KiB) trips the replication tripwire.", BYTES);
$display(" The 2nd read port forces Quartus to replicate the storage, ~doubling M20K cost.");
$display(" Either set ENABLE_READ2=0 (PSMCT32-only hardware profile) or land the");
$display(" arbitrated/line-buffered VRAM follow-up before re-enabling read2 at this size.");
$display(" See docs/decisions/0006-vram-roadmap.md.");
$fatal(1, "vram_bram_stub: replication-tripwire fatal exit");
end
for (int i = 0; i < int'(WORDS); i++) mem[i] = 32'd0;
end
// synthesis translate_on
// ----------------------------------------------------------------
// Write port — per-byte WE, per-byte non-wrapping admission.
// ----------------------------------------------------------------
logic [32:0] addr33;
logic admit_b0, admit_b1, admit_b2, admit_b3;
logic write_admit;
assign addr33 = {1'b0, write_addr};
assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
assign write_admit = write_en
&& (write_addr[1:0] == 2'b00) // word-aligned
&& (!write_be[0] || admit_b0)
&& (!write_be[1] || admit_b1)
&& (!write_be[2] || admit_b2)
&& (!write_be[3] || admit_b3);
logic [WORD_AW-1:0] write_word_idx;
assign write_word_idx = write_addr[WORD_AW+1:2];
// BRAM-native byte-WE template — each `if (write_be[i])` slice
// updates a separate 8-bit lane of the 32-bit word. This is the
// canonical Quartus inference shape (proven in Ch153 exp_a).
always_ff @(posedge clk) begin
if (rst_n && write_admit) begin
if (write_be[0]) mem[write_word_idx][ 7: 0] <= write_data[ 7: 0];
if (write_be[1]) mem[write_word_idx][15: 8] <= write_data[15: 8];
if (write_be[2]) mem[write_word_idx][23:16] <= write_data[23:16];
if (write_be[3]) mem[write_word_idx][31:24] <= write_data[31:24];
end
end
// ----------------------------------------------------------------
// Read ports — sync, registered output, 1-cycle latency.
//
// The read path is the CANONICAL Quartus M20K inference template:
// a single unconditional `read_data <= mem[idx]` registered
// assignment, with NO reset on the data register and NO read-side
// gating. Quartus rejected an earlier draft that gated reads on
// `read_addr[1:0]==2'b00 && in-bounds` with
// "Info (276007): RAM logic ... uninferred due to asynchronous
// read logic"
// and synthesized the storage as flip-flops. Bounds + alignment
// checks land on the separate `read_valid` pipeline below where
// they don't poison the data path.
// ----------------------------------------------------------------
// Word-index extraction. For a power-of-two `WORDS` depth (the
// parameter contract enforced above), the slice
// `read_addr[WORD_AW+1:2]` is naturally bounded to `[0, WORDS-1]`
// — the high bits beyond WORD_AW+1 represent address ranges
// already rejected by the `read_valid` gate below. The mask
// `& WORD_AW'(WORDS - 1)` is redundant for power-of-two WORDS
// (it just keeps the same bits) but documents the contract: a
// future relaxation that allows non-power-of-two depths would
// need to either remove that change OR force the mem-read index
// through a real range-clamp rather than relying on the natural
// truncation.
logic [WORD_AW-1:0] read_word_idx;
assign read_word_idx = read_addr [WORD_AW+1:2] & WORD_AW'(WORDS - 1);
always_ff @(posedge clk) begin
read_data <= mem[read_word_idx];
end
// Out-of-range / misaligned detection on a parallel pipeline so
// it doesn't gate the BRAM read path. read_valid pulses 1 cycle
// late, aligned with read_data.
logic read_in_range_pre;
assign read_in_range_pre = (read_addr [1:0] == 2'b00) &&
({1'b0, read_addr } + 33'd3 < 33'(BYTES));
always_ff @(posedge clk) begin
if (!rst_n) read_valid <= 1'b0;
else read_valid <= read_in_range_pre;
end
// ----------------------------------------------------------------
// Read port 1 — feature-strippable via ENABLE_READ2 (Ch251.4).
// When ENABLE_READ2=1: full sync read + range gate, matching the
// pre-Ch251.4 behaviour. When ENABLE_READ2=0: NO reference to
// `mem` from this branch, so Quartus does not infer a second M20K
// read port and the VRAM storage stops replicating.
// ----------------------------------------------------------------
generate
if (ENABLE_READ2) begin : g_read2_en
logic [WORD_AW-1:0] read2_word_idx;
assign read2_word_idx = read2_addr[WORD_AW+1:2] & WORD_AW'(WORDS - 1);
always_ff @(posedge clk) begin
read2_data <= mem[read2_word_idx];
end
logic read2_in_range_pre;
assign read2_in_range_pre = (read2_addr[1:0] == 2'b00) &&
({1'b0, read2_addr} + 33'd3 < 33'(BYTES));
always_ff @(posedge clk) begin
if (!rst_n) read2_valid <= 1'b0;
else read2_valid <= read2_in_range_pre;
end
end else begin : g_read2_dis
always_ff @(posedge clk) begin
read2_data <= 32'd0;
read2_valid <= 1'b0;
end
end
endgenerate
endmodule : vram_bram_stub
+200
View File
@@ -0,0 +1,200 @@
// retroDE_ps2 — vram_normalize_pkg (Ch155)
//
// Writer-side normalization for `vram_bram_stub`. The new BRAM-friendly
// VRAM (Ch154) requires word-aligned writes (`write_addr[1:0] == 0`)
// with payload pre-shifted into the selected byte lane(s) and
// `write_be` set per byte. Today's writer-side RTL emits at sub-word
// boundaries for PSMCT16 (halfword), PSMT8 (byte), and PSMT4 (nibble);
// this package's `normalize_write` function bridges the contract.
//
// Codex Ch155 framing: "Add a small helper module or function for
// VRAM write normalization: input: natural byte address, PSM,
// pixel/index payload, old byte for PSMT4 if needed; output:
// word-aligned write_addr, shifted write_data, write_be."
//
// Scope (Ch155):
// - Function is defined + standalone-verified for all 4 PSMs.
// - NOT yet applied inside `gs_stub.raster_pixel_emit` or
// `gif_image_xfer_stub`. The PSMT4 case needs a read-then-write
// pipeline upstream (to source `old_byte`); that's a Ch156+
// RTL plumbing chapter. CT32/CT16/T8 cases are pure-comb and
// can be plumbed in as soon as the wiring lands.
//
// Pure-comb function — no RTL pipelining inside the function itself.
// Callers that need a read-then-write pipeline (PSMT4) own that
// pipelining and pass the read result as `old_byte`.
`timescale 1ns/1ps
package vram_normalize_pkg;
// GS PSM codes (subset relevant to VRAM writes).
localparam logic [5:0] PSM_PSMCT32 = 6'h00;
localparam logic [5:0] PSM_PSMCT16 = 6'h02;
localparam logic [5:0] PSM_PSMT8 = 6'h13;
localparam logic [5:0] PSM_PSMT4 = 6'h14;
typedef struct packed {
logic [31:0] write_addr; // word-aligned
logic [31:0] write_data; // payload shifted to lane
logic [3:0] write_be; // per-byte write enable
} norm_out_t;
// ----------------------------------------------------------------
// normalize_write — pure-comb writer-side normalization.
//
// Inputs:
// byte_addr — natural byte address as the legacy writers
// already emit. CT32 callers must already pass
// a word-aligned address. CT16 callers may pass
// a halfword address (byte_addr[1] selects low
// or high halfword). T8/T4 callers may pass any
// byte address.
// psm — GS PSM code (use one of the localparams above).
// payload — payload bits in the LSBs:
// CT32 → payload[31:0] is the full ABGR word.
// CT16 → payload[15:0] is the RGB5A1 halfword.
// T8 → payload[ 7:0] is the byte index.
// T4 → payload[ 3:0] is the nibble index.
// nibble_hi — T4 only. 0 = splice payload[3:0] into the LOW
// nibble of the byte at byte_addr, 1 = HIGH.
// Ignored for CT32/CT16/T8.
// old_byte — T4 only. Current value of mem[byte_addr]; the
// function splices the new nibble into this byte
// to preserve the other nibble. Ignored for
// CT32/CT16/T8.
//
// Output: word-aligned write_addr + shifted write_data + write_be.
//
// For PSMs other than CT32/CT16/T8/T4 the function returns a
// dropped write (write_be = 4'b0000, write_data = 32'd0); this
// matches `vram_stub`'s "unsupported PSMs are silent no-ops"
// posture (Ch95).
// ----------------------------------------------------------------
function automatic norm_out_t normalize_write(
input logic [31:0] byte_addr,
input logic [5:0] psm,
input logic [31:0] payload,
input logic nibble_hi,
input logic [7:0] old_byte
);
norm_out_t r;
// Word-aligned base address common to every PSM.
r.write_addr = byte_addr & ~32'd3;
unique case (psm)
// ------------------------------------------------------
// PSMCT32 — natural 32-bit-aligned write. byte_addr MUST
// already be word-aligned; if it isn't, the function
// produces a dropped write so the BRAM module never sees
// the misuse.
// ------------------------------------------------------
PSM_PSMCT32: begin
if (byte_addr[1:0] != 2'b00) begin
r.write_data = 32'd0;
r.write_be = 4'b0000;
end else begin
r.write_data = payload;
r.write_be = 4'b1111;
end
end
// ------------------------------------------------------
// PSMCT16 — halfword write. byte_addr[1] picks low or
// high halfword; byte_addr[0] MUST be 0.
// ------------------------------------------------------
PSM_PSMCT16: begin
if (byte_addr[0] != 1'b0) begin
r.write_data = 32'd0;
r.write_be = 4'b0000;
end else if (byte_addr[1] == 1'b0) begin
r.write_data = {16'd0, payload[15:0]};
r.write_be = 4'b0011;
end else begin
r.write_data = {payload[15:0], 16'd0};
r.write_be = 4'b1100;
end
end
// ------------------------------------------------------
// PSMT8 — single byte at any byte address. byte_addr[1:0]
// selects which of the 4 byte lanes gets the byte.
// ------------------------------------------------------
PSM_PSMT8: begin
unique case (byte_addr[1:0])
2'b00: begin
r.write_data = {24'd0, payload[7:0]};
r.write_be = 4'b0001;
end
2'b01: begin
r.write_data = {16'd0, payload[7:0], 8'd0};
r.write_be = 4'b0010;
end
2'b10: begin
r.write_data = {8'd0, payload[7:0], 16'd0};
r.write_be = 4'b0100;
end
2'b11: begin
r.write_data = {payload[7:0], 24'd0};
r.write_be = 4'b1000;
end
endcase
end
// ------------------------------------------------------
// PSMT4 — nibble splice. The function takes `old_byte` as
// the current value of mem[byte_addr] and produces a
// full-byte write at that address containing the new
// byte: (old_byte & ~nibble_mask) | (new_nibble in lane).
//
// The caller is responsible for sourcing `old_byte` —
// typically a 1-cycle read of mem[byte_addr] before the
// write fires. Ch156+ inserts that read pipeline inside
// gs_stub.raster_pixel_emit + gif_image_xfer_stub.
//
// byte_addr[1:0] selects the byte lane in the 32-bit
// word; nibble_hi selects which nibble of that byte gets
// the new value.
// ------------------------------------------------------
PSM_PSMT4: begin
logic [7:0] new_byte;
if (nibble_hi)
new_byte = {payload[3:0], old_byte[3:0]};
else
new_byte = {old_byte[7:4], payload[3:0]};
unique case (byte_addr[1:0])
2'b00: begin
r.write_data = {24'd0, new_byte};
r.write_be = 4'b0001;
end
2'b01: begin
r.write_data = {16'd0, new_byte, 8'd0};
r.write_be = 4'b0010;
end
2'b10: begin
r.write_data = {8'd0, new_byte, 16'd0};
r.write_be = 4'b0100;
end
2'b11: begin
r.write_data = {new_byte, 24'd0};
r.write_be = 4'b1000;
end
endcase
end
// ------------------------------------------------------
// Unsupported PSM → drop the write. Matches vram_stub's
// Ch95 stance.
// ------------------------------------------------------
default: begin
r.write_data = 32'd0;
r.write_be = 4'b0000;
end
endcase
return r;
endfunction
endpackage : vram_normalize_pkg
+185
View File
@@ -0,0 +1,185 @@
// retroDE_ps2 — vram_stub (Ch89)
//
// Linear byte-addressable VRAM backing store for gs_stub's
// `raster_pixel_emit` channel. This is the FIRST persistence
// layer the rasterizer has had — pre-Ch89, pixels only pulsed as
// trace-visible events and updated `raster_pixel_color_q` /
// `raster_pixel_fb_addr_q` snapshot regs, then evaporated. Now
// they actually land somewhere a TB (or a future scanout path)
// can read back.
//
// Scope (intentionally minimal for Ch89):
// - Linear byte-addressable: NO page/block VRAM swizzle. Real
// PS2 VRAM is 4 MiB, organized into pages × blocks × columns
// per PSM. The fb_addr math in gs_stub matches the linear-
// framebuffer layout that PCSX2's gs_state pages out for
// "linear" PSM channels; that's what this stub speaks.
// - PSMCT32 only: writes 4 bytes per emitted pixel. PSMCT16
// (2 bytes) and PSMT8 (1 byte) are deferred until a future
// chapter exposes per-pixel PSM at the raster channel.
// - Combinational debug read port: byte-addressable, returns
// the 4 bytes starting at read_addr packed little-endian.
// For TBs to verify pixel storage; not on any hardware path.
//
// Wiring contract:
// - write_en ← gs_stub.raster_pixel_emit
// - write_addr ← gs_stub.raster_pixel_fb_addr_q
// - write_data ← gs_stub.raster_pixel_color_q[31:0] (lower 32 bits)
// - write_be ← gs_stub.raster_pixel_be_q (Ch95)
//
// The full 64-bit raster_pixel_color_q carries Q (texture-coord
// IEEE float) in the upper 32 bits — those bits are NOT part of
// the framebuffer pixel and are deliberately discarded here.
//
// `write_be[3:0]` (Ch95): per-byte write enable. byte i (the
// byte at `write_addr + i`) is committed only when
// `write_en && write_be[i]`. PSMCT32 writes use 4'b1111;
// PSMCT16 writes use 4'b0011 (the 2 bytes at write_addr — gs_stub
// passes the actual byte address of the pixel, which is
// 2-byte-aligned but not necessarily 4-byte-aligned). TBs that
// bypass gs_stub (e.g. `tb_vram_stub`, `tb_gs_scanout_psm16`)
// tie write_be to 4'b1111.
//
// `write_mask[31:0]` (Ch106): per-BIT merge mask used to support
// sub-byte writes (PSMT4 — 4-bit nibble per pixel). The committed
// byte i (still gated by write_be[i]) is:
// mem[addr+i] <= (mem[addr+i] & ~mask_i) | (data_i & mask_i)
// where mask_i = write_mask[i*8 +: 8] and data_i =
// write_data[i*8 +: 8]. PSMCT32/16 + PSMT8 writes tie write_mask
// to 32'hFFFFFFFF (full byte writes — equivalent to the pre-Ch106
// behavior). PSMT4 emits use 0x0F (low nibble) or 0xF0 (high
// nibble) on the enabled byte. The merge happens inside the same
// always_ff that commits the byte, so back-to-back nibble writes
// to the SAME byte chain cleanly through NBA semantics: the
// second write samples mem[addr] AFTER the prior NBA committed.
//
// Bounds check (Ch95 audit-medium fix): the write is admitted
// only if EVERY enabled byte's address is in [0, BYTES). This
// uses non-wrapping 33-bit arithmetic so a write near the 32-bit
// address space limit (e.g. write_addr near 0xFFFF_FFFC with
// be=4'b1111) is rejected cleanly. Halfword writes at the last
// valid 2-byte slot (write_addr=BYTES-2 with be=4'b0011) are
// accepted; write_addr=BYTES-1 with be=4'b0011 is rejected
// because byte 1 of that slot is OOB.
`timescale 1ns/1ps
module vram_stub
#(
parameter int unsigned BYTES = 65536
) (
input logic clk,
input logic rst_n,
// Write side: one 32-bit pixel slot per cycle when write_en
// pulses. write_addr is a byte offset (already PSM-aware via
// gs_stub's bpp_shift math). write_be[i] gates byte i — used
// by Ch95 to commit just the 2 bytes of a PSMCT16 pixel
// without stomping the adjacent halfword.
input logic write_en,
input logic [31:0] write_addr,
input logic [31:0] write_data,
input logic [3:0] write_be,
input logic [31:0] write_mask,
// Debug read port: combinational, byte-addressable, little-
// endian 4-byte read. Used by gs_pcrtc_stub for scanout, and
// by TBs for verification.
input logic [31:0] read_addr,
output logic [31:0] read_data,
// Ch99 — second combinational read port for clients that
// need to read VRAM concurrently with pcrtc scanout (the
// canonical example is `clut_loader_stub`, which copies
// CLUT bytes from VRAM into clut_stub when TEX0.CLD fires).
// Same byte-addressed 4-byte semantics as port 0. Tie
// `read2_addr` to 0 in TBs that don't use it; the unused
// `read2_data` output can be left unconnected.
input logic [31:0] read2_addr,
output logic [31:0] read2_data
);
logic [7:0] mem [0:BYTES-1];
// Largest base address that admits a 4-byte access without
// overrunning the array. Used by the READ port (always 4
// bytes). The write port now does per-byte admission below
// (Ch95 audit-medium fix) so it can accept halfword writes
// near the end of VRAM that the old `addr <= MAX_BASE` gate
// would have spuriously dropped.
localparam logic [31:0] MAX_BASE = (BYTES >= 4)
? (32'(BYTES) - 32'd4)
: 32'd0;
// Sim-only memory init. Real Altera/Intel BRAM is power-on-zero
// on FPGA configuration, so the procedural loop is unnecessary
// in synthesis — and at BYTES=8192 it exceeds Quartus's 5000-
// iteration synthesizable-loop limit (Quartus error 13356).
// The pragma pair tells Quartus to skip this initial block;
// iverilog and other simulators ignore the pragma and run the
// init normally so time-0 values are deterministic in sim.
// synthesis translate_off
initial begin
if (BYTES < 4)
$error("vram_stub: BYTES (%0d) must be >= 4", BYTES);
for (int i = 0; i < BYTES; i++) mem[i] = 8'd0;
end
// synthesis translate_on
always_comb begin
if (read_addr <= MAX_BASE) begin
read_data = {mem[read_addr + 32'd3],
mem[read_addr + 32'd2],
mem[read_addr + 32'd1],
mem[read_addr]};
end else begin
read_data = 32'd0;
end
end
always_comb begin
if (read2_addr <= MAX_BASE) begin
read2_data = {mem[read2_addr + 32'd3],
mem[read2_addr + 32'd2],
mem[read2_addr + 32'd1],
mem[read2_addr]};
end else begin
read2_data = 32'd0;
end
end
// Per-byte admission. We use non-wrapping 33-bit arithmetic
// for `write_addr + i` so a near-0xFFFFFFFF address can't
// wrap and falsely pass the comparison. An enabled byte is
// admitted only if its byte address is strictly less than
// BYTES; the entire write is dropped if ANY enabled byte
// would land out of range, matching the Ch89-audit "no
// partial writes near the boundary" stance.
logic [32:0] addr33;
logic admit_b0, admit_b1, admit_b2, admit_b3;
logic write_admit;
assign addr33 = {1'b0, write_addr};
assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
assign write_admit = write_en
&& (!write_be[0] || admit_b0)
&& (!write_be[1] || admit_b1)
&& (!write_be[2] || admit_b2)
&& (!write_be[3] || admit_b3);
always_ff @(posedge clk) begin
if (rst_n && write_admit) begin
if (write_be[0]) mem[write_addr] <= (mem[write_addr] & ~write_mask[7:0])
| (write_data[7:0] & write_mask[7:0]);
if (write_be[1]) mem[write_addr + 32'd1] <= (mem[write_addr + 32'd1] & ~write_mask[15:8])
| (write_data[15:8] & write_mask[15:8]);
if (write_be[2]) mem[write_addr + 32'd2] <= (mem[write_addr + 32'd2] & ~write_mask[23:16])
| (write_data[23:16] & write_mask[23:16]);
if (write_be[3]) mem[write_addr + 32'd3] <= (mem[write_addr + 32'd3] & ~write_mask[31:24])
| (write_data[31:24] & write_mask[31:24]);
end
end
endmodule : vram_stub
+49
View File
@@ -0,0 +1,49 @@
# rtl/intc
Interrupt controller scaffolding. Matches `docs/contracts/intc.md`.
## Current contents
- `intc_stub.sv` — generic PS2-style INTC register shell.
Register-visible INTC_STAT / INTC_MASK (offsets parameterized) plus a
16-source injection port `irq_src[15:0]`. The aggregate output
`cpu_irq` is polarity-neutral: the same module is instantiated both
as the EE INTC and as the IOP INTC (with appropriate offsets and a
different set of wired sources in each case).
## Register semantics
- `INTC_STAT` (offset is a parameter; default 0x00): W1C on writes;
sticky until cleared. `irq_src` sets bits on each cycle they're
observed; same-cycle inject-over-W1C collisions keep the pending bit
— interrupts are never silently swallowed.
- `INTC_MASK` (offset is a parameter; default 0x10): plain write-to-set.
Real PS2 uses XOR/toggle semantics on mask writes; stub uses plain
write for simplicity. Escalate if a BIOS trace demands it.
## Instantiation conventions
- **EE INTC**: default offsets (STAT=0x00, MASK=0x10). Instantiated
stand-alone in most benches; the EE memory map does not route INTC
addresses yet (deferred).
- **IOP INTC**: parameterized to STAT=0x70, MASK=0x74 to match real
PS2 IOP INTC placement. Reached through `iop_memory_map_stub` at
physical address 0x1F80_1070+ (region id = 5).
## Wired sources (current)
- EE INTC bit 0 = EE DMAC completion (`dmac_reg_stub.irq_completion_o`).
- IOP INTC bit 0 = IOP DMAC ch9 completion
(`iop_dmac_reg_stub.irq_completion_o`).
Both are one-cycle pulses driven from the respective DMAC's `S_DONE`
state. The INTC latches them into its own pending bit; software (the
TB, for now) reads STAT through the architectural register port and
acks with a W1C write.
## Scope boundary
Module is side-neutral by design. Source-routing from other real
subsystems (timers, GIF/GS, IPU, SPU2, bridge `last_seen_o`) is the
next natural expansion. Re-arm / re-assertion ordering is already
proven in the integration benches.
+189
View File
@@ -0,0 +1,189 @@
// retroDE_ps2 — intc_stub
//
// Generic PS2-style interrupt controller shell. Register-visible
// status/mask behaviour plus a 16-source injection port; the same
// module is reusable as either the EE-side or IOP-side INTC by picking
// the appropriate address offsets and instantiating with different
// sources. The aggregate output `cpu_irq` is side-neutral.
//
// Contract refs:
// docs/stub_module_plan.md (Wave 1, item 7)
// docs/contracts/intc.md
//
// Register layout (Wave 1):
// offset 0x000: INTC_STAT read: current pending, write: W1C
// offset 0x010: INTC_MASK read: current mask, write: plain set
//
// Real PS2 INTC_MASK uses write-to-toggle (XOR) semantics. Wave 1 uses
// plain write semantics for stub simplicity; toggle semantics are a
// Wave 2+ concern if BIOS traces demand them.
//
// Injection:
// irq_src[i] high on any cycle latches bit i in INTC_STAT. Sticky until
// cleared by a W1C write. Sixteen sources are exposed (matches real PS2
// INTC source count); testbenches drive whichever they need.
//
// Trace payload schema (per stub plan):
// INTC IRQ arg0=source_bitmap arg1=masked arg2=pending arg3=ack
// one event per cycle max. Priority if multiple triggers coincide:
// ack (STAT W1C) > new assertion > mask write.
// ack arg3=1 when the event is a W1C ack, 0 otherwise.
// flags bit 0 = register write (vs. source-driven assertion)
`timescale 1ns/1ps
module intc_stub
import trace_pkg::*;
#(
parameter logic [7:0] INTC_STAT_OFFSET = 8'h00,
parameter logic [7:0] INTC_MASK_OFFSET = 8'h10
) (
input logic clk,
input logic rst_n,
// Register port
input logic reg_wr_en,
input logic reg_rd_en,
input logic [7:0] reg_addr,
input logic [31:0] reg_wr_data,
output logic [31:0] reg_rd_data,
output logic reg_rd_valid,
// Synthetic interrupt sources
input logic [15:0] irq_src,
// Aggregate interrupt line to whichever CPU side this INTC serves
// (EE or IOP). Named generically because this module is reused on
// both sides.
output logic cpu_irq,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
logic [15:0] intc_stat;
logic [15:0] intc_mask;
// ------------------------------------------------------------------
// Register reads (1-cycle latency, matches bios_rom_stub pattern)
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
reg_rd_data <= 32'd0;
reg_rd_valid <= 1'b0;
end else begin
reg_rd_valid <= reg_rd_en;
if (reg_rd_en) begin
case (reg_addr)
INTC_STAT_OFFSET: reg_rd_data <= {16'd0, intc_stat};
INTC_MASK_OFFSET: reg_rd_data <= {16'd0, intc_mask};
default: reg_rd_data <= 32'd0;
endcase
end
end
end
// ------------------------------------------------------------------
// Pending/mask update + synthetic injection
// - W1C on INTC_STAT clears bits where write_data has 1.
// - Plain write on INTC_MASK replaces current mask.
// - irq_src sets bits in INTC_STAT (sticky).
// - If W1C and irq_src collide on the same cycle and same bit, the
// assertion wins — we don't want to swallow an interrupt.
// ------------------------------------------------------------------
logic [15:0] stat_w1c_mask;
logic [15:0] stat_inject;
logic mask_wr;
assign stat_w1c_mask = (reg_wr_en && (reg_addr == INTC_STAT_OFFSET))
? reg_wr_data[15:0] : 16'd0;
assign stat_inject = irq_src;
assign mask_wr = reg_wr_en && (reg_addr == INTC_MASK_OFFSET);
always_ff @(posedge clk) begin
if (!rst_n) begin
intc_stat <= 16'd0;
intc_mask <= 16'd0;
end else begin
intc_stat <= (intc_stat & ~stat_w1c_mask) | stat_inject;
if (mask_wr) intc_mask <= reg_wr_data[15:0];
end
end
assign cpu_irq = |(intc_stat & intc_mask);
// ------------------------------------------------------------------
// Trace
// ------------------------------------------------------------------
logic [15:0] new_assertions;
logic [15:0] bits_acked;
logic had_ack;
logic had_assertion;
logic had_mask_wr;
// "new_assertions" = bits becoming pending this cycle that weren't pending
// before. Combinational on the pre-edge state.
assign new_assertions = stat_inject & ~intc_stat;
assign bits_acked = stat_w1c_mask & intc_stat;
assign had_ack = |bits_acked;
assign had_assertion = |new_assertions;
assign had_mask_wr = mask_wr;
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_INTC;
ev_event <= EV_IRQ;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (had_ack) begin
// arg1/arg2 must reflect the post-update state. The state
// update preserves simultaneous stat_inject over W1C clears
// (see always_ff above), so if inject and ack collide on the
// same bit, that bit stays pending. arg0 still reports what
// software tried to ack, regardless of whether it took effect.
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_INTC;
ev_event <= EV_IRQ;
ev_arg0 <= {48'd0, bits_acked};
ev_arg1 <= {48'd0, ((intc_stat & ~stat_w1c_mask) | stat_inject) & intc_mask};
ev_arg2 <= {48'd0, (intc_stat & ~stat_w1c_mask) | stat_inject};
ev_arg3 <= 64'd1; // ack = 1
ev_flags <= 32'h0000_0001;
end else if (had_assertion) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_INTC;
ev_event <= EV_IRQ;
ev_arg0 <= {48'd0, new_assertions};
ev_arg1 <= {48'd0, (intc_stat | stat_inject) & intc_mask};
ev_arg2 <= {48'd0, (intc_stat | stat_inject)};
ev_arg3 <= 64'd0; // ack = 0
ev_flags <= 32'd0;
end else if (had_mask_wr) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_INTC;
ev_event <= EV_IRQ;
ev_arg0 <= 64'd0;
ev_arg1 <= {48'd0, intc_stat & reg_wr_data[15:0]};
ev_arg2 <= {48'd0, intc_stat};
ev_arg3 <= 64'd0;
ev_flags <= 32'h0000_0001;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : intc_stub
+151
View File
@@ -0,0 +1,151 @@
# rtl/iop
IOP subsystem. Matches `docs/contracts/iop.md`.
## Current contents
- `iop_ram_stub.sv` — 32-bit IOP RAM primitive. Default 16 KiB,
parameterizable. Read + write ports with byte-enable granularity,
one-cycle read latency, caller-provided `master_id` for trace
attribution. Emits trace events under `SUBSYS_IOP`.
- `iop_memory_map_stub.sv` — IOP-side address decode. CPU-side port uses
kseg0/kseg1 stripping (`phys = iop_addr[28:0]`). Second write-master
port for DMA bridges (`bridge_wr_*`), physical addressing. Regions now
decoded:
- IOP RAM (phys 0x00000000-0x001FFFFF) → `iop_ram_stub`
- SIF registers (phys 0x1D000000 block) → SIF register shell
(`sif_mailbox_stub` IOP side) via `sif_rd_*` / `sif_wr_*` ports
- IOP DMAC channel 9 (phys 0x1F801520-0x1F80152F) → IOP DMAC
register shell via `iop_dmac_rd_*` / `iop_dmac_wr_*` ports
- IOP INTC (phys 0x1F801070-0x1F80107F) → `intc_stub` (IOP-side
instance) via `iop_intc_rd_*` / `iop_intc_wr_*` ports
- Shared BIOS ROM (phys 0x1FC00000-0x1FFFFFFF, 4 MiB) →
`bios_rom_stub` via `bios_rd_*` port. kseg1 aliasing makes
`0xBFC0_0000` reset fetches land here transparently. BIOS is
read-only; writes to this window trace as UNMAPPED.
- everything else → UNMAPPED with deterministic 0xDEADBEEF
Future regions (other DMAC channels, IOP timers, SPU2) reserved in
comments. Arbitration between CPU and bridge writes on RAM path:
CPU wins on same-cycle collision. SIF, DMAC, INTC, and BIOS are
separate ports and don't contend with RAM.
- `iop_fetch_stub.sv` — minimal sequential 32-bit fetcher. Mirrors
`ee_fetch_stub` in shape: PC-incrementing, no decode, no branches, no
exceptions. Default `RESET_VECTOR` is in IOP RAM (0x00000000), NOT in
BIOS space — explicitly non-BIOS boot. Emits `IOP RESET` once and
`IOP IFETCH` per response. First execution-visible IOP traffic in the
project; fetches route through `iop_memory_map_stub`.
- `iop_core_stub.sv` — **real instruction-decoding IOP core with
minimal COP0, asynchronous interrupt exception entry, and the
architectural MIPS reset vector.** Tiny MIPS R3000 subset,
multi-cycle FSM, speaks the same map / DMAC / INTC protocol as every
previous engine. Default `PC_RESET = 0xBFC0_0000` (kseg1 into the
shared BIOS window; override with a parameter for RAM-only tests).
Supported opcodes:
LUI, ORI, ADDIU, LW, SW, BEQ, BNE, J, JR (SPECIAL func 0x08),
NOP (any other SPECIAL func / unknown opcode), SYSCALL
(SPECIAL func 0x0C, halts), **MFC0 / MTC0 / RFE** (COP0 opcode 0x10).
32-entry register file with `$0` hardwired.
**COP0 subset:** Status (IE/KU triple stack + IM), Cause (ExcCode +
IP reflecting cpu_irq), EPC. Exception entry is sampled at clean
instruction-retire boundaries: if a delay slot is outstanding, the
exception defers until the delay slot resolves. On entry: push IE/KU
stack, ExcCode=0, save EPC=next_pc, PC←EXC_VECTOR (parameter).
**Branch delay slot** honoured from day one; taken-branch and
delay-slot retires are both flagged in the trace.
**Strict mode:** `STRICT_UNSUPPORTED` parameter (default 0). When
set, unsupported opcodes halt the core and latch the offending
pc/instr word into `trap_o` / `trap_pc_o` / `trap_instr_o` instead
of silently retiring as NOPs. The canonical NOP (`instr == 32'h0`,
SLL $0,$0,0) is always treated as a real NOP. Retire trace flag
bit 7 marks strict-trap retires. Used by the BIOS smoke TB; other
benches leave it off for backwards compatibility.
Deferred: BD bit in Cause, nested interrupts, syscall/break
exception dispatch, R-type ALU/shifts/HI-LO.
- `iop_exec_stub.sv` — **RAM-backed IOP execution primitive (bridge
module).**
Not a MIPS core, not an ISA decoder. A tiny FSM sequencer that fetches
its micro-ops from IOP RAM through the real `iop_memory_map_stub`
CPU-side port — the same way a future instruction-fetching CPU will.
The control program is no longer RTL-resident; it lives as data in
RAM that someone (a TB, eventually a BIOS loader) preloads before
pulsing `go_i`.
**Five opcodes**: `HALT`, `WRITE(addr, data)`, `READ(addr)`,
`WAIT_IRQ`, `BNE(target_pc, expected)` — branch if the last READ's
result does not equal `expected`, enabling real loops.
Op layout in RAM: 16 bytes per op (`pc<<4` addressing). Word 0 is
the opcode (low 4 bits), word 1 is addr or branch target, word 2 is
data or expected value, word 3 is reserved. `SCRIPT_BASE` is a
parameter (default 0x0000_0400).
Takes `cpu_irq` from the IOP INTC; `WAIT_IRQ` genuinely blocks until
a real interrupt asserts. One trace event per op completion with
flag bits marking WAIT_IRQ exit (bit 1), HALT entry (bit 2), and
BNE taken (bit 3). When a real MIPS decode primitive eventually
arrives, it replaces this module while keeping the same map / DMA /
INTC hookup verbatim.
- `iop_dmac_reg_stub.sv` — IOP DMAC for one SIF-facing channel
(CHANNEL=9, PATH_ID=9, MASTER_ID=4). Register surface (low-byte
offsets): MADR @ 0x00, BCR @ 0x04, CHCR @ 0x08, DONE_COUNT @ 0x0C
(read-only monotonic counter); start bit is CHCR[0].
Real data path: on start, DMAC latches MADR/BCR, then steps through
IDLE → FETCH_WAIT → ACTIVE_SEND → DONE per beat, sourcing 32-bit words
from IOP RAM through the map's `dma_rd_*` port (src_addr stepping by
4 per beat). Endpoint is a word-granularity ready/valid/last stream
with `ep_ready` back-pressure — no false completion under stall.
Emits DMA_CFG on register writes, DMA_START on arm, DMA_BEAT per
accepted beat (with src_addr + remaining count), DMA_DONE on the
final beat. `done_count_o` is a monotonic visible counter.
`irq_completion_o` is a one-cycle pulse on S_DONE — wired into the
IOP INTC as source bit 0 so software can observe channel completion.
Only reachable through the real IOP map at 0x1F80_1520.
## Explicit non-goals (current step)
- Full MIPS R3000 ISA coverage (the core is still a narrow subset;
strict-mode halts on the first unsupported opcode so the BIOS tells
us what to grow next)
- Full 2 MiB RAM sizing (stub defaults stay small for sim speed; the map
window is 2 MiB and truncates at the connection to the smaller stub)
- IOP I/O beyond the currently decoded regions (DMAC ch9 / INTC / BIOS);
SPU2, timers, and other peripherals are not wired yet
- IOP DMAC channels other than ch9 (SIF0 IOP→EE)
- Real Sony BIOS execution (the smoke TB's synthetic bootstrap is the
current committed content; swapping in a user-supplied dump is a
drop-in exercise that will reveal the next missing opcode)
## Scope boundary
This directory owns IOP CPU execution, IOP-local RAM/I/O decode, IOP
interrupt intake, IOP DMAC channels, and BIOS-side IOP boot sequencing
behavior (per `docs/contracts/iop.md`).
The IOP side now runs a MIPS R3000 subset from an architecturally
correct BIOS reset vector, with precise interrupt exception entry and
a RAM-resident ISR. The project has crossed five architectural seams:
1. TB-orchestrated → fabric-orchestrated (scripted exec stub)
2. RTL-resident → RAM-resident control (exec stub reads ops from RAM)
3. Micro-op bridge → real ISA decode (iop_core_stub)
4. Polled completion → asynchronous exception-driven control flow
(COP0 + cpu_irq)
5. TB-preloaded RAM as reset source → BIOS ROM at 0xBFC0_0000
(shared BIOS wired through the IOP map; hand-assembled bootstraps
prove the seam before any real Sony BIOS is attempted)
Each seam preserved every prior module — only where code comes from
evolved.
## Planned next increments
These are possibilities, not commitments — order will be decided per the
next architectural question:
- **BIOS-driven core growth:** point `tb_iop_core_bios_smoke` at a
user-supplied BIOS dump (swap the TB's synthetic preload for
`$readmemh` into `u_bios.mem`), observe the first unsupported
opcode, add it to `iop_core_stub`, repeat. Expected near-term
additions: ANDI, ADDU/SUBU, SLL/SRL/SRA, JAL, SLT(U). Do not add
speculatively; let the BIOS trace drive the order.
- Core exception growth as the BIOS path demands it: BD bit in
Cause, nested interrupts, syscall/break exception dispatch.
- Other IOP DMAC channels (CDVD / SPU2 / DEV9 / SIF1-2 / SIO2).
- IOP map expansion: remaining IOP I/O (0x1F800000), SPU2
(0x1F900000).
+711
View File
@@ -0,0 +1,711 @@
// retroDE_ps2 — iop_core_stub
//
// Minimal MIPS R3000 subset for the IOP side, now with real interrupt
// exception entry. The engine sits where `iop_exec_stub` sat, drives
// `iop_memory_map_stub`'s CPU-side port for ifetch and data accesses,
// and finally *uses* `cpu_irq` from the IOP INTC instead of ignoring it.
//
// Wave 1 (decode): LUI/ORI/ADDIU/LW/SW/BEQ/BNE/J/NOP/SYSCALL, honest
// branch delay slots. Programs polled INTC_STAT through the real map.
//
// Wave 2 (this module revision): minimal COP0 + asynchronous interrupt
// exception entry. cpu_irq becomes a real vectoring event when
// enabled through Status. Mainline no longer needs to touch INTC_STAT;
// an ISR at the exception vector handles acknowledgement.
//
// Intentionally still NOT a full R3000:
// - No TLB / cache / HI/LO / R-type ALU / shifts / mul / div.
// - No syscall / break exception *handling* beyond SYSCALL-as-halt.
// - No BD bit in Cause for branch-delay exceptions (we simply
// refuse to take exceptions between a taken branch and its delay
// slot — see "delay-slot rule" below).
// - No kernel/user mode enforcement: KU state exists on the stack
// for forward compatibility but nothing in the core consults it.
//
// Supported opcodes (MIPS encoding):
// SPECIAL (opcode = 0x00):
// func 0x08 (JR) — pc <= rs_val; has delay slot.
// func 0x0C (SYSCALL) — halt_o asserts; FSM stops fetching.
// any other func — treated as NOP (incl. SLL $0,$0,0).
// 0x02 J — jump; has delay slot.
// 0x04 BEQ / 0x05 BNE — conditional branch; has delay slot.
// 0x09 ADDIU — no overflow trap.
// 0x0D ORI / 0x0F LUI — logical immediate / upper load.
// 0x10 COP0:
// rs 0x00 (MFC0) — rt <= COP0[rd]
// rs 0x04 (MTC0) — COP0[rd] <= rt
// rs 0x10, func 0x10 (RFE)
// — shift IE/KU stack right (pop)
// 0x23 LW / 0x2B SW — word memory access.
// Anything else — treated as NOP.
//
// COP0 register surface (subset):
// 12 Status [0]=IEc [1]=KUc [2]=IEp [3]=KUp [4]=IEo [5]=KUo
// [15:8]=IM (bit 10 = IM2 gates the HW interrupt
// wired to cpu_irq)
// 13 Cause [6:2]=ExcCode, [15:8]=IP. IP[2] reflects cpu_irq.
// Software may write Cause but we only latch SW
// interrupt pending bits IP[1:0] — not load-bearing
// in the first TB.
// 14 EPC saved PC on exception entry.
//
// Exception entry semantics:
// Sampled at *instruction-retire boundaries*, never mid-fetch or
// mid-memory. An exception is taken iff all of the following hold
// at the retire boundary:
// - Status.IEc == 1 (master interrupts enabled)
// - Cause.IP[i] & Status.IM[i] (any unmasked pending source)
// - new_branch_pending == 0 (delay slot already resolved)
// On entry:
// EPC <= next_pc (the pc that would have been
// fetched next; branch_target
// if a delay slot just resolved,
// pc+4 otherwise)
// Cause.ExcCode <= 5'h00 (Int exception)
// Status stack pushes left:
// IEo <= IEp; IEp <= IEc; IEc <= 0
// KUo <= KUp; KUp <= KUc; KUc <= 0
// pc <= EXC_VECTOR (fixed, parameter)
// branch_pending <= 0 (any pending control flow is
// canceled; EPC captured it)
//
// RFE semantics (pop stack, one level):
// IEc <= IEp; IEp <= IEo
// KUc <= KUp; KUp <= KUo
// (IEo, KUo left intact — matches impl-defined R3000 behaviour
// for non-nested use)
//
// Trace (SUBSYS_IOP, EV_IFETCH one-per-retire as before):
// flags bit 0 = SW (write) (unchanged)
// flags bit 1 = LW (read) (unchanged)
// flags bit 2 = branch / jump taken (unchanged)
// flags bit 3 = SYSCALL (halt) (unchanged)
// flags bit 4 = this instruction was in a delay slot
// flags bit 5 = exception taken at the end of this instruction
// (EPC saved = next_pc, PC redirected to EXC_VECTOR)
// flags bit 6 = RFE retired (IE stack popped)
// flags bit 7 = strict trap (unsupported instruction halted the core)
//
// Strict mode (STRICT_UNSUPPORTED parameter):
// Default is 0 (lenient) to preserve every prior bench's regression
// behaviour — any instruction the core doesn't actively decode retires
// as a NOP. When STRICT_UNSUPPORTED=1, the core instead halts on the
// first unsupported opcode it encounters, latches the offending PC +
// instruction word into trap_pc_o / trap_instr_o, asserts trap_o, and
// emits a retire trace with flag bit 7 set. Intended for real-BIOS
// smoke bring-up — "the first missing opcode is the one the core
// needs to grow next." The canonical NOP (32'h0000_0000 =
// SLL $0,$0,0) is always treated as a NOP regardless of strict mode.
`timescale 1ns/1ps
module iop_core_stub
import trace_pkg::*;
#(
// Architectural MIPS R3000 reset vector (kseg1 into the shared BIOS
// window). kseg1 strip in iop_memory_map_stub maps this to physical
// 0x1FC0_0000, which the map now routes to bios_rom_stub.
// Tests that don't have a BIOS image must override PC_RESET.
parameter logic [31:0] PC_RESET = 32'hBFC0_0000,
parameter logic [31:0] EXC_VECTOR = 32'h0000_0080,
// See header comment "Strict mode". Default 0 preserves existing
// regression behaviour; BIOS-oriented benches should set to 1.
parameter bit STRICT_UNSUPPORTED = 1'b0
) (
input logic clk,
input logic rst_n,
input logic go_i,
output logic map_rd_en,
output logic [31:0] map_rd_addr,
input logic [31:0] map_rd_data,
input logic map_rd_valid,
output logic map_wr_en,
output logic [31:0] map_wr_addr,
output logic [31:0] map_wr_data,
output logic [3:0] map_wr_be,
input logic cpu_irq,
output logic halt_o,
output logic [31:0] pc_o,
// Strict-mode trap reporting. `trap_o` rises the cycle the core
// halts on an unsupported instruction; `trap_pc_o` / `trap_instr_o`
// latch the offending fetch. All three stay stable after the halt.
output logic trap_o,
output logic [31:0] trap_pc_o,
output logic [31:0] trap_instr_o,
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
// ------------------------------------------------------------------
// Opcode / func / COP0 rs constants
// ------------------------------------------------------------------
localparam logic [5:0] OP_SPECIAL = 6'h00;
localparam logic [5:0] OP_J = 6'h02;
localparam logic [5:0] OP_BEQ = 6'h04;
localparam logic [5:0] OP_BNE = 6'h05;
localparam logic [5:0] OP_ADDIU = 6'h09;
localparam logic [5:0] OP_ORI = 6'h0D;
localparam logic [5:0] OP_LUI = 6'h0F;
localparam logic [5:0] OP_COP0 = 6'h10;
localparam logic [5:0] OP_LW = 6'h23;
localparam logic [5:0] OP_SW = 6'h2B;
localparam logic [5:0] FUNC_JR = 6'h08;
localparam logic [5:0] FUNC_SYSCALL = 6'h0C;
localparam logic [5:0] FUNC_RFE = 6'h10;
localparam logic [4:0] COP0_RS_MF = 5'h00;
localparam logic [4:0] COP0_RS_MT = 5'h04;
localparam logic [4:0] COP0_RS_CO = 5'h10;
localparam logic [4:0] COP0_REG_STATUS = 5'd12;
localparam logic [4:0] COP0_REG_CAUSE = 5'd13;
localparam logic [4:0] COP0_REG_EPC = 5'd14;
// ------------------------------------------------------------------
// FSM state
// ------------------------------------------------------------------
typedef enum logic [3:0] {
S_IDLE = 4'd0,
S_IFETCH_REQ = 4'd1,
S_IFETCH_WAIT = 4'd2,
S_EXECUTE = 4'd3,
S_MEM_REQ = 4'd4,
S_MEM_WAIT = 4'd5,
S_MEM_WRITE = 4'd6,
S_HALT = 4'd7
} state_e;
state_e state;
// Architectural state
logic [31:0] pc;
logic [31:0] instr;
logic [31:0] regfile [0:31];
// Branch delay-slot tracking
logic branch_pending;
logic [31:0] branch_target;
logic instr_in_delay_slot;
// COP0 — Status (IE/KU triple stack + IM)
logic status_iec, status_iep, status_ieo;
logic status_kuc, status_kup, status_kuo;
logic [7:0] status_im;
// COP0 — Cause / EPC
logic [4:0] cause_exc_code;
logic [7:0] cause_ip_sw; // software-writable pending bits (IP[1:0])
logic [31:0] epc;
// Combinational composition of IP. IP[2] mirrors cpu_irq directly;
// higher sources are not wired in the current scope.
logic [7:0] cause_ip;
always_comb begin
cause_ip = 8'd0;
cause_ip[1:0] = cause_ip_sw[1:0];
cause_ip[2] = cpu_irq;
end
// Composed Status word (for MFC0) and Cause word (for MFC0)
logic [31:0] status_word;
logic [31:0] cause_word;
always_comb begin
status_word = 32'd0;
status_word[0] = status_iec;
status_word[1] = status_kuc;
status_word[2] = status_iep;
status_word[3] = status_kup;
status_word[4] = status_ieo;
status_word[5] = status_kuo;
status_word[15:8] = status_im;
cause_word = 32'd0;
cause_word[6:2] = cause_exc_code;
cause_word[15:8] = cause_ip;
end
// ------------------------------------------------------------------
// Decode — combinational extraction from `instr`
// ------------------------------------------------------------------
logic [5:0] opcode;
logic [4:0] rs_idx;
logic [4:0] rt_idx;
logic [4:0] rd_idx;
logic [5:0] func;
logic [15:0] imm16;
logic [25:0] imm26;
logic [31:0] imm_sx;
logic [31:0] imm_zx;
logic [31:0] branch_offset;
logic [31:0] branch_tgt;
logic [31:0] j_tgt;
logic [31:0] rs_val;
logic [31:0] rt_val;
logic [31:0] ea;
assign opcode = instr[31:26];
assign rs_idx = instr[25:21];
assign rt_idx = instr[20:16];
assign rd_idx = instr[15:11];
assign imm16 = instr[15:0];
assign imm26 = instr[25:0];
assign func = instr[5:0];
assign imm_sx = {{16{imm16[15]}}, imm16};
assign imm_zx = {16'd0, imm16};
assign branch_offset = {{14{imm16[15]}}, imm16, 2'b00};
assign branch_tgt = pc + 32'd4 + branch_offset;
assign j_tgt = {pc[31:28], imm26, 2'b00};
assign rs_val = (rs_idx == 5'd0) ? 32'd0 : regfile[rs_idx];
assign rt_val = (rt_idx == 5'd0) ? 32'd0 : regfile[rt_idx];
assign ea = rs_val + imm_sx;
// Instruction classification
logic is_special, is_syscall, is_jr;
logic is_cop0, is_mfc0, is_mtc0, is_rfe;
logic is_nop_class;
logic is_lui, is_ori, is_addiu, is_lw, is_sw, is_beq, is_bne, is_j;
logic is_branch, is_jump;
logic branch_taken;
logic is_taken_branch_or_jump;
assign is_special = (opcode == OP_SPECIAL);
assign is_syscall = is_special && (func == FUNC_SYSCALL);
assign is_jr = is_special && (func == FUNC_JR);
assign is_cop0 = (opcode == OP_COP0);
assign is_mfc0 = is_cop0 && (rs_idx == COP0_RS_MF);
assign is_mtc0 = is_cop0 && (rs_idx == COP0_RS_MT);
assign is_rfe = is_cop0 && (rs_idx == COP0_RS_CO) && (func == FUNC_RFE);
assign is_lui = (opcode == OP_LUI);
assign is_ori = (opcode == OP_ORI);
assign is_addiu = (opcode == OP_ADDIU);
assign is_lw = (opcode == OP_LW);
assign is_sw = (opcode == OP_SW);
assign is_beq = (opcode == OP_BEQ);
assign is_bne = (opcode == OP_BNE);
assign is_j = (opcode == OP_J);
assign is_branch = is_beq || is_bne;
assign is_jump = is_j || is_jr;
assign branch_taken = (is_beq && (rs_val == rt_val)) ||
(is_bne && (rs_val != rt_val));
assign is_taken_branch_or_jump = branch_taken || is_jump;
// "NOP class" = anything we don't actively decode. In lenient mode
// these retire as a NOP; in strict mode the core halts on them
// (see `is_unsupported` / `strict_trap` below).
assign is_nop_class = (is_special && !is_syscall && !is_jr)
|| (is_cop0 && !is_mfc0 && !is_mtc0 && !is_rfe)
|| (!is_special && !is_cop0
&& !is_lui && !is_ori && !is_addiu
&& !is_lw && !is_sw && !is_beq && !is_bne
&& !is_j);
// The canonical NOP is the all-zero instruction word
// (SLL $0,$0,0). It is always treated as a NOP even in strict mode
// so the bios_rom_stub default NOP sled doesn't look like a field
// of traps.
logic is_nop_instr;
logic is_unsupported;
logic strict_trap;
assign is_nop_instr = (instr == 32'd0);
assign is_unsupported = is_nop_class && !is_nop_instr;
assign strict_trap = STRICT_UNSUPPORTED && is_unsupported;
// ALU writeback value (for LUI/ORI/ADDIU)
logic [31:0] alu_wb;
always_comb begin
if (is_lui) alu_wb = {imm16, 16'd0};
else if (is_ori) alu_wb = rs_val | imm_zx;
else if (is_addiu) alu_wb = rs_val + imm_sx;
else alu_wb = 32'd0;
end
// MFC0 source value (selected by rd_idx)
logic [31:0] cop0_read_val;
always_comb begin
unique case (rd_idx)
COP0_REG_STATUS: cop0_read_val = status_word;
COP0_REG_CAUSE: cop0_read_val = cause_word;
COP0_REG_EPC: cop0_read_val = epc;
default: cop0_read_val = 32'd0;
endcase
end
// Taken-branch / jump target selection
logic [31:0] taken_target;
always_comb begin
if (is_jr) taken_target = rs_val;
else if (is_j) taken_target = j_tgt;
else taken_target = branch_tgt;
end
// ------------------------------------------------------------------
// Trace book-keeping (captured at retire)
// ------------------------------------------------------------------
logic [31:0] retired_pc;
logic [31:0] retired_instr;
logic [31:0] retired_arg2;
logic [31:0] retired_arg3;
logic retired_flag_write;
logic retired_flag_read;
logic retired_flag_branch;
logic retired_flag_halt;
logic retired_flag_in_delay;
logic retired_flag_except;
logic retired_flag_rfe;
logic retired_flag_trap;
logic retire_pulse;
// ------------------------------------------------------------------
// Map-port drive (combinational on state)
// ------------------------------------------------------------------
always_comb begin
map_rd_en = 1'b0;
map_rd_addr = 32'd0;
map_wr_en = 1'b0;
map_wr_addr = 32'd0;
map_wr_data = 32'd0;
map_wr_be = 4'd0;
case (state)
S_IFETCH_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = pc;
end
S_MEM_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = ea;
end
S_MEM_WRITE: begin
map_wr_en = 1'b1;
map_wr_addr = ea;
map_wr_data = rt_val;
map_wr_be = 4'b1111;
end
default: ;
endcase
end
// ------------------------------------------------------------------
// Retire helper — applies pc advance, branch queuing, and
// exception entry at a clean instruction boundary.
//
// Inputs (implicit from decoded state):
// - is_taken_branch_or_jump, taken_target
// - branch_pending (current, pre-advance)
// - branch_target (the pending target if any)
// - Status / Cause state for exception gating
//
// Outputs (all registered on this clock edge):
// - pc, branch_pending, branch_target
// - epc, status/cause on exception
// - retired_flag_except set when exception fires
// ------------------------------------------------------------------
task automatic retire_advance;
logic [31:0] next_pc;
logic new_branch_pending;
logic [31:0] new_branch_target;
logic irq_pending_masked;
logic exception_now;
next_pc = branch_pending ? branch_target : pc + 32'd4;
new_branch_pending = is_taken_branch_or_jump;
new_branch_target = taken_target;
irq_pending_masked = |(cause_ip & status_im);
exception_now = !new_branch_pending
&& status_iec
&& irq_pending_masked;
if (exception_now) begin
epc <= next_pc;
cause_exc_code <= 5'h00; // Int exception code
status_ieo <= status_iep;
status_iep <= status_iec;
status_iec <= 1'b0;
status_kuo <= status_kup;
status_kup <= status_kuc;
status_kuc <= 1'b0;
pc <= EXC_VECTOR;
branch_pending <= 1'b0;
retired_flag_except <= 1'b1;
end else begin
pc <= next_pc;
branch_pending <= new_branch_pending;
if (new_branch_pending) branch_target <= new_branch_target;
retired_flag_except <= 1'b0;
end
endtask
// ------------------------------------------------------------------
// Main FSM
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
pc <= PC_RESET;
instr <= 32'd0;
branch_pending <= 1'b0;
branch_target <= 32'd0;
instr_in_delay_slot <= 1'b0;
// COP0 reset state: interrupts disabled, mask cleared.
status_iec <= 1'b0;
status_iep <= 1'b0;
status_ieo <= 1'b0;
status_kuc <= 1'b0;
status_kup <= 1'b0;
status_kuo <= 1'b0;
status_im <= 8'd0;
cause_exc_code <= 5'd0;
cause_ip_sw <= 8'd0;
epc <= 32'd0;
retire_pulse <= 1'b0;
retired_pc <= 32'd0;
retired_instr <= 32'd0;
retired_arg2 <= 32'd0;
retired_arg3 <= 32'd0;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b0;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= 1'b0;
retired_flag_except <= 1'b0;
retired_flag_rfe <= 1'b0;
retired_flag_trap <= 1'b0;
trap_o <= 1'b0;
trap_pc_o <= 32'd0;
trap_instr_o <= 32'd0;
for (int i = 0; i < 32; i++) regfile[i] <= 32'd0;
end else begin
retire_pulse <= 1'b0;
case (state)
S_IDLE: begin
if (go_i) state <= S_IFETCH_REQ;
end
S_IFETCH_REQ: state <= S_IFETCH_WAIT;
S_IFETCH_WAIT: begin
if (map_rd_valid) begin
instr <= map_rd_data;
instr_in_delay_slot <= branch_pending;
state <= S_EXECUTE;
end
end
S_EXECUTE: begin
// Defaults for retire bookkeeping
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= 32'd0;
retired_arg3 <= 32'd0;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b0;
retired_flag_branch <= is_taken_branch_or_jump;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retired_flag_except <= 1'b0;
retired_flag_rfe <= 1'b0;
retired_flag_trap <= 1'b0;
if (is_syscall) begin
// SYSCALL halts the core unconditionally; no
// exception vectoring in this scope.
retired_flag_halt <= 1'b1;
retire_pulse <= 1'b1;
state <= S_HALT;
end else if (strict_trap) begin
// Unsupported instruction under strict mode.
// Halt and latch the offending fetch; no pc
// advance, no regfile write, no COP0 side
// effect. Trap output stays asserted for the
// TB to inspect after halt_o rises.
retired_flag_trap <= 1'b1;
retire_pulse <= 1'b1;
trap_o <= 1'b1;
trap_pc_o <= pc;
trap_instr_o <= instr;
state <= S_HALT;
end else if (is_lw) begin
state <= S_MEM_REQ;
end else if (is_sw) begin
state <= S_MEM_WRITE;
end else begin
// ALU / branch / COP0 / NOP: retire in this
// cycle. Handle per-op writebacks and COP0
// side effects, then advance pc.
if ((is_lui || is_ori || is_addiu) && (rt_idx != 5'd0))
regfile[rt_idx] <= alu_wb;
if (is_mfc0 && (rt_idx != 5'd0))
regfile[rt_idx] <= cop0_read_val;
if (is_mtc0) begin
unique case (rd_idx)
COP0_REG_STATUS: begin
status_iec <= rt_val[0];
status_kuc <= rt_val[1];
status_iep <= rt_val[2];
status_kup <= rt_val[3];
status_ieo <= rt_val[4];
status_kuo <= rt_val[5];
status_im <= rt_val[15:8];
end
COP0_REG_CAUSE: begin
// Only the software IP[1:0] bits
// are writable; ExcCode is normally
// written by the core on exception
// entry, but allow SW override too
// since the minimal scope doesn't
// dispatch on ExcCode.
cause_exc_code <= rt_val[6:2];
cause_ip_sw[1:0] <= rt_val[9:8];
end
COP0_REG_EPC: epc <= rt_val;
default: ;
endcase
end
if (is_rfe) begin
status_iec <= status_iep;
status_iep <= status_ieo;
status_kuc <= status_kup;
status_kup <= status_kuo;
retired_flag_rfe <= 1'b1;
end
// Trace payload for ALU / branch / COP0 / NOP
if (is_mfc0) begin
retired_arg2 <= {27'd0, rd_idx};
retired_arg3 <= cop0_read_val;
end else if (is_mtc0) begin
retired_arg2 <= {27'd0, rd_idx};
retired_arg3 <= rt_val;
end else if (is_taken_branch_or_jump) begin
retired_arg2 <= taken_target;
retired_arg3 <= 32'd0;
end else if (is_lui || is_ori || is_addiu) begin
retired_arg3 <= alu_wb;
end
retire_pulse <= 1'b1;
retire_advance();
state <= S_IFETCH_REQ;
end
end
S_MEM_REQ: state <= S_MEM_WAIT;
S_MEM_WAIT: begin
if (map_rd_valid) begin
if (rt_idx != 5'd0) regfile[rt_idx] <= map_rd_data;
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= ea;
retired_arg3 <= map_rd_data;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b1;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retired_flag_rfe <= 1'b0;
retire_pulse <= 1'b1;
retire_advance();
state <= S_IFETCH_REQ;
end
end
S_MEM_WRITE: begin
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= ea;
retired_arg3 <= rt_val;
retired_flag_write <= 1'b1;
retired_flag_read <= 1'b0;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retired_flag_rfe <= 1'b0;
retire_pulse <= 1'b1;
retire_advance();
state <= S_IFETCH_REQ;
end
S_HALT: state <= S_HALT;
default: state <= S_IDLE;
endcase
end
end
assign halt_o = (state == S_HALT);
assign pc_o = pc;
// ------------------------------------------------------------------
// Trace emission — one event per retire
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_IFETCH;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (retire_pulse) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_IFETCH;
ev_arg0 <= {32'd0, retired_pc};
ev_arg1 <= {32'd0, retired_instr};
ev_arg2 <= {32'd0, retired_arg2};
ev_arg3 <= {32'd0, retired_arg3};
ev_flags <= {24'd0,
retired_flag_trap,
retired_flag_rfe,
retired_flag_except,
retired_flag_in_delay,
retired_flag_halt,
retired_flag_branch,
retired_flag_read,
retired_flag_write};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : iop_core_stub
+319
View File
@@ -0,0 +1,319 @@
// retroDE_ps2 — iop_dmac_reg_stub
//
// IOP DMAC channel 9 (SIF0 IOP→EE) with a real, bounded data path.
// Upgraded from the earlier register+lifecycle shell: MADR is a real
// source pointer into IOP RAM, BCR is a real word count, and the
// state machine pulls 32-bit beats out of IOP RAM through the IOP map
// and emits them on a word-granularity endpoint with ready/valid/last
// handshake. Mirrors the EE DMAC shape (dmac_reg_stub) at 32-bit width.
//
// Contract refs:
// docs/contracts/iop.md (IOP DMAC ownership)
//
// Register surface (per-channel, low-byte offset):
// 0x00 MADR — real source address in IOP physical space
// 0x04 BCR — transfer length in 32-bit beats
// 0x08 CHCR — channel control; bit[0] is the start bit
// 0x0C DONE_COUNT — monotonic completion counter (read-only; writes
// are accepted but ignored). Software reads this
// to distinguish "nth completion" without needing
// to count interrupts externally.
// Other offsets: writes accepted but ignored; reads return 0.
//
// Memory master interface (to iop_memory_map_stub's dma_rd_* port):
// mem_rd_en / mem_rd_addr issue the request (one cycle)
// mem_rd_valid / mem_rd_data return the word one cycle later
// mem_master_id drives the map trace attribution (convention: 4)
//
// Endpoint (to sif_dma_ee_ram_bridge_stub or similar 32-bit sink):
// ep_valid / ep_data[31:0] / ep_last
// ep_ready is the backpressure signal — when low, the state machine
// holds in ACTIVE_SEND with the current beat. No false completion.
//
// State machine:
// IDLE → FETCH_WAIT on CHCR start
// FETCH_WAIT → ACTIVE_SEND on mem_rd_valid (word latched)
// ACTIVE_SEND → FETCH_WAIT on endpoint accept with more beats left
// → DONE on endpoint accept for the final beat
// DONE → IDLE next cycle (clears CHCR.start)
//
// Source stepping: src_addr = madr_latched + (beat_index * 4).
//
// Trace payload schema (SUBSYS_DMAC):
// DMA_CFG arg0=channel arg1=chcr arg2=madr arg3=bcr flags=reg_offset
// DMA_START arg0=channel arg1=bcr arg2=madr arg3=path_id
// DMA_BEAT arg0=channel arg1=beat_index arg2=src_addr arg3=remaining
// DMA_DONE arg0=channel arg1=beats arg2=completion_code arg3=path_id
// completion_code 0 = OK.
`timescale 1ns/1ps
module iop_dmac_reg_stub
import trace_pkg::*;
#(
parameter logic [3:0] CHANNEL = 4'd9, // SIF0 (IOP → EE)
parameter logic [3:0] PATH_ID = 4'd9,
parameter logic [7:0] MASTER_ID = 8'd4 // for dma_rd trace attribution
) (
input logic clk,
input logic rst_n,
// IOP-side register access (from the memory map's iop_dmac_* port)
input logic reg_wr_en,
input logic reg_rd_en,
input logic [3:0] reg_offset,
input logic [31:0] reg_wr_data,
output logic [31:0] reg_rd_data,
output logic reg_rd_valid,
// Memory read master (to iop_memory_map_stub dma_rd_* port)
output logic mem_rd_en,
output logic [31:0] mem_rd_addr,
output logic [7:0] mem_master_id,
input logic [31:0] mem_rd_data,
input logic mem_rd_valid,
// Endpoint (word-granularity stream to SIF egress bridge)
output logic ep_valid,
output logic [31:0] ep_data,
output logic ep_last,
input logic ep_ready,
// Completion pulse — one cycle high when the channel reaches S_DONE.
// Intended as an IOP INTC source; latching is the interrupt
// controller's responsibility.
output logic irq_completion_o,
// Status
output logic busy_o,
output logic [31:0] done_count_o,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam logic [3:0] MADR_OFFSET = 4'h0;
localparam logic [3:0] BCR_OFFSET = 4'h4;
localparam logic [3:0] CHCR_OFFSET = 4'h8;
localparam logic [3:0] DONE_COUNT_OFFSET = 4'hC;
typedef enum logic [1:0] {
S_IDLE = 2'd0,
S_FETCH_WAIT = 2'd1,
S_ACTIVE_SEND = 2'd2,
S_DONE = 2'd3
} state_e;
logic [31:0] madr;
logic [31:0] bcr;
logic [31:0] chcr;
state_e state;
logic [31:0] madr_latched;
logic [31:0] bcr_latched;
logic [31:0] beat_index;
logic [31:0] beat_payload;
logic start_pulse;
assign start_pulse = reg_wr_en && (reg_offset == CHCR_OFFSET)
&& reg_wr_data[0] && !chcr[0];
// ------------------------------------------------------------------
// Register file
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
madr <= 32'd0;
bcr <= 32'd0;
chcr <= 32'd0;
end else begin
if (reg_wr_en) begin
case (reg_offset)
MADR_OFFSET: madr <= reg_wr_data;
BCR_OFFSET: bcr <= reg_wr_data;
CHCR_OFFSET: chcr <= reg_wr_data;
default: ;
endcase
end
if (state == S_DONE) chcr[0] <= 1'b0;
end
end
// ------------------------------------------------------------------
// Register read (1-cycle latency, matches rest of stub ecosystem)
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
reg_rd_data <= 32'd0;
reg_rd_valid <= 1'b0;
end else begin
reg_rd_valid <= reg_rd_en;
if (reg_rd_en) begin
case (reg_offset)
MADR_OFFSET: reg_rd_data <= madr;
BCR_OFFSET: reg_rd_data <= bcr;
CHCR_OFFSET: reg_rd_data <= chcr;
DONE_COUNT_OFFSET: reg_rd_data <= done_count_o;
default: reg_rd_data <= 32'd0;
endcase
end
end
end
// ------------------------------------------------------------------
// Transfer state machine
// ------------------------------------------------------------------
logic [31:0] src_addr;
assign src_addr = madr_latched + (beat_index << 2); // 4 bytes/beat
logic beat_accepted;
assign beat_accepted = ep_valid && ep_ready;
// Pulse mem_rd_en for one cycle whenever we first enter FETCH_WAIT.
logic prev_state_fw;
always_ff @(posedge clk) begin
if (!rst_n) prev_state_fw <= 1'b0;
else prev_state_fw <= (state == S_FETCH_WAIT);
end
logic entering_fw;
assign entering_fw = (state == S_FETCH_WAIT) && !prev_state_fw;
assign mem_rd_en = entering_fw;
assign mem_rd_addr = src_addr;
assign mem_master_id = MASTER_ID;
// Drive endpoint only in ACTIVE_SEND with the latched payload.
assign ep_valid = (state == S_ACTIVE_SEND);
assign ep_data = beat_payload;
assign ep_last = (state == S_ACTIVE_SEND) &&
(beat_index + 32'd1 == bcr_latched);
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
madr_latched <= 32'd0;
bcr_latched <= 32'd0;
beat_index <= 32'd0;
beat_payload <= 32'd0;
end else begin
unique case (state)
S_IDLE: begin
if (start_pulse) begin
state <= S_FETCH_WAIT;
madr_latched <= madr;
bcr_latched <= bcr;
beat_index <= 32'd0;
end
end
S_FETCH_WAIT: begin
if (mem_rd_valid) begin
beat_payload <= mem_rd_data;
state <= S_ACTIVE_SEND;
end
end
S_ACTIVE_SEND: begin
if (beat_accepted) begin
if (beat_index + 32'd1 == bcr_latched) begin
state <= S_DONE;
end else begin
beat_index <= beat_index + 32'd1;
state <= S_FETCH_WAIT;
end
end
end
S_DONE: begin
state <= S_IDLE;
end
default: state <= S_IDLE;
endcase
end
end
assign busy_o = (state != S_IDLE);
assign irq_completion_o = (state == S_DONE);
// ------------------------------------------------------------------
// Trace emission — one event per cycle. Priority:
// DONE > BEAT > START > CFG (register write)
// ------------------------------------------------------------------
logic prev_in_transfer;
always_ff @(posedge clk) begin
if (!rst_n) prev_in_transfer <= 1'b0;
else prev_in_transfer <= (state != S_IDLE);
end
logic enter_start;
assign enter_start = (state == S_FETCH_WAIT) && !prev_in_transfer;
logic enter_done;
assign enter_done = (state == S_DONE);
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_CFG;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
done_count_o <= 32'd0;
end else if (enter_done) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_DONE;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, beat_index + 32'd1}; // beats completed
ev_arg2 <= 64'd0; // completion OK
ev_arg3 <= {60'd0, PATH_ID};
ev_flags <= 32'd0;
done_count_o <= done_count_o + 32'd1;
end else if (beat_accepted) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_BEAT;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, beat_index};
ev_arg2 <= {32'd0, src_addr};
ev_arg3 <= {32'd0, bcr_latched - beat_index - 32'd1};
ev_flags <= 32'd0;
end else if (enter_start) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_START;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, bcr_latched};
ev_arg2 <= {32'd0, madr_latched};
ev_arg3 <= {60'd0, PATH_ID};
ev_flags <= 32'd0;
end else if (reg_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_DMAC;
ev_event <= EV_DMA_CFG;
ev_arg0 <= {60'd0, CHANNEL};
ev_arg1 <= {32'd0, (reg_offset == CHCR_OFFSET) ? reg_wr_data : chcr};
ev_arg2 <= {32'd0, (reg_offset == MADR_OFFSET) ? reg_wr_data : madr};
ev_arg3 <= {32'd0, (reg_offset == BCR_OFFSET) ? reg_wr_data : bcr};
ev_flags <= {28'd0, reg_offset};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : iop_dmac_reg_stub
+320
View File
@@ -0,0 +1,320 @@
// retroDE_ps2 — iop_exec_stub
//
// First RAM-backed IOP execution primitive. Micro-op fetch now comes
// from IOP RAM through the real `iop_memory_map_stub` CPU-side port —
// the same way a future MIPS-class CPU would fetch instructions. The
// control program is no longer RTL-resident; it lives as data in RAM
// that someone (a TB, eventually a BIOS / loader path) preloads before
// pulsing `go_i`.
//
// NOT a MIPS core, NOT an ISA decoder. A tiny FSM sequencer over a
// five-opcode micro-op ISA, designed as the bridge between "testbench
// choreographs everything" and a real instruction-fetching CPU. When
// the real CPU arrives, it replaces this module but keeps the same
// map / DMA / INTC hookup verbatim.
//
// Contract refs:
// docs/contracts/iop.md (IOP-local programming model)
//
// Opcodes (encoded in word 0 low nibble):
// OP_HALT 0x0 — terminal; halt_o rises, no further accesses.
// OP_WRITE 0x1 — pulse map CPU write with (addr, data). pc++
// OP_READ 0x2 — pulse map CPU read; latch into last_read_data.
// pc++
// OP_WAIT_IRQ 0x3 — block until cpu_irq==1. pc++
// OP_BNE 0x4 — if last_read_data != expected, pc <= target;
// else pc++.
// target is in word1[7:0]; expected is in word2.
//
// Micro-op layout in RAM (16 bytes per op, little-endian word order):
// +0 word 0: {28'd0, opcode[3:0]}
// +4 word 1: addr (for WRITE/READ) or target_pc in low 8 bits (for BNE)
// +8 word 2: data (for WRITE) or expected value (for BNE); unused for
// READ/WAIT_IRQ/HALT
// +12 word 3: reserved for future opcodes
//
// Fetch sequence: three map reads per op (words 0/1/2). Word 3 is
// skipped to save a cycle. Each read has one-cycle latency via the
// map — so a full fetch is ~6 cycles, after which dispatch takes one
// more cycle. Negligible in the current scope; swap the engine for a
// real CPU later and the instruction width stops mattering.
//
// Trace payload (SUBSYS_IOP, EV_IFETCH, emitted on each op completion):
// arg0 = pc value of the op that just completed
// arg1 = opcode
// arg2 = addr (0 for WAIT_IRQ/HALT)
// arg3 = data written, data read back, expected (for BNE), or 0
// flags bit 0 = 1 → write-flavour op
// flags bit 1 = 1 → WAIT_IRQ just exited (IRQ observed)
// flags bit 2 = 1 → HALT entered
// flags bit 3 = 1 → BNE branch taken (pc changed to target, not +1)
`timescale 1ns/1ps
module iop_exec_stub
import trace_pkg::*;
#(
parameter logic [31:0] SCRIPT_BASE = 32'h0000_0400
) (
input logic clk,
input logic rst_n,
input logic go_i,
// Drive the IOP memory map's CPU-side port. Both ifetch reads and
// the script's own WRITE/READ ops flow through here.
output logic map_rd_en,
output logic [31:0] map_rd_addr,
input logic [31:0] map_rd_data,
input logic map_rd_valid,
output logic map_wr_en,
output logic [31:0] map_wr_addr,
output logic [31:0] map_wr_data,
output logic [3:0] map_wr_be,
input logic cpu_irq,
output logic halt_o,
output logic [7:0] pc_o,
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam logic [3:0] OP_HALT = 4'h0;
localparam logic [3:0] OP_WRITE = 4'h1;
localparam logic [3:0] OP_READ = 4'h2;
localparam logic [3:0] OP_WAIT_IRQ = 4'h3;
localparam logic [3:0] OP_BNE = 4'h4;
typedef enum logic [3:0] {
S_IDLE = 4'd0,
S_IF0_REQ = 4'd1,
S_IF0_WAIT = 4'd2,
S_IF1_REQ = 4'd3,
S_IF1_WAIT = 4'd4,
S_IF2_REQ = 4'd5,
S_IF2_WAIT = 4'd6,
S_DECODE = 4'd7,
S_WRITE = 4'd8,
S_READ_REQ = 4'd9,
S_READ_WAIT = 4'd10,
S_WAIT_IRQ = 4'd11,
S_BNE = 4'd12,
S_HALT = 4'd13
} state_e;
state_e state;
logic [7:0] pc;
logic [3:0] cur_opcode;
logic [31:0] cur_addr;
logic [31:0] cur_data;
logic [31:0] last_read_data;
// Op-completion event triggers (one-cycle pulses)
logic ev_op_done;
logic ev_wait_irq_exit;
logic ev_enter_halt;
logic ev_bne_taken;
// Address for the next ifetch word: SCRIPT_BASE + pc*16 + word_offset
logic [31:0] ifetch_base;
assign ifetch_base = SCRIPT_BASE + {20'd0, pc, 4'd0}; // pc << 4
// ------------------------------------------------------------------
// Map-port drive (combinational on state)
// ------------------------------------------------------------------
always_comb begin
map_wr_en = 1'b0;
map_wr_addr = 32'd0;
map_wr_data = 32'd0;
map_wr_be = 4'd0;
map_rd_en = 1'b0;
map_rd_addr = 32'd0;
case (state)
S_IF0_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = ifetch_base + 32'd0;
end
S_IF1_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = ifetch_base + 32'd4;
end
S_IF2_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = ifetch_base + 32'd8;
end
S_WRITE: begin
map_wr_en = 1'b1;
map_wr_addr = cur_addr;
map_wr_data = cur_data;
map_wr_be = 4'b1111;
end
S_READ_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = cur_addr;
end
default: ;
endcase
end
// ------------------------------------------------------------------
// State machine
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
pc <= 8'd0;
cur_opcode <= 4'd0;
cur_addr <= 32'd0;
cur_data <= 32'd0;
last_read_data <= 32'd0;
ev_op_done <= 1'b0;
ev_wait_irq_exit <= 1'b0;
ev_enter_halt <= 1'b0;
ev_bne_taken <= 1'b0;
end else begin
ev_op_done <= 1'b0;
ev_wait_irq_exit <= 1'b0;
ev_enter_halt <= 1'b0;
ev_bne_taken <= 1'b0;
case (state)
S_IDLE: begin
if (go_i) begin
pc <= 8'd0;
state <= S_IF0_REQ;
end
end
S_IF0_REQ: state <= S_IF0_WAIT;
S_IF0_WAIT: if (map_rd_valid) begin
cur_opcode <= map_rd_data[3:0];
state <= S_IF1_REQ;
end
S_IF1_REQ: state <= S_IF1_WAIT;
S_IF1_WAIT: if (map_rd_valid) begin
cur_addr <= map_rd_data;
state <= S_IF2_REQ;
end
S_IF2_REQ: state <= S_IF2_WAIT;
S_IF2_WAIT: if (map_rd_valid) begin
cur_data <= map_rd_data;
state <= S_DECODE;
end
S_DECODE: begin
case (cur_opcode)
OP_HALT: begin
state <= S_HALT;
ev_enter_halt <= 1'b1;
end
OP_WRITE: state <= S_WRITE;
OP_READ: state <= S_READ_REQ;
OP_WAIT_IRQ: state <= S_WAIT_IRQ;
OP_BNE: state <= S_BNE;
default: state <= S_HALT; // unknown opcode → safe stop
endcase
end
S_WRITE: begin
pc <= pc + 8'd1;
state <= S_IF0_REQ;
ev_op_done <= 1'b1;
end
S_READ_REQ: state <= S_READ_WAIT;
S_READ_WAIT: if (map_rd_valid) begin
last_read_data <= map_rd_data;
pc <= pc + 8'd1;
state <= S_IF0_REQ;
ev_op_done <= 1'b1;
end
S_WAIT_IRQ: begin
if (cpu_irq) begin
pc <= pc + 8'd1;
state <= S_IF0_REQ;
ev_op_done <= 1'b1;
ev_wait_irq_exit <= 1'b1;
end
end
S_BNE: begin
// target_pc = cur_addr[7:0], expected = cur_data
if (last_read_data != cur_data) begin
pc <= cur_addr[7:0];
ev_bne_taken <= 1'b1;
end else begin
pc <= pc + 8'd1;
end
state <= S_IF0_REQ;
ev_op_done <= 1'b1;
end
S_HALT: state <= S_HALT;
default: state <= S_IDLE;
endcase
end
end
assign halt_o = (state == S_HALT);
assign pc_o = pc;
// ------------------------------------------------------------------
// Trace emission. One event per op completion + one on HALT entry.
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_IFETCH;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (ev_enter_halt) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_IFETCH;
ev_arg0 <= {56'd0, pc};
ev_arg1 <= {60'd0, cur_opcode};
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'h0000_0004; // halt marker
end else if (ev_op_done) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_IFETCH;
ev_arg0 <= {56'd0, pc};
ev_arg1 <= {60'd0, cur_opcode};
ev_arg2 <= {32'd0, cur_addr};
ev_arg3 <= (cur_opcode == OP_READ)
? {32'd0, map_rd_data}
: {32'd0, cur_data};
ev_flags <= {28'd0,
ev_bne_taken,
1'b0, // (was halt; halt has its own path above)
ev_wait_irq_exit,
(cur_opcode == OP_WRITE)};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : iop_exec_stub
+128
View File
@@ -0,0 +1,128 @@
// retroDE_ps2 — iop_fetch_stub
//
// Minimal IOP-side sequential fetcher. Mirrors ee_fetch_stub in shape and
// discipline — just the smallest honest primitive that produces visible
// IOP-side execution-flow traffic. Not a CPU. Explicitly NOT a BIOS boot
// stub: the default reset vector lives in IOP RAM, not in BIOS space.
//
// Contract refs:
// docs/contracts/iop.md (IOP CPU execution, required debug
// visibility: PC stream)
//
// Behavior:
// - On reset, PC = RESET_VECTOR (default 0x00000000, the low end of
// IOP RAM).
// - Each cycle while `enable` is high: issue a 32-bit read at PC,
// advance PC += 4. No decode, no branches, no exceptions, no FPU.
// - Responses return 1 cycle later via rd_valid/rd_data from the
// map. The issued address is latched (pc_d1) so trace lines pair
// address with data.
//
// Non-goals:
// - full decode
// - branch / exception / interrupt handling
// - real IOP R3000 pipeline timing
// - BIOS fetch (use a BIOS-pointing RESET_VECTOR param override if
// needed, but that's intentionally not the default)
//
// Trace payload schema (matches ee_fetch_stub structure under SUBSYS_IOP):
// IOP RESET arg0=reset_vector
// IOP IFETCH arg0=pc arg1=data arg2=resp_kind arg3=-
// resp_kind: 0=OK (only path in this scope)
`timescale 1ns/1ps
module iop_fetch_stub
import trace_pkg::*;
#(
parameter logic [31:0] RESET_VECTOR = 32'h0000_0000
) (
input logic clk,
input logic rst_n,
input logic enable,
// Map-facing fetch port
output logic rd_en,
output logic [31:0] rd_addr,
input logic [31:0] rd_data,
input logic rd_valid,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
// ------------------------------------------------------------------
// PC and issued-address shadow (same pattern as ee_fetch_stub):
// pc is the address being issued THIS cycle (rd_addr)
// pc_d1 is the address whose response arrives THIS cycle on rd_valid
// pc_d1 only advances alongside pc when enable is high, so it stays
// aligned with the in-flight request.
// ------------------------------------------------------------------
logic [31:0] pc;
logic [31:0] pc_d1;
always_ff @(posedge clk) begin
if (!rst_n) begin
pc <= RESET_VECTOR;
pc_d1 <= RESET_VECTOR;
end else if (enable) begin
pc_d1 <= pc;
pc <= pc + 32'd4;
end
end
assign rd_en = enable;
assign rd_addr = pc;
// ------------------------------------------------------------------
// Trace
// - Single EV_RESET pulse at reset exit.
// - EV_IFETCH one cycle after each rd_valid response.
// ------------------------------------------------------------------
logic reset_emit_pending;
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_RESET;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
reset_emit_pending <= 1'b1;
end else if (reset_emit_pending) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_RESET;
ev_arg0 <= {32'd0, RESET_VECTOR};
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
reset_emit_pending <= 1'b0;
end else if (rd_valid) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_IFETCH;
ev_arg0 <= {32'd0, pc_d1};
ev_arg1 <= {32'd0, rd_data};
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : iop_fetch_stub
+652
View File
@@ -0,0 +1,652 @@
// retroDE_ps2 — iop_memory_map_stub
//
// IOP-side memory map. Gives IOP-visible addresses architectural meaning.
// Wave 3 first-pass scope is deliberately narrow: only the IOP-RAM window
// is routed; every other address decodes as UNMAPPED with deterministic
// fault data. SIF registers, IOP I/O, SPU2, CDVD, and IOP-side BIOS are
// all intentionally deferred — slots noted in comments so the map can
// grow without re-shaping its interface.
//
// Contract refs:
// docs/contracts/iop.md (IOP-local address decode)
// docs/contracts/memory.md (IOP RAM lives at phys 0x00000000-
// 0x001FFFFF, 2 MiB)
//
// Address semantics:
// IOP CPU side is MIPS R3000-class. kseg0/kseg1 aliases
// (0x80000000/0xA0000000 mirrors of 0x00000000) are modelled via
// `phys = iop_addr[28:0]`, consistent with how ee_memory_map_stub
// strips kseg for EE fetches. Physical window decode then works on
// the low 29 bits.
//
// Bridge-side write port (Wave 3 addition): the SIF-to-IOP-RAM bridge
// writes directly at physical offsets — no kseg strip. The map decodes
// its address bits directly against the same region rules.
//
// DMA read-master port (Wave 3 reverse-direction addition): the IOP
// DMAC (ch9 SIF0 egress, and any other future channel) fetches source
// bytes through this port. Physical addressing; RAM-only decode in
// current scope. Caller provides its own master_id (convention: 4 =
// IOP_DMAC).
//
// Arbitration (Wave 3 scope):
// Two potential write masters on the RAM path: the IOP CPU port and
// the bridge port. Two potential read masters on the RAM path: the
// IOP CPU port and the DMA read master. Collisions within the same
// cycle are not expected in the current TBs (CPU programming / readback
// phases are separate from DMA transfer phases). Policy if they ever
// collide: CPU wins. Documented here rather than hidden in priority
// ordering; RAM port is mux'd accordingly.
//
// Region decode (current):
// - IOP RAM window: phys[28:21] == 8'b00000000
// (0x0000_0000 - 0x001F_FFFF, 2 MiB)
// → route to iop_ram_stub (offset phys[20:0])
// - SIF registers (IOP side): phys[28:24] == 5'b11101
// (0x1D00_0000 block) → route to the SIF
// register shell with offset phys[7:0]. The
// mailbox stub's register surface covers
// offsets 0x00/0x10/0x20/0x30.
// - IOP DMAC channel 9: phys[28:4] == 25'h01F8_0152
// (0x1F80_1520 - 0x1F80_152F, 16 bytes)
// → route to iop_dmac_reg_stub with 4-bit
// offset phys[3:0]. Channel 9 is SIF0
// (IOP→EE) in the real PS2 DMAC map; other
// channels are intentionally not decoded.
// - IOP INTC: phys[28:4] == 25'h01F8_0107
// (0x1F80_1070 - 0x1F80_107F, 16 bytes)
// → route to intc_stub with 8-bit offset
// phys[7:0]. Matches the real PS2 IOP INTC
// placement (I_STAT / I_MASK).
// - Shared BIOS ROM: phys[28:22] == 7'b1111111
// (0x1FC0_0000 - 0x1FFF_FFFF, 4 MiB)
// → route to bios_rom_stub with 22-bit
// offset phys[21:0]. kseg1 aliasing maps
// 0xBFC0_0000 fetches to this window via
// the standard [28:0] strip. The IOP core
// reset vector normally points here.
// Writes to BIOS decode as UNMAPPED
// (read-only ROM).
// - everything else: UNMAPPED, reads return 32'hDEADBEEF
//
// Future regions (reserved in comments, not wired):
// - Other IOP DMAC channels: 0x1F80_1080-0x1F80_156F (partial block)
// - IOP timers / SIO: elsewhere in 0x1F80_0000 block
// - SPU2: 0x1F90_0000 block
//
// Trace semantics (matches ee_memory_map_stub's request-routing pattern):
// Map-layer events describe routing (what was asked for, where it was
// sent). Arg1 is 0 when the request is routed to a backing store that
// will emit its own delivery event; 0xDEADBEEF on unmapped reads; the
// actual write data on unmapped writes (so the TB can see what software
// tried to write).
//
// Latency assumption (mirrors ee_memory_map_stub note):
// Assumes fixed one-cycle backing-store latency. `ram_rd_valid` is not
// consulted — the map asserts its own `iop_rd_valid` one cycle after
// request unconditionally. All Wave 3 backing stubs honour that. If a
// later backing store introduces wait states, the map must grow proper
// response handshaking.
//
// Trace payload schema:
// IOP READ arg0=addr arg1=0 arg2=master_id arg3=region_id
// IOP WRITE arg0=addr arg1=wr_data arg2=master_id arg3=region_id
// IOP UNMAPPED arg0=addr arg1=0xDEADBEEF arg2=master_id arg3=0xFF
// region_id: 2 = IOP_RAM, 3 = SIF_REGS, 4 = IOP_DMAC, 5 = IOP_INTC,
// 6 = IOP_BIOS
// master_id: 2 = IOP_CPU, 3 = SIF bridge (writes), 4 = IOP_DMAC (reads)
// flags bit 0: 1 = write, 0 = read
`timescale 1ns/1ps
module iop_memory_map_stub
import trace_pkg::*;
(
input logic clk,
input logic rst_n,
// ------------------------------------------------------------------
// IOP CPU-side request interface (32-bit data, virtual address)
// ------------------------------------------------------------------
input logic iop_rd_en,
input logic [31:0] iop_rd_addr,
output logic [31:0] iop_rd_data,
output logic iop_rd_valid,
input logic iop_wr_en,
input logic [31:0] iop_wr_addr,
input logic [31:0] iop_wr_data,
input logic [3:0] iop_wr_be,
// Caller-provided master id for trace attribution. Conventional:
// 0 = TB direct, 2 = IOP CPU (once a fetch stub exists).
input logic [7:0] master_id,
// ------------------------------------------------------------------
// Bridge-side write port (Wave 3). Physical addresses; no kseg strip.
// Used by sif_dma_iop_ram_bridge_stub and similar DMA-side masters.
// Caller provides its own master_id (convention: 3 = SIF bridge).
// ------------------------------------------------------------------
input logic bridge_wr_en,
input logic [31:0] bridge_wr_addr,
input logic [31:0] bridge_wr_data,
input logic [3:0] bridge_wr_be,
input logic [7:0] bridge_master_id,
// ------------------------------------------------------------------
// DMA read-master port (Wave 3). Physical addressing; intended for
// IOP DMAC ch9 reads out of IOP RAM. One-cycle read latency, same
// pipeline shape as the CPU read. Caller provides its own master_id
// (convention: 4 = IOP_DMAC).
// ------------------------------------------------------------------
input logic dma_rd_en,
input logic [31:0] dma_rd_addr,
input logic [7:0] dma_master_id,
output logic [31:0] dma_rd_data,
output logic dma_rd_valid,
// ------------------------------------------------------------------
// Downstream to iop_ram_stub.
// Address presented as a 21-bit offset within the 2 MiB IOP RAM
// window; consumers may truncate to match their backing-store width.
// ------------------------------------------------------------------
output logic ram_rd_en,
output logic [20:0] ram_rd_addr,
input logic [31:0] ram_rd_data,
input logic ram_rd_valid,
output logic ram_wr_en,
output logic [20:0] ram_wr_addr,
output logic [31:0] ram_wr_data,
output logic [3:0] ram_wr_be,
output logic [7:0] ram_master_id,
// ------------------------------------------------------------------
// Downstream to the SIF register shell (sif_mailbox_stub IOP-side
// port). Low byte of the physical address is presented; writes go
// out with the CPU's data/be; reads come back with 1-cycle latency
// consistent with the rest of the stub ecosystem.
// ------------------------------------------------------------------
output logic sif_rd_en,
output logic [7:0] sif_rd_addr,
input logic [31:0] sif_rd_data,
input logic sif_rd_valid,
output logic sif_wr_en,
output logic [7:0] sif_wr_addr,
output logic [31:0] sif_wr_data,
// ------------------------------------------------------------------
// Downstream to the IOP DMAC register shell (channel 9). 4-bit
// offset; data path uses the CPU write data. Read returns with
// one-cycle latency like the rest of the stub ecosystem.
// ------------------------------------------------------------------
output logic iop_dmac_rd_en,
output logic [3:0] iop_dmac_rd_addr,
input logic [31:0] iop_dmac_rd_data,
input logic iop_dmac_rd_valid,
output logic iop_dmac_wr_en,
output logic [3:0] iop_dmac_wr_addr,
output logic [31:0] iop_dmac_wr_data,
// ------------------------------------------------------------------
// Downstream to the IOP INTC register shell (intc_stub reused).
// 8-bit offset passed downstream; read returns with one-cycle
// latency consistent with the rest of the stub ecosystem.
// ------------------------------------------------------------------
output logic iop_intc_rd_en,
output logic [7:0] iop_intc_rd_addr,
input logic [31:0] iop_intc_rd_data,
input logic iop_intc_rd_valid,
output logic iop_intc_wr_en,
output logic [7:0] iop_intc_wr_addr,
output logic [31:0] iop_intc_wr_data,
// ------------------------------------------------------------------
// Downstream to bios_rom_stub (shared BIOS window).
// 22-bit byte offset within the 4 MiB window. Writes are never
// forwarded (BIOS is ROM); the map routes any bios-window write
// attempt to the UNMAPPED trace event instead.
// ------------------------------------------------------------------
output logic bios_rd_en,
output logic [21:0] bios_rd_addr,
input logic [31:0] bios_rd_data,
input logic bios_rd_valid,
// ------------------------------------------------------------------
// Ch234 — bridge-clock-domain pad bitmaps from ps2_hps_bridge
// (INPUT_P1/P2 latches @ 0x040/0x044). Sync'd into the IOP clock
// by the internal `sio2_input_stub` instance below. TBs that
// don't exercise the pad path can tie both ports to `32'd0`.
// ------------------------------------------------------------------
input logic [31:0] input_p1,
input logic [31:0] input_p2,
// ------------------------------------------------------------------
// Trace
// ------------------------------------------------------------------
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam logic [63:0] REGION_IOP_RAM = 64'd2;
localparam logic [63:0] REGION_SIF_REGS = 64'd3;
localparam logic [63:0] REGION_IOP_DMAC = 64'd4;
localparam logic [63:0] REGION_IOP_INTC = 64'd5;
localparam logic [63:0] REGION_IOP_BIOS = 64'd6;
localparam logic [63:0] REGION_PAD_IO = 64'd7; // Ch234
localparam logic [63:0] REGION_UNMAPPED = 64'hFF;
localparam logic [28:0] DMAC_CH9_BASE = 29'h1F80_1520;
localparam logic [28:0] IOP_INTC_BASE = 29'h1F80_1070;
// Ch234 — retroDE-local pad I/O window (256 bytes), deliberately
// OUTSIDE the real SIO2 range (0x1F80_8200..0x1F80_82FF) so a
// faithful SIO2 emulation chapter can land later without collision.
localparam logic [28:0] PAD_IO_BASE = 29'h1F80_8500;
// ------------------------------------------------------------------
// Region decode (combinational, shared for read + write)
// ------------------------------------------------------------------
// CPU-side decode (virtual → physical via kseg strip)
logic [28:0] rd_phys_addr;
logic [28:0] wr_phys_addr;
logic rd_is_ram;
logic rd_is_sif;
logic rd_is_dmac;
logic rd_is_intc;
logic rd_is_bios;
logic rd_is_pad; // Ch234
logic cpu_wr_is_ram;
logic cpu_wr_is_sif;
logic cpu_wr_is_dmac;
logic cpu_wr_is_intc;
logic cpu_wr_is_bios;
logic cpu_wr_is_pad; // Ch234
logic [20:0] rd_ram_offset;
logic [20:0] cpu_wr_ram_offset;
assign rd_phys_addr = iop_rd_addr[28:0];
assign wr_phys_addr = iop_wr_addr[28:0];
assign rd_is_ram = (rd_phys_addr[28:21] == 8'd0);
assign rd_is_sif = (rd_phys_addr[28:24] == 5'b11101);
assign rd_is_dmac = (rd_phys_addr[28:4] == DMAC_CH9_BASE[28:4]);
assign rd_is_intc = (rd_phys_addr[28:4] == IOP_INTC_BASE[28:4]);
assign rd_is_bios = (rd_phys_addr[28:22] == 7'b1111111);
// Ch234 — pad I/O region is 256 bytes at PAD_IO_BASE, so the
// decode is bits [28:8] (= 21 high bits of the 29-bit phys addr).
assign rd_is_pad = (rd_phys_addr[28:8] == PAD_IO_BASE[28:8]);
assign cpu_wr_is_ram = (wr_phys_addr[28:21] == 8'd0);
assign cpu_wr_is_sif = (wr_phys_addr[28:24] == 5'b11101);
assign cpu_wr_is_dmac = (wr_phys_addr[28:4] == DMAC_CH9_BASE[28:4]);
assign cpu_wr_is_intc = (wr_phys_addr[28:4] == IOP_INTC_BASE[28:4]);
assign cpu_wr_is_bios = (wr_phys_addr[28:22] == 7'b1111111);
assign cpu_wr_is_pad = (wr_phys_addr[28:8] == PAD_IO_BASE[28:8]);
assign rd_ram_offset = rd_phys_addr[20:0];
assign cpu_wr_ram_offset = wr_phys_addr[20:0];
// Bridge-side decode (physical, no strip). Bridge writes are routed
// to IOP RAM only — no SIF destination from the bridge side yet.
logic bridge_wr_is_ram;
logic [20:0] bridge_wr_ram_offset;
assign bridge_wr_is_ram = (bridge_wr_addr[28:21] == 8'd0);
assign bridge_wr_ram_offset = bridge_wr_addr[20:0];
// DMA-side read decode (physical, no strip). Scope covers RAM only.
logic dma_rd_is_ram;
logic [20:0] dma_rd_ram_offset;
assign dma_rd_is_ram = (dma_rd_addr[28:21] == 8'd0);
assign dma_rd_ram_offset = dma_rd_addr[20:0];
// RAM routing. Ch261 — DMA wins the port on CPU+DMA collision; the
// CPU's read address is latched into a one-entry pending slot and
// serviced on the next RAM cycle that the DMA does not consume.
// Pre-Ch261 the comment above this block was "CPU read wins over
// DMA read on same-cycle collision" but the silent consequence was
// the DMA path sampling `ram_rd_data` from the CPU's address —
// silent DMA data corruption. The Ch261 SIF-landing TB found it.
//
// Single-entry slot is sufficient because every existing CPU
// client of this map is request-then-wait-for-valid (no second
// outstanding read in flight): exec stub, iop_core_stub, fetch
// stub all stall in their own wait state until `iop_rd_valid`
// asserts. A sim-only overflow assertion below catches any future
// client that breaks that assumption.
logic cpu_rd_hit;
logic dma_rd_hit;
logic cpu_dma_collision;
assign cpu_rd_hit = iop_rd_en && rd_is_ram;
assign dma_rd_hit = dma_rd_en && dma_rd_is_ram;
assign cpu_dma_collision = cpu_rd_hit && dma_rd_hit;
// One-entry deferred CPU-RAM-read slot.
logic cpu_pend_valid;
logic [20:0] cpu_pend_addr;
// Service priority (mutually exclusive):
// serve_dma — DMA wins the bus any cycle it asks
// serve_cpu_def — deferred CPU read services on the next non-DMA cycle
// serve_cpu_now — live CPU read services when neither of the above fires
logic serve_dma;
logic serve_cpu_def;
logic serve_cpu_now;
assign serve_dma = dma_rd_hit;
assign serve_cpu_def = !dma_rd_hit && cpu_pend_valid;
assign serve_cpu_now = !dma_rd_hit && !cpu_pend_valid && cpu_rd_hit;
assign ram_rd_en = serve_dma || serve_cpu_def || serve_cpu_now;
assign ram_rd_addr = serve_dma ? dma_rd_ram_offset
: serve_cpu_def ? cpu_pend_addr
: rd_ram_offset;
// Slot update: latch on collision, clear on service.
always_ff @(posedge clk) begin
if (!rst_n) begin
cpu_pend_valid <= 1'b0;
cpu_pend_addr <= 21'd0;
end else begin
if (cpu_dma_collision && !cpu_pend_valid) begin
cpu_pend_valid <= 1'b1;
cpu_pend_addr <= rd_ram_offset;
end else if (serve_cpu_def) begin
cpu_pend_valid <= 1'b0;
end
end
end
`ifndef SYNTHESIS
// Overflow detector: a second CPU+DMA collision while the slot is
// already pending means we'd drop the new CPU read silently. The
// current set of CPU clients can't trigger this (single outstanding
// read each), but future producers should fail loudly here.
always_ff @(posedge clk) begin
if (rst_n && cpu_dma_collision && cpu_pend_valid) begin
$error("[iop_memory_map_stub] Ch261 deferred-CPU-slot overflow: cpu_dma_collision while cpu_pend_valid (live addr=0x%05h pending addr=0x%05h)",
rd_ram_offset, cpu_pend_addr);
end
end
`endif
// SIF register-shell routing. Low byte of the physical address is
// presented downstream (mailbox uses 8-bit offsets).
assign sif_rd_en = iop_rd_en && rd_is_sif;
assign sif_rd_addr = rd_phys_addr[7:0];
assign sif_wr_en = iop_wr_en && cpu_wr_is_sif;
assign sif_wr_addr = wr_phys_addr[7:0];
assign sif_wr_data = iop_wr_data;
// IOP DMAC ch9 routing. Low 4 bits of the physical address select
// among MADR / BCR / CHCR (and any other in-block offsets).
assign iop_dmac_rd_en = iop_rd_en && rd_is_dmac;
assign iop_dmac_rd_addr = rd_phys_addr[3:0];
assign iop_dmac_wr_en = iop_wr_en && cpu_wr_is_dmac;
assign iop_dmac_wr_addr = wr_phys_addr[3:0];
assign iop_dmac_wr_data = iop_wr_data;
// IOP INTC routing. Low byte of the physical address selects
// INTC_STAT (0x00) or INTC_MASK (0x10).
assign iop_intc_rd_en = iop_rd_en && rd_is_intc;
assign iop_intc_rd_addr = rd_phys_addr[7:0];
assign iop_intc_wr_en = iop_wr_en && cpu_wr_is_intc;
assign iop_intc_wr_addr = wr_phys_addr[7:0];
assign iop_intc_wr_data = iop_wr_data;
// Ch234 — pad-I/O region wiring. The map owns a single internal
// `sio2_input_stub` instance; the bridge's INPUT_P1/P2 latches
// flow into it directly. `pad_rd_*` / `pad_wr_*` are the
// map↔stub handshake (4-bit word offset within the 256-byte
// region, captured from phys_addr[5:2]).
wire pad_rd_en;
wire [3:0] pad_rd_addr;
wire [31:0] pad_rd_data;
wire pad_rd_valid;
wire pad_wr_en;
wire [3:0] pad_wr_addr;
wire [31:0] pad_wr_data;
assign pad_rd_en = iop_rd_en && rd_is_pad;
assign pad_rd_addr = rd_phys_addr[5:2];
assign pad_wr_en = iop_wr_en && cpu_wr_is_pad;
assign pad_wr_addr = wr_phys_addr[5:2];
assign pad_wr_data = iop_wr_data;
sio2_input_stub u_sio2_input (
.clk (clk),
.rst_n (rst_n),
.input_p1 (input_p1),
.input_p2 (input_p2),
.rd_en (pad_rd_en),
.rd_addr (pad_rd_addr),
.rd_data (pad_rd_data),
.rd_valid (pad_rd_valid),
.wr_en (pad_wr_en),
.wr_addr (pad_wr_addr),
.wr_data (pad_wr_data)
);
// BIOS ROM routing. 22-bit byte offset within the 4 MiB window.
// No write path — BIOS is read-only.
assign bios_rd_en = iop_rd_en && rd_is_bios;
assign bios_rd_addr = rd_phys_addr[21:0];
// Write-path arbitration for the RAM side: CPU wins on same-cycle
// collision. Neither TB nor current design exercises collision;
// priority is defensive. SIF writes are a separate port and don't
// contend with RAM writes.
logic cpu_wr_hit;
logic bridge_wr_hit;
assign cpu_wr_hit = iop_wr_en && cpu_wr_is_ram;
assign bridge_wr_hit = bridge_wr_en && bridge_wr_is_ram;
assign ram_wr_en = cpu_wr_hit || bridge_wr_hit;
assign ram_wr_addr = cpu_wr_hit ? cpu_wr_ram_offset : bridge_wr_ram_offset;
assign ram_wr_data = cpu_wr_hit ? iop_wr_data : bridge_wr_data;
assign ram_wr_be = cpu_wr_hit ? iop_wr_be : bridge_wr_be;
assign ram_master_id = cpu_wr_hit ? master_id : bridge_master_id;
// ------------------------------------------------------------------
// Read response pipeline
// cycle N : iop_rd_en high, request routed downstream (or unmapped)
// cycle N+1: iop_rd_valid high, data from RAM or fault
// ------------------------------------------------------------------
logic rd_pending;
logic rd_was_ram;
logic rd_was_sif;
logic rd_was_dmac;
logic rd_was_intc;
logic rd_was_bios;
logic rd_was_pad; // Ch234
// Ch261 — rd_pending only pulses when the CPU read is ACTUALLY
// serviced this cycle. Three cases:
// 1. Non-RAM CPU read: always serviced (separate decode paths,
// no arbitration). Pulse rd_pending normally.
// 2. RAM CPU read, no collision: serviced this cycle (serve_cpu_now
// fires above). Pulse rd_pending.
// 3. RAM CPU read in collision: deferred (cpu_pend_valid latches).
// Do NOT pulse rd_pending — iop_rd_valid stays low until the
// deferred read finally fires (serve_cpu_def).
// 4. Deferred RAM read finally serviced (serve_cpu_def): pulse
// rd_pending with rd_was_ram=1; the data arrives next cycle.
always_ff @(posedge clk) begin
if (!rst_n) begin
rd_pending <= 1'b0;
rd_was_ram <= 1'b0;
rd_was_sif <= 1'b0;
rd_was_dmac <= 1'b0;
rd_was_intc <= 1'b0;
rd_was_bios <= 1'b0;
rd_was_pad <= 1'b0;
end else if (serve_cpu_def) begin
// Deferred RAM read serviced this cycle — data next cycle.
rd_pending <= 1'b1;
rd_was_ram <= 1'b1;
rd_was_sif <= 1'b0;
rd_was_dmac <= 1'b0;
rd_was_intc <= 1'b0;
rd_was_bios <= 1'b0;
rd_was_pad <= 1'b0;
end else if (iop_rd_en && !(rd_is_ram && cpu_dma_collision)) begin
// Normal read path: live RAM read with no collision, OR
// any non-RAM CPU read (decoded by rd_is_*, routed via
// independent paths so no arbitration concern).
rd_pending <= 1'b1;
rd_was_ram <= rd_is_ram;
rd_was_sif <= rd_is_sif;
rd_was_dmac <= rd_is_dmac;
rd_was_intc <= rd_is_intc;
rd_was_bios <= rd_is_bios;
rd_was_pad <= rd_is_pad;
end else begin
// Collision-deferred OR idle cycle. CPU waits for deferred
// read to fire; iop_rd_valid stays low.
rd_pending <= 1'b0;
end
end
assign iop_rd_valid = rd_pending;
assign iop_rd_data = rd_was_ram ? ram_rd_data
: rd_was_sif ? sif_rd_data
: rd_was_dmac ? iop_dmac_rd_data
: rd_was_intc ? iop_intc_rd_data
: rd_was_bios ? bios_rd_data
: rd_was_pad ? pad_rd_data
: 32'hDEADBEEF;
// ------------------------------------------------------------------
// DMA read response pipeline (separate from CPU pipeline). Ch261 —
// CPU+DMA collision is now handled cleanly by the deferred-CPU-slot
// above: DMA wins the port immediately, CPU's read is latched and
// serviced on the next non-DMA cycle. DMA always gets its own word
// on its expected timing; no silent corruption.
// ------------------------------------------------------------------
logic dma_rd_pending;
logic dma_rd_was_ram;
always_ff @(posedge clk) begin
if (!rst_n) begin
dma_rd_pending <= 1'b0;
dma_rd_was_ram <= 1'b0;
end else begin
dma_rd_pending <= dma_rd_en;
if (dma_rd_en) dma_rd_was_ram <= dma_rd_is_ram;
end
end
assign dma_rd_valid = dma_rd_pending;
assign dma_rd_data = dma_rd_was_ram ? ram_rd_data : 32'hDEADBEEF;
// ------------------------------------------------------------------
// Trace emission — one event per cycle. Priority:
// CPU read > CPU write > DMA read > bridge write
// Masters are expected to be sequenced in TBs; priority is defensive
// for the rare collision case.
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_READ;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (iop_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
if (rd_is_ram) begin
ev_event <= EV_READ;
ev_arg1 <= 64'd0;
ev_arg3 <= REGION_IOP_RAM;
end else if (rd_is_sif) begin
ev_event <= EV_READ;
ev_arg1 <= 64'd0;
ev_arg3 <= REGION_SIF_REGS;
end else if (rd_is_dmac) begin
ev_event <= EV_READ;
ev_arg1 <= 64'd0;
ev_arg3 <= REGION_IOP_DMAC;
end else if (rd_is_intc) begin
ev_event <= EV_READ;
ev_arg1 <= 64'd0;
ev_arg3 <= REGION_IOP_INTC;
end else if (rd_is_bios) begin
ev_event <= EV_READ;
ev_arg1 <= 64'd0;
ev_arg3 <= REGION_IOP_BIOS;
end else if (rd_is_pad) begin
ev_event <= EV_READ;
ev_arg1 <= 64'd0;
ev_arg3 <= REGION_PAD_IO;
end else begin
ev_event <= EV_UNMAPPED;
ev_arg1 <= 64'hDEADBEEF;
ev_arg3 <= REGION_UNMAPPED;
end
ev_arg0 <= {32'd0, iop_rd_addr};
ev_arg2 <= {56'd0, master_id};
ev_flags <= 32'd0;
end else if (iop_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
if (cpu_wr_is_ram) begin
ev_event <= EV_WRITE;
ev_arg3 <= REGION_IOP_RAM;
end else if (cpu_wr_is_sif) begin
ev_event <= EV_WRITE;
ev_arg3 <= REGION_SIF_REGS;
end else if (cpu_wr_is_dmac) begin
ev_event <= EV_WRITE;
ev_arg3 <= REGION_IOP_DMAC;
end else if (cpu_wr_is_intc) begin
ev_event <= EV_WRITE;
ev_arg3 <= REGION_IOP_INTC;
end else if (cpu_wr_is_pad) begin
ev_event <= EV_WRITE;
ev_arg3 <= REGION_PAD_IO;
end else begin
ev_event <= EV_UNMAPPED;
ev_arg3 <= REGION_UNMAPPED;
end
ev_arg0 <= {32'd0, iop_wr_addr};
ev_arg1 <= {32'd0, iop_wr_data};
ev_arg2 <= {56'd0, master_id};
ev_flags <= 32'h0000_0001;
end else if (dma_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= dma_rd_is_ram ? EV_READ : EV_UNMAPPED;
ev_arg0 <= {32'd0, dma_rd_addr};
ev_arg1 <= dma_rd_is_ram ? 64'd0 : 64'hDEADBEEF;
ev_arg2 <= {56'd0, dma_master_id};
ev_arg3 <= dma_rd_is_ram ? REGION_IOP_RAM : REGION_UNMAPPED;
ev_flags <= 32'd0;
end else if (bridge_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= bridge_wr_is_ram ? EV_WRITE : EV_UNMAPPED;
ev_arg0 <= {32'd0, bridge_wr_addr};
ev_arg1 <= {32'd0, bridge_wr_data};
ev_arg2 <= {56'd0, bridge_master_id};
ev_arg3 <= bridge_wr_is_ram ? REGION_IOP_RAM : REGION_UNMAPPED;
ev_flags <= 32'h0000_0001;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : iop_memory_map_stub
+153
View File
@@ -0,0 +1,153 @@
// retroDE_ps2 — iop_ram_stub
//
// First narrow IOP-side primitive. 32-bit IOP-RAM stub, architecturally
// honest to the IOP's R3000-class 32-bit bus. NOT an IOP CPU — this is
// pure memory. No fetch, no execution, no BIOS bring-up. Future IOP-side
// work (fetch stub, IOP memory map, DMAC routing) can build on top of it.
//
// Contract refs:
// docs/contracts/iop.md (IOP-local RAM/I/O decode)
// docs/contracts/memory.md (2 MiB IOP RAM in the PS2 memory map)
//
// Scope:
// - read/write 32-bit data
// - byte-enable granularity on writes
// - one-cycle read latency (matches existing stub ecosystem)
// - caller-provided master_id for trace attribution
// - trace events tagged as SUBSYS_IOP so IOP-side memory traffic is
// distinct from EE MEM events even when both are active
//
// Explicit non-goals (Wave 3 IOP first step):
// - IOP CPU execution
// - full 2 MiB sizing (default is 16 KiB — plenty for stub tests)
// - integration into any IOP memory map yet
// - connection to SIF receive path (intentional: kept independent so
// future bridging is explicit, not accidental)
//
// Trace payload schema:
// IOP READ arg0=addr arg1=data arg2=master_id arg3=region_id
// IOP WRITE arg0=addr arg1=data arg2=master_id arg3=region_id
// master_id : caller-provided (e.g. 0 = TB direct, future: 2 = IOP CPU,
// 3 = SIF bridge, etc.)
// region_id : 2 = IOP_RAM (constant for this module)
// flags[0] : 1 = write, 0 = read
`timescale 1ns/1ps
module iop_ram_stub
import trace_pkg::*;
#(
parameter int SIZE_BYTES = 16 * 1024, // 16 KiB default
parameter string IMAGE_FILE = ""
) (
input logic clk,
input logic rst_n,
// Read port
input logic rd_en,
input logic [$clog2(SIZE_BYTES)-1:0] rd_addr,
output logic [31:0] rd_data,
output logic rd_valid,
// Write port
input logic wr_en,
input logic [$clog2(SIZE_BYTES)-1:0] wr_addr,
input logic [31:0] wr_data,
input logic [3:0] wr_be,
// Caller-provided master id for trace attribution
input logic [7:0] master_id,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int ADDR_WIDTH = $clog2(SIZE_BYTES);
localparam int WORD_COUNT = SIZE_BYTES / 4;
localparam int WORD_INDEX_WIDTH = ADDR_WIDTH - 2;
localparam logic [63:0] REGION_IOP_RAM = 64'd2;
logic [31:0] mem [0:WORD_COUNT-1];
initial begin
if (IMAGE_FILE != "") begin
$display("[iop_ram_stub] loading image: %0s", IMAGE_FILE);
$readmemh(IMAGE_FILE, mem);
end else begin
for (int i = 0; i < WORD_COUNT; i++) mem[i] = 32'd0;
$display("[iop_ram_stub] zero-initialised (%0d words / %0d bytes)",
WORD_COUNT, SIZE_BYTES);
end
end
logic [WORD_INDEX_WIDTH-1:0] rd_word_idx;
logic [WORD_INDEX_WIDTH-1:0] wr_word_idx;
assign rd_word_idx = rd_addr[ADDR_WIDTH-1:2];
assign wr_word_idx = wr_addr[ADDR_WIDTH-1:2];
// ------------------------------------------------------------------
// Read + write (one-cycle latency)
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
rd_data <= 32'd0;
rd_valid <= 1'b0;
end else begin
rd_valid <= rd_en;
if (rd_en) rd_data <= mem[rd_word_idx];
if (wr_en) begin
for (int b = 0; b < 4; b++) begin
if (wr_be[b]) mem[wr_word_idx][b*8 +: 8] <= wr_data[b*8 +: 8];
end
end
end
end
// ------------------------------------------------------------------
// Trace emission — read wins on same-cycle collision (single-port
// RAM wouldn't see that anyway in Wave 3).
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_READ;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_READ;
ev_arg0 <= {{(64-ADDR_WIDTH){1'b0}}, rd_addr};
ev_arg1 <= {32'd0, mem[rd_word_idx]};
ev_arg2 <= {56'd0, master_id};
ev_arg3 <= REGION_IOP_RAM;
ev_flags <= 32'd0;
end else if (wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_IOP;
ev_event <= EV_WRITE;
ev_arg0 <= {{(64-ADDR_WIDTH){1'b0}}, wr_addr};
ev_arg1 <= {32'd0, wr_data};
ev_arg2 <= {56'd0, master_id};
ev_arg3 <= REGION_IOP_RAM;
ev_flags <= 32'h0000_0001;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : iop_ram_stub
+204
View File
@@ -0,0 +1,204 @@
// SPDX-License-Identifier: GPL-3.0-or-later
// Copyright (c) 2025-2026 retroDE contributors
// ============================================================================
// sio2_input_stub — Ch234 retroDE-local IOP-readable pad input stub
// ============================================================================
// **Not real SIO2.** A deliberately minimal MMIO surface that translates
// the Ch222 HPS-written `INPUT_P1`/`INPUT_P2` controller bitmaps into a
// Sony-format 16-bit digital pad word, exposed as IOP-readable
// registers in the retroDE-local I/O window
// `0x1F80_8500..0x1F80_85FF`. Real SIO2 emulation (`0x1F80_8200..0x1F80_82FF`,
// FIFO, command/response, IOP DMAC channel 11) is intentionally deferred
// — see `docs/contracts/sio2_pad.md` for the reconnaissance + scoping.
//
// **Register surface** (offsets relative to PAD_IO_BASE = 0x1F80_8500):
//
// 0x500 PAD_P1_STATE (RO) [15:0] = Sony 16-bit pad word for P1
// [31:16] = 0
// 0x504 PAD_P2_STATE (RO) Same shape, sourced from `input_p2`.
// 0x508 PAD_STATUS (RO) [0] = pad path present/valid = 1
// [31:1] = 0
// other reserved reads return 32'd0; writes accepted-and-ignored.
//
// **Sony pad word format (Sony "digital mode" / type 0x41 response,
// bytes 3 and 4 of the libpad/padman struct):**
//
// pad_byte3 (D-pad / start / select / sticks; active-low, 0 = pressed):
// bit 7 LEFT bit 6 DOWN bit 5 RIGHT bit 4 UP
// bit 3 START bit 2 R3 bit 1 L3 bit 0 SELECT
//
// pad_byte4 (face / shoulder buttons; active-low):
// bit 7 □ square bit 6 × cross bit 5 ○ circle bit 4 △ triangle
// bit 3 R1 bit 2 L1 bit 1 R2 bit 0 L2
//
// PAD_P1_STATE[7:0] = pad_byte3
// PAD_P1_STATE[15:8] = pad_byte4
//
// **INPUT_P1 → Sony mapping** (per `docs/contracts/sio2_pad.md`,
// SNES-style 32-bit retroDE bitmap folded onto Sony names by spatial
// face-button layout — matches the convention coco2 / a2600 already use):
//
// INPUT_P1[ 0] JOY_RIGHT → Sony RIGHT (byte3.5)
// INPUT_P1[ 1] JOY_LEFT → Sony LEFT (byte3.7)
// INPUT_P1[ 2] JOY_DOWN → Sony DOWN (byte3.6)
// INPUT_P1[ 3] JOY_UP → Sony UP (byte3.4)
// INPUT_P1[ 4] JOY_START → Sony START (byte3.3)
// INPUT_P1[ 5] JOY_SELECT → Sony SELECT (byte3.0)
// INPUT_P1[ 6] JOY_Y → Sony △ triangle (byte4.4)
// INPUT_P1[ 7] JOY_B → Sony × cross (byte4.6)
// INPUT_P1[ 8] JOY_X → Sony □ square (byte4.7)
// INPUT_P1[ 9] JOY_A → Sony ○ circle (byte4.5)
// INPUT_P1[10] JOY_L → Sony L1 (byte4.2)
// INPUT_P1[11] JOY_R → Sony R1 (byte4.3)
// INPUT_P1[12] JOY_L2 → Sony L2 (byte4.0)
// INPUT_P1[13] JOY_R2 → Sony R2 (byte4.1)
// INPUT_P1[14] JOY_L3 → Sony L3 (byte3.1)
// INPUT_P1[15] JOY_R3 → Sony R3 (byte3.2)
// INPUT_P1[16] JOY_OSD → not forwarded (retrodesd consumes it)
//
// retroDE bitmap is **active-high** (1 = pressed); Sony word is
// **active-low** (0 = pressed). The two `pad_byteN` assigns invert
// per-bit and reorder.
//
// **CDC contract.** `input_p1`/`input_p2` are bridge-clock-domain
// signals (CLOCK2_50). This module runs on the IOP/design clock.
// The 2-FF synchronizer chain inside is the standard retroDE
// single-bit sync; tearing between bits during a partial-write
// settling window is theoretically possible but practically
// vanishingly rare (retrodesd writes the whole 32-bit latch at
// one bridge edge ≤ 1 kHz; the IOP-side read is a small window
// against millions of bridge cycles). A future chapter can promote
// this to "snapshot CDC" (latch + 2-sample coherency) if tearing
// ever becomes observable.
//
// In the focused TB and single-clock sim setups, the 2-FF sync is
// a no-op functionally and adds 2 cycles of read latency from
// input change to readable register update.
// ============================================================================
`timescale 1ns/1ps
module sio2_input_stub (
input logic clk, // IOP / design clock
input logic rst_n,
// Bridge-clock-domain inputs (sync'd internally).
input logic [31:0] input_p1,
input logic [31:0] input_p2,
// IOP map read port. `rd_addr` is the 4-bit word offset within
// the PAD I/O region (so 0x500 → addr 0x0, 0x504 → 0x1, etc.).
input logic rd_en,
input logic [3:0] rd_addr,
output logic [31:0] rd_data,
output logic rd_valid,
// IOP map write port. Writes are accepted-and-ignored.
input logic wr_en,
input logic [3:0] wr_addr,
input logic [31:0] wr_data,
// Ch250 — surface the post-translation Sony 16-bit pad words for
// fabric consumers that don't go through the IOP read memory map.
// The synth top uses `p1_sony_word_o` bits to drive status LEDs as
// a hardware proof that `bridge_input_p1_raw` actually reaches a
// live fabric consumer. (Ch241 noted those wires terminated at
// unconnected nets that Quartus elided; Ch250 ends that.) Bits
// are still active-LOW per Sony's wire-format convention. Both
// outputs are parallel taps of the same internal logic that feeds
// the 0x500/0x504 read responses — no functional change to the
// existing IOP-side path.
output logic [15:0] p1_sony_word_o,
output logic [15:0] p2_sony_word_o
);
// -----------------------------------------------------------------
// 2-FF sync of each P1/P2 bit into the IOP clock domain.
// -----------------------------------------------------------------
logic [31:0] p1_sync_0, p1_sync_1;
logic [31:0] p2_sync_0, p2_sync_1;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
p1_sync_0 <= 32'd0;
p1_sync_1 <= 32'd0;
p2_sync_0 <= 32'd0;
p2_sync_1 <= 32'd0;
end else begin
p1_sync_0 <= input_p1;
p1_sync_1 <= p1_sync_0;
p2_sync_0 <= input_p2;
p2_sync_1 <= p2_sync_0;
end
end
wire [31:0] p1_q = p1_sync_1;
wire [31:0] p2_q = p2_sync_1;
// -----------------------------------------------------------------
// Sony pad-word translation. Each `pad_byteN` is the *active-low*
// Sony byte; inversion folds the active-high retroDE bitmap.
// Bit positions per `docs/contracts/sio2_pad.md`:
// byte3 = {LEFT, DOWN, RIGHT, UP, START, R3, L3, SELECT} (MSB→LSB)
// byte4 = {□, ×, ○, △, R1, L1, R2, L2}
// -----------------------------------------------------------------
function automatic logic [15:0] sony_word(input logic [31:0] joy);
logic [7:0] byte3;
logic [7:0] byte4;
// byte3 MSB→LSB: LEFT[1], DOWN[2], RIGHT[0], UP[3], START[4], R3[15], L3[14], SELECT[5]
byte3 = ~{joy[1], joy[2], joy[0], joy[3], joy[4], joy[15], joy[14], joy[5]};
// byte4 MSB→LSB: SQUARE[8], CROSS[7], CIRCLE[9], TRIANGLE[6], R1[11], L1[10], R2[13], L2[12]
byte4 = ~{joy[8], joy[7], joy[9], joy[6], joy[11], joy[10], joy[13], joy[12]};
sony_word = {byte4, byte3};
endfunction
wire [15:0] p1_word = sony_word(p1_q);
wire [15:0] p2_word = sony_word(p2_q);
// Ch250 — surface the post-translation Sony words to fabric.
assign p1_sony_word_o = p1_word;
assign p2_sony_word_o = p2_word;
// -----------------------------------------------------------------
// Register address constants (word-aligned within the PAD I/O
// region; address bits [3:2] passed in as `rd_addr[1:0]`).
// 0x500 → rd_addr = 4'h0 PAD_P1_STATE
// 0x504 → rd_addr = 4'h1 PAD_P2_STATE
// 0x508 → rd_addr = 4'h2 PAD_STATUS
// -----------------------------------------------------------------
localparam logic [3:0] OFF_P1_STATE = 4'h0;
localparam logic [3:0] OFF_P2_STATE = 4'h1;
localparam logic [3:0] OFF_STATUS = 4'h2;
// -----------------------------------------------------------------
// Read response. Combinational lookup + 1-cycle valid pipeline
// (matches the rest of the IOP map peripherals).
// -----------------------------------------------------------------
logic [31:0] rd_data_c;
always_comb begin
unique case (rd_addr)
OFF_P1_STATE: rd_data_c = {16'd0, p1_word};
OFF_P2_STATE: rd_data_c = {16'd0, p2_word};
OFF_STATUS: rd_data_c = {31'd0, 1'b1};
default: rd_data_c = 32'd0;
endcase
end
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
rd_data <= 32'd0;
rd_valid <= 1'b0;
end else begin
rd_valid <= rd_en;
if (rd_en)
rd_data <= rd_data_c;
end
end
// -----------------------------------------------------------------
// Writes are accepted-and-ignored. We tie `wr_*` to a placeholder
// wire so lint tools don't flag them as unused.
// -----------------------------------------------------------------
// verilator lint_off UNUSED
wire _wr_unused = &{1'b0, wr_en, wr_addr, wr_data, 1'b0};
// verilator lint_on UNUSED
endmodule : sio2_input_stub
+33
View File
@@ -0,0 +1,33 @@
# rtl/memory
Memory visibility, storage, and arbitration. Matches `docs/contracts/memory.md`.
Per the BIOS-ownership split (memory owns storage, IOP owns behavior), this
directory contains the storage/mapping layer. BIOS boot sequencing
(IOPBOOT / IOPBTCONF parsing) belongs under `rtl/iop/`.
## Wave 1 contents
- `bios_rom_stub.sv` — 4 MiB BIOS ROM adapter. Loads a user-supplied hex
image via `$readmemh` when `IMAGE_FILE` is set, otherwise falls back to a
synthetic NOP sled. One-cycle read latency.
- `ee_memory_map_stub.sv` — EE-side address decode. Wave 2.7 revision adds
a DMAC read-master port (128-bit data, physical addressing) with its own
RAM-window decode at 0x00000000-0x01FFFFFF routing to `ee_ram_stub`. EE
fetch path still uses kseg-aliased decode and is BIOS-only.
## Wave 2.5 addition
- `ee_ram_stub.sv` — small addressable EE-RAM block (default 16 KiB,
128-bit data path). First real memory source for DMAC-backed transfers.
Read port: `rd_en/rd_addr/rd_data/rd_valid`. Write port: `wr_en/wr_addr/
wr_data/wr_be`. Caller-provided `master_id` gets tagged into MEM READ /
WRITE trace events. Not the final 32 MiB EE-RAM model — see
`docs/wave25_memory_backed_dma_plan.md` for scope boundaries.
## BIOS policy note
Per `docs/decisions/0002-bios-policy.md`, no BIOS image is distributed from
this repository. Synthetic fixture is the default so the project can run
stubs without any Sony firmware. Real BIOS usage requires a user-supplied
dump placed at the path passed to `IMAGE_FILE`.
+123
View File
@@ -0,0 +1,123 @@
// retroDE_ps2 — bios_rom_stub
//
// Simulation stub for the 4 MiB BIOS ROM window. Gives Milestone B a
// deterministic instruction source before the rest of the memory system
// exists.
//
// Contract refs:
// docs/stub_module_plan.md (Wave 1, item 2)
// docs/contracts/memory.md (memory owns BIOS storage/visibility)
// docs/contracts/iop.md (IOP owns BIOS behavior — NOT here)
// docs/decisions/0002-bios-policy.md (real BIOS + narrow stubs; this stub
// is the storage adapter, not firmware
// behavior, and needs no stub-policy
// tracking)
//
// Backing store:
// - If IMAGE_FILE is a non-empty string, `$readmemh` loads it at
// elaboration. Caller is responsible for supplying a hex image produced
// from a user-supplied BIOS dump. No BIOS image is shipped with this
// repository (see third_party/LICENSING.md).
// - If IMAGE_FILE is empty (default), a synthetic fixture is generated:
// mem[word_i] = 32'h00000000 (MIPS NOP: sll $0, $0, 0)
// Rationale: straight-line valid MIPS so the fixture is a legitimate
// execution target for any future emulator comparison. This aligns
// with sim/golden/trace_compare_spec.md ("first comparison target").
// Earlier versions used 32'hBFC00000 | word_index for trace-distinct
// inspection, but the spec explicitly rules out fixtures whose words
// are not a sensible execution target.
//
// Interface:
// - Byte-addressed within the 4 MiB window. The lower 2 bits of rd_addr
// are ignored (word-aligned fetch). Upstream address decode is owned
// by ee_memory_map_stub; this block does not validate the window itself.
// - One-cycle read latency: rd_en pulses on cycle N, rd_data/rd_valid
// present on cycle N+1.
// - Each completed read emits a MEM.READ trace event.
//
// Trace payload schema (per stub plan):
// MEM READ arg0=addr arg1=data arg2=master arg3=region
// master: 0=EE_IFETCH (only source wired in Wave 1)
// region: 0=BIOS
`timescale 1ns/1ps
module bios_rom_stub
import trace_pkg::*;
#(
parameter int SIZE_BYTES = 4 * 1024 * 1024,
parameter string IMAGE_FILE = ""
) (
input logic clk,
input logic rst_n,
input logic rd_en,
input logic [$clog2(SIZE_BYTES)-1:0] rd_addr,
output logic [31:0] rd_data,
output logic rd_valid,
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int WORD_COUNT = SIZE_BYTES / 4;
localparam int ADDR_WIDTH = $clog2(SIZE_BYTES);
localparam int WORD_INDEX_WIDTH = ADDR_WIDTH - 2;
logic [31:0] mem [0:WORD_COUNT-1];
initial begin
if (IMAGE_FILE != "") begin
$display("[bios_rom_stub] loading image: %0s", IMAGE_FILE);
$readmemh(IMAGE_FILE, mem);
end else begin
for (int i = 0; i < WORD_COUNT; i++) begin
mem[i] = 32'h00000000; // MIPS NOP
end
$display("[bios_rom_stub] synthetic NOP sled loaded (%0d words)", WORD_COUNT);
end
end
logic [WORD_INDEX_WIDTH-1:0] word_index;
assign word_index = rd_addr[ADDR_WIDTH-1:2];
always_ff @(posedge clk) begin
if (!rst_n) begin
rd_data <= 32'd0;
rd_valid <= 1'b0;
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_READ;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else begin
rd_valid <= rd_en;
if (rd_en) begin
rd_data <= mem[word_index];
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_READ;
ev_arg0 <= {{(64-ADDR_WIDTH){1'b0}}, rd_addr};
ev_arg1 <= {32'd0, mem[word_index]};
ev_arg2 <= 64'd0; // master: EE_IFETCH
ev_arg3 <= 64'd0; // region: BIOS
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
end
endmodule : bios_rom_stub
File diff suppressed because it is too large Load Diff
+165
View File
@@ -0,0 +1,165 @@
// retroDE_ps2 — ee_ram_stub
//
// Tiny addressable EE-RAM block for Wave 2.5. Provides the first real memory
// source for DMAC-backed transfers. Not the final 32 MiB main-RAM model —
// see docs/wave25_memory_backed_dma_plan.md for explicit scope.
//
// Contract refs:
// docs/wave25_memory_backed_dma_plan.md (ee_ram_stub scope)
// docs/contracts/memory.md (memory subsystem ownership)
//
// Interface:
// - 128-bit wide data path, qword-aligned addressing (low 4 bits ignored).
// - One-cycle read latency: rd_en on cycle N → rd_data / rd_valid on N+1.
// - Write port has per-byte enables (wr_be[15:0]).
// - Optional `$readmemh` preload via IMAGE_FILE parameter.
//
// Trace:
// Emits MEM READ / MEM WRITE events one cycle after the request, matching
// the existing MEM schema. master_id is a caller-provided input (8 bits);
// the integration TB tags reads as 1 (DMAC) while TB-initiated writes are
// tagged 0. Any downstream master can drive its own id without RAM-side
// changes.
//
// Trace payload:
// MEM READ arg0=addr arg1=data_lo arg2=master_id arg3=region_id
// MEM WRITE arg0=addr arg1=data_lo arg2=master_id arg3=region_id
// master_id : caller-provided (e.g. 0 = TB direct, 1 = DMAC)
// region_id : 1 = EE_RAM (constant for this module)
// flags bit 0: 1 = write, 0 = read
`timescale 1ns/1ps
module ee_ram_stub
import trace_pkg::*;
#(
parameter int SIZE_BYTES = 16 * 1024, // 16 KiB default
parameter string IMAGE_FILE = ""
) (
input logic clk,
input logic rst_n,
// Read port (qword-aligned)
input logic rd_en,
input logic [$clog2(SIZE_BYTES)-1:0] rd_addr,
output logic [127:0] rd_data,
output logic rd_valid,
// Write port (qword-aligned; wr_be provides per-byte granularity)
input logic wr_en,
input logic [$clog2(SIZE_BYTES)-1:0] wr_addr,
input logic [127:0] wr_data,
input logic [15:0] wr_be,
// Optional caller-provided master id for trace attribution. Default tie
// to 8'd0 (TB direct) if the caller doesn't drive; DMAC drives 8'd1.
input logic [7:0] master_id,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int ADDR_WIDTH = $clog2(SIZE_BYTES);
localparam int QWORD_COUNT = SIZE_BYTES / 16;
localparam int QW_INDEX_WIDTH = $clog2(QWORD_COUNT);
localparam logic [63:0] REGION_EE_RAM = 64'd1;
logic [127:0] mem [0:QWORD_COUNT-1];
initial begin
if (IMAGE_FILE != "") begin
$display("[ee_ram_stub] loading image: %0s", IMAGE_FILE);
$readmemh(IMAGE_FILE, mem);
end else begin
for (int i = 0; i < QWORD_COUNT; i++) mem[i] = 128'd0;
$display("[ee_ram_stub] zero-initialised (%0d qwords / %0d bytes)",
QWORD_COUNT, SIZE_BYTES);
end
end
logic [QW_INDEX_WIDTH-1:0] rd_qw_idx;
logic [QW_INDEX_WIDTH-1:0] wr_qw_idx;
assign rd_qw_idx = rd_addr[ADDR_WIDTH-1:4];
assign wr_qw_idx = wr_addr[ADDR_WIDTH-1:4];
// ------------------------------------------------------------------
// Read + write (one-cycle latency). Reads and writes to the same
// address in the same cycle are not expected in Wave 2.5; if they
// occur, the read sees pre-write data (standard register-file
// semantics).
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
rd_data <= 128'd0;
rd_valid <= 1'b0;
end else begin
rd_valid <= rd_en;
if (rd_en) rd_data <= mem[rd_qw_idx];
if (wr_en) begin
for (int b = 0; b < 16; b++) begin
if (wr_be[b]) mem[wr_qw_idx][b*8 +: 8] <= wr_data[b*8 +: 8];
end
end
end
end
// ------------------------------------------------------------------
// Trace emission: one event per cycle, read wins over write on the
// unlikely same-cycle collision (single-port RAM would not see that
// anyway). Registered so ev_valid lines up with rd_valid / wr_ack
// boundaries.
// ------------------------------------------------------------------
logic [127:0] rd_data_sampled;
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_READ;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
rd_data_sampled <= 128'd0;
end else begin
// The actual fetched data is available one cycle after rd_en.
// Sample it in parallel with rd_data so the trace fires on the
// same edge as rd_valid.
rd_data_sampled <= mem[rd_qw_idx];
if (rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_READ;
ev_arg0 <= {{(64-ADDR_WIDTH){1'b0}}, rd_addr};
ev_arg1 <= mem[rd_qw_idx][63:0];
ev_arg2 <= {56'd0, master_id};
ev_arg3 <= REGION_EE_RAM;
ev_flags <= 32'd0;
end else if (wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_MEM;
ev_event <= EV_WRITE;
ev_arg0 <= {{(64-ADDR_WIDTH){1'b0}}, wr_addr};
ev_arg1 <= wr_data[63:0];
ev_arg2 <= {56'd0, master_id};
ev_arg3 <= REGION_EE_RAM;
ev_flags <= 32'h0000_0001; // bit 0 = write
end else begin
ev_valid <= 1'b0;
end
end
end
endmodule : ee_ram_stub
+234
View File
@@ -0,0 +1,234 @@
// ============================================================================
// I2C_Controller.v — Fixed-frame I2C master (3-byte write transactions)
// ============================================================================
//
// SPDX-License-Identifier: MIT
// Copyright (c) 2026 retroDE contributors
//
// Clean-room implementation, not derived from any GPL upstream. Released
// under the MIT license to allow reuse outside the retroDE project. The
// retroDE project as a whole is distributed under GPLv3 — see ../LICENSE
// for the combined-work terms. See ../LICENSES/MIT.txt for the full MIT
// text.
//
// ----------------------------------------------------------------------------
// Purpose
// Simple I2C master that sends a fixed 24-bit frame per transaction:
// I2C_DATA = { slave_addr[6:0], rw, reg_addr[7:0], data[7:0] }
// Slave ACK is sampled after each of the three bytes. A STOP condition is
// generated at the end of the frame.
//
// Timing contract (compatible with legacy I2C_HDMI_Config parent)
// CLK Fabric clock; state is registered on its rising edge.
// CLK_EN One-cycle pulse at the desired I2C bit rate. State only advances
// when CLK_EN is asserted.
// CLK_PHASE Square wave at the SCL rate. During actively clocked bit cells,
// I2C_SCLK is driven as ~CLK_PHASE. Dedicated START/STOP hold
// phases force SCL high or low for a full cell.
// I2C_SDAT Open-drain: driven low via 1'b0 or released to 1'bz. An
// external or FPGA internal pull-up is required on this line.
// I2C_SCLK Actively driven (not open-drain). This matches the known-good
// DE25-Nano HDMI path and avoids relying on board-side pull-ups.
//
// Interface
// Port list is preserved verbatim from the legacy module so this file is
// a drop-in replacement. W_R is retained as a no-op input for source-
// compatibility; direction is encoded in I2C_DATA[16] by convention.
// SD_COUNTER and SDO are exposed for debug/observation only.
//
// Implementation note
// The transaction is modeled as explicit phases rather than implicit state
// updates. This keeps the START, bit, ACK, and STOP cells easy to inspect
// while preserving the known-good bus waveform used by retroDE_splash.
// ============================================================================
`timescale 1ns/1ps
module I2C_Controller (
input wire CLK,
input wire CLK_EN,
input wire CLK_PHASE,
output wire I2C_SCLK,
inout wire I2C_SDAT,
input wire [23:0] I2C_DATA,
input wire GO,
output reg END,
input wire W_R, // retained for interface compat; unused
output wire ACK,
input wire RESET,
output wire [5:0] SD_COUNTER, // debug: current transaction phase
output wire SDO // debug: current SDA release state
);
localparam [5:0] PH_IDLE = 6'd0,
PH_START_HOLD = 6'd1,
PH_START_LOW = 6'd2,
PH_B0_7 = 6'd3,
PH_B0_6 = 6'd4,
PH_B0_5 = 6'd5,
PH_B0_4 = 6'd6,
PH_B0_3 = 6'd7,
PH_B0_2 = 6'd8,
PH_B0_1 = 6'd9,
PH_B0_0 = 6'd10,
PH_ACK0 = 6'd11,
PH_B1_7 = 6'd12,
PH_B1_6 = 6'd13,
PH_B1_5 = 6'd14,
PH_B1_4 = 6'd15,
PH_B1_3 = 6'd16,
PH_B1_2 = 6'd17,
PH_B1_1 = 6'd18,
PH_B1_0 = 6'd19,
PH_ACK1 = 6'd20,
PH_B2_7 = 6'd21,
PH_B2_6 = 6'd22,
PH_B2_5 = 6'd23,
PH_B2_4 = 6'd24,
PH_B2_3 = 6'd25,
PH_B2_2 = 6'd26,
PH_B2_1 = 6'd27,
PH_B2_0 = 6'd28,
PH_ACK2 = 6'd29,
PH_STOP_LOW = 6'd30,
PH_STOP_HIGH = 6'd31,
PH_DONE = 6'd32;
reg [5:0] phase;
reg [23:0] frame_data;
reg [2:0] ack_bits;
reg sda_release;
reg [5:0] phase_next;
reg [23:0] frame_data_next;
reg [2:0] ack_bits_next;
reg sda_release_next;
reg end_next;
assign I2C_SCLK =
(phase == PH_IDLE || phase == PH_START_HOLD || phase == PH_STOP_HIGH || phase == PH_DONE) ? 1'b1 :
(phase == PH_START_LOW || phase == PH_STOP_LOW) ? 1'b0 :
~CLK_PHASE;
assign I2C_SDAT = sda_release ? 1'bz : 1'b0;
assign ACK = |ack_bits;
assign SDO = sda_release;
assign SD_COUNTER = phase;
always @(*) begin
phase_next = phase;
frame_data_next = frame_data;
ack_bits_next = ack_bits;
sda_release_next = sda_release;
end_next = END;
case (phase)
PH_IDLE: begin
end_next = 1'b0;
sda_release_next = 1'b1;
if (GO) begin
phase_next = PH_START_HOLD;
frame_data_next = I2C_DATA;
ack_bits_next = 3'd0;
sda_release_next = 1'b0;
end
end
PH_START_HOLD: begin
phase_next = PH_START_LOW;
sda_release_next = 1'b0;
end
PH_START_LOW: begin
phase_next = PH_B0_7;
sda_release_next = frame_data[23];
end
PH_B0_7: begin phase_next = PH_B0_6; sda_release_next = frame_data[22]; end
PH_B0_6: begin phase_next = PH_B0_5; sda_release_next = frame_data[21]; end
PH_B0_5: begin phase_next = PH_B0_4; sda_release_next = frame_data[20]; end
PH_B0_4: begin phase_next = PH_B0_3; sda_release_next = frame_data[19]; end
PH_B0_3: begin phase_next = PH_B0_2; sda_release_next = frame_data[18]; end
PH_B0_2: begin phase_next = PH_B0_1; sda_release_next = frame_data[17]; end
PH_B0_1: begin phase_next = PH_B0_0; sda_release_next = frame_data[16]; end
PH_B0_0: begin phase_next = PH_ACK0; sda_release_next = 1'b1; end
PH_ACK0: begin
phase_next = PH_B1_7;
ack_bits_next[0] = I2C_SDAT;
sda_release_next = frame_data[15];
end
PH_B1_7: begin phase_next = PH_B1_6; sda_release_next = frame_data[14]; end
PH_B1_6: begin phase_next = PH_B1_5; sda_release_next = frame_data[13]; end
PH_B1_5: begin phase_next = PH_B1_4; sda_release_next = frame_data[12]; end
PH_B1_4: begin phase_next = PH_B1_3; sda_release_next = frame_data[11]; end
PH_B1_3: begin phase_next = PH_B1_2; sda_release_next = frame_data[10]; end
PH_B1_2: begin phase_next = PH_B1_1; sda_release_next = frame_data[9]; end
PH_B1_1: begin phase_next = PH_B1_0; sda_release_next = frame_data[8]; end
PH_B1_0: begin phase_next = PH_ACK1; sda_release_next = 1'b1; end
PH_ACK1: begin
phase_next = PH_B2_7;
ack_bits_next[1] = I2C_SDAT;
sda_release_next = frame_data[7];
end
PH_B2_7: begin phase_next = PH_B2_6; sda_release_next = frame_data[6]; end
PH_B2_6: begin phase_next = PH_B2_5; sda_release_next = frame_data[5]; end
PH_B2_5: begin phase_next = PH_B2_4; sda_release_next = frame_data[4]; end
PH_B2_4: begin phase_next = PH_B2_3; sda_release_next = frame_data[3]; end
PH_B2_3: begin phase_next = PH_B2_2; sda_release_next = frame_data[2]; end
PH_B2_2: begin phase_next = PH_B2_1; sda_release_next = frame_data[1]; end
PH_B2_1: begin phase_next = PH_B2_0; sda_release_next = frame_data[0]; end
PH_B2_0: begin phase_next = PH_ACK2; sda_release_next = 1'b1; end
PH_ACK2: begin
phase_next = PH_STOP_LOW;
ack_bits_next[2] = I2C_SDAT;
sda_release_next = 1'b0;
end
PH_STOP_LOW: begin
phase_next = PH_STOP_HIGH;
sda_release_next = 1'b0;
end
PH_STOP_HIGH: begin
phase_next = PH_DONE;
sda_release_next = 1'b1;
end
PH_DONE: begin
end_next = 1'b1;
sda_release_next = 1'b1;
if (!GO)
phase_next = PH_IDLE;
end
default: begin
phase_next = PH_IDLE;
sda_release_next = 1'b1;
end_next = 1'b0;
end
endcase
end
always @(posedge CLK or negedge RESET) begin
if (!RESET) begin
phase <= PH_IDLE;
frame_data <= 24'd0;
ack_bits <= 3'd0;
sda_release <= 1'b1;
END <= 1'b0;
end
else if (CLK_EN) begin
phase <= phase_next;
frame_data <= frame_data_next;
ack_bits <= ack_bits_next;
sda_release <= sda_release_next;
END <= end_next;
end
end
endmodule
+236
View File
@@ -0,0 +1,236 @@
// ============================================================================
// I2C_HDMI_Config.v — ADV7513 HDMI transmitter configuration via I2C
// ============================================================================
//
// Derived from Terasic DE-series reference design (I2C_HDMI_Config.v).
// Original copyright belongs to Terasic Technologies Inc.; this file is
// distributed under the terms of the Terasic Reference Design license that
// ships with the DE25-Nano System CD (free use on Terasic hardware,
// copyright notice retained).
//
// retroDE modifications (2025-2026):
// - LUT_SIZE expanded to 38 entries
// - Audio configuration for I2S input @ 48 kHz, MCLK 12.288 MHz
// - HPD override (0xD6 = 0xC0) for monitors that misreport hot-plug
// - AVI InfoFrame configured for full-range RGB 444 output
// - Comments documenting each ADV7513 register write
//
// ============================================================================
`timescale 1ns/1ps
module I2C_HDMI_Config ( // Host Side
iCLK,
iRST_N,
// I2C Side
I2C_SCLK,
I2C_SDAT,
HDMI_TX_INT,
READY,
// Ch166: sticky NACK watchdog
ERROR
);
// Host Side
input iCLK;
input iRST_N;
// I2C Side: SCL is actively driven by the master; SDA is open-drain
// (master drives low / releases to 1'bz; slave drives ACK).
output I2C_SCLK;
inout I2C_SDAT;
input HDMI_TX_INT;
output READY ;
// Ch166: ERROR latches HIGH if the same LUT entry NACKs
// NACK_LIMIT consecutive times (chip absent, address wrong,
// bus shorted). Sticky until iRST_N. Cleared on reset.
output ERROR;
// Internal Registers/Wires
reg [15:0] mI2C_CLK_DIV;
reg [23:0] mI2C_DATA;
reg mI2C_CTRL_CLK;
reg mI2C_GO;
wire mI2C_END;
wire mI2C_ACK;
reg [15:0] LUT_DATA;
reg [5:0] LUT_INDEX;
reg [3:0] mSetup_ST;
reg READY ;
// Clock Setting
parameter CLK_Freq = 50000000; // 50 MHz
parameter I2C_Freq = 20000; // 20 KHz
// LUT Data Number
parameter LUT_SIZE = 38;
// Ch166 - NACK watchdog threshold (consecutive retries on the
// same LUT entry before ERROR latches). At I2C_Freq=20 kHz a
// full byte transaction is ~1.5 ms, so 16 retries ~= 24 ms before
// we declare the bus dead - generous enough for real-world bus
// settling but well short of a stuck-LED user complaint.
parameter NACK_LIMIT = 16;
///////////////////// I2C Control Clock ////////////////////////
always@(posedge iCLK or negedge iRST_N)
begin
if(!iRST_N)
begin
mI2C_CTRL_CLK <= 0;
mI2C_CLK_DIV <= 0;
end
else
begin
if( mI2C_CLK_DIV < (CLK_Freq/I2C_Freq) )
mI2C_CLK_DIV <= mI2C_CLK_DIV+1;
else
begin
mI2C_CLK_DIV <= 0;
mI2C_CTRL_CLK <= ~mI2C_CTRL_CLK;
end
end
end
////////////////////////////////////////////////////////////////////
I2C_Controller u0 ( .CLK(mI2C_CTRL_CLK), // Controller work clock
.CLK_EN(1'b1), // Advance every controller clock
.CLK_PHASE(mI2C_CTRL_CLK), // Phase for SCL generation
.I2C_SCLK(I2C_SCLK), // I2C CLOCK
.I2C_SDAT(I2C_SDAT), // I2C DATA
.I2C_DATA(mI2C_DATA), // DATA:[SLAVE_ADDR,SUB_ADDR,DATA]
.GO(mI2C_GO), // GO transfor
.END(mI2C_END), // END transfor
.W_R(1'b0), // Ch165 audit Low tie retained-compat port off (always WRITE)
.ACK(mI2C_ACK), // ACK
.RESET(iRST_N) );
////////////////////////////////////////////////////////////////////
////////////////////// Config Control ////////////////////////////
always@(posedge mI2C_CTRL_CLK or negedge iRST_N)
begin
if(!iRST_N)
begin
READY <= 0;
LUT_INDEX <= 0;
mSetup_ST <= 0;
mI2C_GO <= 0;
end
else
begin
if(LUT_INDEX<LUT_SIZE)
begin
READY<=0;
case(mSetup_ST)
0: begin
mI2C_DATA <= {8'h72,LUT_DATA};
mI2C_GO <= 1;
mSetup_ST <= 1;
end
1: begin
if(mI2C_END)
begin
if(!mI2C_ACK)
mSetup_ST <= 2;
else
mSetup_ST <= 0;
mI2C_GO <= 0;
end
end
2: begin
LUT_INDEX <= LUT_INDEX+1;
mSetup_ST <= 0;
end
endcase
end
else
begin
READY<=1;
if(!HDMI_TX_INT)
begin
LUT_INDEX <= 0;
end
else
LUT_INDEX <= LUT_INDEX;
end
end
end
////////////////////////////////////////////////////////////////////
////////////////// Ch166 NACK watchdog (sticky) //////////////////
//
// Counts consecutive NACK retries on the *current* LUT entry.
// In the config FSM above, state 1 sees mI2C_END at the end of
// each I2C transaction; if mI2C_ACK is HIGH (slave didn't drive
// the ACK bit LOW), the FSM bounces back to state 0 and retries
// the same LUT_DATA. State 2 means the byte ACKed and LUT_INDEX
// is about to advance, so we clear the retry count there. Once
// the count hits NACK_LIMIT, ERROR latches HIGH (sticky until
// iRST_N) so the top level can surface a stuck bus on an LED.
reg [7:0] nack_retries;
reg error_latched;
always @(posedge mI2C_CTRL_CLK or negedge iRST_N)
begin
if (!iRST_N)
begin
nack_retries <= 0;
error_latched <= 1'b0;
end
else
begin
if (mSetup_ST == 1 && mI2C_END && mI2C_ACK)
begin
nack_retries <= nack_retries + 1;
if (nack_retries == NACK_LIMIT - 1)
error_latched <= 1'b1;
end
else if (mSetup_ST == 2)
begin
nack_retries <= 0;
end
end
end
assign ERROR = error_latched;
////////////////////////////////////////////////////////////////////
///////////////////// Config Data LUT //////////////////////////
always@(*)
begin
case(LUT_INDEX)
// Video Config Data
00 : LUT_DATA <= 16'h9803; //Must be set to 0x03 for proper operation
01 : LUT_DATA <= 16'hD6C0; //HPD override: force HPD always-high (bits[7:6]=11)
02 : LUT_DATA <= 16'h0100; //Set 'N' value at 6144
03 : LUT_DATA <= 16'h0218; //Set 'N' value at 6144
04 : LUT_DATA <= 16'h0300; //Set 'N' value at 6144
05 : LUT_DATA <= 16'h0a01; //MCLK ratio = 256x fs (12.288 MHz / 48 kHz)
06 : LUT_DATA <= 16'h0b2e; //MCLK Active
07 : LUT_DATA <= 16'h0cbc; //Serial Audio standard i2s, R0x0C[1:0] = '00
08 : LUT_DATA <= 16'h1402; //Audio Word Length 16 bit, stereo (2 channels)
09 : LUT_DATA <= 16'h1520; //Input 444 (RGB or YCrCb) with Separate Syncs, 48kHz fs
10 : LUT_DATA <= 16'h1630; //Output format 444, 24-bit input
11 : LUT_DATA <= 16'h1846; //Disable CSC
12 : LUT_DATA <= 16'h4080; //General control packet enable
13 : LUT_DATA <= 16'h4110; //Power down control
14 : LUT_DATA <= 16'h49A8; //Set dither mode - 12-to-10 bit
15 : LUT_DATA <= 16'h5510; //AVI InfoFrame byte 1: Y=RGB, A0=active fmt valid
16 : LUT_DATA <= 16'h5608; //AVI InfoFrame byte 2: active format aspect
17 : LUT_DATA <= 16'h5708; //AVI InfoFrame byte 3: Q=10 (full range RGB 0-255)
18 : LUT_DATA <= 16'h94C0; //INT enable 1: HPD + monitor sense only
19 : LUT_DATA <= 16'h9500; //INT enable 2: all disabled
20 : LUT_DATA <= 16'h96C0; //Clear HPD + monitor sense status (matches 0x94 enable mask)
21 : LUT_DATA <= 16'h7301; //Info frame Ch count = 2 (stereo)
22 : LUT_DATA <= 16'h7600; //Speaker allocation: FL+FR (stereo)
23 : LUT_DATA <= 16'h9803; //Must be set to 0x03 for proper operation
24 : LUT_DATA <= 16'h9902; //Must be set to Default Value
25 : LUT_DATA <= 16'h9ae0; //Must be set to 0b1110000
26 : LUT_DATA <= 16'h9c30; //PLL filter R1 value
27 : LUT_DATA <= 16'h9d61; //Set clock divide
28 : LUT_DATA <= 16'ha2a4; //Must be set to 0xA4 for proper operation
29 : LUT_DATA <= 16'ha3a4; //Must be set to 0xA4 for proper operation
30 : LUT_DATA <= 16'ha504; //Must be set to Default Value
31 : LUT_DATA <= 16'hab40; //Must be set to Default Value
32 : LUT_DATA <= 16'haf16; //Select HDMI mode
33 : LUT_DATA <= 16'hba60; //No clock delay
34 : LUT_DATA <= 16'hd1ff; //Must be set to Default Value
35 : LUT_DATA <= 16'hde10; //Must be set to Default for proper operation
36 : LUT_DATA <= 16'he460; //Must be set to Default Value
37 : LUT_DATA <= 16'hfa7d; //Nbr of times to look for good phase
default: LUT_DATA <= 16'h9803;
endcase
end
////////////////////////////////////////////////////////////////////
endmodule
+29
View File
@@ -0,0 +1,29 @@
# rtl/platform
retroDE-specific platform integration. Matches `docs/contracts/platform.md`.
## Wave 1 contents
- `platform_video_stub.sv` — free-running raster generator. Default VGA
640x480 timing (overridable per-testbench to tiny values for fast sim).
Takes `bg_{r,g,b}` from `gs_stub` and flood-fills the active region.
Emits one `EV_MODE` per completed frame so testbenches can count frames
without sampling raw video.
## Scope boundary
This directory owns:
- clock/reset sequencing entry points,
- retroDE-facing video and audio adaptation,
- HPS bridge plumbing (future),
- top-level wrappers not belonging inside PS2 subsystems.
It does **not** own GS/PCRTC semantics (that's `rtl/gif_gs/`), SPU2 audio
synthesis (`rtl/spu2/`), or any PS2 register behavior.
## Replacement path
`platform_video_stub` stays as the platform adaptation layer. What changes
is the upstream pixel source: Wave 1 → flat BGCOLOR from `gs_stub`,
later waves → fuller GS/PCRTC output including framebuffer scan-out.
+1
View File
@@ -0,0 +1 @@
/home/ubuntu/FPGA_Projects/retroDE_splash/rtl/platform/cp437_8x8.mem
+163
View File
@@ -0,0 +1,163 @@
// retroDE_ps2 — platform_video_stub
//
// Smallest retroDE-facing video adapter needed for Milestone A. Accepts a
// flat pixel source (bg_{r,g,b}) from gs_stub and generates a free-running
// VGA-style raster with configurable timing. Wave 1 produces a flood-fill
// frame at the current BGCOLOR — enough to prove the platform video path
// end-to-end without waiting for real GS/PCRTC behavior.
//
// Contract refs:
// docs/stub_module_plan.md (Wave 1, item 5)
// docs/contracts/platform.md
//
// Default timing is VGA 640x480 @ 25.175 MHz pixel clock. Testbenches
// typically override to tiny values (e.g. 16x8 with minimal porches) to
// keep simulation turnaround short.
//
// Replacement path: this module remains as the platform adaptation layer
// while the upstream pixel source evolves from gs_stub to fuller GS/PCRTC
// output.
//
// Trace payload schema:
// PLAT MODE arg0=frame_number arg1=pixels_per_frame arg2=- arg3=-
// emitted once per frame on vsync rising edge, so testbenches can count
// frames without sampling raw video signals.
`timescale 1ns/1ps
module platform_video_stub
import trace_pkg::*;
#(
// Horizontal timing (in pixel clocks)
parameter int H_ACTIVE = 640,
parameter int H_FRONT = 16,
parameter int H_SYNC = 96,
parameter int H_BACK = 48,
// Vertical timing (in line counts)
parameter int V_ACTIVE = 480,
parameter int V_FRONT = 10,
parameter int V_SYNC = 2,
parameter int V_BACK = 33,
// Sync polarity. VGA 640x480 is active-low on both.
parameter bit HSYNC_ACTIVE_LOW = 1'b1,
parameter bit VSYNC_ACTIVE_LOW = 1'b1
) (
input logic clk, // pixel clock
input logic rst_n,
// Pixel source from gs_stub
input logic [7:0] bg_r,
input logic [7:0] bg_g,
input logic [7:0] bg_b,
// Platform-facing video
output logic hsync,
output logic vsync,
output logic de,
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int H_TOTAL = H_ACTIVE + H_FRONT + H_SYNC + H_BACK;
localparam int V_TOTAL = V_ACTIVE + V_FRONT + V_SYNC + V_BACK;
localparam int H_SYNC_START = H_ACTIVE + H_FRONT;
localparam int H_SYNC_END = H_SYNC_START + H_SYNC;
localparam int V_SYNC_START = V_ACTIVE + V_FRONT;
localparam int V_SYNC_END = V_SYNC_START + V_SYNC;
localparam int HCNT_W = $clog2(H_TOTAL);
localparam int VCNT_W = $clog2(V_TOTAL);
logic [HCNT_W-1:0] hcnt;
logic [VCNT_W-1:0] vcnt;
// ------------------------------------------------------------------
// Raster counters
// ------------------------------------------------------------------
logic end_of_line;
logic end_of_frame;
assign end_of_line = (hcnt == HCNT_W'(H_TOTAL - 1));
assign end_of_frame = end_of_line && (vcnt == VCNT_W'(V_TOTAL - 1));
always_ff @(posedge clk) begin
if (!rst_n) begin
hcnt <= '0;
vcnt <= '0;
end else if (end_of_line) begin
hcnt <= '0;
vcnt <= end_of_frame ? '0 : (vcnt + VCNT_W'(1));
end else begin
hcnt <= hcnt + HCNT_W'(1);
end
end
// ------------------------------------------------------------------
// Sync + data-enable + pixel colour
// ------------------------------------------------------------------
logic active_h;
logic active_v;
logic in_hsync;
logic in_vsync;
assign active_h = (hcnt < HCNT_W'(H_ACTIVE));
assign active_v = (vcnt < VCNT_W'(V_ACTIVE));
assign in_hsync = (hcnt >= HCNT_W'(H_SYNC_START)) && (hcnt < HCNT_W'(H_SYNC_END));
assign in_vsync = (vcnt >= VCNT_W'(V_SYNC_START)) && (vcnt < VCNT_W'(V_SYNC_END));
assign hsync = HSYNC_ACTIVE_LOW ? ~in_hsync : in_hsync;
assign vsync = VSYNC_ACTIVE_LOW ? ~in_vsync : in_vsync;
assign de = active_h && active_v;
assign r = de ? bg_r : 8'd0;
assign g = de ? bg_g : 8'd0;
assign b = de ? bg_b : 8'd0;
// ------------------------------------------------------------------
// Trace: one EV_MODE pulse per completed frame.
// ------------------------------------------------------------------
logic [31:0] frame_count;
always_ff @(posedge clk) begin
if (!rst_n) begin
frame_count <= 32'd0;
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_PLAT;
ev_event <= EV_MODE;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (end_of_frame) begin
frame_count <= frame_count + 32'd1;
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_PLAT;
ev_event <= EV_MODE;
ev_arg0 <= {32'd0, frame_count};
ev_arg1 <= {{(64-32){1'b0}}, 32'(H_ACTIVE * V_ACTIVE)};
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : platform_video_stub
File diff suppressed because it is too large Load Diff
+222
View File
@@ -0,0 +1,222 @@
// SPDX-License-Identifier: GPL-3.0-or-later
// Copyright (c) 2025-2026 retroDE contributors
// ============================================================================
// ps2_hps_bridge_null — minimal AXI4 slave for the PS2 core's Ch170 shell
// ============================================================================
//
// Purpose: present an AXI4 slave endpoint to the HPS hps2fpga bridge that
// (a) does proper AXI handshake so HPS transactions can't stall the bus,
// and (b) exposes a minimal "core identity" register window at 0x000-0x00F
// so retrodesd / probing utilities can read back who loaded.
//
// This is the Ch170 placeholder — when a real ps2_hps_bridge.sv lands (with
// HPS-driven core_reset, status mirrors, ROM staging, etc.), it should keep
// the same AXI4 port signature so the top-wrapper instantiation doesn't
// need to change.
//
// AXI4 subset (matches splash_hps_bridge.sv):
// - 128-bit data bus with byte-lane selection via {awaddr[3:2] / araddr[3:2]}
// - Single-beat only (awlen=0, arlen=0)
// - 4-bit ID echo
// - 38-bit address
//
// Identity register map (ABI v1.0 — read-only):
// 0x000 CORE_ID = 32'h70533200 ("pS2\0" — placeholder, refine later)
// 0x004 ABI_VERSION = 32'h00000100 (v1.0)
// 0x008 CORE_STATUS = 32'h00000001 (bit 0 = loaded)
// 0x00C CORE_CAPS = 32'h00000000 (no caps advertised)
//
// Everything else: reads return 0, writes ACK'd and discarded.
// ============================================================================
`timescale 1ns/1ps
module ps2_hps_bridge_null (
input logic clk, // qsys clk_100_clk domain
input logic reset_n,
input logic h2f_reset, // HPS-driven fabric reset (active high) — unused; reserved
// AXI4 slave — write address channel
input logic [3:0] s_axi_awid,
input logic [37:0] s_axi_awaddr,
input logic [7:0] s_axi_awlen,
input logic [2:0] s_axi_awsize,
input logic [1:0] s_axi_awburst,
input logic s_axi_awlock,
input logic [3:0] s_axi_awcache,
input logic [2:0] s_axi_awprot,
input logic s_axi_awvalid,
output logic s_axi_awready,
// AXI4 slave — write data channel
input logic [127:0] s_axi_wdata,
input logic [15:0] s_axi_wstrb,
input logic s_axi_wlast,
input logic s_axi_wvalid,
output logic s_axi_wready,
// AXI4 slave — write response channel
output logic [3:0] s_axi_bid,
output logic [1:0] s_axi_bresp,
output logic s_axi_bvalid,
input logic s_axi_bready,
// AXI4 slave — read address channel
input logic [3:0] s_axi_arid,
input logic [37:0] s_axi_araddr,
input logic [7:0] s_axi_arlen,
input logic [2:0] s_axi_arsize,
input logic [1:0] s_axi_arburst,
input logic s_axi_arlock,
input logic [3:0] s_axi_arcache,
input logic [2:0] s_axi_arprot,
input logic s_axi_arvalid,
output logic s_axi_arready,
// AXI4 slave — read data channel
output logic [3:0] s_axi_rid,
output logic [127:0] s_axi_rdata,
output logic [1:0] s_axi_rresp,
output logic s_axi_rlast,
output logic s_axi_rvalid,
input logic s_axi_rready
);
// ----------------------------------------------------------------
// Identity register window (Ch170 ABI v1.0).
// ----------------------------------------------------------------
localparam logic [31:0] CORE_ID = 32'h70533200;
localparam logic [31:0] ABI_VERSION = 32'h00000100;
localparam logic [31:0] CORE_STATUS = 32'h00000001;
localparam logic [31:0] CORE_CAPS = 32'h00000000;
function automatic logic [31:0] identity_lookup(input logic [37:0] addr);
// Identity registers live in the first 16 bytes of the bridge map.
// Anything else returns 0. addr[3:2] picks one of four 32-bit slots.
if (addr[37:4] != '0)
return 32'd0;
case (addr[3:2])
2'b00: identity_lookup = CORE_ID;
2'b01: identity_lookup = ABI_VERSION;
2'b10: identity_lookup = CORE_STATUS;
default: identity_lookup = CORE_CAPS;
endcase
endfunction
// ----------------------------------------------------------------
// Write FSM. Single-beat: accept awvalid + wvalid together, hold
// them ready for one cycle each, then emit bvalid. Stays in the
// BRESP state until bready, so multi-cycle bready timing from
// qsys still completes cleanly.
// ----------------------------------------------------------------
typedef enum logic [1:0] { W_IDLE, W_DATA, W_RESP } w_state_t;
w_state_t w_state;
logic [3:0] aw_id_q;
always_ff @(posedge clk or negedge reset_n) begin
if (!reset_n) begin
w_state <= W_IDLE;
aw_id_q <= '0;
s_axi_bvalid <= 1'b0;
end else begin
case (w_state)
W_IDLE: begin
s_axi_bvalid <= 1'b0;
if (s_axi_awvalid && s_axi_awready) begin
aw_id_q <= s_axi_awid;
w_state <= W_DATA;
end
end
W_DATA: begin
if (s_axi_wvalid && s_axi_wready) begin
s_axi_bvalid <= 1'b1;
w_state <= W_RESP;
end
end
W_RESP: begin
if (s_axi_bready) begin
s_axi_bvalid <= 1'b0;
w_state <= W_IDLE;
end
end
default: w_state <= W_IDLE;
endcase
end
end
assign s_axi_awready = (w_state == W_IDLE);
assign s_axi_wready = (w_state == W_DATA);
assign s_axi_bid = aw_id_q;
assign s_axi_bresp = 2'b00; // OKAY
// ----------------------------------------------------------------
// Read FSM. Same shape — accept arvalid, drive rdata + rvalid,
// hold until rready.
// ----------------------------------------------------------------
typedef enum logic [0:0] { R_IDLE, R_RESP } r_state_t;
r_state_t r_state;
logic [3:0] ar_id_q;
logic [37:0] ar_addr_q;
logic [127:0] rdata_q;
always_ff @(posedge clk or negedge reset_n) begin
if (!reset_n) begin
r_state <= R_IDLE;
ar_id_q <= '0;
ar_addr_q <= '0;
rdata_q <= '0;
s_axi_rvalid <= 1'b0;
end else begin
case (r_state)
R_IDLE: begin
s_axi_rvalid <= 1'b0;
if (s_axi_arvalid && s_axi_arready) begin
ar_id_q <= s_axi_arid;
ar_addr_q <= s_axi_araddr;
// Replicate the 32-bit identity word into the
// matching 32-bit lane of the 128-bit response,
// mirroring splash_hps_bridge's lane semantics.
case (s_axi_araddr[3:2])
2'b00: rdata_q <= {96'd0, identity_lookup(s_axi_araddr)};
2'b01: rdata_q <= {64'd0, identity_lookup(s_axi_araddr), 32'd0};
2'b10: rdata_q <= {32'd0, identity_lookup(s_axi_araddr), 64'd0};
default: rdata_q <= {identity_lookup(s_axi_araddr), 96'd0};
endcase
s_axi_rvalid <= 1'b1;
r_state <= R_RESP;
end
end
R_RESP: begin
if (s_axi_rready) begin
s_axi_rvalid <= 1'b0;
r_state <= R_IDLE;
end
end
default: r_state <= R_IDLE;
endcase
end
end
assign s_axi_arready = (r_state == R_IDLE);
assign s_axi_rid = ar_id_q;
assign s_axi_rdata = rdata_q;
assign s_axi_rresp = 2'b00; // OKAY
assign s_axi_rlast = 1'b1; // single-beat
// ----------------------------------------------------------------
// Tie off the AXI4 fields we don't consume so Quartus doesn't
// emit lint warnings: awlen/awsize/awburst/awlock/awcache/awprot,
// wstrb/wlast, arlen/arsize/arburst/arlock/arcache/arprot, h2f_reset.
// ----------------------------------------------------------------
// verilator lint_off UNUSED
wire _unused_ok = &{ 1'b0,
s_axi_awlen, s_axi_awsize, s_axi_awburst,
s_axi_awlock, s_axi_awcache, s_axi_awprot,
s_axi_wdata, s_axi_wstrb, s_axi_wlast,
s_axi_arlen, s_axi_arsize, s_axi_arburst,
s_axi_arlock, s_axi_arcache, s_axi_arprot,
h2f_reset,
1'b0 };
// verilator lint_on UNUSED
endmodule : ps2_hps_bridge_null
+173
View File
@@ -0,0 +1,173 @@
// SPDX-License-Identifier: GPL-3.0-or-later
// Copyright (c) 2025-2026 retroDE contributors
// ============================================================================
// tile_ram_cdc — Ch229 bridge-clock → design-clock tile-RAM shadow
// ============================================================================
// Implements the design-domain side of the Ch229 tile-RAM CDC. Owns a
// 1024 × 32-bit shadow memory in the design clock domain. Bridge-side
// writes arrive as a toggle-based "event" signal plus latched index +
// data; a 2-FF synchronizer + XOR edge detector turns each toggle edge
// into a 1-cycle write pulse against the shadow RAM. Read port is
// purely combinational (the consumer is the Ch245 platform-OSD
// char-BRAM read adapter in the top, which selects high/low 16-bit
// cells from each 32-bit shadow word and feeds them to the platform
// `osd_overlay`. Pre-Ch245 the consumer was the now-retired
// PS2-local `osd_overlay_stub`). No back-pressure — the bridge is assumed
// to space tile writes far enough apart for the sync chain to keep up.
//
// **CDC contract (read carefully before refactoring):**
// - The bridge updates `bclk_wr_toggle`, `bclk_wr_index`, `bclk_wr_data`
// at the same `bclk` edge (one bridge clock cycle).
// - The receiver sees the toggle through a 2-FF synchronizer; the
// edge-detection wire `wr_pulse` fires on the dclk cycle where the
// synchronized toggle has FULLY settled. That guarantees ≥ 2 dclk
// periods of stability on `bclk_wr_index/data` before they're
// sampled into the shadow memory.
// - Multiple bridge writes faster than ~3 dclk periods apart will
// race and may drop or merge events. For the Ch229 use case
// (retrodesd OSD updates at ≤ 1 kHz, design_clk at 2550 MHz),
// this is many orders of magnitude of slack. **Do not** wire a
// fast-cycling source (e.g. a counter) into the bridge's tile
// write path without first replacing this CDC with an async FIFO.
//
// **Reset behavior:**
// - On `breset_n` deasserted: bridge clears `bclk_wr_toggle` to 0
// (matching the receiver's post-reset state). When both domains
// reset together (the normal case on FPGA configure), no spurious
// edge fires after release.
// - On `dreset_n` deasserted: synchronizer chain clears to 0;
// shadow memory contents are NOT cleared (matches Ch227 retention
// semantics — sim `initial` block zeroes for determinism, hardware
// power-up is undefined). The Ch229 contract is "tile RAM survives
// warm reset"; rebooting both sides is a power-cycle scenario and
// the bridge will re-broadcast any written tiles via the next set
// of AXI writes from HPS.
// ============================================================================
`timescale 1ns/1ps
module tile_ram_cdc (
// ---- Bridge clock domain (write port) ----
input logic bclk,
input logic breset_n,
input logic bclk_wr_toggle,
input logic [9:0] bclk_wr_index,
input logic [31:0] bclk_wr_data,
// ---- Design clock domain (read port) ----
input logic dclk,
input logic dreset_n,
input logic [9:0] dclk_rd_index,
output logic [31:0] dclk_rd_data,
// ---- Ch230 design-domain diagnostic counter ----
// Saturating count of "tile writes too close" events — successive
// wr_pulse events fewer than MIN_DCLK_GAP dclk cycles apart.
// Exposed as an output so the top can route it to a reverse-CDC +
// bridge-readable diagnostic register in a future chapter (Ch231+).
// For Ch230 the top leaves it unconnected; the counter still exists
// in the design domain as a synthesis artifact ready for hookup.
output logic [15:0] tile_wr_too_close_count
);
// Shadow RAM lives in the design clock domain. Matched-size with
// the bridge-side `ps2_hps_bridge.tile_mem` (1024 × 32-bit). The
// `ramstyle = "M20K"` attribute (added in the Ch232 hardware
// bring-up hotfix) forces Quartus to use a single M20K block
// instead of distributing the storage across LABs.
(* ramstyle = "M20K" *) logic [31:0] shadow_mem [0:1023];
initial begin
for (int i = 0; i < 1024; i++)
shadow_mem[i] = 32'd0;
end
// 2-FF synchronizer on the bridge toggle into the design clock.
// Three stages let us compute an edge detector against the
// already-resampled bits ([2] ^ [1]), giving the wr_pulse a full
// dclk cycle of bclk_wr_index/data stability before we sample.
logic [2:0] toggle_sync;
always_ff @(posedge dclk or negedge dreset_n) begin
if (!dreset_n)
toggle_sync <= 3'b000;
else
toggle_sync <= {toggle_sync[1:0], bclk_wr_toggle};
end
wire wr_pulse = toggle_sync[2] ^ toggle_sync[1];
// Shadow write port. At the dclk edge where wr_pulse fires,
// sample bclk_wr_index + bclk_wr_data. Both have been stable for
// ≥ 2 dclk cycles by construction of the CDC contract above.
always_ff @(posedge dclk) begin
if (wr_pulse)
shadow_mem[bclk_wr_index] <= bclk_wr_data;
end
// Read port: combinational lookup. The consumer pulls index from
// its pixel position and uses the data to decide overlay vs
// transparent for each pixel.
assign dclk_rd_data = shadow_mem[dclk_rd_index];
// ---- Ch229 / Ch230 tile-write rate watchdog ----
// The CDC contract requires writes to be spaced far enough apart
// that each toggle edge passes through the sync chain cleanly.
// Two consecutive bridge writes that both flip toggle within one
// dclk of each other can be merged into a single transition at
// sync[0] — the first write's bclk_wr_index/bclk_wr_data are
// overwritten before the receiver samples them, and the write is
// silently lost.
//
// The actual minimum gap is ≥ 3 dclk between successive
// wr_pulse events at the receiver:
// - 1 dclk for the synchronizer to fully settle (so the
// second edge is visible as a distinct transition)
// - 1 dclk for the receiver to fire wr_pulse for write 1
// - 1 dclk of margin for jitter / setup time
//
// Production rate enforcer is software-side (retrodesd OSD
// updates at ≤ 1 kHz ≫ 3 dclk @ 25 MHz = 120 ns); the bridge
// does not back-pressure AXI on this constraint. Ch229 added a
// sim-only `$display` warning; Ch230 promotes the gap-tracker to
// a real **saturating counter** (16-bit) exposed as
// `tile_wr_too_close_count` so a future chapter can route it
// through a reverse CDC into a bridge-readable register
// (HDMI_DIAG upper bits or a new diagnostic offset). The
// `$display` aid remains in `\`ifndef SYNTHESIS` for pre-silicon
// log visibility.
localparam int unsigned MIN_DCLK_GAP = 3;
logic [31:0] dclk_since_last_pulse;
wire too_close = wr_pulse && (dclk_since_last_pulse < MIN_DCLK_GAP);
always_ff @(posedge dclk or negedge dreset_n) begin
if (!dreset_n) begin
dclk_since_last_pulse <= 32'hFFFF_FFFF;
tile_wr_too_close_count <= 16'd0;
end else begin
if (wr_pulse)
dclk_since_last_pulse <= 32'd0;
else if (dclk_since_last_pulse != 32'hFFFF_FFFF)
dclk_since_last_pulse <= dclk_since_last_pulse + 32'd1;
if (too_close && (tile_wr_too_close_count != 16'hFFFF))
tile_wr_too_close_count <= tile_wr_too_close_count + 16'd1;
end
end
`ifndef SYNTHESIS
always_ff @(posedge dclk) begin
if (dreset_n && too_close) begin
$display(
"[tile_ram_cdc] WARN time=%0t: tile writes too close - %0d dclk cycles between toggle edges (CDC needs >= %0d for safe sample).",
$time, dclk_since_last_pulse, MIN_DCLK_GAP);
end
end
`endif
// ---- Lint: bclk + breset_n are intentionally referenced ONLY
// via the bclk_wr_toggle path. Tie a placeholder reference
// to silence "unused" warnings on tools that don't trace
// through the upstream toggle source.
// verilator lint_off UNUSED
wire _unused_ok = &{1'b0, bclk, breset_n, 1'b0};
// verilator lint_on UNUSED
endmodule : tile_ram_cdc
+91
View File
@@ -0,0 +1,91 @@
# rtl/sif
EE↔IOP subsystem interface. Matches `docs/contracts/sif.md`.
## Current contents
- `sif_mailbox_stub.sv` — minimal four-register mailbox/flag shell
(MSCOM / SMCOM / MSFLG / SMFLG). Independent EE-side and IOP-side register
ports. Directional set/clear semantics deferred; this phase only proves
that both sides observe consistent storage and that side-of-origin is
trace-visible. Per-register write arbitration: EE wins on same-register
collision, independent writes to different registers coexist.
- `sif_dma_stub.sv` — receive-side DMA endpoint. Accepts qwords from a
DMAC channel's `ep_*` port into a small internal buffer (default DEPTH=8).
Capacity-safe: `in_ready` drops when `rx_count >= DEPTH`, `full_o`
exposed for testbench observation. TB-controlled `stall_in` input for
explicit stall testing. Read port for payload verification. No consume
path yet — once full, stays full. NOT an IOP — purely a bounded receive
buffer with trace emission per accepted beat.
- `sif_mailbox_peer_stub.sv` — tiny active peer used in integration tests
to play "the IOP side" of a specific mailbox protocol. Re-armable
command-echo state machine (poll MSFLG → read MSCOM → write SMCOM →
write SMFLG → wait for TB to clear MSFLG → repeat). Refuses to re-fire
while the doorbell bit stays high, so lifecycle is explicit. Exposes
`ack_count_o` for testbench synchronisation.
Explicitly NOT an IOP core: no code execution, no BIOS bring-up, no
implicit flag clearing (re-arm is the TB's responsibility). Kept under
`rtl/sif/` precisely so it does not get misread as IOP maturity progress.
- `sif_dma_iop_ram_bridge_stub.sv` — width-adapting bridge from a 128-bit
SIF DMA endpoint to 32-bit IOP-side writes. Splits each incoming qword
into four 32-bit writes at consecutive physical addresses from
`DEST_BASE_ADDR`. Little-endian unpacking. Drives the IOP memory map's
bridge-write port (`bridge_wr_*`). In-ready drops while the bridge is
flushing a qword — natural backpressure to the DMAC.
- `sif_dma_ack_peer_stub.sv` — protocol combiner for the first combined
control+data SIF milestone. Observes a mailbox doorbell (MSFLG pending
bit) AND `sif_dma_stub.last_seen` (payload completion); only emits the
ack sequence (SMCOM=cmd + SMFLG=ACK) once both are true. Composes two
existing SIF primitives; does not fatten the plain mailbox peer with
DMA awareness.
Explicitly NOT an IOP.
- `sif_dma_ee_ram_bridge_stub.sv` — width-adapting bridge from a 32-bit
SIF DMA endpoint (IOP→EE egress) to 128-bit EE-side writes. Mirror of
`sif_dma_iop_ram_bridge_stub` in the other direction: accumulates four
consecutive 32-bit beats into a qword (little-endian), then issues
one write through the EE memory map's bridge write port. Drops
`in_ready` during the one-cycle emit for natural back-pressure.
Handles partial-quad on `in_last` via byte-enable masking. Exposes
`last_seen_o` — a level-held latch that rises when the final beat of
a transfer is accepted, so EE-side protocol combiners can gate on
"payload fully landed."
- `sif_dma_ee_ack_peer_stub.sv` — protocol combiner for the first
IOP-driven combined control+data SIF milestone. Polarity mirror of
`sif_dma_ack_peer_stub`: observes the mailbox's EE side for an IOP
doorbell (SMFLG pending bit), gates on
`sif_dma_ee_ram_bridge_stub.last_seen_o`, and only then reads SMCOM
and echoes MSCOM + MSFLG=ACK back IOP-ward. One-shot. Explicitly NOT
an EE core — purely a composition of two existing SIF primitives.
## Current status
The SIF seam is feature-complete for staged bring-up in both directions.
Storage, active peer, lifecycle/re-arm, negative-path, EE→IOP DMA, three
classes of backpressure (start / mid-transfer / full-stop), EE-driven
combined control+data gating, a reverse-direction (IOP→EE) data path
with its own stall semantics, AND the matching IOP-driven combined
control+data handshake are all proven end-to-end. Further SIF-only work
would be symmetry-chasing rather than unlocking new architectural
questions.
## Deferred follow-ons (not gaps)
These are known extension points, intentionally not pursued yet:
- **Re-armable combined control+data handshakes.** Both directions are
currently one-shot; re-arm mostly composes pieces already proven
separately. Nice-to-have.
- **Directional write-ownership + flag set/clear semantics.** Currently
both sides of the mailbox can write any register with plain replace
semantics; real PS2 has directional set/W1C rules.
- **Real EE↔IOP coordination.** Arrives once an IOP-side execution
primitive exists that can observe SIF as "IOP behaviour," not as a
peer stub.
## Scope boundary
This directory owns the SIF register shell and DMA-visible coordination.
It does **not** own:
- IOP CPU execution (`rtl/iop/`, not yet created)
- EE-side addressing / kseg stripping for SIF registers (memory-map work)
- Interrupt routing to INTC on SIF transitions (Wave 3)
+217
View File
@@ -0,0 +1,217 @@
// retroDE_ps2 — boot_install_agent_stub (Ch55 / Ch56)
//
// Minimal external producer that streams a coordinated low-RAM handler
// image into EE RAM through the SIF EE-RAM bridge. Emits 32-bit beats
// on a ready/valid handshake compatible with sif_dma_ee_ram_bridge_stub.
//
// NOT an IOP, NOT a full boot firmware. This is the thinnest possible
// stand-in for "whatever on real PS2 populates EE useg [0x80..0x1FF]
// with exception-entry + safe-return stubs before the EE starts
// faulting" (IOP→EE SIF DMA, BootROM/CDVD handoff, etc.). The point
// is to validate the transport path and the coordinated-install
// thesis, not to model the producer's identity.
//
// Payload source (Ch56):
// USE_IMAGE_FILE=0 (default) — built-in Ch54 image, hardcoded below
// USE_IMAGE_FILE=1 — $readmemh(IMAGE_FILE, payload) once
// at sim start, expects TOTAL_WORDS
// hex words
// Transport (timing, handshake, trace) is identical across both modes.
//
// Built-in image (USE_IMAGE_FILE=0):
// word[0..3] → AdES handler at useg 0x80..0x8C:
// MFC0 $26, $14 (32'h401A7000)
// ADDIU $26, $26, 4 (32'h275A0004)
// JR $26 (32'h03400008)
// RFE (32'h42000010)
// word[4..95] → 46× (JR $31; NOP) safe-return pairs covering
// useg 0x90..0x1FC.
//
// Downstream contract (matches sif_dma_ee_ram_bridge_stub upstream):
// out_valid / out_data[31:0] / out_last / out_ready
// out_last asserted on the final word. One-beat-per-cycle while
// out_ready is high.
//
// Trace:
// SUBSYS_SIF / EV_DMA_START once on go.
// SUBSYS_SIF / EV_DMA_BEAT per accepted beat.
// arg0 = word index, arg1 = word data, arg2 = MASTER_ID,
// arg3 = TOTAL_WORDS, flags bit0 = out_last.
// SUBSYS_SIF / EV_DMA_DONE once on completion.
`timescale 1ns/1ps
module boot_install_agent_stub
import trace_pkg::*;
#(
parameter int TOTAL_WORDS = 96,
parameter logic [7:0] MASTER_ID = 8'd6, // install agent
parameter bit USE_IMAGE_FILE = 1'b0, // 0: built-in ROM, 1: $readmemh
parameter string IMAGE_FILE = ""
) (
input logic clk,
input logic rst_n,
input logic go_i,
output logic out_valid,
output logic [31:0] out_data,
output logic out_last,
input logic out_ready,
output logic busy_o,
output logic done_o,
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
// ------------------------------------------------------------------
// Payload ROM
// ------------------------------------------------------------------
// The 4-word exception-return stub at [0..3] plus (JR $31; NOP)
// pairs filling the rest requires TOTAL_WORDS to be even and at
// least 4. Guard explicitly (Ch55 audit low-1): odd values would
// walk past the array end in the pair loop.
initial begin
if (TOTAL_WORDS < 4 || (TOTAL_WORDS & 1) != 0) begin
$fatal(1, "boot_install_agent_stub: TOTAL_WORDS must be even and >= 4, got %0d",
TOTAL_WORDS);
end
end
logic [31:0] payload [0:TOTAL_WORDS-1];
initial begin
if (USE_IMAGE_FILE) begin
if (IMAGE_FILE == "") begin
$fatal(1, "boot_install_agent_stub: USE_IMAGE_FILE=1 but IMAGE_FILE is empty");
end
$readmemh(IMAGE_FILE, payload);
end else begin
payload[0] = 32'h401A7000; // MFC0 $26, $14
payload[1] = 32'h275A0004; // ADDIU $26, $26, 4
payload[2] = 32'h03400008; // JR $26
payload[3] = 32'h42000010; // RFE (delay slot)
for (int i = 4; i < TOTAL_WORDS; i = i + 2) begin
payload[i] = 32'h03E00008; // JR $31
payload[i + 1] = 32'h00000000; // NOP
end
end
end
// ------------------------------------------------------------------
// Streaming FSM
// ------------------------------------------------------------------
typedef enum logic [1:0] {
S_IDLE = 2'd0,
S_STREAM = 2'd1,
S_DONE = 2'd2
} state_e;
state_e state;
logic [31:0] idx; // next word to emit
logic accept_beat;
assign accept_beat = out_valid && out_ready;
assign out_valid = (state == S_STREAM);
assign out_data = (state == S_STREAM) ? payload[idx[$clog2(TOTAL_WORDS)-1:0]]
: 32'd0;
assign out_last = (state == S_STREAM) && (idx == TOTAL_WORDS - 1);
assign busy_o = (state == S_STREAM);
assign done_o = (state == S_DONE);
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
idx <= 32'd0;
end else begin
unique case (state)
S_IDLE: begin
if (go_i) begin
state <= S_STREAM;
idx <= 32'd0;
end
end
S_STREAM: begin
if (accept_beat) begin
if (idx == TOTAL_WORDS - 1) state <= S_DONE;
else idx <= idx + 32'd1;
end
end
S_DONE: ; // terminal
default: state <= S_IDLE;
endcase
end
end
// ------------------------------------------------------------------
// Trace
// ------------------------------------------------------------------
// START fires combinationally on the cycle the caller pulses go_i
// while we're still in S_IDLE. That cycle has out_valid=0 and
// accept_beat=0, so the event doesn't compete with a BEAT event
// in the priority if-else below (the bug pre-fix: flopping
// go_latched delayed START onto the same cycle as beat 0, dropping
// one of the two).
logic go_pulse;
assign go_pulse = (state == S_IDLE) && go_i;
logic done_edge;
state_e state_prev;
always_ff @(posedge clk) begin
if (!rst_n) state_prev <= S_IDLE;
else state_prev <= state;
end
assign done_edge = (state == S_DONE) && (state_prev != S_DONE);
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_DMA_START;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (go_pulse) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_DMA_START;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= {56'd0, MASTER_ID};
ev_arg3 <= 64'(TOTAL_WORDS);
ev_flags <= 32'd0;
end else if (accept_beat) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_DMA_BEAT;
ev_arg0 <= {32'd0, idx};
ev_arg1 <= {32'd0, out_data};
ev_arg2 <= {56'd0, MASTER_ID};
ev_arg3 <= 64'(TOTAL_WORDS);
ev_flags <= {31'd0, out_last};
end else if (done_edge) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_DMA_DONE;
ev_arg0 <= 64'(TOTAL_WORDS);
ev_arg1 <= 64'd0;
ev_arg2 <= {56'd0, MASTER_ID};
ev_arg3 <= 64'(TOTAL_WORDS);
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : boot_install_agent_stub
+181
View File
@@ -0,0 +1,181 @@
// retroDE_ps2 — sif_dma_ack_peer_stub
//
// Protocol combiner for the first combined control+data SIF milestone.
// Observes a mailbox command doorbell on one seam and the SIF DMA receive
// endpoint's payload-complete indication on the other; only issues the
// mailbox ack sequence once BOTH are true.
//
// Explicitly NOT an IOP. This module has no code execution, no bus master,
// and no capability beyond composing two existing SIF primitives. Kept
// under `rtl/sif/` with the other SIF scaffolding.
//
// Contract refs:
// docs/contracts/sif.md
//
// Layering:
// sif_mailbox_stub — storage primitive
// sif_mailbox_peer_stub — mailbox-only active peer (no DMA awareness)
// sif_dma_stub — data-plane receive endpoint
// sif_dma_ack_peer_stub — THIS module. Wires the two together.
//
// Protocol (one-shot):
// 1. EE writes MSCOM = cmd
// 2. EE writes MSFLG = CMD_PENDING_BIT (request doorbell)
// 3. DMAC transfers bounded payload into sif_dma_stub
// 4. sif_dma_stub asserts last_seen once the final beat arrives
// 5. this peer observes (MSFLG & CMD_PENDING_BIT) AND last_seen
// 6. peer reads MSCOM
// 7. peer writes SMCOM = cmd
// 8. peer writes SMFLG = CMD_ACK_BIT
// 9. terminal DONE (one-shot for this milestone)
//
// The peer does NOT clear MSFLG or SMFLG — lifecycle is the TB's
// responsibility, consistent with sif_mailbox_peer_stub's guardrail.
//
// Ports connect to:
// obs_* → sif_mailbox_stub iop_rd_* (peer reads MSFLG, then MSCOM)
// resp_* → sif_mailbox_stub iop_wr_* (peer writes SMCOM, then SMFLG)
// payload_complete ← sif_dma_stub.last_seen
`timescale 1ns/1ps
module sif_dma_ack_peer_stub
#(
parameter logic [7:0] MSCOM_OFF = 8'h00,
parameter logic [7:0] SMCOM_OFF = 8'h10,
parameter logic [7:0] MSFLG_OFF = 8'h20,
parameter logic [7:0] SMFLG_OFF = 8'h30,
parameter logic [31:0] CMD_PENDING_BIT = 32'h0000_0001,
parameter logic [31:0] CMD_ACK_BIT = 32'h0000_0002
) (
input logic clk,
input logic rst_n,
// Mailbox observation (IOP-side read port)
output logic obs_rd_en,
output logic [7:0] obs_rd_addr,
input logic [31:0] obs_rd_data,
input logic obs_rd_valid,
// Mailbox response (IOP-side write port)
output logic resp_wr_en,
output logic [7:0] resp_wr_addr,
output logic [31:0] resp_wr_data,
// Payload completion indication from sif_dma_stub (level)
input logic payload_complete,
// Status
output logic done_o,
output logic [31:0] ack_count_o
);
typedef enum logic [2:0] {
S_POLL_REQ = 3'd0, // pulse rd_en for MSFLG
S_POLL_WAIT = 3'd1, // wait for rd_valid, gate on BOTH conditions
S_MSCOM_REQ = 3'd2, // pulse rd_en for MSCOM
S_MSCOM_WAIT = 3'd3, // wait for rd_valid, latch cmd
S_WRITE_SMCOM = 3'd4, // drive wr_en, addr=SMCOM, data=cmd
S_WRITE_SMFLG = 3'd5, // drive wr_en, addr=SMFLG, data=ACK
S_DONE = 3'd6 // terminal (one-shot for this milestone)
} state_e;
state_e state;
logic [31:0] latched_cmd;
// ------------------------------------------------------------------
// State machine — advance to MSCOM_REQ only when MSFLG pending is set
// AND payload_complete is observed simultaneously. This is the
// load-bearing guarantee of the whole combiner.
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_POLL_REQ;
latched_cmd <= 32'd0;
end else begin
unique case (state)
S_POLL_REQ: state <= S_POLL_WAIT;
S_POLL_WAIT: begin
if (obs_rd_valid) begin
if (((obs_rd_data & CMD_PENDING_BIT) != 32'd0) &&
payload_complete)
state <= S_MSCOM_REQ;
else
state <= S_POLL_REQ; // keep polling
end
end
S_MSCOM_REQ: state <= S_MSCOM_WAIT;
S_MSCOM_WAIT: begin
if (obs_rd_valid) begin
latched_cmd <= obs_rd_data;
state <= S_WRITE_SMCOM;
end
end
S_WRITE_SMCOM: state <= S_WRITE_SMFLG;
S_WRITE_SMFLG: state <= S_DONE;
S_DONE: state <= S_DONE;
default: state <= S_POLL_REQ;
endcase
end
end
// ------------------------------------------------------------------
// Output drive (combinational, one-hot on state)
// ------------------------------------------------------------------
always_comb begin
obs_rd_en = 1'b0;
obs_rd_addr = 8'd0;
resp_wr_en = 1'b0;
resp_wr_addr = 8'd0;
resp_wr_data = 32'd0;
unique case (state)
S_POLL_REQ: begin
obs_rd_en = 1'b1;
obs_rd_addr = MSFLG_OFF;
end
S_MSCOM_REQ: begin
obs_rd_en = 1'b1;
obs_rd_addr = MSCOM_OFF;
end
S_WRITE_SMCOM: begin
resp_wr_en = 1'b1;
resp_wr_addr = SMCOM_OFF;
resp_wr_data = latched_cmd;
end
S_WRITE_SMFLG: begin
resp_wr_en = 1'b1;
resp_wr_addr = SMFLG_OFF;
resp_wr_data = CMD_ACK_BIT;
end
default: ;
endcase
end
// ------------------------------------------------------------------
// Ack bookkeeping
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ack_count_o <= 32'd0;
done_o <= 1'b0;
end else if (state == S_WRITE_SMFLG) begin
// S_WRITE_SMFLG is a single-cycle state: unconditionally
// transitions to S_DONE on the next edge, so this observes
// exactly one completion.
ack_count_o <= ack_count_o + 32'd1;
done_o <= 1'b1;
end
end
endmodule : sif_dma_ack_peer_stub
+185
View File
@@ -0,0 +1,185 @@
// retroDE_ps2 — sif_dma_ee_ack_peer_stub
//
// Protocol combiner for the first reverse-direction (IOP→EE) combined
// control+data SIF milestone. Mirror of sif_dma_ack_peer_stub with
// polarity swapped to observe the IOP's doorbell from the EE side and
// respond back IOP-ward through the mailbox.
//
// Explicitly NOT an EE. This module has no code execution, no bus
// master, no capability beyond composing two existing SIF primitives.
// Kept under `rtl/sif/` with the other protocol scaffolding so it
// does not get misread as EE maturity progress.
//
// Contract refs:
// docs/contracts/sif.md
//
// Layering:
// sif_mailbox_stub — storage primitive
// sif_dma_ee_ram_bridge_stub — reverse-direction data-plane landing
// sif_dma_ee_ack_peer_stub — THIS module. Ties them together on
// the EE side.
//
// Protocol (one-shot, reverse direction):
// 1. IOP writes SMCOM = cmd (what the IOP wants to say)
// 2. IOP writes SMFLG = CMD_PENDING_BIT (doorbell IOP→EE)
// 3. IOP DMAC ch9 transfers bounded payload through the SIF egress
// bridge, which lands qwords in EE RAM. Bridge's last_seen_o rises
// on the final beat and stays high.
// 4. this peer observes (SMFLG & CMD_PENDING_BIT) AND payload_complete
// 5. peer reads SMCOM (captures the command)
// 6. peer writes MSCOM = cmd (echo back IOP-ward)
// 7. peer writes MSFLG = CMD_ACK_BIT (ack back IOP-ward)
// 8. terminal DONE (one-shot for this milestone)
//
// The peer does NOT clear SMFLG or MSFLG — lifecycle is the TB's
// responsibility, consistent with sif_mailbox_peer_stub's guardrail.
//
// Ordering guarantee: the load-bearing behaviour is that the peer will
// not advance to the ack write sequence unless BOTH the doorbell AND
// payload_complete are observed simultaneously. The milestone is about
// verifying that the ack is gated on data arriving, not just control.
//
// Ports connect to:
// obs_* → sif_mailbox_stub ee_rd_* (peer reads SMFLG, then SMCOM)
// resp_* → sif_mailbox_stub ee_wr_* (peer writes MSCOM, then MSFLG)
// payload_complete ← sif_dma_ee_ram_bridge_stub.last_seen_o
`timescale 1ns/1ps
module sif_dma_ee_ack_peer_stub
#(
parameter logic [7:0] MSCOM_OFF = 8'h00,
parameter logic [7:0] SMCOM_OFF = 8'h10,
parameter logic [7:0] MSFLG_OFF = 8'h20,
parameter logic [7:0] SMFLG_OFF = 8'h30,
parameter logic [31:0] CMD_PENDING_BIT = 32'h0000_0001,
parameter logic [31:0] CMD_ACK_BIT = 32'h0000_0002
) (
input logic clk,
input logic rst_n,
// Mailbox observation (EE-side read port)
output logic obs_rd_en,
output logic [7:0] obs_rd_addr,
input logic [31:0] obs_rd_data,
input logic obs_rd_valid,
// Mailbox response (EE-side write port)
output logic resp_wr_en,
output logic [7:0] resp_wr_addr,
output logic [31:0] resp_wr_data,
// Payload completion indication from sif_dma_ee_ram_bridge_stub (level)
input logic payload_complete,
// Status
output logic done_o,
output logic [31:0] ack_count_o
);
typedef enum logic [2:0] {
S_POLL_REQ = 3'd0, // pulse rd_en for SMFLG
S_POLL_WAIT = 3'd1, // wait for rd_valid, gate on BOTH conditions
S_SMCOM_REQ = 3'd2, // pulse rd_en for SMCOM
S_SMCOM_WAIT = 3'd3, // wait for rd_valid, latch cmd
S_WRITE_MSCOM = 3'd4, // drive wr_en, addr=MSCOM, data=cmd
S_WRITE_MSFLG = 3'd5, // drive wr_en, addr=MSFLG, data=ACK
S_DONE = 3'd6 // terminal (one-shot for this milestone)
} state_e;
state_e state;
logic [31:0] latched_cmd;
// ------------------------------------------------------------------
// State machine — advance to SMCOM_REQ only when SMFLG pending is set
// AND payload_complete is observed simultaneously. This is the
// load-bearing guarantee of the whole combiner.
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_POLL_REQ;
latched_cmd <= 32'd0;
end else begin
unique case (state)
S_POLL_REQ: state <= S_POLL_WAIT;
S_POLL_WAIT: begin
if (obs_rd_valid) begin
if (((obs_rd_data & CMD_PENDING_BIT) != 32'd0) &&
payload_complete)
state <= S_SMCOM_REQ;
else
state <= S_POLL_REQ; // keep polling
end
end
S_SMCOM_REQ: state <= S_SMCOM_WAIT;
S_SMCOM_WAIT: begin
if (obs_rd_valid) begin
latched_cmd <= obs_rd_data;
state <= S_WRITE_MSCOM;
end
end
S_WRITE_MSCOM: state <= S_WRITE_MSFLG;
S_WRITE_MSFLG: state <= S_DONE;
S_DONE: state <= S_DONE;
default: state <= S_POLL_REQ;
endcase
end
end
// ------------------------------------------------------------------
// Output drive (combinational, one-hot on state)
// ------------------------------------------------------------------
always_comb begin
obs_rd_en = 1'b0;
obs_rd_addr = 8'd0;
resp_wr_en = 1'b0;
resp_wr_addr = 8'd0;
resp_wr_data = 32'd0;
unique case (state)
S_POLL_REQ: begin
obs_rd_en = 1'b1;
obs_rd_addr = SMFLG_OFF;
end
S_SMCOM_REQ: begin
obs_rd_en = 1'b1;
obs_rd_addr = SMCOM_OFF;
end
S_WRITE_MSCOM: begin
resp_wr_en = 1'b1;
resp_wr_addr = MSCOM_OFF;
resp_wr_data = latched_cmd;
end
S_WRITE_MSFLG: begin
resp_wr_en = 1'b1;
resp_wr_addr = MSFLG_OFF;
resp_wr_data = CMD_ACK_BIT;
end
default: ;
endcase
end
// ------------------------------------------------------------------
// Ack bookkeeping
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ack_count_o <= 32'd0;
done_o <= 1'b0;
end else if (state == S_WRITE_MSFLG) begin
ack_count_o <= ack_count_o + 32'd1;
done_o <= 1'b1;
end
end
endmodule : sif_dma_ee_ack_peer_stub
+202
View File
@@ -0,0 +1,202 @@
// retroDE_ps2 — sif_dma_ee_ram_bridge_stub
//
// Width-adapting bridge from a 32-bit SIF DMA endpoint (IOP→EE egress)
// to the 128-bit EE memory map. Accumulates four incoming 32-bit beats
// into a qword and issues one qword write through ee_memory_map_stub's
// bridge write port.
//
// Mirror of sif_dma_iop_ram_bridge_stub, but in the other direction
// (words → qword, EE-side landing).
//
// Contract refs:
// docs/contracts/sif.md (DMA-linked data movement endpoints)
// docs/contracts/memory.md (EE RAM is 128-bit qword-aligned)
//
// Handshake (upstream, from DMAC ep_* port or equivalent):
// in_valid / in_data[31:0] / in_last / in_ready
// Bridge asserts in_ready while it's accumulating (up to the 3rd beat
// of a quad, inclusive). It drops in_ready during the one-cycle emit
// that follows the 4th beat, so the DMAC naturally stalls with
// back-pressure for a single cycle between qwords.
//
// Handshake (downstream, to ee_memory_map_stub bridge-write port):
// bridge_wr_en / bridge_wr_addr[31:0] / bridge_wr_data[127:0] /
// bridge_wr_be[15:0] / bridge_master_id[7:0]
//
// Data layout (little-endian):
// beat 0 → bridge_wr_data[31:0]
// beat 1 → bridge_wr_data[63:32]
// beat 2 → bridge_wr_data[95:64]
// beat 3 → bridge_wr_data[127:96]
// qword address advances DEST_BASE_ADDR by 16 per emit.
//
// Partial quad on in_last:
// If `in_last` arrives before the 4th beat of a quad, the bridge
// emits the partial qword with wr_be masked to cover only the bytes
// that were actually accepted. Not exercised by the current TB (BCR
// is chosen to be a multiple of 4), but kept defensively.
//
// Payload-complete indication (last_seen_o):
// Level-held output, set when `in_last && accept_beat` fires on the
// upstream handshake. Intended for EE-side protocol combiners that
// need to gate an ack on "payload fully moved" independently of when
// the IOP posted a control doorbell. Latch stays high until reset —
// this mirrors sif_dma_stub.last_seen.
//
// Parameters:
// DEST_BASE_ADDR — byte offset where the first qword lands. Advances
// by 16 per emit for the life of the transfer.
// MASTER_ID — bridge's identity for MEM / EE-map trace attribution
// (default 5 = SIF EE-side bridge).
//
// Non-goals:
// - multiple in-flight qwords
// - arbitration against other bridge writers on the EE map's write path
`timescale 1ns/1ps
module sif_dma_ee_ram_bridge_stub
#(
parameter logic [31:0] DEST_BASE_ADDR = 32'h0000_0000,
parameter logic [7:0] MASTER_ID = 8'd5
) (
input logic clk,
input logic rst_n,
// Upstream (DMAC endpoint side)
input logic in_valid,
input logic [31:0] in_data,
input logic in_last,
output logic in_ready,
// Downstream (EE map bridge-write port)
output logic bridge_wr_en,
output logic [31:0] bridge_wr_addr,
output logic [127:0] bridge_wr_data,
output logic [15:0] bridge_wr_be,
output logic [7:0] bridge_master_id,
// Payload-complete indication (level, latched). Consumers gate on
// "full payload landed" without needing to count beats.
output logic last_seen_o,
// Ch239 — single-cycle "rewind" pulse. When asserted (and the
// bridge is idle in S_ACCUM with no beat in flight), the running
// `wr_offset` returns to 0 so the NEXT emit lands at
// DEST_BASE_ADDR. Lets a producer that wants single-slot buffer
// semantics (e.g. a libpad-style pad packet) overwrite the same
// 16-byte slot on every transfer instead of streaming forward.
// Existing producers that don't need this leave it tied to 1'b0
// and the bridge keeps its streaming behaviour exactly as before.
// Pulse must be asserted between transfers; firing mid-transfer
// (`state==S_EMIT` or `pos != 0`) is illegal and logged as a
// sim-only `$error` (no defensive RTL gating — keeps the path
// single-purpose). See `docs/contracts/sio2_pad.md` Ch239.
input logic rewind_i = 1'b0
);
typedef enum logic [0:0] {
S_ACCUM = 1'b0,
S_EMIT = 1'b1
} state_e;
state_e state;
logic [127:0] acc_data;
logic [15:0] acc_be;
logic [1:0] pos; // 0..3 within qword
logic [31:0] wr_offset; // running byte offset
assign in_ready = (state == S_ACCUM);
assign bridge_master_id = MASTER_ID;
logic accept_beat;
assign accept_beat = in_valid && in_ready;
// ------------------------------------------------------------------
// Accumulator / state machine
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_ACCUM;
acc_data <= 128'd0;
acc_be <= 16'd0;
pos <= 2'd0;
wr_offset <= 32'd0;
end else begin
// Ch239 — between-transfer rewind. Resets only the
// streaming offset; `acc_data`/`acc_be`/`pos` are
// already 0 after every emit's tail. Misuse (rewind
// pulse during a transfer) is reported via sim $error
// below; the RTL still applies the rewind because the
// guard would otherwise hide producer-side bugs.
if (rewind_i) wr_offset <= 32'd0;
unique case (state)
S_ACCUM: begin
if (accept_beat) begin
// Place the incoming word in slot `pos` and mark
// its four bytes enabled.
acc_data[pos*32 +: 32] <= in_data;
acc_be[pos*4 +: 4] <= 4'b1111;
if (pos == 2'd3 || in_last) begin
state <= S_EMIT;
end else begin
pos <= pos + 2'd1;
end
end
end
S_EMIT: begin
// Single-cycle emit; bridge_wr_en is combinationally
// tied to state. Advance qword offset, reset slot /
// accumulator for the next quad. The Ch239 rewind
// above runs first, so a `rewind_i` pulse coincident
// with an emit cycle leaves wr_offset at 0 (no +16
// increment) — but that combination is the illegal
// "rewind mid-transfer" case and the $error below
// catches it for the producer to fix.
wr_offset <= wr_offset + 32'd16;
acc_data <= 128'd0;
acc_be <= 16'd0;
pos <= 2'd0;
state <= S_ACCUM;
end
default: state <= S_ACCUM;
endcase
end
end
`ifndef SYNTHESIS
// Misuse detector — `rewind_i` while a transfer is in flight is
// a producer-side bug. Caught here so the path stays clean.
always_ff @(posedge clk) begin
if (rst_n && rewind_i && (state != S_ACCUM || pos != 2'd0)) begin
$error("[sif_dma_ee_ram_bridge_stub] illegal rewind_i mid-transfer (state=%0d pos=%0d)",
state, pos);
end
end
`endif
// ------------------------------------------------------------------
// Downstream write-port drive (combinational on state)
// ------------------------------------------------------------------
assign bridge_wr_en = (state == S_EMIT);
assign bridge_wr_addr = DEST_BASE_ADDR + wr_offset;
assign bridge_wr_data = acc_data;
assign bridge_wr_be = acc_be;
// ------------------------------------------------------------------
// last_seen_o: set once the upstream asserts in_last on a beat that
// is actually accepted. Level-held until reset.
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) last_seen_o <= 1'b0;
else if (accept_beat && in_last) last_seen_o <= 1'b1;
end
endmodule : sif_dma_ee_ram_bridge_stub
+139
View File
@@ -0,0 +1,139 @@
// retroDE_ps2 — sif_dma_iop_ram_bridge_stub
//
// Width-adapting bridge from a 128-bit SIF DMA endpoint to the 32-bit
// IOP memory map. Splits each incoming qword into four 32-bit writes at
// consecutive physical addresses starting from DEST_BASE_ADDR.
//
// First real coupling between the SIF data-plane and the IOP side. NOT an
// IOP, not a DMAC, not a peer — just a width/ordering adapter.
//
// Contract refs:
// docs/contracts/sif.md (DMA-linked data movement endpoints)
// docs/contracts/iop.md (IOP-local RAM/I/O decode; writes land
// through the IOP memory map)
//
// Handshake (upstream, from DMAC ep_* port or equivalent):
// in_valid / in_data[127:0] / in_last / in_ready
// Bridge asserts in_ready only while idle. During the four-write
// expansion of a qword, in_ready drops — natural backpressure onto
// whatever's producing qwords.
//
// Handshake (downstream, to iop_memory_map_stub's bridge-write port):
// bridge_wr_en / bridge_wr_addr[31:0] / bridge_wr_data[31:0] /
// bridge_wr_be[3:0] / bridge_master_id[7:0]
// Addresses are physical (no kseg stripping) — the IOP map must treat
// this port's addresses differently from its CPU-side port.
//
// Data layout:
// Little-endian unpacking: in_data[31:0] -> DEST_BASE+0
// in_data[63:32] -> DEST_BASE+4
// in_data[95:64] -> DEST_BASE+8
// in_data[127:96] -> DEST_BASE+12
// Subsequent qwords append: DEST_BASE+16, +20, +24, +28, ...
//
// Parameters:
// DEST_BASE_ADDR — where the bridge starts writing. Persistent across
// the life of the transfer; would become a register in
// a later wave where software programs the target.
// MASTER_ID — bridge's identity in MEM / IOP traces (default 3,
// distinct from EE IFETCH=0, DMAC=1, IOP_CPU=2).
//
// Non-goals:
// - multiple in-flight qwords
// - ack back upstream beyond in_ready / in_last observation
// - byte-enable variation per write (all writes are full 32-bit)
// - arbitration against other masters on the map's write path
`timescale 1ns/1ps
module sif_dma_iop_ram_bridge_stub
#(
parameter logic [31:0] DEST_BASE_ADDR = 32'h0000_0000,
parameter logic [7:0] MASTER_ID = 8'd3
) (
input logic clk,
input logic rst_n,
// Upstream (DMAC endpoint side)
input logic in_valid,
input logic [127:0] in_data,
input logic in_last,
output logic in_ready,
// Downstream (IOP map bridge-write port)
output logic bridge_wr_en,
output logic [31:0] bridge_wr_addr,
output logic [31:0] bridge_wr_data,
output logic [3:0] bridge_wr_be,
output logic [7:0] bridge_master_id
);
typedef enum logic [1:0] {
S_IDLE = 2'd0,
S_WRITE = 2'd1
} state_e;
state_e state;
logic [127:0] latched_qword;
logic [1:0] beat_index; // 0..3 across the 4 writes
logic [31:0] wr_offset; // running byte offset
assign in_ready = (state == S_IDLE);
assign bridge_master_id = MASTER_ID;
logic accept_new_qword;
assign accept_new_qword = in_valid && in_ready;
// ------------------------------------------------------------------
// State machine
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
latched_qword <= 128'd0;
beat_index <= 2'd0;
wr_offset <= 32'd0;
end else begin
unique case (state)
S_IDLE: begin
if (accept_new_qword) begin
latched_qword <= in_data;
beat_index <= 2'd0;
state <= S_WRITE;
end
end
S_WRITE: begin
// Each cycle here drives one 32-bit write. After the
// fourth, go idle. wr_offset advances per write.
wr_offset <= wr_offset + 32'd4;
if (beat_index == 2'd3) begin
state <= S_IDLE;
beat_index <= 2'd0;
end else begin
beat_index <= beat_index + 2'd1;
end
end
default: state <= S_IDLE;
endcase
end
end
// ------------------------------------------------------------------
// Downstream write-port drive (combinational on state)
// ------------------------------------------------------------------
// Indexed part-select picks the 32-bit slice for the current beat.
// Avoids the constant-select-in-always_comb pattern that trips
// portability warnings on some simulators.
logic [31:0] beat_data;
assign beat_data = latched_qword[beat_index*32 +: 32];
assign bridge_wr_en = (state == S_WRITE);
assign bridge_wr_addr = DEST_BASE_ADDR + wr_offset;
assign bridge_wr_data = beat_data;
assign bridge_wr_be = 4'b1111; // full-word writes only
endmodule : sif_dma_iop_ram_bridge_stub
+154
View File
@@ -0,0 +1,154 @@
// retroDE_ps2 — sif_dma_stub
//
// Minimal SIF DMA receive-side endpoint. First data-plane step on the SIF
// seam. NOT an IOP — this is a bounded receive buffer that accepts qwords
// from a DMAC channel and exposes them to the TB via a small read port.
// No IOP CPU, no live peer logic, no directional policy beyond "incoming
// qwords land in sequential slots."
//
// Contract refs:
// docs/contracts/sif.md (DMA-linked data movement endpoints)
//
// Receive interface (connects to DMAC's ep_* endpoint):
// in_valid / in_data / in_last / in_ready
// One-cycle accept per beat when in_ready is high. in_last observed
// alongside the final qword of a transfer.
//
// Read interface (TB-side verification):
// rd_en pulses with rd_idx; rd_data / rd_valid return the stored qword
// one cycle later.
//
// Stall input:
// stall_in (level) forces in_ready low while asserted. Used by the
// negative-path test to prove that a not-ready receiver does not let
// the DMAC spuriously complete.
//
// Buffer:
// Small internal array (DEPTH qwords). Full detection is tracked from
// `rx_count`: once `rx_count >= DEPTH` the buffer is full and `in_ready`
// drops so the DMAC stalls in ACTIVE_SEND. No silent wrap. There is no
// consume path yet — once full, the buffer stays full (intentional for
// the current scope). `full_o` is exposed for testbench observation.
//
// Trace:
// One SIF EV_WRITE per accepted beat (one event per cycle).
// arg0 = slot index into the receive buffer
// arg1 = data[63:0] (low half)
// arg2 = source id (hard-wired to 8'd1 = DMAC for Wave 3)
// arg3 = 0
// flags bit 0 = in_last value for this beat
// flags bit 1 = 1 (distinguishes DMA-receive writes from mailbox writes
// if both subsystems are ever instantiated together)
`timescale 1ns/1ps
module sif_dma_stub
import trace_pkg::*;
#(
parameter int DEPTH = 8 // max qwords buffered
) (
input logic clk,
input logic rst_n,
// DMAC-facing receive
input logic in_valid,
input logic [127:0] in_data,
input logic in_last,
output logic in_ready,
// TB verification read port
input logic rd_en,
input logic [$clog2(DEPTH)-1:0] rd_idx,
output logic [127:0] rd_data,
output logic rd_valid,
// Negative-path control
input logic stall_in,
// Status
output logic [31:0] rx_count, // monotonic accepted-beat count
output logic last_seen, // sticky: in_last observed
output logic full_o, // buffer full, in_ready=0
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam int IDX_W = $clog2(DEPTH);
logic [127:0] buf_mem [0:DEPTH-1];
logic [IDX_W-1:0] wr_ptr;
logic beat_accepted;
assign full_o = (rx_count >= DEPTH);
assign in_ready = !stall_in && !full_o;
assign beat_accepted = in_valid && in_ready;
// ------------------------------------------------------------------
// Receive path
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
wr_ptr <= '0;
rx_count <= 32'd0;
last_seen <= 1'b0;
for (int i = 0; i < DEPTH; i++) buf_mem[i] <= 128'd0;
end else if (beat_accepted) begin
buf_mem[wr_ptr] <= in_data;
wr_ptr <= wr_ptr + IDX_W'(1);
rx_count <= rx_count + 32'd1;
if (in_last) last_seen <= 1'b1;
end
end
// ------------------------------------------------------------------
// Read port (1-cycle latency)
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
rd_data <= 128'd0;
rd_valid <= 1'b0;
end else begin
rd_valid <= rd_en;
if (rd_en) rd_data <= buf_mem[rd_idx];
end
end
// ------------------------------------------------------------------
// Trace — one event per accepted beat
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_WRITE;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (beat_accepted) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_WRITE;
ev_arg0 <= {{(64-IDX_W){1'b0}}, wr_ptr};
ev_arg1 <= in_data[63:0];
ev_arg2 <= 64'd1; // DMAC
ev_arg3 <= 64'd0;
ev_flags <= {30'd0, 1'b1, in_last}; // bit1=DMA, bit0=in_last
end else begin
ev_valid <= 1'b0;
end
end
endmodule : sif_dma_stub
+184
View File
@@ -0,0 +1,184 @@
// retroDE_ps2 — sif_mailbox_peer_stub
//
// Re-armable active peer for the SIF mailbox. Second step on the two-actor
// coordination track (A'' — lifecycle). Observes one mailbox/flag pattern
// and responds with a known acknowledgement pattern. NOT an IOP — does not
// execute code, does not boot anything, does not claim to be a CPU.
//
// Contract refs:
// docs/contracts/sif.md (mailbox/flag-only SIF stub)
// docs/stub_module_plan.md (Wave 2 SIF track)
//
// Canonical command-echo protocol:
// 1. EE writes MSCOM = cmd
// 2. EE writes MSFLG = CMD_PENDING_BIT (doorbell rising edge)
// 3. peer polls MSFLG; when it sees CMD_PENDING_BIT set AND it has not
// already responded to the current request, it reads MSCOM
// 4. peer writes SMCOM = <the cmd it just read>
// 5. peer writes SMFLG = CMD_ACK_BIT
// 6. peer latches `responded` and resumes polling; it will NOT respond
// again until the TB (or EE) clears CMD_PENDING_BIT in MSFLG
// 7. when the peer observes CMD_PENDING_BIT cleared, `responded` clears
// and the next rising edge of CMD_PENDING_BIT triggers a fresh echo
//
// The peer still does NOT clear any mailbox state itself. Re-arm is the
// TB's responsibility; the peer just refuses to double-fire while the
// doorbell bit is still high.
//
// Ports connect directly to sif_mailbox_stub's IOP-side register port:
// obs_* → mailbox iop_rd_* (peer reads MSFLG then MSCOM)
// resp_* → mailbox iop_wr_* (peer writes SMCOM then SMFLG)
//
// All peer activity is visible through the mailbox's own trace output
// (side_id=IOP=1). The peer does not emit its own trace; `ack_count_o`
// provides a testbench synchronisation point.
`timescale 1ns/1ps
module sif_mailbox_peer_stub
#(
parameter logic [7:0] MSCOM_OFF = 8'h00,
parameter logic [7:0] SMCOM_OFF = 8'h10,
parameter logic [7:0] MSFLG_OFF = 8'h20,
parameter logic [7:0] SMFLG_OFF = 8'h30,
parameter logic [31:0] CMD_PENDING_BIT = 32'h0000_0001,
parameter logic [31:0] CMD_ACK_BIT = 32'h0000_0002
) (
input logic clk,
input logic rst_n,
// Observation — connects to mailbox iop_rd_*
output logic obs_rd_en,
output logic [7:0] obs_rd_addr,
input logic [31:0] obs_rd_data,
input logic obs_rd_valid,
// Response — connects to mailbox iop_wr_*
output logic resp_wr_en,
output logic [7:0] resp_wr_addr,
output logic [31:0] resp_wr_data,
// Status
output logic done_o, // latched high after the first ack
output logic [31:0] ack_count_o // monotonic count of completed acks
);
typedef enum logic [2:0] {
S_POLL_REQ = 3'd0, // drive rd_en for MSFLG
S_POLL_WAIT = 3'd1, // wait for obs_rd_valid, decide
S_MSCOM_REQ = 3'd2, // drive rd_en for MSCOM
S_MSCOM_WAIT = 3'd3, // wait for obs_rd_valid, latch cmd
S_WRITE_SMCOM = 3'd4, // drive wr_en, addr=SMCOM, data=cmd
S_WRITE_SMFLG = 3'd5 // drive wr_en, addr=SMFLG, data=ACK
} state_e;
state_e state;
logic [31:0] latched_cmd;
logic responded; // peer has already acked the current
// doorbell assertion; suppresses re-fire
// until the doorbell is observed low
// ------------------------------------------------------------------
// State machine
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_POLL_REQ;
latched_cmd <= 32'd0;
responded <= 1'b0;
end else begin
unique case (state)
S_POLL_REQ: state <= S_POLL_WAIT;
S_POLL_WAIT: begin
if (obs_rd_valid) begin
if (responded) begin
// Waiting for the TB to clear CMD_PENDING_BIT
// before we arm again.
if ((obs_rd_data & CMD_PENDING_BIT) == 32'd0)
responded <= 1'b0;
state <= S_POLL_REQ;
end else begin
if ((obs_rd_data & CMD_PENDING_BIT) != 32'd0)
state <= S_MSCOM_REQ;
else
state <= S_POLL_REQ;
end
end
end
S_MSCOM_REQ: state <= S_MSCOM_WAIT;
S_MSCOM_WAIT: begin
if (obs_rd_valid) begin
latched_cmd <= obs_rd_data;
state <= S_WRITE_SMCOM;
end
end
S_WRITE_SMCOM: state <= S_WRITE_SMFLG;
S_WRITE_SMFLG: begin
responded <= 1'b1; // refuse to re-fire until MSFLG
// clears
state <= S_POLL_REQ;
end
default: state <= S_POLL_REQ;
endcase
end
end
// ------------------------------------------------------------------
// Output drive (combinational, one-hot on state)
// ------------------------------------------------------------------
always_comb begin
obs_rd_en = 1'b0;
obs_rd_addr = 8'd0;
resp_wr_en = 1'b0;
resp_wr_addr = 8'd0;
resp_wr_data = 32'd0;
unique case (state)
S_POLL_REQ: begin
obs_rd_en = 1'b1;
obs_rd_addr = MSFLG_OFF;
end
S_MSCOM_REQ: begin
obs_rd_en = 1'b1;
obs_rd_addr = MSCOM_OFF;
end
S_WRITE_SMCOM: begin
resp_wr_en = 1'b1;
resp_wr_addr = SMCOM_OFF;
resp_wr_data = latched_cmd;
end
S_WRITE_SMFLG: begin
resp_wr_en = 1'b1;
resp_wr_addr = SMFLG_OFF;
resp_wr_data = CMD_ACK_BIT;
end
default: ;
endcase
end
// ------------------------------------------------------------------
// Ack bookkeeping
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ack_count_o <= 32'd0;
done_o <= 1'b0;
end else if (state == S_WRITE_SMFLG) begin
// S_WRITE_SMFLG is a single-cycle state: the state machine
// unconditionally transitions to S_POLL_REQ on the next edge,
// so this branch is observed exactly once per completed ack.
ack_count_o <= ack_count_o + 32'd1;
done_o <= 1'b1;
end
end
endmodule : sif_mailbox_peer_stub
+230
View File
@@ -0,0 +1,230 @@
// retroDE_ps2 — sif_mailbox_stub
//
// Minimal EE↔IOP subsystem-interface mailbox shell. First stub on the SIF
// track. Standalone unit — does not yet integrate with any live IOP core.
// Testbenches drive both the EE-side port and the IOP-side port directly,
// playing both roles, to prove the register semantics without requiring
// a full dual-CPU bring-up.
//
// Contract refs:
// docs/stub_module_plan.md (Wave 2, item 10)
// docs/contracts/sif.md (mailbox/flag-only stub is allowed here)
//
// Register surface (offsets within the SIF block):
// 0x00 MSCOM — 32-bit mailbox, conventionally EE→IOP
// 0x10 SMCOM — 32-bit mailbox, conventionally IOP→EE
// 0x20 MSFLG — 32-bit flag word, conventionally EE-owned for set,
// IOP-owned for clear (directional semantics deferred)
// 0x30 SMFLG — 32-bit flag word, conventionally IOP-owned for set,
// EE-owned for clear (directional semantics deferred)
//
// Wave 2 scope intentionally does NOT enforce direction or set/clear
// semantics. Both ports can read and write any register with plain
// replace-on-write. The trace records which side initiated each access
// (side_id in arg2) so future-wave work can layer directional rules on
// top without changing the storage model.
//
// Port semantics:
// Each side (EE / IOP) has an independent register port:
// wr_en, rd_en, addr[7:0], wr_data[31:0], rd_data[31:0], rd_valid
// Reads have 1-cycle latency to match the existing stub ecosystem.
//
// Write arbitration (per-register):
// - EE and IOP writes to *different* registers on the same cycle both
// land. Storage is not serialized across independent registers.
// - EE and IOP writes to the *same* register on the same cycle: EE
// wins, IOP write is dropped that cycle.
// - Trace is limited to one event per cycle by the shared trace bus
// (priority EE > IOP). An IOP write that lands silently when EE is
// driving a different register will not be traced this wave — future
// waves can add a second trace output port if that becomes a gap.
//
// Trace payload schema (SUBSYS_SIF, existing EV_READ/EV_WRITE codes):
// SIF WRITE arg0=offset arg1=data arg2=side_id arg3=0 flags[0]=1
// SIF READ arg0=offset arg1=data arg2=side_id arg3=0 flags[0]=0
// side_id: 0 = EE, 1 = IOP
//
// Trace priority on same cycle: EE write > IOP write > EE read > IOP read.
// In practice TBs drive at most one operation per cycle.
`timescale 1ns/1ps
module sif_mailbox_stub
import trace_pkg::*;
(
input logic clk,
input logic rst_n,
// EE-side register port
input logic ee_wr_en,
input logic ee_rd_en,
input logic [7:0] ee_addr,
input logic [31:0] ee_wr_data,
output logic [31:0] ee_rd_data,
output logic ee_rd_valid,
// IOP-side register port
input logic iop_wr_en,
input logic iop_rd_en,
input logic [7:0] iop_addr,
input logic [31:0] iop_wr_data,
output logic [31:0] iop_rd_data,
output logic iop_rd_valid,
// Trace
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
localparam logic [7:0] MSCOM_OFF = 8'h00;
localparam logic [7:0] SMCOM_OFF = 8'h10;
localparam logic [7:0] MSFLG_OFF = 8'h20;
localparam logic [7:0] SMFLG_OFF = 8'h30;
localparam logic [63:0] SIDE_EE = 64'd0;
localparam logic [63:0] SIDE_IOP = 64'd1;
// ------------------------------------------------------------------
// Register file
// ------------------------------------------------------------------
logic [31:0] mscom;
logic [31:0] smcom;
logic [31:0] msflg;
logic [31:0] smflg;
function automatic logic [31:0] select_reg(input logic [7:0] offset,
input logic [31:0] mscom_v,
input logic [31:0] smcom_v,
input logic [31:0] msflg_v,
input logic [31:0] smflg_v);
case (offset)
MSCOM_OFF: select_reg = mscom_v;
SMCOM_OFF: select_reg = smcom_v;
MSFLG_OFF: select_reg = msflg_v;
SMFLG_OFF: select_reg = smflg_v;
default: select_reg = 32'hDEAD_BEEF;
endcase
endfunction
// Per-register write arbitration: EE wins on same-register collision,
// but writes to different registers land independently.
logic ee_hits_mscom, ee_hits_smcom, ee_hits_msflg, ee_hits_smflg;
logic iop_hits_mscom, iop_hits_smcom, iop_hits_msflg, iop_hits_smflg;
assign ee_hits_mscom = ee_wr_en && (ee_addr == MSCOM_OFF);
assign ee_hits_smcom = ee_wr_en && (ee_addr == SMCOM_OFF);
assign ee_hits_msflg = ee_wr_en && (ee_addr == MSFLG_OFF);
assign ee_hits_smflg = ee_wr_en && (ee_addr == SMFLG_OFF);
assign iop_hits_mscom = iop_wr_en && (iop_addr == MSCOM_OFF);
assign iop_hits_smcom = iop_wr_en && (iop_addr == SMCOM_OFF);
assign iop_hits_msflg = iop_wr_en && (iop_addr == MSFLG_OFF);
assign iop_hits_smflg = iop_wr_en && (iop_addr == SMFLG_OFF);
always_ff @(posedge clk) begin
if (!rst_n) begin
mscom <= 32'd0;
smcom <= 32'd0;
msflg <= 32'd0;
smflg <= 32'd0;
end else begin
if (ee_hits_mscom) mscom <= ee_wr_data;
else if (iop_hits_mscom) mscom <= iop_wr_data;
if (ee_hits_smcom) smcom <= ee_wr_data;
else if (iop_hits_smcom) smcom <= iop_wr_data;
if (ee_hits_msflg) msflg <= ee_wr_data;
else if (iop_hits_msflg) msflg <= iop_wr_data;
if (ee_hits_smflg) smflg <= ee_wr_data;
else if (iop_hits_smflg) smflg <= iop_wr_data;
end
end
// ------------------------------------------------------------------
// Reads (1-cycle latency, both ports independent)
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ee_rd_data <= 32'd0;
ee_rd_valid <= 1'b0;
iop_rd_data <= 32'd0;
iop_rd_valid <= 1'b0;
end else begin
ee_rd_valid <= ee_rd_en;
if (ee_rd_en)
ee_rd_data <= select_reg(ee_addr, mscom, smcom, msflg, smflg);
iop_rd_valid <= iop_rd_en;
if (iop_rd_en)
iop_rd_data <= select_reg(iop_addr, mscom, smcom, msflg, smflg);
end
end
// ------------------------------------------------------------------
// Trace emission — priority EE_wr > IOP_wr > EE_rd > IOP_rd.
// Reads emit with the data that will be delivered next cycle, keeping
// the trace line self-consistent.
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_READ;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (ee_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_WRITE;
ev_arg0 <= {56'd0, ee_addr};
ev_arg1 <= {32'd0, ee_wr_data};
ev_arg2 <= SIDE_EE;
ev_arg3 <= 64'd0;
ev_flags <= 32'h0000_0001;
end else if (iop_wr_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_WRITE;
ev_arg0 <= {56'd0, iop_addr};
ev_arg1 <= {32'd0, iop_wr_data};
ev_arg2 <= SIDE_IOP;
ev_arg3 <= 64'd0;
ev_flags <= 32'h0000_0001;
end else if (ee_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_READ;
ev_arg0 <= {56'd0, ee_addr};
ev_arg1 <= {32'd0, select_reg(ee_addr, mscom, smcom, msflg, smflg)};
ev_arg2 <= SIDE_EE;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (iop_rd_en) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_SIF;
ev_event <= EV_READ;
ev_arg0 <= {56'd0, iop_addr};
ev_arg1 <= {32'd0, select_reg(iop_addr, mscom, smcom, msflg, smflg)};
ev_arg2 <= SIDE_IOP;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else begin
ev_valid <= 1'b0;
end
end
endmodule : sif_mailbox_stub
+45
View File
@@ -0,0 +1,45 @@
// retroDE_ps2 — de25_nano_pll_stub (Ch151)
//
// Sim-friendly stub matching the Quartus IOPLL "pll" module signature
// used by sibling cores (retroDE_nes/ip/pll/pll_bb.v and
// retroDE_splash/ip/sys_pll/sys_pll_bb.v). Real synthesis swaps this
// stub for Terasic-supplied IP via Quartus's IP catalog and a
// `\`ifdef USE_PLL_IP` gate in the board top.
//
// Behavior:
// - `outclk_0` is a direct pass-through of `refclk` (no PLL
// multiplication; sim doesn't need a different frequency, and a
// pass-through still exercises the PLL-gated reset bridge in the
// Ch149 board top).
// - `locked` rises after a small post-reset delay (~32 cycles),
// mimicking real-IP behavior where lock acquires after rst goes
// low. Held LOW while `rst` is HIGH.
//
// The signature matches Quartus's IOPLL exactly so swapping in the
// real IP is a single `\`ifdef` at instantiation; the rest of the
// board top is unchanged.
`timescale 1ns/1ps
module de25_nano_pll_stub (
input wire refclk, // reference clock from CLOCK2_50
input wire rst, // active-HIGH async reset (Quartus convention)
output wire outclk_0, // pass-through of refclk
output wire locked // high once "lock" is acquired
);
assign outclk_0 = refclk;
// Lock counter — tick up while rst is low; saturate at 32 and hold
// `locked` high. While rst is high, hold counter at 0 and locked low.
logic [5:0] lock_cnt;
always_ff @(posedge refclk or posedge rst) begin
if (rst)
lock_cnt <= 6'd0;
else if (lock_cnt < 6'd32)
lock_cnt <= lock_cnt + 6'd1;
end
assign locked = (lock_cnt == 6'd32);
endmodule : de25_nano_pll_stub
File diff suppressed because it is too large Load Diff
+666
View File
@@ -0,0 +1,666 @@
// retroDE_ps2 — top_psmct32_raster_demo (Ch146)
//
// First hardware-targeted top wrapper, structured around the Ch123 PSMCT32
// raster end-to-end demo (the simplest direct-color path; see the Ch144
// hardware-readiness report in docs/contracts/gif_gs.md for rationale and
// dep-tree audit). This module is the one a board-level synthesis project
// would target — board-level concerns (HDMI/VGA PHY, pin constraints,
// .mem bake tooling, clock-domain crossings) are deliberately deferred to
// later chapters. Ch146's job is to prove the design can be expressed as
// a single SystemVerilog module with a sensible top-level shape.
//
// Topology mirrors the Ch123 TB exactly — the 11 modules in the Ch144
// dep tree, all instantiated here, with hardware-friendly tweaks:
//
// bios_rom_stub#(.IMAGE_FILE(BIOS_IMAGE_FILE)) — EE bootlet at 0xBFC0_0000
// ee_ram_stub#(.IMAGE_FILE(PAYLOAD_IMAGE_FILE)) — GIF payload at phys 0x100
// ee_memory_map_stub#(.USEG_SHADOW_WORDS_PARAM(1024)) — Ch145 BRAM shrink
// ee_core_stub#(.PC_RESET(0xBFC00000)) — MIPS R5900 core
// ee_gs_priv_bridge_stub — 32-bit MMIO → 64-bit GS-priv
// dmac_reg_stub — DMAC ch2
// gif_packed_stub#(.REAL_AD_REG_MAP(1'b1)) — GIFtag + PACKED A+D parser
// gs_stub#(.PSMCT32_SWIZZLE(1'b1)) — GS register file + raster
// gif_image_xfer_stub#(.PSMCT32_SWIZZLE(1'b1)) — TRXDIR/IMAGE engine (idle in Ch123)
// vram_stub#(.BYTES(8192)) — 8 KiB VRAM (one PSMCT32 page)
// gs_pcrtc_stub#(.PSMCT32_SWIZZLE(1'b1)) — PCRTC scanout
//
// Differences from the Ch123 TB:
// - No procedural ee_prog_word() / preload_qword() drives. The BIOS
// bootlet and GIF payload are preloaded by `$readmemh` from the
// IMAGE_FILE parameters (default empty = synthetic NOP-sled fallback
// in bios_rom_stub for a "won't crash on power-up" smoke baseline,
// and an all-zeros ee_ram_stub which yields no DMAC payload but a
// stable PCRTC frame).
// - useg_shadow_mem trimmed to 1024 words (4 KiB) via Ch145
// parameter — no useg traffic in the Ch123 data plane.
// - All trace event outputs left open. Status is exposed as a
// debug bundle (core_halt, dma_done_seen, frame_seen) that a
// board can wire to LEDs.
// - The Ch123 TB's collision-check `$error` and observer counters
// are TB-only and do not appear here. (Their checks land in the
// focused Ch146 TB tb_top_psmct32_raster_demo.sv instead.)
//
// Top-level ports:
// clk, rst_n — single clock domain, active-low synchronous reset
// core_go — pulsed high for one cycle to start the EE bootlet
// (a board reset-release sequencer can tie it high
// after rst_n deasserts)
// r/g/b, hsync, vsync, de — 8-bit RGB scanout (PCRTC active region)
// core_halt — high once SYSCALL halts the EE
// dma_done_seen — sticky: high once DMAC channel-2 fires its DONE event
// frame_seen — sticky: high once one full PCRTC frame end-of-frame fires
//
// Parameters:
// H_ACTIVE / V_ACTIVE — PCRTC active region (defaults to the Ch123 16×8)
// BIOS_SIZE_BYTES — bios_rom_stub size (default 4 KiB)
// RAM_SIZE_BYTES — ee_ram_stub size (default 4 KiB)
// VRAM_BYTES — vram_stub size (default 8 KiB)
// USEG_SHADOW_WORDS_PARAM — Ch145 useg-shadow size (default 1024 = 4 KiB)
//
// Macros (NOT parameters — iverilog-12 string-parameter forwarding
// limitation forced them to be macros; see the `\`define` block
// below the `timescale directive):
// TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE — path to bios.mem
// (one 32-bit hex word/line)
// TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE — path to payload.mem
// (one 128-bit hex qword/line)
// Both default to "" so the wrapper is still elaborable without
// fixtures (synthetic NOP-sled in bios_rom_stub + zero-init
// ee_ram_stub, which produces no DMAC payload but a stable PCRTC
// frame). On synthesis these become FPGA-tool defines.
//
// PASS for the Ch146 focused TB matches Ch123 exactly:
// dma=(1,24,1) ee_dmac_wr=3 giftags=4 ad_writes=20 xfer_writes=0
// ee_priv_wr=4 bridge_fires=4 core_halt=1 emits=128 frame=16x8
`timescale 1ns/1ps
// BIOS / payload image paths are passed via macros (iverilog-12
// limitation: string parameter forwarding through hierarchy
// elaborates inconsistently). On synthesis the same macros become
// FPGA-tool defines pointing at .mem fixtures or board-specific
// files. The macros default to empty strings (synthetic NOP-sled +
// zero-RAM fallback in bios_rom_stub / ee_ram_stub) so the wrapper
// is still elaborable without bake artifacts present.
`ifndef TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE
`define TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE ""
`endif
`ifndef TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE
`define TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE ""
`endif
module top_psmct32_raster_demo
import trace_pkg::*;
#(
parameter int H_ACTIVE = 16,
parameter int V_ACTIVE = 8,
parameter int BIOS_SIZE_BYTES = 4 * 1024,
parameter int RAM_SIZE_BYTES = 4 * 1024,
parameter int VRAM_BYTES = 8 * 1024,
parameter int unsigned USEG_SHADOW_WORDS_PARAM = 1024,
// Brick 1 — PSMCT32 page/block swizzle gate. Default 1 preserves
// the Ch123/Ch251 swizzled raster+scanout behavior (and every
// existing TB that drives this top). A TEXTURED-sprite demo
// fixture sets this to 0 so the linear gs_texel_addr fetch and the
// BITBLT upload land in the SAME (linear) VRAM layout — avoiding
// the swizzle reconciliation the gs_stub TODO flags. The gate is
// forwarded to gs_stub / gif_image_xfer_stub / gs_pcrtc_stub
// together so all three VRAM views stay consistent.
parameter bit PSMCT32_SWIZZLE = 1'b1
) (
input logic clk,
input logic rst_n,
input logic core_go,
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
output logic hsync,
output logic vsync,
output logic de,
output logic core_halt,
output logic dma_done_seen,
output logic frame_seen,
output logic raster_overflow,
// Ch174 — event toggles for HPS-visible counters. See the
// mirror block in top_psmct32_raster_demo_bram.sv for the full
// pulse-CDC contract. Toggle, not pulse — by design.
output logic frame_toggle,
output logic dma_done_toggle
);
localparam int RAM_ADDR_W = $clog2(RAM_SIZE_BYTES);
localparam int BIOS_ADDR_W = $clog2(BIOS_SIZE_BYTES);
// ---------------------------------------------------------------------
// ee_ram_stub — DMAC-side GIF payload
// ---------------------------------------------------------------------
logic ram_rd_en;
logic [RAM_ADDR_W-1:0] ram_rd_addr;
logic [127:0] ram_rd_data;
logic ram_rd_valid;
// Top has no TB-direct write path; the wr_* ports are tied off.
logic [7:0] ram_master_id;
assign ram_master_id = ram_rd_en ? 8'd1 : 8'd0;
ee_ram_stub #(
.SIZE_BYTES(RAM_SIZE_BYTES),
.IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE)
) u_ram (
.clk(clk), .rst_n(rst_n),
.rd_en(ram_rd_en), .rd_addr(ram_rd_addr),
.rd_data(ram_rd_data), .rd_valid(ram_rd_valid),
.wr_en(1'b0), .wr_addr('0), .wr_data(128'd0), .wr_be(16'd0),
.master_id(ram_master_id),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// bios_rom_stub — EE bootlet at 0xBFC0_0000
// ---------------------------------------------------------------------
logic bios_rd_en;
logic [21:0] bios_rd_addr_full;
logic [BIOS_ADDR_W-1:0] bios_rd_addr;
logic bios_rd_valid;
logic [31:0] bios_rd_data;
assign bios_rd_addr = bios_rd_addr_full[BIOS_ADDR_W-1:0];
bios_rom_stub #(
.SIZE_BYTES(BIOS_SIZE_BYTES),
.IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE)
) u_bios (
.clk(clk), .rst_n(rst_n),
.rd_en(bios_rd_en),
.rd_addr(bios_rd_addr),
.rd_data(bios_rd_data),
.rd_valid(bios_rd_valid),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// dmac_reg_stub — channel-2 NORMAL transfer
// ---------------------------------------------------------------------
logic dmac_reg_wr_en;
logic [7:0] dmac_reg_offset;
logic [31:0] dmac_reg_wr_data;
logic dmac_mem_rd_en;
logic [31:0] dmac_mem_rd_addr;
logic dmac_gif_valid;
logic [127:0] dmac_gif_data;
logic dmac_gif_last;
logic dmac_gif_ready;
logic dmac_ev_valid;
subsys_e dmac_ev_subsys;
event_e dmac_ev_event;
logic [127:0] map_to_dmac_rd_data;
logic map_to_dmac_rd_valid;
dmac_reg_stub u_dmac (
.clk(clk), .rst_n(rst_n),
.reg_wr_en(dmac_reg_wr_en), .reg_offset(dmac_reg_offset),
.reg_wr_data(dmac_reg_wr_data),
.reg_rd_en(1'b0), .reg_rd_data(), .reg_rd_valid(),
.mem_rd_en(dmac_mem_rd_en), .mem_rd_addr(dmac_mem_rd_addr),
.mem_rd_data(map_to_dmac_rd_data), .mem_rd_valid(map_to_dmac_rd_valid),
.ep_valid(dmac_gif_valid), .ep_data(dmac_gif_data),
.ep_last(dmac_gif_last), .ep_ready(dmac_gif_ready),
.irq_completion_o(),
.ev_valid(dmac_ev_valid), .ev_subsys(dmac_ev_subsys),
.ev_event(dmac_ev_event),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// ee_memory_map_stub — bus arbiter (USEG_SHADOW shrunk per Ch145)
// ---------------------------------------------------------------------
logic ee_cpu_rd_en;
logic [31:0] ee_cpu_rd_addr;
logic [31:0] ee_cpu_rd_data;
logic ee_cpu_rd_valid;
logic ee_cpu_wr_en;
logic [31:0] ee_cpu_wr_addr;
logic [31:0] ee_cpu_wr_data;
logic [3:0] ee_cpu_wr_be;
logic map_gs_priv_wr_en;
logic [15:0] map_gs_priv_wr_addr;
logic [31:0] map_gs_priv_wr_data;
logic [3:0] map_gs_priv_wr_be;
logic map_ram_rd_en;
logic [24:0] map_ram_rd_addr;
ee_memory_map_stub #(
.USEG_SHADOW_WORDS_PARAM(USEG_SHADOW_WORDS_PARAM)
) u_map (
.clk(clk), .rst_n(rst_n),
.ee_rd_en (ee_cpu_rd_en),
.ee_rd_addr(ee_cpu_rd_addr),
.ee_rd_data(ee_cpu_rd_data),
.ee_rd_valid(ee_cpu_rd_valid),
.ee_wr_en (ee_cpu_wr_en),
.ee_wr_addr(ee_cpu_wr_addr),
.ee_wr_data(ee_cpu_wr_data),
.ee_wr_be (ee_cpu_wr_be),
.dmac_rd_en(dmac_mem_rd_en), .dmac_rd_addr(dmac_mem_rd_addr),
.dmac_rd_data(map_to_dmac_rd_data),
.dmac_rd_valid(map_to_dmac_rd_valid),
.bios_rd_en (bios_rd_en),
.bios_rd_addr(bios_rd_addr_full),
.bios_rd_data(bios_rd_data),
.bios_rd_valid(bios_rd_valid),
.ram_rd_en(map_ram_rd_en), .ram_rd_addr(map_ram_rd_addr),
.ram_rd_data(ram_rd_data), .ram_rd_valid(ram_rd_valid),
.bridge_wr_en(1'b0), .bridge_wr_addr(32'd0),
.bridge_wr_data(128'd0), .bridge_wr_be(16'd0),
.bridge_master_id(8'd0),
.ram_wr_en(), .ram_wr_addr(), .ram_wr_data(),
.ram_wr_be(), .ram_master_id(),
.ee_dmac_ch2_wr_en (dmac_reg_wr_en),
.ee_dmac_ch2_wr_addr(dmac_reg_offset),
.ee_dmac_ch2_wr_data(dmac_reg_wr_data),
.ee_dmac_ch2_rd_en(), .ee_dmac_ch2_rd_addr(),
.ee_dmac_ch2_rd_data(32'd0), .ee_dmac_ch2_rd_valid(1'b0),
.ee_intc_wr_en(), .ee_intc_wr_addr(), .ee_intc_wr_data(),
.ee_intc_rd_en(), .ee_intc_rd_addr(),
.ee_intc_rd_data(32'd0), .ee_intc_rd_valid(1'b0),
.ee_misc_mmio_wr_en(), .ee_misc_mmio_wr_addr(), .ee_misc_mmio_wr_data(), .ee_misc_mmio_wr_be(),
.ee_misc_mmio_rd_en(), .ee_misc_mmio_rd_addr(),
.ee_misc_mmio_rd_data(32'd0), .ee_misc_mmio_rd_valid(1'b0),
.ee_biu_wr_en(), .ee_biu_wr_addr(), .ee_biu_wr_data(), .ee_biu_wr_be(),
.ee_biu_rd_en(), .ee_biu_rd_addr(),
.ee_biu_rd_data(32'd0), .ee_biu_rd_valid(1'b0),
.ee_gs_priv_wr_en (map_gs_priv_wr_en),
.ee_gs_priv_wr_addr(map_gs_priv_wr_addr),
.ee_gs_priv_wr_data(map_gs_priv_wr_data),
.ee_gs_priv_wr_be (map_gs_priv_wr_be),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
assign ram_rd_en = map_ram_rd_en;
assign ram_rd_addr = map_ram_rd_addr[RAM_ADDR_W-1:0];
// ---------------------------------------------------------------------
// ee_core_stub
// ---------------------------------------------------------------------
logic [31:0] core_pc;
logic core_trap;
ee_core_stub #(
.PC_RESET(32'hBFC0_0000),
.STRICT_UNSUPPORTED(1'b0)
) u_core (
.clk(clk), .rst_n(rst_n),
.go_i(core_go),
.map_rd_en (ee_cpu_rd_en),
.map_rd_addr(ee_cpu_rd_addr),
.map_rd_data(ee_cpu_rd_data),
.map_rd_valid(ee_cpu_rd_valid),
.map_wr_en (ee_cpu_wr_en),
.map_wr_addr(ee_cpu_wr_addr),
.map_wr_data(ee_cpu_wr_data),
.map_wr_be (ee_cpu_wr_be),
.cpu_irq(1'b0),
.halt_o(core_halt),
.pc_o (core_pc),
.trap_o(core_trap),
.trap_pc_o(),
.trap_instr_o(),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// gif_packed_stub
// ---------------------------------------------------------------------
logic gif_in_ready;
logic [7:0] gif_gif_reg_num;
logic gif_gif_reg_wr_en;
logic [63:0] gif_gif_reg_data;
logic gif_image_data_valid;
logic [127:0] gif_image_data;
logic gif_image_data_last;
logic xfer_data_ready;
// Ch172 — raster FIFO full from gs_stub, fed back into gif_packed_stub.
logic gs_raster_fifo_full;
gif_packed_stub #(.REAL_AD_REG_MAP(1'b1)) u_gif (
.clk(clk), .rst_n(rst_n),
.in_valid(dmac_gif_valid), .in_data(dmac_gif_data),
.in_last(dmac_gif_last), .in_ready(gif_in_ready),
.image_data_valid(gif_image_data_valid),
.image_data(gif_image_data),
.image_data_last(gif_image_data_last),
.image_data_ready(xfer_data_ready),
.raster_fifo_full(gs_raster_fifo_full),
.gs_wr_en(), .gs_wr_addr(), .gs_wr_data(),
.gif_reg_wr_en(gif_gif_reg_wr_en),
.gif_reg_num(gif_gif_reg_num),
.gif_reg_data(gif_gif_reg_data),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// DMAC ready follows gif_packed_stub's in_ready directly (Ch110
// image-xfer backpressure propagates through gif_packed_stub).
assign dmac_gif_ready = gif_in_ready;
// ---------------------------------------------------------------------
// gs_stub — PSMCT32 raster, swizzled
// ---------------------------------------------------------------------
logic priv_reg_wr_en;
logic [15:0] priv_reg_wr_addr;
logic [63:0] priv_reg_wr_data;
logic [63:0] pmode_q, dispfb1_q, display1_q;
logic [63:0] bitbltbuf_q, trxpos_q, trxreg_q, trxdir_q;
logic trxdir_wr_q;
logic raster_pixel_emit;
logic [63:0] raster_pixel_color_q;
logic [31:0] raster_pixel_fb_addr_q;
logic [3:0] raster_pixel_be_q;
logic [31:0] raster_pixel_mask_q;
// Brick 1 — texture-sampler read port out of gs_stub. Wired to
// vram_stub's SECOND read port (read2) below. In this top there is
// no clut_loader_stub instantiated (clut_enable=0 at the PCRTC), so
// read2 is dedicated to the texel fetch; the mux contract (CLUT load
// at TEX0 commit vs texel fetch during scan) is documented at the
// read2 wiring site.
logic gs_tex_rd_en;
logic [31:0] gs_tex_rd_addr;
logic [31:0] gs_tex_rd_data;
// Brick 2a — dest-framebuffer read port for alpha blending. Wired
// to vram_stub.read2 below, arbitrated with the texel-fetch port.
// vram_stub.read2 is COMBINATIONAL, so FB_RD_REGISTERED defaults to
// 0 (dest data valid the same cycle the S2 address is presented).
logic gs_fb_rd_en;
logic [31:0] gs_fb_rd_addr;
logic [31:0] gs_fb_rd_data;
// Brick 2b — Z-buffer stored-Z read port. Wired to vram_stub.read2
// below, arbitrated with the texel-fetch + alpha dest-fb ports.
// vram_stub.read2 is COMBINATIONAL, so Z_RD_REGISTERED defaults to 0.
logic gs_z_rd_en;
logic [31:0] gs_z_rd_addr;
logic [31:0] gs_z_rd_data;
gs_stub #(
.PSMCT32_SWIZZLE(PSMCT32_SWIZZLE)
) u_gs (
.clk(clk), .rst_n(rst_n),
.reg_wr_en (priv_reg_wr_en),
.reg_wr_addr(priv_reg_wr_addr),
.reg_wr_data(priv_reg_wr_data),
.gif_reg_wr_en(gif_gif_reg_wr_en),
.gif_reg_num (gif_gif_reg_num),
.gif_reg_data (gif_gif_reg_data),
.bg_r(), .bg_g(), .bg_b(),
.pmode_q(pmode_q), .dispfb1_q(dispfb1_q), .display1_q(display1_q),
.prim_q(), .rgbaq_q(),
.xyz2_q(), .xyzf2_q(),
.frame_1_q(), .zbuf_1_q(),
.tex0_1_q(), .tex0_1_cbp_q(), .tex0_1_cpsm_q(),
.tex0_1_csm_q(), .tex0_1_csa_q(), .tex0_1_cld_q(), .tex0_1_wr_q(),
.bitbltbuf_q(bitbltbuf_q),
.trxpos_q(trxpos_q),
.trxreg_q(trxreg_q),
.trxdir_q(trxdir_q),
.trxdir_wr_q(trxdir_wr_q),
.prim_complete(), .prim_complete_count(),
.prim_v0_q(), .prim_v1_q(), .prim_v2_q(),
.prim_color_q(),
.prim_color_v0_q(), .prim_color_v1_q(), .prim_color_v2_q(),
.prim_v0_decoded_q(), .prim_v1_decoded_q(), .prim_v2_decoded_q(),
.prim_v0_color_decoded_q(), .prim_v1_color_decoded_q(), .prim_v2_color_decoded_q(),
.pixel_emit(), .pixel_emit_count(),
.pixel_x_q(), .pixel_y_q(),
.pixel_color_q(),
.pixel_fbp_q(), .pixel_fbw_q(), .pixel_psm_q(), .pixel_fb_addr_q(),
.raster_pixel_emit(raster_pixel_emit),
.raster_pixel_emit_count(),
.raster_pixel_x_q(), .raster_pixel_y_q(),
.raster_pixel_color_q(raster_pixel_color_q),
.raster_pixel_fb_addr_q(raster_pixel_fb_addr_q),
.raster_pixel_be_q(raster_pixel_be_q),
.raster_pixel_mask_q(raster_pixel_mask_q),
.raster_pixel_psm_q(),
.raster_active(),
.raster_overflow(raster_overflow),
.raster_fifo_full(gs_raster_fifo_full),
.raster_degenerate(),
.tex_rd_en (gs_tex_rd_en),
.tex_rd_addr(gs_tex_rd_addr),
.tex_rd_data(gs_tex_rd_data),
// Ch296 — PSMCT32-only top: no CLUT instantiated, the PSMT8 index
// path is never selected (s1_tex_active gates it on PSM==0x13).
// Tie the lookup data to 0; leave the index output open.
.clut_rd_idx (),
.clut_rd_data(32'd0),
.clut_load_busy(1'b0),
.fb_rd_en (gs_fb_rd_en),
.fb_rd_addr(gs_fb_rd_addr),
.fb_rd_data(gs_fb_rd_data),
.z_rd_en (gs_z_rd_en),
.z_rd_addr(gs_z_rd_addr),
.z_rd_data(gs_z_rd_data),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// ee_gs_priv_bridge_stub
// ---------------------------------------------------------------------
ee_gs_priv_bridge_stub u_priv_bridge (
.clk(clk), .rst_n(rst_n),
.ee_wr_en (map_gs_priv_wr_en),
.ee_wr_addr(map_gs_priv_wr_addr),
.ee_wr_data(map_gs_priv_wr_data),
.ee_wr_be (map_gs_priv_wr_be),
.gs_reg_wr_en (priv_reg_wr_en),
.gs_reg_wr_addr(priv_reg_wr_addr),
.gs_reg_wr_data(priv_reg_wr_data)
);
// ---------------------------------------------------------------------
// gif_image_xfer_stub — idle in Ch123 (no TRXDIR/IMAGE), but
// instantiated for symmetry. The TRXDIR-driven Ch124 demo would
// turn it load-bearing.
// ---------------------------------------------------------------------
logic xfer_we;
logic [31:0] xfer_waddr;
logic [31:0] xfer_wdata;
logic [3:0] xfer_wbe;
logic [31:0] xfer_wmask;
logic xfer_busy;
gif_image_xfer_stub #(
.PSMCT32_SWIZZLE(PSMCT32_SWIZZLE)
) u_xfer (
.clk(clk), .rst_n(rst_n),
.trxdir_wr_pulse(trxdir_wr_q),
.trxdir(trxdir_q),
.bitbltbuf(bitbltbuf_q),
.trxpos(trxpos_q),
.trxreg(trxreg_q),
.data_valid(gif_image_data_valid),
.data_qword(gif_image_data),
.data_last (gif_image_data_last),
.data_ready(xfer_data_ready),
.vram_we (xfer_we),
.vram_waddr(xfer_waddr),
.vram_wdata(xfer_wdata),
.vram_wbe (xfer_wbe),
.vram_wmask(xfer_wmask),
.busy (xfer_busy)
);
// ---------------------------------------------------------------------
// VRAM mux: xfer-OWNED when xfer.busy, raster-OWNED otherwise.
// (Sequenced: in Ch123 raster fills exclusively; xfer never fires.
// In a future TRXDIR variant the mux still works — payload upload
// finishes before raster starts.)
// ---------------------------------------------------------------------
logic vram_we_mux;
logic [31:0] vram_waddr_mux;
logic [31:0] vram_wdata_mux;
logic [3:0] vram_wbe_mux;
logic [31:0] vram_wmask_mux;
assign vram_we_mux = xfer_busy ? xfer_we : raster_pixel_emit;
assign vram_waddr_mux = xfer_busy ? xfer_waddr : raster_pixel_fb_addr_q;
assign vram_wdata_mux = xfer_busy ? xfer_wdata : raster_pixel_color_q[31:0];
assign vram_wbe_mux = xfer_busy ? xfer_wbe : raster_pixel_be_q;
assign vram_wmask_mux = xfer_busy ? xfer_wmask : raster_pixel_mask_q;
logic [31:0] vram_raddr;
logic [31:0] vram_rdata;
// ---------------------------------------------------------------------
// Brick 1 — read2 (second VRAM read port) MUX.
//
// read2 is shared between two consumers that are sequenced in time:
// - clut_loader_stub : VRAM→CLUT copy at TEX0 commit (BEFORE the
// raster scan). NOT instantiated in this top
// (PCRTC clut_enable=0), so it never drives
// read2 here.
// - gs_stub texel fetch : during the raster SCAN, one read per
// inside pixel of a textured SPRITE.
// Because CLUT load completes before scanout begins, a simple
// gs_tex_rd_en select is collision-free. When a future variant adds
// clut_loader, extend this select: read2_addr = clut_active ?
// clut_rd_addr : gs_tex_rd_addr.
// vram_stub's read2 is COMBINATIONAL; gs_stub presents the address
// from a registered S1 stage and consumes tex_rd_data one cycle
// later, so the effective latency matches TEX_RD_LATENCY=1.
// ---------------------------------------------------------------------
//
// Brick 2a — THIRD potential read2 consumer: the alpha-blend
// dest-fb read (gs_fb_rd_en/gs_fb_rd_addr). A flat alpha-blended
// SPRITE never textures, so gs_tex_rd_en and gs_fb_rd_en are
// mutually exclusive (gs_stub.new_abe_active requires
// !close_tme_effective). The combinational read2_data is fanned out
// to both consumers; only the active one's address selects the mux.
//
// Brick 2b — FOURTH potential read2 consumer: the Z-buffer stored-Z
// read (gs_z_rd_en/gs_z_rd_addr). A flat Z-tested SPRITE never
// textures and never alpha-blends (gs_stub.new_zte_active requires
// !close_tme_effective && !new_abe_active), so the four read2
// consumers are mutually exclusive by feature.
logic [31:0] vram_read2_addr;
logic [31:0] vram_read2_data;
assign vram_read2_addr = gs_tex_rd_en ? gs_tex_rd_addr
: gs_fb_rd_en ? gs_fb_rd_addr
: gs_z_rd_en ? gs_z_rd_addr
: 32'd0;
assign gs_tex_rd_data = vram_read2_data;
assign gs_fb_rd_data = vram_read2_data;
assign gs_z_rd_data = vram_read2_data;
// synthesis translate_off
always_ff @(posedge clk) begin
if (rst_n && gs_tex_rd_en && gs_fb_rd_en)
$error("Brick2a: read2 overlap @%0t — texel fetch and alpha dest-fb read both active; one read is being dropped (must be mutually exclusive by texturing).",
$time);
if (rst_n && gs_z_rd_en && (gs_tex_rd_en || gs_fb_rd_en))
$error("Brick2b: read2 overlap @%0t — Z-buffer read collides with another consumer; one read is being dropped (Z-tested flat sprite must be mutually exclusive with texel/alpha).",
$time);
end
// synthesis translate_on
vram_stub #(.BYTES(VRAM_BYTES)) u_vram (
.clk(clk), .rst_n(rst_n),
.write_en (vram_we_mux),
.write_addr(vram_waddr_mux),
.write_data(vram_wdata_mux),
.write_be (vram_wbe_mux),
.write_mask(vram_wmask_mux),
.read_addr (vram_raddr),
.read_data (vram_rdata),
.read2_addr(vram_read2_addr),
.read2_data(vram_read2_data)
);
// ---------------------------------------------------------------------
// gs_pcrtc_stub — PSMCT32 swizzled scanout
// ---------------------------------------------------------------------
logic end_of_frame;
gs_pcrtc_stub #(
.H_ACTIVE(H_ACTIVE), .H_FRONT(1), .H_SYNC(1), .H_BACK(1),
.V_ACTIVE(V_ACTIVE), .V_FRONT(1), .V_SYNC(1), .V_BACK(1),
.PSMCT32_SWIZZLE(PSMCT32_SWIZZLE)
) u_pcrtc (
.clk(clk), .rst_n(rst_n),
.pmode_q (pmode_q),
.dispfb1_q (dispfb1_q),
.display1_q (display1_q),
.vram_read_addr(vram_raddr),
.vram_read_data(vram_rdata),
.clut_enable (1'b0),
.clut_csa (5'd0),
.clut_read_idx (),
.clut_read_data(32'd0),
.hsync(hsync), .vsync(vsync), .de(de),
.r(r), .g(g), .b(b),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// gs_pcrtc_stub doesn't expose end_of_frame as a port; the Ch123 TB
// taps it via hierarchical ref. For the top wrapper we synthesize an
// equivalent edge by watching vsync rise.
logic vsync_d;
always_ff @(posedge clk) begin
if (!rst_n) vsync_d <= 1'b0;
else vsync_d <= vsync;
end
assign end_of_frame = vsync && !vsync_d;
// ---------------------------------------------------------------------
// Sticky status outputs.
// ---------------------------------------------------------------------
logic dma_done_seen_q;
logic frame_seen_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
dma_done_seen_q <= 1'b0;
frame_seen_q <= 1'b0;
end else begin
if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
dma_done_seen_q <= 1'b1;
if (end_of_frame)
frame_seen_q <= 1'b1;
end
end
assign dma_done_seen = dma_done_seen_q;
assign frame_seen = frame_seen_q;
// ---------------------------------------------------------------------
// Ch174 — event toggles for HPS-visible counters.
// ---------------------------------------------------------------------
logic frame_toggle_q;
logic dma_done_toggle_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
frame_toggle_q <= 1'b0;
dma_done_toggle_q <= 1'b0;
end else begin
if (end_of_frame)
frame_toggle_q <= ~frame_toggle_q;
if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
dma_done_toggle_q <= ~dma_done_toggle_q;
end
end
assign frame_toggle = frame_toggle_q;
assign dma_done_toggle = dma_done_toggle_q;
endmodule : top_psmct32_raster_demo
File diff suppressed because it is too large Load Diff