ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
264 lines
13 KiB
Systemverilog
264 lines
13 KiB
Systemverilog
// retroDE_ps2 — vram_bram_stub (Ch154)
|
||
//
|
||
// Hardware-friendly sibling of `vram_stub`. Maps cleanly onto Agilex 5
|
||
// M20K block-RAM:
|
||
// - 2048 × 32-bit word storage (instead of 8192 × 8-bit byte
|
||
// storage). Internal width matches Agilex M20K native widths;
|
||
// external addressing stays byte-addressable to keep the same
|
||
// mental model as `vram_stub`.
|
||
// - SYNCHRONOUS reads (registered 32-bit output). One-cycle read
|
||
// latency — the rd_valid pulse fires the cycle the data is on
|
||
// read_data.
|
||
// - BYTE write enable only (4-bit `write_be`). The Ch106 PSMT4
|
||
// per-bit `write_mask` RMW is NOT supported; PSMT4 callers must
|
||
// do the nibble splice on the writer side BEFORE issuing the
|
||
// write here. Ch155+ task to rework gs_stub.raster_pixel_emit
|
||
// and gif_image_xfer_stub for that.
|
||
// - Two synchronous read ports. Quartus implements two
|
||
// independent read addresses by REPLICATING the M20K storage
|
||
// across two RAM blocks rather than using a single native
|
||
// dual-read port — exp_c shows 8 RAM Blocks for 8 KB vs
|
||
// exp_a's 4 RAM Blocks for the same 8 KB single-port shape.
|
||
// Two replicated RAM blocks is still vastly cheaper than the
|
||
// 65,536 flip-flops the legacy `vram_stub` shape produced;
|
||
// the cost just isn't free.
|
||
//
|
||
// Empirical motivation (Ch153 forensics):
|
||
// The legacy `vram_stub` shape (byte-addressable + combinational
|
||
// dual reads + per-bit-mask RMW) failed to fit on Agilex 5 — the
|
||
// 8 KB array consumed 65,536 dedicated registers and 261,578
|
||
// combinational nodes, dominating Ch152's 331 % ALM overrun.
|
||
// `exp_a_bram_friendly` proved that a 2048 × 32-bit sync-read
|
||
// byte-WE shape maps to 4 RAM Blocks + 0 registers + 46 ALMs.
|
||
//
|
||
// External port shape vs `vram_stub`:
|
||
// IDENTICAL: clk, rst_n, write_en, write_addr[31:0],
|
||
// write_data[31:0], write_be[3:0], read_addr[31:0],
|
||
// read_data[31:0], read2_addr[31:0], read2_data[31:0].
|
||
// NEW : read_valid + read2_valid (1-cycle pulse with the data).
|
||
// DROPPED : write_mask[31:0] (Ch106 per-bit RMW; callers must
|
||
// splice nibbles on the writer side).
|
||
//
|
||
// Address contract:
|
||
// - Writes: write_addr is byte-aligned; the low 2 bits MUST be 0
|
||
// (4-byte writes only). Each `write_be[i]` independently
|
||
// commits byte `i` of the addressed word. Per-byte non-
|
||
// wrapping admission: an enabled byte beyond `BYTES`
|
||
// drops the WHOLE write (matches vram_stub Ch95 audit).
|
||
// - Reads: read_addr is byte-aligned; the low 2 bits MUST be 0.
|
||
// `read_data` is the 32-bit word at `read_addr / 4`.
|
||
// Byte / halfword extraction is the caller's job
|
||
// (matches Ch141 / Ch142 nibble-readback pattern).
|
||
//
|
||
// Sim behaviour: time-0 mem is power-on-zero matching real M20K (the
|
||
// `// synthesis translate_off` initial block matches vram_stub's
|
||
// post-Ch152 pattern).
|
||
|
||
`timescale 1ns/1ps
|
||
|
||
module vram_bram_stub #(
|
||
parameter int unsigned BYTES = 8192,
|
||
|
||
// Ch251.4 — hardware-demo M20K rescue. When ENABLE_READ2 = 0, the
|
||
// second sync-read port is FEATURE-STRIPPED: `read2_data` ties to
|
||
// 0, `read2_valid` ties to 0, and Quartus no longer infers a
|
||
// separate read port on `mem`. This collapses the storage from
|
||
// two replicated 1W+1R simple-dual-port M20K banks (~410 M20Ks at
|
||
// 512 KiB) to ONE 1W+1R bank (~205 M20Ks) — the savings that get
|
||
// the 512 KiB framebuffer to fit on Agilex 5 (358 M20K budget).
|
||
//
|
||
// Contract caveat: read2 is the PSMT4 RMW old-byte read path. Any
|
||
// build that exercises PSMT4 rasterization MUST keep this `1`. The
|
||
// PSMCT32-only hardware demo (top_psmct32_raster_demo_bram) sets
|
||
// it to `0`; all simulation TBs leave it at the default `1`.
|
||
//
|
||
// This is a SCOPED build profile, not a general fix — see
|
||
// docs/decisions/0006-vram-roadmap.md for the longer-term
|
||
// arbitrated / line-buffered VRAM plan.
|
||
parameter bit ENABLE_READ2 = 1'b1
|
||
) (
|
||
input logic clk,
|
||
input logic rst_n,
|
||
|
||
// Write port (byte-WE; 4-byte-aligned write_addr).
|
||
input logic write_en,
|
||
input logic [31:0] write_addr,
|
||
input logic [31:0] write_data,
|
||
input logic [3:0] write_be,
|
||
|
||
// Read port 0 (sync read; 4-byte-aligned read_addr).
|
||
input logic [31:0] read_addr,
|
||
output logic [31:0] read_data,
|
||
output logic read_valid,
|
||
|
||
// Read port 1 (sync read; 4-byte-aligned).
|
||
input logic [31:0] read2_addr,
|
||
output logic [31:0] read2_data,
|
||
output logic read2_valid
|
||
);
|
||
|
||
// 2048 × 32-bit storage. Index is the WORD index (write_addr / 4).
|
||
//
|
||
// Parameter contract: `BYTES` MUST be a power-of-two multiple of 4.
|
||
// The WORD_AW-bit slice `*_addr[WORD_AW+1:2]` truncates the byte
|
||
// address to a word index; for non-power-of-two `WORDS`, an out-
|
||
// of-range byte address can map to a slice value that exceeds
|
||
// `WORDS-1` and indexes beyond `mem[]`. `read_valid` already
|
||
// marks such reads invalid downstream, but the BRAM read template
|
||
// still indexes the array unconditionally to satisfy Quartus's
|
||
// M20K inference (Ch154 audit), so the index itself must remain
|
||
// in bounds. The Ch155 audit-low fix: clamp the read indices
|
||
// with `& (WORDS-1)` so a power-of-two depth is required AND any
|
||
// bit beyond the legal slice is masked away. Power-of-two also
|
||
// matches every Agilex M20K depth target (256/512/1024/2048/...).
|
||
localparam int unsigned WORDS = BYTES / 4;
|
||
localparam int unsigned WORD_AW = $clog2(WORDS);
|
||
|
||
logic [31:0] mem [0:WORDS-1];
|
||
|
||
// synthesis translate_off
|
||
initial begin
|
||
if (BYTES < 4 || (BYTES & 32'd3) != 0)
|
||
$error("vram_bram_stub: BYTES (%0d) must be >= 4 and a multiple of 4", BYTES);
|
||
// Power-of-two check on WORDS: (WORDS != 0) && ((WORDS & (WORDS-1)) == 0).
|
||
if (WORDS == 0 || (WORDS & (WORDS - 1)) != 0)
|
||
$error("vram_bram_stub: BYTES (%0d) must yield a power-of-two WORDS depth (got %0d)",
|
||
BYTES, WORDS);
|
||
|
||
// Ch252 — VRAM replication tripwire (simulation/elaboration only).
|
||
//
|
||
// At BYTES >= 256 KiB, each 1W+1R simple-dual-port replica costs
|
||
// ~100 M20Ks. With ENABLE_READ2 = 1, Quartus replicates the
|
||
// storage to give the second read its own port, doubling that
|
||
// cost (>= 200 M20Ks per pair). Above this threshold a Quartus
|
||
// fitter overrun on Agilex 5 (358 M20K budget) becomes likely.
|
||
//
|
||
// This `$fatal` runs in simulation and elaboration-aware lint
|
||
// tools — it is the loud canary. The REAL protection is the
|
||
// board-top profile: hardware builds explicitly set
|
||
// ENABLE_READ2 = 0 when VRAM_BYTES is large (see
|
||
// de25_nano_psmct32_raster_demo_top). Re-enabling read2 on a
|
||
// large hardware VRAM requires landing one of the architectural
|
||
// follow-ups in docs/decisions/0006-vram-roadmap.md first.
|
||
if (ENABLE_READ2 && (BYTES >= 32'd262144)) begin
|
||
$display("vram_bram_stub: ENABLE_READ2=1 with BYTES=%0d (>= 256 KiB) trips the replication tripwire.", BYTES);
|
||
$display(" The 2nd read port forces Quartus to replicate the storage, ~doubling M20K cost.");
|
||
$display(" Either set ENABLE_READ2=0 (PSMCT32-only hardware profile) or land the");
|
||
$display(" arbitrated/line-buffered VRAM follow-up before re-enabling read2 at this size.");
|
||
$display(" See docs/decisions/0006-vram-roadmap.md.");
|
||
$fatal(1, "vram_bram_stub: replication-tripwire fatal exit");
|
||
end
|
||
|
||
for (int i = 0; i < int'(WORDS); i++) mem[i] = 32'd0;
|
||
end
|
||
// synthesis translate_on
|
||
|
||
// ----------------------------------------------------------------
|
||
// Write port — per-byte WE, per-byte non-wrapping admission.
|
||
// ----------------------------------------------------------------
|
||
logic [32:0] addr33;
|
||
logic admit_b0, admit_b1, admit_b2, admit_b3;
|
||
logic write_admit;
|
||
assign addr33 = {1'b0, write_addr};
|
||
assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
|
||
assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
|
||
assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
|
||
assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
|
||
assign write_admit = write_en
|
||
&& (write_addr[1:0] == 2'b00) // word-aligned
|
||
&& (!write_be[0] || admit_b0)
|
||
&& (!write_be[1] || admit_b1)
|
||
&& (!write_be[2] || admit_b2)
|
||
&& (!write_be[3] || admit_b3);
|
||
|
||
logic [WORD_AW-1:0] write_word_idx;
|
||
assign write_word_idx = write_addr[WORD_AW+1:2];
|
||
|
||
// BRAM-native byte-WE template — each `if (write_be[i])` slice
|
||
// updates a separate 8-bit lane of the 32-bit word. This is the
|
||
// canonical Quartus inference shape (proven in Ch153 exp_a).
|
||
always_ff @(posedge clk) begin
|
||
if (rst_n && write_admit) begin
|
||
if (write_be[0]) mem[write_word_idx][ 7: 0] <= write_data[ 7: 0];
|
||
if (write_be[1]) mem[write_word_idx][15: 8] <= write_data[15: 8];
|
||
if (write_be[2]) mem[write_word_idx][23:16] <= write_data[23:16];
|
||
if (write_be[3]) mem[write_word_idx][31:24] <= write_data[31:24];
|
||
end
|
||
end
|
||
|
||
// ----------------------------------------------------------------
|
||
// Read ports — sync, registered output, 1-cycle latency.
|
||
//
|
||
// The read path is the CANONICAL Quartus M20K inference template:
|
||
// a single unconditional `read_data <= mem[idx]` registered
|
||
// assignment, with NO reset on the data register and NO read-side
|
||
// gating. Quartus rejected an earlier draft that gated reads on
|
||
// `read_addr[1:0]==2'b00 && in-bounds` with
|
||
// "Info (276007): RAM logic ... uninferred due to asynchronous
|
||
// read logic"
|
||
// and synthesized the storage as flip-flops. Bounds + alignment
|
||
// checks land on the separate `read_valid` pipeline below where
|
||
// they don't poison the data path.
|
||
// ----------------------------------------------------------------
|
||
// Word-index extraction. For a power-of-two `WORDS` depth (the
|
||
// parameter contract enforced above), the slice
|
||
// `read_addr[WORD_AW+1:2]` is naturally bounded to `[0, WORDS-1]`
|
||
// — the high bits beyond WORD_AW+1 represent address ranges
|
||
// already rejected by the `read_valid` gate below. The mask
|
||
// `& WORD_AW'(WORDS - 1)` is redundant for power-of-two WORDS
|
||
// (it just keeps the same bits) but documents the contract: a
|
||
// future relaxation that allows non-power-of-two depths would
|
||
// need to either remove that change OR force the mem-read index
|
||
// through a real range-clamp rather than relying on the natural
|
||
// truncation.
|
||
logic [WORD_AW-1:0] read_word_idx;
|
||
assign read_word_idx = read_addr [WORD_AW+1:2] & WORD_AW'(WORDS - 1);
|
||
|
||
always_ff @(posedge clk) begin
|
||
read_data <= mem[read_word_idx];
|
||
end
|
||
|
||
// Out-of-range / misaligned detection on a parallel pipeline so
|
||
// it doesn't gate the BRAM read path. read_valid pulses 1 cycle
|
||
// late, aligned with read_data.
|
||
logic read_in_range_pre;
|
||
assign read_in_range_pre = (read_addr [1:0] == 2'b00) &&
|
||
({1'b0, read_addr } + 33'd3 < 33'(BYTES));
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) read_valid <= 1'b0;
|
||
else read_valid <= read_in_range_pre;
|
||
end
|
||
|
||
// ----------------------------------------------------------------
|
||
// Read port 1 — feature-strippable via ENABLE_READ2 (Ch251.4).
|
||
// When ENABLE_READ2=1: full sync read + range gate, matching the
|
||
// pre-Ch251.4 behaviour. When ENABLE_READ2=0: NO reference to
|
||
// `mem` from this branch, so Quartus does not infer a second M20K
|
||
// read port and the VRAM storage stops replicating.
|
||
// ----------------------------------------------------------------
|
||
generate
|
||
if (ENABLE_READ2) begin : g_read2_en
|
||
logic [WORD_AW-1:0] read2_word_idx;
|
||
assign read2_word_idx = read2_addr[WORD_AW+1:2] & WORD_AW'(WORDS - 1);
|
||
|
||
always_ff @(posedge clk) begin
|
||
read2_data <= mem[read2_word_idx];
|
||
end
|
||
|
||
logic read2_in_range_pre;
|
||
assign read2_in_range_pre = (read2_addr[1:0] == 2'b00) &&
|
||
({1'b0, read2_addr} + 33'd3 < 33'(BYTES));
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) read2_valid <= 1'b0;
|
||
else read2_valid <= read2_in_range_pre;
|
||
end
|
||
end else begin : g_read2_dis
|
||
always_ff @(posedge clk) begin
|
||
read2_data <= 32'd0;
|
||
read2_valid <= 1'b0;
|
||
end
|
||
end
|
||
endgenerate
|
||
|
||
endmodule : vram_bram_stub
|