ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
186 lines
8.4 KiB
Systemverilog
186 lines
8.4 KiB
Systemverilog
// retroDE_ps2 — vram_stub (Ch89)
|
||
//
|
||
// Linear byte-addressable VRAM backing store for gs_stub's
|
||
// `raster_pixel_emit` channel. This is the FIRST persistence
|
||
// layer the rasterizer has had — pre-Ch89, pixels only pulsed as
|
||
// trace-visible events and updated `raster_pixel_color_q` /
|
||
// `raster_pixel_fb_addr_q` snapshot regs, then evaporated. Now
|
||
// they actually land somewhere a TB (or a future scanout path)
|
||
// can read back.
|
||
//
|
||
// Scope (intentionally minimal for Ch89):
|
||
// - Linear byte-addressable: NO page/block VRAM swizzle. Real
|
||
// PS2 VRAM is 4 MiB, organized into pages × blocks × columns
|
||
// per PSM. The fb_addr math in gs_stub matches the linear-
|
||
// framebuffer layout that PCSX2's gs_state pages out for
|
||
// "linear" PSM channels; that's what this stub speaks.
|
||
// - PSMCT32 only: writes 4 bytes per emitted pixel. PSMCT16
|
||
// (2 bytes) and PSMT8 (1 byte) are deferred until a future
|
||
// chapter exposes per-pixel PSM at the raster channel.
|
||
// - Combinational debug read port: byte-addressable, returns
|
||
// the 4 bytes starting at read_addr packed little-endian.
|
||
// For TBs to verify pixel storage; not on any hardware path.
|
||
//
|
||
// Wiring contract:
|
||
// - write_en ← gs_stub.raster_pixel_emit
|
||
// - write_addr ← gs_stub.raster_pixel_fb_addr_q
|
||
// - write_data ← gs_stub.raster_pixel_color_q[31:0] (lower 32 bits)
|
||
// - write_be ← gs_stub.raster_pixel_be_q (Ch95)
|
||
//
|
||
// The full 64-bit raster_pixel_color_q carries Q (texture-coord
|
||
// IEEE float) in the upper 32 bits — those bits are NOT part of
|
||
// the framebuffer pixel and are deliberately discarded here.
|
||
//
|
||
// `write_be[3:0]` (Ch95): per-byte write enable. byte i (the
|
||
// byte at `write_addr + i`) is committed only when
|
||
// `write_en && write_be[i]`. PSMCT32 writes use 4'b1111;
|
||
// PSMCT16 writes use 4'b0011 (the 2 bytes at write_addr — gs_stub
|
||
// passes the actual byte address of the pixel, which is
|
||
// 2-byte-aligned but not necessarily 4-byte-aligned). TBs that
|
||
// bypass gs_stub (e.g. `tb_vram_stub`, `tb_gs_scanout_psm16`)
|
||
// tie write_be to 4'b1111.
|
||
//
|
||
// `write_mask[31:0]` (Ch106): per-BIT merge mask used to support
|
||
// sub-byte writes (PSMT4 — 4-bit nibble per pixel). The committed
|
||
// byte i (still gated by write_be[i]) is:
|
||
// mem[addr+i] <= (mem[addr+i] & ~mask_i) | (data_i & mask_i)
|
||
// where mask_i = write_mask[i*8 +: 8] and data_i =
|
||
// write_data[i*8 +: 8]. PSMCT32/16 + PSMT8 writes tie write_mask
|
||
// to 32'hFFFFFFFF (full byte writes — equivalent to the pre-Ch106
|
||
// behavior). PSMT4 emits use 0x0F (low nibble) or 0xF0 (high
|
||
// nibble) on the enabled byte. The merge happens inside the same
|
||
// always_ff that commits the byte, so back-to-back nibble writes
|
||
// to the SAME byte chain cleanly through NBA semantics: the
|
||
// second write samples mem[addr] AFTER the prior NBA committed.
|
||
//
|
||
// Bounds check (Ch95 audit-medium fix): the write is admitted
|
||
// only if EVERY enabled byte's address is in [0, BYTES). This
|
||
// uses non-wrapping 33-bit arithmetic so a write near the 32-bit
|
||
// address space limit (e.g. write_addr near 0xFFFF_FFFC with
|
||
// be=4'b1111) is rejected cleanly. Halfword writes at the last
|
||
// valid 2-byte slot (write_addr=BYTES-2 with be=4'b0011) are
|
||
// accepted; write_addr=BYTES-1 with be=4'b0011 is rejected
|
||
// because byte 1 of that slot is OOB.
|
||
|
||
`timescale 1ns/1ps
|
||
|
||
module vram_stub
|
||
#(
|
||
parameter int unsigned BYTES = 65536
|
||
) (
|
||
input logic clk,
|
||
input logic rst_n,
|
||
|
||
// Write side: one 32-bit pixel slot per cycle when write_en
|
||
// pulses. write_addr is a byte offset (already PSM-aware via
|
||
// gs_stub's bpp_shift math). write_be[i] gates byte i — used
|
||
// by Ch95 to commit just the 2 bytes of a PSMCT16 pixel
|
||
// without stomping the adjacent halfword.
|
||
input logic write_en,
|
||
input logic [31:0] write_addr,
|
||
input logic [31:0] write_data,
|
||
input logic [3:0] write_be,
|
||
input logic [31:0] write_mask,
|
||
|
||
// Debug read port: combinational, byte-addressable, little-
|
||
// endian 4-byte read. Used by gs_pcrtc_stub for scanout, and
|
||
// by TBs for verification.
|
||
input logic [31:0] read_addr,
|
||
output logic [31:0] read_data,
|
||
|
||
// Ch99 — second combinational read port for clients that
|
||
// need to read VRAM concurrently with pcrtc scanout (the
|
||
// canonical example is `clut_loader_stub`, which copies
|
||
// CLUT bytes from VRAM into clut_stub when TEX0.CLD fires).
|
||
// Same byte-addressed 4-byte semantics as port 0. Tie
|
||
// `read2_addr` to 0 in TBs that don't use it; the unused
|
||
// `read2_data` output can be left unconnected.
|
||
input logic [31:0] read2_addr,
|
||
output logic [31:0] read2_data
|
||
);
|
||
|
||
logic [7:0] mem [0:BYTES-1];
|
||
|
||
// Largest base address that admits a 4-byte access without
|
||
// overrunning the array. Used by the READ port (always 4
|
||
// bytes). The write port now does per-byte admission below
|
||
// (Ch95 audit-medium fix) so it can accept halfword writes
|
||
// near the end of VRAM that the old `addr <= MAX_BASE` gate
|
||
// would have spuriously dropped.
|
||
localparam logic [31:0] MAX_BASE = (BYTES >= 4)
|
||
? (32'(BYTES) - 32'd4)
|
||
: 32'd0;
|
||
|
||
// Sim-only memory init. Real Altera/Intel BRAM is power-on-zero
|
||
// on FPGA configuration, so the procedural loop is unnecessary
|
||
// in synthesis — and at BYTES=8192 it exceeds Quartus's 5000-
|
||
// iteration synthesizable-loop limit (Quartus error 13356).
|
||
// The pragma pair tells Quartus to skip this initial block;
|
||
// iverilog and other simulators ignore the pragma and run the
|
||
// init normally so time-0 values are deterministic in sim.
|
||
// synthesis translate_off
|
||
initial begin
|
||
if (BYTES < 4)
|
||
$error("vram_stub: BYTES (%0d) must be >= 4", BYTES);
|
||
for (int i = 0; i < BYTES; i++) mem[i] = 8'd0;
|
||
end
|
||
// synthesis translate_on
|
||
|
||
always_comb begin
|
||
if (read_addr <= MAX_BASE) begin
|
||
read_data = {mem[read_addr + 32'd3],
|
||
mem[read_addr + 32'd2],
|
||
mem[read_addr + 32'd1],
|
||
mem[read_addr]};
|
||
end else begin
|
||
read_data = 32'd0;
|
||
end
|
||
end
|
||
|
||
always_comb begin
|
||
if (read2_addr <= MAX_BASE) begin
|
||
read2_data = {mem[read2_addr + 32'd3],
|
||
mem[read2_addr + 32'd2],
|
||
mem[read2_addr + 32'd1],
|
||
mem[read2_addr]};
|
||
end else begin
|
||
read2_data = 32'd0;
|
||
end
|
||
end
|
||
|
||
// Per-byte admission. We use non-wrapping 33-bit arithmetic
|
||
// for `write_addr + i` so a near-0xFFFFFFFF address can't
|
||
// wrap and falsely pass the comparison. An enabled byte is
|
||
// admitted only if its byte address is strictly less than
|
||
// BYTES; the entire write is dropped if ANY enabled byte
|
||
// would land out of range, matching the Ch89-audit "no
|
||
// partial writes near the boundary" stance.
|
||
logic [32:0] addr33;
|
||
logic admit_b0, admit_b1, admit_b2, admit_b3;
|
||
logic write_admit;
|
||
assign addr33 = {1'b0, write_addr};
|
||
assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
|
||
assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
|
||
assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
|
||
assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
|
||
assign write_admit = write_en
|
||
&& (!write_be[0] || admit_b0)
|
||
&& (!write_be[1] || admit_b1)
|
||
&& (!write_be[2] || admit_b2)
|
||
&& (!write_be[3] || admit_b3);
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (rst_n && write_admit) begin
|
||
if (write_be[0]) mem[write_addr] <= (mem[write_addr] & ~write_mask[7:0])
|
||
| (write_data[7:0] & write_mask[7:0]);
|
||
if (write_be[1]) mem[write_addr + 32'd1] <= (mem[write_addr + 32'd1] & ~write_mask[15:8])
|
||
| (write_data[15:8] & write_mask[15:8]);
|
||
if (write_be[2]) mem[write_addr + 32'd2] <= (mem[write_addr + 32'd2] & ~write_mask[23:16])
|
||
| (write_data[23:16] & write_mask[23:16]);
|
||
if (write_be[3]) mem[write_addr + 32'd3] <= (mem[write_addr + 32'd3] & ~write_mask[31:24])
|
||
| (write_data[31:24] & write_mask[31:24]);
|
||
end
|
||
end
|
||
|
||
endmodule : vram_stub
|