Files
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

186 lines
8.4 KiB
Systemverilog
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// retroDE_ps2 — vram_stub (Ch89)
//
// Linear byte-addressable VRAM backing store for gs_stub's
// `raster_pixel_emit` channel. This is the FIRST persistence
// layer the rasterizer has had — pre-Ch89, pixels only pulsed as
// trace-visible events and updated `raster_pixel_color_q` /
// `raster_pixel_fb_addr_q` snapshot regs, then evaporated. Now
// they actually land somewhere a TB (or a future scanout path)
// can read back.
//
// Scope (intentionally minimal for Ch89):
// - Linear byte-addressable: NO page/block VRAM swizzle. Real
// PS2 VRAM is 4 MiB, organized into pages × blocks × columns
// per PSM. The fb_addr math in gs_stub matches the linear-
// framebuffer layout that PCSX2's gs_state pages out for
// "linear" PSM channels; that's what this stub speaks.
// - PSMCT32 only: writes 4 bytes per emitted pixel. PSMCT16
// (2 bytes) and PSMT8 (1 byte) are deferred until a future
// chapter exposes per-pixel PSM at the raster channel.
// - Combinational debug read port: byte-addressable, returns
// the 4 bytes starting at read_addr packed little-endian.
// For TBs to verify pixel storage; not on any hardware path.
//
// Wiring contract:
// - write_en ← gs_stub.raster_pixel_emit
// - write_addr ← gs_stub.raster_pixel_fb_addr_q
// - write_data ← gs_stub.raster_pixel_color_q[31:0] (lower 32 bits)
// - write_be ← gs_stub.raster_pixel_be_q (Ch95)
//
// The full 64-bit raster_pixel_color_q carries Q (texture-coord
// IEEE float) in the upper 32 bits — those bits are NOT part of
// the framebuffer pixel and are deliberately discarded here.
//
// `write_be[3:0]` (Ch95): per-byte write enable. byte i (the
// byte at `write_addr + i`) is committed only when
// `write_en && write_be[i]`. PSMCT32 writes use 4'b1111;
// PSMCT16 writes use 4'b0011 (the 2 bytes at write_addr — gs_stub
// passes the actual byte address of the pixel, which is
// 2-byte-aligned but not necessarily 4-byte-aligned). TBs that
// bypass gs_stub (e.g. `tb_vram_stub`, `tb_gs_scanout_psm16`)
// tie write_be to 4'b1111.
//
// `write_mask[31:0]` (Ch106): per-BIT merge mask used to support
// sub-byte writes (PSMT4 — 4-bit nibble per pixel). The committed
// byte i (still gated by write_be[i]) is:
// mem[addr+i] <= (mem[addr+i] & ~mask_i) | (data_i & mask_i)
// where mask_i = write_mask[i*8 +: 8] and data_i =
// write_data[i*8 +: 8]. PSMCT32/16 + PSMT8 writes tie write_mask
// to 32'hFFFFFFFF (full byte writes — equivalent to the pre-Ch106
// behavior). PSMT4 emits use 0x0F (low nibble) or 0xF0 (high
// nibble) on the enabled byte. The merge happens inside the same
// always_ff that commits the byte, so back-to-back nibble writes
// to the SAME byte chain cleanly through NBA semantics: the
// second write samples mem[addr] AFTER the prior NBA committed.
//
// Bounds check (Ch95 audit-medium fix): the write is admitted
// only if EVERY enabled byte's address is in [0, BYTES). This
// uses non-wrapping 33-bit arithmetic so a write near the 32-bit
// address space limit (e.g. write_addr near 0xFFFF_FFFC with
// be=4'b1111) is rejected cleanly. Halfword writes at the last
// valid 2-byte slot (write_addr=BYTES-2 with be=4'b0011) are
// accepted; write_addr=BYTES-1 with be=4'b0011 is rejected
// because byte 1 of that slot is OOB.
`timescale 1ns/1ps
module vram_stub
#(
parameter int unsigned BYTES = 65536
) (
input logic clk,
input logic rst_n,
// Write side: one 32-bit pixel slot per cycle when write_en
// pulses. write_addr is a byte offset (already PSM-aware via
// gs_stub's bpp_shift math). write_be[i] gates byte i — used
// by Ch95 to commit just the 2 bytes of a PSMCT16 pixel
// without stomping the adjacent halfword.
input logic write_en,
input logic [31:0] write_addr,
input logic [31:0] write_data,
input logic [3:0] write_be,
input logic [31:0] write_mask,
// Debug read port: combinational, byte-addressable, little-
// endian 4-byte read. Used by gs_pcrtc_stub for scanout, and
// by TBs for verification.
input logic [31:0] read_addr,
output logic [31:0] read_data,
// Ch99 — second combinational read port for clients that
// need to read VRAM concurrently with pcrtc scanout (the
// canonical example is `clut_loader_stub`, which copies
// CLUT bytes from VRAM into clut_stub when TEX0.CLD fires).
// Same byte-addressed 4-byte semantics as port 0. Tie
// `read2_addr` to 0 in TBs that don't use it; the unused
// `read2_data` output can be left unconnected.
input logic [31:0] read2_addr,
output logic [31:0] read2_data
);
logic [7:0] mem [0:BYTES-1];
// Largest base address that admits a 4-byte access without
// overrunning the array. Used by the READ port (always 4
// bytes). The write port now does per-byte admission below
// (Ch95 audit-medium fix) so it can accept halfword writes
// near the end of VRAM that the old `addr <= MAX_BASE` gate
// would have spuriously dropped.
localparam logic [31:0] MAX_BASE = (BYTES >= 4)
? (32'(BYTES) - 32'd4)
: 32'd0;
// Sim-only memory init. Real Altera/Intel BRAM is power-on-zero
// on FPGA configuration, so the procedural loop is unnecessary
// in synthesis — and at BYTES=8192 it exceeds Quartus's 5000-
// iteration synthesizable-loop limit (Quartus error 13356).
// The pragma pair tells Quartus to skip this initial block;
// iverilog and other simulators ignore the pragma and run the
// init normally so time-0 values are deterministic in sim.
// synthesis translate_off
initial begin
if (BYTES < 4)
$error("vram_stub: BYTES (%0d) must be >= 4", BYTES);
for (int i = 0; i < BYTES; i++) mem[i] = 8'd0;
end
// synthesis translate_on
always_comb begin
if (read_addr <= MAX_BASE) begin
read_data = {mem[read_addr + 32'd3],
mem[read_addr + 32'd2],
mem[read_addr + 32'd1],
mem[read_addr]};
end else begin
read_data = 32'd0;
end
end
always_comb begin
if (read2_addr <= MAX_BASE) begin
read2_data = {mem[read2_addr + 32'd3],
mem[read2_addr + 32'd2],
mem[read2_addr + 32'd1],
mem[read2_addr]};
end else begin
read2_data = 32'd0;
end
end
// Per-byte admission. We use non-wrapping 33-bit arithmetic
// for `write_addr + i` so a near-0xFFFFFFFF address can't
// wrap and falsely pass the comparison. An enabled byte is
// admitted only if its byte address is strictly less than
// BYTES; the entire write is dropped if ANY enabled byte
// would land out of range, matching the Ch89-audit "no
// partial writes near the boundary" stance.
logic [32:0] addr33;
logic admit_b0, admit_b1, admit_b2, admit_b3;
logic write_admit;
assign addr33 = {1'b0, write_addr};
assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
assign write_admit = write_en
&& (!write_be[0] || admit_b0)
&& (!write_be[1] || admit_b1)
&& (!write_be[2] || admit_b2)
&& (!write_be[3] || admit_b3);
always_ff @(posedge clk) begin
if (rst_n && write_admit) begin
if (write_be[0]) mem[write_addr] <= (mem[write_addr] & ~write_mask[7:0])
| (write_data[7:0] & write_mask[7:0]);
if (write_be[1]) mem[write_addr + 32'd1] <= (mem[write_addr + 32'd1] & ~write_mask[15:8])
| (write_data[15:8] & write_mask[15:8]);
if (write_be[2]) mem[write_addr + 32'd2] <= (mem[write_addr + 32'd2] & ~write_mask[23:16])
| (write_data[23:16] & write_mask[23:16]);
if (write_be[3]) mem[write_addr + 32'd3] <= (mem[write_addr + 32'd3] & ~write_mask[31:24])
| (write_data[31:24] & write_mask[31:24]);
end
end
endmodule : vram_stub