Files
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

213 lines
12 KiB
Systemverilog

// ============================================================================
// gs_tile_reload.sv (Ch323 Brick 2 — tile color+Z reload staging engine)
//
// The reload counterpart to the GS tile-flush writers, and a DIRECT structural
// clone of the silicon-proven gs_texture_cache (Ch322): an emif_clk fill FSM that
// reads a tile's worth of color+Z from FPGA-private LPDDR4B into on-chip staging
// RAMs, plus a design_clk serve port that returns one (color,Z) per tile index at
// the existing 1-cycle latency. gs_stub's TP_RELOAD phase sweeps the serve port and
// writes the tile color/Z RAMs before rendering. Same CDC shape as gs_texture_cache
// (one-shot warm fill, fill_done 2-FF synced into the serve clock) — NOT a new CDC.
//
// SEPARATE LPDDR bases (Codex): COLOR_BASE (the color framebuffer) and Z_BASE (the
// Z-backing region) are distinct. A 16x16 tile lives at FB stride STRIDE_BYTES per
// row (FBW*64*4 = 256 for FBW=1), so the fill reads ROW_BEATS 256-bit beats per row
// from each base — sparse/strided, exactly like the Ch322 texture (which read a
// 64-texel-stride region). Single-beat reads (arlen=0, the only proven EMIF pattern).
//
// Counters (Codex): color_beats, z_beats, rd_errs — all distinct.
// ============================================================================
`timescale 1ns/1ps
module gs_tile_reload #(
parameter [29:0] COLOR_BASE = 30'd0, // LPDDR byte base of the color framebuffer
parameter [29:0] Z_BASE = 30'h0010_0000,// LPDDR byte base of the Z-backing (DISTINCT)
parameter int TILE_W = 16,
parameter int TILE_H = 16,
parameter int STRIDE_BYTES = 256, // FB row stride (FBW*64 px * 4 B = 256 for FBW=1)
parameter int ROW_BEATS = 2, // 16 words/row * 4 B / 32 B = 2 single-beat reads
parameter int COLOR_W = 32
)(
// ---- AXI read clock domain (emif_clk) — fill side ----
input logic axi_clk,
input logic axi_rst_n,
input logic reload_start, // STROBE (gs/serve domain, CDC-synced): each RISING edge (re)fills
// Ch324 — RUNTIME per-tile byte offset into the raster LPDDR framebuffer. Latched at the fill
// arm (fs_edge) so it is stable for the whole fill. = ((tile_oy*(FBW*64)) + tile_ox)*4, the SAME
// formula the flush side uses, so reload gathers exactly the tile the spill wrote. 0 = origin
// tile (byte-identical to the Ch323 single-tile path). Quasi-static: gs_stub holds the current
// tile constant across TP_RELOAD, so sampling it at the synced strobe needs no extra CDC.
input logic [29:0] reload_base,
output logic reload_done, // tile fully resident (until the next fill arm)
output logic [31:0] color_beats, // color beats read (cumulative)
output logic [31:0] z_beats, // Z beats read (cumulative)
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
output logic [29:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready,
// ---- serve clock domain (design_clk) — gs_stub TP_RELOAD reads this ----
input logic serve_clk,
input logic [7:0] raddr, // tile index (row*16 + col), 0..255
output logic [COLOR_W-1:0] color_o, // 1-cycle REGISTERED color for raddr
output logic [31:0] z_o, // 1-cycle REGISTERED Z for raddr
output logic reload_ready // reload_done synced into serve_clk (TP_RELOAD ready gate)
);
localparam int N_ENTRIES = TILE_W*TILE_H; // 256
localparam int N_ROWS = TILE_H; // 16
localparam int WORDS_ROW = TILE_W; // 16 words/row
assign arburst = 2'b01; // INCR
assign arid = 7'd6; // distinct: writer=0/rd-probe=1/fcache=2/linebuf=3/texfill=4/wr-probe=5/tile-reload=6
assign arlen = 8'd0; // single-beat (only proven EMIF read pattern)
assign arsize = 3'b101; // 32 bytes
// On-chip staging RAMs: written by the fill FSM (axi_clk), read by gs_stub (serve_clk).
// One-shot warm fill => static during reads => no read/write CDC hazard (gs_texture_cache pattern).
logic [COLOR_W-1:0] color_ram [0:N_ENTRIES-1];
logic [31:0] z_ram [0:N_ENTRIES-1];
// ================= fill side (axi_clk) =================
// For each of N_ROWS rows, read ROW_BEATS color beats then ROW_BEATS Z beats. Each 256-bit
// beat = 8 words; WORDS_ROW=16 spans ROW_BEATS=2 beats. Store the row's 16 words into the
// staging RAM at indices row*16 + (0..15).
typedef enum logic [2:0] { R_IDLE, R_C_AR, R_C_R, R_C_W, R_Z_AR, R_Z_R, R_Z_W, R_DONE } rstate_t;
rstate_t rst_q;
logic [$clog2(N_ROWS):0] row;
logic [$clog2(ROW_BEATS):0] beat;
logic [2:0] lane; // serialized unpack lane 0..7 — ONE RAM write/cycle (M20K, not an 8-wide reg file)
logic [255:0] beat_q; // latched 256-bit beat, drained one 32-bit lane per cycle
logic [29:0] base_q; // Ch324 — per-tile byte offset latched at fill arm (stable across the fill)
logic [2:0] fs_sync;
// reload_start is a STROBE (gs_stub pulses it once per tile reload): trigger on the
// RISING edge only — one pulse => exactly one fill. (Was an any-edge toggle, which made
// a pulse trigger TWO fills; harmless but wasteful and confusing.)
wire fs_edge = fs_sync[1] & ~fs_sync[2];
function automatic [29:0] row_base(input [29:0] base, input int r);
row_base = base + r*STRIDE_BYTES;
endfunction
// SINGLE write port per RAM (one index, one data, per clock) so Quartus infers M20K
// instead of the 8-wide register file the old parallel beat-unpack forced (~8.7K ALMs).
// wa = row*WORDS_ROW + beat*8 + lane. Uses the CURRENT row/beat/lane; the lane==7 branch
// updates row/beat non-blockingly, so this cycle's write still targets the right entry.
wire [$clog2(N_ENTRIES)-1:0] wa = row[$clog2(N_ROWS)-1:0]*WORDS_ROW
+ beat[$clog2(ROW_BEATS)-1:0]*8 + lane;
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
rst_q <= R_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
row <= '0; beat <= '0; lane <= '0; reload_done <= 1'b0; base_q <= 30'd0;
color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0; fs_sync <= 3'd0;
end else begin
fs_sync <= {fs_sync[1:0], reload_start};
case (rst_q)
R_IDLE, R_DONE: begin
if (fs_edge) begin
reload_done <= 1'b0; color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0;
row <= '0; beat <= '0; lane <= '0;
base_q <= reload_base; // latch this tile's offset for the whole fill
araddr <= COLOR_BASE + reload_base; // row 0 color of THIS tile
arvalid <= 1'b1;
rst_q <= R_C_AR;
end
end
R_C_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_C_R; end
R_C_R: if (rvalid) begin // latch the beat; drain it serially in R_C_W
beat_q <= rdata;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
rready <= 1'b0; color_beats <= color_beats + 32'd1;
lane <= '0;
rst_q <= R_C_W;
end
R_C_W: begin // 8 cycles: one 32-bit lane -> color_ram per clock
color_ram[wa] <= beat_q[lane*32 +: 32];
if (lane == 3'd7) begin
if (beat == ROW_BEATS-1) begin // color row done -> Z row
beat <= '0;
araddr <= row_base(Z_BASE + base_q, row);
arvalid <= 1'b1;
rst_q <= R_Z_AR;
end else begin
beat <= beat + 1'b1;
araddr <= araddr + 30'd32;
arvalid <= 1'b1;
rst_q <= R_C_AR;
end
end else lane <= lane + 1'b1;
end
R_Z_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_Z_R; end
R_Z_R: if (rvalid) begin
beat_q <= rdata;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
rready <= 1'b0; z_beats <= z_beats + 32'd1;
lane <= '0;
rst_q <= R_Z_W;
end
R_Z_W: begin // 8 cycles: one 32-bit lane -> z_ram per clock
z_ram[wa] <= beat_q[lane*32 +: 32];
if (lane == 3'd7) begin
if (beat == ROW_BEATS-1) begin // Z row done -> next row (or finish).
// reload_done stays LOW until THIS final-row final-Z-lane write.
if (row == N_ROWS-1) begin
reload_done <= 1'b1;
rst_q <= R_DONE;
end else begin
row <= row + 1'b1;
beat <= '0;
araddr <= row_base(COLOR_BASE + base_q, row + 1);
arvalid <= 1'b1;
rst_q <= R_C_AR;
end
end else begin
beat <= beat + 1'b1;
araddr <= araddr + 30'd32;
arvalid <= 1'b1;
rst_q <= R_Z_AR;
end
end else lane <= lane + 1'b1;
end
default: rst_q <= R_IDLE;
endcase
end
end
// ================= serve side (serve_clk) =================
// 1-cycle REGISTERED read, identical timing to the tile RAM / vram read2.
always_ff @(posedge serve_clk) begin
color_o <= color_ram[raddr];
z_o <= z_ram[raddr];
end
// reload_ready handshake (Ch323 fix): a fresh reload_start MUST drop ready immediately,
// and ready re-raises only when THIS fill completes. Without this, a back-to-back reload
// (two tile batches) sees ready still high from the PREVIOUS fill and gs_stub sweeps the
// stale (pre-fill) z_ram before the new fill populates it — the reloaded Z is lost (the
// board's "region A wrong color" bug; reproduced in tb_gs_tile_spill_lpddr). reload_start
// is in the serve_clk (design) domain; reload_done is edge-detected after CDC.
logic [1:0] done_sync = 2'b00;
logic ready_q = 1'b0;
wire done_rise = done_sync[0] & ~done_sync[1];
always_ff @(posedge serve_clk) begin
done_sync <= {done_sync[0], reload_done};
if (reload_start) ready_q <= 1'b0; // new fill armed -> not ready
else if (done_rise) ready_q <= 1'b1; // this fill completed
end
// COMBINATIONALLY mask ready low while reload_start is asserted: gs_stub pulses
// reload_start and checks ready in the SAME cycle, so the registered clear above lands
// one cycle too late — without the mask gs_stub sees the PREVIOUS fill's stale ready=1
// and sweeps before this fill populates z_ram (the region-A-wrong-color bug).
assign reload_ready = ready_q & ~reload_start;
endmodule