ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
213 lines
12 KiB
Systemverilog
213 lines
12 KiB
Systemverilog
// ============================================================================
|
|
// gs_tile_reload.sv (Ch323 Brick 2 — tile color+Z reload staging engine)
|
|
//
|
|
// The reload counterpart to the GS tile-flush writers, and a DIRECT structural
|
|
// clone of the silicon-proven gs_texture_cache (Ch322): an emif_clk fill FSM that
|
|
// reads a tile's worth of color+Z from FPGA-private LPDDR4B into on-chip staging
|
|
// RAMs, plus a design_clk serve port that returns one (color,Z) per tile index at
|
|
// the existing 1-cycle latency. gs_stub's TP_RELOAD phase sweeps the serve port and
|
|
// writes the tile color/Z RAMs before rendering. Same CDC shape as gs_texture_cache
|
|
// (one-shot warm fill, fill_done 2-FF synced into the serve clock) — NOT a new CDC.
|
|
//
|
|
// SEPARATE LPDDR bases (Codex): COLOR_BASE (the color framebuffer) and Z_BASE (the
|
|
// Z-backing region) are distinct. A 16x16 tile lives at FB stride STRIDE_BYTES per
|
|
// row (FBW*64*4 = 256 for FBW=1), so the fill reads ROW_BEATS 256-bit beats per row
|
|
// from each base — sparse/strided, exactly like the Ch322 texture (which read a
|
|
// 64-texel-stride region). Single-beat reads (arlen=0, the only proven EMIF pattern).
|
|
//
|
|
// Counters (Codex): color_beats, z_beats, rd_errs — all distinct.
|
|
// ============================================================================
|
|
`timescale 1ns/1ps
|
|
|
|
module gs_tile_reload #(
|
|
parameter [29:0] COLOR_BASE = 30'd0, // LPDDR byte base of the color framebuffer
|
|
parameter [29:0] Z_BASE = 30'h0010_0000,// LPDDR byte base of the Z-backing (DISTINCT)
|
|
parameter int TILE_W = 16,
|
|
parameter int TILE_H = 16,
|
|
parameter int STRIDE_BYTES = 256, // FB row stride (FBW*64 px * 4 B = 256 for FBW=1)
|
|
parameter int ROW_BEATS = 2, // 16 words/row * 4 B / 32 B = 2 single-beat reads
|
|
parameter int COLOR_W = 32
|
|
)(
|
|
// ---- AXI read clock domain (emif_clk) — fill side ----
|
|
input logic axi_clk,
|
|
input logic axi_rst_n,
|
|
input logic reload_start, // STROBE (gs/serve domain, CDC-synced): each RISING edge (re)fills
|
|
// Ch324 — RUNTIME per-tile byte offset into the raster LPDDR framebuffer. Latched at the fill
|
|
// arm (fs_edge) so it is stable for the whole fill. = ((tile_oy*(FBW*64)) + tile_ox)*4, the SAME
|
|
// formula the flush side uses, so reload gathers exactly the tile the spill wrote. 0 = origin
|
|
// tile (byte-identical to the Ch323 single-tile path). Quasi-static: gs_stub holds the current
|
|
// tile constant across TP_RELOAD, so sampling it at the synced strobe needs no extra CDC.
|
|
input logic [29:0] reload_base,
|
|
output logic reload_done, // tile fully resident (until the next fill arm)
|
|
output logic [31:0] color_beats, // color beats read (cumulative)
|
|
output logic [31:0] z_beats, // Z beats read (cumulative)
|
|
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
|
|
|
|
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
|
|
output logic [29:0] araddr,
|
|
output logic [1:0] arburst,
|
|
output logic [6:0] arid,
|
|
output logic [7:0] arlen,
|
|
output logic [2:0] arsize,
|
|
output logic arvalid,
|
|
input logic arready,
|
|
input logic [255:0] rdata,
|
|
input logic [1:0] rresp,
|
|
input logic rlast,
|
|
input logic rvalid,
|
|
output logic rready,
|
|
|
|
// ---- serve clock domain (design_clk) — gs_stub TP_RELOAD reads this ----
|
|
input logic serve_clk,
|
|
input logic [7:0] raddr, // tile index (row*16 + col), 0..255
|
|
output logic [COLOR_W-1:0] color_o, // 1-cycle REGISTERED color for raddr
|
|
output logic [31:0] z_o, // 1-cycle REGISTERED Z for raddr
|
|
output logic reload_ready // reload_done synced into serve_clk (TP_RELOAD ready gate)
|
|
);
|
|
localparam int N_ENTRIES = TILE_W*TILE_H; // 256
|
|
localparam int N_ROWS = TILE_H; // 16
|
|
localparam int WORDS_ROW = TILE_W; // 16 words/row
|
|
|
|
assign arburst = 2'b01; // INCR
|
|
assign arid = 7'd6; // distinct: writer=0/rd-probe=1/fcache=2/linebuf=3/texfill=4/wr-probe=5/tile-reload=6
|
|
assign arlen = 8'd0; // single-beat (only proven EMIF read pattern)
|
|
assign arsize = 3'b101; // 32 bytes
|
|
|
|
// On-chip staging RAMs: written by the fill FSM (axi_clk), read by gs_stub (serve_clk).
|
|
// One-shot warm fill => static during reads => no read/write CDC hazard (gs_texture_cache pattern).
|
|
logic [COLOR_W-1:0] color_ram [0:N_ENTRIES-1];
|
|
logic [31:0] z_ram [0:N_ENTRIES-1];
|
|
|
|
// ================= fill side (axi_clk) =================
|
|
// For each of N_ROWS rows, read ROW_BEATS color beats then ROW_BEATS Z beats. Each 256-bit
|
|
// beat = 8 words; WORDS_ROW=16 spans ROW_BEATS=2 beats. Store the row's 16 words into the
|
|
// staging RAM at indices row*16 + (0..15).
|
|
typedef enum logic [2:0] { R_IDLE, R_C_AR, R_C_R, R_C_W, R_Z_AR, R_Z_R, R_Z_W, R_DONE } rstate_t;
|
|
rstate_t rst_q;
|
|
logic [$clog2(N_ROWS):0] row;
|
|
logic [$clog2(ROW_BEATS):0] beat;
|
|
logic [2:0] lane; // serialized unpack lane 0..7 — ONE RAM write/cycle (M20K, not an 8-wide reg file)
|
|
logic [255:0] beat_q; // latched 256-bit beat, drained one 32-bit lane per cycle
|
|
logic [29:0] base_q; // Ch324 — per-tile byte offset latched at fill arm (stable across the fill)
|
|
logic [2:0] fs_sync;
|
|
// reload_start is a STROBE (gs_stub pulses it once per tile reload): trigger on the
|
|
// RISING edge only — one pulse => exactly one fill. (Was an any-edge toggle, which made
|
|
// a pulse trigger TWO fills; harmless but wasteful and confusing.)
|
|
wire fs_edge = fs_sync[1] & ~fs_sync[2];
|
|
|
|
function automatic [29:0] row_base(input [29:0] base, input int r);
|
|
row_base = base + r*STRIDE_BYTES;
|
|
endfunction
|
|
|
|
// SINGLE write port per RAM (one index, one data, per clock) so Quartus infers M20K
|
|
// instead of the 8-wide register file the old parallel beat-unpack forced (~8.7K ALMs).
|
|
// wa = row*WORDS_ROW + beat*8 + lane. Uses the CURRENT row/beat/lane; the lane==7 branch
|
|
// updates row/beat non-blockingly, so this cycle's write still targets the right entry.
|
|
wire [$clog2(N_ENTRIES)-1:0] wa = row[$clog2(N_ROWS)-1:0]*WORDS_ROW
|
|
+ beat[$clog2(ROW_BEATS)-1:0]*8 + lane;
|
|
|
|
always_ff @(posedge axi_clk) begin
|
|
if (!axi_rst_n) begin
|
|
rst_q <= R_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
|
|
row <= '0; beat <= '0; lane <= '0; reload_done <= 1'b0; base_q <= 30'd0;
|
|
color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0; fs_sync <= 3'd0;
|
|
end else begin
|
|
fs_sync <= {fs_sync[1:0], reload_start};
|
|
case (rst_q)
|
|
R_IDLE, R_DONE: begin
|
|
if (fs_edge) begin
|
|
reload_done <= 1'b0; color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0;
|
|
row <= '0; beat <= '0; lane <= '0;
|
|
base_q <= reload_base; // latch this tile's offset for the whole fill
|
|
araddr <= COLOR_BASE + reload_base; // row 0 color of THIS tile
|
|
arvalid <= 1'b1;
|
|
rst_q <= R_C_AR;
|
|
end
|
|
end
|
|
R_C_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_C_R; end
|
|
R_C_R: if (rvalid) begin // latch the beat; drain it serially in R_C_W
|
|
beat_q <= rdata;
|
|
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
|
|
rready <= 1'b0; color_beats <= color_beats + 32'd1;
|
|
lane <= '0;
|
|
rst_q <= R_C_W;
|
|
end
|
|
R_C_W: begin // 8 cycles: one 32-bit lane -> color_ram per clock
|
|
color_ram[wa] <= beat_q[lane*32 +: 32];
|
|
if (lane == 3'd7) begin
|
|
if (beat == ROW_BEATS-1) begin // color row done -> Z row
|
|
beat <= '0;
|
|
araddr <= row_base(Z_BASE + base_q, row);
|
|
arvalid <= 1'b1;
|
|
rst_q <= R_Z_AR;
|
|
end else begin
|
|
beat <= beat + 1'b1;
|
|
araddr <= araddr + 30'd32;
|
|
arvalid <= 1'b1;
|
|
rst_q <= R_C_AR;
|
|
end
|
|
end else lane <= lane + 1'b1;
|
|
end
|
|
R_Z_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_Z_R; end
|
|
R_Z_R: if (rvalid) begin
|
|
beat_q <= rdata;
|
|
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
|
|
rready <= 1'b0; z_beats <= z_beats + 32'd1;
|
|
lane <= '0;
|
|
rst_q <= R_Z_W;
|
|
end
|
|
R_Z_W: begin // 8 cycles: one 32-bit lane -> z_ram per clock
|
|
z_ram[wa] <= beat_q[lane*32 +: 32];
|
|
if (lane == 3'd7) begin
|
|
if (beat == ROW_BEATS-1) begin // Z row done -> next row (or finish).
|
|
// reload_done stays LOW until THIS final-row final-Z-lane write.
|
|
if (row == N_ROWS-1) begin
|
|
reload_done <= 1'b1;
|
|
rst_q <= R_DONE;
|
|
end else begin
|
|
row <= row + 1'b1;
|
|
beat <= '0;
|
|
araddr <= row_base(COLOR_BASE + base_q, row + 1);
|
|
arvalid <= 1'b1;
|
|
rst_q <= R_C_AR;
|
|
end
|
|
end else begin
|
|
beat <= beat + 1'b1;
|
|
araddr <= araddr + 30'd32;
|
|
arvalid <= 1'b1;
|
|
rst_q <= R_Z_AR;
|
|
end
|
|
end else lane <= lane + 1'b1;
|
|
end
|
|
default: rst_q <= R_IDLE;
|
|
endcase
|
|
end
|
|
end
|
|
|
|
// ================= serve side (serve_clk) =================
|
|
// 1-cycle REGISTERED read, identical timing to the tile RAM / vram read2.
|
|
always_ff @(posedge serve_clk) begin
|
|
color_o <= color_ram[raddr];
|
|
z_o <= z_ram[raddr];
|
|
end
|
|
// reload_ready handshake (Ch323 fix): a fresh reload_start MUST drop ready immediately,
|
|
// and ready re-raises only when THIS fill completes. Without this, a back-to-back reload
|
|
// (two tile batches) sees ready still high from the PREVIOUS fill and gs_stub sweeps the
|
|
// stale (pre-fill) z_ram before the new fill populates it — the reloaded Z is lost (the
|
|
// board's "region A wrong color" bug; reproduced in tb_gs_tile_spill_lpddr). reload_start
|
|
// is in the serve_clk (design) domain; reload_done is edge-detected after CDC.
|
|
logic [1:0] done_sync = 2'b00;
|
|
logic ready_q = 1'b0;
|
|
wire done_rise = done_sync[0] & ~done_sync[1];
|
|
always_ff @(posedge serve_clk) begin
|
|
done_sync <= {done_sync[0], reload_done};
|
|
if (reload_start) ready_q <= 1'b0; // new fill armed -> not ready
|
|
else if (done_rise) ready_q <= 1'b1; // this fill completed
|
|
end
|
|
// COMBINATIONALLY mask ready low while reload_start is asserted: gs_stub pulses
|
|
// reload_start and checks ready in the SAME cycle, so the registered clear above lands
|
|
// one cycle too late — without the mask gs_stub sees the PREVIOUS fill's stale ready=1
|
|
// and sweeps before this fill populates z_ram (the region-A-wrong-color bug).
|
|
assign reload_ready = ready_q & ~reload_start;
|
|
endmodule
|