Files
retroDE_ps2/rtl/gif_gs/gs_z_flush_writer.sv
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

237 lines
13 KiB
Systemverilog

// ============================================================================
// gs_z_flush_writer.sv (Ch323 Brick 2 — tile color/Z-flush LPDDR writer; PACKED)
//
// Writes a gs_stub tile-flush stream (one 32-bit word per tile pixel on design_clk —
// either the TP_ZFLUSH Z stream or the TP_FLUSH color stream) to an FPGA-private
// LPDDR4B scratch region (emif_clk). Used twice in the de25 top: once for Z, once for
// the 32-bit color spill (the module is generic — it writes the 32-bit `data` at
// BASE + `addr`).
//
// PACKED (Ch323 board fix): the FIRST cut did ONE single-32-bit-lane AXI write PER
// pixel through a strictly-sequential AW->W->B FSM. The tile sweep emits one pixel per
// design_clk (256 back-to-back), but each isolated write pays the full LPDDR round-trip
// latency, so the drain fell far behind the emit rate and the 16-deep async FIFO
// OVERFLOWED within ~16 px — dropping most of the spilled tile (grey-with-specks on
// HDMI, spill_ovf=1). The proven framebuffer writer (gs_lpddr_axi_master) avoids this by
// PACKING pixels into 256-bit beats; this writer now does the same with 32-bit lanes:
//
// design_clk : PACKER — accumulate 8 consecutive 32-bit pixels of a tile-row into one
// 256-bit (32-byte) beat {block_off, data, strb}, keyed by the 32-byte
// block address (addr[29:5]). A 16-px tile-row is exactly two 32-byte-
// aligned blocks, so each beat completes naturally on its 8th px (no
// dangling partial); a block-address change flushes the in-flight beat.
// One FIFO push per 8 px => 8x fewer AXI writes => the sequential drain
// keeps up with the same small FIFO.
// async FIFO : gray-code CDC, carries {block_off[29:0], data[255:0], strb[31:0]}.
// emif_clk : AXI FSM — pop a beat, issue a single-beat INCR write (AWSIZE=5 = 32 B,
// AWLEN=0, full WSTRB on the populated lanes) at BASE + block_off.
//
// The packed beats land at exactly the offsets gs_tile_reload reads back (row r at
// BASE + r*STRIDE, two 32-byte beats), so the reload side is unchanged.
//
// SEPARATE base (Codex): BASE is distinct from the color FB and the other scratch
// region. A synthesis-off CANARY asserts no beat lands inside the canary-guard regions.
//
// Counters (Codex, distinct per instance): z_write_beats (256-bit beats written),
// z_wr_errs (non-OKAY responses), fifo_overflow (sticky).
//
// NOTE (parity with gs_lpddr_axi_master): assumes the flush stream produces FULL 8-lane
// beats (true for a tile width that is a multiple of 8 — the 16-wide spill tile). A
// trailing partial beat at end-of-stream is NOT flushed.
// ============================================================================
`timescale 1ns/1ps
module gs_z_flush_writer #(
parameter [29:0] Z_BASE = 30'h0010_0000, // LPDDR byte base of this scratch region (DISTINCT)
parameter [29:0] FB_BASE = 30'd0, // color framebuffer base (canary guard)
parameter int FB_BYTES = 32'h0001_0000, // color framebuffer size (canary guard)
parameter [29:0] TEX_BASE = 30'h0020_0000, // other scratch base (canary guard)
parameter int TEX_BYTES = 32'h0000_8000, // other scratch size (canary guard)
parameter int FIFO_DEPTH = 16
)(
// ---- GS / design clock domain: the flush emit stream ----
input logic gs_clk,
input logic gs_rst_n,
input logic enable, // 1 = accept emits (default off => inert)
input logic z_flush_emit, // one pulse per tile pixel
input logic [31:0] z_flush_addr, // scratch-RELATIVE byte offset (pixel_index*4)
input logic [31:0] z_flush_data, // 32-bit word for this pixel (Z or color)
// ---- status (emif_clk domain unless noted) ----
output logic [31:0] z_write_beats, // 256-bit beats written (cumulative)
output logic [31:0] z_wr_errs, // non-OKAY write responses (cumulative)
output logic fifo_overflow, // sticky (gs domain): an emit dropped (FIFO full)
// Pipeline-split counters (Codex): emit/push (GS, reset by gs_rst_n=per-render core reset) and
// pop/beats (EMIF, reset by trace_clear) localize any spill divergence: healthy = 512/64/64/64;
// push>64 = packer partial beats; pop/beats>push = FIFO/reset broken; beats!=pop = AXI-FSM bug.
input logic trace_clear, // resets the EMIF-domain counters (beats/pop) per render
output logic [31:0] dbg_beat_count, // beats committed (B handshakes) since the last trace_clear
output logic [31:0] dbg_emit_count, // GS: enable&&z_flush_emit accepted (per render)
output logic [31:0] dbg_push_count, // GS: beats pushed into the FIFO (per render)
output logic [31:0] dbg_pop_count, // EMIF: beats popped from the FIFO (since trace_clear)
output logic [31:0] dbg_aw_count, // EMIF: AW handshakes (since trace_clear)
output logic [31:0] dbg_w_count, // EMIF: W handshakes (since trace_clear)
// ---- AXI4 write channel to the EMIF user port (emif_clk, 256-bit) ----
input logic axi_clk,
input logic axi_rst_n,
output logic [29:0] awaddr,
output logic [1:0] awburst,
output logic [6:0] awid,
output logic [7:0] awlen,
output logic [2:0] awsize,
output logic awvalid,
input logic awready,
output logic [255:0] wdata,
output logic [31:0] wstrb,
output logic wlast,
output logic wvalid,
input logic wready,
input logic [1:0] bresp,
input logic bvalid,
output logic bready
);
assign awburst = 2'b01; // INCR
assign awid = 7'd6; // distinct from FB writer(0)/probes/reload(6 too; arb priority disambiguates)
assign awlen = 8'd0; // single beat
assign awsize = 3'b101; // 32 bytes (256-bit)
assign bready = 1'b1;
localparam int PW = 318; // {block_off[29:0], data[255:0], strb[31:0]}
// ============================ design_clk PACKER ============================
// Accumulate 8 consecutive 32-bit pixels into one 256-bit beat keyed by the 32-byte
// block address; push a COMPLETE beat to the FIFO (one push per 8 px, not per px).
logic [29:0] cur_off;
logic [255:0] cur_data;
logic [31:0] cur_strb;
logic has_data;
logic fifo_wr;
logic [PW-1:0] fifo_wdata;
wire fifo_full, fifo_empty;
wire [PW-1:0] fifo_rdata;
logic fifo_rd;
always_ff @(posedge gs_clk or negedge gs_rst_n) begin
if (!gs_rst_n) begin
cur_off <= '0; cur_data <= '0; cur_strb <= '0; has_data <= 1'b0;
fifo_wr <= 1'b0; fifo_wdata <= '0; fifo_overflow <= 1'b0;
dbg_emit_count <= 32'd0; dbg_push_count <= 32'd0;
end else begin
fifo_wr <= 1'b0;
if (enable && z_flush_emit) dbg_emit_count <= dbg_emit_count + 32'd1;
if (fifo_wr && !fifo_full) dbg_push_count <= dbg_push_count + 32'd1;
if (enable && z_flush_emit) begin
logic [29:0] block_off;
logic [2:0] lane; // 0..7 (which 32-bit lane)
logic [255:0] nd;
logic [31:0] ns;
block_off = {z_flush_addr[29:5], 5'd0};
lane = z_flush_addr[4:2];
if (has_data && (block_off != cur_off)) begin
// block changed before the previous beat filled — flush it, restart.
fifo_wdata <= {cur_off, cur_data, cur_strb};
fifo_wr <= 1'b1;
cur_off <= block_off;
cur_data <= (256'd0 | (256'(z_flush_data) << ({29'd0, lane} * 32)));
cur_strb <= (32'hF << ({29'd0, lane} * 4));
has_data <= 1'b1;
end else begin
nd = has_data ? cur_data : 256'd0;
ns = has_data ? cur_strb : 32'd0;
nd[ ({29'd0, lane} * 32) +: 32 ] = z_flush_data;
ns[ ({29'd0, lane} * 4) +: 4 ] = 4'hF;
if (&ns) begin
// beat complete (all 8 lanes) — flush, beat consumed.
fifo_wdata <= {block_off, nd, ns};
fifo_wr <= 1'b1;
has_data <= 1'b0;
end else begin
cur_off <= block_off;
cur_data <= nd;
cur_strb <= ns;
has_data <= 1'b1;
end
end
end
// overflow witness: a push attempt while the FIFO is full (must stay 0).
if (fifo_wr && fifo_full) fifo_overflow <= 1'b1;
end
end
// CRITICAL (Ch323 board bug): the async FIFO's two pointers MUST reset together. The
// packer side uses gs_rst_n (= core reset, which a CORE_CTRL pulse toggles EVERY render);
// the read side uses axi_rst_n (= EMIF cal, power-on only). If wrst_n followed gs_rst_n,
// each render's core-reset pulse would reset ONLY the write pointer → gray-code pointer
// desync → FIFO corruption (garbage data, spurious overflow, writes that never commit).
// Sim missed it (single reset, both sides together). So reset BOTH FIFO sides from the
// STABLE axi_rst_n: assert async on axi_rst_n, deassert synchronized into gs_clk.
reg [1:0] wrst_sync;
always_ff @(posedge gs_clk or negedge axi_rst_n) begin
if (!axi_rst_n) wrst_sync <= 2'b00;
else wrst_sync <= {wrst_sync[0], 1'b1};
end
wire fifo_wrst_n = wrst_sync[1];
gs_async_fifo #(.WIDTH(PW), .DEPTH(FIFO_DEPTH)) u_fifo (
.wclk(gs_clk), .wrst_n(fifo_wrst_n), .wr(fifo_wr && !fifo_full), .wdata(fifo_wdata), .wfull(fifo_full),
.rclk(axi_clk), .rrst_n(axi_rst_n), .rd(fifo_rd), .rdata(fifo_rdata), .rempty(fifo_empty)
);
// ============================ emif_clk AXI FSM ============================
wire [29:0] beat_block = fifo_rdata[PW-1 -: 30]; // block_off[29:0]
wire [255:0] beat_data = fifo_rdata[287:32];
wire [31:0] beat_strb = fifo_rdata[31:0];
wire [29:0] full_addr = Z_BASE + beat_block;
typedef enum logic [1:0] { W_IDLE, W_AW, W_W, W_B } wstate_t;
wstate_t wst;
logic [29:0] lat_addr;
logic [255:0] lat_data;
logic [31:0] lat_strb;
always_ff @(posedge axi_clk or negedge axi_rst_n) begin
if (!axi_rst_n) begin
wst <= W_IDLE; awaddr <= '0; awvalid <= 1'b0; wdata <= '0; wstrb <= '0;
wlast <= 1'b0; wvalid <= 1'b0; fifo_rd <= 1'b0;
z_write_beats <= 32'd0; z_wr_errs <= 32'd0;
dbg_beat_count <= 32'd0;
dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
lat_addr <= '0; lat_data <= '0; lat_strb <= '0;
end else begin
fifo_rd <= 1'b0;
if (trace_clear) begin
dbg_beat_count <= 32'd0;
dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
end
if (fifo_rd) dbg_pop_count <= dbg_pop_count + 32'd1;
if (awvalid && awready) dbg_aw_count <= dbg_aw_count + 32'd1;
if (wvalid && wready) dbg_w_count <= dbg_w_count + 32'd1;
case (wst)
W_IDLE: if (!fifo_empty) begin
lat_addr <= full_addr; lat_data <= beat_data; lat_strb <= beat_strb;
fifo_rd <= 1'b1; // pop this beat
awaddr <= {full_addr[29:5], 5'd0}; // 32-byte aligned
awvalid <= 1'b1;
wst <= W_AW;
// synthesis translate_off
if (((full_addr >= FB_BASE) && (full_addr < FB_BASE + FB_BYTES[29:0])) ||
((full_addr >= TEX_BASE) && (full_addr < TEX_BASE + TEX_BYTES[29:0])))
$error("gs_z_flush_writer CANARY: beat addr 0x%07x overlaps a canary-guard region", full_addr);
// synthesis translate_on
end
W_AW: if (awready) begin
awvalid <= 1'b0; wdata <= lat_data;
wstrb <= lat_strb; wlast <= 1'b1; wvalid <= 1'b1; wst <= W_W;
end
W_W: if (wready) begin wvalid <= 1'b0; wlast <= 1'b0; wst <= W_B; end
W_B: if (bvalid) begin
if (bresp != 2'b00) z_wr_errs <= z_wr_errs + 32'd1;
z_write_beats <= z_write_beats + 32'd1;
dbg_beat_count <= dbg_beat_count + 32'd1;
wst <= W_IDLE;
end
default: wst <= W_IDLE;
endcase
end
end
endmodule