ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
237 lines
13 KiB
Systemverilog
237 lines
13 KiB
Systemverilog
// ============================================================================
|
|
// gs_z_flush_writer.sv (Ch323 Brick 2 — tile color/Z-flush LPDDR writer; PACKED)
|
|
//
|
|
// Writes a gs_stub tile-flush stream (one 32-bit word per tile pixel on design_clk —
|
|
// either the TP_ZFLUSH Z stream or the TP_FLUSH color stream) to an FPGA-private
|
|
// LPDDR4B scratch region (emif_clk). Used twice in the de25 top: once for Z, once for
|
|
// the 32-bit color spill (the module is generic — it writes the 32-bit `data` at
|
|
// BASE + `addr`).
|
|
//
|
|
// PACKED (Ch323 board fix): the FIRST cut did ONE single-32-bit-lane AXI write PER
|
|
// pixel through a strictly-sequential AW->W->B FSM. The tile sweep emits one pixel per
|
|
// design_clk (256 back-to-back), but each isolated write pays the full LPDDR round-trip
|
|
// latency, so the drain fell far behind the emit rate and the 16-deep async FIFO
|
|
// OVERFLOWED within ~16 px — dropping most of the spilled tile (grey-with-specks on
|
|
// HDMI, spill_ovf=1). The proven framebuffer writer (gs_lpddr_axi_master) avoids this by
|
|
// PACKING pixels into 256-bit beats; this writer now does the same with 32-bit lanes:
|
|
//
|
|
// design_clk : PACKER — accumulate 8 consecutive 32-bit pixels of a tile-row into one
|
|
// 256-bit (32-byte) beat {block_off, data, strb}, keyed by the 32-byte
|
|
// block address (addr[29:5]). A 16-px tile-row is exactly two 32-byte-
|
|
// aligned blocks, so each beat completes naturally on its 8th px (no
|
|
// dangling partial); a block-address change flushes the in-flight beat.
|
|
// One FIFO push per 8 px => 8x fewer AXI writes => the sequential drain
|
|
// keeps up with the same small FIFO.
|
|
// async FIFO : gray-code CDC, carries {block_off[29:0], data[255:0], strb[31:0]}.
|
|
// emif_clk : AXI FSM — pop a beat, issue a single-beat INCR write (AWSIZE=5 = 32 B,
|
|
// AWLEN=0, full WSTRB on the populated lanes) at BASE + block_off.
|
|
//
|
|
// The packed beats land at exactly the offsets gs_tile_reload reads back (row r at
|
|
// BASE + r*STRIDE, two 32-byte beats), so the reload side is unchanged.
|
|
//
|
|
// SEPARATE base (Codex): BASE is distinct from the color FB and the other scratch
|
|
// region. A synthesis-off CANARY asserts no beat lands inside the canary-guard regions.
|
|
//
|
|
// Counters (Codex, distinct per instance): z_write_beats (256-bit beats written),
|
|
// z_wr_errs (non-OKAY responses), fifo_overflow (sticky).
|
|
//
|
|
// NOTE (parity with gs_lpddr_axi_master): assumes the flush stream produces FULL 8-lane
|
|
// beats (true for a tile width that is a multiple of 8 — the 16-wide spill tile). A
|
|
// trailing partial beat at end-of-stream is NOT flushed.
|
|
// ============================================================================
|
|
`timescale 1ns/1ps
|
|
|
|
module gs_z_flush_writer #(
|
|
parameter [29:0] Z_BASE = 30'h0010_0000, // LPDDR byte base of this scratch region (DISTINCT)
|
|
parameter [29:0] FB_BASE = 30'd0, // color framebuffer base (canary guard)
|
|
parameter int FB_BYTES = 32'h0001_0000, // color framebuffer size (canary guard)
|
|
parameter [29:0] TEX_BASE = 30'h0020_0000, // other scratch base (canary guard)
|
|
parameter int TEX_BYTES = 32'h0000_8000, // other scratch size (canary guard)
|
|
parameter int FIFO_DEPTH = 16
|
|
)(
|
|
// ---- GS / design clock domain: the flush emit stream ----
|
|
input logic gs_clk,
|
|
input logic gs_rst_n,
|
|
input logic enable, // 1 = accept emits (default off => inert)
|
|
input logic z_flush_emit, // one pulse per tile pixel
|
|
input logic [31:0] z_flush_addr, // scratch-RELATIVE byte offset (pixel_index*4)
|
|
input logic [31:0] z_flush_data, // 32-bit word for this pixel (Z or color)
|
|
|
|
// ---- status (emif_clk domain unless noted) ----
|
|
output logic [31:0] z_write_beats, // 256-bit beats written (cumulative)
|
|
output logic [31:0] z_wr_errs, // non-OKAY write responses (cumulative)
|
|
output logic fifo_overflow, // sticky (gs domain): an emit dropped (FIFO full)
|
|
// Pipeline-split counters (Codex): emit/push (GS, reset by gs_rst_n=per-render core reset) and
|
|
// pop/beats (EMIF, reset by trace_clear) localize any spill divergence: healthy = 512/64/64/64;
|
|
// push>64 = packer partial beats; pop/beats>push = FIFO/reset broken; beats!=pop = AXI-FSM bug.
|
|
input logic trace_clear, // resets the EMIF-domain counters (beats/pop) per render
|
|
output logic [31:0] dbg_beat_count, // beats committed (B handshakes) since the last trace_clear
|
|
output logic [31:0] dbg_emit_count, // GS: enable&&z_flush_emit accepted (per render)
|
|
output logic [31:0] dbg_push_count, // GS: beats pushed into the FIFO (per render)
|
|
output logic [31:0] dbg_pop_count, // EMIF: beats popped from the FIFO (since trace_clear)
|
|
output logic [31:0] dbg_aw_count, // EMIF: AW handshakes (since trace_clear)
|
|
output logic [31:0] dbg_w_count, // EMIF: W handshakes (since trace_clear)
|
|
|
|
// ---- AXI4 write channel to the EMIF user port (emif_clk, 256-bit) ----
|
|
input logic axi_clk,
|
|
input logic axi_rst_n,
|
|
output logic [29:0] awaddr,
|
|
output logic [1:0] awburst,
|
|
output logic [6:0] awid,
|
|
output logic [7:0] awlen,
|
|
output logic [2:0] awsize,
|
|
output logic awvalid,
|
|
input logic awready,
|
|
output logic [255:0] wdata,
|
|
output logic [31:0] wstrb,
|
|
output logic wlast,
|
|
output logic wvalid,
|
|
input logic wready,
|
|
input logic [1:0] bresp,
|
|
input logic bvalid,
|
|
output logic bready
|
|
);
|
|
assign awburst = 2'b01; // INCR
|
|
assign awid = 7'd6; // distinct from FB writer(0)/probes/reload(6 too; arb priority disambiguates)
|
|
assign awlen = 8'd0; // single beat
|
|
assign awsize = 3'b101; // 32 bytes (256-bit)
|
|
assign bready = 1'b1;
|
|
|
|
localparam int PW = 318; // {block_off[29:0], data[255:0], strb[31:0]}
|
|
|
|
// ============================ design_clk PACKER ============================
|
|
// Accumulate 8 consecutive 32-bit pixels into one 256-bit beat keyed by the 32-byte
|
|
// block address; push a COMPLETE beat to the FIFO (one push per 8 px, not per px).
|
|
logic [29:0] cur_off;
|
|
logic [255:0] cur_data;
|
|
logic [31:0] cur_strb;
|
|
logic has_data;
|
|
logic fifo_wr;
|
|
logic [PW-1:0] fifo_wdata;
|
|
wire fifo_full, fifo_empty;
|
|
wire [PW-1:0] fifo_rdata;
|
|
logic fifo_rd;
|
|
|
|
always_ff @(posedge gs_clk or negedge gs_rst_n) begin
|
|
if (!gs_rst_n) begin
|
|
cur_off <= '0; cur_data <= '0; cur_strb <= '0; has_data <= 1'b0;
|
|
fifo_wr <= 1'b0; fifo_wdata <= '0; fifo_overflow <= 1'b0;
|
|
dbg_emit_count <= 32'd0; dbg_push_count <= 32'd0;
|
|
end else begin
|
|
fifo_wr <= 1'b0;
|
|
if (enable && z_flush_emit) dbg_emit_count <= dbg_emit_count + 32'd1;
|
|
if (fifo_wr && !fifo_full) dbg_push_count <= dbg_push_count + 32'd1;
|
|
if (enable && z_flush_emit) begin
|
|
logic [29:0] block_off;
|
|
logic [2:0] lane; // 0..7 (which 32-bit lane)
|
|
logic [255:0] nd;
|
|
logic [31:0] ns;
|
|
block_off = {z_flush_addr[29:5], 5'd0};
|
|
lane = z_flush_addr[4:2];
|
|
if (has_data && (block_off != cur_off)) begin
|
|
// block changed before the previous beat filled — flush it, restart.
|
|
fifo_wdata <= {cur_off, cur_data, cur_strb};
|
|
fifo_wr <= 1'b1;
|
|
cur_off <= block_off;
|
|
cur_data <= (256'd0 | (256'(z_flush_data) << ({29'd0, lane} * 32)));
|
|
cur_strb <= (32'hF << ({29'd0, lane} * 4));
|
|
has_data <= 1'b1;
|
|
end else begin
|
|
nd = has_data ? cur_data : 256'd0;
|
|
ns = has_data ? cur_strb : 32'd0;
|
|
nd[ ({29'd0, lane} * 32) +: 32 ] = z_flush_data;
|
|
ns[ ({29'd0, lane} * 4) +: 4 ] = 4'hF;
|
|
if (&ns) begin
|
|
// beat complete (all 8 lanes) — flush, beat consumed.
|
|
fifo_wdata <= {block_off, nd, ns};
|
|
fifo_wr <= 1'b1;
|
|
has_data <= 1'b0;
|
|
end else begin
|
|
cur_off <= block_off;
|
|
cur_data <= nd;
|
|
cur_strb <= ns;
|
|
has_data <= 1'b1;
|
|
end
|
|
end
|
|
end
|
|
// overflow witness: a push attempt while the FIFO is full (must stay 0).
|
|
if (fifo_wr && fifo_full) fifo_overflow <= 1'b1;
|
|
end
|
|
end
|
|
|
|
// CRITICAL (Ch323 board bug): the async FIFO's two pointers MUST reset together. The
|
|
// packer side uses gs_rst_n (= core reset, which a CORE_CTRL pulse toggles EVERY render);
|
|
// the read side uses axi_rst_n (= EMIF cal, power-on only). If wrst_n followed gs_rst_n,
|
|
// each render's core-reset pulse would reset ONLY the write pointer → gray-code pointer
|
|
// desync → FIFO corruption (garbage data, spurious overflow, writes that never commit).
|
|
// Sim missed it (single reset, both sides together). So reset BOTH FIFO sides from the
|
|
// STABLE axi_rst_n: assert async on axi_rst_n, deassert synchronized into gs_clk.
|
|
reg [1:0] wrst_sync;
|
|
always_ff @(posedge gs_clk or negedge axi_rst_n) begin
|
|
if (!axi_rst_n) wrst_sync <= 2'b00;
|
|
else wrst_sync <= {wrst_sync[0], 1'b1};
|
|
end
|
|
wire fifo_wrst_n = wrst_sync[1];
|
|
gs_async_fifo #(.WIDTH(PW), .DEPTH(FIFO_DEPTH)) u_fifo (
|
|
.wclk(gs_clk), .wrst_n(fifo_wrst_n), .wr(fifo_wr && !fifo_full), .wdata(fifo_wdata), .wfull(fifo_full),
|
|
.rclk(axi_clk), .rrst_n(axi_rst_n), .rd(fifo_rd), .rdata(fifo_rdata), .rempty(fifo_empty)
|
|
);
|
|
|
|
// ============================ emif_clk AXI FSM ============================
|
|
wire [29:0] beat_block = fifo_rdata[PW-1 -: 30]; // block_off[29:0]
|
|
wire [255:0] beat_data = fifo_rdata[287:32];
|
|
wire [31:0] beat_strb = fifo_rdata[31:0];
|
|
wire [29:0] full_addr = Z_BASE + beat_block;
|
|
typedef enum logic [1:0] { W_IDLE, W_AW, W_W, W_B } wstate_t;
|
|
wstate_t wst;
|
|
logic [29:0] lat_addr;
|
|
logic [255:0] lat_data;
|
|
logic [31:0] lat_strb;
|
|
|
|
always_ff @(posedge axi_clk or negedge axi_rst_n) begin
|
|
if (!axi_rst_n) begin
|
|
wst <= W_IDLE; awaddr <= '0; awvalid <= 1'b0; wdata <= '0; wstrb <= '0;
|
|
wlast <= 1'b0; wvalid <= 1'b0; fifo_rd <= 1'b0;
|
|
z_write_beats <= 32'd0; z_wr_errs <= 32'd0;
|
|
dbg_beat_count <= 32'd0;
|
|
dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
|
|
lat_addr <= '0; lat_data <= '0; lat_strb <= '0;
|
|
end else begin
|
|
fifo_rd <= 1'b0;
|
|
if (trace_clear) begin
|
|
dbg_beat_count <= 32'd0;
|
|
dbg_pop_count <= 32'd0; dbg_aw_count <= 32'd0; dbg_w_count <= 32'd0;
|
|
end
|
|
if (fifo_rd) dbg_pop_count <= dbg_pop_count + 32'd1;
|
|
if (awvalid && awready) dbg_aw_count <= dbg_aw_count + 32'd1;
|
|
if (wvalid && wready) dbg_w_count <= dbg_w_count + 32'd1;
|
|
case (wst)
|
|
W_IDLE: if (!fifo_empty) begin
|
|
lat_addr <= full_addr; lat_data <= beat_data; lat_strb <= beat_strb;
|
|
fifo_rd <= 1'b1; // pop this beat
|
|
awaddr <= {full_addr[29:5], 5'd0}; // 32-byte aligned
|
|
awvalid <= 1'b1;
|
|
wst <= W_AW;
|
|
// synthesis translate_off
|
|
if (((full_addr >= FB_BASE) && (full_addr < FB_BASE + FB_BYTES[29:0])) ||
|
|
((full_addr >= TEX_BASE) && (full_addr < TEX_BASE + TEX_BYTES[29:0])))
|
|
$error("gs_z_flush_writer CANARY: beat addr 0x%07x overlaps a canary-guard region", full_addr);
|
|
// synthesis translate_on
|
|
end
|
|
W_AW: if (awready) begin
|
|
awvalid <= 1'b0; wdata <= lat_data;
|
|
wstrb <= lat_strb; wlast <= 1'b1; wvalid <= 1'b1; wst <= W_W;
|
|
end
|
|
W_W: if (wready) begin wvalid <= 1'b0; wlast <= 1'b0; wst <= W_B; end
|
|
W_B: if (bvalid) begin
|
|
if (bresp != 2'b00) z_wr_errs <= z_wr_errs + 32'd1;
|
|
z_write_beats <= z_write_beats + 32'd1;
|
|
dbg_beat_count <= dbg_beat_count + 32'd1;
|
|
wst <= W_IDLE;
|
|
end
|
|
default: wst <= W_IDLE;
|
|
endcase
|
|
end
|
|
end
|
|
endmodule
|