retroDE_ps2/rtl/gif_gs/gs_tile_reload.sv

// ============================================================================
// gs_tile_reload.sv  (Ch323 Brick 2 — tile color+Z reload staging engine)
//
// The reload counterpart to the GS tile-flush writers, and a DIRECT structural
// clone of the silicon-proven gs_texture_cache (Ch322): an emif_clk fill FSM that
// reads a tile's worth of color+Z from FPGA-private LPDDR4B into on-chip staging
// RAMs, plus a design_clk serve port that returns one (color,Z) per tile index at
// the existing 1-cycle latency. gs_stub's TP_RELOAD phase sweeps the serve port and
// writes the tile color/Z RAMs before rendering. Same CDC shape as gs_texture_cache
// (one-shot warm fill, fill_done 2-FF synced into the serve clock) — NOT a new CDC.
//
// SEPARATE LPDDR bases (Codex): COLOR_BASE (the color framebuffer) and Z_BASE (the
// Z-backing region) are distinct. A 16x16 tile lives at FB stride STRIDE_BYTES per
// row (FBW*64*4 = 256 for FBW=1), so the fill reads ROW_BEATS 256-bit beats per row
// from each base — sparse/strided, exactly like the Ch322 texture (which read a
// 64-texel-stride region). Single-beat reads (arlen=0, the only proven EMIF pattern).
//
// Counters (Codex): color_beats, z_beats, rd_errs — all distinct.
// ============================================================================
`timescale 1ns/1ps

module gs_tile_reload #(
    parameter [29:0] COLOR_BASE   = 30'd0,        // LPDDR byte base of the color framebuffer
    parameter [29:0] Z_BASE       = 30'h0010_0000,// LPDDR byte base of the Z-backing (DISTINCT)
    parameter int    TILE_W       = 16,
    parameter int    TILE_H       = 16,
    parameter int    STRIDE_BYTES = 256,          // FB row stride (FBW*64 px * 4 B = 256 for FBW=1)
    parameter int    ROW_BEATS    = 2,            // 16 words/row * 4 B / 32 B = 2 single-beat reads
    parameter int    COLOR_W      = 32
)(
    // ---- AXI read clock domain (emif_clk) — fill side ----
    input  logic         axi_clk,
    input  logic         axi_rst_n,
    input  logic         reload_start,  // STROBE (gs/serve domain, CDC-synced): each RISING edge (re)fills
    // Ch324 — RUNTIME per-tile byte offset into the raster LPDDR framebuffer. Latched at the fill
    // arm (fs_edge) so it is stable for the whole fill. = ((tile_oy*(FBW*64)) + tile_ox)*4, the SAME
    // formula the flush side uses, so reload gathers exactly the tile the spill wrote. 0 = origin
    // tile (byte-identical to the Ch323 single-tile path). Quasi-static: gs_stub holds the current
    // tile constant across TP_RELOAD, so sampling it at the synced strobe needs no extra CDC.
    input  logic [29:0]  reload_base,
    output logic         reload_done,   // tile fully resident (until the next fill arm)
    output logic [31:0]  color_beats,   // color beats read (cumulative)
    output logic [31:0]  z_beats,       // Z beats read (cumulative)
    output logic [31:0]  rd_errs,       // non-OKAY read responses (cumulative)

    // ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
    output logic [29:0]  araddr,
    output logic [1:0]   arburst,
    output logic [6:0]   arid,
    output logic [7:0]   arlen,
    output logic [2:0]   arsize,
    output logic         arvalid,
    input  logic         arready,
    input  logic [255:0] rdata,
    input  logic [1:0]   rresp,
    input  logic         rlast,
    input  logic         rvalid,
    output logic         rready,

    // ---- serve clock domain (design_clk) — gs_stub TP_RELOAD reads this ----
    input  logic         serve_clk,
    input  logic [7:0]   raddr,         // tile index (row*16 + col), 0..255
    output logic [COLOR_W-1:0] color_o, // 1-cycle REGISTERED color for raddr
    output logic [31:0]  z_o,           // 1-cycle REGISTERED Z for raddr
    output logic         reload_ready   // reload_done synced into serve_clk (TP_RELOAD ready gate)
);
    localparam int N_ENTRIES = TILE_W*TILE_H;     // 256
    localparam int N_ROWS    = TILE_H;            // 16
    localparam int WORDS_ROW = TILE_W;            // 16 words/row

    assign arburst = 2'b01;   // INCR
    assign arid    = 7'd6;    // distinct: writer=0/rd-probe=1/fcache=2/linebuf=3/texfill=4/wr-probe=5/tile-reload=6
    assign arlen   = 8'd0;    // single-beat (only proven EMIF read pattern)
    assign arsize  = 3'b101;  // 32 bytes

    // On-chip staging RAMs: written by the fill FSM (axi_clk), read by gs_stub (serve_clk).
    // One-shot warm fill => static during reads => no read/write CDC hazard (gs_texture_cache pattern).
    logic [COLOR_W-1:0] color_ram [0:N_ENTRIES-1];
    logic [31:0]        z_ram     [0:N_ENTRIES-1];

    // ================= fill side (axi_clk) =================
    // For each of N_ROWS rows, read ROW_BEATS color beats then ROW_BEATS Z beats. Each 256-bit
    // beat = 8 words; WORDS_ROW=16 spans ROW_BEATS=2 beats. Store the row's 16 words into the
    // staging RAM at indices row*16 + (0..15).
    typedef enum logic [2:0] { R_IDLE, R_C_AR, R_C_R, R_C_W, R_Z_AR, R_Z_R, R_Z_W, R_DONE } rstate_t;
    rstate_t rst_q;
    logic [$clog2(N_ROWS):0]   row;
    logic [$clog2(ROW_BEATS):0] beat;
    logic [2:0]                 lane;   // serialized unpack lane 0..7 — ONE RAM write/cycle (M20K, not an 8-wide reg file)
    logic [255:0]               beat_q; // latched 256-bit beat, drained one 32-bit lane per cycle
    logic [29:0]                base_q; // Ch324 — per-tile byte offset latched at fill arm (stable across the fill)
    logic [2:0] fs_sync;
    // reload_start is a STROBE (gs_stub pulses it once per tile reload): trigger on the
    // RISING edge only — one pulse => exactly one fill. (Was an any-edge toggle, which made
    // a pulse trigger TWO fills; harmless but wasteful and confusing.)
    wire        fs_edge = fs_sync[1] & ~fs_sync[2];

    function automatic [29:0] row_base(input [29:0] base, input int r);
        row_base = base + r*STRIDE_BYTES;
    endfunction

    // SINGLE write port per RAM (one index, one data, per clock) so Quartus infers M20K
    // instead of the 8-wide register file the old parallel beat-unpack forced (~8.7K ALMs).
    // wa = row*WORDS_ROW + beat*8 + lane. Uses the CURRENT row/beat/lane; the lane==7 branch
    // updates row/beat non-blockingly, so this cycle's write still targets the right entry.
    wire [$clog2(N_ENTRIES)-1:0] wa = row[$clog2(N_ROWS)-1:0]*WORDS_ROW
                                    + beat[$clog2(ROW_BEATS)-1:0]*8 + lane;

    always_ff @(posedge axi_clk) begin
        if (!axi_rst_n) begin
            rst_q <= R_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
            row <= '0; beat <= '0; lane <= '0; reload_done <= 1'b0; base_q <= 30'd0;
            color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0; fs_sync <= 3'd0;
        end else begin
            fs_sync <= {fs_sync[1:0], reload_start};
            case (rst_q)
                R_IDLE, R_DONE: begin
                    if (fs_edge) begin
                        reload_done <= 1'b0; color_beats <= 32'd0; z_beats <= 32'd0; rd_errs <= 32'd0;
                        row <= '0; beat <= '0; lane <= '0;
                        base_q  <= reload_base;              // latch this tile's offset for the whole fill
                        araddr  <= COLOR_BASE + reload_base; // row 0 color of THIS tile
                        arvalid <= 1'b1;
                        rst_q   <= R_C_AR;
                    end
                end
                R_C_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_C_R; end
                R_C_R:  if (rvalid) begin            // latch the beat; drain it serially in R_C_W
                    beat_q <= rdata;
                    if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
                    rready <= 1'b0; color_beats <= color_beats + 32'd1;
                    lane   <= '0;
                    rst_q  <= R_C_W;
                end
                R_C_W: begin                         // 8 cycles: one 32-bit lane -> color_ram per clock
                    color_ram[wa] <= beat_q[lane*32 +: 32];
                    if (lane == 3'd7) begin
                        if (beat == ROW_BEATS-1) begin   // color row done -> Z row
                            beat    <= '0;
                            araddr  <= row_base(Z_BASE + base_q, row);
                            arvalid <= 1'b1;
                            rst_q   <= R_Z_AR;
                        end else begin
                            beat    <= beat + 1'b1;
                            araddr  <= araddr + 30'd32;
                            arvalid <= 1'b1;
                            rst_q   <= R_C_AR;
                        end
                    end else lane <= lane + 1'b1;
                end
                R_Z_AR: if (arready) begin arvalid <= 1'b0; rready <= 1'b1; rst_q <= R_Z_R; end
                R_Z_R:  if (rvalid) begin
                    beat_q <= rdata;
                    if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
                    rready <= 1'b0; z_beats <= z_beats + 32'd1;
                    lane   <= '0;
                    rst_q  <= R_Z_W;
                end
                R_Z_W: begin                         // 8 cycles: one 32-bit lane -> z_ram per clock
                    z_ram[wa] <= beat_q[lane*32 +: 32];
                    if (lane == 3'd7) begin
                        if (beat == ROW_BEATS-1) begin   // Z row done -> next row (or finish).
                            // reload_done stays LOW until THIS final-row final-Z-lane write.
                            if (row == N_ROWS-1) begin
                                reload_done <= 1'b1;
                                rst_q       <= R_DONE;
                            end else begin
                                row     <= row + 1'b1;
                                beat    <= '0;
                                araddr  <= row_base(COLOR_BASE + base_q, row + 1);
                                arvalid <= 1'b1;
                                rst_q   <= R_C_AR;
                            end
                        end else begin
                            beat    <= beat + 1'b1;
                            araddr  <= araddr + 30'd32;
                            arvalid <= 1'b1;
                            rst_q   <= R_Z_AR;
                        end
                    end else lane <= lane + 1'b1;
                end
                default: rst_q <= R_IDLE;
            endcase
        end
    end

    // ================= serve side (serve_clk) =================
    // 1-cycle REGISTERED read, identical timing to the tile RAM / vram read2.
    always_ff @(posedge serve_clk) begin
        color_o <= color_ram[raddr];
        z_o     <= z_ram[raddr];
    end
    // reload_ready handshake (Ch323 fix): a fresh reload_start MUST drop ready immediately,
    // and ready re-raises only when THIS fill completes. Without this, a back-to-back reload
    // (two tile batches) sees ready still high from the PREVIOUS fill and gs_stub sweeps the
    // stale (pre-fill) z_ram before the new fill populates it — the reloaded Z is lost (the
    // board's "region A wrong color" bug; reproduced in tb_gs_tile_spill_lpddr). reload_start
    // is in the serve_clk (design) domain; reload_done is edge-detected after CDC.
    logic [1:0] done_sync = 2'b00;
    logic       ready_q   = 1'b0;
    wire        done_rise = done_sync[0] & ~done_sync[1];
    always_ff @(posedge serve_clk) begin
        done_sync <= {done_sync[0], reload_done};
        if      (reload_start) ready_q <= 1'b0;   // new fill armed -> not ready
        else if (done_rise)    ready_q <= 1'b1;   // this fill completed
    end
    // COMBINATIONALLY mask ready low while reload_start is asserted: gs_stub pulses
    // reload_start and checks ready in the SAME cycle, so the registered clear above lands
    // one cycle too late — without the mask gs_stub sees the PREVIOUS fill's stale ready=1
    // and sweeps before this fill populates z_ram (the region-A-wrong-color bug).
    assign reload_ready = ready_q & ~reload_start;
endmodule