retroDE_ps2/rtl/gif_gs/vram_bram_stub.sv

// retroDE_ps2 — vram_bram_stub (Ch154)
//
// Hardware-friendly sibling of `vram_stub`. Maps cleanly onto Agilex 5
// M20K block-RAM:
//   - 2048 × 32-bit word storage (instead of 8192 × 8-bit byte
//     storage). Internal width matches Agilex M20K native widths;
//     external addressing stays byte-addressable to keep the same
//     mental model as `vram_stub`.
//   - SYNCHRONOUS reads (registered 32-bit output). One-cycle read
//     latency — the rd_valid pulse fires the cycle the data is on
//     read_data.
//   - BYTE write enable only (4-bit `write_be`). The Ch106 PSMT4
//     per-bit `write_mask` RMW is NOT supported; PSMT4 callers must
//     do the nibble splice on the writer side BEFORE issuing the
//     write here. Ch155+ task to rework gs_stub.raster_pixel_emit
//     and gif_image_xfer_stub for that.
//   - Two synchronous read ports. Quartus implements two
//     independent read addresses by REPLICATING the M20K storage
//     across two RAM blocks rather than using a single native
//     dual-read port — exp_c shows 8 RAM Blocks for 8 KB vs
//     exp_a's 4 RAM Blocks for the same 8 KB single-port shape.
//     Two replicated RAM blocks is still vastly cheaper than the
//     65,536 flip-flops the legacy `vram_stub` shape produced;
//     the cost just isn't free.
//
// Empirical motivation (Ch153 forensics):
//   The legacy `vram_stub` shape (byte-addressable + combinational
//   dual reads + per-bit-mask RMW) failed to fit on Agilex 5 — the
//   8 KB array consumed 65,536 dedicated registers and 261,578
//   combinational nodes, dominating Ch152's 331 % ALM overrun.
//   `exp_a_bram_friendly` proved that a 2048 × 32-bit sync-read
//   byte-WE shape maps to 4 RAM Blocks + 0 registers + 46 ALMs.
//
// External port shape vs `vram_stub`:
//   IDENTICAL: clk, rst_n, write_en, write_addr[31:0],
//              write_data[31:0], write_be[3:0], read_addr[31:0],
//              read_data[31:0], read2_addr[31:0], read2_data[31:0].
//   NEW       : read_valid + read2_valid (1-cycle pulse with the data).
//   DROPPED   : write_mask[31:0] (Ch106 per-bit RMW; callers must
//              splice nibbles on the writer side).
//
// Address contract:
//   - Writes: write_addr is byte-aligned; the low 2 bits MUST be 0
//             (4-byte writes only). Each `write_be[i]` independently
//             commits byte `i` of the addressed word. Per-byte non-
//             wrapping admission: an enabled byte beyond `BYTES`
//             drops the WHOLE write (matches vram_stub Ch95 audit).
//   - Reads:  read_addr is byte-aligned; the low 2 bits MUST be 0.
//             `read_data` is the 32-bit word at `read_addr / 4`.
//             Byte / halfword extraction is the caller's job
//             (matches Ch141 / Ch142 nibble-readback pattern).
//
// Sim behaviour: time-0 mem is power-on-zero matching real M20K (the
// `// synthesis translate_off` initial block matches vram_stub's
// post-Ch152 pattern).

`timescale 1ns/1ps

module vram_bram_stub #(
    parameter int unsigned BYTES = 8192,

    // Ch251.4 — hardware-demo M20K rescue. When ENABLE_READ2 = 0, the
    // second sync-read port is FEATURE-STRIPPED: `read2_data` ties to
    // 0, `read2_valid` ties to 0, and Quartus no longer infers a
    // separate read port on `mem`. This collapses the storage from
    // two replicated 1W+1R simple-dual-port M20K banks (~410 M20Ks at
    // 512 KiB) to ONE 1W+1R bank (~205 M20Ks) — the savings that get
    // the 512 KiB framebuffer to fit on Agilex 5 (358 M20K budget).
    //
    // Contract caveat: read2 is the PSMT4 RMW old-byte read path. Any
    // build that exercises PSMT4 rasterization MUST keep this `1`. The
    // PSMCT32-only hardware demo (top_psmct32_raster_demo_bram) sets
    // it to `0`; all simulation TBs leave it at the default `1`.
    //
    // This is a SCOPED build profile, not a general fix — see
    // docs/decisions/0006-vram-roadmap.md for the longer-term
    // arbitrated / line-buffered VRAM plan.
    parameter bit         ENABLE_READ2 = 1'b1
) (
    input  logic        clk,
    input  logic        rst_n,

    // Write port (byte-WE; 4-byte-aligned write_addr).
    input  logic        write_en,
    input  logic [31:0] write_addr,
    input  logic [31:0] write_data,
    input  logic [3:0]  write_be,

    // Read port 0 (sync read; 4-byte-aligned read_addr).
    input  logic [31:0] read_addr,
    output logic [31:0] read_data,
    output logic        read_valid,

    // Read port 1 (sync read; 4-byte-aligned).
    input  logic [31:0] read2_addr,
    output logic [31:0] read2_data,
    output logic        read2_valid
);

    // 2048 × 32-bit storage. Index is the WORD index (write_addr / 4).
    //
    // Parameter contract: `BYTES` MUST be a power-of-two multiple of 4.
    // The WORD_AW-bit slice `*_addr[WORD_AW+1:2]` truncates the byte
    // address to a word index; for non-power-of-two `WORDS`, an out-
    // of-range byte address can map to a slice value that exceeds
    // `WORDS-1` and indexes beyond `mem[]`. `read_valid` already
    // marks such reads invalid downstream, but the BRAM read template
    // still indexes the array unconditionally to satisfy Quartus's
    // M20K inference (Ch154 audit), so the index itself must remain
    // in bounds. The Ch155 audit-low fix: clamp the read indices
    // with `& (WORDS-1)` so a power-of-two depth is required AND any
    // bit beyond the legal slice is masked away. Power-of-two also
    // matches every Agilex M20K depth target (256/512/1024/2048/...).
    localparam int unsigned WORDS     = BYTES / 4;
    localparam int unsigned WORD_AW   = $clog2(WORDS);

    logic [31:0] mem [0:WORDS-1];

    // synthesis translate_off
    initial begin
        if (BYTES < 4 || (BYTES & 32'd3) != 0)
            $error("vram_bram_stub: BYTES (%0d) must be >= 4 and a multiple of 4", BYTES);
        // Power-of-two check on WORDS: (WORDS != 0) && ((WORDS & (WORDS-1)) == 0).
        if (WORDS == 0 || (WORDS & (WORDS - 1)) != 0)
            $error("vram_bram_stub: BYTES (%0d) must yield a power-of-two WORDS depth (got %0d)",
                   BYTES, WORDS);

        // Ch252 — VRAM replication tripwire (simulation/elaboration only).
        //
        // At BYTES >= 256 KiB, each 1W+1R simple-dual-port replica costs
        // ~100 M20Ks. With ENABLE_READ2 = 1, Quartus replicates the
        // storage to give the second read its own port, doubling that
        // cost (>= 200 M20Ks per pair). Above this threshold a Quartus
        // fitter overrun on Agilex 5 (358 M20K budget) becomes likely.
        //
        // This `$fatal` runs in simulation and elaboration-aware lint
        // tools — it is the loud canary. The REAL protection is the
        // board-top profile: hardware builds explicitly set
        // ENABLE_READ2 = 0 when VRAM_BYTES is large (see
        // de25_nano_psmct32_raster_demo_top). Re-enabling read2 on a
        // large hardware VRAM requires landing one of the architectural
        // follow-ups in docs/decisions/0006-vram-roadmap.md first.
        if (ENABLE_READ2 && (BYTES >= 32'd262144)) begin
            $display("vram_bram_stub: ENABLE_READ2=1 with BYTES=%0d (>= 256 KiB) trips the replication tripwire.", BYTES);
            $display("  The 2nd read port forces Quartus to replicate the storage, ~doubling M20K cost.");
            $display("  Either set ENABLE_READ2=0 (PSMCT32-only hardware profile) or land the");
            $display("  arbitrated/line-buffered VRAM follow-up before re-enabling read2 at this size.");
            $display("  See docs/decisions/0006-vram-roadmap.md.");
            $fatal(1, "vram_bram_stub: replication-tripwire fatal exit");
        end

        for (int i = 0; i < int'(WORDS); i++) mem[i] = 32'd0;
    end
    // synthesis translate_on

    // ----------------------------------------------------------------
    // Write port — per-byte WE, per-byte non-wrapping admission.
    // ----------------------------------------------------------------
    logic [32:0] addr33;
    logic        admit_b0, admit_b1, admit_b2, admit_b3;
    logic        write_admit;
    assign addr33   = {1'b0, write_addr};
    assign admit_b0 = (addr33 + 33'd0) < 33'(BYTES);
    assign admit_b1 = (addr33 + 33'd1) < 33'(BYTES);
    assign admit_b2 = (addr33 + 33'd2) < 33'(BYTES);
    assign admit_b3 = (addr33 + 33'd3) < 33'(BYTES);
    assign write_admit = write_en
                       && (write_addr[1:0] == 2'b00)   // word-aligned
                       && (!write_be[0] || admit_b0)
                       && (!write_be[1] || admit_b1)
                       && (!write_be[2] || admit_b2)
                       && (!write_be[3] || admit_b3);

    logic [WORD_AW-1:0] write_word_idx;
    assign write_word_idx = write_addr[WORD_AW+1:2];

    // BRAM-native byte-WE template — each `if (write_be[i])` slice
    // updates a separate 8-bit lane of the 32-bit word. This is the
    // canonical Quartus inference shape (proven in Ch153 exp_a).
    always_ff @(posedge clk) begin
        if (rst_n && write_admit) begin
            if (write_be[0]) mem[write_word_idx][ 7: 0] <= write_data[ 7: 0];
            if (write_be[1]) mem[write_word_idx][15: 8] <= write_data[15: 8];
            if (write_be[2]) mem[write_word_idx][23:16] <= write_data[23:16];
            if (write_be[3]) mem[write_word_idx][31:24] <= write_data[31:24];
        end
    end

    // ----------------------------------------------------------------
    // Read ports — sync, registered output, 1-cycle latency.
    //
    // The read path is the CANONICAL Quartus M20K inference template:
    // a single unconditional `read_data <= mem[idx]` registered
    // assignment, with NO reset on the data register and NO read-side
    // gating. Quartus rejected an earlier draft that gated reads on
    // `read_addr[1:0]==2'b00 && in-bounds` with
    //   "Info (276007): RAM logic ... uninferred due to asynchronous
    //   read logic"
    // and synthesized the storage as flip-flops. Bounds + alignment
    // checks land on the separate `read_valid` pipeline below where
    // they don't poison the data path.
    // ----------------------------------------------------------------
    // Word-index extraction. For a power-of-two `WORDS` depth (the
    // parameter contract enforced above), the slice
    // `read_addr[WORD_AW+1:2]` is naturally bounded to `[0, WORDS-1]`
    // — the high bits beyond WORD_AW+1 represent address ranges
    // already rejected by the `read_valid` gate below. The mask
    // `& WORD_AW'(WORDS - 1)` is redundant for power-of-two WORDS
    // (it just keeps the same bits) but documents the contract: a
    // future relaxation that allows non-power-of-two depths would
    // need to either remove that change OR force the mem-read index
    // through a real range-clamp rather than relying on the natural
    // truncation.
    logic [WORD_AW-1:0] read_word_idx;
    assign read_word_idx  = read_addr [WORD_AW+1:2] & WORD_AW'(WORDS - 1);

    always_ff @(posedge clk) begin
        read_data <= mem[read_word_idx];
    end

    // Out-of-range / misaligned detection on a parallel pipeline so
    // it doesn't gate the BRAM read path. read_valid pulses 1 cycle
    // late, aligned with read_data.
    logic read_in_range_pre;
    assign read_in_range_pre  = (read_addr [1:0] == 2'b00) &&
                                ({1'b0, read_addr } + 33'd3 < 33'(BYTES));
    always_ff @(posedge clk) begin
        if (!rst_n) read_valid <= 1'b0;
        else        read_valid <= read_in_range_pre;
    end

    // ----------------------------------------------------------------
    // Read port 1 — feature-strippable via ENABLE_READ2 (Ch251.4).
    // When ENABLE_READ2=1: full sync read + range gate, matching the
    // pre-Ch251.4 behaviour. When ENABLE_READ2=0: NO reference to
    // `mem` from this branch, so Quartus does not infer a second M20K
    // read port and the VRAM storage stops replicating.
    // ----------------------------------------------------------------
    generate
    if (ENABLE_READ2) begin : g_read2_en
        logic [WORD_AW-1:0] read2_word_idx;
        assign read2_word_idx = read2_addr[WORD_AW+1:2] & WORD_AW'(WORDS - 1);

        always_ff @(posedge clk) begin
            read2_data <= mem[read2_word_idx];
        end

        logic read2_in_range_pre;
        assign read2_in_range_pre = (read2_addr[1:0] == 2'b00) &&
                                    ({1'b0, read2_addr} + 33'd3 < 33'(BYTES));
        always_ff @(posedge clk) begin
            if (!rst_n) read2_valid <= 1'b0;
            else        read2_valid <= read2_in_range_pre;
        end
    end else begin : g_read2_dis
        always_ff @(posedge clk) begin
            read2_data  <= 32'd0;
            read2_valid <= 1'b0;
        end
    end
    endgenerate

endmodule : vram_bram_stub