retroDE_ps2/rtl/gif_gs/gs_texture_unit.sv

// retroDE_ps2 — gs_texture_unit
//
// Per-pixel texture sampler (brick 1, the texturing datapath core).
//
// Takes a per-pixel texture coordinate (u,v) + the TEX0 descriptor, fetches
// the texel from VRAM through a read port, and outputs the sampled color,
// pipelined to absorb the VRAM read latency.
//
//   (u,v,valid) --[gs_texel_addr]--> byte addr --> VRAM read port
//                                                     |  (RD_LATENCY cyc)
//                              sampled color <--[decode]-- tex_rd_data
//
// v1 scope (kept deliberately minimal so it's fully verifiable now):
//   - PSMCT32 only (32-bit ABGR texels, direct — no CLUT).
//   - DECAL texture function (texel replaces fragment color).
//
// Ch296 — PSMT8 indexed texturing (this chapter):
//   - When psm==PSMT8 (0x13) the fetched 32-bit word holds FOUR packed
//     8-bit indices. The byte for this texel is selected by the texel
//     byte address' low 2 bits (gs_texel_addr emits a 1-byte/texel
//     address for PSMT8). That index drives a CLUT lookup port; the
//     returned PSMCT32 entry is the texel color (DECAL).
//   - The lookup is COMBINATIONAL (clut_stub's read port is comb), so it
//     lands in the SAME cycle as the direct PSMCT32 path — the existing
//     single S1->S2 register in gs_stub aligns it with emit unchanged.
//   - PSMCT32 (psm==0x00) behavior is byte-identical to before.
// Next versions add: PSMCT16 unpack, PSMT4 (nibble) + CLUT, swizzle, and
// MODULATE/HIGHLIGHT tex functions.
//
// The VRAM read port here is generic (byte address out, 32-bit word in,
// fixed RD_LATENCY). Integration wires it to vram_stub's spare read port;
// vram_stub's exact address convention is reconciled at integration time.

`timescale 1ns/1ps

module gs_texture_unit #(
    // Ch298 — SWIZZLED PSMT4 texture sampling. When PSMT4_SWIZZLE=1 AND the
    // texture psm==PSMT4, the texel byte address + nibble_hi are computed by
    // gs_swizzle_psmt4_stub (the SAME proven module already on the framebuffer
    // WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT4 block layout,
    // instead of the linear gs_texel_addr. LINEAR is the default (0) so every
    // existing linear PSMT4/PSMT8/PSMCT32 demo + TB is byte-identical. The
    // swizzled address feeds the SAME word-aligned read, byte-lane extract,
    // nibble select, and CLUT lookup — only the address GENERATION differs.
    // Because the swizzled address (its low 2 bits + nibble_hi) is also
    // address-derived, it flows through the SAME SEL_DELAY pipe as the linear
    // selectors, so registered-read (TEX_RD_REGISTERED=1) alignment is reused
    // verbatim. PSMT8/PSMCT32 always take the linear address (this rung is
    // PSMT4-only).
    parameter bit PSMT4_SWIZZLE = 1'b0,
    // Ch299 — SWIZZLED PSMT8 texture sampling. The sibling of PSMT4_SWIZZLE,
    // MINUS the nibble (PSMT8 is 1 byte/texel). When PSMT8_SWIZZLE=1 AND the
    // texture psm==PSMT8, the texel byte address is computed by
    // gs_swizzle_psmt8_stub (the SAME proven module already on the framebuffer
    // WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT8 block layout,
    // instead of the linear gs_texel_addr. LINEAR is the default (0) so every
    // existing linear PSMT8/PSMT4/PSMCT32 demo + TB is byte-identical. The
    // swizzled address feeds the SAME word-aligned read, byte-lane extract, and
    // CLUT lookup — only the address GENERATION differs. Because the swizzled
    // address' low 2 bits (byte-lane selector) are also address-derived, they
    // flow through the SAME SEL_DELAY pipe as the linear selectors, so
    // registered-read (TEX_RD_REGISTERED=1) alignment is reused verbatim. NO
    // nibble pipe is needed — PSMT8 has no nibble. PSMT4/PSMCT32 always take
    // their own address (this rung is PSMT8-only).
    parameter bit PSMT8_SWIZZLE = 1'b0,
    // Ch300 — SWIZZLED PSMCT32 (direct-color) texture sampling. The closure
    // rung of the swizzle layout family. When PSMCT32_SWIZZLE=1 AND the texture
    // psm==PSMCT32, the texel byte address is computed by gs_swizzle_psmct32_stub
    // (the SAME proven module already on the framebuffer WRITE / SCANOUT / UPLOAD
    // paths — Ch119/Ch122) using the real PS2 PSMCT32 page/block layout, instead
    // of the linear gs_texel_addr. Unlike PSMT4/PSMT8 this needs NO CLUT and NO
    // byte-lane select: PSMCT32 is 4 bytes/texel, so the swizzled address is
    // already word-aligned and the fetched 32-bit word IS the color directly
    // (tex_color = tex_rd_data). LINEAR is the default (0) so every existing
    // linear PSMCT32 demo + TB (textured / tritex) is byte-identical. This is
    // the SAME single-param-per-format gate as PSMCT32_SWIZZLE on the FB side,
    // so a PSMCT32 texture and a PSMCT32 framebuffer swizzle together.
    parameter bit PSMCT32_SWIZZLE = 1'b0,
    // Ch294 — GS texture WRAP MODES (REPEAT + CLAMP). When TEX_WRAP_ENABLE=1
    // the per-pixel (u,v) are resolved against the texture's power-of-two
    // dimensions (width=2^TW, height=2^TH from TEX0) using the CLAMP_1 wrap
    // mode (WMS for u/S, WMT for v/T): 0=REPEAT (u & (width-1)), 1=CLAMP
    // (u>=width -> width-1). REGION_* (2/3) are NOT modelled and pass through.
    // The wrap is applied to u/v BEFORE address generation, so it covers the
    // linear path AND every swizzle path. With TEX_WRAP_ENABLE=0 (default)
    // u_eff===u and v_eff===v as a compile-time constant, so the wrap logic is
    // pruned and every existing consumer is BYTE-IDENTICAL.
    parameter bit TEX_WRAP_ENABLE = 1'b0,
    // Ch308 — BILINEAR (4-tap) texture filtering, PSMCT32-only this rung.
    // When BILINEAR_ENABLE=1 AND psm==PSMCT32 the sampler runs a 4-beat read
    // FSM: it fetches the 4 texels surrounding the fractional coord
    //   (u,v) (u+1,v) (u,v+1) (u+1,v+1)
    // — each independently wrapped/clamped through the SAME u_eff/v_eff
    // machinery (so edge taps repeat/clamp instead of reading outside the
    // texture) — then blends them per channel (R,G,B,A) by the 4-bit
    // fractional u_frac/v_frac (0..15, /16) using a >>4 fixed-point lerp.
    // For !BILINEAR_ENABLE (default) OR psm!=PSMCT32 the EXACT current
    // single-read NEAREST path is used and u_frac/v_frac are ignored, so the
    // synthesized logic and every existing consumer is BYTE-IDENTICAL (the
    // bilinear FSM, the per-beat coord select, and the blend datapath are all
    // pruned as compile-time-dead when BILINEAR_ENABLE=0). Bilinear is
    // PSMCT32-only by default; with PALETTE_BILINEAR=1 (Ch314) it also covers
    // PSMT8/PSMT4 via per-tap CLUT-before-interp. At PALETTE_BILINEAR=0 the
    // indexed textures still take the nearest path even with BILINEAR_ENABLE=1.
    //
    // ALPHA: the alpha channel is INTERPOLATED with the same 4-tap lerp as
    // R/G/B (not pass-through-nearest). For an opaque texture (all taps a=255)
    // this returns 255 exactly; for a texel-center sample (u_frac=v_frac=0) it
    // returns the (u,v) tap's alpha exactly.
    parameter bit BILINEAR_ENABLE = 1'b0,
    // Ch314 — BILINEAR for PALETTIZED (indexed) textures. When
    // PALETTE_BILINEAR=1 (and BILINEAR_ENABLE=1) the 4-tap path also runs for
    // PSMT8 (0x13) and PSMT4 (0x14). The CRITICAL rule is CLUT-BEFORE-INTERP:
    // each of the 4 taps fetches an INDEX, that index is CLUT'd to an RGBA
    // color (the existing combinational clut_rd_idx/clut_rd_data port), and the
    // 4 COLORS are then interpolated — NOT the indices. This falls out of
    // capturing `near_color` per tap (clut_rd_data for indexed, tex_rd_data for
    // PSMCT32) instead of the raw word. Swizzled addressing + wrap/clamp run in
    // the SAME per-tap addr-gen that already feeds the nearest path, so they
    // happen BEFORE the index/CLUT lookup. Default 0 → indexed textures stay
    // nearest even with BILINEAR_ENABLE=1, so every existing build is
    // byte-identical (the combined path only ever fed PSMCT32 textures anyway).
    parameter bit PALETTE_BILINEAR = 1'b0,
    parameter int RD_LATENCY = 1,      // VRAM read latency in clk cycles
    // Ch296 — PSMT8 byte-lane realignment. The byte selected from the
    // fetched word must use the LOW 2 bits of the address that was ISSUED
    // for the returned data. When the texel ADDRESS advances every cycle
    // while a read is in flight (gs_stub TEX_RD_REGISTERED=1: address
    // taken from the S0 walker, registered read returns 1 cycle later),
    // the current `addr` no longer matches the in-flight word, so the
    // selector must be delayed by SEL_DELAY cycles to re-pair them. When
    // the address is HELD stable across the read (combinational read port,
    // address from the stable S1 latch), SEL_DELAY=0 and the current addr
    // is correct. Driven from gs_stub as TEX_RD_REGISTERED?TEX_RD_LATENCY:0.
    parameter int SEL_DELAY  = 0
) (
    input  logic        clk,
    input  logic        rst_n,

    // per-pixel texture coordinate in
    input  logic        in_valid,
    input  logic [10:0] u,
    input  logic [10:0] v,

    // Ch308 — fractional texture coords for BILINEAR (4-bit, 0..15 => /16).
    // Unused at default (BILINEAR_ENABLE=0) and for non-PSMCT32 psm.
    input  logic [3:0]  u_frac,
    input  logic [3:0]  v_frac,

    // Ch310 — RUNTIME filter select (per-primitive TEX1_1.MMAG). When
    // BILINEAR_ENABLE=1 the 4-tap path runs ONLY when (is_ct32 && filter_lin);
    // with filter_lin=0 (TEX1.MMAG=0 NEAREST) the sampler falls back to the
    // exact nearest single-read path (busy stays 0). Unused at
    // BILINEAR_ENABLE=0 (g_nearest), so the default build is byte-identical.
    input  logic        filter_lin,

    // Ch294 — wrap-mode controls (CLAMP_1 WMS/WMT + TEX0 TW/TH). Unused at
    // default (TEX_WRAP_ENABLE=0) since u_eff/v_eff collapse to u/v.
    input  logic [1:0]  wms,
    input  logic [1:0]  wmt,
    input  logic [3:0]  tw,
    input  logic [3:0]  th,

    // TEX0 descriptor
    input  logic [31:0] tbp0_base_bytes, // texture base in VRAM (bytes)
    input  logic [13:0] tbw,             // TEX0.TBW (texels/row / 64)
    input  logic [5:0]  psm,             // pixel storage mode

    // VRAM texel read port
    output logic        tex_rd_en,
    output logic [31:0] tex_rd_addr,     // byte address
    input  logic [31:0] tex_rd_data,     // 32-bit word, valid RD_LATENCY later

    // Ch296 — CLUT lookup port (PSMT8 indexed texturing). The extracted
    // 8-bit index drives `clut_rd_idx`; the parent wires this to
    // clut_stub's second (combinational) read port and returns the
    // PSMCT32 entry on `clut_rd_data`. Unused for PSMCT32 textures.
    output logic [7:0]  clut_rd_idx,
    input  logic [31:0] clut_rd_data,    // PSMCT32 entry for clut_rd_idx

    // sampled color out (aligned with out_valid)
    output logic        out_valid,
    output logic [31:0] tex_color,       // ABGR8888

    // Ch308 — BILINEAR busy: high while the 4-beat read sequence is in flight
    // (the caller must not issue a new in_valid until it drops / out_valid
    // pulses). Always 0 on the nearest path (BILINEAR_ENABLE=0 or non-PSMCT32),
    // so a caller that ignores it sees byte-identical behavior.
    output logic        busy
);

    localparam logic [5:0] PSM_PSMCT32 = 6'h00;
    localparam logic [5:0] PSM_PSMT8   = 6'h13;
    localparam logic [5:0] PSM_PSMT4   = 6'h14;

    // --- Ch294: wrap-mode resolution (u/v -> u_eff/v_eff) ---
    // Applied BEFORE any address generation so it covers the linear path AND
    // every swizzle path. width=2^TW, height=2^TH (both powers of two), so
    // REPEAT is a mask and CLAMP is a >width-1 saturate. u/v are unsigned so
    // there is no negative/underflow case to handle. REGION_* (2/3) pass
    // through unchanged (not modelled this rung). At TEX_WRAP_ENABLE=0 this is
    // a constant pass-through (u_eff===u, v_eff===v) -> byte-identical.
    // Ch308 — the coord that FEEDS the wrap. On the nearest path (bilinear off
    // or non-PSMCT32) this is the port u/v UNCHANGED, so the wrap output
    // (u_eff/v_eff) and everything downstream is byte-identical. On the
    // bilinear path it is the current beat's neighbor coord (u+du[k],v+dv[k]),
    // so each of the 4 taps is independently wrapped/clamped. `bili_active` is
    // a compile-time constant 0 when BILINEAR_ENABLE=0, so u_in===u / v_in===v
    // collapses away at the default build.
    logic        bili_active;            // declared below; bilinear running for this psm
    logic [10:0] beat_u, beat_v;         // declared below; current beat neighbor coord
    logic [10:0] u_in, v_in;
    always_comb begin
        if (BILINEAR_ENABLE && bili_active) begin
            u_in = beat_u;  v_in = beat_v;
        end else begin
            u_in = u;       v_in = v;     // byte-identical nearest coord
        end
    end

    logic [10:0] u_eff, v_eff;
    logic [10:0] u_wmask, v_wmask;     // width-1 / height-1
    logic [10:0] u_wlimit, v_wlimit;
    always_comb begin
        u_wmask  = (11'd1 << tw) - 11'd1;   v_wmask  = (11'd1 << th) - 11'd1;
        u_wlimit = u_wmask;                 v_wlimit = v_wmask;   // width-1 / height-1
        if (!TEX_WRAP_ENABLE) begin
            u_eff = u_in;  v_eff = v_in;    // pass-through -> byte-identical at default
        end else begin
            // U
            unique case (wms)
                2'd0:    u_eff = u_in & u_wmask;                       // REPEAT
                2'd1:    u_eff = (u_in > u_wlimit) ? u_wlimit : u_in;  // CLAMP
                default: u_eff = u_in;                                 // REGION_* not modelled -> pass-through
            endcase
            // V
            unique case (wmt)
                2'd0:    v_eff = v_in & v_wmask;
                2'd1:    v_eff = (v_in > v_wlimit) ? v_wlimit : v_in;
                default: v_eff = v_in;
            endcase
        end
    end

    // --- linear address (combinational) ---
    logic [31:0] lin_addr;
    logic        lin_nibble_hi;   // PSMT4: this texel is the HIGH nibble of its byte
    gs_texel_addr #(.ADDR_W(32)) u_addr (
        .base_byte_addr (tbp0_base_bytes),
        .u              (u_eff),
        .v              (v_eff),
        .tbw            (tbw),
        .psm            (psm),
        .texel_byte_addr(lin_addr),
        .nibble_hi      (lin_nibble_hi)
    );

    // --- swizzled PSMT4 address (combinational) ---
    // EXACTLY mirrors the texture-UPLOAD path (gif_image_xfer_stub Ch139):
    // the swizzle module is fed FBP=0 so it emits only the WITHIN-TEXTURE
    // byte OFFSET, and the texture base (tbp0_base_bytes) is ADDED on top.
    // This makes the sampled address bit-identical to the uploaded one for
    // ANY 256-byte-aligned base (using the swizzle module's `fbp` input here
    // would discard the low 11 bits of a non-2048-aligned base). FBW=TBW (in
    // 64-texel units); PSMT4 swizzle needs FBW even (bw_pg = FBW>>1). The
    // texture's (u,v) ARE the swizzle (x,y). Output is byte-offset + nibble_hi
    // — the SAME shape gs_texel_addr emits for linear PSMT4, so downstream
    // (word-align, byte-lane, nibble select, CLUT) is untouched.
    logic [31:0] swz_off;
    logic [31:0] swz_addr;
    logic        swz_nibble_hi;
    generate
        if (PSMT4_SWIZZLE) begin : g_swizzle4
            gs_swizzle_psmt4_stub u_swizzle4 (
                .fbp       (9'd0),
                .fbw       (tbw[5:0]),
                .x         ({1'b0, u_eff}),
                .y         ({1'b0, v_eff}),
                .addr      (swz_off),
                .nibble_hi (swz_nibble_hi)
            );
            assign swz_addr = tbp0_base_bytes + swz_off;
        end else begin : g_no_swizzle4
            assign swz_off       = 32'd0;
            assign swz_addr      = 32'd0;
            assign swz_nibble_hi = 1'b0;
        end
    endgenerate

    // --- swizzled PSMT8 address (combinational) ---
    // Ch299 — EXACTLY mirrors the PSMT4-swizzle sampler arm above (and the
    // PSMT8 UPLOAD path in gif_image_xfer_stub Ch133), MINUS the nibble.
    // gs_swizzle_psmt8_stub is fed FBP=0 so it emits only the WITHIN-TEXTURE
    // byte OFFSET; the texture base (tbp0_base_bytes) is ADDED on top. This
    // makes the sampled address bit-identical to the uploaded one for ANY
    // 256-byte-aligned base. FBW=TBW (in 64-texel units); the PSMT8 swizzle
    // needs FBW even (bw_pg = FBW>>1). The texture's (u,v) ARE the swizzle
    // (x,y). Output is a byte address — the SAME shape gs_texel_addr emits for
    // linear PSMT8 — so downstream (word-align, byte-lane, CLUT) is untouched.
    // No nibble_hi: PSMT8 is one full byte per texel.
    logic [31:0] swz8_off;
    logic [31:0] swz8_addr;
    generate
        if (PSMT8_SWIZZLE) begin : g_swizzle8
            gs_swizzle_psmt8_stub u_swizzle8 (
                .fbp  (9'd0),
                .fbw  (tbw[5:0]),
                .x    ({1'b0, u_eff}),
                .y    ({1'b0, v_eff}),
                .addr (swz8_off)
            );
            assign swz8_addr = tbp0_base_bytes + swz8_off;
        end else begin : g_no_swizzle8
            assign swz8_off  = 32'd0;
            assign swz8_addr = 32'd0;
        end
    endgenerate

    // --- swizzled PSMCT32 address (combinational) ---
    // Ch300 — direct-color sibling of the PSMT4/PSMT8 swizzle arms above, using
    // the SAME proven gs_swizzle_psmct32_stub already on the FB WRITE / SCANOUT
    // / UPLOAD paths. Fed FBP=0 so it emits only the WITHIN-TEXTURE byte OFFSET;
    // the texture base (tbp0_base_bytes) is ADDED on top, making the sampled
    // address bit-identical to the uploaded one for ANY 2048-byte-aligned base.
    // FBW=TBW (in 64-pixel units — PSMCT32 page is 64 px wide, so TBW units
    // match the stub's fbw directly, NO >>1). The texture's (u,v) ARE the
    // swizzle (x,y). Output is a 4-byte-aligned byte address — gs_texel_addr's
    // PSMCT32 shape — so downstream is untouched. NO nibble, NO byte-lane, NO
    // CLUT: the fetched word is the color (tex_color = tex_rd_data).
    logic [31:0] swz32_off;
    logic [31:0] swz32_addr;
    generate
        if (PSMCT32_SWIZZLE) begin : g_swizzle32
            gs_swizzle_psmct32_stub u_swizzle32 (
                .fbp  (9'd0),
                .fbw  (tbw[5:0]),
                .x    ({1'b0, u_eff}),
                .y    ({1'b0, v_eff}),
                .addr (swz32_off)
            );
            assign swz32_addr = tbp0_base_bytes + swz32_off;
        end else begin : g_no_swizzle32
            assign swz32_off  = 32'd0;
            assign swz32_addr = 32'd0;
        end
    endgenerate

    // --- linear-vs-swizzled select ---
    // Swizzle applies to a PSMT4 texture when PSMT4_SWIZZLE is set, and to a
    // PSMT8 texture when PSMT8_SWIZZLE is set; every other psm always takes the
    // linear address, and the two swizzle gates are mutually exclusive by psm.
    // With both params 0 the selects are constant-false, so the synthesized
    // logic — and every linear TB/demo — is byte-identical.
    logic        use_swizzle4;
    logic        use_swizzle8;
    logic        use_swizzle32;
    logic [31:0] addr;
    logic        nibble_hi;
    assign use_swizzle4  = (PSMT4_SWIZZLE   != 1'b0) && (psm == PSM_PSMT4);
    assign use_swizzle8  = (PSMT8_SWIZZLE   != 1'b0) && (psm == PSM_PSMT8);
    assign use_swizzle32 = (PSMCT32_SWIZZLE != 1'b0) && (psm == PSM_PSMCT32);
    // Only the PSMT4 path carries a nibble; PSMT8/PSMCT32 swizzle and the
    // linear fallback have none.
    assign addr         = use_swizzle4  ? swz_addr      :
                          use_swizzle8  ? swz8_addr     :
                          use_swizzle32 ? swz32_addr    : lin_addr;
    assign nibble_hi    = use_swizzle4  ? swz_nibble_hi : lin_nibble_hi;

    // Nearest-path read enable / address. These are muxed at the module
    // outputs (tex_rd_en/tex_rd_addr) below: on the nearest path they ARE the
    // outputs (byte-identical); on the bilinear path the FSM drives the
    // outputs instead. The word-align mask is a no-op for PSMCT32.
    logic        near_rd_en;
    logic [31:0] near_rd_addr;
    assign near_rd_en   = in_valid;
    // The VRAM read port is 32-bit WORD-addressed (and vram_bram_stub's
    // read2 only returns data for word-aligned addresses). PSMCT32 texel
    // addresses are already word-aligned; PSMT8 byte addresses are not, so
    // present the word-aligned address and recover the right byte via the
    // low 2 bits (sel_lo) in the index extract below. Masking is a no-op
    // for PSMCT32, so that path stays byte-identical.
    assign near_rd_addr = addr & ~32'd3;

    // --- PSMT8 index extract ---
    // gs_texel_addr returns a 1-byte/texel address for PSMT8, so the
    // fetched 32-bit word (read at addr & ~3 by the word-addressed VRAM
    // port) packs 4 indices; the issued address' low 2 bits select which
    // byte is THIS texel.
    //
    // The byte selector uses the addr[1:0] from the issue cycle of the
    // returned word. SEL_DELAY (see the param comment) is 0 when the
    // address is held stable across the read (current addr is correct) and
    // >0 when the address advances while the read is in flight (delay the
    // selector to re-pair it with the in-flight word). `sel_lo` carries it.
    //
    // PSMT4 (Ch297) adds a NIBBLE selector on top of the byte selector.
    // gs_texel_addr emits a byte address (texel_offset>>1) plus `nibble_hi`
    // (= texel_offset[0]: even texel -> LOW nibble, odd -> HIGH nibble). The
    // selected byte (via sel_lo, exactly as PSMT8) holds TWO 4-bit indices;
    // nibble_hi picks which. Because nibble_hi is derived from the texel
    // ADDRESS — which advances every cycle while a read is in flight under
    // TEX_RD_REGISTERED=1 — it must be SEL_DELAY-aligned by the SAME pipe
    // depth as sel_lo so it re-pairs with the returned word. (Same class as
    // the PSMT8 byte-lane realignment; get it wrong and odd/even texels smear.)
    logic [1:0] sel_lo;
    logic       nib_sel;          // SEL_DELAY-aligned nibble_hi
    generate
        if (SEL_DELAY == 0) begin : g_sel_comb
            assign sel_lo  = addr[1:0];
            assign nib_sel = nibble_hi;
        end else begin : g_sel_reg
            logic [1:0] sel_pipe [0:SEL_DELAY-1];
            logic       nib_pipe [0:SEL_DELAY-1];
            always_ff @(posedge clk or negedge rst_n) begin
                if (!rst_n) begin
                    for (int i = 0; i < SEL_DELAY; i++) begin
                        sel_pipe[i] <= 2'd0;
                        nib_pipe[i] <= 1'b0;
                    end
                end else begin
                    sel_pipe[0] <= addr[1:0];
                    nib_pipe[0] <= nibble_hi;
                    for (int i = 1; i < SEL_DELAY; i++) begin
                        sel_pipe[i] <= sel_pipe[i-1];
                        nib_pipe[i] <= nib_pipe[i-1];
                    end
                end
            end
            assign sel_lo  = sel_pipe[SEL_DELAY-1];
            assign nib_sel = nib_pipe[SEL_DELAY-1];
        end
    endgenerate

    // Byte select (shared by PSMT8 and PSMT4): pick the texel's byte lane.
    logic [7:0] sel_byte;
    always_comb begin
        unique case (sel_lo)
            2'b00:   sel_byte = tex_rd_data[ 7: 0];
            2'b01:   sel_byte = tex_rd_data[15: 8];
            2'b10:   sel_byte = tex_rd_data[23:16];
            default: sel_byte = tex_rd_data[31:24];
        endcase
    end

    // Nibble select for PSMT4 (4-bit index, zero-extended to 8 bits so the
    // SAME clut_rd_idx port + clut_stub feed it; CLUT entries 0..15 used).
    // iverilog-12: no bit-select on a parenthesized expr, so split into a
    // named net first, then index it.
    logic [7:0] sel_byte_for_nib;
    assign sel_byte_for_nib = sel_byte;
    logic [3:0] psmt4_nibble;
    assign psmt4_nibble = nib_sel ? sel_byte_for_nib[7:4] : sel_byte_for_nib[3:0];

    // Index out: PSMT4 -> zero-extended nibble; PSMT8 -> full byte.
    assign clut_rd_idx = (psm == PSM_PSMT4) ? {4'd0, psmt4_nibble} : sel_byte;

    // --- valid pipeline matching the read latency ---
    // in_valid presented with the address this cycle; tex_rd_data for it
    // arrives RD_LATENCY cycles later. Delay valid by the same amount.
    logic [RD_LATENCY-1:0] valid_pipe;
    always_ff @(posedge clk or negedge rst_n) begin
        if (!rst_n) begin
            valid_pipe <= '0;
        end else begin
            if (RD_LATENCY == 1)
                valid_pipe[0] <= in_valid;
            else begin
                valid_pipe[0] <= in_valid;
                for (int i = 1; i < RD_LATENCY; i++)
                    valid_pipe[i] <= valid_pipe[i-1];
            end
        end
    end

    logic        near_out_valid;
    assign near_out_valid = valid_pipe[RD_LATENCY-1];

    // --- decode (DECAL) ---
    //   PSMT4   : texel color = CLUT[nibble]     (indexed indirection)
    //   PSMT8   : texel color = CLUT[byte index] (indexed indirection)
    //   PSMCT32 : texel word IS the color directly (byte-identical to v1)
    logic [31:0] near_color;
    assign near_color = (psm == PSM_PSMT8 || psm == PSM_PSMT4)
                      ? clut_rd_data : tex_rd_data;

    // ========================================================================
    // Ch308 — BILINEAR (4-tap) PSMCT32 FILTER
    // ========================================================================
    // When BILINEAR_ENABLE=1 and psm==PSMCT32 we sample the 4 texels around the
    // fractional coord and blend them. The whole block is wrapped in a generate
    // that is empty when BILINEAR_ENABLE=0, so the default build is pruned to
    // exactly the nearest path and is BYTE-IDENTICAL.
    //
    // CYCLE SCHEDULE (RD_LATENCY-aware; example RD_LATENCY=L):
    //   T0           : caller asserts in_valid (with u,v,u_frac,v_frac). FSM in
    //                  IDLE latches u/v/frac, sets beat index k=0, drives
    //                  bili_active=1, busy=1, moves to ISSUE.
    //   T0+ (ISSUE)  : present neighbor[k] coord (beat_u/beat_v -> wrap ->
    //                  gs_texel_addr -> tex_rd_addr) and pulse tex_rd_en for 1
    //                  cycle; start an L-cycle wait; -> WAIT.
    //   ISSUE+1..+L  : WAIT counts L cycles; on the L-th cycle tex_rd_data holds
    //                  beat[k]'s 32-bit ABGR word -> capture into tap[k].
    //                  If k<3: k++ and -> ISSUE (next neighbor). If k==3: -> DONE.
    //   DONE         : combinationally lerp the 4 captured taps by u_frac/v_frac
    //                  per channel; assert out_valid for 1 cycle with tex_color;
    //                  drop busy; -> IDLE.
    //   => total ~ 4*(1+L)+1 cycles per filtered sample. Throughput is NOT a
    //      goal here (a later texture-cache pass collapses the 4 reads).
    //
    // Neighbor table (k -> du,dv): 0->(0,0) 1->(1,0) 2->(0,1) 3->(1,1).
    // Each neighbor coord is fed through the SAME u_eff/v_eff wrap (via
    // u_in/v_in above) so edge taps repeat/clamp and never read outside the
    // texture (proven in the TB clamp/repeat cases).
    //
    // lerp(a,b,f) = a + (($signed({1'b0,b}) - $signed({1'b0,a})) * $signed({1'b0,f})) >>> 4
    //   with f the 4-bit frac (0..15 => /16). a,b are 8-bit channels. The
    //   bracketed product is computed in a SIGNED temp (no bit-select on a
    //   parenthesized expr — iverilog-12 rule), then arithmetic-shifted >>>4,
    //   then defensively clamped to 0..255.
    generate
    if (BILINEAR_ENABLE) begin : g_bilinear
        localparam logic [1:0] BS_IDLE  = 2'd0;
        localparam logic [1:0] BS_ISSUE = 2'd1;
        localparam logic [1:0] BS_WAIT  = 2'd2;
        localparam logic [1:0] BS_DONE  = 2'd3;

        logic [1:0]  state;
        logic [1:0]  beat;               // which neighbor 0..3
        logic [31:0] wait_cnt;           // counts RD_LATENCY
        logic [31:0] tap [0:3];          // captured ABGR per neighbor
        logic [10:0] lat_u, lat_v;       // latched coord for this sample
        logic [3:0]  lat_uf, lat_vf;     // latched fracs

        // is this a PSMCT32 sample? bilinear runs for PSMCT32 always, and (Ch314)
        // for PSMT8/PSMT4 when PALETTE_BILINEAR=1; any other psm falls back to the
        // nearest path even with BILINEAR_ENABLE=1.
        logic        is_ct32;
        logic        is_indexed;
        logic        bili_psm_ok;
        assign is_ct32     = (psm == PSM_PSMCT32);
        assign is_indexed  = (psm == PSM_PSMT8) || (psm == PSM_PSMT4);
        assign bili_psm_ok = is_ct32 || (PALETTE_BILINEAR && is_indexed);

        // Ch310 — RUNTIME filter gate. The 4-tap path runs ONLY for a PSMCT32
        // texture whose primitive selected LINEAR magnification (filter_lin=1,
        // i.e. TEX1.MMAG=1). With filter_lin=0 (NEAREST) we fall back to the
        // single-read nearest path even with BILINEAR_ENABLE=1, so an
        // MMAG=0 primitive stays nearest. `do_lin` is the single predicate that
        // selects the bilinear datapath everywhere below.
        //
        // NOTE on the `!== 1'b0` test: it makes filter_lin DEFAULT-ON when the
        // port is left UNCONNECTED (sim Z). The standalone tb_gs_texture_bilinear
        // exercises the 4-tap path directly without driving filter_lin, so an
        // unconnected input must keep bilinear running (Z !== 0 → true). A
        // driven 0 (gs_stub MMAG=0) gives nearest; a driven 1 gives bilinear.
        // In synthesis filter_lin is always driven by gs_stub, so this reduces
        // to a plain `is_ct32 && filter_lin`.
        logic        do_lin;
        assign do_lin = bili_psm_ok && (filter_lin !== 1'b0);

        // bili_active (read by the wrap mux above): high whenever a filtered
        // PSMCT32 sample is being processed by the FSM (ISSUE/WAIT/DONE) so the
        // wrap consumes the per-beat neighbor coord. When do_lin=0 it is low so
        // the wrap uses the port u/v (nearest), byte-identical to the
        // non-bilinear coord path.
        assign bili_active = do_lin;

        // neighbor delta for the current beat
        logic [10:0] du, dv;
        always_comb begin
            unique case (beat)
                2'd0: begin du = 11'd0; dv = 11'd0; end
                2'd1: begin du = 11'd1; dv = 11'd0; end
                2'd2: begin du = 11'd0; dv = 11'd1; end
                default: begin du = 11'd1; dv = 11'd1; end
            endcase
        end
        // beat coord feeds the wrap (u_in/v_in). In IDLE (before latching) use
        // the live ports so the first ISSUE sees neighbor 0 of the live coord;
        // once latched, use the latched coord.
        always_comb begin
            if (state == BS_IDLE) begin
                beat_u = u   + du;       // beat==0 here -> u+0
                beat_v = v   + dv;
            end else begin
                beat_u = lat_u + du;
                beat_v = lat_v + dv;
            end
        end

        // The bilinear read address reuses the SAME addr-gen (gs_texel_addr via
        // the u_eff/v_eff wrap fed by beat_u/beat_v). near_rd_addr already is
        // (addr & ~3) for the currently-selected coord; for PSMCT32 the linear
        // path is used and it is word-aligned. We pulse rd_en only on ISSUE.
        logic        bi_rd_en;
        assign bi_rd_en = (state == BS_ISSUE);

        always_ff @(posedge clk or negedge rst_n) begin
            if (!rst_n) begin
                state    <= BS_IDLE;
                beat     <= 2'd0;
                wait_cnt <= 32'd0;
                lat_u    <= 11'd0;  lat_v  <= 11'd0;
                lat_uf   <= 4'd0;   lat_vf <= 4'd0;
                for (int i = 0; i < 4; i++) tap[i] <= 32'd0;
            end else begin
                unique case (state)
                    BS_IDLE: begin
                        if (in_valid && do_lin) begin
                            lat_u  <= u;   lat_v  <= v;
                            lat_uf <= u_frac; lat_vf <= v_frac;
                            beat   <= 2'd0;
                            state  <= BS_ISSUE;
                        end
                    end
                    BS_ISSUE: begin
                        // address presented this cycle (combinationally via
                        // beat -> beat_u/beat_v -> wrap -> addr). Begin the
                        // RD_LATENCY wait.
                        wait_cnt <= 32'd1;
                        state    <= BS_WAIT;
                    end
                    BS_WAIT: begin
                        if (wait_cnt >= RD_LATENCY[31:0]) begin
                            // tex_rd_data now holds beat's word. Capture the
                            // resolved COLOR (`near_color`): for PSMCT32 that is
                            // the raw word (byte-identical to the original);
                            // for PSMT8/PSMT4 (Ch314) it is clut_rd_data — the
                            // index extracted from this beat's word (sel_byte /
                            // psmt4_nibble, stable across the held beat) then CLUT'd.
                            // Capturing the CLUT'd color per tap is what makes the
                            // downstream lerp interpolate COLORS, not indices.
                            tap[beat] <= near_color;
                            if (beat == 2'd3) begin
                                state <= BS_DONE;
                            end else begin
                                beat  <= beat + 2'd1;
                                state <= BS_ISSUE;
                            end
                        end else begin
                            wait_cnt <= wait_cnt + 32'd1;
                        end
                    end
                    default: begin // BS_DONE
                        state <= BS_IDLE;
                    end
                endcase
            end
        end

        // --- 4-tap blend (combinational, on the captured taps) ---
        // PSMCT32 word layout: [31:24]=A [23:16]=B [15:8]=G [7:0]=R (ABGR8888).
        // tap0=(u,v) tap1=(u+1,v) tap2=(u,v+1) tap3=(u+1,v+1).
        function automatic logic [7:0] lerp8(input logic [7:0] a,
                                             input logic [7:0] b,
                                             input logic [3:0] f);
            logic signed [16:0] diff;     // b-a, signed, range -255..255
            logic signed [21:0] prod;     // diff*f
            logic signed [21:0] shifted;  // prod >>> 4
            logic signed [21:0] res;      // a + shifted
            begin
                diff    = $signed({1'b0, b}) - $signed({1'b0, a});
                prod    = diff * $signed({1'b0, f});
                shifted = prod >>> 4;
                res     = $signed({14'd0, a}) + shifted;
                // defensive clamp 0..255 (in-range inputs keep res in range)
                if (res < 0)            lerp8 = 8'd0;
                else if (res > 22'sd255) lerp8 = 8'd255;
                else                    lerp8 = res[7:0];
            end
        endfunction

        // per-channel taps
        logic [7:0] t0_r, t0_g, t0_b, t0_a;
        logic [7:0] t1_r, t1_g, t1_b, t1_a;
        logic [7:0] t2_r, t2_g, t2_b, t2_a;
        logic [7:0] t3_r, t3_g, t3_b, t3_a;
        assign t0_r = tap[0][ 7: 0]; assign t0_g = tap[0][15: 8];
        assign t0_b = tap[0][23:16]; assign t0_a = tap[0][31:24];
        assign t1_r = tap[1][ 7: 0]; assign t1_g = tap[1][15: 8];
        assign t1_b = tap[1][23:16]; assign t1_a = tap[1][31:24];
        assign t2_r = tap[2][ 7: 0]; assign t2_g = tap[2][15: 8];
        assign t2_b = tap[2][23:16]; assign t2_a = tap[2][31:24];
        assign t3_r = tap[3][ 7: 0]; assign t3_g = tap[3][15: 8];
        assign t3_b = tap[3][23:16]; assign t3_a = tap[3][31:24];

        // top = lerp(tap0,tap1,uf); bot = lerp(tap2,tap3,uf); out = lerp(top,bot,vf)
        logic [7:0] top_r, top_g, top_b, top_a;
        logic [7:0] bot_r, bot_g, bot_b, bot_a;
        logic [7:0] cv_r,  cv_g,  cv_b,  cv_a;
        always_comb begin
            top_r = lerp8(t0_r, t1_r, lat_uf);
            top_g = lerp8(t0_g, t1_g, lat_uf);
            top_b = lerp8(t0_b, t1_b, lat_uf);
            top_a = lerp8(t0_a, t1_a, lat_uf);
            bot_r = lerp8(t2_r, t3_r, lat_uf);
            bot_g = lerp8(t2_g, t3_g, lat_uf);
            bot_b = lerp8(t2_b, t3_b, lat_uf);
            bot_a = lerp8(t2_a, t3_a, lat_uf);
            cv_r  = lerp8(top_r, bot_r, lat_vf);
            cv_g  = lerp8(top_g, bot_g, lat_vf);
            cv_b  = lerp8(top_b, bot_b, lat_vf);
            cv_a  = lerp8(top_a, bot_a, lat_vf);
        end

        // Ch310 — HOLD register for the filtered color. The combined-renderer
        // FSM (gs_stub CB_TWAIT) may latch the result a cycle or two AFTER the
        // out_valid pulse (it steps at half-rate on z_advance beats), so the
        // blended ABGR must stay STABLE from out_valid until the next sample.
        // tex_color is the LIVE combinational blend during DONE (so an
        // out_valid-keyed caller — tb_gs_texture_bilinear — reads the fresh
        // value the SAME cycle out_valid pulses, byte-identical to before) and
        // the LATCHED copy afterward (so a caller that reads one+ cycles later,
        // like CB_TWAIT→CB_T, still sees it). The register captures the blend
        // on the clk edge that LEAVES DONE; combining "live during DONE, held
        // after" gives a value stable from out_valid until the next sample
        // overwrites it at its DONE.
        logic [31:0] tex_color_blend;
        assign tex_color_blend = {cv_a, cv_b, cv_g, cv_r};
        logic [31:0] tex_color_hold;
        always_ff @(posedge clk or negedge rst_n) begin
            if (!rst_n)
                tex_color_hold <= 32'd0;
            else if (state == BS_DONE)
                tex_color_hold <= tex_color_blend;   // capture the just-blended value
        end
        // live during the DONE pulse, held (last captured) otherwise
        logic [31:0] tex_color_lin;
        assign tex_color_lin = (state == BS_DONE) ? tex_color_blend : tex_color_hold;

        // --- output mux: bilinear FSM owns the outputs for a FILTERED PSMCT32
        // sample (do_lin). When do_lin=0 — non-PSMCT32 psm OR MMAG=0 NEAREST —
        // we transparently fall back to the nearest single-read path so
        // PSMT8/PSMT4/swizzle and nearest PSMCT32 still work with
        // BILINEAR_ENABLE=1, and busy stays 0 there.
        // tex_color: the HELD blended ABGR (stable from out_valid to next DONE).
        assign tex_rd_en   = do_lin ? bi_rd_en      : near_rd_en;
        // tex_rd_addr is the SAME addr-gen output for both paths (the wrap
        // selects beat_u/beat_v vs port u/v); the FSM just gates rd_en.
        assign tex_rd_addr = near_rd_addr;
        assign out_valid   = do_lin ? (state == BS_DONE) : near_out_valid;
        assign tex_color   = do_lin ? tex_color_lin : near_color;
        assign busy        = do_lin && (state != BS_IDLE);
    end else begin : g_nearest
        // BYTE-IDENTICAL nearest path: outputs are exactly the original assigns.
        assign bili_active = 1'b0;       // constant -> wrap uses port u/v
        assign beat_u      = 11'd0;      // unused (pruned)
        assign beat_v      = 11'd0;
        assign tex_rd_en   = near_rd_en;
        assign tex_rd_addr = near_rd_addr;
        assign out_valid   = near_out_valid;
        assign tex_color   = near_color;
        assign busy        = 1'b0;
    end
    endgenerate

endmodule : gs_texture_unit