Files
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

758 lines
39 KiB
Systemverilog

// retroDE_ps2 — gs_texture_unit
//
// Per-pixel texture sampler (brick 1, the texturing datapath core).
//
// Takes a per-pixel texture coordinate (u,v) + the TEX0 descriptor, fetches
// the texel from VRAM through a read port, and outputs the sampled color,
// pipelined to absorb the VRAM read latency.
//
// (u,v,valid) --[gs_texel_addr]--> byte addr --> VRAM read port
// | (RD_LATENCY cyc)
// sampled color <--[decode]-- tex_rd_data
//
// v1 scope (kept deliberately minimal so it's fully verifiable now):
// - PSMCT32 only (32-bit ABGR texels, direct — no CLUT).
// - DECAL texture function (texel replaces fragment color).
//
// Ch296 — PSMT8 indexed texturing (this chapter):
// - When psm==PSMT8 (0x13) the fetched 32-bit word holds FOUR packed
// 8-bit indices. The byte for this texel is selected by the texel
// byte address' low 2 bits (gs_texel_addr emits a 1-byte/texel
// address for PSMT8). That index drives a CLUT lookup port; the
// returned PSMCT32 entry is the texel color (DECAL).
// - The lookup is COMBINATIONAL (clut_stub's read port is comb), so it
// lands in the SAME cycle as the direct PSMCT32 path — the existing
// single S1->S2 register in gs_stub aligns it with emit unchanged.
// - PSMCT32 (psm==0x00) behavior is byte-identical to before.
// Next versions add: PSMCT16 unpack, PSMT4 (nibble) + CLUT, swizzle, and
// MODULATE/HIGHLIGHT tex functions.
//
// The VRAM read port here is generic (byte address out, 32-bit word in,
// fixed RD_LATENCY). Integration wires it to vram_stub's spare read port;
// vram_stub's exact address convention is reconciled at integration time.
`timescale 1ns/1ps
module gs_texture_unit #(
// Ch298 — SWIZZLED PSMT4 texture sampling. When PSMT4_SWIZZLE=1 AND the
// texture psm==PSMT4, the texel byte address + nibble_hi are computed by
// gs_swizzle_psmt4_stub (the SAME proven module already on the framebuffer
// WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT4 block layout,
// instead of the linear gs_texel_addr. LINEAR is the default (0) so every
// existing linear PSMT4/PSMT8/PSMCT32 demo + TB is byte-identical. The
// swizzled address feeds the SAME word-aligned read, byte-lane extract,
// nibble select, and CLUT lookup — only the address GENERATION differs.
// Because the swizzled address (its low 2 bits + nibble_hi) is also
// address-derived, it flows through the SAME SEL_DELAY pipe as the linear
// selectors, so registered-read (TEX_RD_REGISTERED=1) alignment is reused
// verbatim. PSMT8/PSMCT32 always take the linear address (this rung is
// PSMT4-only).
parameter bit PSMT4_SWIZZLE = 1'b0,
// Ch299 — SWIZZLED PSMT8 texture sampling. The sibling of PSMT4_SWIZZLE,
// MINUS the nibble (PSMT8 is 1 byte/texel). When PSMT8_SWIZZLE=1 AND the
// texture psm==PSMT8, the texel byte address is computed by
// gs_swizzle_psmt8_stub (the SAME proven module already on the framebuffer
// WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT8 block layout,
// instead of the linear gs_texel_addr. LINEAR is the default (0) so every
// existing linear PSMT8/PSMT4/PSMCT32 demo + TB is byte-identical. The
// swizzled address feeds the SAME word-aligned read, byte-lane extract, and
// CLUT lookup — only the address GENERATION differs. Because the swizzled
// address' low 2 bits (byte-lane selector) are also address-derived, they
// flow through the SAME SEL_DELAY pipe as the linear selectors, so
// registered-read (TEX_RD_REGISTERED=1) alignment is reused verbatim. NO
// nibble pipe is needed — PSMT8 has no nibble. PSMT4/PSMCT32 always take
// their own address (this rung is PSMT8-only).
parameter bit PSMT8_SWIZZLE = 1'b0,
// Ch300 — SWIZZLED PSMCT32 (direct-color) texture sampling. The closure
// rung of the swizzle layout family. When PSMCT32_SWIZZLE=1 AND the texture
// psm==PSMCT32, the texel byte address is computed by gs_swizzle_psmct32_stub
// (the SAME proven module already on the framebuffer WRITE / SCANOUT / UPLOAD
// paths — Ch119/Ch122) using the real PS2 PSMCT32 page/block layout, instead
// of the linear gs_texel_addr. Unlike PSMT4/PSMT8 this needs NO CLUT and NO
// byte-lane select: PSMCT32 is 4 bytes/texel, so the swizzled address is
// already word-aligned and the fetched 32-bit word IS the color directly
// (tex_color = tex_rd_data). LINEAR is the default (0) so every existing
// linear PSMCT32 demo + TB (textured / tritex) is byte-identical. This is
// the SAME single-param-per-format gate as PSMCT32_SWIZZLE on the FB side,
// so a PSMCT32 texture and a PSMCT32 framebuffer swizzle together.
parameter bit PSMCT32_SWIZZLE = 1'b0,
// Ch294 — GS texture WRAP MODES (REPEAT + CLAMP). When TEX_WRAP_ENABLE=1
// the per-pixel (u,v) are resolved against the texture's power-of-two
// dimensions (width=2^TW, height=2^TH from TEX0) using the CLAMP_1 wrap
// mode (WMS for u/S, WMT for v/T): 0=REPEAT (u & (width-1)), 1=CLAMP
// (u>=width -> width-1). REGION_* (2/3) are NOT modelled and pass through.
// The wrap is applied to u/v BEFORE address generation, so it covers the
// linear path AND every swizzle path. With TEX_WRAP_ENABLE=0 (default)
// u_eff===u and v_eff===v as a compile-time constant, so the wrap logic is
// pruned and every existing consumer is BYTE-IDENTICAL.
parameter bit TEX_WRAP_ENABLE = 1'b0,
// Ch308 — BILINEAR (4-tap) texture filtering, PSMCT32-only this rung.
// When BILINEAR_ENABLE=1 AND psm==PSMCT32 the sampler runs a 4-beat read
// FSM: it fetches the 4 texels surrounding the fractional coord
// (u,v) (u+1,v) (u,v+1) (u+1,v+1)
// — each independently wrapped/clamped through the SAME u_eff/v_eff
// machinery (so edge taps repeat/clamp instead of reading outside the
// texture) — then blends them per channel (R,G,B,A) by the 4-bit
// fractional u_frac/v_frac (0..15, /16) using a >>4 fixed-point lerp.
// For !BILINEAR_ENABLE (default) OR psm!=PSMCT32 the EXACT current
// single-read NEAREST path is used and u_frac/v_frac are ignored, so the
// synthesized logic and every existing consumer is BYTE-IDENTICAL (the
// bilinear FSM, the per-beat coord select, and the blend datapath are all
// pruned as compile-time-dead when BILINEAR_ENABLE=0). Bilinear is
// PSMCT32-only by default; with PALETTE_BILINEAR=1 (Ch314) it also covers
// PSMT8/PSMT4 via per-tap CLUT-before-interp. At PALETTE_BILINEAR=0 the
// indexed textures still take the nearest path even with BILINEAR_ENABLE=1.
//
// ALPHA: the alpha channel is INTERPOLATED with the same 4-tap lerp as
// R/G/B (not pass-through-nearest). For an opaque texture (all taps a=255)
// this returns 255 exactly; for a texel-center sample (u_frac=v_frac=0) it
// returns the (u,v) tap's alpha exactly.
parameter bit BILINEAR_ENABLE = 1'b0,
// Ch314 — BILINEAR for PALETTIZED (indexed) textures. When
// PALETTE_BILINEAR=1 (and BILINEAR_ENABLE=1) the 4-tap path also runs for
// PSMT8 (0x13) and PSMT4 (0x14). The CRITICAL rule is CLUT-BEFORE-INTERP:
// each of the 4 taps fetches an INDEX, that index is CLUT'd to an RGBA
// color (the existing combinational clut_rd_idx/clut_rd_data port), and the
// 4 COLORS are then interpolated — NOT the indices. This falls out of
// capturing `near_color` per tap (clut_rd_data for indexed, tex_rd_data for
// PSMCT32) instead of the raw word. Swizzled addressing + wrap/clamp run in
// the SAME per-tap addr-gen that already feeds the nearest path, so they
// happen BEFORE the index/CLUT lookup. Default 0 → indexed textures stay
// nearest even with BILINEAR_ENABLE=1, so every existing build is
// byte-identical (the combined path only ever fed PSMCT32 textures anyway).
parameter bit PALETTE_BILINEAR = 1'b0,
parameter int RD_LATENCY = 1, // VRAM read latency in clk cycles
// Ch296 — PSMT8 byte-lane realignment. The byte selected from the
// fetched word must use the LOW 2 bits of the address that was ISSUED
// for the returned data. When the texel ADDRESS advances every cycle
// while a read is in flight (gs_stub TEX_RD_REGISTERED=1: address
// taken from the S0 walker, registered read returns 1 cycle later),
// the current `addr` no longer matches the in-flight word, so the
// selector must be delayed by SEL_DELAY cycles to re-pair them. When
// the address is HELD stable across the read (combinational read port,
// address from the stable S1 latch), SEL_DELAY=0 and the current addr
// is correct. Driven from gs_stub as TEX_RD_REGISTERED?TEX_RD_LATENCY:0.
parameter int SEL_DELAY = 0
) (
input logic clk,
input logic rst_n,
// per-pixel texture coordinate in
input logic in_valid,
input logic [10:0] u,
input logic [10:0] v,
// Ch308 — fractional texture coords for BILINEAR (4-bit, 0..15 => /16).
// Unused at default (BILINEAR_ENABLE=0) and for non-PSMCT32 psm.
input logic [3:0] u_frac,
input logic [3:0] v_frac,
// Ch310 — RUNTIME filter select (per-primitive TEX1_1.MMAG). When
// BILINEAR_ENABLE=1 the 4-tap path runs ONLY when (is_ct32 && filter_lin);
// with filter_lin=0 (TEX1.MMAG=0 NEAREST) the sampler falls back to the
// exact nearest single-read path (busy stays 0). Unused at
// BILINEAR_ENABLE=0 (g_nearest), so the default build is byte-identical.
input logic filter_lin,
// Ch294 — wrap-mode controls (CLAMP_1 WMS/WMT + TEX0 TW/TH). Unused at
// default (TEX_WRAP_ENABLE=0) since u_eff/v_eff collapse to u/v.
input logic [1:0] wms,
input logic [1:0] wmt,
input logic [3:0] tw,
input logic [3:0] th,
// TEX0 descriptor
input logic [31:0] tbp0_base_bytes, // texture base in VRAM (bytes)
input logic [13:0] tbw, // TEX0.TBW (texels/row / 64)
input logic [5:0] psm, // pixel storage mode
// VRAM texel read port
output logic tex_rd_en,
output logic [31:0] tex_rd_addr, // byte address
input logic [31:0] tex_rd_data, // 32-bit word, valid RD_LATENCY later
// Ch296 — CLUT lookup port (PSMT8 indexed texturing). The extracted
// 8-bit index drives `clut_rd_idx`; the parent wires this to
// clut_stub's second (combinational) read port and returns the
// PSMCT32 entry on `clut_rd_data`. Unused for PSMCT32 textures.
output logic [7:0] clut_rd_idx,
input logic [31:0] clut_rd_data, // PSMCT32 entry for clut_rd_idx
// sampled color out (aligned with out_valid)
output logic out_valid,
output logic [31:0] tex_color, // ABGR8888
// Ch308 — BILINEAR busy: high while the 4-beat read sequence is in flight
// (the caller must not issue a new in_valid until it drops / out_valid
// pulses). Always 0 on the nearest path (BILINEAR_ENABLE=0 or non-PSMCT32),
// so a caller that ignores it sees byte-identical behavior.
output logic busy
);
localparam logic [5:0] PSM_PSMCT32 = 6'h00;
localparam logic [5:0] PSM_PSMT8 = 6'h13;
localparam logic [5:0] PSM_PSMT4 = 6'h14;
// --- Ch294: wrap-mode resolution (u/v -> u_eff/v_eff) ---
// Applied BEFORE any address generation so it covers the linear path AND
// every swizzle path. width=2^TW, height=2^TH (both powers of two), so
// REPEAT is a mask and CLAMP is a >width-1 saturate. u/v are unsigned so
// there is no negative/underflow case to handle. REGION_* (2/3) pass
// through unchanged (not modelled this rung). At TEX_WRAP_ENABLE=0 this is
// a constant pass-through (u_eff===u, v_eff===v) -> byte-identical.
// Ch308 — the coord that FEEDS the wrap. On the nearest path (bilinear off
// or non-PSMCT32) this is the port u/v UNCHANGED, so the wrap output
// (u_eff/v_eff) and everything downstream is byte-identical. On the
// bilinear path it is the current beat's neighbor coord (u+du[k],v+dv[k]),
// so each of the 4 taps is independently wrapped/clamped. `bili_active` is
// a compile-time constant 0 when BILINEAR_ENABLE=0, so u_in===u / v_in===v
// collapses away at the default build.
logic bili_active; // declared below; bilinear running for this psm
logic [10:0] beat_u, beat_v; // declared below; current beat neighbor coord
logic [10:0] u_in, v_in;
always_comb begin
if (BILINEAR_ENABLE && bili_active) begin
u_in = beat_u; v_in = beat_v;
end else begin
u_in = u; v_in = v; // byte-identical nearest coord
end
end
logic [10:0] u_eff, v_eff;
logic [10:0] u_wmask, v_wmask; // width-1 / height-1
logic [10:0] u_wlimit, v_wlimit;
always_comb begin
u_wmask = (11'd1 << tw) - 11'd1; v_wmask = (11'd1 << th) - 11'd1;
u_wlimit = u_wmask; v_wlimit = v_wmask; // width-1 / height-1
if (!TEX_WRAP_ENABLE) begin
u_eff = u_in; v_eff = v_in; // pass-through -> byte-identical at default
end else begin
// U
unique case (wms)
2'd0: u_eff = u_in & u_wmask; // REPEAT
2'd1: u_eff = (u_in > u_wlimit) ? u_wlimit : u_in; // CLAMP
default: u_eff = u_in; // REGION_* not modelled -> pass-through
endcase
// V
unique case (wmt)
2'd0: v_eff = v_in & v_wmask;
2'd1: v_eff = (v_in > v_wlimit) ? v_wlimit : v_in;
default: v_eff = v_in;
endcase
end
end
// --- linear address (combinational) ---
logic [31:0] lin_addr;
logic lin_nibble_hi; // PSMT4: this texel is the HIGH nibble of its byte
gs_texel_addr #(.ADDR_W(32)) u_addr (
.base_byte_addr (tbp0_base_bytes),
.u (u_eff),
.v (v_eff),
.tbw (tbw),
.psm (psm),
.texel_byte_addr(lin_addr),
.nibble_hi (lin_nibble_hi)
);
// --- swizzled PSMT4 address (combinational) ---
// EXACTLY mirrors the texture-UPLOAD path (gif_image_xfer_stub Ch139):
// the swizzle module is fed FBP=0 so it emits only the WITHIN-TEXTURE
// byte OFFSET, and the texture base (tbp0_base_bytes) is ADDED on top.
// This makes the sampled address bit-identical to the uploaded one for
// ANY 256-byte-aligned base (using the swizzle module's `fbp` input here
// would discard the low 11 bits of a non-2048-aligned base). FBW=TBW (in
// 64-texel units); PSMT4 swizzle needs FBW even (bw_pg = FBW>>1). The
// texture's (u,v) ARE the swizzle (x,y). Output is byte-offset + nibble_hi
// — the SAME shape gs_texel_addr emits for linear PSMT4, so downstream
// (word-align, byte-lane, nibble select, CLUT) is untouched.
logic [31:0] swz_off;
logic [31:0] swz_addr;
logic swz_nibble_hi;
generate
if (PSMT4_SWIZZLE) begin : g_swizzle4
gs_swizzle_psmt4_stub u_swizzle4 (
.fbp (9'd0),
.fbw (tbw[5:0]),
.x ({1'b0, u_eff}),
.y ({1'b0, v_eff}),
.addr (swz_off),
.nibble_hi (swz_nibble_hi)
);
assign swz_addr = tbp0_base_bytes + swz_off;
end else begin : g_no_swizzle4
assign swz_off = 32'd0;
assign swz_addr = 32'd0;
assign swz_nibble_hi = 1'b0;
end
endgenerate
// --- swizzled PSMT8 address (combinational) ---
// Ch299 — EXACTLY mirrors the PSMT4-swizzle sampler arm above (and the
// PSMT8 UPLOAD path in gif_image_xfer_stub Ch133), MINUS the nibble.
// gs_swizzle_psmt8_stub is fed FBP=0 so it emits only the WITHIN-TEXTURE
// byte OFFSET; the texture base (tbp0_base_bytes) is ADDED on top. This
// makes the sampled address bit-identical to the uploaded one for ANY
// 256-byte-aligned base. FBW=TBW (in 64-texel units); the PSMT8 swizzle
// needs FBW even (bw_pg = FBW>>1). The texture's (u,v) ARE the swizzle
// (x,y). Output is a byte address — the SAME shape gs_texel_addr emits for
// linear PSMT8 — so downstream (word-align, byte-lane, CLUT) is untouched.
// No nibble_hi: PSMT8 is one full byte per texel.
logic [31:0] swz8_off;
logic [31:0] swz8_addr;
generate
if (PSMT8_SWIZZLE) begin : g_swizzle8
gs_swizzle_psmt8_stub u_swizzle8 (
.fbp (9'd0),
.fbw (tbw[5:0]),
.x ({1'b0, u_eff}),
.y ({1'b0, v_eff}),
.addr (swz8_off)
);
assign swz8_addr = tbp0_base_bytes + swz8_off;
end else begin : g_no_swizzle8
assign swz8_off = 32'd0;
assign swz8_addr = 32'd0;
end
endgenerate
// --- swizzled PSMCT32 address (combinational) ---
// Ch300 — direct-color sibling of the PSMT4/PSMT8 swizzle arms above, using
// the SAME proven gs_swizzle_psmct32_stub already on the FB WRITE / SCANOUT
// / UPLOAD paths. Fed FBP=0 so it emits only the WITHIN-TEXTURE byte OFFSET;
// the texture base (tbp0_base_bytes) is ADDED on top, making the sampled
// address bit-identical to the uploaded one for ANY 2048-byte-aligned base.
// FBW=TBW (in 64-pixel units — PSMCT32 page is 64 px wide, so TBW units
// match the stub's fbw directly, NO >>1). The texture's (u,v) ARE the
// swizzle (x,y). Output is a 4-byte-aligned byte address — gs_texel_addr's
// PSMCT32 shape — so downstream is untouched. NO nibble, NO byte-lane, NO
// CLUT: the fetched word is the color (tex_color = tex_rd_data).
logic [31:0] swz32_off;
logic [31:0] swz32_addr;
generate
if (PSMCT32_SWIZZLE) begin : g_swizzle32
gs_swizzle_psmct32_stub u_swizzle32 (
.fbp (9'd0),
.fbw (tbw[5:0]),
.x ({1'b0, u_eff}),
.y ({1'b0, v_eff}),
.addr (swz32_off)
);
assign swz32_addr = tbp0_base_bytes + swz32_off;
end else begin : g_no_swizzle32
assign swz32_off = 32'd0;
assign swz32_addr = 32'd0;
end
endgenerate
// --- linear-vs-swizzled select ---
// Swizzle applies to a PSMT4 texture when PSMT4_SWIZZLE is set, and to a
// PSMT8 texture when PSMT8_SWIZZLE is set; every other psm always takes the
// linear address, and the two swizzle gates are mutually exclusive by psm.
// With both params 0 the selects are constant-false, so the synthesized
// logic — and every linear TB/demo — is byte-identical.
logic use_swizzle4;
logic use_swizzle8;
logic use_swizzle32;
logic [31:0] addr;
logic nibble_hi;
assign use_swizzle4 = (PSMT4_SWIZZLE != 1'b0) && (psm == PSM_PSMT4);
assign use_swizzle8 = (PSMT8_SWIZZLE != 1'b0) && (psm == PSM_PSMT8);
assign use_swizzle32 = (PSMCT32_SWIZZLE != 1'b0) && (psm == PSM_PSMCT32);
// Only the PSMT4 path carries a nibble; PSMT8/PSMCT32 swizzle and the
// linear fallback have none.
assign addr = use_swizzle4 ? swz_addr :
use_swizzle8 ? swz8_addr :
use_swizzle32 ? swz32_addr : lin_addr;
assign nibble_hi = use_swizzle4 ? swz_nibble_hi : lin_nibble_hi;
// Nearest-path read enable / address. These are muxed at the module
// outputs (tex_rd_en/tex_rd_addr) below: on the nearest path they ARE the
// outputs (byte-identical); on the bilinear path the FSM drives the
// outputs instead. The word-align mask is a no-op for PSMCT32.
logic near_rd_en;
logic [31:0] near_rd_addr;
assign near_rd_en = in_valid;
// The VRAM read port is 32-bit WORD-addressed (and vram_bram_stub's
// read2 only returns data for word-aligned addresses). PSMCT32 texel
// addresses are already word-aligned; PSMT8 byte addresses are not, so
// present the word-aligned address and recover the right byte via the
// low 2 bits (sel_lo) in the index extract below. Masking is a no-op
// for PSMCT32, so that path stays byte-identical.
assign near_rd_addr = addr & ~32'd3;
// --- PSMT8 index extract ---
// gs_texel_addr returns a 1-byte/texel address for PSMT8, so the
// fetched 32-bit word (read at addr & ~3 by the word-addressed VRAM
// port) packs 4 indices; the issued address' low 2 bits select which
// byte is THIS texel.
//
// The byte selector uses the addr[1:0] from the issue cycle of the
// returned word. SEL_DELAY (see the param comment) is 0 when the
// address is held stable across the read (current addr is correct) and
// >0 when the address advances while the read is in flight (delay the
// selector to re-pair it with the in-flight word). `sel_lo` carries it.
//
// PSMT4 (Ch297) adds a NIBBLE selector on top of the byte selector.
// gs_texel_addr emits a byte address (texel_offset>>1) plus `nibble_hi`
// (= texel_offset[0]: even texel -> LOW nibble, odd -> HIGH nibble). The
// selected byte (via sel_lo, exactly as PSMT8) holds TWO 4-bit indices;
// nibble_hi picks which. Because nibble_hi is derived from the texel
// ADDRESS — which advances every cycle while a read is in flight under
// TEX_RD_REGISTERED=1 — it must be SEL_DELAY-aligned by the SAME pipe
// depth as sel_lo so it re-pairs with the returned word. (Same class as
// the PSMT8 byte-lane realignment; get it wrong and odd/even texels smear.)
logic [1:0] sel_lo;
logic nib_sel; // SEL_DELAY-aligned nibble_hi
generate
if (SEL_DELAY == 0) begin : g_sel_comb
assign sel_lo = addr[1:0];
assign nib_sel = nibble_hi;
end else begin : g_sel_reg
logic [1:0] sel_pipe [0:SEL_DELAY-1];
logic nib_pipe [0:SEL_DELAY-1];
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
for (int i = 0; i < SEL_DELAY; i++) begin
sel_pipe[i] <= 2'd0;
nib_pipe[i] <= 1'b0;
end
end else begin
sel_pipe[0] <= addr[1:0];
nib_pipe[0] <= nibble_hi;
for (int i = 1; i < SEL_DELAY; i++) begin
sel_pipe[i] <= sel_pipe[i-1];
nib_pipe[i] <= nib_pipe[i-1];
end
end
end
assign sel_lo = sel_pipe[SEL_DELAY-1];
assign nib_sel = nib_pipe[SEL_DELAY-1];
end
endgenerate
// Byte select (shared by PSMT8 and PSMT4): pick the texel's byte lane.
logic [7:0] sel_byte;
always_comb begin
unique case (sel_lo)
2'b00: sel_byte = tex_rd_data[ 7: 0];
2'b01: sel_byte = tex_rd_data[15: 8];
2'b10: sel_byte = tex_rd_data[23:16];
default: sel_byte = tex_rd_data[31:24];
endcase
end
// Nibble select for PSMT4 (4-bit index, zero-extended to 8 bits so the
// SAME clut_rd_idx port + clut_stub feed it; CLUT entries 0..15 used).
// iverilog-12: no bit-select on a parenthesized expr, so split into a
// named net first, then index it.
logic [7:0] sel_byte_for_nib;
assign sel_byte_for_nib = sel_byte;
logic [3:0] psmt4_nibble;
assign psmt4_nibble = nib_sel ? sel_byte_for_nib[7:4] : sel_byte_for_nib[3:0];
// Index out: PSMT4 -> zero-extended nibble; PSMT8 -> full byte.
assign clut_rd_idx = (psm == PSM_PSMT4) ? {4'd0, psmt4_nibble} : sel_byte;
// --- valid pipeline matching the read latency ---
// in_valid presented with the address this cycle; tex_rd_data for it
// arrives RD_LATENCY cycles later. Delay valid by the same amount.
logic [RD_LATENCY-1:0] valid_pipe;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
valid_pipe <= '0;
end else begin
if (RD_LATENCY == 1)
valid_pipe[0] <= in_valid;
else begin
valid_pipe[0] <= in_valid;
for (int i = 1; i < RD_LATENCY; i++)
valid_pipe[i] <= valid_pipe[i-1];
end
end
end
logic near_out_valid;
assign near_out_valid = valid_pipe[RD_LATENCY-1];
// --- decode (DECAL) ---
// PSMT4 : texel color = CLUT[nibble] (indexed indirection)
// PSMT8 : texel color = CLUT[byte index] (indexed indirection)
// PSMCT32 : texel word IS the color directly (byte-identical to v1)
logic [31:0] near_color;
assign near_color = (psm == PSM_PSMT8 || psm == PSM_PSMT4)
? clut_rd_data : tex_rd_data;
// ========================================================================
// Ch308 — BILINEAR (4-tap) PSMCT32 FILTER
// ========================================================================
// When BILINEAR_ENABLE=1 and psm==PSMCT32 we sample the 4 texels around the
// fractional coord and blend them. The whole block is wrapped in a generate
// that is empty when BILINEAR_ENABLE=0, so the default build is pruned to
// exactly the nearest path and is BYTE-IDENTICAL.
//
// CYCLE SCHEDULE (RD_LATENCY-aware; example RD_LATENCY=L):
// T0 : caller asserts in_valid (with u,v,u_frac,v_frac). FSM in
// IDLE latches u/v/frac, sets beat index k=0, drives
// bili_active=1, busy=1, moves to ISSUE.
// T0+ (ISSUE) : present neighbor[k] coord (beat_u/beat_v -> wrap ->
// gs_texel_addr -> tex_rd_addr) and pulse tex_rd_en for 1
// cycle; start an L-cycle wait; -> WAIT.
// ISSUE+1..+L : WAIT counts L cycles; on the L-th cycle tex_rd_data holds
// beat[k]'s 32-bit ABGR word -> capture into tap[k].
// If k<3: k++ and -> ISSUE (next neighbor). If k==3: -> DONE.
// DONE : combinationally lerp the 4 captured taps by u_frac/v_frac
// per channel; assert out_valid for 1 cycle with tex_color;
// drop busy; -> IDLE.
// => total ~ 4*(1+L)+1 cycles per filtered sample. Throughput is NOT a
// goal here (a later texture-cache pass collapses the 4 reads).
//
// Neighbor table (k -> du,dv): 0->(0,0) 1->(1,0) 2->(0,1) 3->(1,1).
// Each neighbor coord is fed through the SAME u_eff/v_eff wrap (via
// u_in/v_in above) so edge taps repeat/clamp and never read outside the
// texture (proven in the TB clamp/repeat cases).
//
// lerp(a,b,f) = a + (($signed({1'b0,b}) - $signed({1'b0,a})) * $signed({1'b0,f})) >>> 4
// with f the 4-bit frac (0..15 => /16). a,b are 8-bit channels. The
// bracketed product is computed in a SIGNED temp (no bit-select on a
// parenthesized expr — iverilog-12 rule), then arithmetic-shifted >>>4,
// then defensively clamped to 0..255.
generate
if (BILINEAR_ENABLE) begin : g_bilinear
localparam logic [1:0] BS_IDLE = 2'd0;
localparam logic [1:0] BS_ISSUE = 2'd1;
localparam logic [1:0] BS_WAIT = 2'd2;
localparam logic [1:0] BS_DONE = 2'd3;
logic [1:0] state;
logic [1:0] beat; // which neighbor 0..3
logic [31:0] wait_cnt; // counts RD_LATENCY
logic [31:0] tap [0:3]; // captured ABGR per neighbor
logic [10:0] lat_u, lat_v; // latched coord for this sample
logic [3:0] lat_uf, lat_vf; // latched fracs
// is this a PSMCT32 sample? bilinear runs for PSMCT32 always, and (Ch314)
// for PSMT8/PSMT4 when PALETTE_BILINEAR=1; any other psm falls back to the
// nearest path even with BILINEAR_ENABLE=1.
logic is_ct32;
logic is_indexed;
logic bili_psm_ok;
assign is_ct32 = (psm == PSM_PSMCT32);
assign is_indexed = (psm == PSM_PSMT8) || (psm == PSM_PSMT4);
assign bili_psm_ok = is_ct32 || (PALETTE_BILINEAR && is_indexed);
// Ch310 — RUNTIME filter gate. The 4-tap path runs ONLY for a PSMCT32
// texture whose primitive selected LINEAR magnification (filter_lin=1,
// i.e. TEX1.MMAG=1). With filter_lin=0 (NEAREST) we fall back to the
// single-read nearest path even with BILINEAR_ENABLE=1, so an
// MMAG=0 primitive stays nearest. `do_lin` is the single predicate that
// selects the bilinear datapath everywhere below.
//
// NOTE on the `!== 1'b0` test: it makes filter_lin DEFAULT-ON when the
// port is left UNCONNECTED (sim Z). The standalone tb_gs_texture_bilinear
// exercises the 4-tap path directly without driving filter_lin, so an
// unconnected input must keep bilinear running (Z !== 0 → true). A
// driven 0 (gs_stub MMAG=0) gives nearest; a driven 1 gives bilinear.
// In synthesis filter_lin is always driven by gs_stub, so this reduces
// to a plain `is_ct32 && filter_lin`.
logic do_lin;
assign do_lin = bili_psm_ok && (filter_lin !== 1'b0);
// bili_active (read by the wrap mux above): high whenever a filtered
// PSMCT32 sample is being processed by the FSM (ISSUE/WAIT/DONE) so the
// wrap consumes the per-beat neighbor coord. When do_lin=0 it is low so
// the wrap uses the port u/v (nearest), byte-identical to the
// non-bilinear coord path.
assign bili_active = do_lin;
// neighbor delta for the current beat
logic [10:0] du, dv;
always_comb begin
unique case (beat)
2'd0: begin du = 11'd0; dv = 11'd0; end
2'd1: begin du = 11'd1; dv = 11'd0; end
2'd2: begin du = 11'd0; dv = 11'd1; end
default: begin du = 11'd1; dv = 11'd1; end
endcase
end
// beat coord feeds the wrap (u_in/v_in). In IDLE (before latching) use
// the live ports so the first ISSUE sees neighbor 0 of the live coord;
// once latched, use the latched coord.
always_comb begin
if (state == BS_IDLE) begin
beat_u = u + du; // beat==0 here -> u+0
beat_v = v + dv;
end else begin
beat_u = lat_u + du;
beat_v = lat_v + dv;
end
end
// The bilinear read address reuses the SAME addr-gen (gs_texel_addr via
// the u_eff/v_eff wrap fed by beat_u/beat_v). near_rd_addr already is
// (addr & ~3) for the currently-selected coord; for PSMCT32 the linear
// path is used and it is word-aligned. We pulse rd_en only on ISSUE.
logic bi_rd_en;
assign bi_rd_en = (state == BS_ISSUE);
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
state <= BS_IDLE;
beat <= 2'd0;
wait_cnt <= 32'd0;
lat_u <= 11'd0; lat_v <= 11'd0;
lat_uf <= 4'd0; lat_vf <= 4'd0;
for (int i = 0; i < 4; i++) tap[i] <= 32'd0;
end else begin
unique case (state)
BS_IDLE: begin
if (in_valid && do_lin) begin
lat_u <= u; lat_v <= v;
lat_uf <= u_frac; lat_vf <= v_frac;
beat <= 2'd0;
state <= BS_ISSUE;
end
end
BS_ISSUE: begin
// address presented this cycle (combinationally via
// beat -> beat_u/beat_v -> wrap -> addr). Begin the
// RD_LATENCY wait.
wait_cnt <= 32'd1;
state <= BS_WAIT;
end
BS_WAIT: begin
if (wait_cnt >= RD_LATENCY[31:0]) begin
// tex_rd_data now holds beat's word. Capture the
// resolved COLOR (`near_color`): for PSMCT32 that is
// the raw word (byte-identical to the original);
// for PSMT8/PSMT4 (Ch314) it is clut_rd_data — the
// index extracted from this beat's word (sel_byte /
// psmt4_nibble, stable across the held beat) then CLUT'd.
// Capturing the CLUT'd color per tap is what makes the
// downstream lerp interpolate COLORS, not indices.
tap[beat] <= near_color;
if (beat == 2'd3) begin
state <= BS_DONE;
end else begin
beat <= beat + 2'd1;
state <= BS_ISSUE;
end
end else begin
wait_cnt <= wait_cnt + 32'd1;
end
end
default: begin // BS_DONE
state <= BS_IDLE;
end
endcase
end
end
// --- 4-tap blend (combinational, on the captured taps) ---
// PSMCT32 word layout: [31:24]=A [23:16]=B [15:8]=G [7:0]=R (ABGR8888).
// tap0=(u,v) tap1=(u+1,v) tap2=(u,v+1) tap3=(u+1,v+1).
function automatic logic [7:0] lerp8(input logic [7:0] a,
input logic [7:0] b,
input logic [3:0] f);
logic signed [16:0] diff; // b-a, signed, range -255..255
logic signed [21:0] prod; // diff*f
logic signed [21:0] shifted; // prod >>> 4
logic signed [21:0] res; // a + shifted
begin
diff = $signed({1'b0, b}) - $signed({1'b0, a});
prod = diff * $signed({1'b0, f});
shifted = prod >>> 4;
res = $signed({14'd0, a}) + shifted;
// defensive clamp 0..255 (in-range inputs keep res in range)
if (res < 0) lerp8 = 8'd0;
else if (res > 22'sd255) lerp8 = 8'd255;
else lerp8 = res[7:0];
end
endfunction
// per-channel taps
logic [7:0] t0_r, t0_g, t0_b, t0_a;
logic [7:0] t1_r, t1_g, t1_b, t1_a;
logic [7:0] t2_r, t2_g, t2_b, t2_a;
logic [7:0] t3_r, t3_g, t3_b, t3_a;
assign t0_r = tap[0][ 7: 0]; assign t0_g = tap[0][15: 8];
assign t0_b = tap[0][23:16]; assign t0_a = tap[0][31:24];
assign t1_r = tap[1][ 7: 0]; assign t1_g = tap[1][15: 8];
assign t1_b = tap[1][23:16]; assign t1_a = tap[1][31:24];
assign t2_r = tap[2][ 7: 0]; assign t2_g = tap[2][15: 8];
assign t2_b = tap[2][23:16]; assign t2_a = tap[2][31:24];
assign t3_r = tap[3][ 7: 0]; assign t3_g = tap[3][15: 8];
assign t3_b = tap[3][23:16]; assign t3_a = tap[3][31:24];
// top = lerp(tap0,tap1,uf); bot = lerp(tap2,tap3,uf); out = lerp(top,bot,vf)
logic [7:0] top_r, top_g, top_b, top_a;
logic [7:0] bot_r, bot_g, bot_b, bot_a;
logic [7:0] cv_r, cv_g, cv_b, cv_a;
always_comb begin
top_r = lerp8(t0_r, t1_r, lat_uf);
top_g = lerp8(t0_g, t1_g, lat_uf);
top_b = lerp8(t0_b, t1_b, lat_uf);
top_a = lerp8(t0_a, t1_a, lat_uf);
bot_r = lerp8(t2_r, t3_r, lat_uf);
bot_g = lerp8(t2_g, t3_g, lat_uf);
bot_b = lerp8(t2_b, t3_b, lat_uf);
bot_a = lerp8(t2_a, t3_a, lat_uf);
cv_r = lerp8(top_r, bot_r, lat_vf);
cv_g = lerp8(top_g, bot_g, lat_vf);
cv_b = lerp8(top_b, bot_b, lat_vf);
cv_a = lerp8(top_a, bot_a, lat_vf);
end
// Ch310 — HOLD register for the filtered color. The combined-renderer
// FSM (gs_stub CB_TWAIT) may latch the result a cycle or two AFTER the
// out_valid pulse (it steps at half-rate on z_advance beats), so the
// blended ABGR must stay STABLE from out_valid until the next sample.
// tex_color is the LIVE combinational blend during DONE (so an
// out_valid-keyed caller — tb_gs_texture_bilinear — reads the fresh
// value the SAME cycle out_valid pulses, byte-identical to before) and
// the LATCHED copy afterward (so a caller that reads one+ cycles later,
// like CB_TWAIT→CB_T, still sees it). The register captures the blend
// on the clk edge that LEAVES DONE; combining "live during DONE, held
// after" gives a value stable from out_valid until the next sample
// overwrites it at its DONE.
logic [31:0] tex_color_blend;
assign tex_color_blend = {cv_a, cv_b, cv_g, cv_r};
logic [31:0] tex_color_hold;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n)
tex_color_hold <= 32'd0;
else if (state == BS_DONE)
tex_color_hold <= tex_color_blend; // capture the just-blended value
end
// live during the DONE pulse, held (last captured) otherwise
logic [31:0] tex_color_lin;
assign tex_color_lin = (state == BS_DONE) ? tex_color_blend : tex_color_hold;
// --- output mux: bilinear FSM owns the outputs for a FILTERED PSMCT32
// sample (do_lin). When do_lin=0 — non-PSMCT32 psm OR MMAG=0 NEAREST —
// we transparently fall back to the nearest single-read path so
// PSMT8/PSMT4/swizzle and nearest PSMCT32 still work with
// BILINEAR_ENABLE=1, and busy stays 0 there.
// tex_color: the HELD blended ABGR (stable from out_valid to next DONE).
assign tex_rd_en = do_lin ? bi_rd_en : near_rd_en;
// tex_rd_addr is the SAME addr-gen output for both paths (the wrap
// selects beat_u/beat_v vs port u/v); the FSM just gates rd_en.
assign tex_rd_addr = near_rd_addr;
assign out_valid = do_lin ? (state == BS_DONE) : near_out_valid;
assign tex_color = do_lin ? tex_color_lin : near_color;
assign busy = do_lin && (state != BS_IDLE);
end else begin : g_nearest
// BYTE-IDENTICAL nearest path: outputs are exactly the original assigns.
assign bili_active = 1'b0; // constant -> wrap uses port u/v
assign beat_u = 11'd0; // unused (pruned)
assign beat_v = 11'd0;
assign tex_rd_en = near_rd_en;
assign tex_rd_addr = near_rd_addr;
assign out_valid = near_out_valid;
assign tex_color = near_color;
assign busy = 1'b0;
end
endgenerate
endmodule : gs_texture_unit