ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
758 lines
39 KiB
Systemverilog
758 lines
39 KiB
Systemverilog
// retroDE_ps2 — gs_texture_unit
|
|
//
|
|
// Per-pixel texture sampler (brick 1, the texturing datapath core).
|
|
//
|
|
// Takes a per-pixel texture coordinate (u,v) + the TEX0 descriptor, fetches
|
|
// the texel from VRAM through a read port, and outputs the sampled color,
|
|
// pipelined to absorb the VRAM read latency.
|
|
//
|
|
// (u,v,valid) --[gs_texel_addr]--> byte addr --> VRAM read port
|
|
// | (RD_LATENCY cyc)
|
|
// sampled color <--[decode]-- tex_rd_data
|
|
//
|
|
// v1 scope (kept deliberately minimal so it's fully verifiable now):
|
|
// - PSMCT32 only (32-bit ABGR texels, direct — no CLUT).
|
|
// - DECAL texture function (texel replaces fragment color).
|
|
//
|
|
// Ch296 — PSMT8 indexed texturing (this chapter):
|
|
// - When psm==PSMT8 (0x13) the fetched 32-bit word holds FOUR packed
|
|
// 8-bit indices. The byte for this texel is selected by the texel
|
|
// byte address' low 2 bits (gs_texel_addr emits a 1-byte/texel
|
|
// address for PSMT8). That index drives a CLUT lookup port; the
|
|
// returned PSMCT32 entry is the texel color (DECAL).
|
|
// - The lookup is COMBINATIONAL (clut_stub's read port is comb), so it
|
|
// lands in the SAME cycle as the direct PSMCT32 path — the existing
|
|
// single S1->S2 register in gs_stub aligns it with emit unchanged.
|
|
// - PSMCT32 (psm==0x00) behavior is byte-identical to before.
|
|
// Next versions add: PSMCT16 unpack, PSMT4 (nibble) + CLUT, swizzle, and
|
|
// MODULATE/HIGHLIGHT tex functions.
|
|
//
|
|
// The VRAM read port here is generic (byte address out, 32-bit word in,
|
|
// fixed RD_LATENCY). Integration wires it to vram_stub's spare read port;
|
|
// vram_stub's exact address convention is reconciled at integration time.
|
|
|
|
`timescale 1ns/1ps
|
|
|
|
module gs_texture_unit #(
|
|
// Ch298 — SWIZZLED PSMT4 texture sampling. When PSMT4_SWIZZLE=1 AND the
|
|
// texture psm==PSMT4, the texel byte address + nibble_hi are computed by
|
|
// gs_swizzle_psmt4_stub (the SAME proven module already on the framebuffer
|
|
// WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT4 block layout,
|
|
// instead of the linear gs_texel_addr. LINEAR is the default (0) so every
|
|
// existing linear PSMT4/PSMT8/PSMCT32 demo + TB is byte-identical. The
|
|
// swizzled address feeds the SAME word-aligned read, byte-lane extract,
|
|
// nibble select, and CLUT lookup — only the address GENERATION differs.
|
|
// Because the swizzled address (its low 2 bits + nibble_hi) is also
|
|
// address-derived, it flows through the SAME SEL_DELAY pipe as the linear
|
|
// selectors, so registered-read (TEX_RD_REGISTERED=1) alignment is reused
|
|
// verbatim. PSMT8/PSMCT32 always take the linear address (this rung is
|
|
// PSMT4-only).
|
|
parameter bit PSMT4_SWIZZLE = 1'b0,
|
|
// Ch299 — SWIZZLED PSMT8 texture sampling. The sibling of PSMT4_SWIZZLE,
|
|
// MINUS the nibble (PSMT8 is 1 byte/texel). When PSMT8_SWIZZLE=1 AND the
|
|
// texture psm==PSMT8, the texel byte address is computed by
|
|
// gs_swizzle_psmt8_stub (the SAME proven module already on the framebuffer
|
|
// WRITE / SCANOUT / UPLOAD paths) using the real PS2 PSMT8 block layout,
|
|
// instead of the linear gs_texel_addr. LINEAR is the default (0) so every
|
|
// existing linear PSMT8/PSMT4/PSMCT32 demo + TB is byte-identical. The
|
|
// swizzled address feeds the SAME word-aligned read, byte-lane extract, and
|
|
// CLUT lookup — only the address GENERATION differs. Because the swizzled
|
|
// address' low 2 bits (byte-lane selector) are also address-derived, they
|
|
// flow through the SAME SEL_DELAY pipe as the linear selectors, so
|
|
// registered-read (TEX_RD_REGISTERED=1) alignment is reused verbatim. NO
|
|
// nibble pipe is needed — PSMT8 has no nibble. PSMT4/PSMCT32 always take
|
|
// their own address (this rung is PSMT8-only).
|
|
parameter bit PSMT8_SWIZZLE = 1'b0,
|
|
// Ch300 — SWIZZLED PSMCT32 (direct-color) texture sampling. The closure
|
|
// rung of the swizzle layout family. When PSMCT32_SWIZZLE=1 AND the texture
|
|
// psm==PSMCT32, the texel byte address is computed by gs_swizzle_psmct32_stub
|
|
// (the SAME proven module already on the framebuffer WRITE / SCANOUT / UPLOAD
|
|
// paths — Ch119/Ch122) using the real PS2 PSMCT32 page/block layout, instead
|
|
// of the linear gs_texel_addr. Unlike PSMT4/PSMT8 this needs NO CLUT and NO
|
|
// byte-lane select: PSMCT32 is 4 bytes/texel, so the swizzled address is
|
|
// already word-aligned and the fetched 32-bit word IS the color directly
|
|
// (tex_color = tex_rd_data). LINEAR is the default (0) so every existing
|
|
// linear PSMCT32 demo + TB (textured / tritex) is byte-identical. This is
|
|
// the SAME single-param-per-format gate as PSMCT32_SWIZZLE on the FB side,
|
|
// so a PSMCT32 texture and a PSMCT32 framebuffer swizzle together.
|
|
parameter bit PSMCT32_SWIZZLE = 1'b0,
|
|
// Ch294 — GS texture WRAP MODES (REPEAT + CLAMP). When TEX_WRAP_ENABLE=1
|
|
// the per-pixel (u,v) are resolved against the texture's power-of-two
|
|
// dimensions (width=2^TW, height=2^TH from TEX0) using the CLAMP_1 wrap
|
|
// mode (WMS for u/S, WMT for v/T): 0=REPEAT (u & (width-1)), 1=CLAMP
|
|
// (u>=width -> width-1). REGION_* (2/3) are NOT modelled and pass through.
|
|
// The wrap is applied to u/v BEFORE address generation, so it covers the
|
|
// linear path AND every swizzle path. With TEX_WRAP_ENABLE=0 (default)
|
|
// u_eff===u and v_eff===v as a compile-time constant, so the wrap logic is
|
|
// pruned and every existing consumer is BYTE-IDENTICAL.
|
|
parameter bit TEX_WRAP_ENABLE = 1'b0,
|
|
// Ch308 — BILINEAR (4-tap) texture filtering, PSMCT32-only this rung.
|
|
// When BILINEAR_ENABLE=1 AND psm==PSMCT32 the sampler runs a 4-beat read
|
|
// FSM: it fetches the 4 texels surrounding the fractional coord
|
|
// (u,v) (u+1,v) (u,v+1) (u+1,v+1)
|
|
// — each independently wrapped/clamped through the SAME u_eff/v_eff
|
|
// machinery (so edge taps repeat/clamp instead of reading outside the
|
|
// texture) — then blends them per channel (R,G,B,A) by the 4-bit
|
|
// fractional u_frac/v_frac (0..15, /16) using a >>4 fixed-point lerp.
|
|
// For !BILINEAR_ENABLE (default) OR psm!=PSMCT32 the EXACT current
|
|
// single-read NEAREST path is used and u_frac/v_frac are ignored, so the
|
|
// synthesized logic and every existing consumer is BYTE-IDENTICAL (the
|
|
// bilinear FSM, the per-beat coord select, and the blend datapath are all
|
|
// pruned as compile-time-dead when BILINEAR_ENABLE=0). Bilinear is
|
|
// PSMCT32-only by default; with PALETTE_BILINEAR=1 (Ch314) it also covers
|
|
// PSMT8/PSMT4 via per-tap CLUT-before-interp. At PALETTE_BILINEAR=0 the
|
|
// indexed textures still take the nearest path even with BILINEAR_ENABLE=1.
|
|
//
|
|
// ALPHA: the alpha channel is INTERPOLATED with the same 4-tap lerp as
|
|
// R/G/B (not pass-through-nearest). For an opaque texture (all taps a=255)
|
|
// this returns 255 exactly; for a texel-center sample (u_frac=v_frac=0) it
|
|
// returns the (u,v) tap's alpha exactly.
|
|
parameter bit BILINEAR_ENABLE = 1'b0,
|
|
// Ch314 — BILINEAR for PALETTIZED (indexed) textures. When
|
|
// PALETTE_BILINEAR=1 (and BILINEAR_ENABLE=1) the 4-tap path also runs for
|
|
// PSMT8 (0x13) and PSMT4 (0x14). The CRITICAL rule is CLUT-BEFORE-INTERP:
|
|
// each of the 4 taps fetches an INDEX, that index is CLUT'd to an RGBA
|
|
// color (the existing combinational clut_rd_idx/clut_rd_data port), and the
|
|
// 4 COLORS are then interpolated — NOT the indices. This falls out of
|
|
// capturing `near_color` per tap (clut_rd_data for indexed, tex_rd_data for
|
|
// PSMCT32) instead of the raw word. Swizzled addressing + wrap/clamp run in
|
|
// the SAME per-tap addr-gen that already feeds the nearest path, so they
|
|
// happen BEFORE the index/CLUT lookup. Default 0 → indexed textures stay
|
|
// nearest even with BILINEAR_ENABLE=1, so every existing build is
|
|
// byte-identical (the combined path only ever fed PSMCT32 textures anyway).
|
|
parameter bit PALETTE_BILINEAR = 1'b0,
|
|
parameter int RD_LATENCY = 1, // VRAM read latency in clk cycles
|
|
// Ch296 — PSMT8 byte-lane realignment. The byte selected from the
|
|
// fetched word must use the LOW 2 bits of the address that was ISSUED
|
|
// for the returned data. When the texel ADDRESS advances every cycle
|
|
// while a read is in flight (gs_stub TEX_RD_REGISTERED=1: address
|
|
// taken from the S0 walker, registered read returns 1 cycle later),
|
|
// the current `addr` no longer matches the in-flight word, so the
|
|
// selector must be delayed by SEL_DELAY cycles to re-pair them. When
|
|
// the address is HELD stable across the read (combinational read port,
|
|
// address from the stable S1 latch), SEL_DELAY=0 and the current addr
|
|
// is correct. Driven from gs_stub as TEX_RD_REGISTERED?TEX_RD_LATENCY:0.
|
|
parameter int SEL_DELAY = 0
|
|
) (
|
|
input logic clk,
|
|
input logic rst_n,
|
|
|
|
// per-pixel texture coordinate in
|
|
input logic in_valid,
|
|
input logic [10:0] u,
|
|
input logic [10:0] v,
|
|
|
|
// Ch308 — fractional texture coords for BILINEAR (4-bit, 0..15 => /16).
|
|
// Unused at default (BILINEAR_ENABLE=0) and for non-PSMCT32 psm.
|
|
input logic [3:0] u_frac,
|
|
input logic [3:0] v_frac,
|
|
|
|
// Ch310 — RUNTIME filter select (per-primitive TEX1_1.MMAG). When
|
|
// BILINEAR_ENABLE=1 the 4-tap path runs ONLY when (is_ct32 && filter_lin);
|
|
// with filter_lin=0 (TEX1.MMAG=0 NEAREST) the sampler falls back to the
|
|
// exact nearest single-read path (busy stays 0). Unused at
|
|
// BILINEAR_ENABLE=0 (g_nearest), so the default build is byte-identical.
|
|
input logic filter_lin,
|
|
|
|
// Ch294 — wrap-mode controls (CLAMP_1 WMS/WMT + TEX0 TW/TH). Unused at
|
|
// default (TEX_WRAP_ENABLE=0) since u_eff/v_eff collapse to u/v.
|
|
input logic [1:0] wms,
|
|
input logic [1:0] wmt,
|
|
input logic [3:0] tw,
|
|
input logic [3:0] th,
|
|
|
|
// TEX0 descriptor
|
|
input logic [31:0] tbp0_base_bytes, // texture base in VRAM (bytes)
|
|
input logic [13:0] tbw, // TEX0.TBW (texels/row / 64)
|
|
input logic [5:0] psm, // pixel storage mode
|
|
|
|
// VRAM texel read port
|
|
output logic tex_rd_en,
|
|
output logic [31:0] tex_rd_addr, // byte address
|
|
input logic [31:0] tex_rd_data, // 32-bit word, valid RD_LATENCY later
|
|
|
|
// Ch296 — CLUT lookup port (PSMT8 indexed texturing). The extracted
|
|
// 8-bit index drives `clut_rd_idx`; the parent wires this to
|
|
// clut_stub's second (combinational) read port and returns the
|
|
// PSMCT32 entry on `clut_rd_data`. Unused for PSMCT32 textures.
|
|
output logic [7:0] clut_rd_idx,
|
|
input logic [31:0] clut_rd_data, // PSMCT32 entry for clut_rd_idx
|
|
|
|
// sampled color out (aligned with out_valid)
|
|
output logic out_valid,
|
|
output logic [31:0] tex_color, // ABGR8888
|
|
|
|
// Ch308 — BILINEAR busy: high while the 4-beat read sequence is in flight
|
|
// (the caller must not issue a new in_valid until it drops / out_valid
|
|
// pulses). Always 0 on the nearest path (BILINEAR_ENABLE=0 or non-PSMCT32),
|
|
// so a caller that ignores it sees byte-identical behavior.
|
|
output logic busy
|
|
);
|
|
|
|
localparam logic [5:0] PSM_PSMCT32 = 6'h00;
|
|
localparam logic [5:0] PSM_PSMT8 = 6'h13;
|
|
localparam logic [5:0] PSM_PSMT4 = 6'h14;
|
|
|
|
// --- Ch294: wrap-mode resolution (u/v -> u_eff/v_eff) ---
|
|
// Applied BEFORE any address generation so it covers the linear path AND
|
|
// every swizzle path. width=2^TW, height=2^TH (both powers of two), so
|
|
// REPEAT is a mask and CLAMP is a >width-1 saturate. u/v are unsigned so
|
|
// there is no negative/underflow case to handle. REGION_* (2/3) pass
|
|
// through unchanged (not modelled this rung). At TEX_WRAP_ENABLE=0 this is
|
|
// a constant pass-through (u_eff===u, v_eff===v) -> byte-identical.
|
|
// Ch308 — the coord that FEEDS the wrap. On the nearest path (bilinear off
|
|
// or non-PSMCT32) this is the port u/v UNCHANGED, so the wrap output
|
|
// (u_eff/v_eff) and everything downstream is byte-identical. On the
|
|
// bilinear path it is the current beat's neighbor coord (u+du[k],v+dv[k]),
|
|
// so each of the 4 taps is independently wrapped/clamped. `bili_active` is
|
|
// a compile-time constant 0 when BILINEAR_ENABLE=0, so u_in===u / v_in===v
|
|
// collapses away at the default build.
|
|
logic bili_active; // declared below; bilinear running for this psm
|
|
logic [10:0] beat_u, beat_v; // declared below; current beat neighbor coord
|
|
logic [10:0] u_in, v_in;
|
|
always_comb begin
|
|
if (BILINEAR_ENABLE && bili_active) begin
|
|
u_in = beat_u; v_in = beat_v;
|
|
end else begin
|
|
u_in = u; v_in = v; // byte-identical nearest coord
|
|
end
|
|
end
|
|
|
|
logic [10:0] u_eff, v_eff;
|
|
logic [10:0] u_wmask, v_wmask; // width-1 / height-1
|
|
logic [10:0] u_wlimit, v_wlimit;
|
|
always_comb begin
|
|
u_wmask = (11'd1 << tw) - 11'd1; v_wmask = (11'd1 << th) - 11'd1;
|
|
u_wlimit = u_wmask; v_wlimit = v_wmask; // width-1 / height-1
|
|
if (!TEX_WRAP_ENABLE) begin
|
|
u_eff = u_in; v_eff = v_in; // pass-through -> byte-identical at default
|
|
end else begin
|
|
// U
|
|
unique case (wms)
|
|
2'd0: u_eff = u_in & u_wmask; // REPEAT
|
|
2'd1: u_eff = (u_in > u_wlimit) ? u_wlimit : u_in; // CLAMP
|
|
default: u_eff = u_in; // REGION_* not modelled -> pass-through
|
|
endcase
|
|
// V
|
|
unique case (wmt)
|
|
2'd0: v_eff = v_in & v_wmask;
|
|
2'd1: v_eff = (v_in > v_wlimit) ? v_wlimit : v_in;
|
|
default: v_eff = v_in;
|
|
endcase
|
|
end
|
|
end
|
|
|
|
// --- linear address (combinational) ---
|
|
logic [31:0] lin_addr;
|
|
logic lin_nibble_hi; // PSMT4: this texel is the HIGH nibble of its byte
|
|
gs_texel_addr #(.ADDR_W(32)) u_addr (
|
|
.base_byte_addr (tbp0_base_bytes),
|
|
.u (u_eff),
|
|
.v (v_eff),
|
|
.tbw (tbw),
|
|
.psm (psm),
|
|
.texel_byte_addr(lin_addr),
|
|
.nibble_hi (lin_nibble_hi)
|
|
);
|
|
|
|
// --- swizzled PSMT4 address (combinational) ---
|
|
// EXACTLY mirrors the texture-UPLOAD path (gif_image_xfer_stub Ch139):
|
|
// the swizzle module is fed FBP=0 so it emits only the WITHIN-TEXTURE
|
|
// byte OFFSET, and the texture base (tbp0_base_bytes) is ADDED on top.
|
|
// This makes the sampled address bit-identical to the uploaded one for
|
|
// ANY 256-byte-aligned base (using the swizzle module's `fbp` input here
|
|
// would discard the low 11 bits of a non-2048-aligned base). FBW=TBW (in
|
|
// 64-texel units); PSMT4 swizzle needs FBW even (bw_pg = FBW>>1). The
|
|
// texture's (u,v) ARE the swizzle (x,y). Output is byte-offset + nibble_hi
|
|
// — the SAME shape gs_texel_addr emits for linear PSMT4, so downstream
|
|
// (word-align, byte-lane, nibble select, CLUT) is untouched.
|
|
logic [31:0] swz_off;
|
|
logic [31:0] swz_addr;
|
|
logic swz_nibble_hi;
|
|
generate
|
|
if (PSMT4_SWIZZLE) begin : g_swizzle4
|
|
gs_swizzle_psmt4_stub u_swizzle4 (
|
|
.fbp (9'd0),
|
|
.fbw (tbw[5:0]),
|
|
.x ({1'b0, u_eff}),
|
|
.y ({1'b0, v_eff}),
|
|
.addr (swz_off),
|
|
.nibble_hi (swz_nibble_hi)
|
|
);
|
|
assign swz_addr = tbp0_base_bytes + swz_off;
|
|
end else begin : g_no_swizzle4
|
|
assign swz_off = 32'd0;
|
|
assign swz_addr = 32'd0;
|
|
assign swz_nibble_hi = 1'b0;
|
|
end
|
|
endgenerate
|
|
|
|
// --- swizzled PSMT8 address (combinational) ---
|
|
// Ch299 — EXACTLY mirrors the PSMT4-swizzle sampler arm above (and the
|
|
// PSMT8 UPLOAD path in gif_image_xfer_stub Ch133), MINUS the nibble.
|
|
// gs_swizzle_psmt8_stub is fed FBP=0 so it emits only the WITHIN-TEXTURE
|
|
// byte OFFSET; the texture base (tbp0_base_bytes) is ADDED on top. This
|
|
// makes the sampled address bit-identical to the uploaded one for ANY
|
|
// 256-byte-aligned base. FBW=TBW (in 64-texel units); the PSMT8 swizzle
|
|
// needs FBW even (bw_pg = FBW>>1). The texture's (u,v) ARE the swizzle
|
|
// (x,y). Output is a byte address — the SAME shape gs_texel_addr emits for
|
|
// linear PSMT8 — so downstream (word-align, byte-lane, CLUT) is untouched.
|
|
// No nibble_hi: PSMT8 is one full byte per texel.
|
|
logic [31:0] swz8_off;
|
|
logic [31:0] swz8_addr;
|
|
generate
|
|
if (PSMT8_SWIZZLE) begin : g_swizzle8
|
|
gs_swizzle_psmt8_stub u_swizzle8 (
|
|
.fbp (9'd0),
|
|
.fbw (tbw[5:0]),
|
|
.x ({1'b0, u_eff}),
|
|
.y ({1'b0, v_eff}),
|
|
.addr (swz8_off)
|
|
);
|
|
assign swz8_addr = tbp0_base_bytes + swz8_off;
|
|
end else begin : g_no_swizzle8
|
|
assign swz8_off = 32'd0;
|
|
assign swz8_addr = 32'd0;
|
|
end
|
|
endgenerate
|
|
|
|
// --- swizzled PSMCT32 address (combinational) ---
|
|
// Ch300 — direct-color sibling of the PSMT4/PSMT8 swizzle arms above, using
|
|
// the SAME proven gs_swizzle_psmct32_stub already on the FB WRITE / SCANOUT
|
|
// / UPLOAD paths. Fed FBP=0 so it emits only the WITHIN-TEXTURE byte OFFSET;
|
|
// the texture base (tbp0_base_bytes) is ADDED on top, making the sampled
|
|
// address bit-identical to the uploaded one for ANY 2048-byte-aligned base.
|
|
// FBW=TBW (in 64-pixel units — PSMCT32 page is 64 px wide, so TBW units
|
|
// match the stub's fbw directly, NO >>1). The texture's (u,v) ARE the
|
|
// swizzle (x,y). Output is a 4-byte-aligned byte address — gs_texel_addr's
|
|
// PSMCT32 shape — so downstream is untouched. NO nibble, NO byte-lane, NO
|
|
// CLUT: the fetched word is the color (tex_color = tex_rd_data).
|
|
logic [31:0] swz32_off;
|
|
logic [31:0] swz32_addr;
|
|
generate
|
|
if (PSMCT32_SWIZZLE) begin : g_swizzle32
|
|
gs_swizzle_psmct32_stub u_swizzle32 (
|
|
.fbp (9'd0),
|
|
.fbw (tbw[5:0]),
|
|
.x ({1'b0, u_eff}),
|
|
.y ({1'b0, v_eff}),
|
|
.addr (swz32_off)
|
|
);
|
|
assign swz32_addr = tbp0_base_bytes + swz32_off;
|
|
end else begin : g_no_swizzle32
|
|
assign swz32_off = 32'd0;
|
|
assign swz32_addr = 32'd0;
|
|
end
|
|
endgenerate
|
|
|
|
// --- linear-vs-swizzled select ---
|
|
// Swizzle applies to a PSMT4 texture when PSMT4_SWIZZLE is set, and to a
|
|
// PSMT8 texture when PSMT8_SWIZZLE is set; every other psm always takes the
|
|
// linear address, and the two swizzle gates are mutually exclusive by psm.
|
|
// With both params 0 the selects are constant-false, so the synthesized
|
|
// logic — and every linear TB/demo — is byte-identical.
|
|
logic use_swizzle4;
|
|
logic use_swizzle8;
|
|
logic use_swizzle32;
|
|
logic [31:0] addr;
|
|
logic nibble_hi;
|
|
assign use_swizzle4 = (PSMT4_SWIZZLE != 1'b0) && (psm == PSM_PSMT4);
|
|
assign use_swizzle8 = (PSMT8_SWIZZLE != 1'b0) && (psm == PSM_PSMT8);
|
|
assign use_swizzle32 = (PSMCT32_SWIZZLE != 1'b0) && (psm == PSM_PSMCT32);
|
|
// Only the PSMT4 path carries a nibble; PSMT8/PSMCT32 swizzle and the
|
|
// linear fallback have none.
|
|
assign addr = use_swizzle4 ? swz_addr :
|
|
use_swizzle8 ? swz8_addr :
|
|
use_swizzle32 ? swz32_addr : lin_addr;
|
|
assign nibble_hi = use_swizzle4 ? swz_nibble_hi : lin_nibble_hi;
|
|
|
|
// Nearest-path read enable / address. These are muxed at the module
|
|
// outputs (tex_rd_en/tex_rd_addr) below: on the nearest path they ARE the
|
|
// outputs (byte-identical); on the bilinear path the FSM drives the
|
|
// outputs instead. The word-align mask is a no-op for PSMCT32.
|
|
logic near_rd_en;
|
|
logic [31:0] near_rd_addr;
|
|
assign near_rd_en = in_valid;
|
|
// The VRAM read port is 32-bit WORD-addressed (and vram_bram_stub's
|
|
// read2 only returns data for word-aligned addresses). PSMCT32 texel
|
|
// addresses are already word-aligned; PSMT8 byte addresses are not, so
|
|
// present the word-aligned address and recover the right byte via the
|
|
// low 2 bits (sel_lo) in the index extract below. Masking is a no-op
|
|
// for PSMCT32, so that path stays byte-identical.
|
|
assign near_rd_addr = addr & ~32'd3;
|
|
|
|
// --- PSMT8 index extract ---
|
|
// gs_texel_addr returns a 1-byte/texel address for PSMT8, so the
|
|
// fetched 32-bit word (read at addr & ~3 by the word-addressed VRAM
|
|
// port) packs 4 indices; the issued address' low 2 bits select which
|
|
// byte is THIS texel.
|
|
//
|
|
// The byte selector uses the addr[1:0] from the issue cycle of the
|
|
// returned word. SEL_DELAY (see the param comment) is 0 when the
|
|
// address is held stable across the read (current addr is correct) and
|
|
// >0 when the address advances while the read is in flight (delay the
|
|
// selector to re-pair it with the in-flight word). `sel_lo` carries it.
|
|
//
|
|
// PSMT4 (Ch297) adds a NIBBLE selector on top of the byte selector.
|
|
// gs_texel_addr emits a byte address (texel_offset>>1) plus `nibble_hi`
|
|
// (= texel_offset[0]: even texel -> LOW nibble, odd -> HIGH nibble). The
|
|
// selected byte (via sel_lo, exactly as PSMT8) holds TWO 4-bit indices;
|
|
// nibble_hi picks which. Because nibble_hi is derived from the texel
|
|
// ADDRESS — which advances every cycle while a read is in flight under
|
|
// TEX_RD_REGISTERED=1 — it must be SEL_DELAY-aligned by the SAME pipe
|
|
// depth as sel_lo so it re-pairs with the returned word. (Same class as
|
|
// the PSMT8 byte-lane realignment; get it wrong and odd/even texels smear.)
|
|
logic [1:0] sel_lo;
|
|
logic nib_sel; // SEL_DELAY-aligned nibble_hi
|
|
generate
|
|
if (SEL_DELAY == 0) begin : g_sel_comb
|
|
assign sel_lo = addr[1:0];
|
|
assign nib_sel = nibble_hi;
|
|
end else begin : g_sel_reg
|
|
logic [1:0] sel_pipe [0:SEL_DELAY-1];
|
|
logic nib_pipe [0:SEL_DELAY-1];
|
|
always_ff @(posedge clk or negedge rst_n) begin
|
|
if (!rst_n) begin
|
|
for (int i = 0; i < SEL_DELAY; i++) begin
|
|
sel_pipe[i] <= 2'd0;
|
|
nib_pipe[i] <= 1'b0;
|
|
end
|
|
end else begin
|
|
sel_pipe[0] <= addr[1:0];
|
|
nib_pipe[0] <= nibble_hi;
|
|
for (int i = 1; i < SEL_DELAY; i++) begin
|
|
sel_pipe[i] <= sel_pipe[i-1];
|
|
nib_pipe[i] <= nib_pipe[i-1];
|
|
end
|
|
end
|
|
end
|
|
assign sel_lo = sel_pipe[SEL_DELAY-1];
|
|
assign nib_sel = nib_pipe[SEL_DELAY-1];
|
|
end
|
|
endgenerate
|
|
|
|
// Byte select (shared by PSMT8 and PSMT4): pick the texel's byte lane.
|
|
logic [7:0] sel_byte;
|
|
always_comb begin
|
|
unique case (sel_lo)
|
|
2'b00: sel_byte = tex_rd_data[ 7: 0];
|
|
2'b01: sel_byte = tex_rd_data[15: 8];
|
|
2'b10: sel_byte = tex_rd_data[23:16];
|
|
default: sel_byte = tex_rd_data[31:24];
|
|
endcase
|
|
end
|
|
|
|
// Nibble select for PSMT4 (4-bit index, zero-extended to 8 bits so the
|
|
// SAME clut_rd_idx port + clut_stub feed it; CLUT entries 0..15 used).
|
|
// iverilog-12: no bit-select on a parenthesized expr, so split into a
|
|
// named net first, then index it.
|
|
logic [7:0] sel_byte_for_nib;
|
|
assign sel_byte_for_nib = sel_byte;
|
|
logic [3:0] psmt4_nibble;
|
|
assign psmt4_nibble = nib_sel ? sel_byte_for_nib[7:4] : sel_byte_for_nib[3:0];
|
|
|
|
// Index out: PSMT4 -> zero-extended nibble; PSMT8 -> full byte.
|
|
assign clut_rd_idx = (psm == PSM_PSMT4) ? {4'd0, psmt4_nibble} : sel_byte;
|
|
|
|
// --- valid pipeline matching the read latency ---
|
|
// in_valid presented with the address this cycle; tex_rd_data for it
|
|
// arrives RD_LATENCY cycles later. Delay valid by the same amount.
|
|
logic [RD_LATENCY-1:0] valid_pipe;
|
|
always_ff @(posedge clk or negedge rst_n) begin
|
|
if (!rst_n) begin
|
|
valid_pipe <= '0;
|
|
end else begin
|
|
if (RD_LATENCY == 1)
|
|
valid_pipe[0] <= in_valid;
|
|
else begin
|
|
valid_pipe[0] <= in_valid;
|
|
for (int i = 1; i < RD_LATENCY; i++)
|
|
valid_pipe[i] <= valid_pipe[i-1];
|
|
end
|
|
end
|
|
end
|
|
|
|
logic near_out_valid;
|
|
assign near_out_valid = valid_pipe[RD_LATENCY-1];
|
|
|
|
// --- decode (DECAL) ---
|
|
// PSMT4 : texel color = CLUT[nibble] (indexed indirection)
|
|
// PSMT8 : texel color = CLUT[byte index] (indexed indirection)
|
|
// PSMCT32 : texel word IS the color directly (byte-identical to v1)
|
|
logic [31:0] near_color;
|
|
assign near_color = (psm == PSM_PSMT8 || psm == PSM_PSMT4)
|
|
? clut_rd_data : tex_rd_data;
|
|
|
|
// ========================================================================
|
|
// Ch308 — BILINEAR (4-tap) PSMCT32 FILTER
|
|
// ========================================================================
|
|
// When BILINEAR_ENABLE=1 and psm==PSMCT32 we sample the 4 texels around the
|
|
// fractional coord and blend them. The whole block is wrapped in a generate
|
|
// that is empty when BILINEAR_ENABLE=0, so the default build is pruned to
|
|
// exactly the nearest path and is BYTE-IDENTICAL.
|
|
//
|
|
// CYCLE SCHEDULE (RD_LATENCY-aware; example RD_LATENCY=L):
|
|
// T0 : caller asserts in_valid (with u,v,u_frac,v_frac). FSM in
|
|
// IDLE latches u/v/frac, sets beat index k=0, drives
|
|
// bili_active=1, busy=1, moves to ISSUE.
|
|
// T0+ (ISSUE) : present neighbor[k] coord (beat_u/beat_v -> wrap ->
|
|
// gs_texel_addr -> tex_rd_addr) and pulse tex_rd_en for 1
|
|
// cycle; start an L-cycle wait; -> WAIT.
|
|
// ISSUE+1..+L : WAIT counts L cycles; on the L-th cycle tex_rd_data holds
|
|
// beat[k]'s 32-bit ABGR word -> capture into tap[k].
|
|
// If k<3: k++ and -> ISSUE (next neighbor). If k==3: -> DONE.
|
|
// DONE : combinationally lerp the 4 captured taps by u_frac/v_frac
|
|
// per channel; assert out_valid for 1 cycle with tex_color;
|
|
// drop busy; -> IDLE.
|
|
// => total ~ 4*(1+L)+1 cycles per filtered sample. Throughput is NOT a
|
|
// goal here (a later texture-cache pass collapses the 4 reads).
|
|
//
|
|
// Neighbor table (k -> du,dv): 0->(0,0) 1->(1,0) 2->(0,1) 3->(1,1).
|
|
// Each neighbor coord is fed through the SAME u_eff/v_eff wrap (via
|
|
// u_in/v_in above) so edge taps repeat/clamp and never read outside the
|
|
// texture (proven in the TB clamp/repeat cases).
|
|
//
|
|
// lerp(a,b,f) = a + (($signed({1'b0,b}) - $signed({1'b0,a})) * $signed({1'b0,f})) >>> 4
|
|
// with f the 4-bit frac (0..15 => /16). a,b are 8-bit channels. The
|
|
// bracketed product is computed in a SIGNED temp (no bit-select on a
|
|
// parenthesized expr — iverilog-12 rule), then arithmetic-shifted >>>4,
|
|
// then defensively clamped to 0..255.
|
|
generate
|
|
if (BILINEAR_ENABLE) begin : g_bilinear
|
|
localparam logic [1:0] BS_IDLE = 2'd0;
|
|
localparam logic [1:0] BS_ISSUE = 2'd1;
|
|
localparam logic [1:0] BS_WAIT = 2'd2;
|
|
localparam logic [1:0] BS_DONE = 2'd3;
|
|
|
|
logic [1:0] state;
|
|
logic [1:0] beat; // which neighbor 0..3
|
|
logic [31:0] wait_cnt; // counts RD_LATENCY
|
|
logic [31:0] tap [0:3]; // captured ABGR per neighbor
|
|
logic [10:0] lat_u, lat_v; // latched coord for this sample
|
|
logic [3:0] lat_uf, lat_vf; // latched fracs
|
|
|
|
// is this a PSMCT32 sample? bilinear runs for PSMCT32 always, and (Ch314)
|
|
// for PSMT8/PSMT4 when PALETTE_BILINEAR=1; any other psm falls back to the
|
|
// nearest path even with BILINEAR_ENABLE=1.
|
|
logic is_ct32;
|
|
logic is_indexed;
|
|
logic bili_psm_ok;
|
|
assign is_ct32 = (psm == PSM_PSMCT32);
|
|
assign is_indexed = (psm == PSM_PSMT8) || (psm == PSM_PSMT4);
|
|
assign bili_psm_ok = is_ct32 || (PALETTE_BILINEAR && is_indexed);
|
|
|
|
// Ch310 — RUNTIME filter gate. The 4-tap path runs ONLY for a PSMCT32
|
|
// texture whose primitive selected LINEAR magnification (filter_lin=1,
|
|
// i.e. TEX1.MMAG=1). With filter_lin=0 (NEAREST) we fall back to the
|
|
// single-read nearest path even with BILINEAR_ENABLE=1, so an
|
|
// MMAG=0 primitive stays nearest. `do_lin` is the single predicate that
|
|
// selects the bilinear datapath everywhere below.
|
|
//
|
|
// NOTE on the `!== 1'b0` test: it makes filter_lin DEFAULT-ON when the
|
|
// port is left UNCONNECTED (sim Z). The standalone tb_gs_texture_bilinear
|
|
// exercises the 4-tap path directly without driving filter_lin, so an
|
|
// unconnected input must keep bilinear running (Z !== 0 → true). A
|
|
// driven 0 (gs_stub MMAG=0) gives nearest; a driven 1 gives bilinear.
|
|
// In synthesis filter_lin is always driven by gs_stub, so this reduces
|
|
// to a plain `is_ct32 && filter_lin`.
|
|
logic do_lin;
|
|
assign do_lin = bili_psm_ok && (filter_lin !== 1'b0);
|
|
|
|
// bili_active (read by the wrap mux above): high whenever a filtered
|
|
// PSMCT32 sample is being processed by the FSM (ISSUE/WAIT/DONE) so the
|
|
// wrap consumes the per-beat neighbor coord. When do_lin=0 it is low so
|
|
// the wrap uses the port u/v (nearest), byte-identical to the
|
|
// non-bilinear coord path.
|
|
assign bili_active = do_lin;
|
|
|
|
// neighbor delta for the current beat
|
|
logic [10:0] du, dv;
|
|
always_comb begin
|
|
unique case (beat)
|
|
2'd0: begin du = 11'd0; dv = 11'd0; end
|
|
2'd1: begin du = 11'd1; dv = 11'd0; end
|
|
2'd2: begin du = 11'd0; dv = 11'd1; end
|
|
default: begin du = 11'd1; dv = 11'd1; end
|
|
endcase
|
|
end
|
|
// beat coord feeds the wrap (u_in/v_in). In IDLE (before latching) use
|
|
// the live ports so the first ISSUE sees neighbor 0 of the live coord;
|
|
// once latched, use the latched coord.
|
|
always_comb begin
|
|
if (state == BS_IDLE) begin
|
|
beat_u = u + du; // beat==0 here -> u+0
|
|
beat_v = v + dv;
|
|
end else begin
|
|
beat_u = lat_u + du;
|
|
beat_v = lat_v + dv;
|
|
end
|
|
end
|
|
|
|
// The bilinear read address reuses the SAME addr-gen (gs_texel_addr via
|
|
// the u_eff/v_eff wrap fed by beat_u/beat_v). near_rd_addr already is
|
|
// (addr & ~3) for the currently-selected coord; for PSMCT32 the linear
|
|
// path is used and it is word-aligned. We pulse rd_en only on ISSUE.
|
|
logic bi_rd_en;
|
|
assign bi_rd_en = (state == BS_ISSUE);
|
|
|
|
always_ff @(posedge clk or negedge rst_n) begin
|
|
if (!rst_n) begin
|
|
state <= BS_IDLE;
|
|
beat <= 2'd0;
|
|
wait_cnt <= 32'd0;
|
|
lat_u <= 11'd0; lat_v <= 11'd0;
|
|
lat_uf <= 4'd0; lat_vf <= 4'd0;
|
|
for (int i = 0; i < 4; i++) tap[i] <= 32'd0;
|
|
end else begin
|
|
unique case (state)
|
|
BS_IDLE: begin
|
|
if (in_valid && do_lin) begin
|
|
lat_u <= u; lat_v <= v;
|
|
lat_uf <= u_frac; lat_vf <= v_frac;
|
|
beat <= 2'd0;
|
|
state <= BS_ISSUE;
|
|
end
|
|
end
|
|
BS_ISSUE: begin
|
|
// address presented this cycle (combinationally via
|
|
// beat -> beat_u/beat_v -> wrap -> addr). Begin the
|
|
// RD_LATENCY wait.
|
|
wait_cnt <= 32'd1;
|
|
state <= BS_WAIT;
|
|
end
|
|
BS_WAIT: begin
|
|
if (wait_cnt >= RD_LATENCY[31:0]) begin
|
|
// tex_rd_data now holds beat's word. Capture the
|
|
// resolved COLOR (`near_color`): for PSMCT32 that is
|
|
// the raw word (byte-identical to the original);
|
|
// for PSMT8/PSMT4 (Ch314) it is clut_rd_data — the
|
|
// index extracted from this beat's word (sel_byte /
|
|
// psmt4_nibble, stable across the held beat) then CLUT'd.
|
|
// Capturing the CLUT'd color per tap is what makes the
|
|
// downstream lerp interpolate COLORS, not indices.
|
|
tap[beat] <= near_color;
|
|
if (beat == 2'd3) begin
|
|
state <= BS_DONE;
|
|
end else begin
|
|
beat <= beat + 2'd1;
|
|
state <= BS_ISSUE;
|
|
end
|
|
end else begin
|
|
wait_cnt <= wait_cnt + 32'd1;
|
|
end
|
|
end
|
|
default: begin // BS_DONE
|
|
state <= BS_IDLE;
|
|
end
|
|
endcase
|
|
end
|
|
end
|
|
|
|
// --- 4-tap blend (combinational, on the captured taps) ---
|
|
// PSMCT32 word layout: [31:24]=A [23:16]=B [15:8]=G [7:0]=R (ABGR8888).
|
|
// tap0=(u,v) tap1=(u+1,v) tap2=(u,v+1) tap3=(u+1,v+1).
|
|
function automatic logic [7:0] lerp8(input logic [7:0] a,
|
|
input logic [7:0] b,
|
|
input logic [3:0] f);
|
|
logic signed [16:0] diff; // b-a, signed, range -255..255
|
|
logic signed [21:0] prod; // diff*f
|
|
logic signed [21:0] shifted; // prod >>> 4
|
|
logic signed [21:0] res; // a + shifted
|
|
begin
|
|
diff = $signed({1'b0, b}) - $signed({1'b0, a});
|
|
prod = diff * $signed({1'b0, f});
|
|
shifted = prod >>> 4;
|
|
res = $signed({14'd0, a}) + shifted;
|
|
// defensive clamp 0..255 (in-range inputs keep res in range)
|
|
if (res < 0) lerp8 = 8'd0;
|
|
else if (res > 22'sd255) lerp8 = 8'd255;
|
|
else lerp8 = res[7:0];
|
|
end
|
|
endfunction
|
|
|
|
// per-channel taps
|
|
logic [7:0] t0_r, t0_g, t0_b, t0_a;
|
|
logic [7:0] t1_r, t1_g, t1_b, t1_a;
|
|
logic [7:0] t2_r, t2_g, t2_b, t2_a;
|
|
logic [7:0] t3_r, t3_g, t3_b, t3_a;
|
|
assign t0_r = tap[0][ 7: 0]; assign t0_g = tap[0][15: 8];
|
|
assign t0_b = tap[0][23:16]; assign t0_a = tap[0][31:24];
|
|
assign t1_r = tap[1][ 7: 0]; assign t1_g = tap[1][15: 8];
|
|
assign t1_b = tap[1][23:16]; assign t1_a = tap[1][31:24];
|
|
assign t2_r = tap[2][ 7: 0]; assign t2_g = tap[2][15: 8];
|
|
assign t2_b = tap[2][23:16]; assign t2_a = tap[2][31:24];
|
|
assign t3_r = tap[3][ 7: 0]; assign t3_g = tap[3][15: 8];
|
|
assign t3_b = tap[3][23:16]; assign t3_a = tap[3][31:24];
|
|
|
|
// top = lerp(tap0,tap1,uf); bot = lerp(tap2,tap3,uf); out = lerp(top,bot,vf)
|
|
logic [7:0] top_r, top_g, top_b, top_a;
|
|
logic [7:0] bot_r, bot_g, bot_b, bot_a;
|
|
logic [7:0] cv_r, cv_g, cv_b, cv_a;
|
|
always_comb begin
|
|
top_r = lerp8(t0_r, t1_r, lat_uf);
|
|
top_g = lerp8(t0_g, t1_g, lat_uf);
|
|
top_b = lerp8(t0_b, t1_b, lat_uf);
|
|
top_a = lerp8(t0_a, t1_a, lat_uf);
|
|
bot_r = lerp8(t2_r, t3_r, lat_uf);
|
|
bot_g = lerp8(t2_g, t3_g, lat_uf);
|
|
bot_b = lerp8(t2_b, t3_b, lat_uf);
|
|
bot_a = lerp8(t2_a, t3_a, lat_uf);
|
|
cv_r = lerp8(top_r, bot_r, lat_vf);
|
|
cv_g = lerp8(top_g, bot_g, lat_vf);
|
|
cv_b = lerp8(top_b, bot_b, lat_vf);
|
|
cv_a = lerp8(top_a, bot_a, lat_vf);
|
|
end
|
|
|
|
// Ch310 — HOLD register for the filtered color. The combined-renderer
|
|
// FSM (gs_stub CB_TWAIT) may latch the result a cycle or two AFTER the
|
|
// out_valid pulse (it steps at half-rate on z_advance beats), so the
|
|
// blended ABGR must stay STABLE from out_valid until the next sample.
|
|
// tex_color is the LIVE combinational blend during DONE (so an
|
|
// out_valid-keyed caller — tb_gs_texture_bilinear — reads the fresh
|
|
// value the SAME cycle out_valid pulses, byte-identical to before) and
|
|
// the LATCHED copy afterward (so a caller that reads one+ cycles later,
|
|
// like CB_TWAIT→CB_T, still sees it). The register captures the blend
|
|
// on the clk edge that LEAVES DONE; combining "live during DONE, held
|
|
// after" gives a value stable from out_valid until the next sample
|
|
// overwrites it at its DONE.
|
|
logic [31:0] tex_color_blend;
|
|
assign tex_color_blend = {cv_a, cv_b, cv_g, cv_r};
|
|
logic [31:0] tex_color_hold;
|
|
always_ff @(posedge clk or negedge rst_n) begin
|
|
if (!rst_n)
|
|
tex_color_hold <= 32'd0;
|
|
else if (state == BS_DONE)
|
|
tex_color_hold <= tex_color_blend; // capture the just-blended value
|
|
end
|
|
// live during the DONE pulse, held (last captured) otherwise
|
|
logic [31:0] tex_color_lin;
|
|
assign tex_color_lin = (state == BS_DONE) ? tex_color_blend : tex_color_hold;
|
|
|
|
// --- output mux: bilinear FSM owns the outputs for a FILTERED PSMCT32
|
|
// sample (do_lin). When do_lin=0 — non-PSMCT32 psm OR MMAG=0 NEAREST —
|
|
// we transparently fall back to the nearest single-read path so
|
|
// PSMT8/PSMT4/swizzle and nearest PSMCT32 still work with
|
|
// BILINEAR_ENABLE=1, and busy stays 0 there.
|
|
// tex_color: the HELD blended ABGR (stable from out_valid to next DONE).
|
|
assign tex_rd_en = do_lin ? bi_rd_en : near_rd_en;
|
|
// tex_rd_addr is the SAME addr-gen output for both paths (the wrap
|
|
// selects beat_u/beat_v vs port u/v); the FSM just gates rd_en.
|
|
assign tex_rd_addr = near_rd_addr;
|
|
assign out_valid = do_lin ? (state == BS_DONE) : near_out_valid;
|
|
assign tex_color = do_lin ? tex_color_lin : near_color;
|
|
assign busy = do_lin && (state != BS_IDLE);
|
|
end else begin : g_nearest
|
|
// BYTE-IDENTICAL nearest path: outputs are exactly the original assigns.
|
|
assign bili_active = 1'b0; // constant -> wrap uses port u/v
|
|
assign beat_u = 11'd0; // unused (pruned)
|
|
assign beat_v = 11'd0;
|
|
assign tex_rd_en = near_rd_en;
|
|
assign tex_rd_addr = near_rd_addr;
|
|
assign out_valid = near_out_valid;
|
|
assign tex_color = near_color;
|
|
assign busy = 1'b0;
|
|
end
|
|
endgenerate
|
|
|
|
endmodule : gs_texture_unit
|