Files
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

110 lines
4.2 KiB
Systemverilog

// retroDE_ps2 — gs_persp_uv (Ch301)
//
// Per-pixel PERSPECTIVE-CORRECT texture-coordinate divide. Given the three
// affinely-interpolated perspective attributes at a pixel —
//
// uq = (u/w) * 2**FRAC (u-over-w, fixed-point)
// vq = (v/w) * 2**FRAC (v-over-w, fixed-point)
// q = (1/w) * 2**FRAC (one-over-w, fixed-point)
//
// — this recovers the integer texel coordinates:
//
// w_recip = 1/q (= w, via the pipelined gs_reciprocal_stub LUT, NO divider)
// u_texel = (uq * w_recip) >> SCALE (= (u/w) * w = u)
// v_texel = (vq * w_recip) >> SCALE (= (v/w) * w = v)
//
// gs_reciprocal_stub returns recip = floor(2**SCALE / q). With q = (1/w)<<FRAC
// that is recip = w << (SCALE-FRAC). Then uq*recip = (u/w<<FRAC)*(w<<(SCALE-FRAC))
// = u << SCALE, so (uq*recip) >> SCALE = u. (The FRAC scaling cancels.)
//
// Pipeline (NO divider, ~1 result/cycle):
// recip: RLAT cycles (gs_reciprocal_stub, 3).
// uq/vq: delayed RLAT cycles to align with recip.
// mul: 1 cycle (uq*recip, vq*recip) + shift + clamp.
// total latency = RLAT + 1.
//
// Output texel coords are clamped to [0, TEXEL_MAX] (saturating), matching the
// integer-coord clamp the affine path already applies.
`timescale 1ns/1ps
module gs_persp_uv #(
parameter int ATTR_W = 24, // width of uq/vq ((u/w)<<FRAC)
parameter int Q_W = 24, // width of q ((1/w)<<FRAC)
parameter int FRAC = 12, // fixed-point fraction bits of the attributes
parameter int SCALE = 24, // gs_reciprocal scale (recip = floor(2**SCALE/q))
parameter int RECIP_W = 25,
parameter int TEXEL_W = 11,
parameter int TEXEL_MAX = 2047,
// Ch351 — reciprocal LUT mantissa width. Default 8 (256-entry) is byte-identical to Ch301/342/348.
// Far-W perspective draws (small Q at high PERSP_FRAC) want more: 11 (2048-entry) ~ 0.05% rel error.
parameter int RECIP_IDX_BITS = 8
) (
input logic clk,
input logic rst_n,
input logic in_valid,
input logic [ATTR_W-1:0] uq,
input logic [ATTR_W-1:0] vq,
input logic [Q_W-1:0] q,
output logic out_valid,
output logic [TEXEL_W-1:0] u,
output logic [TEXEL_W-1:0] v
);
localparam int RLAT = 3; // gs_reciprocal_stub latency
// --- reciprocal of q (= w), pipelined LUT, no divider ---
logic recip_valid;
logic [RECIP_W-1:0] w_recip;
gs_reciprocal_stub #(
.Q_W(Q_W), .IDX_BITS(RECIP_IDX_BITS), .SCALE(SCALE), .OUT_W(RECIP_W)
) u_recip (
.clk(clk), .rst_n(rst_n),
.in_valid(in_valid), .q(q),
.out_valid(recip_valid), .recip(w_recip)
);
// --- delay uq/vq by RLAT to align with w_recip ---
logic [ATTR_W-1:0] uq_pipe [0:RLAT-1];
logic [ATTR_W-1:0] vq_pipe [0:RLAT-1];
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
for (int i = 0; i < RLAT; i++) begin
uq_pipe[i] <= '0;
vq_pipe[i] <= '0;
end
end else begin
uq_pipe[0] <= uq;
vq_pipe[0] <= vq;
for (int i = 1; i < RLAT; i++) begin
uq_pipe[i] <= uq_pipe[i-1];
vq_pipe[i] <= vq_pipe[i-1];
end
end
end
// --- multiply + shift + clamp (1 reg stage) ---
localparam int PROD_W = ATTR_W + RECIP_W;
function automatic logic [TEXEL_W-1:0] clamp_texel(input logic [PROD_W-1:0] prod);
logic [PROD_W-1:0] shifted;
shifted = prod >> SCALE;
if (shifted > PROD_W'(TEXEL_MAX)) clamp_texel = TEXEL_W'(TEXEL_MAX);
else clamp_texel = shifted[TEXEL_W-1:0];
endfunction
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
out_valid <= 1'b0;
u <= '0;
v <= '0;
end else begin
logic [PROD_W-1:0] u_prod, v_prod;
out_valid <= recip_valid;
u_prod = uq_pipe[RLAT-1] * w_recip;
v_prod = vq_pipe[RLAT-1] * w_recip;
u <= clamp_texel(u_prod);
v <= clamp_texel(v_prod);
end
end
endmodule : gs_persp_uv