Files
retroDE_ps2/rtl/gif_gs/gs_reciprocal_stub.sv
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

128 lines
5.2 KiB
Systemverilog

// retroDE_ps2 — gs_reciprocal_stub (Ch301)
//
// Pipelined fixed-point reciprocal unit for PERSPECTIVE-CORRECT texture
// interpolation. Computes recip = floor(2**SCALE / q) for an unsigned input
// q, with NO divider in the datapath — a serialized per-pixel divide would
// stall the ~1-pixel/cycle rasterizer (the architect's explicit constraint).
//
// Method — range-reduced table lookup (classic LUT reciprocal):
// 1. e = position of q's most-significant set bit (0..Q_W-1).
// 2. M = q normalized to an IDX_BITS-wide mantissa with its MSB at the top
// (M in [2**(IDX_BITS-1) .. 2**IDX_BITS-1)), i.e. q ~= M * 2**(e-(IDX_BITS-1)).
// 3. recip = LUT[M] >> e, where LUT[M] = floor(2**(SCALE+IDX_BITS-1) / M).
// Proof: LUT[M] >> e ~= 2**(SCALE+IDX_BITS-1)/(M * 2**e)
// = 2**SCALE / (M * 2**(e-(IDX_BITS-1)))
// = 2**SCALE / q. ✓ (uniform for all e)
//
// Accuracy is ~1 part in 2**IDX_BITS (relative). For the first perspective
// rung (texel coords <= 63) an 8-bit mantissa gives sub-texel error; bump
// IDX_BITS for tighter precision later if real traces demand it.
//
// Pipeline: 3 stages (LAT=3), one result per cycle.
// S0: register input q + valid.
// S1: e = msb(q); M = normalize(q).
// S2: lut_out = LUT[M]; carry e.
// S3: recip = lut_out >> e; out_valid.
//
// q==0 saturates to all-ones (1/0 -> +inf), which is harmless for the demo
// (q = 1/w with w finite positive is always > 0).
//
// LUT init is a computed `initial` for-loop (Quartus infers ROM from it). If a
// future synth flow rejects it, switch to $readmemh of a generated .mem.
`timescale 1ns/1ps
module gs_reciprocal_stub #(
parameter int Q_W = 24, // input width (q in [1, 2**Q_W))
parameter int IDX_BITS = 8, // mantissa / LUT-index width (256 entries)
parameter int SCALE = 24, // output = floor(2**SCALE / q)
parameter int OUT_W = 25 // output width (recip <= 2**SCALE for q>=1)
) (
input logic clk,
input logic rst_n,
input logic in_valid,
input logic [Q_W-1:0] q,
output logic out_valid,
output logic [OUT_W-1:0] recip
);
localparam int LUT_N = (1 << IDX_BITS);
localparam int TOP_BIT = IDX_BITS - 1; // mantissa MSB position
// LUT entries: floor(2**(SCALE+TOP_BIT) / M). Only M in [2**TOP_BIT .. LUT_N-1]
// are ever addressed (M always has its MSB set after normalization).
localparam int LUT_W = SCALE + 1; // wide enough for M=2**TOP_BIT
logic [LUT_W-1:0] lut [0:LUT_N-1];
initial begin
// 2**(SCALE+TOP_BIT) as a 64-bit constant numerator.
longint unsigned num;
num = (64'd1 << (SCALE + TOP_BIT));
for (int m = 0; m < LUT_N; m++) begin
if (m == 0) lut[m] = '0;
else lut[m] = LUT_W'(num / m);
end
end
// --- combinational msb-detect + normalize (S0->S1 inputs) ---
function automatic int unsigned msb_index(input logic [Q_W-1:0] v);
msb_index = 0;
for (int i = 0; i < Q_W; i++)
if (v[i]) msb_index = i;
endfunction
// ---------------- S1: e + mantissa (from the LIVE input) ----------------
// The msb-detect + normalize is combinational on the input q and registered
// here, so the whole unit is exactly 3 register stages (S1/S2/S3) → LAT=3.
logic s1_valid;
logic [$clog2(Q_W):0] s1_e;
logic [IDX_BITS-1:0] s1_m;
logic s1_zero;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s1_valid <= 1'b0; s1_e <= '0; s1_m <= '0; s1_zero <= 1'b0;
end else begin
int unsigned e;
logic [Q_W-1:0] norm;
e = msb_index(q);
s1_valid <= in_valid;
s1_zero <= (q == '0);
s1_e <= ($clog2(Q_W)+1)'(e);
// normalize so the mantissa MSB sits at bit TOP_BIT
if (e >= TOP_BIT) norm = q >> (e - TOP_BIT);
else norm = q << (TOP_BIT - e);
s1_m <= norm[IDX_BITS-1:0];
end
end
// ---------------- S2: LUT read ------------------------
logic s2_valid;
logic [$clog2(Q_W):0] s2_e;
logic [LUT_W-1:0] s2_lut;
logic s2_zero;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s2_valid <= 1'b0; s2_e <= '0; s2_lut <= '0; s2_zero <= 1'b0;
end else begin
s2_valid <= s1_valid;
s2_e <= s1_e;
s2_lut <= lut[s1_m];
s2_zero <= s1_zero;
end
end
// ---------------- S3: shift back ----------------------
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
out_valid <= 1'b0; recip <= '0;
end else begin
logic [LUT_W-1:0] shifted;
out_valid <= s2_valid;
shifted = s2_lut >> s2_e;
if (s2_zero) recip <= '1; // 1/0 -> saturate
else if (shifted > OUT_W'('1)) recip <= '1; // clamp to OUT_W
else recip <= OUT_W'(shifted);
end
end
endmodule : gs_reciprocal_stub