Files
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

89 lines
4.4 KiB
Systemverilog

// ============================================================================
// gs_grad_divider.sv (Ch352 — sequential signed divider for the triangle-setup gradient solve)
//
// Replaces the single combinational `grad_num_q[grad_step] / grad_det_q` in gs_stub. That combinational
// divider is a ~6700-cell, ~100ns cone at the 25MHz design clock — the worst setup path, and (the real lesson)
// it CANNOT be covered by any SDC timing exception: both a multicycle and a false_path made the Quartus fitter
// grind on its cone indefinitely (Place stuck <1% for hours). A sequential divider has REGISTERED iterations and
// no combinational cone, so every internal path is an ordinary single-cycle path that closes timing normally —
// no exception needed, no grind.
//
// BIT-EXACT to SystemVerilog signed `/`:
// * truncation toward zero (divide magnitudes, then apply the XOR-of-signs);
// * den == 0 -> quotient 0 (matches the gs_stub `if (grad_det_q==0) grad_quo=0` guard).
// Restoring division of the W-bit magnitudes (W iterations), one iteration per clock.
//
// Handshake: pulse `start` with num/den stable -> `busy` high for the solve -> `done` pulses for one cycle
// with `quo` valid (and stays valid until the next start). The gs_stub gradient FSM waits on `done`.
// ============================================================================
`timescale 1ns/1ps
module gs_grad_divider #(
parameter int W = 56 // operand width (gs_stub: grad_num_q / sign-extended grad_det)
)(
input logic clk,
input logic rst_n,
input logic start, // pulse: begin a divide (num/den sampled this cycle)
input logic signed [W-1:0] num,
input logic signed [W-1:0] den,
output logic signed [W-1:0] quo, // truncate-toward-zero quotient (== $signed(num)/$signed(den))
output logic busy,
output logic done // 1-cycle pulse when quo is valid
);
localparam int CW = $clog2(W+1);
// magnitude + sign capture
function automatic logic [W-1:0] absval(input logic signed [W-1:0] v);
absval = v[W-1] ? (~v + 1'b1) : v; // |v| (the most-negative wraps to 2^(W-1), which fits unsigned W)
endfunction
logic [W:0] rem; // remainder, W+1 bits for the compare/subtract
logic [W-1:0] qbuild; // quotient under construction (also shifts the dividend out of its top)
logic [W-1:0] den_mag; // |den|
logic qsign; // result sign = num_sign ^ den_sign
logic [CW-1:0] iter;
logic run;
// one restoring step: bring the next dividend bit into rem, conditionally subtract |den|.
wire [W:0] rem_sh = {rem[W-1:0], qbuild[W-1]}; // rem<<1 | dividend MSB
wire sub_ok = (rem_sh >= {1'b0, den_mag});
wire [W:0] rem_nxt = sub_ok ? (rem_sh - {1'b0, den_mag}) : rem_sh;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
rem <= '0; qbuild <= '0; den_mag <= '0; qsign <= 1'b0; iter <= '0;
run <= 1'b0; busy <= 1'b0; done <= 1'b0; quo <= '0;
end else begin
done <= 1'b0;
if (start && !busy) begin
if (den == '0) begin
// den == 0 -> quotient 0 (matches the gs_stub guard), available next cycle
quo <= '0;
done <= 1'b1;
busy <= 1'b0;
run <= 1'b0;
end else begin
rem <= '0;
qbuild <= absval(num);
den_mag <= absval(den);
qsign <= num[W-1] ^ den[W-1];
iter <= CW'(W);
run <= 1'b1;
busy <= 1'b1;
end
end else if (run) begin
rem <= rem_nxt;
qbuild <= {qbuild[W-2:0], sub_ok}; // shift dividend out, shift quotient bit in
iter <= iter - 1'b1;
if (iter == CW'(1)) begin
// final iteration: qbuild now holds the W-bit magnitude quotient -> apply sign
run <= 1'b0;
busy <= 1'b0;
done <= 1'b1;
quo <= qsign ? (~{qbuild[W-2:0], sub_ok} + 1'b1) : {qbuild[W-2:0], sub_ok};
end
end
end
end
endmodule