ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
7219 lines
414 KiB
Systemverilog
7219 lines
414 KiB
Systemverilog
// retroDE_ps2 — gs_stub
|
||
//
|
||
// GS shell with two architecturally distinct write ports (Ch75), a
|
||
// primitive-shape observer (Ch76–Ch81), a minimal interior
|
||
// rasterizer (Ch84), a D3D-style top-left fill rule on triangle
|
||
// edges (Ch85), per-pixel Gouraud color interpolation across the
|
||
// triangle interior (Ch86), a 2-entry primitive queue feeding
|
||
// the SCAN FSM (Ch87 — absorbs up to 3 back-to-back closes
|
||
// before raster_overflow), and a 3-stage pixel pipeline
|
||
// (Ch88 — S0 coord gen / S1 edge test + bary / S2 color +
|
||
// fb_addr + emit, with R_DRAIN to flush before next pop). The
|
||
// TEX0_1 (GIF reg# 0x06) is latched (Ch98) with sub-field
|
||
// decoder outputs for CBP/CPSM/CSM/CSA/CLD; CSA flows into
|
||
// gs_pcrtc_stub.clut_csa for the indexed-color CLUT lookup
|
||
// path. Ch99..Ch102 added a 1-cycle `tex0_1_wr_q` pulse on
|
||
// every TEX0_1 commit; clut_loader_stub watches it and (when
|
||
// CSM=CSM2, CPSM ∈ {PSMCT32, PSMCT16}, AND the CLD-mode policy
|
||
// permits — CLD=0 never, =1 always full, =2 if CBP changed,
|
||
// =3 if CBP/CPSM/CSA any-changed, =4 always but only 16-entry
|
||
// CSA window, ∈ {5..7} reserved no-op) walks the entries
|
||
// copying CLUT bytes from VRAM into clut_stub via vram_stub's
|
||
// second read port. PSMCT16 entries are RGB5A1-unpacked to
|
||
// PSMCT32 ABGR with 5→8 bit-replicate so clut_stub stays a
|
||
// single PSMCT32 staging area regardless of source format.
|
||
// raster_pixel_emit channel is a streaming write port for
|
||
// vram_stub (Ch89 — first persistence layer; PSMCT32 lane only,
|
||
// linear byte addressing); gs_pcrtc_stub (Ch90) is the dual
|
||
// scanout engine that reads vram_stub back as r/g/b for video,
|
||
// configured by PMODE / DISPFB1 (Ch91) and DISPLAY1 (Ch92, Ch93)
|
||
// latches written through the privileged CPU MMIO port — no TB
|
||
// sideband. DISPLAY1.DX/DY/DW/DH gate the display window inside
|
||
// the active area; DISPLAY1.MAGH/MAGV (Ch93) scale each VRAM
|
||
// column/line across (MAGH+1)/(MAGV+1) VCK pulses/lines.
|
||
// DISPFB1.PSM picks the scanout pixel format (Ch94 / Ch96 /
|
||
// Ch103 — PSMCT32, PSMCT16, PSMT8, and PSMT4 supported; other
|
||
// PSMs force scanout off). PSMT8 / PSMT4 surface the
|
||
// index/nibble as grayscale by default, and as CLUT-decoded
|
||
// RGB when `clut_enable=1` with a programmed clut_stub (Ch97). The raster channel emits
|
||
// PSMCT32 OR PSMCT16 (Ch95 — pack ABGR → RGB5A1 in S2;
|
||
// raster_pixel_be_q gates which bytes vram_stub commits).
|
||
// FRAME_1.PSM at the time of close picks the format. Ch105 adds
|
||
// PSMT8 raster-emit: when ras_bpp_shift==0 (FRAME_1.PSM=0x13), the
|
||
// rasterizer takes the natural ABGR's R channel (low 8 bits) as
|
||
// the PSMT8 index — the same lane real PS2 hardware writes when
|
||
// the destination FB is PSMT8 — places it in the LOW byte of the
|
||
// emit lane, and sets raster_pixel_be_q=4'b0001 so vram_stub
|
||
// commits exactly the 1 byte at fb_addr without stomping
|
||
// neighbouring pixels in the same 32-bit word. Ch106 closes the
|
||
// indexed-write gap with PSMT4: 2 pixels per byte, byte address =
|
||
// pixel_index >> 1, low/high nibble keyed by pixel_index[0]. The
|
||
// natural ABGR's low 4 bits become the PSMT4 index; emit places
|
||
// it in the targeted nibble position with be=4'b0001 and a
|
||
// per-bit mask (raster_pixel_mask_q = 0x0F or 0xF0) so vram_stub
|
||
// merges only that nibble — the OTHER nibble of the same byte is
|
||
// preserved. Back-to-back same-byte emits (e.g. PSMT4 pixels at
|
||
// x=0 and x=1) chain through NBA semantics without bypass logic.
|
||
// Ch110 — host→local image-transfer (TRX) register set.
|
||
// BITBLTBUF (0x50), TRXPOS (0x51), TRXREG (0x52), TRXDIR (0x53)
|
||
// are latched as 64-bit raw quadwords; `trxdir_wr_q` pulses for
|
||
// one cycle when TRXDIR commits, arming gif_image_xfer_stub for
|
||
// a host→local upload. The transfer engine consumes IMAGE-mode
|
||
// qwords (FLG=2) coming out of gif_packed_stub and writes the
|
||
// pixels into vram_stub at the BITBLTBUF/TRXPOS/TRXREG-described
|
||
// destination — driver-shaped palette/texture upload, no TB
|
||
// sideband on the framebuffer side.
|
||
// What this stub still does NOT do:
|
||
// model sub-pixel edge adjustment; expose per-pixel PSM at the
|
||
// raster channel. ALL FOUR common GS PSMs (CT32 / CT16 / T8 / T4)
|
||
// have OPTIONAL canonical-swizzle paths gated by `PSMCT32_SWIZZLE`
|
||
// (Ch122), `PSMCT16_SWIZZLE` (Ch128), `PSMT8_SWIZZLE` (Ch134),
|
||
// and `PSMT4_SWIZZLE` (Ch140) respectively — all default OFF, so
|
||
// fb_addr is linear by default and any TB that doesn't set those
|
||
// parameters sees the legacy linear framebuffer layout. (PSMT4's
|
||
// swizzle module also outputs a `nibble_hi` selector that
|
||
// replaces `s2_pixel_index[0]` in the existing Ch106 nibble RMW
|
||
// machinery when its gate is on.) R/G/B/A
|
||
// interpolation is integer truncation (not real-GS fixed-point
|
||
// rounding), Q is passed through from the closing vertex's RGBAQ
|
||
// (no float interp), and the queue is shallow enough that
|
||
// sustained STRIP/FAN streams beyond ~3 simultaneous closes still
|
||
// trigger raster_overflow.
|
||
//
|
||
// Contract refs:
|
||
// docs/stub_module_plan.md (Wave 1, item 6)
|
||
// docs/contracts/gif_gs.md (authoritative for port + event taxonomy)
|
||
//
|
||
// Write ports (separate namespaces — do not conflate):
|
||
// - reg_wr_* : privileged-block writes, 16-bit offset within
|
||
// 0x12000000. CPU MMIO path. BGCOLOR (offset 0x00E0)
|
||
// latches into bg_{r,g,b}; other offsets emit EV_MODE.
|
||
// - gif_reg_* : GIF A+D / REGLIST writes, 8-bit reg# per PCSX2's
|
||
// GSRegs.h. gif_packed_stub drives this with
|
||
// REAL_AD_REG_MAP=1. Decodes PRIM/RGBAQ/XYZF2/XYZ2/
|
||
// FRAME_1/ZBUF_1 into per-register 64-bit latches;
|
||
// unknown reg numbers emit EV_MODE.
|
||
//
|
||
// Primitive observer (Ch76–Ch87 — recognition + Gouraud raster + 2-entry primitive queue):
|
||
// - PRIM[2:0] selects vertex threshold and discrete-vs-strip mode:
|
||
// POINT=1 (discrete), LINE=2 (discrete), LINE_STRIP=2 (strip),
|
||
// TRIANGLE=3 (discrete), TRI_STRIP=3 (strip), TRI_FAN=3 (strip),
|
||
// SPRITE=2 (discrete), reserved=no draw.
|
||
// - XYZ2 (reg# 0x05) and XYZF2 (reg# 0x04) are vertex commits.
|
||
// - Discrete primitives: count vertices to threshold, fire a draw,
|
||
// reset counter to 0. One primitive per N vertices.
|
||
// - Strip / fan primitives (Ch77): count vertices to anchor
|
||
// threshold, fire a draw, then SATURATE the counter at the
|
||
// threshold. Every subsequent vertex commit fires another draw,
|
||
// so the cadence becomes "first N vertices anchor + one primitive
|
||
// per additional vertex." TRI_FAN's pivot-vertex semantics are
|
||
// fully modeled at the recognition layer (Ch78–Ch81): saturated
|
||
// fan extensions PIN the v0 slot to the pivot vertex's identity
|
||
// (raw, color, decoded fields, format flag). The Ch84 raster
|
||
// engine consumes those pinned slots directly; Ch85 added the
|
||
// D3D-style top-left fill rule on triangle edges, and Ch86
|
||
// added per-pixel Gouraud R/G/B/A interpolation across the
|
||
// interior. What stays out of scope from the real GS raster
|
||
// rules: sub-pixel edge adjustment (4-bit fractional X/Y is
|
||
// discarded), real-PS2 fixed-point rounding (R/G/B/A use
|
||
// integer truncation toward zero). ALL FOUR common GS PSMs
|
||
// (CT32 / CT16 / T8 / T4) raster emits are linear by DEFAULT
|
||
// but support the canonical PS2 GS page/block swizzle behind
|
||
// parameter gates: `PSMCT32_SWIZZLE=1` (Ch122) routes through
|
||
// `gs_swizzle_psmct32_stub`; `PSMCT16_SWIZZLE=1` (Ch128)
|
||
// through `gs_swizzle_psmct16_stub`; `PSMT8_SWIZZLE=1`
|
||
// (Ch134) through `gs_swizzle_psmt8_stub` (page=128×64 px,
|
||
// bw_pg=FBW>>1); `PSMT4_SWIZZLE=1` (Ch140) through
|
||
// `gs_swizzle_psmt4_stub` (page=128×128 px, bw_pg=FBW>>1).
|
||
// PSMT4's swizzle module additionally outputs a `nibble_hi`
|
||
// selector that replaces `s2_pixel_index[0]` in the existing
|
||
// Ch106 nibble RMW machinery when its gate is on. The four
|
||
// parameters are independent. Default 0 on all four keeps
|
||
// every existing PSMCT32 / PSMCT16 / PSMT8 / PSMT4 raster TB
|
||
// on the legacy linear formula.
|
||
// - When a draw fires, prim_complete pulses one cycle,
|
||
// prim_complete_count increments, and EV_PRIM_DRAW preempts the
|
||
// cycle's EV_WRITE in the trace (the xyz2_q / xyzf2_q latch still
|
||
// updates).
|
||
// - Ch78: a rolling vertex-identity window (v_curr / v_prev /
|
||
// v_prev_prev / v_pivot) is exposed via prim_v0_q / prim_v1_q /
|
||
// prim_v2_q so a TB can verify *which* vertices form each closed
|
||
// primitive — distinguishes TRI_STRIP rolling triangles
|
||
// {v_n-2, v_n-1, v_n} from TRI_FAN pivot triangles
|
||
// {v_pivot, v_n-1, v_n}. Snapshot is registered in lockstep with
|
||
// the EV_PRIM_DRAW pulse; held between draws.
|
||
// - Ch79: prim_color_q snapshots rgbaq_q at close so each closed
|
||
// primitive carries its draw-time color (the closing-vertex
|
||
// color, equivalent to prim_color_v_close in Ch80 terms).
|
||
// - Ch80: per-vertex Gouraud color — a parallel rolling color
|
||
// window (c_curr / c_prev / c_prev_prev / c_pivot) sampled at
|
||
// each vertex commit. Snapshot outputs prim_color_v0_q /
|
||
// prim_color_v1_q / prim_color_v2_q carry the per-vertex color
|
||
// tuple aligned with prim_v0_q / v1_q / v2_q. Distinguishes
|
||
// flat-shaded (all slots equal) from Gouraud (slots differ)
|
||
// primitives at trace level.
|
||
// - Ch81: structured-field decode. A parallel format-flag
|
||
// rolling window (xyzf2_curr / xyzf2_prev / xyzf2_prev_prev /
|
||
// xyzf2_pivot) records whether each vertex came from XYZ2 or
|
||
// XYZF2. Decoded snapshot outputs prim_v0_decoded_q /
|
||
// prim_v1_decoded_q / prim_v2_decoded_q (vertex_t) and
|
||
// prim_v0_color_decoded_q / v1 / v2 (color_t) carry per-channel
|
||
// fields (x/y/z/fog/is_xyzf2 + r/g/b/a/q) so a downstream
|
||
// rasterizer or pixel-emit path doesn't have to re-derive them
|
||
// from raw 64-bit payloads. Cleared on PRIM write same as the
|
||
// vertex/color windows.
|
||
// - Ch82: minimal pixel emit. One pixel per closed primitive
|
||
// (the closing vertex), addressed by frame_1_q.FBP / FBW and
|
||
// the decoded screen X/Y. pixel_emit pulses on the same edge
|
||
// as prim_complete; pixel_x_q / pixel_y_q / pixel_color_q /
|
||
// pixel_fb_addr_q snapshot the resolved fb destination.
|
||
// - Ch83: PSM-aware byte-width. fb_addr math uses a per-PSM
|
||
// bpp_shift (PSMCT32 / PSMZ32 / host-word formats → shift 2;
|
||
// PSMCT16 / PSMZ16 → shift 1; PSMT8 → shift 0). PSMT4 stays
|
||
// out of scope (host-word fallback).
|
||
// - Ch84: minimal interior rasterizer. SPRITE → axis-aligned
|
||
// rectangle fill; TRI / TRI_STRIP / TRI_FAN → edge-function
|
||
// half-plane test inside the bounding box. POINT / LINE /
|
||
// LINE_STRIP keep Ch82 single-closing-pixel behavior.
|
||
// Separate raster_pixel_emit strobe channel — existing
|
||
// pixel_emit (one-per-close) is unchanged. Flat-shaded;
|
||
// Gouraud across the interior is a future chapter.
|
||
// - Ch85: D3D-style top-left fill rule on triangle edges. At
|
||
// IDLE→SCAN, signed area picks winding (auto-swap to CCW if
|
||
// CW); per-edge top-or-left flag → 0/1 bias such that
|
||
// `inside = all (e[i] + bias[i] <= 0)`. Top edges
|
||
// (dy==0 & dx>0) and left edges (dy>0) are inclusive; right
|
||
// and bottom edges are exclusive. Adjacent triangles sharing
|
||
// an edge no longer double-paint. Degenerate triangles
|
||
// (signed area == 0) skip SCAN and latch raster_degenerate.
|
||
// - Ch86: per-pixel Gouraud color interpolation for triangles.
|
||
// Three per-vertex colors (ras_c0/c1/c2) latched at SCAN
|
||
// init with the same v1↔v2 swap mirror as the vertex coords.
|
||
// Inside a TRI, raster_pixel_color_q is computed as
|
||
// λ0*c0 + λ1*c1 + λ2*c2 per channel, where λ_i are the
|
||
// barycentric weights derived from the unbiased edge
|
||
// functions and divided by the post-swap signed area. R/G/B/A
|
||
// are integer-truncated; Q passes through from the closing
|
||
// vertex (it's an IEEE float and doesn't interp as integer).
|
||
// SPRITE keeps flat shading.
|
||
// - Ch87: 2-entry raster command FIFO. Captures the per-prim
|
||
// context at every close cycle and feeds the FSM one
|
||
// primitive per scan. Effective concurrency = 1 in-flight
|
||
// + 2 queued = absorbs up to 3 back-to-back closes without
|
||
// raster_overflow. raster_overflow now latches when the
|
||
// FIFO is FULL on a new push attempt; degenerate triangles
|
||
// are filtered at enqueue (set raster_degenerate, not
|
||
// pushed). Pop happens at IDLE→SCAN AND at end-of-scan when
|
||
// the FIFO has more work, so back-to-back scans run
|
||
// contiguously without an IDLE bubble.
|
||
// - Ch88: 3-stage pixel pipeline + R_DRAIN. The SCAN body is
|
||
// split into S0 (current x/y from the bbox walker), S1 (edge
|
||
// functions + top-left bias decision), and S2 (Gouraud
|
||
// interp + fb_addr math + emit). Throughput is one candidate
|
||
// pixel per cycle; latency from pop_ok to first emit is 3
|
||
// pipeline stages. After S0 generates the bbox corner the
|
||
// FSM transitions R_SCAN→R_DRAIN so S1/S2 can drain without
|
||
// S0 producing more valids. The next pop fires only when
|
||
// drain_done = (state==R_DRAIN) & !s1_valid_q & !s2_valid_q,
|
||
// so back-to-back scans don't lose pipeline-tail pixels.
|
||
// - PRIM write resets the vertex counter AND the rolling window so
|
||
// a fresh primitive context starts clean.
|
||
//
|
||
// Trace payload schema (this stub):
|
||
// GS MODE arg0=offset arg1=value arg2=- arg3=-
|
||
// GS BGCOLOR arg0=r arg1=g arg2=b arg3=-
|
||
// GS WRITE arg0=reg# arg1=data arg2=selector arg3=-
|
||
// (selector: 1=PRIM 2=RGBAQ 3=XYZF2 4=XYZ2 5=FRAME_1 6=ZBUF_1
|
||
// 7=TEX0_1 (Ch98); 0 for unknown reg#)
|
||
// GS PRIM_DRAW arg0=prim_type arg1=threshold arg2=count arg3=vertex
|
||
// (Ch78: prim_v0_q / prim_v1_q / prim_v2_q outputs
|
||
// carry the per-primitive vertex tuple alongside the
|
||
// event — sample on the same cycle as ev_valid.
|
||
// Ch79: prim_color_q output carries the draw-time
|
||
// rgbaq_q value, latched on the same edge.
|
||
// Ch80: prim_color_v0_q / v1_q / v2_q outputs
|
||
// carry the per-vertex Gouraud color tuple.)
|
||
|
||
`timescale 1ns/1ps
|
||
|
||
module gs_stub
|
||
import trace_pkg::*;
|
||
#(
|
||
parameter logic [15:0] BGCOLOR_OFFSET = 16'h00E0,
|
||
// Ch91 — privileged display registers. PMODE selects which
|
||
// CRTCs are active and how their outputs blend; DISPFB1 holds
|
||
// the framebuffer base/width/PSM that the PCRTC scanout
|
||
// engine reads from. gs_pcrtc_stub consumes these latches as
|
||
// its DISPFB-equivalent inputs (replacing Ch90's TB sideband
|
||
// ports). DISPFB2 / DISPLAY1 / DISPLAY2 / SMODE / SYNC are
|
||
// out of scope until a dual-display / interlace chapter.
|
||
parameter logic [15:0] PMODE_OFFSET = 16'h0000,
|
||
parameter logic [15:0] DISPFB1_OFFSET = 16'h0070,
|
||
// Ch92 — DISPLAY1 (display window/crop) at offset 0x0080.
|
||
// gs_pcrtc_stub honors DX/DY (window origin) and DW/DH
|
||
// (window extents-1) to gate scanout to a sub-rect of the
|
||
// active area. MAGH/MAGV (scaling) deferred — pcrtc still
|
||
// takes H/V totals from module parameters at instantiation.
|
||
parameter logic [15:0] DISPLAY1_OFFSET = 16'h0080,
|
||
// Ch75: the Ch74-era PMODE/SMODE2/FRAME_1/ZBUF_1 offset
|
||
// parameters were wrong — they conflated GIF A+D register
|
||
// numbers with privileged CRTC/MMIO offsets. PMODE/SMODE2 are
|
||
// privileged-only (CPU MMIO); FRAME_1/ZBUF_1 belong to the GIF
|
||
// A+D namespace, not the privileged one. Removed; replaced with
|
||
// a separate GIF-A+D port below that uses real PS2 reg# from
|
||
// PCSX2's GSRegs.h.
|
||
|
||
// Ch122 — when set, PSMCT32 raster emit (the per-pixel write
|
||
// path through `raster_pixel_fb_addr_q`) computes the VRAM
|
||
// byte address via the canonical PS2 GS page/block swizzle
|
||
// (gs_swizzle_psmct32_stub) instead of the legacy linear
|
||
// formula `(FBW*64*y + x) * 4`. Other PSMs are not affected
|
||
// by this parameter — PSMCT16 has its own gate
|
||
// (PSMCT16_SWIZZLE, Ch128), PSMT8 has PSMT8_SWIZZLE (Ch134),
|
||
// PSMT4 has PSMT4_SWIZZLE (Ch140 — see below).
|
||
// Default 0 keeps every existing PSMCT32 raster TB on the
|
||
// original linear addressing.
|
||
parameter bit PSMCT32_SWIZZLE = 1'b0,
|
||
|
||
// Ch128 — when set, PSMCT16 raster emit computes the VRAM
|
||
// byte address via the canonical PS2 GS page/block/column
|
||
// swizzle (gs_swizzle_psmct16_stub) instead of the legacy
|
||
// linear formula `(FBW*64*y + x) * 2`. Mirrors the Ch122
|
||
// PSMCT32 raster gate at the same emit stage. PSMCT32 / PSMT8
|
||
// are governed by their own gates (PSMCT32_SWIZZLE /
|
||
// PSMT8_SWIZZLE / PSMT4_SWIZZLE). Default 0 keeps
|
||
// every existing PSMCT16 raster TB (Ch95 raster_psmct16, etc.)
|
||
// on the legacy linear path. Closes the third PSMCT16
|
||
// integration point — together with Ch126 (read-side
|
||
// pcrtc) and Ch127 (image-xfer write-side), all three
|
||
// major PSMCT16 paths can be byte-consistent under the
|
||
// canonical swizzle when their gates are flipped on.
|
||
parameter bit PSMCT16_SWIZZLE = 1'b0,
|
||
|
||
// Ch134 — when set, PSMT8 raster emit computes the VRAM byte
|
||
// address via the canonical PS2 GS page/block/column swizzle
|
||
// (gs_swizzle_psmt8_stub) instead of the legacy linear
|
||
// formula `(FBW*64*y + x) * 1`. PSMT8 pages are 128 px wide
|
||
// (vs 64 px for direct-color PSMs) so the swizzle internally
|
||
// divides FBW by 2 — PCSX2 asserts FBW must be even for
|
||
// PSMT8 at GSLocalMemory.h:553. Mirrors the Ch122/Ch128
|
||
// raster gates at the same emit stage. PSMCT32 / PSMCT16 /
|
||
// PSMT4 are governed by their own gates. Default 0 keeps
|
||
// every existing PSMT8 raster TB (Ch105 raster_psmt8, Ch107
|
||
// PSMT4-via-CT16-CLUT palette path, etc.) on the legacy
|
||
// linear addressing. Closes the third PSMT8 integration
|
||
// point — together with Ch132 (read-side pcrtc) and Ch133
|
||
// (image-xfer write-side), all three major PSMT8 paths can
|
||
// be byte-consistent under the canonical swizzle when their
|
||
// gates are flipped on.
|
||
parameter bit PSMT8_SWIZZLE = 1'b0,
|
||
|
||
// Ch140 — when set, PSMT4 raster emit computes the VRAM byte
|
||
// address via the canonical PS2 GS page/block/column swizzle
|
||
// (gs_swizzle_psmt4_stub) instead of the legacy linear
|
||
// formula `(FBW*64*y + x) >> 1`. PSMT4 is 4 bits/pixel so the
|
||
// swizzle module also outputs a `nibble_hi` selector that
|
||
// picks which nibble of the byte at the swizzled address
|
||
// holds this pixel — the linear formula's s2_pixel_index[0]
|
||
// selector is wrong under the swizzled layout because the
|
||
// canonical PCSX2 columnTable4 reorders nibbles within a
|
||
// block. The existing Ch106 nibble RMW machinery
|
||
// (write_be=4'b0001 + write_mask 0x0F or 0xF0) layers on top
|
||
// of the swizzled byte address: the mask + low/high data
|
||
// placement key on the swizzle's nibble_hi when this gate is
|
||
// on, instead of on s2_pixel_index[0]. PSMT4 pages are 128
|
||
// px wide AND 128 px tall (different from PSMT8's 128×64);
|
||
// the swizzle internally uses bw_pg = ras_fbw>>1 — PCSX2
|
||
// asserts FBW must be even for PSMT4 at GSLocalMemory.h:560.
|
||
// Mirrors the Ch122/Ch128/Ch134 raster gates at the same emit
|
||
// stage. PSMCT32 / PSMCT16 / PSMT8 are governed by their own
|
||
// gates. Default 0 keeps every existing PSMT4 raster TB
|
||
// (Ch106 raster_psmt4, Ch107 PSMT4 e2e, Ch104 round-trip,
|
||
// etc.) on the legacy linear addressing. Closes the third
|
||
// PSMT4 integration point — together with Ch138 (read-side
|
||
// pcrtc) and Ch139 (image-xfer write-side), all three major
|
||
// PSMT4 paths can be byte-consistent under the canonical
|
||
// swizzle when their gates are flipped on, completing the
|
||
// four-PSM byte-accuracy foundation for ALL three integration
|
||
// points.
|
||
parameter bit PSMT4_SWIZZLE = 1'b0,
|
||
|
||
// Brick 1 (texturing) — VRAM texel read latency, in clk cycles,
|
||
// for the texture read port (tex_rd_*). vram_stub's second read
|
||
// port (read2) is COMBINATIONAL, but gs_stub presents the texel
|
||
// address from a registered pipeline stage and consumes the data
|
||
// one cycle later, so the effective round-trip seen by the
|
||
// sampler is 1 cycle. Kept as a parameter so a future registered/
|
||
// BRAM-backed VRAM (vram_bram_stub, RD_LATENCY 1+) can bump it
|
||
// without touching the datapath.
|
||
parameter int TEX_RD_LATENCY = 1,
|
||
|
||
// Ch296 — when set, the FIFO pop (textured-scan start) is held off
|
||
// while `clut_load_busy` is high, so the textured texel fetch never
|
||
// collides with a VRAM->CLUT load on the shared read2 port. Only a top
|
||
// with a CLUT loader sets this (and drives clut_load_busy); default 0
|
||
// ignores clut_load_busy entirely (it may float), keeping every other
|
||
// instance byte-identical to pre-Ch296.
|
||
parameter bit CLUT_STALL = 1'b0,
|
||
|
||
// Ch295 — texel-read port timing model. Selects WHICH pipeline
|
||
// stage drives the texel address presented on `tex_rd_addr`, so
|
||
// the registered `tex_rd_data` lands in the SAME cycle the
|
||
// existing single S1->S2 texel register samples it:
|
||
//
|
||
// 0 (default) : COMBINATIONAL read port (vram_stub.read2). The
|
||
// address is generated from the S1 coords
|
||
// (s1_x_q/s1_y_q); read2 returns mem[addr] in the
|
||
// SAME (S1) cycle, so `s1_tex_color` is valid in S1
|
||
// and the single S1->S2 register aligns it with the
|
||
// S2 pixel. This is the Brick-1 contract every
|
||
// existing TB + the vram_stub demo top verified.
|
||
//
|
||
// 1 : REGISTERED 1-cycle read port (vram_bram_stub.read2,
|
||
// the BRAM board VRAM). The address is generated one
|
||
// stage EARLIER, from the S0 coords (ras_cur_x/
|
||
// ras_cur_y), and presented on the S0 cycle. The BRAM
|
||
// registers the read and returns mem[addr] one cycle
|
||
// later — exactly at the S1 cycle — so `s1_tex_color`
|
||
// is again valid in S1 and the SAME single S1->S2
|
||
// register (unchanged) aligns it with the S2 pixel.
|
||
// No emit-stage timing changes; only the address
|
||
// source + read-enable move up one stage.
|
||
//
|
||
// The S0 coord/valid used for addressing is exactly what latches
|
||
// into s1_x_q/s1_valid_q at the next edge (see the S0->S1 latch),
|
||
// so the texel sampled for S0 pixel P is the texel for the S1
|
||
// instance of pixel P — the invariant the single register relies on.
|
||
parameter bit TEX_RD_REGISTERED = 1'b0,
|
||
|
||
// Brick 2a — ALPHA blending (transparency). When a FLAT (non-
|
||
// textured) PSMCT32 SPRITE closes with PRIM.ABE=1 AND the ALPHA_1
|
||
// register selects the canonical source-over config
|
||
// (A=0/Cs, B=1/Cd, C=0/As, D=1/Cd), the raster emit becomes a
|
||
// read-MODIFY-write against the destination framebuffer:
|
||
// Cv = ((Cs - Cd) * As) >> 7 + Cd per R/G/B, clamp [0,255]
|
||
// The dest pixel Cd is READ back from VRAM at the SAME byte address
|
||
// that is about to be written, through a dedicated dest-fb read
|
||
// port (fb_rd_*). gs_stub does NOT own VRAM — the wrapper arbitrates
|
||
// fb_rd_* onto the spare second read port (read2), mutually
|
||
// exclusive with the texel-fetch port (a FLAT blend never textures)
|
||
// and with the PSMT4 RMW path (blend is PSMCT32). See the wrapper
|
||
// read2 arbitration note.
|
||
//
|
||
// When PRIM.ABE=0 (or the primitive is textured, or PSM != PSMCT32,
|
||
// or ALPHA_1 selects a non-source-over config), the emit path is
|
||
// BYTE-IDENTICAL to the pre-Brick-2a opaque write — fb_rd_en stays
|
||
// low and no extra emit-pipeline latency is introduced.
|
||
//
|
||
// FB_RD_REGISTERED selects the dest-read timing model, mirroring
|
||
// TEX_RD_REGISTERED:
|
||
// 0 (default) : COMBINATIONAL read port (vram_stub.read2). fb_rd_addr
|
||
// is presented in the S2 stage; fb_rd_data is valid the
|
||
// SAME cycle and latched into the S3 blend stage.
|
||
// 1 : REGISTERED 1-cycle read port (vram_bram_stub.read2).
|
||
// fb_rd_addr is presented in the S2 stage; the BRAM
|
||
// returns fb_rd_data one cycle later — exactly at the
|
||
// S3 stage — so S3 blends against the LIVE fb_rd_data.
|
||
// Either way the blended pixel commits one cycle later than an opaque
|
||
// pixel would (the added S3 stage); since ABE is constant within a
|
||
// primitive, no pixel mixes the two latencies.
|
||
parameter bit FB_RD_REGISTERED = 1'b0,
|
||
|
||
// Brick 2b — Z-buffer (depth test) dest-Z read timing model,
|
||
// mirroring FB_RD_REGISTERED. The Z read shares the per-pixel
|
||
// S2-address / S3-use shape:
|
||
// 0 (default) : COMBINATIONAL read port (vram_stub.read2). z_rd_addr
|
||
// presented in the S2 stage; z_rd_data valid the SAME
|
||
// cycle and latched into the S3 compare stage.
|
||
// 1 : REGISTERED 1-cycle read port (vram_bram_stub.read2).
|
||
// z_rd_addr presented in S2; z_rd_data returns one
|
||
// cycle later (S3) and is compared live.
|
||
// The stored-Z read uses the SAME read2 port the alpha dest-fb read
|
||
// and the texel fetch use — mutually exclusive by feature (a flat
|
||
// Z-tested sprite never textures and never alpha-blends in v1; see
|
||
// new_zte_active). A passing Z pixel commits BOTH the framebuffer
|
||
// color AND the Z value through the single VRAM write port, so a
|
||
// Z-tested primitive runs the pixel pipeline at HALF rate (one
|
||
// candidate pixel every 2 cycles): beat 0 writes the fb color (on
|
||
// pass) and beat 1 writes the Z value (on pass, ZMSK=0). When ZTE
|
||
// is inactive for the primitive, the pipeline runs full rate and
|
||
// the emit path is byte-identical to pre-Brick-2b.
|
||
parameter bit Z_RD_REGISTERED = 1'b0,
|
||
|
||
// Ch301 — PERSPECTIVE-CORRECT textured triangles. When 1, a TME TRIANGLE
|
||
// whose texture coords were supplied via the ST register (S=u/w, T=v/w) plus
|
||
// RGBAQ.Q (=1/w) interpolates those three perspective attributes affinely
|
||
// (extra shared-divider gradient steps) and recovers per-pixel integer texel
|
||
// (u,v) = (S/Q, T/Q) through the pipelined gs_reciprocal_stub LUT + multiply
|
||
// (gs_persp_uv) — NO per-pixel divider. UV-supplied triangles + SPRITEs keep
|
||
// the existing AFFINE path. Default 0 generate-guards ALL perspective logic
|
||
// out, so every existing TB / the flat production build is byte-identical
|
||
// (zero DSP/ALM/M20K cost). Only a perspective demo build sets it 1.
|
||
parameter bit PERSPECTIVE_CORRECT = 1'b0,
|
||
// Ch351 — perspective reciprocal LUT width (forwarded to gs_persp_uv). Default 8 = byte-identical to
|
||
// Ch342/348. Far-W draws (high PERSP_FRAC, small Q) set 11 for finer 1/w. Inert when PERSPECTIVE_CORRECT=0.
|
||
parameter int PERSP_RECIP_IDX_BITS = 8,
|
||
// Ch352 — triangle-setup divider settle cycles. The gradient solve uses ONE shared combinational signed
|
||
// divide (grad_num_q[grad_step]/grad_det) advanced one grad_step per cycle. At 30 MHz that divide cone is
|
||
// ~100 ns (−66 ns setup) and only matters for perspective/gradient-heavy draws (SH3). GRAD_DIV_CYCLES holds
|
||
// each grad_step for N cycles so the divide settles, capturing the FIFO only on the last cycle (paired with a
|
||
// multicycle SDC constraint). The gradient VALUES are unchanged — bit-exact, only the per-triangle setup is
|
||
// slower (once per primitive, free). Default 1 = byte-identical to today; board fits set 4.
|
||
parameter int GRAD_DIV_CYCLES = 1,
|
||
// Ch352 — use the SEQUENTIAL gs_grad_divider for the triangle-setup divide instead of the combinational `/`.
|
||
// Default 0 = combinational + grad_settle (byte-identical to today; sim regression unchanged). Board fits set
|
||
// 1: registered iterations, no combinational cone -> closes timing with NO SDC exception (the combinational
|
||
// divider's cone could not be multicycle'd or false-pathed without the Quartus fitter grinding for hours).
|
||
parameter bit GRAD_SEQ_DIVIDER = 1'b0,
|
||
|
||
// COMBINED architectural probe — when 1, a TRIANGLE that is
|
||
// textured (PSMCT32 DECAL) AND alpha-blended (source-over) AND
|
||
// depth-tested (GEQUAL/GREATER, PSMZ32) at once runs through a
|
||
// dedicated walker-stalling 5-beat per-pixel FSM (comb_*). This
|
||
// exists to PROVE the per-pixel read/write SCHEDULE for a fully
|
||
// featured primitive (one read2 consumer per beat: Z, then texel,
|
||
// then dest-fb; then two writes: color, then Z). Speed is NOT a
|
||
// goal — the walker HOLDS at each candidate pixel until the FSM
|
||
// completes. Default 0 generate-guards ALL combined logic out and
|
||
// ties ras_combined constant 0, so every existing TB / production
|
||
// build is BYTE-IDENTICAL (zero added cost). Only a combined-mode
|
||
// probe demo sets it 1.
|
||
parameter bit COMBINED_TAZ = 1'b0,
|
||
|
||
// Ch344 — TEXTURED + source-over ALPHA SPRITE path. When 1, a SPRITE that is
|
||
// textured (PSMCT32 TEX0) AND alpha-blended (PRIM.ABE=1, ALPHA_1 source-over)
|
||
// renders through an ISOLATED half-rate per-pixel schedule (NOT the combined-TAZ
|
||
// compositor, which stays triangle/tile/Z-shaped and load-bearing): the existing
|
||
// texel fetch (S1, read2) and the alpha dest-read (S2, read2) are placed on
|
||
// SEPARATE half-rate beats so read2 keeps exactly one consumer per cycle, then
|
||
// gs_alpha_blend composites Cs=texel(*vertex color if TFX=0 MODULATE), As=texel
|
||
// alpha, over the dest. Default 0 generate-guards ALL of it out and ties
|
||
// new_tex_abe_active to 0, so a textured ABE sprite still falls back to opaque
|
||
// DECAL exactly as pre-Ch344 — every existing TB / production build byte-identical.
|
||
parameter bit SPRITE_TEX_ALPHA = 1'b0,
|
||
// Ch347 — also admit PSMT8 (CLUT-indexed) textures into the textured-alpha SPRITE path. Default 0
|
||
// keeps the proven Ch344 PSMCT32 sprite path byte-identical; when 1, a PSMT8 (TEX0.PSM=0x13) source-
|
||
// over sprite takes the same half-rate texel+dest+blend path, with gs_texture_unit doing the
|
||
// index->CLUT->ABGR decode upstream of the s1_tex_color capture (the CLUT must be loaded via a CLD!=0
|
||
// TEX0 before the draw, same precondition as the proven indexed-DECAL path). PSMT4 stays out for v1.
|
||
parameter bit SPRITE_TEX_ALPHA_CLUT = 1'b0,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch304 TILE-LOCAL render mode — first architectural step toward
|
||
// tiled VRAM (docs/decisions/0008). When 1 (AND COMBINED_TAZ=1, AND
|
||
// the in-flight primitive is the combined TME+ABE+ZTE triangle), the
|
||
// combined per-pixel render targets an ON-CHIP 16x16 color+Z tile
|
||
// scratchpad instead of VRAM: the raster sequence becomes
|
||
// R_IDLE -> T_CLEAR -> R_SCAN(tile-local) -> R_DRAIN -> T_FLUSH -> R_IDLE.
|
||
// - T_CLEAR : sweep all 256 tile entries, color<=TILE_CLEAR_COLOR,
|
||
// Z<=TILE_CLEAR_Z (one entry / cycle, no FB emit).
|
||
// - R_SCAN : the EXISTING combined FSM runs, but its Z read, dest
|
||
// read, color write and Z write are rerouted to the two
|
||
// on-chip tile RAMs (u_tile_z / u_tile_color). The
|
||
// TEXTURE fetch is UNCHANGED on the VRAM read2 path.
|
||
// The combined raster_pixel_emit color/Z writes are
|
||
// SUPPRESSED (they land in tile RAM instead). z_rd_en /
|
||
// fb_rd_en stay LOW (the Z/dest reads are tile-local).
|
||
// - T_FLUSH : sweep all 256 tile entries, emit each tile color word
|
||
// to its linear PSMCT32 FB address via raster_pixel_emit.
|
||
// raster_active stays high across CLEAR+SCAN+DRAIN+FLUSH so external
|
||
// waits see no gap. The tile is at screen origin, so the tile-local
|
||
// index of pixel (x,y) is {y[3:0], x[3:0]}.
|
||
//
|
||
// Default 0 generate-guards ALL tile logic out and ties tile_active
|
||
// constant 0, so every existing TB / production build is BYTE-
|
||
// IDENTICAL — including a COMBINED_TAZ=1, TILE_LOCAL=0 build, which
|
||
// takes the unchanged VRAM combined path.
|
||
parameter bit TILE_LOCAL = 1'b0,
|
||
// Tile clear values (ABGR color / Z word). Defaults per task spec.
|
||
parameter logic [31:0] TILE_CLEAR_COLOR = 32'hFF00_8000, // green ABGR
|
||
parameter logic [31:0] TILE_CLEAR_Z = 32'h0000_4000,
|
||
// Ch305 TILE GRID — render the popped combined primitive across a small
|
||
// grid of 16x16 tiles (TILE_COLS x TILE_ROWS), re-rendering (re-clipping)
|
||
// the SAME primitive against each tile. The primitive is popped + its
|
||
// affine gradients solved ONCE; the grid loop only re-clips the walker
|
||
// bbox + re-targets the FLUSH framebuffer offset per tile.
|
||
//
|
||
// With BOTH = 1 the behavior is BYTE-IDENTICAL to the Ch303 single-tile
|
||
// path: tile_ox=tile_oy=0, the clip = (primitive bbox) ∩ [0..15] = the
|
||
// primitive bbox (the Ch303 prim is already 16-aligned at the origin),
|
||
// the FLUSH offset is 0, and there is exactly ONE tile iteration.
|
||
parameter int TILE_COLS = 1,
|
||
parameter int TILE_ROWS = 1,
|
||
// Ch323 TEST HOOK (default 0): force every tile's spill_valid set at reset, so the FIRST
|
||
// batch RELOADS instead of booting from clear. Used ONLY by the negative regression test to
|
||
// prove the clean-Z bootstrap matters (forcing valid + garbage backing => depth rejects all).
|
||
parameter bit SPILL_FORCE_VALID = 1'b0,
|
||
// Ch305+ MULTI-PRIMITIVE TILED SCENE — when set, each tile re-renders a
|
||
// LIST of TILE_PRIM_COUNT primitives (all buffered in the FIFO) in order,
|
||
// so later primitives depth-test / alpha-blend against earlier ones within
|
||
// the tile (no inter-primitive clear). Default OFF → byte-identical: every
|
||
// multiprim branch below is guarded by `if (TILE_MULTIPRIM)` and the normal
|
||
// streaming pop / grid path is reached UNCHANGED.
|
||
parameter bit TILE_MULTIPRIM = 1'b0,
|
||
parameter int TILE_PRIM_COUNT = 1,
|
||
// Ch331 — feeder "end-of-list" grid trigger. When 1, the grid starts on the LATCHED
|
||
// prim_list_flush_i (render whatever 1..FIFO_DEPTH prims are buffered) instead of the
|
||
// fixed fifo_count>=TILE_PRIM_COUNT threshold — so a runtime list of ANY size (incl.
|
||
// <TILE_PRIM_COUNT or >TILE_PRIM_COUNT, up to FIFO_DEPTH) renders in ONE pass without the
|
||
// threshold splitting it across framebuffer-clearing passes. Default 0 = legacy threshold.
|
||
parameter bit MP_FLUSH_ONLY = 1'b0,
|
||
// Ch336 — FRAMEBUFFER ACCUMULATION across >FIFO_DEPTH scenes. When 1 (feeder mode), the grid
|
||
// also fires when the FIFO is FULL (mid-list batch, not just end-of-list), and only the FIRST
|
||
// batch of a scene clears+flushes the full tile; later batches SPARSE-flush only the pixels they
|
||
// drew, so a multi-batch scene accumulates into one framebuffer instead of each batch wiping the
|
||
// last. v1 = COLOR accumulation with PER-BATCH Z (cross-batch Z ordering is a follow-on). Default
|
||
// 0 = legacy single-pass-per-grid (byte-identical).
|
||
parameter bit TILE_ACCUM_ENABLE = 1'b0,
|
||
// Ch338 — PERSISTENT cross-batch Z. With TILE_ACCUM_ENABLE the on-chip tile_z is cleared every
|
||
// batch, so a >FIFO_DEPTH scene whose overlapping prims SPLIT across FIFO batches loses Z ordering
|
||
// (each batch starts from a fresh Z → a later batch's FAR prim overwrites an earlier batch's NEAR
|
||
// prim). When 1, tile_z is widened to the WHOLE grid (NTILES*256, indexed by tile_id) and cleared
|
||
// ONLY on the first (full-flush) batch of a scene; later batches read/update the resident Z, so a
|
||
// tile re-rendered in a later batch depth-tests against ALL prior batches. Color still rides the
|
||
// Ch336 sparse flush (Z-fail pixels aren't written → not flushed → the accumulated FB survives), so
|
||
// ONLY Z needs to persist. Default 0 = legacy per-batch Z (tile_z stays 256-deep, byte-identical).
|
||
// On-chip only (no VRAM Z traffic) — sized for the small feeder FB; large FBs use the Ch323
|
||
// LPDDR spill/reload path instead.
|
||
parameter bit TILE_Z_PERSIST = 1'b0,
|
||
// Ch315 — PRIMITIVE/BIN CAPACITY. TILE_FIFO_DEPTH sizes the primitive FIFO
|
||
// (the full per-primitive attribute storage — the ~40 fifo_* arrays) AND, in
|
||
// the bin-buffer renderer, the per-tile bin depth (bin_prim[NTILES][DEPTH] of
|
||
// prim INDICES). So one knob scales both the prim-list capacity (N) and the
|
||
// per-tile bin capacity (M = N here: a tile's bin can hold every queued prim,
|
||
// so the single capacity limit is the FIFO). MUST be a power of two (the
|
||
// wptr/rptr wrap relies on $clog2(DEPTH) bits — see the FIFO_DEPTH comment).
|
||
// Default 4 → byte-identical to every pre-Ch315 build. The dominant cost is
|
||
// the fifo_* attribute storage (~hundreds of register bits/slot); the bins
|
||
// add only NTILES*FIFO_CNT_W index bits per depth (negligible) — so register
|
||
// bins stay reasonable far beyond the prim FIFO's practical limit.
|
||
parameter int TILE_FIFO_DEPTH = 4,
|
||
// Bin-buffer renderer. Default OFF -> BYTE-IDENTICAL: the TP_BIN phase is
|
||
// never entered (mp_grid_start goes straight to TP_CLEAR as before), the
|
||
// render uses the original mp_next_nonempty re-test path, and every bin_*
|
||
// register is dead (synth-pruned). When 1 (with TILE_MULTIPRIM=1), a
|
||
// pre-render classification phase materialises a per-tile ordered list of
|
||
// primitive indices (the BIN), and each tile's render walks ONLY its bin
|
||
// list instead of re-scanning every primitive at render time. The overlap
|
||
// test and clip math are IDENTICAL to mp_next_nonempty, so the rendered
|
||
// image is bit-for-bit the same as the Ch305 re-test path.
|
||
parameter bit BIN_BUFFER_ENABLE = 1'b0,
|
||
// GS SCISSOR_1 rectangular clip. Default OFF → byte-identical: the
|
||
// eff_sc* bounds below collapse to 0 / 0xFFF so intersecting them into
|
||
// every walker-bbox clip site is a provable no-op. When ENABLEd, the
|
||
// inclusive screen rect [SCAX0..SCAX1]x[SCAY0..SCAY1] is folded into the
|
||
// walker bounding box — exact for a rectangular scissor, no per-pixel test.
|
||
parameter bit SCISSOR_ENABLE = 1'b0,
|
||
// Ch294 — GS texture WRAP MODES (REPEAT + CLAMP). Default OFF →
|
||
// byte-identical: the per-primitive wrap snapshot (ras_wms/wmt/tw/th) is
|
||
// forwarded to gs_texture_unit, but with TEX_WRAP_ENABLE=0 the sampler
|
||
// collapses u_eff/v_eff to u/v as a compile-time constant, so the wrap
|
||
// plumbing is unused dead state and every existing demo/TB is identical.
|
||
parameter bit TEX_WRAP_ENABLE = 1'b0,
|
||
// Ch305+ TILE COLOR PSMCT16 — when set, the on-chip tile COLOR RAM
|
||
// stores 16-bit PSMCT16 (RGB5A1) instead of 32-bit PSMCT32, halving
|
||
// its footprint. The RMW packs blended ABGR8888 -> pix16 on write,
|
||
// unpacks pix16 -> ABGR8888 on the blend dest-read, and the FLUSH
|
||
// emits PSMCT16-formatted framebuffer writes. The Z tile RAM is
|
||
// UNCHANGED (stays 32-bit). Default OFF -> TILE_COLOR_W folds to 32
|
||
// and every pack/unpack/PSMCT16-flush branch is a compile-time-
|
||
// constant dead arm, so every existing build is BYTE-IDENTICAL.
|
||
parameter bit TILE_COLOR_PSMCT16 = 1'b0,
|
||
// Brick-2c — generic GS ALPHA blend selector. When 0 (default) the
|
||
// combined blender u_comb_blend is hardwired to source-over and the
|
||
// per-primitive selector snapshots (fifo_alpha_*/ras_alpha_*) are
|
||
// loaded but UNREAD by the blender → every existing build is
|
||
// BYTE-IDENTICAL. When 1, u_comb_blend uses the generic A/B/C/D + FIX
|
||
// datapath driven by the snapshotted per-primitive ALPHA_1 selectors.
|
||
parameter bit ALPHA_MODES_ENABLE = 1'b0,
|
||
// Ch310 — BILINEAR (4-tap) texture filtering in the COMBINED tiled
|
||
// renderer, PSMCT32-only. When 1 AND the per-primitive TEX1.MMAG=1
|
||
// (LINEAR magnification) AND the texture psm==PSMCT32, the combined
|
||
// per-pixel FSM stalls in a new CB_TWAIT beat while the (already-proven)
|
||
// 4-tap gs_texture_unit bilinear sampler runs its ~9-cycle multi-beat
|
||
// read, then latches the filtered ABGR. Default OFF → CB_TWAIT is never
|
||
// entered (bili_now is a compile-time-constant 0), the per-primitive
|
||
// filter snapshot (fifo_filter_lin/ras_filter_lin) and the fractional UV
|
||
// (s1_u_frac/s1_v_frac) are loaded/wired but UNREAD by the nearest texel
|
||
// path, and u_tex is built with BILINEAR_ENABLE=0 → g_nearest. So every
|
||
// existing build is BYTE-IDENTICAL.
|
||
parameter bit BILINEAR_ENABLE = 1'b0,
|
||
// Ch314 — BILINEAR for PALETTIZED textures. When 1 (with BILINEAR_ENABLE=1)
|
||
// the combined tile path ALSO accepts a PSMT8 or PSMT4 DECAL texture as a
|
||
// filtered primitive: close_combined's texture-PSM gate widens to admit
|
||
// them, and the shared gs_texture_unit gets PALETTE_BILINEAR=1 so each of
|
||
// the 4 taps fetches an index, CLUTs it to a color, then interpolates the
|
||
// COLORS (CLUT-before-interp). Default OFF → close_combined still requires a
|
||
// PSMCT32 texture and the sampler keeps indexed textures nearest, so every
|
||
// existing build is BYTE-IDENTICAL.
|
||
parameter bit PALETTE_BILINEAR = 1'b0,
|
||
// Ch323 — TILE COLOR+Z SPILL/RELOAD to LPDDR. Default OFF → byte-identical: the
|
||
// TP_RELOAD / TP_ZFLUSH phases below are never entered, no Z-flush stream is driven,
|
||
// and the synthesised state set stays TP_OFF..TP_BIN. When ENABLEd (TILE_LOCAL build),
|
||
// the tile FSM gains a RELOAD phase (load color+Z from a staging RAM into the tile RAMs
|
||
// before RENDER) and a ZFLUSH phase (emit the tile Z RAM to an LPDDR Z-backing region
|
||
// after the color FLUSH), so tile-local color/Z state can leave the chip and return.
|
||
parameter bit TILE_SPILL_ENABLE = 1'b0
|
||
) (
|
||
input logic clk,
|
||
input logic rst_n,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Privileged CRTC/MMIO write port. Reaches gs_stub directly from
|
||
// CPU MMIO at 0x12000000 — NOT through GIF. BGCOLOR lives here.
|
||
// ------------------------------------------------------------------
|
||
input logic reg_wr_en,
|
||
input logic [15:0] reg_wr_addr,
|
||
input logic [63:0] reg_wr_data,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch75 — GIF A+D / REGLIST register port. 8-bit register number
|
||
// (PS2-real, per PCSX2 GSRegs.h). gif_packed_stub drives this
|
||
// with REAL_AD_REG_MAP=1. Architecturally distinct from the
|
||
// privileged port: a GIF-side write to reg# 0x4C goes to the
|
||
// GIF-context FRAME_1 latch; a privileged write to offset
|
||
// 0x04C0 would be a different register. Today only the GIF-
|
||
// context lane is meaningfully populated.
|
||
// ------------------------------------------------------------------
|
||
input logic gif_reg_wr_en,
|
||
input logic [7:0] gif_reg_num,
|
||
input logic [63:0] gif_reg_data,
|
||
|
||
// Ch331 — runtime command-list "end of list" flush (feeder mode). A 1-cycle pulse
|
||
// says "the current list is fully staged"; gs_stub LATCHES it (mp_flush_pending) and
|
||
// starts the grid when the renderer is genuinely ready (all_grad_done / idle), rendering
|
||
// whatever 1..FIFO_DEPTH prims are buffered. Only consulted when MP_FLUSH_ONLY=1; tied 0
|
||
// (left open) by every non-feeder consumer, so default builds are byte-identical.
|
||
input logic prim_list_flush_i,
|
||
|
||
// Pixel source for platform_video_stub
|
||
output logic [7:0] bg_r,
|
||
output logic [7:0] bg_g,
|
||
output logic [7:0] bg_b,
|
||
|
||
// Ch91/Ch92: privileged display register latches.
|
||
// gs_pcrtc_stub consumes these directly — there's no separate
|
||
// sideband path. PMODE.EN1 (bit 0) gates scanout. DISPFB1
|
||
// sets {FBP[8:0], FBW[14:9], PSM[19:15], DBX[42:32], DBY[53:43]}.
|
||
// DISPLAY1 (Ch92) sets {DX[11:0], DY[22:12], MAGH[26:23],
|
||
// MAGV[28:27], DW[43:32], DH[54:44]} — the display window
|
||
// origin/extents within the active area. All reset to 0:
|
||
// EN1=0 keeps scanout disabled; DISPLAY1=0 means a 1×1 window
|
||
// at (0,0), so pcrtc effectively shows nothing until a real
|
||
// driver-style configuration sequence runs.
|
||
output logic [63:0] pmode_q,
|
||
output logic [63:0] dispfb1_q,
|
||
output logic [63:0] display1_q,
|
||
|
||
// Ch75: GIF-context register latches (real PS2 namespace). TBs
|
||
// can verify reg# routing by reading these directly.
|
||
output logic [63:0] prim_q,
|
||
output logic [63:0] rgbaq_q,
|
||
output logic [63:0] xyz2_q,
|
||
output logic [63:0] xyzf2_q,
|
||
output logic [63:0] frame_1_q,
|
||
output logic [63:0] zbuf_1_q,
|
||
|
||
// Ch98: TEX0_1 latch + decoded sub-fields. Real PS2 TEX0
|
||
// carries texture-base + CLUT plumbing fields. Only the
|
||
// CLUT-side fields are routed downstream at this scope:
|
||
// CBP[50:37] — CLUT buffer pointer (14 bits, source for
|
||
// VRAM→CLUT load; not yet acted on)
|
||
// CPSM[54:51] — CLUT pixel storage mode (4 bits;
|
||
// PSMCT32 = 0 honored, others deferred)
|
||
// CSM[55] — CLUT storage mode (1 bit; CSM2=1 is the
|
||
// scope here, CSM1 swizzle deferred)
|
||
// CSA[60:56] — CLUT entry offset in 16-entry units
|
||
// (5 bits; matches pcrtc.clut_csa)
|
||
// CLD[63:61] — CLUT load control (3 bits; latched but
|
||
// not yet triggering a load)
|
||
// The texture-side fields (TBP0/TBW/PSM/TW/TH/TCC/TFX) are
|
||
// also in tex0_1_q if a TB needs them, but no decode/output
|
||
// is wired yet.
|
||
output logic [63:0] tex0_1_q,
|
||
output logic [13:0] tex0_1_cbp_q,
|
||
output logic [3:0] tex0_1_cpsm_q,
|
||
output logic tex0_1_csm_q,
|
||
output logic [4:0] tex0_1_csa_q,
|
||
output logic [2:0] tex0_1_cld_q,
|
||
// Ch99: 1-cycle pulse on the cycle a TEX0_1 GIF write commits.
|
||
// clut_loader_stub samples the just-written tex0_1_* fields
|
||
// and starts a VRAM→CLUT copy when CLD != 0 and CPSM=PSMCT32.
|
||
output logic tex0_1_wr_q,
|
||
|
||
// Ch110 — host→local image-transfer (TRX) register set. Real
|
||
// PS2 driver path for VRAM uploads:
|
||
// BITBLTBUF (0x50) — SBP/SBW/SPSM (source) + DBP/DBW/DPSM (dest)
|
||
// TRXPOS (0x51) — SSAX/SSAY/DSAX/DSAY/DIR
|
||
// TRXREG (0x52) — RRW/RRH (transfer rect width/height in pixels)
|
||
// TRXDIR (0x53) — XDIR (0=host→local, 1=local→host,
|
||
// 2=local→local, 3=deactivate)
|
||
// The transfer is triggered by the TRXDIR write (`trxdir_wr_q`
|
||
// pulses at that posedge). gif_image_xfer_stub samples the
|
||
// already-latched BITBLTBUF/TRXPOS/TRXREG fields and consumes
|
||
// the IMAGE qword stream coming out of gif_packed_stub when
|
||
// a host→local upload is armed.
|
||
output logic [63:0] bitbltbuf_q,
|
||
output logic [63:0] trxpos_q,
|
||
output logic [63:0] trxreg_q,
|
||
output logic [63:0] trxdir_q,
|
||
output logic trxdir_wr_q,
|
||
|
||
// Ch76/Ch77: primitive observer. `prim_complete` pulses for one
|
||
// cycle when an XYZ2/XYZF2 commit closes a primitive under the
|
||
// latched PRIM[2:0]; `prim_complete_count` is the running total.
|
||
// Discrete primitives (POINT/LINE/TRI/SPRITE) draw once per N
|
||
// vertices. Strip / fan primitives (LINE_STRIP/TRI_STRIP/TRI_FAN)
|
||
// anchor on the first N vertices, then draw on every additional
|
||
// vertex commit (Ch77 saturate cadence). Reserved (PRIM=7) never
|
||
// closes a primitive. Ch84 wires this strobe into the interior
|
||
// rasterizer for SPRITE / TRI / TRI_STRIP / TRI_FAN; POINT /
|
||
// LINE / LINE_STRIP still emit only the Ch82 closing pixel. See
|
||
// body for the threshold table and saturate logic.
|
||
output logic prim_complete,
|
||
output logic [31:0] prim_complete_count,
|
||
|
||
// Ch78: per-primitive vertex-identity snapshot, latched in
|
||
// lockstep with the EV_PRIM_DRAW that closes a primitive (one
|
||
// cycle after the closing vertex commits, same cycle as
|
||
// ev_valid). Layout depends on the latched PRIM[2:0]:
|
||
// POINT (1-vert) : v0=closing
|
||
// LINE/LINE_STRIP/SPRITE (2) : v0=previous, v1=closing
|
||
// TRI/TRI_STRIP (3) : v0=v_n-2, v1=v_n-1, v2=closing
|
||
// TRI_FAN anchor draw (3) : v0=v1 (pivot), v1=v2, v2=v3
|
||
// TRI_FAN saturated draw (3) : v0=v_pivot, v1=v_n-1, v2=closing
|
||
// Consumers know the valid-slot count from prim_q[2:0]. Slots
|
||
// not used by the current primitive type read 0. PRIM-write
|
||
// resets the underlying rolling window so a fresh primitive
|
||
// context starts clean.
|
||
output logic [63:0] prim_v0_q,
|
||
output logic [63:0] prim_v1_q,
|
||
output logic [63:0] prim_v2_q,
|
||
|
||
// Ch79: per-primitive color snapshot — the value of rgbaq_q at
|
||
// the closing-vertex commit. Equivalent to prim_color_v2_q for
|
||
// 3-vertex prims, prim_color_v1_q for 2-vertex, prim_color_v0_q
|
||
// for POINT. Kept as a convenience output for consumers that
|
||
// only care about flat shading. Reads 0 if no RGBAQ has been
|
||
// written since reset; rgbaq_q is NOT cleared on PRIM write —
|
||
// color carries forward across PRIM context switches like real
|
||
// GS — and only resets on rst_n.
|
||
output logic [63:0] prim_color_q,
|
||
|
||
// Ch80: per-vertex Gouraud color snapshot. Each slot holds the
|
||
// value of rgbaq_q at the moment the corresponding vertex
|
||
// committed (sampled into a parallel rolling color window
|
||
// c_curr_q / c_prev_q / c_prev_prev_q / c_pivot_q on each
|
||
// vertex commit). Slot layout mirrors prim_v0_q / v1_q / v2_q:
|
||
// POINT (1-vert) : v0=rgbaq_q at commit
|
||
// LINE/LINE_STRIP/SPRITE (2) : v0=c_curr_q (prev), v1=rgbaq_q (close)
|
||
// TRI/TRI_STRIP (3) : v0=c_prev_q, v1=c_curr_q, v2=rgbaq_q
|
||
// TRI_FAN saturated draw (3) : v0=c_pivot_q, v1=c_curr_q, v2=rgbaq_q
|
||
// For a flat-shaded primitive (RGBAQ written once before the
|
||
// strip), all per-vertex slots equal each other and equal
|
||
// prim_color_q. For Gouraud (RGBAQ rewritten between vertex
|
||
// commits) they may differ, capturing the per-vertex color
|
||
// identity real game streams need. Slots not used by the
|
||
// current primitive type read 0.
|
||
output logic [63:0] prim_color_v0_q,
|
||
output logic [63:0] prim_color_v1_q,
|
||
output logic [63:0] prim_color_v2_q,
|
||
|
||
// Ch81: structured-field decoded snapshots, latched alongside
|
||
// the raw 64-bit slots. vertex_t carries x / y / z / fog /
|
||
// is_xyzf2 already unpacked; color_t carries r / g / b / a / q.
|
||
// gs_stub tracks the per-vertex source format (XYZ2 vs XYZF2)
|
||
// in a parallel rolling flag window so each slot's vertex_t can
|
||
// be decoded with the right Z/fog interpretation. Slots not
|
||
// used by the current primitive type read 0. Consumers downstream
|
||
// (rasterizer / pixel emit) can pick up these decoded fields
|
||
// instead of re-deriving them from the raw 64-bit payloads.
|
||
output trace_pkg::vertex_t prim_v0_decoded_q,
|
||
output trace_pkg::vertex_t prim_v1_decoded_q,
|
||
output trace_pkg::vertex_t prim_v2_decoded_q,
|
||
output trace_pkg::color_t prim_v0_color_decoded_q,
|
||
output trace_pkg::color_t prim_v1_color_decoded_q,
|
||
output trace_pkg::color_t prim_v2_color_decoded_q,
|
||
|
||
// Ch82 / Ch83 — minimal pixel emit. One pixel per closed
|
||
// primitive, sourced from the closing vertex's screen X/Y plus
|
||
// the latched FRAME_1 register. pixel_emit pulses for one cycle
|
||
// on the same edge prim_complete pulses (separate strobe channel
|
||
// — does not multiplex with the main ev_valid trace stream).
|
||
// Ch83 added a PSM-aware byte-width lookup so PSMCT16, PSMT8,
|
||
// PSMZ16 (etc.) compute the correct fb_addr.
|
||
//
|
||
// NOTE: this `pixel_emit` channel is the legacy single-pixel-
|
||
// per-primitive observer. Sub-byte PSM4 addressing is treated
|
||
// as host-word here (bpp_shift=2) because pixel_emit does not
|
||
// commit to vram_stub — it is a debug strobe only. The raster
|
||
// channel (`raster_pixel_emit`) is the real write port; Ch106
|
||
// gave it full PSMT4 support with `pixel_index >> 1` byte
|
||
// addressing and a per-bit nibble mask. Do not conflate the
|
||
// two channels.
|
||
// fb_addr = FBP * 2048 + (Y * FBW * 64 + X) * bytes_per_pixel
|
||
// FRAME_1 bit layout (per PCSX2 GSRegs.h):
|
||
// [8:0] FBP framebuffer base / 2048
|
||
// [21:16] FBW framebuffer width / 64 (in pixels)
|
||
// [29:24] PSM pixel storage format
|
||
// [63:32] FBMSK write mask (not used by emit)
|
||
output logic pixel_emit,
|
||
output logic [31:0] pixel_emit_count,
|
||
output logic [11:0] pixel_x_q, // closing vertex integer X (12 of 12.4)
|
||
output logic [11:0] pixel_y_q, // closing vertex integer Y (12 of 12.4)
|
||
output logic [63:0] pixel_color_q, // RGBAQ at emit (= prim_color_q)
|
||
output logic [8:0] pixel_fbp_q, // FRAME_1 FBP field
|
||
output logic [5:0] pixel_fbw_q, // FRAME_1 FBW field
|
||
output logic [5:0] pixel_psm_q, // FRAME_1 PSM field
|
||
output logic [31:0] pixel_fb_addr_q, // computed VRAM byte offset (Ch83 PSM-aware bpp)
|
||
|
||
// Ch84/Ch85/Ch86/Ch87/Ch88 — interior rasterizer. SEPARATE
|
||
// strobe channel from pixel_emit; the existing per-primitive-
|
||
// close emit is unchanged (still pulses once per closed
|
||
// primitive). The raster channel pulses once *per interior
|
||
// pixel* of TRIANGLE / TRI_STRIP / TRI_FAN (edge-function +
|
||
// top-left fill rule, Ch85) and SPRITE (axis-aligned rectangle
|
||
// fill). POINT / LINE / LINE_STRIP do NOT use the raster
|
||
// channel — they emit only the Ch82 closing-vertex pixel.
|
||
// For TRI/STRIP/FAN, color is per-pixel Gouraud-interpolated
|
||
// (Ch86) from the three per-vertex RGBAQ snapshots; SPRITE
|
||
// stays flat (== prim_color_q at close). Coordinates use the
|
||
// integer (12.4 top-12) screen X/Y; sub-pixel discarded.
|
||
// Closed primitives feed a 2-entry FIFO (Ch87) and emerge
|
||
// through a 3-stage pixel pipeline (Ch88) at one candidate
|
||
// pixel per cycle; raster_overflow latches only when a push
|
||
// arrives while the FIFO is full and no concurrent pop frees
|
||
// a slot.
|
||
//
|
||
// FSM is IDLE / SCAN (one pixel per cycle inside the box).
|
||
// raster_active is high during SCAN. raster_overflow latches if
|
||
// a new primitive closes while a previous one is still
|
||
// scanning — the new one is dropped, documented limitation.
|
||
// The TB design keeps bounding boxes small (≤ 4×4) to stay
|
||
// ahead of back-to-back vertex commits.
|
||
output logic raster_pixel_emit,
|
||
output logic [31:0] raster_pixel_emit_count,
|
||
output logic [11:0] raster_pixel_x_q,
|
||
output logic [11:0] raster_pixel_y_q,
|
||
output logic [63:0] raster_pixel_color_q,
|
||
output logic [31:0] raster_pixel_fb_addr_q,
|
||
// Ch95: per-byte write enable + PSM tag for the raster
|
||
// emit. PSMCT32 raster: be=4'b1111. PSMCT16 raster: be is
|
||
// 4'b0011 or 4'b1100 depending on the addressed halfword
|
||
// within the enclosing 32-bit word, and the lower 32 bits
|
||
// of raster_pixel_color_q carry the duplicated 16-bit
|
||
// RGB5A1 packing so vram_stub commits the right halfword
|
||
// without stomping its neighbor.
|
||
output logic [3:0] raster_pixel_be_q,
|
||
// Ch106: per-bit merge mask for sub-byte writes (PSMT4
|
||
// nibbles). 32'hFFFFFFFF for PSMCT32/16/PSMT8 (full byte
|
||
// writes — NO behavior change vs Ch95). 0x0000_000F (low
|
||
// nibble) or 0x0000_00F0 (high nibble) for PSMT4. vram_stub
|
||
// commits each enabled byte as
|
||
// mem[i] <= (mem[i] & ~mask[i]) | (data[i] & mask[i])
|
||
output logic [31:0] raster_pixel_mask_q,
|
||
output logic [5:0] raster_pixel_psm_q,
|
||
// Ch323 — tile Z-FLUSH stream (TILE_SPILL_ENABLE only). One emit per tile pixel during
|
||
// TP_ZFLUSH, on a SEPARATE channel from the color flush (raster_pixel_*) so the color
|
||
// flush path/counts are byte-identical. z_flush_addr_o is the Z-BACKING-relative byte
|
||
// offset (pixel_index*4, 32-bit Z); the de25 Z-writer adds the LPDDR Z-backing base.
|
||
// Tied 0 at the default (TP_ZFLUSH never entered).
|
||
output logic z_flush_emit_o,
|
||
output logic [31:0] z_flush_addr_o,
|
||
output logic [31:0] z_flush_data_o,
|
||
// Ch323 — DEDICATED tile COLOR-FLUSH spill stream (TILE_SPILL_ENABLE only). One emit per
|
||
// tile pixel during TP_FLUSH ONLY — a SEPARATE channel from the generic raster_pixel_emit /
|
||
// flush_emit_o (which ALSO fires for every RENDER-phase pixel write and so over-feeds the
|
||
// spill writer). Mirror of the Z-flush channel: tile_color_flush_addr_o is the spill-region-
|
||
// relative byte offset (pixel_index*4); the de25 color-writer adds the LPDDR base.
|
||
output logic tile_color_flush_emit_o,
|
||
output logic [31:0] tile_color_flush_addr_o,
|
||
output logic [31:0] tile_color_flush_data_o,
|
||
// Ch323 — tile RELOAD staging interface (TILE_SPILL_ENABLE only). reload_start_o pulses to
|
||
// arm the de25 staging engine's fill from LPDDR (color FB + Z-backing for this tile);
|
||
// tile_reload_ready_i = staging warm; then tile_reload_raddr_o sweeps 0..255 and the engine
|
||
// returns tile_reload_color_i/z_i (1-cyc). Inert at the default (TP_RELOAD never entered).
|
||
output logic reload_start_o,
|
||
output logic [7:0] tile_reload_raddr_o,
|
||
// Ch324 — current tile's byte offset into the raster LPDDR framebuffer (same formula as the
|
||
// flush side). gs_tile_reload latches it at the reload arm so it gathers exactly this tile.
|
||
// At TILE_COLS=TILE_ROWS=1 the tile is at the origin → 0 (byte-identical to Ch323).
|
||
output logic [29:0] reload_base_o,
|
||
input logic tile_reload_ready_i,
|
||
input logic [TILE_COLOR_W-1:0] tile_reload_color_i,
|
||
input logic [31:0] tile_reload_z_i,
|
||
// Ch323 diag — current tile phase (TP_OFF/CLEAR/RENDER/FLUSH/BIN/RELOAD/ZFLUSH). The
|
||
// de25 edge-detects entries into each phase as bring-up event counters. Inert otherwise.
|
||
output logic [2:0] tile_phase_o,
|
||
output logic raster_active,
|
||
// Ch337 — SCENE-LEVEL busy: high while ANY work for the in-flight scene remains, INCLUDING the
|
||
// inter-batch gaps that raster_active dips through on a >FIFO_DEPTH (multi-batch) scene. The
|
||
// control FSM gates its retrigger 'ready' on this (NOT raster_active) so a >8 scene can't be
|
||
// retriggered mid-render. = raster_active || tile still flushing || prims still queued || an
|
||
// end-of-list flush still pending. For a <=FIFO single-batch scene it tracks raster_active
|
||
// closely (FIFO/flush drain within the one batch), so the Ch330-335 path is unaffected.
|
||
output logic raster_scene_busy,
|
||
output logic raster_overflow,
|
||
// Ch172 — backpressure handshake to the upstream GIF parser.
|
||
// High when the raster command FIFO is full (raster_fifo_full ==
|
||
// count == FIFO_DEPTH). The wrapper routes this to
|
||
// gif_packed_stub.raster_fifo_full so the GIF deasserts in_ready
|
||
// BEFORE accepting the next qword; the DMAC then pauses cleanly
|
||
// and resumes when raster pops the next FIFO entry. With this
|
||
// path, no primitive can ever silently drop, and FIFO_DEPTH can
|
||
// be set independently of "how many sprites we might draw."
|
||
output logic raster_fifo_full,
|
||
// Ch85: latches when a TRI/TRI_STRIP/TRI_FAN closes with zero
|
||
// signed area (3 colinear vertices). The rasterizer skips SCAN
|
||
// for that primitive and stays IDLE — no raster pixels emit.
|
||
// pixel_emit (Ch82 channel) and prim_complete still pulse on
|
||
// the close cycle as usual; only the interior raster is
|
||
// suppressed. SPRITE never sets this flag (rectangle fill
|
||
// doesn't depend on signed area).
|
||
output logic raster_degenerate,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Texturing (brick 1) — VRAM texel read port.
|
||
//
|
||
// gs_stub does NOT instantiate vram_stub; the wrapper/TB owns that.
|
||
// For textured SPRITEs the rasterizer fetches a texel per inside
|
||
// pixel through this read port. The TB wires it to vram_stub's
|
||
// SECOND read port (read2_addr/read2_data), muxed with
|
||
// clut_loader_stub's read2 access so CLUT-loading (which happens at
|
||
// TEX0 commit, before the raster scan) and texel-fetch (during the
|
||
// scan) don't collide.
|
||
//
|
||
// tex_rd_en / tex_rd_addr drive the read port; tex_rd_data returns
|
||
// the 32-bit word TEX_RD_LATENCY cycles later (default 1, matching
|
||
// vram_stub's combinational read viewed through gs_stub's own S2
|
||
// register stage — see the latency note at the sampler instance).
|
||
//
|
||
// Scope: PSMCT32 DECAL + (Ch296) PSMT8 indexed DECAL textured
|
||
// SPRITEs/TRIANGLEs. When PRIM.TME=0 (or PSM is neither PSMCT32 nor
|
||
// PSMT8) the texturing path is bypassed and emit is byte-identical to
|
||
// the pre-texturing Gouraud/flat path.
|
||
// ------------------------------------------------------------------
|
||
output logic tex_rd_en,
|
||
output logic [31:0] tex_rd_addr,
|
||
input logic [31:0] tex_rd_data,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch296 — CLUT lookup port for PSMT8 indexed texturing. The texel
|
||
// fetch yields an 8-bit index; `clut_rd_idx` drives clut_stub's
|
||
// second (combinational) read port and `clut_rd_data` returns the
|
||
// PSMCT32 entry. The wrapper/TB instantiates clut_stub (+ the
|
||
// clut_loader that fills it at TEX0 commit) and wires these two
|
||
// signals. Tie clut_rd_data=0 for PSMCT32-only tops (the index path
|
||
// is never selected there).
|
||
// ------------------------------------------------------------------
|
||
output logic [7:0] clut_rd_idx,
|
||
input logic [31:0] clut_rd_data,
|
||
|
||
// Ch296 — CLUT-load-busy backpressure. The CLUT load (clut_loader_stub,
|
||
// fired at TEX0 commit) and the textured raster's texel fetch share the
|
||
// single VRAM read2 port. They must be mutually exclusive in time. The
|
||
// load runs ~256 cycles and can outlast the few GIF cycles between the
|
||
// TEX0 commit and the textured primitive reaching the raster head, so
|
||
// hold off the FIFO pop (don't START the textured scan) while the load
|
||
// is in flight. Tie LOW for tops with no CLUT loader (no effect). This
|
||
// gates only the pop, so in-flight primitives drain normally.
|
||
input logic clut_load_busy,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Brick 2a — ALPHA blend dest-framebuffer read port.
|
||
//
|
||
// For a FLAT alpha-blended SPRITE pixel, gs_stub reads the existing
|
||
// destination pixel back from VRAM at the write address (fb_rd_addr)
|
||
// and blends the source color against it. The wrapper wires this to
|
||
// vram_stub/vram_bram_stub's SECOND read port (read2), arbitrated
|
||
// with the texel-fetch port — a FLAT blend never textures, so the
|
||
// two enables are mutually exclusive (documented in the wrapper).
|
||
//
|
||
// fb_rd_en / fb_rd_addr drive the read; fb_rd_data returns the
|
||
// 32-bit PSMCT32 word (the dest pixel) — combinationally
|
||
// (FB_RD_REGISTERED=0) or one cycle later (FB_RD_REGISTERED=1).
|
||
// ------------------------------------------------------------------
|
||
output logic fb_rd_en,
|
||
output logic [31:0] fb_rd_addr,
|
||
input logic [31:0] fb_rd_data,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Brick 2b — Z-buffer (depth test) stored-Z read port.
|
||
//
|
||
// For a Z-tested FLAT PSMCT32 SPRITE pixel, gs_stub reads the stored
|
||
// Z value back from VRAM at the Z-buffer address for (x,y) and
|
||
// compares the fragment Z against it per TEST_1.ZTST. The wrapper
|
||
// wires this to the SAME spare second read port (read2) the texel
|
||
// fetch / alpha dest-fb read use — mutually exclusive by feature.
|
||
//
|
||
// z_rd_en / z_rd_addr drive the read; z_rd_data returns the 32-bit
|
||
// stored Z (PSMZ32) — combinationally (Z_RD_REGISTERED=0) or one
|
||
// cycle later (Z_RD_REGISTERED=1).
|
||
// ------------------------------------------------------------------
|
||
output logic z_rd_en,
|
||
output logic [31:0] z_rd_addr,
|
||
input logic [31:0] z_rd_data,
|
||
|
||
// ------------------------------------------------------------------
|
||
// COMBINED probe — schedule observability. These expose the
|
||
// walker-stalling 5-beat FSM state so a TB can assert the exact
|
||
// per-beat read/write sequence. Driven only when COMBINED_TAZ=1 and
|
||
// the in-flight primitive is the combined T+A+Z triangle; held 0
|
||
// otherwise (so param=0 / non-combined builds see constant 0).
|
||
// comb_active : a combined-mode scan is running this primitive
|
||
// comb_beat : current beat 0..4 of the per-pixel FSM
|
||
// comb_pix_inside : the held candidate pixel is inside the triangle
|
||
// comb_ztest_pass : depth test passed for the held pixel (valid
|
||
// from beat 1 onward)
|
||
// The per-beat read enables (tex_rd_en / fb_rd_en / z_rd_en) and the
|
||
// write strobe (raster_pixel_emit) are already observable on their
|
||
// own ports.
|
||
output logic comb_active,
|
||
output logic [2:0] comb_beat,
|
||
output logic comb_pix_inside,
|
||
output logic comb_ztest_pass,
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch304 TILE-LOCAL render mode — schedule observability. These expose
|
||
// the tile-phase FSM and the on-chip tile RAM ports so a TB can prove
|
||
// clear-init, hidden-pixel-skips-texture/color, visible-pixel reads
|
||
// Z+tex+color & writes color+Z, and flush emits FB. Held 0 when
|
||
// TILE_LOCAL=0 (constant tie-off) so non-tiled builds see 0.
|
||
// tile_active : the in-flight primitive is rendering tile-local
|
||
// tile_phase : 0=OFF 1=CLEAR 2=RENDER 3=FLUSH 4=BIN(BIN_BUFFER_ENABLE)
|
||
// tile_color_we : color tile write strobe this cycle
|
||
// tile_color_waddr: color tile write index
|
||
// tile_color_raddr: color tile read index (registered read)
|
||
// tile_z_we : Z tile write strobe this cycle
|
||
// tile_z_waddr : Z tile write index
|
||
// tile_z_raddr : Z tile read index (registered read)
|
||
output logic tile_active,
|
||
output logic [2:0] tile_phase,
|
||
output logic tile_color_we,
|
||
output logic [7:0] tile_color_waddr,
|
||
output logic [7:0] tile_color_raddr,
|
||
output logic tile_z_we,
|
||
output logic [7:0] tile_z_waddr,
|
||
output logic [7:0] tile_z_raddr,
|
||
|
||
// Trace
|
||
output logic ev_valid,
|
||
output subsys_e ev_subsys,
|
||
output event_e ev_event,
|
||
output logic [63:0] ev_arg0,
|
||
output logic [63:0] ev_arg1,
|
||
output logic [63:0] ev_arg2,
|
||
output logic [63:0] ev_arg3,
|
||
output logic [31:0] ev_flags
|
||
);
|
||
|
||
// Default at reset: mid-grey so a missing BGCOLOR write is visually
|
||
// distinct from a black screen caused by disabled video.
|
||
localparam logic [7:0] DEFAULT_R = 8'h40;
|
||
localparam logic [7:0] DEFAULT_G = 8'h40;
|
||
localparam logic [7:0] DEFAULT_B = 8'h40;
|
||
|
||
// ------------------------------------------------------------------
|
||
// BGCOLOR latch
|
||
// Real BGCOLOR is a 24-bit RGB field in a 64-bit register. We take the
|
||
// low 24 bits of reg_wr_data as {R[23:16], G[15:8], B[7:0]} — matches
|
||
// Sony GS layout (see docs/supplemental/sony_official_docs.md).
|
||
// ------------------------------------------------------------------
|
||
|
||
logic is_bgcolor_wr;
|
||
logic is_pmode_wr;
|
||
logic is_dispfb1_wr;
|
||
logic is_display1_wr;
|
||
assign is_bgcolor_wr = reg_wr_en && (reg_wr_addr == BGCOLOR_OFFSET);
|
||
assign is_pmode_wr = reg_wr_en && (reg_wr_addr == PMODE_OFFSET);
|
||
assign is_dispfb1_wr = reg_wr_en && (reg_wr_addr == DISPFB1_OFFSET);
|
||
assign is_display1_wr = reg_wr_en && (reg_wr_addr == DISPLAY1_OFFSET);
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
bg_r <= DEFAULT_R;
|
||
bg_g <= DEFAULT_G;
|
||
bg_b <= DEFAULT_B;
|
||
end else if (is_bgcolor_wr) begin
|
||
bg_r <= reg_wr_data[23:16];
|
||
bg_g <= reg_wr_data[15:8];
|
||
bg_b <= reg_wr_data[7:0];
|
||
end
|
||
end
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch91 / Ch92 / Ch93 / Ch94 — PMODE / DISPFB1 / DISPLAY1 latches.
|
||
//
|
||
// PMODE bit 0 = EN1 (display 1 enable). DISPFB1 fields are
|
||
// extracted by gs_pcrtc_stub:
|
||
// FBP : [8:0] base in 2048-byte units
|
||
// FBW : [14:9] width in 64-pixel units
|
||
// PSM : [19:15] pixel storage mode (PSMCT32 + PSMCT16 honored, Ch94)
|
||
// DBX : [42:32] display X origin (consumed by pcrtc, Ch91-audit)
|
||
// DBY : [53:43] display Y origin (consumed by pcrtc, Ch91-audit)
|
||
// DISPLAY1 fields (Ch92/Ch93) — DX/DY/DW/DH (window inside
|
||
// active area), MAGH/MAGV (per-axis magnification, Ch93).
|
||
// All three regs reset to 0 — scanout starts disabled,
|
||
// exactly the pre-Ch91 default behavior, so legacy TBs that
|
||
// don't write PMODE see no scanout activity.
|
||
// ------------------------------------------------------------------
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
pmode_q <= 64'd0;
|
||
dispfb1_q <= 64'd0;
|
||
display1_q <= 64'd0;
|
||
end else begin
|
||
if (is_pmode_wr) pmode_q <= reg_wr_data;
|
||
if (is_dispfb1_wr) dispfb1_q <= reg_wr_data;
|
||
if (is_display1_wr) display1_q <= reg_wr_data;
|
||
end
|
||
end
|
||
|
||
// Ch75/Ch98: GIF-context register latches keyed on real PS2
|
||
// reg# per PCSX2 GSRegs.h. Only a small primitive-oriented
|
||
// subset is recognized today — anything else lands as
|
||
// EV_MODE in the trace.
|
||
// 0x00 PRIM
|
||
// 0x01 RGBAQ
|
||
// 0x04 XYZF2
|
||
// 0x05 XYZ2
|
||
// 0x06 TEX0_1 (Ch98)
|
||
// 0x4C FRAME_1
|
||
// 0x4E ZBUF_1 <- not 0x4F (that's ZBUF_2)
|
||
localparam logic [7:0] GIF_REG_PRIM = 8'h00;
|
||
localparam logic [7:0] GIF_REG_RGBAQ = 8'h01;
|
||
// Brick 1 — texture-coordinate registers. ST (0x02) carries the
|
||
// float S/T coords (used for TRI/perspective); UV (0x03) carries
|
||
// the integer U/V used for SPRITEs (and non-perspective prims).
|
||
// v1 SPRITE texturing only consumes UV. ST is decoded/latched too
|
||
// so a future TRI texturing chapter has the latch in place, but
|
||
// ST is NOT yet wired into any sampler path.
|
||
localparam logic [7:0] GIF_REG_ST = 8'h02;
|
||
localparam logic [7:0] GIF_REG_UV = 8'h03;
|
||
localparam logic [7:0] GIF_REG_XYZF2 = 8'h04;
|
||
localparam logic [7:0] GIF_REG_XYZ2 = 8'h05;
|
||
localparam logic [7:0] GIF_REG_TEX0_1 = 8'h06;
|
||
// Brick 2a — ALPHA_1 (GIF reg# 0x42). Blend-function selector:
|
||
// A = [1:0] (0=Cs, 1=Cd, 2=0) first color operand
|
||
// B = [3:2] (0=Cs, 1=Cd, 2=0) second color operand
|
||
// C = [5:4] (0=As, 1=Ad, 2=FIX) blend coefficient
|
||
// D = [7:6] (0=Cs, 1=Cd, 2=0) additive operand
|
||
// FIX = [39:32] fixed alpha (used when C=2)
|
||
// GS formula: Cv = ((A - B) * C) >> 7 + D. The brick-2a source-over
|
||
// config is A=0 B=1 C=0 D=1 -> ((Cs - Cd) * As) >> 7 + Cd.
|
||
localparam logic [7:0] GIF_REG_ALPHA_1 = 8'h42;
|
||
localparam logic [7:0] GIF_REG_SCISSOR_1 = 8'h40;
|
||
// Brick 2b — TEST_1 (GIF reg# 0x47). Per-context pixel test
|
||
// selector. Brick 2b consumes only the Z-test sub-fields:
|
||
// ZTE = bit 16 (Z-test enable)
|
||
// ZTST = bits [18:17] (0=NEVER, 1=ALWAYS, 2=GEQUAL, 3=GREATER)
|
||
// The alpha-test / dest-alpha-test fields (ATE/ATST/AREF/AFAIL,
|
||
// DATE/DATM) are latched in test_1_q but NOT decoded/acted on by
|
||
// brick 2b — out of scope. ZTST semantics (PS2): LARGER Z = nearer
|
||
// the viewer; GEQUAL passes when fragment_Z >= stored_Z.
|
||
localparam logic [7:0] GIF_REG_TEST_1 = 8'h47;
|
||
// Ch294 — CLAMP_1 (GIF reg# 0x48). Texture wrap-mode selector.
|
||
// WMS = [1:0] (0=REPEAT 1=CLAMP 2/3=REGION_*) S/u wrap
|
||
// WMT = [3:2] (same) T/v wrap
|
||
// Higher REGION_* fields (MINU/MAXU/MINV/MAXV) latched but not decoded
|
||
// (REGION_* not modelled this rung). Consumed only when TEX_WRAP_ENABLE=1.
|
||
localparam logic [7:0] GIF_REG_CLAMP_1 = 8'h48;
|
||
// Ch310 — TEX1_1 (GIF reg# 0x14). Texture sampling-mode selector. We
|
||
// consume ONLY the MMAG bit (magnification filter):
|
||
// MMAG = bit 5 (0=NEAREST, 1=LINEAR)
|
||
// (LCM/MXL/MMIN/MTBA/L/K are latched but not decoded — only magnification
|
||
// bilinear is modelled this rung.) NOTE: GIF reg# 0x14 is distinct from
|
||
// the PSMT4 PSM value 6'h14 seen in the bpp tables — same numeric, totally
|
||
// unrelated. Consumed only when BILINEAR_ENABLE=1.
|
||
localparam logic [7:0] GIF_REG_TEX1_1 = 8'h14;
|
||
localparam logic [7:0] GIF_REG_FRAME_1 = 8'h4C;
|
||
localparam logic [7:0] GIF_REG_ZBUF_1 = 8'h4E;
|
||
localparam logic [7:0] GIF_REG_BITBLTBUF = 8'h50;
|
||
localparam logic [7:0] GIF_REG_TRXPOS = 8'h51;
|
||
localparam logic [7:0] GIF_REG_TRXREG = 8'h52;
|
||
localparam logic [7:0] GIF_REG_TRXDIR = 8'h53;
|
||
|
||
// Brick 1 — texture-coordinate latches (internal; not yet ported
|
||
// out). uv_q holds the most-recent UV write (U=[13:0], V=[27:14],
|
||
// both 10.4 fixed-point — 4 fractional bits). st_q holds ST.
|
||
logic [63:0] st_q;
|
||
logic [63:0] uv_q;
|
||
// Brick 2a — ALPHA_1 latch (internal; not ported out).
|
||
logic [63:0] alpha_1_q;
|
||
// Brick 2b — TEST_1 latch (internal; not ported out). Holds the
|
||
// pixel-test config; only ZTE/ZTST are consumed (see below).
|
||
logic [63:0] test_1_q;
|
||
// SCISSOR_1 latch (internal). Inclusive screen rect; consumed only when
|
||
// SCISSOR_ENABLE=1 (see eff_sc* below). Reset to FULL-RANGE so an enabled
|
||
// build with no SCISSOR_1 write yet is still a no-op clip.
|
||
logic [63:0] scissor_1_q;
|
||
// CLAMP_1 latch (internal). Reset to 0 → WMS=WMT=0=REPEAT default.
|
||
logic [63:0] clamp_1_q;
|
||
// Ch310 — TEX1_1 latch (internal). Reset to 0 → MMAG=0 = NEAREST default,
|
||
// so an enabled build with no TEX1_1 write yet still samples nearest.
|
||
logic [63:0] tex1_1_q;
|
||
|
||
logic gif_is_known_reg;
|
||
assign gif_is_known_reg =
|
||
(gif_reg_num == GIF_REG_PRIM ) ||
|
||
(gif_reg_num == GIF_REG_RGBAQ ) ||
|
||
(gif_reg_num == GIF_REG_ST ) ||
|
||
(gif_reg_num == GIF_REG_UV ) ||
|
||
(gif_reg_num == GIF_REG_XYZF2 ) ||
|
||
(gif_reg_num == GIF_REG_XYZ2 ) ||
|
||
(gif_reg_num == GIF_REG_TEX0_1 ) ||
|
||
(gif_reg_num == GIF_REG_ALPHA_1 ) ||
|
||
(gif_reg_num == GIF_REG_SCISSOR_1) ||
|
||
(gif_reg_num == GIF_REG_TEST_1 ) ||
|
||
(gif_reg_num == GIF_REG_CLAMP_1 ) ||
|
||
(gif_reg_num == GIF_REG_TEX1_1 ) ||
|
||
(gif_reg_num == GIF_REG_FRAME_1 ) ||
|
||
(gif_reg_num == GIF_REG_ZBUF_1 ) ||
|
||
(gif_reg_num == GIF_REG_BITBLTBUF) ||
|
||
(gif_reg_num == GIF_REG_TRXPOS ) ||
|
||
(gif_reg_num == GIF_REG_TRXREG ) ||
|
||
(gif_reg_num == GIF_REG_TRXDIR );
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
prim_q <= 64'd0;
|
||
rgbaq_q <= 64'd0;
|
||
st_q <= 64'd0;
|
||
uv_q <= 64'd0;
|
||
alpha_1_q <= 64'd0;
|
||
test_1_q <= 64'd0;
|
||
// SCAX0=0, SCAX1=0x7FF, SCAY0=0, SCAY1=0x7FF → full-range no-op.
|
||
scissor_1_q <= 64'h07FF_0000_07FF_0000;
|
||
clamp_1_q <= 64'd0; // WMS=WMT=0 → REPEAT
|
||
tex1_1_q <= 64'd0; // MMAG=0 → NEAREST
|
||
xyz2_q <= 64'd0;
|
||
xyzf2_q <= 64'd0;
|
||
tex0_1_q <= 64'd0;
|
||
frame_1_q <= 64'd0;
|
||
zbuf_1_q <= 64'd0;
|
||
bitbltbuf_q <= 64'd0;
|
||
trxpos_q <= 64'd0;
|
||
trxreg_q <= 64'd0;
|
||
trxdir_q <= 64'd0;
|
||
end else if (gif_reg_wr_en) begin
|
||
unique case (gif_reg_num)
|
||
GIF_REG_PRIM: prim_q <= gif_reg_data;
|
||
GIF_REG_RGBAQ: rgbaq_q <= gif_reg_data;
|
||
GIF_REG_ST: st_q <= gif_reg_data;
|
||
GIF_REG_UV: uv_q <= gif_reg_data;
|
||
GIF_REG_XYZF2: xyzf2_q <= gif_reg_data;
|
||
GIF_REG_XYZ2: xyz2_q <= gif_reg_data;
|
||
GIF_REG_TEX0_1: tex0_1_q <= gif_reg_data;
|
||
GIF_REG_ALPHA_1: alpha_1_q <= gif_reg_data;
|
||
GIF_REG_SCISSOR_1: scissor_1_q <= gif_reg_data;
|
||
GIF_REG_TEST_1: test_1_q <= gif_reg_data;
|
||
GIF_REG_CLAMP_1: clamp_1_q <= gif_reg_data;
|
||
GIF_REG_TEX1_1: tex1_1_q <= gif_reg_data;
|
||
GIF_REG_FRAME_1: frame_1_q <= gif_reg_data;
|
||
GIF_REG_ZBUF_1: zbuf_1_q <= gif_reg_data;
|
||
GIF_REG_BITBLTBUF: bitbltbuf_q <= gif_reg_data;
|
||
GIF_REG_TRXPOS: trxpos_q <= gif_reg_data;
|
||
GIF_REG_TRXREG: trxreg_q <= gif_reg_data;
|
||
GIF_REG_TRXDIR: trxdir_q <= gif_reg_data;
|
||
default: ; // unknown reg — trace only
|
||
endcase
|
||
end
|
||
end
|
||
|
||
// Ch301 perspective — "saw an ST (0x02) write since the last PRIM"
|
||
// per-primitive flag. A triangle takes the PERSPECTIVE path iff its
|
||
// texcoords were supplied via ST (float S/T/Q), NOT via UV (0x03).
|
||
// Mirror of vert_count / pivot_seen_q lifecycle: cleared on PRIM
|
||
// write, latched on any ST write. When PERSPECTIVE_CORRECT=0 this is
|
||
// a constant 0 (the generate else-branch ties it low), so every
|
||
// downstream use (fifo_persp / ras_persp) is 0 and the param=0 build
|
||
// is byte-identical to the affine-only design.
|
||
logic saw_st_q;
|
||
generate
|
||
if (PERSPECTIVE_CORRECT) begin : g_saw_st
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n)
|
||
saw_st_q <= 1'b0;
|
||
else if (gif_is_prim_wr)
|
||
saw_st_q <= 1'b0;
|
||
else if (gif_reg_wr_en && (gif_reg_num == GIF_REG_ST))
|
||
saw_st_q <= 1'b1;
|
||
end
|
||
end else begin : g_no_saw_st
|
||
assign saw_st_q = 1'b0;
|
||
end
|
||
endgenerate
|
||
|
||
// Ch98 — TEX0_1 sub-field decoders (CLUT plumbing). All
|
||
// combinational from tex0_1_q so they always reflect the
|
||
// most recent TEX0_1 write.
|
||
assign tex0_1_cbp_q = tex0_1_q[50:37];
|
||
assign tex0_1_cpsm_q = tex0_1_q[54:51];
|
||
assign tex0_1_csm_q = tex0_1_q[55];
|
||
assign tex0_1_csa_q = tex0_1_q[60:56];
|
||
assign tex0_1_cld_q = tex0_1_q[63:61];
|
||
|
||
// Ch310 — TEX1_1.MMAG (magnification filter). bit 5: 0=NEAREST, 1=LINEAR.
|
||
// Combinational from tex1_1_q; snapshotted per-primitive as fifo_filter_lin.
|
||
logic tex1_mmag;
|
||
assign tex1_mmag = tex1_1_q[5];
|
||
|
||
// Brick 1 — TEX0_1 TEXTURE-side sub-field decoders (per PS2 GS
|
||
// TEX0 layout, PCSX2 GSRegs.h; the CLUT-side fields above start at
|
||
// CBP=[50:37], which fixes the texture-side field positions):
|
||
// TBP0 [13:0] texture base pointer (in 64-byte/256-bit words)
|
||
// TBW [19:14] texture buffer width (texels-per-row / 64)
|
||
// PSM [25:20] pixel storage mode (6 bits)
|
||
// TW [29:26] log2(texture width)
|
||
// TH [33:30] log2(texture height)
|
||
// TCC [34] / TFX [36:35] — not consumed by v1 DECAL
|
||
logic [13:0] tex0_tbp0;
|
||
logic [5:0] tex0_tbw;
|
||
logic [5:0] tex0_psm;
|
||
logic [3:0] tex0_tw;
|
||
logic [3:0] tex0_th;
|
||
assign tex0_tbp0 = tex0_1_q[13:0];
|
||
assign tex0_tbw = tex0_1_q[19:14];
|
||
assign tex0_psm = tex0_1_q[25:20];
|
||
assign tex0_tw = tex0_1_q[29:26];
|
||
assign tex0_th = tex0_1_q[33:30];
|
||
|
||
// Ch294 — CLAMP_1 decoded wrap-mode fields (WMS[1:0], WMT[3:2]).
|
||
logic [1:0] clamp_wms, clamp_wmt;
|
||
assign clamp_wms = clamp_1_q[1:0];
|
||
assign clamp_wmt = clamp_1_q[3:2];
|
||
|
||
// PRIM.TME — texture-mapping enable (bit 4 of the PRIM register,
|
||
// per PS2 GS PRIM layout: [2:0] PRIM type, [3] IIP, [4] TME,
|
||
// [5] FGE, [6] ABE, ...). When 0, the textured emit path is
|
||
// bypassed entirely.
|
||
logic prim_tme;
|
||
assign prim_tme = prim_q[4];
|
||
|
||
// Brick 2a — PRIM.ABE (alpha-blend enable, bit 6 of PRIM). When 0,
|
||
// the blend path is bypassed and the emit is opaque (byte-identical
|
||
// to pre-Brick-2a). ALPHA_1 sub-field decoders are combinational
|
||
// from alpha_1_q so they always reflect the latest ALPHA_1 write.
|
||
logic prim_abe;
|
||
logic [1:0] alpha_a, alpha_b, alpha_c, alpha_d;
|
||
logic [7:0] alpha_fix;
|
||
logic alpha_is_source_over;
|
||
assign prim_abe = prim_q[6];
|
||
assign alpha_a = alpha_1_q[1:0];
|
||
assign alpha_b = alpha_1_q[3:2];
|
||
assign alpha_c = alpha_1_q[5:4];
|
||
assign alpha_d = alpha_1_q[7:6];
|
||
// Brick-2c — ALPHA_1.FIX (fixed alpha coefficient, used when C==2).
|
||
assign alpha_fix = alpha_1_q[39:32];
|
||
// Source-over: Cv = ((Cs - Cd) * As) >> 7 + Cd i.e. A=0 B=1 C=0 D=1.
|
||
// brick-2a supports ONLY this config; any other selector falls back
|
||
// to an opaque write (documented limitation).
|
||
assign alpha_is_source_over = (alpha_a == 2'd0) && (alpha_b == 2'd1)
|
||
&& (alpha_c == 2'd0) && (alpha_d == 2'd1);
|
||
|
||
// Brick 2b — TEST_1 Z-test sub-field decoders + ZBUF_1 decoders.
|
||
// Combinational from test_1_q / zbuf_1_q so they always reflect the
|
||
// most-recent write.
|
||
// TEST_1.ZTE = bit 16 (Z-test enable)
|
||
// TEST_1.ZTST = bits [18:17] (0=NEVER 1=ALWAYS 2=GEQUAL 3=GREATER)
|
||
// ZBUF_1.ZBP = bits [8:0] (Z buffer base, *2048-word page units)
|
||
// ZBUF_1.PSM = bits [27:24] (0=PSMZ32, 1=PSMZ24, 2=PSMZ16)
|
||
// ZBUF_1.ZMSK = bit 0 (1 => do NOT update Z)
|
||
logic test_zte;
|
||
logic [1:0] test_ztst;
|
||
logic [8:0] zbuf_zbp;
|
||
logic [3:0] zbuf_psm;
|
||
logic zbuf_zmsk;
|
||
assign test_zte = test_1_q[16];
|
||
assign test_ztst = test_1_q[18:17];
|
||
assign zbuf_zbp = zbuf_1_q[8:0];
|
||
assign zbuf_psm = zbuf_1_q[27:24];
|
||
assign zbuf_zmsk = zbuf_1_q[0];
|
||
|
||
// SCISSOR_1 inclusive screen rect, decoded from the SIMPLE reg
|
||
// (part-selects on scissor_1_q — no parenthesized-expr bit-select).
|
||
// SCAX0=[10:0] SCAX1=[26:16] SCAY0=[42:32] SCAY1=[58:48]
|
||
logic [10:0] scissor_x0, scissor_x1, scissor_y0, scissor_y1;
|
||
assign scissor_x0 = scissor_1_q[10:0];
|
||
assign scissor_x1 = scissor_1_q[26:16];
|
||
assign scissor_y0 = scissor_1_q[42:32];
|
||
assign scissor_y1 = scissor_1_q[58:48];
|
||
|
||
// Effective scissor bounds folded into the 12-bit walker clip. When
|
||
// SCISSOR_ENABLE=0 (compile-time constant) these are 0 / 0xFFF, so
|
||
// max(x,0)=x and min(x,0xFFF)=x → intersecting them is a no-op and the
|
||
// synthesizer folds the whole term away. BYTE-IDENTICAL at the default.
|
||
logic [11:0] eff_scx0, eff_scx1, eff_scy0, eff_scy1;
|
||
assign eff_scx0 = SCISSOR_ENABLE ? {1'b0, scissor_x0} : 12'd0;
|
||
assign eff_scx1 = SCISSOR_ENABLE ? {1'b0, scissor_x1} : 12'hFFF;
|
||
assign eff_scy0 = SCISSOR_ENABLE ? {1'b0, scissor_y0} : 12'd0;
|
||
assign eff_scy1 = SCISSOR_ENABLE ? {1'b0, scissor_y1} : 12'hFFF;
|
||
|
||
// Brick 2b — Z-test ENABLED for a primitive only when ALL hold
|
||
// (otherwise byte-identical to the pre-2b opaque write):
|
||
// - TEST_1.ZTE = 1
|
||
// - ZBUF_1.PSM = PSMZ32 (0x0) — 32-bit Z only in v1
|
||
// - FRAME PSM = PSMCT32 (0x00) — flat color sprite only
|
||
// The mutual-exclusivity-by-feature argument (vs texturing /
|
||
// alpha blend, which also use read2) is enforced at close
|
||
// (new_zte_active below): a Z-tested sprite is FLAT, non-blended.
|
||
logic z_format_ok;
|
||
assign z_format_ok = test_zte && (zbuf_psm == 4'h0);
|
||
|
||
// Ch99 — 1-cycle TEX0_1 write pulse. The pulse is REGISTERED
|
||
// (latched at the same posedge that commits tex0_1_q), so it
|
||
// becomes visible to consumers ONE CYCLE AFTER the TEX0_1
|
||
// commit. By that time tex0_1_q has already updated, and the
|
||
// combinational tex0_1_*_q sub-field decoders reflect the
|
||
// new payload — so a downstream loader sampling on the pulse
|
||
// edge sees the just-written values without a race.
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
tex0_1_wr_q <= 1'b0;
|
||
end else begin
|
||
tex0_1_wr_q <= gif_reg_wr_en && (gif_reg_num == GIF_REG_TEX0_1);
|
||
end
|
||
end
|
||
|
||
// Ch110 — 1-cycle TRXDIR write pulse. Triggers
|
||
// gif_image_xfer_stub: when the just-committed trxdir_q[1:0]
|
||
// is 2'd0 (host→local), the xfer engine snapshots
|
||
// BITBLTBUF/TRXPOS/TRXREG and starts consuming IMAGE-mode
|
||
// qwords coming out of gif_packed_stub.
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
trxdir_wr_q <= 1'b0;
|
||
end else begin
|
||
trxdir_wr_q <= gif_reg_wr_en && (gif_reg_num == GIF_REG_TRXDIR);
|
||
end
|
||
end
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch76/Ch77 — primitive observer
|
||
//
|
||
// PRIM[2:0] selects primitive type per real PS2 GS:
|
||
// 0 POINT → discrete, 1 vertex per draw
|
||
// 1 LINE → discrete, 2 vertices per draw
|
||
// 2 LINE_STRIP → strip, anchor=2; then 1 draw per additional vertex
|
||
// 3 TRIANGLE → discrete, 3 vertices per draw
|
||
// 4 TRI_STRIP → strip, anchor=3; then 1 draw per additional vertex
|
||
// 5 TRI_FAN → strip, anchor=3; then 1 draw per additional vertex,
|
||
// with v0 of every saturated draw PINNED to
|
||
// the pivot vertex (Ch78 / Ch80 / Ch81 —
|
||
// raw, color, and decoded fields all carry
|
||
// pivot identity). Cadence count matches
|
||
// TRI_STRIP; identity does NOT.
|
||
// 6 SPRITE → discrete, 2 vertices per draw
|
||
// 7 reserved → no draw, no vertex accounting
|
||
//
|
||
// A vertex commit is any XYZ2 or XYZF2 write on the gif_reg_* port.
|
||
// A PRIM write resets the vertex counter so a fresh primitive type
|
||
// starts cleanly. Discrete-primitive draws reset the counter to 0;
|
||
// strip/fan draws SATURATE the counter at the primary threshold so
|
||
// every subsequent vertex also closes a primitive (Ch77 anchor
|
||
// logic). prim_complete pulses one cycle on each draw;
|
||
// EV_PRIM_DRAW preempts that cycle's EV_WRITE in the trace. The
|
||
// xyz2_q / xyzf2_q latches always update on a vertex commit,
|
||
// independent of whether a primitive closes.
|
||
// ------------------------------------------------------------------
|
||
|
||
logic [2:0] vert_count;
|
||
logic [2:0] prim_type;
|
||
logic [2:0] vert_threshold;
|
||
logic prim_is_strip;
|
||
logic gif_is_vertex;
|
||
logic gif_is_prim_wr;
|
||
|
||
assign prim_type = prim_q[2:0];
|
||
|
||
always_comb begin
|
||
unique case (prim_type)
|
||
3'd0: vert_threshold = 3'd1; // POINT
|
||
3'd1: vert_threshold = 3'd2; // LINE
|
||
3'd2: vert_threshold = 3'd2; // LINE_STRIP anchor
|
||
3'd3: vert_threshold = 3'd3; // TRIANGLE
|
||
3'd4: vert_threshold = 3'd3; // TRI_STRIP anchor
|
||
3'd5: vert_threshold = 3'd3; // TRI_FAN anchor
|
||
3'd6: vert_threshold = 3'd2; // SPRITE
|
||
default: vert_threshold = 3'd0; // reserved
|
||
endcase
|
||
end
|
||
|
||
assign prim_is_strip = (prim_type == 3'd2) || // LINE_STRIP
|
||
(prim_type == 3'd4) || // TRI_STRIP
|
||
(prim_type == 3'd5); // TRI_FAN
|
||
|
||
assign gif_is_vertex = gif_reg_wr_en &&
|
||
((gif_reg_num == GIF_REG_XYZ2) ||
|
||
(gif_reg_num == GIF_REG_XYZF2));
|
||
assign gif_is_prim_wr = gif_reg_wr_en && (gif_reg_num == GIF_REG_PRIM);
|
||
|
||
// A strip is "saturated" once vert_count has reached the primary
|
||
// threshold — every subsequent vertex closes another primitive.
|
||
logic strip_saturated;
|
||
logic prim_complete_now;
|
||
assign strip_saturated = prim_is_strip && (vert_count == vert_threshold);
|
||
assign prim_complete_now = gif_is_vertex && (vert_threshold != 3'd0) &&
|
||
(strip_saturated ||
|
||
((vert_count + 3'd1) == vert_threshold));
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
vert_count <= 3'd0;
|
||
prim_complete <= 1'b0;
|
||
prim_complete_count <= 32'd0;
|
||
end else begin
|
||
prim_complete <= 1'b0;
|
||
if (gif_is_prim_wr) begin
|
||
vert_count <= 3'd0;
|
||
end else if (gif_is_vertex && (vert_threshold != 3'd0)) begin
|
||
if (prim_complete_now) begin
|
||
// Strip stays saturated at the primary threshold;
|
||
// discrete resets to 0. (If already saturated,
|
||
// vert_threshold == vert_count so this is a no-op.)
|
||
vert_count <= prim_is_strip ? vert_threshold : 3'd0;
|
||
prim_complete <= 1'b1;
|
||
prim_complete_count <= prim_complete_count + 32'd1;
|
||
end else begin
|
||
vert_count <= vert_count + 3'd1;
|
||
end
|
||
end
|
||
end
|
||
end
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch78 — vertex-identity rolling window
|
||
//
|
||
// v_curr, v_prev, v_prev_prev: rolling window of the most recent
|
||
// three vertex commits under the current PRIM context. Updated on
|
||
// every vertex commit (whether closing or not). On PRIM write,
|
||
// the window clears so a fresh primitive type starts with no
|
||
// residual vertex bleed.
|
||
//
|
||
// v_pivot: the first vertex committed since the most recent PRIM
|
||
// write. Used for TRI_FAN's pivot-vertex semantics so a saturated
|
||
// fan extension can identify {pivot, prev, curr} instead of the
|
||
// strip-style rolling {v_n-2, v_n-1, v_n}. pivot_seen latches the
|
||
// first vertex once and ignores subsequent commits.
|
||
// ------------------------------------------------------------------
|
||
logic [63:0] v_curr_q;
|
||
logic [63:0] v_prev_q;
|
||
logic [63:0] v_prev_prev_q;
|
||
logic [63:0] v_pivot_q;
|
||
logic pivot_seen_q;
|
||
|
||
// Ch81: parallel format-flag rolling window — tracks whether each
|
||
// vertex in the rolling window came from XYZ2 (0) or XYZF2 (1).
|
||
// Updates lockstep with v_*_q. Cleared on PRIM write so a fresh
|
||
// primitive context has no residual format flags.
|
||
logic xyzf2_curr_q;
|
||
logic xyzf2_prev_q;
|
||
logic xyzf2_prev_prev_q;
|
||
logic xyzf2_pivot_q;
|
||
|
||
logic gif_vertex_is_xyzf2;
|
||
assign gif_vertex_is_xyzf2 = gif_is_vertex && (gif_reg_num == GIF_REG_XYZF2);
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
v_curr_q <= 64'd0;
|
||
v_prev_q <= 64'd0;
|
||
v_prev_prev_q <= 64'd0;
|
||
v_pivot_q <= 64'd0;
|
||
pivot_seen_q <= 1'b0;
|
||
xyzf2_curr_q <= 1'b0;
|
||
xyzf2_prev_q <= 1'b0;
|
||
xyzf2_prev_prev_q <= 1'b0;
|
||
xyzf2_pivot_q <= 1'b0;
|
||
end else if (gif_is_prim_wr) begin
|
||
v_curr_q <= 64'd0;
|
||
v_prev_q <= 64'd0;
|
||
v_prev_prev_q <= 64'd0;
|
||
v_pivot_q <= 64'd0;
|
||
pivot_seen_q <= 1'b0;
|
||
xyzf2_curr_q <= 1'b0;
|
||
xyzf2_prev_q <= 1'b0;
|
||
xyzf2_prev_prev_q <= 1'b0;
|
||
xyzf2_pivot_q <= 1'b0;
|
||
end else if (gif_is_vertex && (vert_threshold != 3'd0)) begin
|
||
v_prev_prev_q <= v_prev_q;
|
||
v_prev_q <= v_curr_q;
|
||
v_curr_q <= gif_reg_data;
|
||
xyzf2_prev_prev_q <= xyzf2_prev_q;
|
||
xyzf2_prev_q <= xyzf2_curr_q;
|
||
xyzf2_curr_q <= gif_vertex_is_xyzf2;
|
||
if (!pivot_seen_q) begin
|
||
v_pivot_q <= gif_reg_data;
|
||
xyzf2_pivot_q <= gif_vertex_is_xyzf2;
|
||
pivot_seen_q <= 1'b1;
|
||
end
|
||
end
|
||
end
|
||
|
||
// Ch80 — per-vertex Gouraud color rolling window. Mirrors the
|
||
// vertex window above but samples rgbaq_q (the current GS
|
||
// color register, settled to its draw-time value because
|
||
// gif_packed_stub serializes A+D entries one per cycle, so any
|
||
// RGBAQ write that "belongs to" this vertex landed on the
|
||
// previous cycle and rgbaq_q has already updated). Cleared on
|
||
// rst_n AND on PRIM write so per-vertex color identity stays
|
||
// tied to the current primitive context.
|
||
logic [63:0] c_curr_q;
|
||
logic [63:0] c_prev_q;
|
||
logic [63:0] c_prev_prev_q;
|
||
logic [63:0] c_pivot_q;
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
c_curr_q <= 64'd0;
|
||
c_prev_q <= 64'd0;
|
||
c_prev_prev_q <= 64'd0;
|
||
c_pivot_q <= 64'd0;
|
||
end else if (gif_is_prim_wr) begin
|
||
c_curr_q <= 64'd0;
|
||
c_prev_q <= 64'd0;
|
||
c_prev_prev_q <= 64'd0;
|
||
c_pivot_q <= 64'd0;
|
||
end else if (gif_is_vertex && (vert_threshold != 3'd0)) begin
|
||
c_prev_prev_q <= c_prev_q;
|
||
c_prev_q <= c_curr_q;
|
||
c_curr_q <= rgbaq_q;
|
||
// Gate on the SAME pivot_seen_q the vertex block reads —
|
||
// both update on the cycle of the first vertex commit.
|
||
if (!pivot_seen_q) begin
|
||
c_pivot_q <= rgbaq_q;
|
||
end
|
||
end
|
||
end
|
||
|
||
// Brick 1 — per-vertex texture-coordinate (UV) rolling window.
|
||
// Exact mirror of the c_*_q Gouraud-color window above, but
|
||
// samples uv_q (the current GS UV register). For a textured
|
||
// SPRITE the two endpoints' UVs define the linear texel ramp:
|
||
// uv0 = uvc_curr_q (first endpoint), uv1 = uv_q (closing).
|
||
// UV writes are A+D entries serialized one-per-cycle by
|
||
// gif_packed_stub, so uv_q has already settled to the value that
|
||
// "belongs to" this vertex by the time the vertex commits — same
|
||
// timing argument as c_curr_q. Cleared on rst_n and PRIM write.
|
||
logic [63:0] uvc_curr_q;
|
||
logic [63:0] uvc_prev_q;
|
||
logic [63:0] uvc_prev_prev_q;
|
||
logic [63:0] uvc_pivot_q;
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
uvc_curr_q <= 64'd0;
|
||
uvc_prev_q <= 64'd0;
|
||
uvc_prev_prev_q <= 64'd0;
|
||
uvc_pivot_q <= 64'd0;
|
||
end else if (gif_is_prim_wr) begin
|
||
uvc_curr_q <= 64'd0;
|
||
uvc_prev_q <= 64'd0;
|
||
uvc_prev_prev_q <= 64'd0;
|
||
uvc_pivot_q <= 64'd0;
|
||
end else if (gif_is_vertex && (vert_threshold != 3'd0)) begin
|
||
uvc_prev_prev_q <= uvc_prev_q;
|
||
uvc_prev_q <= uvc_curr_q;
|
||
uvc_curr_q <= uv_q;
|
||
if (!pivot_seen_q) begin
|
||
uvc_pivot_q <= uv_q;
|
||
end
|
||
end
|
||
end
|
||
|
||
// Ch301 perspective — per-vertex S/T (ST register, 0x02) rolling
|
||
// window. EXACT parallel of the uvc_*_q UV window above, but samples
|
||
// st_q (the GS ST register) so the perspective triangle path has the
|
||
// float S_fp/T_fp at each of {v0,v1,v2}. The matching per-vertex Q
|
||
// (1/w) is NOT a new window: it is the existing RGBAQ.Q field already
|
||
// captured in the c_*_q colour window (extract bits [55:32]). Same
|
||
// reset / PRIM-clear / vertex-shift timing argument as uvc_*_q.
|
||
// GUARDED: when PERSPECTIVE_CORRECT=0 these are constant 0.
|
||
logic [63:0] stc_curr_q;
|
||
logic [63:0] stc_prev_q;
|
||
logic [63:0] stc_prev_prev_q;
|
||
logic [63:0] stc_pivot_q;
|
||
generate
|
||
if (PERSPECTIVE_CORRECT) begin : g_stc_window
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
stc_curr_q <= 64'd0;
|
||
stc_prev_q <= 64'd0;
|
||
stc_prev_prev_q <= 64'd0;
|
||
stc_pivot_q <= 64'd0;
|
||
end else if (gif_is_prim_wr) begin
|
||
stc_curr_q <= 64'd0;
|
||
stc_prev_q <= 64'd0;
|
||
stc_prev_prev_q <= 64'd0;
|
||
stc_pivot_q <= 64'd0;
|
||
end else if (gif_is_vertex && (vert_threshold != 3'd0)) begin
|
||
stc_prev_prev_q <= stc_prev_q;
|
||
stc_prev_q <= stc_curr_q;
|
||
stc_curr_q <= st_q;
|
||
if (!pivot_seen_q) begin
|
||
stc_pivot_q <= st_q;
|
||
end
|
||
end
|
||
end
|
||
end else begin : g_no_stc_window
|
||
assign stc_curr_q = 64'd0;
|
||
assign stc_prev_q = 64'd0;
|
||
assign stc_prev_prev_q = 64'd0;
|
||
assign stc_pivot_q = 64'd0;
|
||
end
|
||
endgenerate
|
||
|
||
// Snapshot: latched on the same edge prim_complete pulses, so it
|
||
// aligns one-for-one with EV_PRIM_DRAW in the trace block (which
|
||
// also fires its registered ev_valid pulse on that same edge).
|
||
// Held until the next prim_complete; idle cycles do not disturb
|
||
// the snapshot. Ch79: prim_color_q snapshots rgbaq_q on the same
|
||
// edge — RGBAQ writes are A+D entries on a different cycle from
|
||
// any XYZ2/XYZF2 commit (gif_packed_stub serializes A+D to one
|
||
// accept per cycle), so rgbaq_q is already settled to its
|
||
// "draw-time" value when prim_complete_now fires.
|
||
// Ch81: closing-vertex format flag is "the reg# being committed
|
||
// RIGHT NOW", not a registered value — the rolling-flag window
|
||
// hasn't shifted yet on this cycle. (Same parity as how the
|
||
// vertex slot uses gif_reg_data directly for the closing slot.)
|
||
logic fan_sat_path;
|
||
assign fan_sat_path = prim_is_strip && (prim_type == 3'd5) && strip_saturated;
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
prim_v0_q <= 64'd0;
|
||
prim_v1_q <= 64'd0;
|
||
prim_v2_q <= 64'd0;
|
||
prim_color_q <= 64'd0;
|
||
prim_color_v0_q <= 64'd0;
|
||
prim_color_v1_q <= 64'd0;
|
||
prim_color_v2_q <= 64'd0;
|
||
prim_v0_decoded_q <= '0;
|
||
prim_v1_decoded_q <= '0;
|
||
prim_v2_decoded_q <= '0;
|
||
prim_v0_color_decoded_q <= '0;
|
||
prim_v1_color_decoded_q <= '0;
|
||
prim_v2_color_decoded_q <= '0;
|
||
end else if (prim_complete_now) begin
|
||
prim_color_q <= rgbaq_q;
|
||
unique case (vert_threshold)
|
||
3'd1: begin // POINT
|
||
prim_v0_q <= gif_reg_data;
|
||
prim_v1_q <= 64'd0;
|
||
prim_v2_q <= 64'd0;
|
||
prim_color_v0_q <= rgbaq_q;
|
||
prim_color_v1_q <= 64'd0;
|
||
prim_color_v2_q <= 64'd0;
|
||
prim_v0_decoded_q <= trace_pkg::decode_vertex(gif_reg_data, gif_vertex_is_xyzf2);
|
||
prim_v1_decoded_q <= '0;
|
||
prim_v2_decoded_q <= '0;
|
||
prim_v0_color_decoded_q <= trace_pkg::decode_color(rgbaq_q);
|
||
prim_v1_color_decoded_q <= '0;
|
||
prim_v2_color_decoded_q <= '0;
|
||
end
|
||
3'd2: begin // LINE / LINE_STRIP / SPRITE
|
||
prim_v0_q <= v_curr_q;
|
||
prim_v1_q <= gif_reg_data;
|
||
prim_v2_q <= 64'd0;
|
||
prim_color_v0_q <= c_curr_q;
|
||
prim_color_v1_q <= rgbaq_q;
|
||
prim_color_v2_q <= 64'd0;
|
||
prim_v0_decoded_q <= trace_pkg::decode_vertex(v_curr_q, xyzf2_curr_q);
|
||
prim_v1_decoded_q <= trace_pkg::decode_vertex(gif_reg_data, gif_vertex_is_xyzf2);
|
||
prim_v2_decoded_q <= '0;
|
||
prim_v0_color_decoded_q <= trace_pkg::decode_color(c_curr_q);
|
||
prim_v1_color_decoded_q <= trace_pkg::decode_color(rgbaq_q);
|
||
prim_v2_color_decoded_q <= '0;
|
||
end
|
||
3'd3: begin // TRI / TRI_STRIP / TRI_FAN
|
||
// FAN saturated path uses v_pivot for v0; all
|
||
// other 3-vertex closes use the rolling v_prev.
|
||
// For a FAN's first draw, v_pivot == v_prev so
|
||
// the two are equivalent on that cycle.
|
||
prim_v0_q <= fan_sat_path ? v_pivot_q : v_prev_q;
|
||
prim_v1_q <= v_curr_q;
|
||
prim_v2_q <= gif_reg_data;
|
||
prim_color_v0_q <= fan_sat_path ? c_pivot_q : c_prev_q;
|
||
prim_color_v1_q <= c_curr_q;
|
||
prim_color_v2_q <= rgbaq_q;
|
||
prim_v0_decoded_q <= trace_pkg::decode_vertex(
|
||
fan_sat_path ? v_pivot_q : v_prev_q,
|
||
fan_sat_path ? xyzf2_pivot_q : xyzf2_prev_q);
|
||
prim_v1_decoded_q <= trace_pkg::decode_vertex(v_curr_q, xyzf2_curr_q);
|
||
prim_v2_decoded_q <= trace_pkg::decode_vertex(gif_reg_data, gif_vertex_is_xyzf2);
|
||
prim_v0_color_decoded_q <= trace_pkg::decode_color(fan_sat_path ? c_pivot_q : c_prev_q);
|
||
prim_v1_color_decoded_q <= trace_pkg::decode_color(c_curr_q);
|
||
prim_v2_color_decoded_q <= trace_pkg::decode_color(rgbaq_q);
|
||
end
|
||
default: ; // reserved — never reached (vert_threshold=0 gates prim_complete_now)
|
||
endcase
|
||
end
|
||
end
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch82 / Ch83 — minimal pixel emit
|
||
//
|
||
// One pixel per closed primitive, addressed from FRAME_1's FBP /
|
||
// FBW plus the closing vertex's integer screen coordinates.
|
||
// Ch83 added a PSM-aware bytes-per-pixel shift (bpp_shift below)
|
||
// so PSMCT16, PSMT8, PSMZ16 etc. compute correct fb_addr;
|
||
// PSMT4's true 0.5 bytes/pixel stays out of scope (host-word
|
||
// fallback). Multiplication is sim-only; synthesis target is
|
||
// unaffected because gs_stub is a stub.
|
||
// ------------------------------------------------------------------
|
||
logic [11:0] pixel_x_next;
|
||
logic [11:0] pixel_y_next;
|
||
logic [31:0] fbp_bytes;
|
||
logic [31:0] pixels_per_row;
|
||
logic [31:0] pixel_index;
|
||
logic [31:0] pixel_byte_offset;
|
||
logic [31:0] pixel_fb_addr_next;
|
||
// Ch83: PSM-aware bytes-per-pixel shift. Drives the byte-offset
|
||
// multiplier for fb_addr. Recognized formats below; unrecognized
|
||
// PSMs fall back to 4-byte addressing (host-word, matches what
|
||
// PSMT4/T8H/T4HL/T4HH would produce for their host-slot
|
||
// addresses). PSMT4's 4-bit-per-pixel sub-byte addressing is
|
||
// out of scope — treated as host-word here.
|
||
//
|
||
// PSM | bytes/pixel | shift
|
||
// ----------+-------------+------
|
||
// PSMCT32 | 4 | 2 (host-word)
|
||
// PSMCT24 | 4 | 2 (24-bit packed in 32-bit slot)
|
||
// PSMCT16 | 2 | 1
|
||
// PSMCT16S | 2 | 1
|
||
// PSMT8 | 1 | 0
|
||
// PSMT4 | 0.5 | 2 * (* host-word fallback)
|
||
// PSMT8H | 4 | 2 (host-word)
|
||
// PSMT4HL | 4 | 2 (host-word)
|
||
// PSMT4HH | 4 | 2 (host-word)
|
||
// PSMZ32 | 4 | 2
|
||
// PSMZ24 | 4 | 2
|
||
// PSMZ16 | 2 | 1
|
||
// PSMZ16S | 2 | 1
|
||
logic [1:0] bpp_shift;
|
||
|
||
always_comb begin
|
||
unique case (frame_1_q[29:24])
|
||
6'h00, 6'h01, 6'h1B, 6'h24, 6'h2C, // PSMCT32 / PSMCT24 / PSMT8H / PSMT4HL / PSMT4HH
|
||
6'h30, 6'h31: // PSMZ32 / PSMZ24
|
||
bpp_shift = 2'd2; // 4 bytes/pixel
|
||
6'h02, 6'h0A, 6'h32, 6'h3A: // PSMCT16 / PSMCT16S / PSMZ16 / PSMZ16S
|
||
bpp_shift = 2'd1; // 2 bytes/pixel
|
||
6'h13: bpp_shift = 2'd0; // PSMT8 — 1 byte/pixel
|
||
6'h14: bpp_shift = 2'd2; // PSMT4 — sub-byte unsupported, host-word fallback
|
||
default: bpp_shift = 2'd2; // unrecognized PSM — host-word fallback
|
||
endcase
|
||
end
|
||
|
||
assign pixel_x_next = gif_reg_data[15:4]; // top 12 of 12.4 X
|
||
assign pixel_y_next = gif_reg_data[31:20]; // top 12 of 12.4 Y
|
||
assign fbp_bytes = {23'd0, frame_1_q[8:0]} << 11; // FBP * 2048
|
||
assign pixels_per_row = {26'd0, frame_1_q[21:16]} << 6; // FBW * 64
|
||
assign pixel_index = ({20'd0, pixel_y_next} * pixels_per_row)
|
||
+ {20'd0, pixel_x_next};
|
||
assign pixel_byte_offset = pixel_index << bpp_shift; // Ch83: PSM-aware
|
||
assign pixel_fb_addr_next = fbp_bytes + pixel_byte_offset;
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
pixel_emit <= 1'b0;
|
||
pixel_emit_count <= 32'd0;
|
||
pixel_x_q <= 12'd0;
|
||
pixel_y_q <= 12'd0;
|
||
pixel_color_q <= 64'd0;
|
||
pixel_fbp_q <= 9'd0;
|
||
pixel_fbw_q <= 6'd0;
|
||
pixel_psm_q <= 6'd0;
|
||
pixel_fb_addr_q <= 32'd0;
|
||
end else begin
|
||
pixel_emit <= 1'b0;
|
||
if (prim_complete_now) begin
|
||
pixel_emit <= 1'b1;
|
||
pixel_emit_count <= pixel_emit_count + 32'd1;
|
||
pixel_x_q <= pixel_x_next;
|
||
pixel_y_q <= pixel_y_next;
|
||
pixel_color_q <= rgbaq_q;
|
||
pixel_fbp_q <= frame_1_q[8:0];
|
||
pixel_fbw_q <= frame_1_q[21:16];
|
||
pixel_psm_q <= frame_1_q[29:24];
|
||
pixel_fb_addr_q <= pixel_fb_addr_next;
|
||
end
|
||
end
|
||
end
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch84 / Ch85 — minimal interior rasterizer
|
||
//
|
||
// Brutally simple but real:
|
||
// - SPRITE (PRIM=6): 2-vertex axis-aligned rectangle, fill all
|
||
// pixels in the bounding box.
|
||
// - TRI / TRI_STRIP / TRI_FAN: 3-vertex triangle, edge-function
|
||
// test at each pixel in the bounding box. Ch85 applies the
|
||
// D3D-style top-left fill rule: at FSM init, if signed area
|
||
// is < 0 the FSM swaps v1 and v2 to canonicalize to CCW;
|
||
// each post-swap edge is classified as top-or-left
|
||
// (inclusive) or right/bottom (exclusive); the inside test
|
||
// is `(e[i] + bias[i]) <= 0` for all i, where bias[i]=0 for
|
||
// inclusive edges and 1 for exclusive. Adjacent triangles
|
||
// sharing an edge no longer double-paint; degenerate
|
||
// triangles (signed area == 0) skip SCAN and latch
|
||
// raster_degenerate.
|
||
// - POINT / LINE / LINE_STRIP: do NOT raster — they keep the
|
||
// Ch82 single-closing-pixel emit on the pixel_emit channel
|
||
// and the raster channel stays quiet for them.
|
||
//
|
||
// FSM (Ch87 + Ch88): R_IDLE → R_SCAN → R_DRAIN. Each closed
|
||
// TRI/SPRITE that's not degenerate latches its full per-prim
|
||
// context (vertices, bias, signed area, per-vertex colors,
|
||
// FRAME_1 fields, bounding box) into a 2-entry FIFO at the
|
||
// close cycle. The FSM dequeues the oldest entry on
|
||
// R_IDLE→R_SCAN. In R_SCAN, S0 walks the bounding box one
|
||
// pixel per cycle into the S1/S2 pipeline; S2 emits inside
|
||
// pixels via raster_pixel_emit. When S0 reaches the bbox
|
||
// corner, the FSM transitions R_SCAN→R_DRAIN to let S1 and S2
|
||
// finish their tail without S0 producing more valids. The
|
||
// next pop fires on drain_done = (R_DRAIN) & !s1_valid_q &
|
||
// !s2_valid_q if the FIFO has more work, otherwise the FSM
|
||
// returns to R_IDLE. raster_active stays high across the
|
||
// R_DRAIN→R_SCAN seam for back-to-back primitives.
|
||
// raster_overflow latches only when a push arrives while the
|
||
// FIFO is full AND no concurrent pop frees a slot — Ch87's
|
||
// audit-medium fix preserves the new primitive on a
|
||
// simultaneous full+pop+push race.
|
||
//
|
||
// Color: TRI/STRIP/FAN are per-pixel Gouraud (Ch86, computed
|
||
// in S2 from latched per-vertex RGBAQ snapshots and the
|
||
// barycentric weights produced from the unbiased edge
|
||
// functions); SPRITE stays flat (Ch86 keeps SPRITE flat since
|
||
// it has only 2 vertices). Coordinates are integer (12.4
|
||
// top-12); sub-pixel snap-to-integer.
|
||
// ------------------------------------------------------------------
|
||
|
||
// Ch88 — added R_DRAIN to let the 3-stage pipeline (S0 coord
|
||
// generation / S1 edge test + barycentric weights / S2 color
|
||
// interp + fb_addr + emit) flush in-flight pixels after the
|
||
// last bbox coord is generated. The FSM stays in R_DRAIN
|
||
// until both S1 and S2 valid bits go low, then either pops
|
||
// the next FIFO entry or returns to R_IDLE.
|
||
typedef enum logic [1:0] { R_IDLE, R_SCAN, R_DRAIN } raster_state_e;
|
||
raster_state_e raster_state;
|
||
|
||
// Latched per-primitive context for the scan
|
||
typedef enum logic [1:0] { RM_NONE, RM_SPRITE, RM_TRI } raster_mode_e;
|
||
raster_mode_e ras_mode;
|
||
|
||
logic [11:0] ras_v0_x, ras_v0_y;
|
||
logic [11:0] ras_v1_x, ras_v1_y;
|
||
logic [11:0] ras_v2_x, ras_v2_y;
|
||
logic [11:0] ras_x_min, ras_x_max;
|
||
logic [11:0] ras_y_min, ras_y_max;
|
||
logic [11:0] ras_cur_x, ras_cur_y;
|
||
logic [63:0] ras_color;
|
||
logic [8:0] ras_fbp;
|
||
logic [5:0] ras_fbw, ras_psm;
|
||
logic [1:0] ras_bpp_shift;
|
||
// Brick 1 — in-flight texture context for the active scan.
|
||
logic ras_tme;
|
||
logic [10:0] ras_u0, ras_v0t, ras_u1, ras_v1t;
|
||
logic [31:0] ras_tex_base;
|
||
logic [13:0] ras_tbw;
|
||
logic [5:0] ras_tpsm;
|
||
// Ch294 — popped per-primitive wrap-mode context (CLAMP_1 + TEX0 dims).
|
||
logic [1:0] ras_wms;
|
||
logic [1:0] ras_wmt;
|
||
logic [3:0] ras_tw;
|
||
logic [3:0] ras_th;
|
||
// Brick 1 (DDA) — popped per-pixel Q16.16 texel-coord increments.
|
||
logic signed [31:0] ras_du_dx_q;
|
||
logic signed [31:0] ras_dv_dy_q;
|
||
// Brick 2a — in-flight alpha-blend-active flag for the scan. High
|
||
// only for a FLAT PSMCT32 SPRITE that closed with PRIM.ABE=1 and a
|
||
// source-over ALPHA_1 config. Held for the whole primitive (ABE is
|
||
// constant per primitive), so the emit path is uniform.
|
||
logic ras_abe;
|
||
// Ch344 — in-flight flag: this SPRITE is the textured + source-over alpha path (isolated
|
||
// half-rate texel-read/dest-read/blend). DERIVED, not packed: ras_abe is now set for both
|
||
// flat AND textured-PSMCT32 alpha sprites (new_abe_active below, gated on SPRITE_TEX_ALPHA),
|
||
// and ras_tme distinguishes them — so no FIFO attr-word change is needed. Constant 0 when
|
||
// SPRITE_TEX_ALPHA=0 (new_abe_active never sets ras_abe for a textured sprite then).
|
||
logic ras_tex_abe;
|
||
assign ras_tex_abe = ras_abe && ras_tme;
|
||
// Ch310 — in-flight per-primitive bilinear-filter flag (TEX1_1.MMAG),
|
||
// snapshotted per-primitive, mirroring ras_abe. Loaded always; only READ
|
||
// by the combined texel path when BILINEAR_ENABLE=1 (drives u_tex.filter_lin
|
||
// + bili_now), so the default build is byte-identical.
|
||
logic ras_filter_lin;
|
||
// Brick-2c — in-flight generic GS ALPHA selectors for the active scan
|
||
// (snapshotted per-primitive, mirroring ras_abe). Loaded always; only
|
||
// READ by u_comb_blend when ALPHA_MODES_ENABLE=1, so the default build
|
||
// is byte-identical.
|
||
logic [1:0] ras_alpha_a, ras_alpha_b, ras_alpha_c, ras_alpha_d;
|
||
logic [7:0] ras_alpha_fix;
|
||
|
||
// Brick 2b — in-flight Z-test context for the active scan. ras_zte
|
||
// gates the depth-test path (high only for a FLAT PSMCT32 SPRITE
|
||
// closed with TEST_1.ZTE=1 and ZBUF PSMZ32). The fields below are
|
||
// constant across the scan (flat sprite Z + per-primitive ZBUF
|
||
// config), so the emit path is uniform per primitive.
|
||
logic ras_zte; // depth-test active for this scan
|
||
logic [1:0] ras_ztst; // 0=NEVER 1=ALWAYS 2=GEQUAL 3=GREATER
|
||
logic ras_zmsk; // 1 => do NOT update Z on pass
|
||
logic [8:0] ras_zbp; // ZBUF base in 2048-word page units
|
||
logic [31:0] ras_z_value; // flat fragment Z for the sprite
|
||
|
||
// Choose primitive type for the scan based on PRIM[2:0] at
|
||
// close. Only TRI/TRI_STRIP/TRI_FAN drive RM_TRI; SPRITE
|
||
// drives RM_SPRITE; everything else is RM_NONE (no scan).
|
||
function automatic raster_mode_e classify_prim_for_raster(input logic [2:0] pt);
|
||
unique case (pt)
|
||
3'd3, 3'd4, 3'd5: return RM_TRI; // TRI / TRI_STRIP / TRI_FAN
|
||
3'd6: return RM_SPRITE;
|
||
default: return RM_NONE;
|
||
endcase
|
||
endfunction
|
||
|
||
// Min/max helpers (unsigned 12-bit)
|
||
function automatic logic [11:0] umin12(input logic [11:0] a,
|
||
input logic [11:0] b);
|
||
return (a < b) ? a : b;
|
||
endfunction
|
||
function automatic logic [11:0] umax12(input logic [11:0] a,
|
||
input logic [11:0] b);
|
||
return (a > b) ? a : b;
|
||
endfunction
|
||
|
||
// Closing vertex's screen X/Y (live from gif_reg_data on the
|
||
// close cycle, same source the pixel-emit block uses).
|
||
logic [11:0] close_x, close_y;
|
||
assign close_x = gif_reg_data[15:4];
|
||
assign close_y = gif_reg_data[31:20];
|
||
|
||
// Brick 2b — closing vertex's fragment Z (XYZ2/XYZF2 bits [63:32]).
|
||
// For a SPRITE the Z is flat (constant across the rectangle), so the
|
||
// closing vertex's Z is THE fragment Z for every pixel of the sprite.
|
||
logic [31:0] close_z;
|
||
assign close_z = gif_reg_data[63:32];
|
||
|
||
// For 3-vertex prims the slot mapping mirrors the prim_v*_q
|
||
// snapshot logic: TRI/TRI_STRIP use {v_prev, v_curr, closing};
|
||
// TRI_FAN saturated uses {v_pivot, v_curr, closing}; FAN
|
||
// anchor draws coincide because v_pivot==v_prev there.
|
||
logic [11:0] tri_v0_x_next, tri_v0_y_next;
|
||
logic [11:0] tri_v1_x_next, tri_v1_y_next;
|
||
logic [11:0] tri_v2_x_next, tri_v2_y_next;
|
||
assign tri_v0_x_next = fan_sat_path ? v_pivot_q[15:4] : v_prev_q[15:4];
|
||
assign tri_v0_y_next = fan_sat_path ? v_pivot_q[31:20] : v_prev_q[31:20];
|
||
assign tri_v1_x_next = v_curr_q[15:4];
|
||
assign tri_v1_y_next = v_curr_q[31:20];
|
||
assign tri_v2_x_next = close_x;
|
||
assign tri_v2_y_next = close_y;
|
||
|
||
// For SPRITE the slot mapping is {v_curr=first endpoint,
|
||
// closing=second endpoint}. v0 / v1 form the diagonal corners.
|
||
logic [11:0] sp_v0_x_next, sp_v0_y_next;
|
||
logic [11:0] sp_v1_x_next, sp_v1_y_next;
|
||
assign sp_v0_x_next = v_curr_q[15:4];
|
||
assign sp_v0_y_next = v_curr_q[31:20];
|
||
assign sp_v1_x_next = close_x;
|
||
assign sp_v1_y_next = close_y;
|
||
|
||
// Brick 1 — SPRITE texel coords at the two endpoints. v0 (first
|
||
// endpoint) uses the windowed UV (uvc_curr_q); v1 (closing) uses
|
||
// the live UV register (uv_q) — same {prev, closing} mapping the
|
||
// geometry uses (sp_v0 = v_curr, sp_v1 = closing). UV fields are
|
||
// U=[13:0], V=[27:14] in 10.4 fixed-point; the integer texel
|
||
// coordinate is the top 11 bits (>>4) so the 0..2047 texel range
|
||
// fits gs_texel_addr's u/v[10:0] inputs.
|
||
logic [10:0] sp_u0_next, sp_v0t_next, sp_u1_next, sp_v1t_next;
|
||
assign sp_u0_next = uvc_curr_q[14:4]; // U integer part (10.4 → 11-bit)
|
||
assign sp_v0t_next = uvc_curr_q[28:18]; // V integer part (V starts at bit 14)
|
||
assign sp_u1_next = uv_q[14:4];
|
||
assign sp_v1t_next = uv_q[28:18];
|
||
|
||
// Brick 1 — texture base byte address from TEX0.TBP0.
|
||
//
|
||
// TBP0 scaling convention: base_bytes = TBP0 * 256 (i.e. TBP0<<8).
|
||
// This MATCHES the VRAM upload path (gif_image_xfer_stub scales
|
||
// BITBLTBUF.DBP the same way: dest_base = DBP*256, see that file's
|
||
// `dest_base_q <= {18'd0, dbp} << 8`). It deliberately does NOT
|
||
// match the framebuffer/pcrtc FBP*2048 convention — TBP0 and DBP
|
||
// are the texture/transfer base namespace, FBP is the framebuffer
|
||
// namespace, and a real flow uploads a texture via BITBLT to some
|
||
// DBP and then points TEX0.TBP0 at the same value. Choosing the
|
||
// DBP scaling here is what makes "BITBLT a texture, then TEX0 it"
|
||
// read back from exactly where it was written.
|
||
//
|
||
// TODO(upload-path reconciliation): v1's TB preloads the texture
|
||
// DIRECTLY into vram_stub at base = TBP0*256 (bypassing BITBLT/TRX)
|
||
// so this scaling is the only contract that must hold. A follow-on
|
||
// chapter must verify the round trip "BITBLT upload at DBP → TEX0
|
||
// TBP0=DBP → textured sprite samples the uploaded texels" end to
|
||
// end, including any PSM-specific block/page swizzle (this linear
|
||
// path assumes the non-swizzled layout, matching gs_texel_addr).
|
||
logic [31:0] tex_base_next;
|
||
assign tex_base_next = {18'd0, tex0_tbp0} << 8;
|
||
|
||
// Brick 1 (DDA) — per-pixel Q16.16 texel-coord step for the SPRITE
|
||
// being pushed. ONE divide each (per primitive, at the close cycle):
|
||
// du_dx = ((u1 - u0) << 16) / (x1 - x0)
|
||
// dv_dy = ((v1 - v0) << 16) / (y1 - y0)
|
||
// A zero span (single-column/row sprite) yields step 0 so the
|
||
// coord holds the first endpoint's texel. These feed fifo_du_dx /
|
||
// fifo_dv_dy at push; the per-pixel datapath uses only the popped
|
||
// step with a multiply (dda_uv above) — no per-pixel divide.
|
||
logic signed [31:0] sp_span_x, sp_span_y;
|
||
logic signed [31:0] sp_du, sp_dv;
|
||
logic signed [31:0] du_dx_next, dv_dy_next;
|
||
assign sp_span_x = $signed({20'd0, sp_v1_x_next}) - $signed({20'd0, sp_v0_x_next});
|
||
assign sp_span_y = $signed({20'd0, sp_v1_y_next}) - $signed({20'd0, sp_v0_y_next});
|
||
assign sp_du = $signed({21'd0, sp_u1_next}) - $signed({21'd0, sp_u0_next});
|
||
assign sp_dv = $signed({21'd0, sp_v1t_next}) - $signed({21'd0, sp_v0t_next});
|
||
assign du_dx_next = (sp_span_x == 32'sd0) ? 32'sd0 : ((sp_du <<< 16) / sp_span_x);
|
||
assign dv_dy_next = (sp_span_y == 32'sd0) ? 32'sd0 : ((sp_dv <<< 16) / sp_span_y);
|
||
|
||
// Edge-function evaluation for the current scan pixel inside a
|
||
// triangle. Use 32-bit signed math: e(p) = (p.x - vA.x) *
|
||
// (vB.y - vA.y) - (p.y - vA.y) * (vB.x - vA.x) for edge AB.
|
||
// Inside test (Ch85 top-left fill rule): for the post-swap CCW
|
||
// triangle, e[i] is interior-negative; an edge i is "top-or-
|
||
// left" iff (dy > 0) OR (dy == 0 AND dx > 0) — those edges are
|
||
// INCLUSIVE (e == 0 counts as inside); other edges are
|
||
// EXCLUSIVE (strict e < 0). Implemented by a per-edge bias: 0
|
||
// for top-left, 1 for right/bottom; the comparison then
|
||
// becomes (e + bias <= 0).
|
||
function automatic logic [31:0] edge_fn(input logic [11:0] px,
|
||
input logic [11:0] py,
|
||
input logic [11:0] ax,
|
||
input logic [11:0] ay,
|
||
input logic [11:0] bx,
|
||
input logic [11:0] by);
|
||
logic signed [31:0] dpx, dpy, dbx, dby, prod;
|
||
dpx = $signed({20'd0, px}) - $signed({20'd0, ax});
|
||
dpy = $signed({20'd0, py}) - $signed({20'd0, ay});
|
||
dbx = $signed({20'd0, bx}) - $signed({20'd0, ax});
|
||
dby = $signed({20'd0, by}) - $signed({20'd0, ay});
|
||
prod = dpx * dby - dpy * dbx;
|
||
return prod;
|
||
endfunction
|
||
|
||
// Top-or-left predicate for an edge (vA, vB) in CCW math
|
||
// traversal order, Y-down screen coords.
|
||
function automatic logic top_or_left(input logic [11:0] ax,
|
||
input logic [11:0] ay,
|
||
input logic [11:0] bx,
|
||
input logic [11:0] by);
|
||
logic signed [12:0] dx, dy;
|
||
dx = $signed({1'b0, bx}) - $signed({1'b0, ax});
|
||
dy = $signed({1'b0, by}) - $signed({1'b0, ay});
|
||
return (dy > 0) || ((dy == 32'sd0) && (dx > 0));
|
||
endfunction
|
||
|
||
// Signed area of the original (pre-swap) triangle, computed
|
||
// combinationally from the slot-mapped vertices. >0 = CCW,
|
||
// <0 = CW, 0 = degenerate. Drives the FSM init's swap decision
|
||
// and the degenerate skip.
|
||
logic signed [31:0] tri_sa_calc;
|
||
assign tri_sa_calc =
|
||
($signed({20'd0, tri_v1_x_next}) - $signed({20'd0, tri_v0_x_next})) *
|
||
($signed({20'd0, tri_v2_y_next}) - $signed({20'd0, tri_v0_y_next})) -
|
||
($signed({20'd0, tri_v1_y_next}) - $signed({20'd0, tri_v0_y_next})) *
|
||
($signed({20'd0, tri_v2_x_next}) - $signed({20'd0, tri_v0_x_next}));
|
||
|
||
logic tri_swap_ccw, tri_degenerate;
|
||
assign tri_swap_ccw = (tri_sa_calc < 32'sd0);
|
||
assign tri_degenerate = (tri_sa_calc == 32'sd0);
|
||
|
||
// Post-swap CCW vertices. If sa < 0 we swap v1 and v2 so the
|
||
// top-left rule below applies uniformly.
|
||
logic [11:0] tri_v0x, tri_v0y, tri_v1x, tri_v1y, tri_v2x, tri_v2y;
|
||
assign tri_v0x = tri_v0_x_next;
|
||
assign tri_v0y = tri_v0_y_next;
|
||
assign tri_v1x = tri_swap_ccw ? tri_v2_x_next : tri_v1_x_next;
|
||
assign tri_v1y = tri_swap_ccw ? tri_v2_y_next : tri_v1_y_next;
|
||
assign tri_v2x = tri_swap_ccw ? tri_v1_x_next : tri_v2_x_next;
|
||
assign tri_v2y = tri_swap_ccw ? tri_v1_y_next : tri_v2_y_next;
|
||
|
||
// Brick 3 — per-vertex fragment Z for the 3 triangle vertices.
|
||
// Source Z follows the SAME {prev, curr, closing} (or {pivot, curr,
|
||
// closing} for a saturated fan) slot mapping the X/Y use, and the
|
||
// SAME v1<->v2 swap as the geometry / colors so vertex-attribute
|
||
// alignment is preserved into the post-swap CCW slots. Z lives in
|
||
// bits [63:32] of the vertex word (XYZ2 layout, matching close_z and
|
||
// the Brick-2b sprite Z path). Affine (screen-linear) Z is correct
|
||
// for this chapter; perspective-correct Z is a future item.
|
||
logic [31:0] tri_v0_z_next, tri_v1_z_next, tri_v2_z_next;
|
||
assign tri_v0_z_next = fan_sat_path ? v_pivot_q[63:32] : v_prev_q[63:32];
|
||
assign tri_v1_z_next = v_curr_q[63:32];
|
||
assign tri_v2_z_next = close_z; // gif_reg_data[63:32]
|
||
logic [31:0] tri_v0z, tri_v1z, tri_v2z;
|
||
assign tri_v0z = tri_v0_z_next;
|
||
assign tri_v1z = tri_swap_ccw ? tri_v2_z_next : tri_v1_z_next;
|
||
assign tri_v2z = tri_swap_ccw ? tri_v1_z_next : tri_v2_z_next;
|
||
|
||
// Post-swap per-vertex Gouraud colors (same mapping the FIFO color
|
||
// fields use), exposed as wires so the Brick-3 gradient setup can
|
||
// read per-channel vertex attributes at the push cycle.
|
||
logic [63:0] tri_c0, tri_c1, tri_c2;
|
||
assign tri_c0 = fan_sat_path ? c_pivot_q : c_prev_q;
|
||
assign tri_c1 = tri_swap_ccw ? rgbaq_q : c_curr_q;
|
||
assign tri_c2 = tri_swap_ccw ? c_curr_q : rgbaq_q;
|
||
|
||
// Textured-triangle rung — post-swap per-vertex texture coordinate
|
||
// (UV). EXACT mirror of the tri_c* colour mapping above but on the
|
||
// uvc_*_q rolling window (the closing vertex's UV is the live uv_q,
|
||
// same role rgbaq_q plays for colour). U=[14:4], V=[28:18] in the
|
||
// 10.4 UV register → integer texel = >>4 (the [14:4]/[28:18] slice),
|
||
// matching the SPRITE sp_u0_next/sp_v0t_next decode. The closing UV
|
||
// is taken from uv_q (the just-written closing vertex's UV).
|
||
logic [10:0] tri_uv0_u, tri_uv0_v, tri_uv1_u, tri_uv1_v, tri_uv2_u, tri_uv2_v;
|
||
logic [63:0] tri_uvc0, tri_uvc1, tri_uvc2; // pre-slice {V,U} window words
|
||
assign tri_uvc0 = fan_sat_path ? uvc_pivot_q : uvc_prev_q;
|
||
assign tri_uvc1 = tri_swap_ccw ? uv_q : uvc_curr_q;
|
||
assign tri_uvc2 = tri_swap_ccw ? uvc_curr_q : uv_q;
|
||
assign tri_uv0_u = tri_uvc0[14:4];
|
||
assign tri_uv0_v = tri_uvc0[28:18];
|
||
assign tri_uv1_u = tri_uvc1[14:4];
|
||
assign tri_uv1_v = tri_uvc1[28:18];
|
||
assign tri_uv2_u = tri_uvc2[14:4];
|
||
assign tri_uv2_v = tri_uvc2[28:18];
|
||
|
||
// Ch301 perspective — post-swap per-vertex S/T/Q (24-bit fixed-point,
|
||
// FRAC=12). EXACT parallel of the tri_uv*/tri_c* post-swap mapping:
|
||
// S_fp = st_q[23:0] (carried in stc_*_q[23:0])
|
||
// T_fp = st_q[55:32] (carried in stc_*_q[55:32])
|
||
// Q_fp = rgbaq[55:32] (the RGBAQ.Q field already in the c_*_q
|
||
// colour window — extract [55:32], NOT a new
|
||
// window). For the closing vertex Q comes from
|
||
// the live rgbaq_q, exactly as the colour and
|
||
// UV closing values use rgbaq_q / uv_q.
|
||
// The stc window words pick {pivot|prev, curr|live, live|curr} with
|
||
// the same tri_swap_ccw v1<->v2 swap as geometry/colour/UV. Q rides
|
||
// the SAME tri_c0/c1/c2 post-swap colour words so S/T/Q stay aligned
|
||
// to the same physical vertex after the swap.
|
||
// GUARDED: when PERSPECTIVE_CORRECT=0 the stc window + tri_c* are 0,
|
||
// so all of these are 0 and never reach the (param-gated) FIFO use.
|
||
logic [63:0] tri_stc0, tri_stc1, tri_stc2; // pre-slice {T,S} window words
|
||
logic [23:0] tri_s0, tri_t0, tri_q0;
|
||
logic [23:0] tri_s1, tri_t1, tri_q1;
|
||
logic [23:0] tri_s2, tri_t2, tri_q2;
|
||
assign tri_stc0 = fan_sat_path ? stc_pivot_q : stc_prev_q;
|
||
assign tri_stc1 = tri_swap_ccw ? st_q : stc_curr_q;
|
||
assign tri_stc2 = tri_swap_ccw ? stc_curr_q : st_q;
|
||
assign tri_s0 = tri_stc0[23:0];
|
||
assign tri_t0 = tri_stc0[55:32];
|
||
assign tri_s1 = tri_stc1[23:0];
|
||
assign tri_t1 = tri_stc1[55:32];
|
||
assign tri_s2 = tri_stc2[23:0];
|
||
assign tri_t2 = tri_stc2[55:32];
|
||
// Q from the post-swap colour words (RGBAQ.Q field == bits [55:32]).
|
||
assign tri_q0 = tri_c0[55:32];
|
||
assign tri_q1 = tri_c1[55:32];
|
||
assign tri_q2 = tri_c2[55:32];
|
||
|
||
// Per-edge top-left flag (1 = top-or-left, 0 = right/bottom).
|
||
// Computed against the post-swap CCW order so bias[i] = ~tl[i].
|
||
logic tl0_init, tl1_init, tl2_init;
|
||
assign tl0_init = top_or_left(tri_v0x, tri_v0y, tri_v1x, tri_v1y);
|
||
assign tl1_init = top_or_left(tri_v1x, tri_v1y, tri_v2x, tri_v2y);
|
||
assign tl2_init = top_or_left(tri_v2x, tri_v2y, tri_v0x, tri_v0y);
|
||
|
||
// Latched per-edge bias used in SCAN. ras_bias[i] = 1 means
|
||
// the i-th edge is right/bottom (exclusive); 0 means top-left
|
||
// (inclusive). Loaded at IDLE→SCAN.
|
||
logic [2:0] ras_bias;
|
||
|
||
// Ch86 — per-vertex Gouraud color and signed area, latched at
|
||
// IDLE→SCAN with the same v1↔v2 swap semantics as the vertex /
|
||
// bias state so the colors stay aligned with the post-swap CCW
|
||
// vertex slots. ras_sa_q is positive (post-swap absolute value)
|
||
// — used as the divisor for barycentric weights. Q (texture
|
||
// divisor) is NOT interpolated; it passes through from the
|
||
// closing vertex's color as an opaque 32-bit IEEE float.
|
||
logic [63:0] ras_c0_q;
|
||
logic [63:0] ras_c1_q;
|
||
logic [63:0] ras_c2_q;
|
||
logic signed [31:0] ras_sa_q;
|
||
|
||
// Brick 3 — in-flight TRI affine interpolation context (popped from
|
||
// the FIFO at SCAN start). Q16.16 gradients + Z base.
|
||
logic ras_tri_active;
|
||
// COMBINED probe — in-flight flag for the combined T+A+Z triangle.
|
||
// Popped from fifo_combined at SCAN start. Drives the combined FSM
|
||
// and SUPPRESSES the standard per-feature emit paths for this
|
||
// primitive. ras_combined_g below ties it to a constant 0 net when
|
||
// COMBINED_TAZ=0 so every gating expression collapses to its legacy
|
||
// form and the build is byte-identical.
|
||
logic ras_combined_r; // register (only written when COMBINED_TAZ=1)
|
||
logic ras_combined; // gated view used everywhere
|
||
logic signed [31:0] ras_dr_dx, ras_dr_dy;
|
||
logic signed [31:0] ras_dg_dx, ras_dg_dy;
|
||
logic signed [31:0] ras_db_dx, ras_db_dy;
|
||
logic signed [31:0] ras_da_dx, ras_da_dy;
|
||
logic signed [31:0] ras_dz_dx, ras_dz_dy;
|
||
logic [31:0] ras_z0;
|
||
// Textured-triangle rung — in-flight affine U/V context. Base texel
|
||
// coord at post-swap v0 + the 4 Q16.16 gradients (popped from the
|
||
// FIFO at SCAN start, solved by the shared divider engine).
|
||
logic [10:0] ras_u0_base, ras_v0_base;
|
||
logic signed [31:0] ras_du_dx_t, ras_du_dy_t;
|
||
logic signed [31:0] ras_dv_dx_t, ras_dv_dy_t;
|
||
// Ch301 perspective — in-flight S/T/Q affine context (popped at SCAN
|
||
// start). ras_persp gates the perspective emit path (mirrors
|
||
// ras_tri_active / ras_tme). S/T/Q base at post-swap v0 + the 6
|
||
// Q16.16 gradients. GUARDED: when PERSPECTIVE_CORRECT=0 these stay 0
|
||
// (never written meaningfully; ras_persp constant 0).
|
||
logic ras_persp;
|
||
logic [23:0] ras_s0_base, ras_t0_base, ras_q0_base;
|
||
logic signed [31:0] ras_ds_dx, ras_ds_dy;
|
||
logic signed [31:0] ras_dt_dx, ras_dt_dy;
|
||
logic signed [31:0] ras_dq_dx, ras_dq_dy;
|
||
|
||
// COMBINED probe — gate ras_combined to the register when the param
|
||
// is enabled, else tie it to a constant 0 net (continuous assign, per
|
||
// iverilog-12 tie-off rule). At COMBINED_TAZ=0 every `ras_combined`
|
||
// reference below collapses to 0 and the legacy paths are restored
|
||
// bit-for-bit.
|
||
generate
|
||
if (COMBINED_TAZ) begin : g_ras_combined
|
||
assign ras_combined = ras_combined_r;
|
||
end else begin : g_no_ras_combined
|
||
assign ras_combined = 1'b0;
|
||
end
|
||
endgenerate
|
||
|
||
// ------------------------------------------------------------------
|
||
// Brick 3 — SYNTHESIZABLE affine attribute interpolation setup.
|
||
//
|
||
// The Ch86 per-pixel barycentric divide (interp_byte) is NOT
|
||
// synthesizable (Quartus error 272006 on the 64-bit signed divide,
|
||
// hence the `// synthesis translate_off` guard). Replace it with
|
||
// screen-space-affine incremental interpolation, exactly mirroring
|
||
// how the SPRITE UV path was made synthesizable (dda_uv):
|
||
//
|
||
// * AT SETUP (per primitive, latency-tolerant — happens at the
|
||
// FIFO-push close cycle): solve, for each attribute A, the plane
|
||
// A(x,y) = A0 + (x-x0)*dAdx + (y-y0)*dAdy
|
||
// from the 3 post-swap CCW vertices. dAdx / dAdy are computed in
|
||
// Q16.16 fixed-point with ONE divide each by the post-swap signed
|
||
// area det (= ras_sa_q, positive). A few divides per triangle at
|
||
// setup are fine — same budget as the sprite-UV DDA.
|
||
//
|
||
// * PER PIXEL (S2 stage): accumulate A0 + ((dx*dAdx + dy*dAdy)>>16)
|
||
// with pure multiply + add + arithmetic shift — NO divide, NO
|
||
// translate_off. Color channels clamp to [0,255]; Z is a full
|
||
// 32-bit value fed to the Brick-2b Z path.
|
||
//
|
||
// det (post-swap) = (x1-x0)(y2-y0) - (x2-x0)(y1-y0) evaluated on the
|
||
// post-swap CCW vertices; this equals ras_sa_q by construction (the
|
||
// pre-swap tri_sa_calc negated when tri_swap_ccw). Computed here
|
||
// directly on the post-swap vertices so the gradient solve is
|
||
// self-consistent regardless of swap.
|
||
logic signed [31:0] tri_det_post;
|
||
assign tri_det_post =
|
||
($signed({20'd0, tri_v1x}) - $signed({20'd0, tri_v0x})) *
|
||
($signed({20'd0, tri_v2y}) - $signed({20'd0, tri_v0y})) -
|
||
($signed({20'd0, tri_v2x}) - $signed({20'd0, tri_v0x})) *
|
||
($signed({20'd0, tri_v1y}) - $signed({20'd0, tri_v0y}));
|
||
|
||
// Gradient solve, Q16.16 result. a0/a1/a2 are the post-swap
|
||
// per-vertex attribute values; the (x,y) deltas are taken from the
|
||
// post-swap vertices. A zero det (degenerate) yields 0 step (the
|
||
// primitive is filtered as degenerate before push anyway, so this is
|
||
// just defensive). 64-bit intermediates: for 8-bit color the numerator
|
||
// is tiny; for Z the demo values are small (<~0x400) so num<<16 stays
|
||
// well within signed 64-bit. Full-range (>~2^31) Z would overflow the
|
||
// <<16 intermediate — documented future item (perspective-correct +
|
||
// wide-Z interpolation).
|
||
// SYNTHESIS-WIDTH NOTE. The Ch86 divide errored on Quartus (272006)
|
||
// because a 64-bit SIGNED divide sign-extends the LPM operand to 65
|
||
// bits, exceeding the 64-bit lpm_divide limit. To stay safely under
|
||
// that cap the setup divide here is bounded to a signed 56-bit
|
||
// dividend / signed 32-bit divisor (LPM_WIDTHN ~= 56 < 64). The
|
||
// attribute inputs are 32-bit (8-bit color trivially fits; demo-range
|
||
// Z fits — a vertex Z above ~2^39 would overflow the 16-bit pre-shift
|
||
// and is a documented FUTURE item alongside perspective-correct Z).
|
||
// Deltas are signed 14-bit (12-bit screen coords). A 32-bit attr diff
|
||
// (~33 bits) times a 14-bit delta (~14 bits) = ~47 bits; minus a like
|
||
// term = ~48 bits; <<16 = ~56 bits — hence the 56-bit dividend.
|
||
// -----------------------------------------------------------------
|
||
// Ch295 — divider-collapse for Quartus fit. The Brick-3 setup
|
||
// previously inlined a SEPARATE hardware divide at each of the 10
|
||
// (attribute × axis) gradient call sites — ~10 dividers, the
|
||
// dominant new LAB cost that pushed the fitter ~24% over. All 10
|
||
// divides share the SAME denominator (tri_det_post). They are now
|
||
// computed by a SINGLE divider time-shared across the 10 numerators
|
||
// by a small per-triangle setup engine (`grad_setup_*` below).
|
||
//
|
||
// These functions return ONLY the pre-shifted signed numerator
|
||
// (num <<< 16) — pure multiply / subtract / shift, NO divide. The
|
||
// shared engine performs `quo = num_q / det_q` and truncates to
|
||
// [31:0], so the arithmetic is BIT-IDENTICAL to the previous
|
||
// per-call divide+truncate (same num, same det, same
|
||
// truncation). Numerator width unchanged (signed 56-bit: see the
|
||
// SYNTHESIS-WIDTH NOTE above for the bound derivation; the <<<16 is
|
||
// applied here in the 56-bit numerator domain exactly as before).
|
||
function automatic logic signed [55:0] grad_num_dadx(
|
||
input logic signed [31:0] a0,
|
||
input logic signed [31:0] a1,
|
||
input logic signed [31:0] a2,
|
||
input logic signed [11:0] x0, input logic signed [11:0] y0,
|
||
input logic signed [11:0] x1, input logic signed [11:0] y1,
|
||
input logic signed [11:0] x2, input logic signed [11:0] y2);
|
||
logic signed [13:0] dy1, dy2;
|
||
logic signed [55:0] num;
|
||
dy1 = $signed({2'd0, y1}) - $signed({2'd0, y0});
|
||
dy2 = $signed({2'd0, y2}) - $signed({2'd0, y0});
|
||
num = (a1 - a0) * $signed({{42{dy2[13]}}, dy2})
|
||
- (a2 - a0) * $signed({{42{dy1[13]}}, dy1});
|
||
grad_num_dadx = num <<< 16;
|
||
endfunction
|
||
|
||
function automatic logic signed [55:0] grad_num_dady(
|
||
input logic signed [31:0] a0,
|
||
input logic signed [31:0] a1,
|
||
input logic signed [31:0] a2,
|
||
input logic signed [11:0] x0, input logic signed [11:0] y0,
|
||
input logic signed [11:0] x1, input logic signed [11:0] y1,
|
||
input logic signed [11:0] x2, input logic signed [11:0] y2);
|
||
logic signed [13:0] dx1, dx2;
|
||
logic signed [55:0] num;
|
||
dx1 = $signed({2'd0, x1}) - $signed({2'd0, x0});
|
||
dx2 = $signed({2'd0, x2}) - $signed({2'd0, x0});
|
||
num = (a2 - a0) * $signed({{42{dx1[13]}}, dx1})
|
||
- (a1 - a0) * $signed({{42{dx2[13]}}, dx2});
|
||
grad_num_dady = num <<< 16;
|
||
endfunction
|
||
|
||
// Ch87 — raster command FIFO. Holds primitive contexts captured
|
||
// at close cycles while the FSM is busy. Effective concurrency =
|
||
// 1 in-flight (in ras_*_q) + FIFO_DEPTH queued.
|
||
//
|
||
// Ch171 bumped FIFO_DEPTH 2 → 4 as a tactical fix for the Ch171
|
||
// 320×240 test card (4 large SPRITEs at 160×120 = 19,200 raster
|
||
// cycles each), where DMA was pushing all 4 sprites before
|
||
// raster popped any, overflowing the 2-entry FIFO and silently
|
||
// dropping the 4th sprite.
|
||
//
|
||
// Ch172 — proper backpressure landed (`raster_fifo_full` output
|
||
// → `gif_packed_stub.raster_fifo_full` → GIF deasserts in_ready
|
||
// → DMAC pauses BEFORE pushing a qword that could push_drop).
|
||
// With backpressure, FIFO_DEPTH is independent of "how many
|
||
// sprites we might draw" — it only sets the work-ahead window
|
||
// while raster is busy.
|
||
//
|
||
// Brick-3 fix — FIFO_DEPTH 2 → 4. The Ch295 divider-collapse made
|
||
// a TRI's affine gradients take ~10 cycles to solve (one shared
|
||
// divider) AFTER push, and a slot is held (grad_pending) until its
|
||
// own gradients are ready, so a TRI now occupies its FIFO slot for
|
||
// ~10 extra cycles before it can pop. The pre-Ch295 combinational
|
||
// solve let the head pop immediately, so 1-in-flight + 2-queued
|
||
// (DEPTH=2) absorbed 3 back-to-back closes; the solve latency
|
||
// consumes one of those slots' worth of slack. DEPTH=4 restores the
|
||
// original work-ahead window (the grad-solving slot is effectively
|
||
// "in flight"), so back-to-back TRI closes are queued, never
|
||
// dropped — the gate is the per-slot grad_pending on POP, not a
|
||
// global push stall. (tb_ee_core_gif_raster_queue pushes 3 closes
|
||
// with raster_fifo_full tied 0, i.e. no backpressure relief.)
|
||
// DEPTH=4 (not 3) keeps FIFO_PTR_W a power-of-two so the plain
|
||
// `+1` ring-pointer wrap stays correct (Ch171's value; DEPTH=3
|
||
// would need explicit mod-3 wrap). The added slots are plain
|
||
// registers — cheap next to the ~9 dividers Ch295 removed; no new
|
||
// wide datapath.
|
||
//
|
||
// Parallel arrays (one per latched field) avoid iverilog 12's
|
||
// packed-struct-array field-access quirk seen in Ch81.
|
||
localparam int FIFO_DEPTH = TILE_FIFO_DEPTH; // Ch315 — was hardcoded 4; now a param (power-of-2)
|
||
// ----------------------------------------------------------------------
|
||
// Ch328 (1a) — BRAM-backed per-prim ATTRIBUTE store. The ~76 HEAVY render
|
||
// attributes below cost ~1,033 regs + ~600 ALMs PER FIFO slot (Ch315), the
|
||
// capacity wall. They move to a packed-struct M20K `attr_ram` (the HOT
|
||
// bin/classification fields — mode, bbox, combined, tri_active, grad_pending —
|
||
// stay in registers). 1a = DARK dual-write only: a delayed-copy pack stage
|
||
// mirrors the register fifo_* into attr_ram on push; all reads stay on the
|
||
// registers, so the build is byte-identical. 1b switches the latch sites to
|
||
// prefetch from attr_ram and drops the heavy register arrays (the payoff).
|
||
typedef struct packed {
|
||
logic [11:0] v0x, v0y, v1x, v1y, v2x, v2y;
|
||
logic [2:0] bias;
|
||
logic signed [31:0] sa;
|
||
logic [63:0] color, c0, c1, c2;
|
||
logic [8:0] fbp; logic [5:0] fbw, psm; logic [1:0] bpp_shift;
|
||
logic tme;
|
||
logic [10:0] u0, v0, u1, v1;
|
||
logic [31:0] tex_base; logic [13:0] tbw; logic [5:0] tpsm;
|
||
logic [1:0] wms, wmt; logic [3:0] tw, th;
|
||
logic signed [31:0] du_dx, dv_dy;
|
||
logic abe, filter_lin;
|
||
logic [1:0] alpha_a, alpha_b, alpha_c, alpha_d; logic [7:0] alpha_fix;
|
||
logic zte; logic [1:0] ztst; logic zmsk; logic [8:0] zbp; logic [31:0] zval;
|
||
logic [31:0] z0;
|
||
logic [10:0] u0_base, v0_base;
|
||
logic [31:0] u1v, u2v;
|
||
logic persp;
|
||
logic [23:0] s0_base, t0_base, q0_base;
|
||
logic [63:0] stq1, stq2;
|
||
logic [23:0] q1, q2;
|
||
logic [31:0] v1z, v2z;
|
||
// NOTE (Ch328 1b): the 20 ENGINE-WRITTEN gradient outputs (dr_dx..dz_dy, du_dx_t..dv_dy_t,
|
||
// ds_dx..dq_dy) are deliberately NOT here — they stay in fifo_* sideband registers (the
|
||
// gradient engine writes them per-step; M20K-resident would force a per-step wide RMW).
|
||
} prim_attr_t; // INPUTS only — W shrinks 1693 -> 1053 (640 = 20 output fields removed)
|
||
localparam int PRIM_ATTR_W = $bits(prim_attr_t); // expected 1053 (inputs only; 1b)
|
||
// Ch328 1c — the actual RAM is a FLAT packed vector (NOT a prim_attr_t struct array). Quartus
|
||
// elaborates a struct array PER FIELD (attr_ram[i].tme, attr_ram[i].abe, ...) → ~76 tiny per-
|
||
// field arrays that never become M20K; a flat [W-1:0] word is ONE wide memory it can infer.
|
||
// prim_attr_t is retained ONLY as the layout/width spec ($bits + the documented pack order).
|
||
(* ramstyle = "M20K" *) logic [PRIM_ATTR_W-1:0] attr_ram [0:FIFO_DEPTH-1]; // heavy-attr table
|
||
// Ch328 1c — SINGLE registered read port so attr_ram infers as 1R1W (DUAL_PORT) M20K instead of a
|
||
// multi-read register file. TWO consumers use it: the multiprim grid (mp_load_prim) and the gradient
|
||
// prefetch. They are TIME-EXCLUSIVE — the grid waits for all_grad_done before rendering — so one
|
||
// address mux + one registered data reg suffices. Each drives its issue strobe + address for ONE
|
||
// cycle, then reads attr_rd_q the NEXT (the inherent +1 latency of a registered RAM read). attr_rd_q
|
||
// is the ONLY synthesis read. NOTE: the streaming pop is DELIBERATELY NOT a port consumer — its read
|
||
// stays a single-cycle attr_ram[fifo_rptr] register read to preserve the full-FIFO push+pop
|
||
// same-cycle slot-free concurrency (a multi-cycle pop drops prims); streaming is shallow and gains
|
||
// no M20K benefit. pop_rd_issue/pop_rd_addr are therefore tied off (kept for the uniform mux shape).
|
||
logic mp_rd_issue, pop_rd_issue, grad_rd_issue; // 1-cycle read requests
|
||
logic [FIFO_PTR_W-1:0] mp_rd_addr, pop_rd_addr, grad_rd_addr; // their slots
|
||
logic [FIFO_PTR_W-1:0] attr_rd_addr; // muxed read address (comb)
|
||
reg [PRIM_ATTR_W-1:0] attr_rd_q; // registered read data (sole synth read)
|
||
always_comb begin
|
||
// mp + grad are time-exclusive; pop is tied off. priority is belt-and-suspenders.
|
||
if (grad_rd_issue) attr_rd_addr = grad_rd_addr;
|
||
else if (mp_rd_issue) attr_rd_addr = mp_rd_addr;
|
||
else if (pop_rd_issue) attr_rd_addr = pop_rd_addr;
|
||
else attr_rd_addr = '0;
|
||
end
|
||
always_ff @(posedge clk) attr_rd_q <= attr_ram[attr_rd_addr];
|
||
`ifndef SYNTHESIS
|
||
// mp (multiprim grid) and grad (prefetch) must be time-exclusive: neither arbitrates, each assumes
|
||
// it owns the port the cycle it issues. all_grad_done gating the grid start guarantees this.
|
||
always_ff @(posedge clk) if (rst_n)
|
||
if (mp_rd_issue && grad_rd_issue)
|
||
$error("ch328 1c: mp+grad concurrent attr read — expected time-exclusive via all_grad_done");
|
||
`endif
|
||
`ifndef SYNTHESIS
|
||
initial if (PRIM_ATTR_W != 1053)
|
||
$error("ch328: prim_attr_t width drifted to %0d (expected 1053) — pack/unpack/checker out of sync", PRIM_ATTR_W);
|
||
`endif
|
||
// delayed-copy pack-stage state: 1 cyc after push_ok, fifo_*[pack_slot] are settled
|
||
logic pack_v;
|
||
logic [$clog2(FIFO_DEPTH)-1:0] pack_slot;
|
||
// Ch328 (1b) — sinks for the grad-engine-ONLY input fields when a rasterizer reader
|
||
// (mp_load_prim / pop) unpacks the full attr_ram word; the rasterizer doesn't consume these.
|
||
logic [31:0] mp_dump_u1v, mp_dump_u2v, mp_dump_v1z, mp_dump_v2z;
|
||
logic [63:0] mp_dump_stq1, mp_dump_stq2;
|
||
logic [23:0] mp_dump_q1, mp_dump_q2;
|
||
logic [31:0] pp_dump_u1v, pp_dump_u2v, pp_dump_v1z, pp_dump_v2z;
|
||
logic [63:0] pp_dump_stq1, pp_dump_stq2;
|
||
logic [23:0] pp_dump_q1, pp_dump_q2;
|
||
// Ch328 (1b) — PUSH-ASSEMBLED input word (same-cycle write so attr_ram is valid at push+1,
|
||
// matching the register FIFO; the delayed pack stage caused a streaming pop read-after-write
|
||
// hazard). Two full concats faithfully MIRROR the push's TRI/SPRITE branches in EXACT pack
|
||
// order; the equivalence checker validates them against the live fifo_* writes (dark). Common
|
||
// fields (color/frame/tex/wrap/filter/alpha/ztst/zmsk/zbp) are identical in both branches.
|
||
wire [PRIM_ATTR_W-1:0] tri_attr_word = {
|
||
tri_v0x, tri_v0y, tri_v1x, tri_v1y, tri_v2x, tri_v2y,
|
||
{~tl2_init, ~tl1_init, ~tl0_init}, (tri_swap_ccw ? -tri_sa_calc : tri_sa_calc),
|
||
rgbaq_q, (fan_sat_path ? c_pivot_q : c_prev_q), (tri_swap_ccw ? rgbaq_q : c_curr_q),
|
||
(tri_swap_ccw ? c_curr_q : rgbaq_q),
|
||
frame_1_q[8:0], frame_1_q[21:16], frame_1_q[29:24], bpp_shift,
|
||
close_tme_effective, 11'd0, 11'd0, 11'd0, 11'd0,
|
||
tex_base_next, {8'd0, tex0_tbw}, tex0_psm, clamp_wms, clamp_wmt, tex0_tw, tex0_th,
|
||
32'sd0, 32'sd0, 1'b0, tex1_mmag,
|
||
alpha_a, alpha_b, alpha_c, alpha_d, alpha_fix,
|
||
new_tri_zte_active, test_ztst, zbuf_zmsk, zbuf_zbp, 32'd0, tri_v0z,
|
||
tri_uv0_u, tri_uv0_v, {5'd0, tri_uv1_v, 5'd0, tri_uv1_u}, {5'd0, tri_uv2_v, 5'd0, tri_uv2_u},
|
||
(saw_st_q && close_tme_effective), tri_s0, tri_t0, tri_q0,
|
||
{16'd0, tri_t1, tri_s1}, {16'd0, tri_t2, tri_s2}, tri_q1, tri_q2, tri_v1z, tri_v2z };
|
||
wire [PRIM_ATTR_W-1:0] sp_attr_word = {
|
||
sp_v0_x_next, sp_v0_y_next, sp_v1_x_next, sp_v1_y_next, 12'd0, 12'd0,
|
||
3'b000, 32'sd0, rgbaq_q, 64'd0, 64'd0, 64'd0,
|
||
frame_1_q[8:0], frame_1_q[21:16], frame_1_q[29:24], bpp_shift,
|
||
prim_tme, sp_u0_next, sp_v0t_next, sp_u1_next, sp_v1t_next,
|
||
tex_base_next, {8'd0, tex0_tbw}, tex0_psm, clamp_wms, clamp_wmt, tex0_tw, tex0_th,
|
||
du_dx_next, dv_dy_next, new_abe_active, tex1_mmag,
|
||
alpha_a, alpha_b, alpha_c, alpha_d, alpha_fix,
|
||
new_zte_active, test_ztst, zbuf_zmsk, zbuf_zbp, close_z, 32'd0,
|
||
11'd0, 11'd0, 32'd0, 32'd0,
|
||
1'b0, 24'd0, 24'd0, 24'd0, 64'd0, 64'd0, 24'd0, 24'd0, 32'd0, 32'd0 };
|
||
wire [PRIM_ATTR_W-1:0] attr_word_next = (new_mode == RM_TRI) ? tri_attr_word : sp_attr_word;
|
||
raster_mode_e fifo_mode [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_v0x [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_v0y [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_v1x [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_v1y [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_v2x [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_v2y [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_x_min [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_x_max [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_y_min [0:FIFO_DEPTH-1];
|
||
logic [11:0] fifo_y_max [0:FIFO_DEPTH-1];
|
||
logic [2:0] fifo_bias [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_sa [0:FIFO_DEPTH-1];
|
||
logic [63:0] fifo_color [0:FIFO_DEPTH-1];
|
||
logic [63:0] fifo_c0 [0:FIFO_DEPTH-1];
|
||
logic [63:0] fifo_c1 [0:FIFO_DEPTH-1];
|
||
logic [63:0] fifo_c2 [0:FIFO_DEPTH-1];
|
||
logic [8:0] fifo_fbp [0:FIFO_DEPTH-1];
|
||
logic [5:0] fifo_fbw [0:FIFO_DEPTH-1];
|
||
logic [5:0] fifo_psm [0:FIFO_DEPTH-1];
|
||
logic [1:0] fifo_bpp_shift [0:FIFO_DEPTH-1];
|
||
// Brick 1 — per-primitive texture context (SPRITE DECAL v1).
|
||
// fifo_tme gates the textured path; fifo_uv0/uv1 are the integer
|
||
// texel coords at the two SPRITE endpoints (already >>4 from the
|
||
// UV 10.4 fixed-point); fifo_tex_base/tbw/psm describe the
|
||
// texture in VRAM. Carried alongside the geometry so the in-flight
|
||
// scan samples the right texture even if a new TEX0/PRIM arrives.
|
||
logic fifo_tme [0:FIFO_DEPTH-1];
|
||
logic [10:0] fifo_u0 [0:FIFO_DEPTH-1];
|
||
logic [10:0] fifo_v0 [0:FIFO_DEPTH-1];
|
||
logic [10:0] fifo_u1 [0:FIFO_DEPTH-1];
|
||
logic [10:0] fifo_v1 [0:FIFO_DEPTH-1];
|
||
logic [31:0] fifo_tex_base [0:FIFO_DEPTH-1];
|
||
logic [13:0] fifo_tbw [0:FIFO_DEPTH-1];
|
||
logic [5:0] fifo_tpsm [0:FIFO_DEPTH-1];
|
||
// Ch294 — per-primitive wrap-mode snapshot (CLAMP_1 WMS/WMT + TEX0 TW/TH).
|
||
logic [1:0] fifo_wms [0:FIFO_DEPTH-1];
|
||
logic [1:0] fifo_wmt [0:FIFO_DEPTH-1];
|
||
logic [3:0] fifo_tw [0:FIFO_DEPTH-1];
|
||
logic [3:0] fifo_th [0:FIFO_DEPTH-1];
|
||
// Brick 1 (DDA) — Q16.16 per-pixel texel-coord increments, computed
|
||
// once per primitive at push (single divide each).
|
||
logic signed [31:0] fifo_du_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dv_dy [0:FIFO_DEPTH-1];
|
||
// Brick 2a — per-primitive alpha-blend-active flag.
|
||
logic fifo_abe [0:FIFO_DEPTH-1];
|
||
// Ch310 — per-primitive bilinear-filter flag (TEX1_1.MMAG). Loaded always
|
||
// (mirrors fifo_abe); consumed only when BILINEAR_ENABLE=1, so the default
|
||
// build is byte-identical.
|
||
logic fifo_filter_lin [0:FIFO_DEPTH-1];
|
||
// Brick 2c — per-primitive generic GS ALPHA selector snapshot
|
||
// (A/B/C/D + FIX). Loaded always; consumed by u_comb_blend only when
|
||
// ALPHA_MODES_ENABLE=1, so the default build is byte-identical.
|
||
logic [1:0] fifo_alpha_a [0:FIFO_DEPTH-1];
|
||
logic [1:0] fifo_alpha_b [0:FIFO_DEPTH-1];
|
||
logic [1:0] fifo_alpha_c [0:FIFO_DEPTH-1];
|
||
logic [1:0] fifo_alpha_d [0:FIFO_DEPTH-1];
|
||
logic [7:0] fifo_alpha_fix [0:FIFO_DEPTH-1];
|
||
// Brick 2b — per-primitive Z-test context.
|
||
logic fifo_zte [0:FIFO_DEPTH-1];
|
||
logic [1:0] fifo_ztst [0:FIFO_DEPTH-1];
|
||
logic fifo_zmsk [0:FIFO_DEPTH-1];
|
||
logic [8:0] fifo_zbp [0:FIFO_DEPTH-1];
|
||
logic [31:0] fifo_zval [0:FIFO_DEPTH-1];
|
||
// Brick 3 — per-primitive TRI affine interpolation context.
|
||
// Q16.16 per-pixel gradients for R/G/B/A/Z, computed once at push
|
||
// (one divide each). Z base value at post-swap vertex 0 (v0); the
|
||
// color bases come from ras_c0_q[..] (already captured). x0/y0 are
|
||
// ras_v0_x/ras_v0_y. fifo_tri_zte gates whether the TRI feeds the
|
||
// Brick-2b Z path. ras_tri_active gates the whole affine path.
|
||
logic fifo_tri_active [0:FIFO_DEPTH-1];
|
||
// COMBINED probe — per-primitive flag: this slot is the textured +
|
||
// alpha + depth-tested triangle that the combined FSM owns. Set only
|
||
// when COMBINED_TAZ=1 (close_combined). Always 0 at param=0.
|
||
logic fifo_combined [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dr_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dr_dy [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dg_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dg_dy [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_db_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_db_dy [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_da_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_da_dy [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dz_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dz_dy [0:FIFO_DEPTH-1];
|
||
logic [31:0] fifo_z0 [0:FIFO_DEPTH-1];
|
||
// Textured-triangle rung — per-slot U/V affine context. The two
|
||
// extra interpolated attributes (U,V) ride the SAME shared-divider
|
||
// setup engine as R/G/B/A/Z (GRAD_STEPS grows to cover their dx/dy);
|
||
// the per-pixel evaluator reuses the existing right-sized 32x16
|
||
// interp_affine path. fifo_u0_base/v0_base are the integer texel
|
||
// coords at post-swap vertex 0 (the affine base, like ras_z0 for Z);
|
||
// the 4 gradient fields are written by the engine on the new steps.
|
||
// The texture descriptor (tme/tex_base/tbw/tpsm) reuses the existing
|
||
// fifo_tme/fifo_tex_base/fifo_tbw/fifo_tpsm fields (set to 0 for an
|
||
// untextured TRI, so the legacy Gouraud path is unchanged).
|
||
logic [10:0] fifo_u0_base [0:FIFO_DEPTH-1];
|
||
logic [10:0] fifo_v0_base [0:FIFO_DEPTH-1];
|
||
logic [31:0] fifo_u1v [0:FIFO_DEPTH-1]; // post-swap v1 {v,u} packed (for engine recompute)
|
||
logic [31:0] fifo_u2v [0:FIFO_DEPTH-1]; // post-swap v2 {v,u} packed
|
||
logic signed [31:0] fifo_du_dx_t [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_du_dy_t [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dv_dx_t [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dv_dy_t [0:FIFO_DEPTH-1];
|
||
// Ch301 perspective — per-slot S/T/Q affine-of-perspective-attribute
|
||
// context. A TRI uses the perspective path iff fifo_persp[slot]=1
|
||
// (PERSPECTIVE_CORRECT=1 AND TME AND ST-sourced). S/T/Q are 24-bit
|
||
// fixed-point (FRAC=12). The base values are post-swap vertex 0; v1/v2
|
||
// S/T/Q are packed so the decoupled gradient engine can recompute the
|
||
// numerators at schedule time (same role fifo_u1v/u2v play for U/V).
|
||
// fifo_stq1/2 pack {16'd0, T(24), S(24)}; fifo_q1/q2 carry Q(24).
|
||
// The 6 gradient fields (dS/dx,dS/dy,dT/dx,dT/dy,dQ/dx,dQ/dy) are
|
||
// signed Q16.16, solved by the shared divider engine on steps 14..19.
|
||
// These declarations are ALWAYS present (unused FIFO fields cost
|
||
// nothing behaviorally at param=0); the param-gated USE is the
|
||
// localparam GRAD_STEPS / fifo_persp / ras_persp plumbing, so the
|
||
// param=0 build is byte-identical.
|
||
logic fifo_persp [0:FIFO_DEPTH-1];
|
||
logic [23:0] fifo_s0_base [0:FIFO_DEPTH-1];
|
||
logic [23:0] fifo_t0_base [0:FIFO_DEPTH-1];
|
||
logic [23:0] fifo_q0_base [0:FIFO_DEPTH-1];
|
||
logic [63:0] fifo_stq1 [0:FIFO_DEPTH-1]; // post-swap v1 {16'd0,T,S}
|
||
logic [63:0] fifo_stq2 [0:FIFO_DEPTH-1]; // post-swap v2 {16'd0,T,S}
|
||
logic [23:0] fifo_q1 [0:FIFO_DEPTH-1]; // post-swap v1 Q
|
||
logic [23:0] fifo_q2 [0:FIFO_DEPTH-1]; // post-swap v2 Q
|
||
logic signed [31:0] fifo_ds_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_ds_dy [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dt_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dt_dy [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dq_dx [0:FIFO_DEPTH-1];
|
||
logic signed [31:0] fifo_dq_dy [0:FIFO_DEPTH-1];
|
||
// Brick-3 fix — per-slot post-swap v1.z / v2.z, stored so the
|
||
// gradient setup engine can recompute the Z numerators for a slot
|
||
// at engine-start time (the engine is now decoupled from push and
|
||
// self-schedules across pending slots; v0.z is already in fifo_z0).
|
||
logic [31:0] fifo_v1z [0:FIFO_DEPTH-1];
|
||
logic [31:0] fifo_v2z [0:FIFO_DEPTH-1];
|
||
// Ch295 — per-slot "gradients not yet solved" flag. Set when a TRI
|
||
// is pushed; cleared by the shared single-divider setup engine when
|
||
// it finishes writing all 10 gradient fields for that slot. pop_ok
|
||
// is blocked for a slot while this is set, so the rasterizer never
|
||
// pops a TRI whose affine gradients are still being computed.
|
||
logic fifo_grad_pending [0:FIFO_DEPTH-1];
|
||
|
||
// Ch171 — widen pointers / counter for FIFO_DEPTH=4. The
|
||
// pre-Ch171 declarations were 1-bit pointer + 2-bit counter
|
||
// tuned for FIFO_DEPTH=2; bumping depth without widening these
|
||
// truncated indices to 1 bit so the FIFO addressed only 2 of
|
||
// its 4 slots (re-introduced the "drop sprite 4" bug under a
|
||
// different shape). PTR_W / CNT_W follow FIFO_DEPTH.
|
||
localparam int FIFO_PTR_W = $clog2(FIFO_DEPTH); // 2 for DEPTH=4
|
||
localparam int FIFO_CNT_W = $clog2(FIFO_DEPTH + 1); // 3 for DEPTH=4
|
||
logic [FIFO_PTR_W-1:0] fifo_wptr;
|
||
logic [FIFO_PTR_W-1:0] fifo_rptr;
|
||
logic [FIFO_CNT_W-1:0] fifo_count;
|
||
logic fifo_empty;
|
||
logic fifo_full;
|
||
assign fifo_empty = (fifo_count == '0);
|
||
assign fifo_full = (fifo_count == FIFO_CNT_W'(FIFO_DEPTH));
|
||
|
||
// Ch295 / Brick-3 fix — single time-shared gradient divider + setup
|
||
// engine, DECOUPLED from the push.
|
||
//
|
||
// The engine self-schedules: whenever it is idle and some FIFO slot
|
||
// is flagged grad_pending, it picks the lowest such slot, recomputes
|
||
// that slot's 10 pre-shifted numerators (grad_num_q) and the common
|
||
// divisor (grad_det_q) from the slot's STORED post-swap vertex /
|
||
// colour / Z data, then iterates ONE divide per cycle over the 10
|
||
// numerators (grad_step 0..9), writing quo[31:0] into the matching
|
||
// fifo_d*_* field of grad_slot. On the last step it clears that
|
||
// slot's grad_pending and goes idle, then picks the next pending
|
||
// slot (if any). Setup is ~10 cycles per triangle — negligible
|
||
// against the thousands of cycles a TRI scan takes — and pop_ok
|
||
// holds each slot until its own gradients are ready, so the engine
|
||
// can never be clobbered by a back-to-back push and no primitive is
|
||
// dropped. The numerators are recomputed from FIFO storage (the same
|
||
// grad_num_dadx/dady functions as before), so the arithmetic is
|
||
// BIT-IDENTICAL to the pre-fix per-push solve.
|
||
// GRAD_STEPS = 14: R/G/B/A/Z (5 attrs × dx,dy = 10 steps) PLUS the
|
||
// textured-triangle rung's U,V (2 attrs × dx,dy = 4 more steps:
|
||
// 10=dU/dx 11=dU/dy 12=dV/dx 13=dV/dy). The shared divider simply
|
||
// iterates 4 more cycles per triangle — NO new divider, NO new
|
||
// per-pixel division. grad_step is 4-bit (0..15) so it already
|
||
// covers 0..13; only the localparam + the step decode grow.
|
||
// Ch301 perspective — GRAD_STEPS grows to 20 ONLY when the
|
||
// perspective path is compiled in (steps 14..19 = dS/dx,dS/dy,
|
||
// dT/dx,dT/dy,dQ/dx,dQ/dy). When PERSPECTIVE_CORRECT=0 it stays 14 so
|
||
// the engine iterates the same 14 steps and the grad_num_q /
|
||
// grad_load_num arrays are the same size as before — the param=0
|
||
// build is unchanged in size-effect and behavior.
|
||
localparam int GRAD_STEPS = PERSPECTIVE_CORRECT ? 20 : 14;
|
||
logic grad_busy;
|
||
logic [4:0] grad_step; // 0..13 (or 0..19 w/ perspective)
|
||
logic [4:0] grad_settle; // Ch352 — divide settle counter (0..GRAD_DIV_CYCLES-1)
|
||
// (* preserve *) — Ch352 (Codex): keep grad_result_q as a named 32-reg endpoint. Without it, synthesis
|
||
// optimized/renamed it away, the multicycle -to matched 0 registers, the exception applied to NOTHING, and
|
||
// Place churned timing the -66ns divider single-cycle. The SDC asserts the dest count is exactly 32.
|
||
(* preserve *) logic signed [31:0] grad_result_q; // settled divide captured here (the ONE multicycle endpoint)
|
||
logic grad_writing; // Ch352 — write phase: grad_result_q -> fifo (single-cycle)
|
||
logic [FIFO_PTR_W-1:0] grad_slot;
|
||
logic signed [31:0] grad_det_q;
|
||
logic signed [55:0] grad_num_q [0:GRAD_STEPS-1];
|
||
|
||
// Combinational "is there a slot waiting for gradients?" scan and the
|
||
// index of the lowest such slot. Used only when the engine is idle to
|
||
// pick the next triangle to solve.
|
||
logic grad_pending_any;
|
||
logic [FIFO_PTR_W-1:0] grad_pending_slot;
|
||
always_comb begin
|
||
grad_pending_any = 1'b0;
|
||
grad_pending_slot = '0;
|
||
for (int s = FIFO_DEPTH-1; s >= 0; s = s - 1) begin
|
||
if (fifo_grad_pending[s]) begin
|
||
grad_pending_any = 1'b1;
|
||
grad_pending_slot = FIFO_PTR_W'(s);
|
||
end
|
||
end
|
||
end
|
||
|
||
// Combinational numerators + divisor for the slot the engine is about
|
||
// to load (grad_pending_slot). These reuse the grad_num_dadx/dady
|
||
// functions on the slot's STORED post-swap data — the same pure
|
||
// multiply/subtract/shift expressions evaluated at push time before
|
||
// the fix. fifo_sa[slot] is the post-swap determinant (== tri_det_post
|
||
// by construction). Only ONE slot's worth is computed at a time (mux
|
||
// by grad_pending_slot), so this is one set of the existing functions,
|
||
// not FIFO_DEPTH copies — no new wide datapath.
|
||
logic signed [55:0] grad_load_num [0:GRAD_STEPS-1];
|
||
logic signed [31:0] grad_load_det;
|
||
logic [63:0] grad_ld_c0, grad_ld_c1, grad_ld_c2;
|
||
logic [11:0] grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y;
|
||
logic [31:0] grad_ld_z0, grad_ld_z1, grad_ld_z2;
|
||
// Textured-triangle rung — per-vertex U/V for the slot being loaded.
|
||
logic [10:0] grad_ld_u0, grad_ld_v0u, grad_ld_u1, grad_ld_v1u, grad_ld_u2, grad_ld_v2u;
|
||
// Ch301 perspective — per-vertex S/T/Q for the slot being loaded.
|
||
// Declared unconditionally (cheap); only ASSIGNED + used by the
|
||
// param-gated grad_load_num[14..19] block below.
|
||
logic [23:0] grad_ld_s0, grad_ld_t0, grad_ld_q0;
|
||
logic [23:0] grad_ld_s1, grad_ld_t1, grad_ld_q1;
|
||
logic [23:0] grad_ld_s2, grad_ld_t2, grad_ld_q2;
|
||
// Ch328 (1b) step C — registered M20K prefetch for the gradient engine. grad_word_q holds
|
||
// attr_ram[grad_slot] (1-cyc sync read, loaded in the prefetch cycle); the grad_ld_* unpack it
|
||
// (LHS-concat, pack order) instead of reading the input fifo_* registers. gw_d_* sink the
|
||
// fields the gradient solve doesn't use. (pop_ok holds the slot until grad_pending clears, so
|
||
// the extra prefetch cycle leaves the rendered framebuffer byte-identical.)
|
||
reg [PRIM_ATTR_W-1:0] grad_word_q;
|
||
reg grad_prefetching;
|
||
reg grad_prefetch2; // Ch328 1c — 2nd prefetch stage (the +1 registered-read-latency cycle)
|
||
// Ch328 1c — grad issues its (single-port) read combinationally during its idle+pending cycle;
|
||
// attr_rd_q holds attr_ram[grad_pending_slot] the NEXT cycle. mp/pop read-issues are wired when
|
||
// those readers are converted (tied 0 here so the shared mux + assertion are well-defined).
|
||
assign grad_rd_issue = !grad_busy && !grad_prefetching && !grad_prefetch2 && grad_pending_any;
|
||
assign grad_rd_addr = grad_pending_slot;
|
||
// Ch328 1c — mp_load_prim read-stall FSM. A load site latches its slot/bbox + sets mp_rd_state
|
||
// to RDS_ISSUE and HOLDS the tile phase; RDS_ISSUE drives attr_rd_addr (comb) so attr_rd_q is
|
||
// valid in RDS_CONSUME, where mp_load_prim consumes it and the phase finally advances.
|
||
localparam logic [1:0] RDS_IDLE = 2'd0, RDS_ISSUE = 2'd1, RDS_CONSUME = 2'd2;
|
||
reg [1:0] mp_rd_state;
|
||
reg [FIFO_PTR_W-1:0] mp_pend_slot;
|
||
reg [11:0] mp_pend_cxlo, mp_pend_cxhi, mp_pend_cylo, mp_pend_cyhi;
|
||
assign mp_rd_issue = (mp_rd_state == RDS_ISSUE);
|
||
assign mp_rd_addr = mp_pend_slot;
|
||
// Ch328 1c — streaming pop is NOT on the shared M20K port (it stays a single-cycle register read):
|
||
// a multi-cycle M20K read would break the full-FIFO push+pop same-cycle slot-free concurrency, and
|
||
// the shallow streaming path gains no capacity benefit from M20K. So pop's port request is tied off.
|
||
assign pop_rd_issue = 1'b0;
|
||
assign pop_rd_addr = '0;
|
||
logic [11:0] gw_v0x,gw_v0y,gw_v1x,gw_v1y,gw_v2x,gw_v2y; logic [2:0] gw_d_bias; logic signed [31:0] gw_sa;
|
||
logic [63:0] gw_d_color, gw_c0, gw_c1, gw_c2; logic [8:0] gw_d_fbp; logic [5:0] gw_d_fbw,gw_d_psm; logic [1:0] gw_d_bpp;
|
||
logic gw_d_tme; logic [10:0] gw_d_u0,gw_d_v0,gw_d_u1,gw_d_v1; logic [31:0] gw_d_texb; logic [13:0] gw_d_tbw; logic [5:0] gw_d_tpsm;
|
||
logic [1:0] gw_d_wms,gw_d_wmt; logic [3:0] gw_d_tw,gw_d_th; logic signed [31:0] gw_d_dudx,gw_d_dvdy; logic gw_d_abe,gw_d_flin;
|
||
logic [1:0] gw_d_aa,gw_d_ab,gw_d_ac,gw_d_ad; logic [7:0] gw_d_afix; logic gw_d_zte; logic [1:0] gw_d_ztst; logic gw_d_zmsk;
|
||
logic [8:0] gw_d_zbp; logic [31:0] gw_d_zval;
|
||
logic [31:0] gw_z0; logic [10:0] gw_u0b,gw_v0b; logic [31:0] gw_u1v,gw_u2v;
|
||
logic gw_d_persp; logic [23:0] gw_s0b,gw_t0b,gw_q0b; logic [63:0] gw_stq1,gw_stq2; logic [23:0] gw_q1,gw_q2;
|
||
logic [31:0] gw_v1z,gw_v2z;
|
||
always_comb begin
|
||
{ gw_v0x,gw_v0y,gw_v1x,gw_v1y,gw_v2x,gw_v2y,gw_d_bias,gw_sa,gw_d_color,gw_c0,gw_c1,gw_c2,
|
||
gw_d_fbp,gw_d_fbw,gw_d_psm,gw_d_bpp,gw_d_tme,gw_d_u0,gw_d_v0,gw_d_u1,gw_d_v1,gw_d_texb,gw_d_tbw,gw_d_tpsm,
|
||
gw_d_wms,gw_d_wmt,gw_d_tw,gw_d_th,gw_d_dudx,gw_d_dvdy,gw_d_abe,gw_d_flin,gw_d_aa,gw_d_ab,gw_d_ac,gw_d_ad,gw_d_afix,
|
||
gw_d_zte,gw_d_ztst,gw_d_zmsk,gw_d_zbp,gw_d_zval,gw_z0,gw_u0b,gw_v0b,gw_u1v,gw_u2v,
|
||
gw_d_persp,gw_s0b,gw_t0b,gw_q0b,gw_stq1,gw_stq2,gw_q1,gw_q2,gw_v1z,gw_v2z } = grad_word_q;
|
||
grad_ld_c0 = gw_c0;
|
||
grad_ld_c1 = gw_c1;
|
||
grad_ld_c2 = gw_c2;
|
||
grad_ld_v0x = gw_v0x;
|
||
grad_ld_v0y = gw_v0y;
|
||
grad_ld_v1x = gw_v1x;
|
||
grad_ld_v1y = gw_v1y;
|
||
grad_ld_v2x = gw_v2x;
|
||
grad_ld_v2y = gw_v2y;
|
||
grad_ld_z0 = gw_z0;
|
||
grad_ld_z1 = gw_v1z;
|
||
grad_ld_z2 = gw_v2z;
|
||
grad_ld_u0 = gw_u0b;
|
||
grad_ld_v0u = gw_v0b;
|
||
grad_ld_u1 = gw_u1v [10:0];
|
||
grad_ld_v1u = gw_u1v [26:16];
|
||
grad_ld_u2 = gw_u2v [10:0];
|
||
grad_ld_v2u = gw_u2v [26:16];
|
||
grad_load_det = gw_sa;
|
||
grad_load_num[0] = grad_num_dadx({24'd0, grad_ld_c0[7:0]}, {24'd0, grad_ld_c1[7:0]}, {24'd0, grad_ld_c2[7:0]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[1] = grad_num_dady({24'd0, grad_ld_c0[7:0]}, {24'd0, grad_ld_c1[7:0]}, {24'd0, grad_ld_c2[7:0]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[2] = grad_num_dadx({24'd0, grad_ld_c0[15:8]}, {24'd0, grad_ld_c1[15:8]}, {24'd0, grad_ld_c2[15:8]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[3] = grad_num_dady({24'd0, grad_ld_c0[15:8]}, {24'd0, grad_ld_c1[15:8]}, {24'd0, grad_ld_c2[15:8]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[4] = grad_num_dadx({24'd0, grad_ld_c0[23:16]}, {24'd0, grad_ld_c1[23:16]}, {24'd0, grad_ld_c2[23:16]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[5] = grad_num_dady({24'd0, grad_ld_c0[23:16]}, {24'd0, grad_ld_c1[23:16]}, {24'd0, grad_ld_c2[23:16]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[6] = grad_num_dadx({24'd0, grad_ld_c0[31:24]}, {24'd0, grad_ld_c1[31:24]}, {24'd0, grad_ld_c2[31:24]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[7] = grad_num_dady({24'd0, grad_ld_c0[31:24]}, {24'd0, grad_ld_c1[31:24]}, {24'd0, grad_ld_c2[31:24]},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[8] = grad_num_dadx(grad_ld_z0, grad_ld_z1, grad_ld_z2,
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[9] = grad_num_dady(grad_ld_z0, grad_ld_z1, grad_ld_z2,
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
// Textured-triangle rung — U (steps 10/11) and V (steps 12/13).
|
||
// U/V are 11-bit unsigned integer texel coords; zero-extend to
|
||
// the signed[31:0] attribute input the existing numerator
|
||
// functions take. Same shared divisor (grad_load_det) → same
|
||
// affine plane solve as the colour/Z attributes.
|
||
grad_load_num[10] = grad_num_dadx({21'd0, grad_ld_u0}, {21'd0, grad_ld_u1}, {21'd0, grad_ld_u2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[11] = grad_num_dady({21'd0, grad_ld_u0}, {21'd0, grad_ld_u1}, {21'd0, grad_ld_u2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[12] = grad_num_dadx({21'd0, grad_ld_v0u}, {21'd0, grad_ld_v1u}, {21'd0, grad_ld_v2u},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[13] = grad_num_dady({21'd0, grad_ld_v0u}, {21'd0, grad_ld_v1u}, {21'd0, grad_ld_v2u},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
end
|
||
|
||
// Ch301 perspective — per-vertex S/T/Q load + the 6 extra gradient
|
||
// numerators (steps 14..19), GUARDED so the grad_load_num indices
|
||
// 14..19 (which only exist when GRAD_STEPS==20) are never referenced
|
||
// in the param=0 build. S/T/Q are 24-bit fixed-point; zero-extend to
|
||
// the signed[31:0] attribute input of the SAME grad_num_dadx/dady
|
||
// functions (identical structure to the U/V steps), sharing the same
|
||
// divisor (grad_load_det) → same affine plane solve over S_fp/T_fp/
|
||
// Q_fp. The per-pixel perspective divide (S/Q, T/Q) is done LATER by
|
||
// gs_persp_uv on the emit side (human-wired); here we only interpolate
|
||
// the three perspective attributes linearly in screen space.
|
||
generate
|
||
if (PERSPECTIVE_CORRECT) begin : g_grad_stq
|
||
always_comb begin
|
||
grad_ld_s0 = gw_s0b;
|
||
grad_ld_t0 = gw_t0b;
|
||
grad_ld_q0 = gw_q0b;
|
||
grad_ld_s1 = gw_stq1[23:0];
|
||
grad_ld_t1 = gw_stq1[47:24]; // pack is {16'd0,T[47:24],S[23:0]}
|
||
grad_ld_q1 = gw_q1;
|
||
grad_ld_s2 = gw_stq2[23:0];
|
||
grad_ld_t2 = gw_stq2[47:24];
|
||
grad_ld_q2 = gw_q2;
|
||
// S (steps 14/15)
|
||
grad_load_num[14] = grad_num_dadx({8'd0, grad_ld_s0}, {8'd0, grad_ld_s1}, {8'd0, grad_ld_s2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[15] = grad_num_dady({8'd0, grad_ld_s0}, {8'd0, grad_ld_s1}, {8'd0, grad_ld_s2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
// T (steps 16/17)
|
||
grad_load_num[16] = grad_num_dadx({8'd0, grad_ld_t0}, {8'd0, grad_ld_t1}, {8'd0, grad_ld_t2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[17] = grad_num_dady({8'd0, grad_ld_t0}, {8'd0, grad_ld_t1}, {8'd0, grad_ld_t2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
// Q (steps 18/19)
|
||
grad_load_num[18] = grad_num_dadx({8'd0, grad_ld_q0}, {8'd0, grad_ld_q1}, {8'd0, grad_ld_q2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
grad_load_num[19] = grad_num_dady({8'd0, grad_ld_q0}, {8'd0, grad_ld_q1}, {8'd0, grad_ld_q2},
|
||
grad_ld_v0x, grad_ld_v0y, grad_ld_v1x, grad_ld_v1y, grad_ld_v2x, grad_ld_v2y);
|
||
end
|
||
end else begin : g_no_grad_stq
|
||
// Tie the S/T/Q load wires low so they have a single driver
|
||
// even when the perspective numerators are not compiled in.
|
||
assign grad_ld_s0 = 24'd0; assign grad_ld_t0 = 24'd0; assign grad_ld_q0 = 24'd0;
|
||
assign grad_ld_s1 = 24'd0; assign grad_ld_t1 = 24'd0; assign grad_ld_q1 = 24'd0;
|
||
assign grad_ld_s2 = 24'd0; assign grad_ld_t2 = 24'd0; assign grad_ld_q2 = 24'd0;
|
||
end
|
||
endgenerate
|
||
|
||
// Combinational divide of the currently-selected numerator by the
|
||
// captured determinant — the SINGLE divider in the triangle setup
|
||
// path. Mirrors the previous per-call expression exactly:
|
||
// (num <<< 16) / det, truncated to [31:0]. The <<<16 was already
|
||
// folded into grad_num_q by grad_num_dadx/dady.
|
||
logic signed [55:0] grad_quo;
|
||
always_comb begin
|
||
if (grad_det_q == 32'sd0)
|
||
grad_quo = 56'sd0;
|
||
else
|
||
grad_quo = grad_num_q[grad_step]
|
||
/ $signed({{24{grad_det_q[31]}}, grad_det_q});
|
||
end
|
||
logic signed [31:0] grad_result;
|
||
assign grad_result = grad_quo[31:0];
|
||
|
||
// Ch352 — SEQUENTIAL divider (GRAD_SEQ_DIVIDER=1): registered iterations, no combinational cone. The FSM
|
||
// pulses div_start with grad_num_q[grad_step]/sign-extended grad_det_q, waits div_done, and captures
|
||
// div_quo[31:0] into grad_result_q. Bit-exact to the combinational `/` above (tb_gs_grad_divider). When
|
||
// GRAD_SEQ_DIVIDER=0 the instance is generate-stripped and these tie inert (combinational path used).
|
||
logic div_start;
|
||
logic signed [55:0] div_quo;
|
||
logic div_busy_w, div_done_w;
|
||
generate
|
||
if (GRAD_SEQ_DIVIDER) begin : g_seqdiv
|
||
gs_grad_divider #(.W(56)) u_grad_div (
|
||
.clk(clk), .rst_n(rst_n), .start(div_start),
|
||
.num(grad_num_q[grad_step]),
|
||
.den($signed({{24{grad_det_q[31]}}, grad_det_q})),
|
||
.quo(div_quo), .busy(div_busy_w), .done(div_done_w)
|
||
);
|
||
end else begin : g_no_seqdiv
|
||
assign div_quo = 56'sd0; assign div_busy_w = 1'b0; assign div_done_w = 1'b0;
|
||
end
|
||
endgenerate
|
||
|
||
// Ch352 — FIFO write data. GRAD_SEQ_DIVIDER: the captured div_quo lives in grad_result_q. Else the original
|
||
// path: GRAD_DIV_CYCLES==1 writes the combinational divide directly (byte-identical), K>1 writes grad_result_q.
|
||
wire signed [31:0] grad_wr_data = GRAD_SEQ_DIVIDER ? grad_result_q
|
||
: ((GRAD_DIV_CYCLES == 1) ? grad_result : grad_result_q);
|
||
|
||
// Ch172 — expose FIFO-full so the wrapper can route it back to
|
||
// the GIF parser's in_ready gate.
|
||
// Brick-3 fix — backpressure is REAL fifo_full ONLY. The gradient
|
||
// setup engine no longer gates the PUSH side: gating push on
|
||
// grad_busy dropped back-to-back triangles that arrived during a
|
||
// prior triangle's ~10-cycle gradient solve (tb_ee_core_gif_raster_
|
||
// queue). Instead the engine is decoupled — it self-schedules across
|
||
// FIFO slots flagged grad_pending and computes each slot's gradients
|
||
// in turn; pop_ok holds a slot until its own gradients are ready.
|
||
// The queue therefore accepts every incoming primitive.
|
||
assign raster_fifo_full = fifo_full;
|
||
|
||
// ==================================================================
|
||
// COMBINED PROBE — per-pixel 5-beat textured+alpha+depth FSM.
|
||
//
|
||
// When ras_combined is set (only at COMBINED_TAZ=1, for a TRI that is
|
||
// TME PSMCT32 DECAL + ABE source-over + ZTE PSMZ32), the legacy S0
|
||
// walker is HELD at the current candidate pixel (ras_cur_x/y) and a
|
||
// dedicated 5-beat FSM runs ONE pixel at a time. The single read2
|
||
// port (arbitrated in the BRAM top) is used by at most ONE consumer
|
||
// per cycle, and the single write port emits at most one beat per
|
||
// cycle, so the existing top-level mux + mutual-exclusion $error
|
||
// asserts keep holding without any top change.
|
||
//
|
||
// Beat schedule (read2 is a 1-cycle REGISTERED read in the BRAM top:
|
||
// an enable+addr presented on beat N returns data on beat N+1):
|
||
// CB_Z (0): if pixel inside -> assert z_rd_en + z_rd_addr (held Z
|
||
// addr). No texel/dest read, no write.
|
||
// If pixel NOT inside -> advance walker immediately.
|
||
// CB_ZW (1): z_rd_data ready -> latch stored Z, compute ztest_pass.
|
||
// PASS -> assert texel request (tex in_valid) with held
|
||
// (u,v); texel returns at beat 2.
|
||
// FAIL -> NO texel/dest read, NO write; advance walker.
|
||
// CB_T (2): s1_tex_color ready -> latch as source color Cs/As.
|
||
// Assert fb_rd_en + fb_rd_addr (held FB addr); dest
|
||
// color returns at beat 3.
|
||
// CB_FB (3): dest color ready -> blend (Cs=texel, Cd=dest,
|
||
// As=texel[31:24]). WRITE #1: raster_pixel_emit color
|
||
// (PSMCT32, be=1111) to the FB addr.
|
||
// CB_ZWR (4): WRITE #2: raster_pixel_emit frag_z to the Z addr
|
||
// (be=1111, psm=0). If ZMSK, skip the write but still
|
||
// consume the beat. Then advance walker.
|
||
//
|
||
// The walker advances to the next candidate pixel only after BOTH the
|
||
// color write (beat 3) AND the Z write (beat 4) have completed (or
|
||
// after a not-inside / depth-FAIL early-out, where nothing is read or
|
||
// written for that pixel).
|
||
// ==================================================================
|
||
// Ch310 — CB_TWAIT is a NEW beat inserted between CB_ZW and CB_T, entered
|
||
// ONLY on the bilinear path (bili_now). It HOLDS the FSM while the 4-tap
|
||
// gs_texture_unit sampler runs its multi-beat read. When BILINEAR_ENABLE=0
|
||
// bili_now is a compile-time-constant 0 so CB_TWAIT is never entered and
|
||
// the encoding/behavior of the other beats is unchanged (byte-identical).
|
||
typedef enum logic [2:0] {
|
||
CB_Z, CB_ZW, CB_T, CB_FB, CB_ZWR, CB_TWAIT
|
||
} comb_beat_e;
|
||
comb_beat_e comb_state_r; // current beat (register)
|
||
|
||
// Held-candidate-pixel geometry — combinational on the FROZEN walker
|
||
// coords (ras_cur_x/ras_cur_y). The combined primitive is always a
|
||
// TRIANGLE, so the inside test is the biased edge-function test on the
|
||
// held pixel (same math as the S1 stage, evaluated at ras_cur_*).
|
||
logic [31:0] comb_e0, comb_e1, comb_e2;
|
||
logic comb_pix_inside_w;
|
||
assign comb_e0 = edge_fn(ras_cur_x, ras_cur_y, ras_v0_x, ras_v0_y, ras_v1_x, ras_v1_y);
|
||
assign comb_e1 = edge_fn(ras_cur_x, ras_cur_y, ras_v1_x, ras_v1_y, ras_v2_x, ras_v2_y);
|
||
assign comb_e2 = edge_fn(ras_cur_x, ras_cur_y, ras_v2_x, ras_v2_y, ras_v0_x, ras_v0_y);
|
||
assign comb_pix_inside_w =
|
||
($signed(comb_e0 + {31'd0, ras_bias[0]}) <= 32'sd0)
|
||
& ($signed(comb_e1 + {31'd0, ras_bias[1]}) <= 32'sd0)
|
||
& ($signed(comb_e2 + {31'd0, ras_bias[2]}) <= 32'sd0);
|
||
|
||
// Held-pixel offsets from post-swap v0 (same right-sized signed-16
|
||
// narrowing the S2 interpolators use) for Z interpolation.
|
||
logic signed [31:0] comb_dx, comb_dy;
|
||
assign comb_dx = $signed({20'd0, ras_cur_x}) - $signed({20'd0, ras_v0_x});
|
||
assign comb_dy = $signed({20'd0, ras_cur_y}) - $signed({20'd0, ras_v0_y});
|
||
|
||
// Held-pixel interpolated fragment Z (affine, same evaluator as the
|
||
// legacy TRI Z path).
|
||
logic [31:0] comb_frag_z;
|
||
assign comb_frag_z = interp_affine_z(ras_z0, ras_dz_dx, ras_dz_dy, comb_dx, comb_dy);
|
||
|
||
// Held-pixel FB and Z byte addresses — linear PSMCT32 layout (combined
|
||
// dest is PSMCT32, non-swizzled in this probe). Mirrors s2_fb_addr_
|
||
// linear / s2_z_addr but on the held walker coords.
|
||
logic [31:0] comb_pixel_index;
|
||
logic [31:0] comb_fb_addr;
|
||
logic [31:0] comb_z_addr;
|
||
assign comb_pixel_index = ({20'd0, ras_cur_y} * ({26'd0, ras_fbw} << 6))
|
||
+ {20'd0, ras_cur_x};
|
||
assign comb_fb_addr = ({23'd0, ras_fbp} << 11) + (comb_pixel_index << ras_bpp_shift);
|
||
assign comb_z_addr = ({23'd0, ras_zbp} << 11) + (comb_pixel_index << 2);
|
||
|
||
// Per-beat read-strobe contributions (driven only in combined mode).
|
||
// comb_z_req : beat 0, inside pixel only.
|
||
// comb_tex_req: beat 1, on depth PASS only (feeds the texture unit
|
||
// in_valid; texel returns beat 2).
|
||
// comb_fb_req : beat 2, on depth PASS only.
|
||
logic comb_z_req;
|
||
logic comb_tex_req;
|
||
logic comb_fb_req;
|
||
|
||
// Latched combined per-pixel state.
|
||
logic comb_ztest_pass_r; // depth test result, latched at beat 1
|
||
logic [31:0] comb_zstored_r; // stored Z captured at beat 1
|
||
logic [31:0] comb_cs_r; // source color (texel ABGR)
|
||
logic [7:0] comb_as_r; // source alpha (texel[31:24])
|
||
|
||
// Ch333 — TEX0.TFX (list-level): 0=MODULATE (texel*color), 1=DECAL (texel replaces). The
|
||
// combined-TAZ source color comb_cs_r honors it below; DECAL is the effective default for the
|
||
// Ch330-332 anchors (their TEX0 carries TFX=1). MODULATE feeds the staging RGBAQ (provoking-
|
||
// vertex color ras_c2_q) through the texel — with a unity (0x80) texture the output IS the
|
||
// RGBAQ color, so the host controls per-primitive color at runtime. Per-vertex gouraud
|
||
// gradient is a later rung (needs the interp color aligned to s1_tex_color at CB_T).
|
||
wire [1:0] comb_tfx = tex0_1_q[36:35];
|
||
wire comb_modulate = (comb_tfx == 2'b00);
|
||
function automatic logic [7:0] gs_mod8(input logic [7:0] t, input logic [7:0] c);
|
||
logic [15:0] p; begin p = ({8'd0,t} * {8'd0,c}) >> 7; gs_mod8 = (p > 16'd255) ? 8'hFF : p[7:0]; end
|
||
endfunction
|
||
function automatic logic [31:0] gs_modulate_abgr(input logic [31:0] tex, input logic [31:0] col);
|
||
gs_modulate_abgr = {tex[31:24], // A: keep texel alpha (opaque blend via unity tex)
|
||
gs_mod8(tex[23:16], col[23:16]), // B
|
||
gs_mod8(tex[15:8], col[15:8]), // G
|
||
gs_mod8(tex[7:0], col[7:0])}; // R
|
||
endfunction
|
||
|
||
// Ch304 — Z value the combined depth test compares against. In the
|
||
// legacy VRAM combined path (TILE_LOCAL=0) this is the VRAM read-back
|
||
// (z_rd_data); in tile-local mode it is the on-chip tile Z RAM read
|
||
// (tile_z_rdata, registered, valid at beat1). tile_active is constant
|
||
// 0 at TILE_LOCAL=0 so this collapses to z_rd_data (byte-identical).
|
||
// tile_z_rdata is a module-scope net assigned below near the tile RAM.
|
||
// Ch305+ tile COLOR RAM width: 16 (PSMCT16) when TILE_COLOR_PSMCT16,
|
||
// else 32 (PSMCT32). At default OFF this is 32 so the COLOR RAM +
|
||
// its data nets keep their legacy 32-bit width (byte-identical).
|
||
localparam int TILE_COLOR_W = TILE_COLOR_PSMCT16 ? 16 : 32;
|
||
logic [TILE_COLOR_W-1:0] tile_color_rdata; // registered read data, color RAM
|
||
logic [TILE_COLOR_W-1:0] tile_color_wdata;
|
||
logic [31:0] tile_z_rdata; // registered read data, Z RAM
|
||
logic [31:0] tile_z_wdata;
|
||
logic [31:0] comb_z_compare_w;
|
||
assign comb_z_compare_w = tile_active ? tile_z_rdata : z_rd_data;
|
||
|
||
// Combinational depth test on beat 1 (CB_ZW): z_rd_data is the
|
||
// 1-cycle-registered read of the address presented on beat 0
|
||
// (CB_Z) — i.e. THIS held pixel's stored Z. Compare per ras_ztst.
|
||
// GEQUAL/GREATER use frag_z vs stored_z (PS2: larger Z = nearer).
|
||
logic comb_ztest_pass_w;
|
||
always_comb begin
|
||
unique case (ras_ztst)
|
||
2'd0: comb_ztest_pass_w = 1'b0; // NEVER
|
||
2'd1: comb_ztest_pass_w = 1'b1; // ALWAYS
|
||
// Ch304 — comb_z_compare_w is z_rd_data (VRAM) at TILE_LOCAL=0
|
||
// and tile_z_rdata (on-chip) in tile mode; byte-identical else.
|
||
2'd2: comb_ztest_pass_w = (comb_frag_z >= comb_z_compare_w); // GEQUAL
|
||
2'd3: comb_ztest_pass_w = (comb_frag_z > comb_z_compare_w); // GREATER
|
||
default: comb_ztest_pass_w = 1'b0;
|
||
endcase
|
||
end
|
||
|
||
// Ch304 — dest color feeding the combined blend. In the legacy VRAM
|
||
// combined path (TILE_LOCAL=0) the dest is the VRAM read-back
|
||
// (fb_rd_data). In tile-local mode (tile_active) the dest is the
|
||
// ON-CHIP tile color RAM read (tile_color_rdata, valid one cycle after
|
||
// the beat-2 raddr was presented). comb_blend_cd is constant fb_rd_data
|
||
// at TILE_LOCAL=0 (tile_active const 0), so the combined VRAM path is
|
||
// byte-identical. (Declared below near the tile RAM; forward net.)
|
||
logic [31:0] comb_blend_cd;
|
||
|
||
// Combined blend — DEDICATED instance so the legacy u_blend datapath
|
||
// is untouched (byte-identical at param=0). Cs = sampled texel,
|
||
// Cd = dest pixel read back, As = texel alpha. Same source-over math.
|
||
logic [7:0] comb_blend_r, comb_blend_g, comb_blend_b, comb_blend_a;
|
||
generate
|
||
if (COMBINED_TAZ) begin : g_comb_blend
|
||
gs_alpha_blend #(.ALPHA_MODES(ALPHA_MODES_ENABLE)) u_comb_blend (
|
||
.cs_r (comb_cs_r[7:0]),
|
||
.cs_g (comb_cs_r[15:8]),
|
||
.cs_b (comb_cs_r[23:16]),
|
||
.as (comb_as_r),
|
||
.cd_r (comb_blend_cd[7:0]),
|
||
.cd_g (comb_blend_cd[15:8]),
|
||
.cd_b (comb_blend_cd[23:16]),
|
||
// Brick 2c — generic GS ALPHA selectors (read only when
|
||
// ALPHA_MODES_ENABLE=1; default OFF → source-over).
|
||
.a_sel(ras_alpha_a),
|
||
.b_sel(ras_alpha_b),
|
||
.c_sel(ras_alpha_c),
|
||
.d_sel(ras_alpha_d),
|
||
.ad (comb_blend_cd[31:24]),
|
||
.fix (ras_alpha_fix),
|
||
.cv_r (comb_blend_r),
|
||
.cv_g (comb_blend_g),
|
||
.cv_b (comb_blend_b),
|
||
.a_out(comb_blend_a)
|
||
);
|
||
end else begin : g_no_comb_blend
|
||
assign comb_blend_r = 8'd0;
|
||
assign comb_blend_g = 8'd0;
|
||
assign comb_blend_b = 8'd0;
|
||
assign comb_blend_a = 8'd0;
|
||
end
|
||
endgenerate
|
||
|
||
// ==================================================================
|
||
// Ch304 TILE-LOCAL render mode — phase enum + control regs declared
|
||
// EARLY so the combined-active gate (below) can reference them.
|
||
//
|
||
// tile_active = TILE_LOCAL && ras_combined : the in-flight primitive
|
||
// is the combined T+A+Z triangle AND we render it into the on-chip
|
||
// tile scratchpad. Constant 0 at TILE_LOCAL=0 (generate tie-off), so
|
||
// every signal derived from it collapses to its legacy value and the
|
||
// combined VRAM path is byte-identical.
|
||
//
|
||
// tile_phase_r overlays the existing raster_state FSM (which keeps its
|
||
// 2-bit {R_IDLE,R_SCAN,R_DRAIN} encoding untouched):
|
||
// TP_OFF : not in a tile render (or non-tile primitive)
|
||
// TP_CLEAR : sweeping the tile RAMs to the clear color/Z
|
||
// TP_RENDER : the combined FSM runs, retargeted to the tile RAMs
|
||
// TP_FLUSH : sweeping the tile color RAM out to the framebuffer
|
||
// ==================================================================
|
||
// TP_BIN : (BIN_BUFFER_ENABLE only) pre-render binning sweep — classify
|
||
// every (prim,tile) pair into the per-tile bins. Touches NO
|
||
// tile RAM, so the tile-RAM driver's case-default keeps it inert.
|
||
// tile_phase_r is widened to 3 bits to hold the 5th encoding TP_BIN. At the
|
||
// default BIN_BUFFER_ENABLE=0 TP_BIN is never assigned, so the synthesised
|
||
// state set is unchanged (TP_OFF..TP_FLUSH) — byte-identical.
|
||
localparam logic [2:0] TP_OFF = 3'd0;
|
||
localparam logic [2:0] TP_CLEAR = 3'd1;
|
||
localparam logic [2:0] TP_RENDER = 3'd2;
|
||
localparam logic [2:0] TP_FLUSH = 3'd3;
|
||
localparam logic [2:0] TP_BIN = 3'd4;
|
||
// Ch323 (TILE_SPILL_ENABLE only): RELOAD = load tile color+Z from the staging RAM
|
||
// before RENDER; ZFLUSH = emit the tile Z RAM to LPDDR after the color FLUSH. Never
|
||
// assigned at the default TILE_SPILL_ENABLE=0, so the synthesised state set is unchanged.
|
||
localparam logic [2:0] TP_RELOAD = 3'd5;
|
||
localparam logic [2:0] TP_ZFLUSH = 3'd6;
|
||
|
||
// tile_active — = ras_combined at TILE_LOCAL=1, tied 0 otherwise.
|
||
generate
|
||
if (TILE_LOCAL) begin : g_tile_active
|
||
assign tile_active = ras_combined;
|
||
end else begin : g_no_tile_active
|
||
assign tile_active = 1'b0;
|
||
end
|
||
endgenerate
|
||
|
||
// Tile-phase register + sweep counter. Both are dead (held 0) at
|
||
// TILE_LOCAL=0 because tile_active is constant 0 — the always_ff arms
|
||
// that write them are all `if (tile_active ...)`-gated.
|
||
logic [2:0] tile_phase_r; // current tile phase (3-bit: 5th value TP_BIN)
|
||
logic [8:0] tile_sweep_r; // 0..256 sweep counter (CLEAR + FLUSH)
|
||
assign tile_phase = tile_phase_r;
|
||
|
||
// ---- Ch323 tile COLOR+Z spill/reload signals (TILE_SPILL_ENABLE only) ----
|
||
// Z-flush emit (TP_ZFLUSH): one cycle after a Z read addr is presented, the registered
|
||
// tile_z_rdata is emitted to the LPDDR Z-backing via the exposed z_flush_* stream.
|
||
logic z_flush_emit_q; // pipeline strobe (mirrors flush_emit_q for color)
|
||
logic [7:0] z_flush_idx_q; // tile index latched alongside the emit
|
||
logic [11:0] z_flush_ox_q, z_flush_oy_q;
|
||
// Tile RELOAD (TP_RELOAD): on entry pulse reload_start_o (arm the de25 staging fill), wait
|
||
// tile_reload_ready_i, then sweep 256 indices presenting tile_reload_raddr_o; the staging
|
||
// engine returns tile_reload_color_i/z_i 1 cycle later, written into the tile RAMs. All 256
|
||
// are written before TP_RENDER (all-or-nothing — no primitive eval on a partial tile).
|
||
logic reload_wait; // waiting for staging fill (tile_reload_ready_i)
|
||
logic reload_wr_we; // tile-RAM write window (1 cyc behind the raddr)
|
||
logic [7:0] reload_wr_addr; // tile index being written this beat
|
||
|
||
// ==================================================================
|
||
// Ch305 TILE GRID — outer loop over TILE_COLS x TILE_ROWS tiles.
|
||
//
|
||
// N_TILES tiles, one per (col,row). The grid loop is gated entirely
|
||
// by tile_active (dead at TILE_LOCAL=0); at TILE_COLS=TILE_ROWS=1 the
|
||
// single tile is at the origin so tile_ox=tile_oy=0 and N_TILES=1 →
|
||
// exactly one iteration, byte-identical to Ch303.
|
||
//
|
||
// prim_x_min/max/y_min/max : a COPY of the popped primitive's bbox,
|
||
// preserved across the whole grid render because ras_x_min/max
|
||
// are OVERWRITTEN by the per-tile clipped bbox each tile.
|
||
// tile_col_r / tile_row_r : current tile coordinate in the grid.
|
||
// tile_ox / tile_oy : current tile origin in screen pixels
|
||
// (= col*16 / row*16). 12-bit to match the coord datapath.
|
||
// ==================================================================
|
||
localparam int N_TILES = TILE_COLS * TILE_ROWS;
|
||
|
||
logic [11:0] prim_x_min, prim_x_max, prim_y_min, prim_y_max;
|
||
// Counters are sized to cover the grid; widths are derived so a
|
||
// 1x1 grid still infers a valid (1-bit) register without warnings.
|
||
logic [11:0] tile_col_r; // 0..TILE_COLS-1
|
||
logic [11:0] tile_row_r; // 0..TILE_ROWS-1
|
||
logic [11:0] tile_ox, tile_oy;
|
||
assign tile_ox = tile_col_r << 4; // col*16
|
||
assign tile_oy = tile_row_r << 4; // row*16
|
||
// Ch323 — clean-Z BOOTSTRAP: a per-tile "this tile has been spilled to LPDDR" valid bit.
|
||
// On the FIRST render of a tile (valid=0) we must NOT reload (LPDDR holds garbage / a stale
|
||
// tile) — render from the local CLEAR instead. After the tile's color+Z FLUSH completes we
|
||
// set its bit; later passes RELOAD. Without this, the first batch reloads garbage Z and the
|
||
// GEQUAL depth test rejects every fragment (nothing renders). Reset clears all bits.
|
||
logic [N_TILES-1:0] spill_valid;
|
||
wire [31:0] tile_id_w = tile_row_r * TILE_COLS + tile_col_r;
|
||
|
||
// Per-tile clip of the primitive bbox against this tile's 16x16 span.
|
||
// clip_*_min = max(prim_*_min, tile_lo) ; clip_*_max = min(prim_*_max,
|
||
// tile_hi). iverilog-12: name the compares, no parenthesized bit-select.
|
||
logic [11:0] tile_x_lo, tile_x_hi, tile_y_lo, tile_y_hi;
|
||
assign tile_x_lo = tile_ox;
|
||
assign tile_x_hi = tile_ox + 12'd15;
|
||
assign tile_y_lo = tile_oy;
|
||
assign tile_y_hi = tile_oy + 12'd15;
|
||
|
||
logic [11:0] clip_x_min, clip_x_max, clip_y_min, clip_y_max;
|
||
// width-safe unsigned max/min via compare-then-select (no $signed).
|
||
// 3-way: intersect (prim bbox) n (tile bbox) n (scissor rect). At the
|
||
// default SCISSOR_ENABLE=0 the eff_sc* terms are 0/0xFFF → no-op.
|
||
logic [11:0] clip_x_min_pt, clip_x_max_pt, clip_y_min_pt, clip_y_max_pt;
|
||
assign clip_x_min_pt = (prim_x_min > tile_x_lo) ? prim_x_min : tile_x_lo;
|
||
assign clip_x_max_pt = (prim_x_max < tile_x_hi) ? prim_x_max : tile_x_hi;
|
||
assign clip_y_min_pt = (prim_y_min > tile_y_lo) ? prim_y_min : tile_y_lo;
|
||
assign clip_y_max_pt = (prim_y_max < tile_y_hi) ? prim_y_max : tile_y_hi;
|
||
assign clip_x_min = (clip_x_min_pt > eff_scx0) ? clip_x_min_pt : eff_scx0;
|
||
assign clip_x_max = (clip_x_max_pt < eff_scx1) ? clip_x_max_pt : eff_scx1;
|
||
assign clip_y_min = (clip_y_min_pt > eff_scy0) ? clip_y_min_pt : eff_scy0;
|
||
assign clip_y_max = (clip_y_max_pt < eff_scy1) ? clip_y_max_pt : eff_scy1;
|
||
|
||
// Empty overlap: primitive does not touch this tile. When empty we
|
||
// SKIP the RENDER (CLEAR -> FLUSH only) so the tile shows clear color.
|
||
logic tile_clip_empty;
|
||
assign tile_clip_empty = (clip_x_min > clip_x_max)
|
||
| (clip_y_min > clip_y_max);
|
||
|
||
// Last tile in the grid? (col == COLS-1 AND row == ROWS-1.) Compared
|
||
// against 12-bit literals of the param-1 so a 1x1 grid is just (0,0).
|
||
logic tile_is_last;
|
||
assign tile_is_last = (tile_col_r == 12'(TILE_COLS - 1))
|
||
& (tile_row_r == 12'(TILE_ROWS - 1));
|
||
|
||
// ==================================================================
|
||
// Ch305+ MULTI-PRIMITIVE TILED SCENE — per-tile primitive batch.
|
||
//
|
||
// In multiprim mode the streaming pop is disabled; instead a whole
|
||
// batch of TILE_PRIM_COUNT primitives (already in the FIFO with their
|
||
// gradients solved) is rendered into each tile in FIFO order. Within a
|
||
// tile the on-chip color/Z RAMs are CLEARED once (TP_CLEAR) then each
|
||
// primitive is rendered without an inter-primitive clear, so primitive
|
||
// p+1 composites over p. All regs/wires below are dead at the default
|
||
// TILE_MULTIPRIM=0 (no branch writes them).
|
||
// ==================================================================
|
||
logic [FIFO_CNT_W-1:0] prim_idx_r; // current primitive within tile (0..prim_count_r-1)
|
||
logic [FIFO_CNT_W-1:0] prim_count_r; // # primitives in the batch (latched at grid start)
|
||
logic [FIFO_PTR_W-1:0] grid_base_rptr; // fifo_rptr at grid start = slot of primitive 0
|
||
|
||
// ==================================================================
|
||
// BIN BUFFER (BIN_BUFFER_ENABLE only) — per-tile ordered primitive lists.
|
||
//
|
||
// Before any tile renders, the TP_BIN sweep classifies every (prim,tile)
|
||
// pair: if primitive p's bbox ∩ tile-t's 16x16 span ∩ scissor is non-empty,
|
||
// p is APPENDED to tile t's bin (bin_prim[t][bin_n[t]] = p; bin_n[t]++).
|
||
// The overlap test is the SAME max/min clip-empty math used by
|
||
// mp_next_nonempty, so a bin walked in slot order yields the IDENTICAL
|
||
// primitive sequence the re-test path would have produced. At
|
||
// BIN_BUFFER_ENABLE=0 these registers are never written (TP_BIN unreached)
|
||
// and the bin-walk render arms are folded out -> synth-pruned dead state.
|
||
//
|
||
// NTILES = TILE_COLS*TILE_ROWS (=4 for the 2x2 grid). t = row*COLS + col,
|
||
// so (col,row) = (t % TILE_COLS, t / TILE_COLS). A bin holds at most
|
||
// FIFO_DEPTH prims; prim_count_r <= FIFO_DEPTH so the append never overflows.
|
||
// ==================================================================
|
||
localparam int NTILES = TILE_COLS * TILE_ROWS;
|
||
// Index/counter widths. $clog2(1)=0 (a 1x1 grid at the default), so guard
|
||
// both width derivations to a minimum of 1 bit — a zero-width register/cast
|
||
// is a Quartus FATAL ("zero width constant") AND an iverilog synth error.
|
||
// BIN_T_W holds 0..NTILES (the sweep counter can equal NTILES at done).
|
||
// CUR_T_W indexes 0..NTILES-1.
|
||
localparam int CUR_T_W = (NTILES > 1) ? $clog2(NTILES) : 1;
|
||
localparam int BIN_T_W = (NTILES > 1) ? $clog2(NTILES + 1) : 1;
|
||
|
||
logic [FIFO_CNT_W-1:0] bin_prim [0:NTILES-1][0:FIFO_DEPTH-1]; // prim INDEX per slot
|
||
logic [FIFO_CNT_W-1:0] bin_n [0:NTILES-1]; // # prims in each tile's bin
|
||
|
||
// Ch315 — CAPACITY DIAGNOSTICS (sim-visible; read hierarchically by the TB, no
|
||
// new ports). raster_overflow (the existing latch) flags a FIFO-full push;
|
||
// these COUNT/peak them so a stress run quantifies capacity pressure.
|
||
// raster_overflow_count_r : # of dropped (FIFO-full) primitive pushes.
|
||
// bin_occ_max_r : peak per-tile bin occupancy seen this batch (the
|
||
// "full-ish bin" metric — how deep the busiest
|
||
// tile's bin got vs the FIFO_DEPTH ceiling).
|
||
// bin_overflow_r : DEFENSIVE bin-capacity flag — set if a bin push
|
||
// would exceed FIFO_DEPTH. With the bin sized to the
|
||
// FIFO (M=N) this can never fire (the FIFO drops the
|
||
// prim first), but it makes the bin's own ceiling
|
||
// explicit and would catch a future M<N decoupling.
|
||
logic [15:0] raster_overflow_count_r;
|
||
logic [FIFO_CNT_W-1:0] bin_occ_max_r;
|
||
logic bin_overflow_r;
|
||
// Ch329 — non-combined primitives have NO tile-local color/Z path (the tile RAMs + combined FSM
|
||
// are gated by tile_active == ras_combined). The multiprim grid is therefore COMBINED-TAZ-ONLY;
|
||
// a non-combined prim is REFUSED at binning (never added to a bin → never loaded → cannot stall
|
||
// the bin-walk) and counted here for visibility. (Full tile-local sprite support is future work.)
|
||
logic [15:0] tile_refused_count;
|
||
|
||
// Binning sweep counters: one (prim p, tile t) pair classified per cycle.
|
||
// Order chosen so that, for a FIXED tile t, prim p is visited in INCREASING
|
||
// order -> appends to a bin happen in p order -> draw order preserved.
|
||
// Loop nesting: t is the OUTER counter, p is the INNER counter (p advances
|
||
// every cycle, t advances when p wraps past prim_count_r-1). NTILES and
|
||
// prim_count_r bounds are register/param compares (not loop bounds), so no
|
||
// non-constant FOR-loop is introduced (Quartus-safe).
|
||
logic [BIN_T_W-1:0] bin_t; // current tile being classified (0..NTILES)
|
||
logic [FIFO_CNT_W-1:0] bin_p; // current prim being classified (0..prim_count_r)
|
||
logic [FIFO_CNT_W-1:0] bin_slot_r; // current slot within cur-tile's bin (render walk)
|
||
|
||
// Tile-t (binning sweep) screen bounds, derived from bin_t. col = t%COLS,
|
||
// row = t/COLS. For the 2x2 grid both are 0/1. Computed combinationally
|
||
// from bin_t so the overlap test below sees the tile currently being binned.
|
||
logic [11:0] bin_t_col, bin_t_row, bin_t_x_lo, bin_t_x_hi, bin_t_y_lo, bin_t_y_hi;
|
||
always_comb begin
|
||
bin_t_col = 12'(bin_t % TILE_COLS);
|
||
bin_t_row = 12'(bin_t / TILE_COLS);
|
||
bin_t_x_lo = bin_t_col << 4;
|
||
bin_t_x_hi = (bin_t_col << 4) + 12'd15;
|
||
bin_t_y_lo = bin_t_row << 4;
|
||
bin_t_y_hi = (bin_t_row << 4) + 12'd15;
|
||
end
|
||
|
||
// Overlap test for the pair (prim bin_p, tile bin_t): the prim's fifo bbox ∩
|
||
// tile-bin_t bbox ∩ scissor is non-empty. IDENTICAL max/min/empty math to
|
||
// mp_next_nonempty (just indexed by bin_p / the bin_t bounds). Indexed array
|
||
// reads pulled into locals first (iverilog-12 variable-index-in-always_comb).
|
||
logic [FIFO_PTR_W-1:0] bin_slot_p; // fifo slot of prim bin_p
|
||
logic [11:0] binf_fx_lo, binf_fx_hi, binf_fy_lo, binf_fy_hi;
|
||
logic [11:0] bin_cx_lo, bin_cx_hi, bin_cy_lo, bin_cy_hi;
|
||
logic bin_overlap;
|
||
always_comb begin
|
||
bin_slot_p = grid_base_rptr + FIFO_PTR_W'(bin_p);
|
||
binf_fx_lo = fifo_x_min[bin_slot_p]; binf_fx_hi = fifo_x_max[bin_slot_p];
|
||
binf_fy_lo = fifo_y_min[bin_slot_p]; binf_fy_hi = fifo_y_max[bin_slot_p];
|
||
bin_cx_lo = (binf_fx_lo > bin_t_x_lo) ? binf_fx_lo : bin_t_x_lo;
|
||
bin_cx_hi = (binf_fx_hi < bin_t_x_hi) ? binf_fx_hi : bin_t_x_hi;
|
||
bin_cy_lo = (binf_fy_lo > bin_t_y_lo) ? binf_fy_lo : bin_t_y_lo;
|
||
bin_cy_hi = (binf_fy_hi < bin_t_y_hi) ? binf_fy_hi : bin_t_y_hi;
|
||
// Fold scissor rect (no-op at default SCISSOR_ENABLE=0).
|
||
bin_cx_lo = (bin_cx_lo > eff_scx0) ? bin_cx_lo : eff_scx0;
|
||
bin_cx_hi = (bin_cx_hi < eff_scx1) ? bin_cx_hi : eff_scx1;
|
||
bin_cy_lo = (bin_cy_lo > eff_scy0) ? bin_cy_lo : eff_scy0;
|
||
bin_cy_hi = (bin_cy_hi < eff_scy1) ? bin_cy_hi : eff_scy1;
|
||
bin_overlap = !((bin_cx_lo > bin_cx_hi) || (bin_cy_lo > bin_cy_hi));
|
||
end
|
||
|
||
// Reduction over the UNPACKED fifo_grad_pending array — cannot use a
|
||
// simple |-reduce on an unpacked array, so loop it (iverilog-12 OK).
|
||
logic all_grad_done;
|
||
always_comb begin
|
||
all_grad_done = 1'b1;
|
||
for (int gi = 0; gi < FIFO_DEPTH; gi++)
|
||
if (fifo_grad_pending[gi]) all_grad_done = 1'b0;
|
||
end
|
||
|
||
// Ch331 — latched "end-of-list" flush request (feeder mode). Set by the 1-cycle
|
||
// prim_list_flush_i pulse, held until the grid actually starts (mp_grid_start), so the
|
||
// request can never be missed if the feeder's done arrives before the renderer is ready
|
||
// (gradients still solving, a prior render draining, etc.). Inert when MP_FLUSH_ONLY=0.
|
||
logic mp_grid_start;
|
||
|
||
// Ch336 — scene/batch accumulation state. scene_first_batch_r marks the first FIFO-batch of a
|
||
// scene (it CLEARS+full-flushes; later batches sparse-flush onto the accumulated FB). batch_full_r
|
||
// latches the flush mode for the in-flight grid. scene_done_w = the end-of-list flush is pending
|
||
// and nothing is left to render -> the scene is fully drained, so the next batch starts fresh.
|
||
logic scene_first_batch_r, batch_full_r;
|
||
logic mp_flush_pending;
|
||
wire scene_done_w = TILE_ACCUM_ENABLE && mp_flush_pending && (fifo_count == FIFO_CNT_W'(0))
|
||
&& (raster_state == R_IDLE) && !tile_render_busy;
|
||
always_ff @(posedge clk or negedge rst_n) begin
|
||
if (!rst_n) begin
|
||
mp_flush_pending <= 1'b0; scene_first_batch_r <= 1'b1; batch_full_r <= 1'b1;
|
||
end else if (TILE_ACCUM_ENABLE) begin
|
||
// end-of-list flush held until the scene fully drains (survives mid-list batch drains)
|
||
if (scene_done_w) mp_flush_pending <= 1'b0;
|
||
else if (prim_list_flush_i) mp_flush_pending <= 1'b1;
|
||
// first batch of a scene -> clears; reset when the scene is done
|
||
if (scene_done_w) scene_first_batch_r <= 1'b1;
|
||
else if (mp_grid_start) scene_first_batch_r <= 1'b0;
|
||
if (mp_grid_start) batch_full_r <= scene_first_batch_r; // latch this grid's flush mode
|
||
end else begin
|
||
// Ch331 legacy: flush consumed at grid start; every grid full-flushes.
|
||
if (mp_grid_start) mp_flush_pending <= 1'b0;
|
||
else if (prim_list_flush_i) mp_flush_pending <= 1'b1;
|
||
batch_full_r <= 1'b1;
|
||
end
|
||
end
|
||
|
||
// Grid-start trigger. Legacy (non-accum): end-of-list flush (MP_FLUSH_ONLY) or fixed threshold.
|
||
// Ch336 accum: ALSO fire when the FIFO is FULL — a mid-list batch — so a >FIFO_DEPTH scene renders
|
||
// in multiple batches that accumulate (first clears, rest sparse-flush) instead of stalling.
|
||
assign mp_grid_start = TILE_LOCAL && TILE_MULTIPRIM
|
||
&& (raster_state == R_IDLE)
|
||
&& (TILE_ACCUM_ENABLE ? ((fifo_full || mp_flush_pending) && (fifo_count != FIFO_CNT_W'(0)))
|
||
: MP_FLUSH_ONLY ? (mp_flush_pending && (fifo_count != FIFO_CNT_W'(0)))
|
||
: (fifo_count >= FIFO_CNT_W'(TILE_PRIM_COUNT)))
|
||
&& all_grad_done
|
||
&& !tile_render_busy;
|
||
|
||
// Ch336 — per-pixel "written" mask for the CURRENT tile (sparse flush). Cleared through TP_CLEAR,
|
||
// set on each combined render color write (CB_FB, tile_color_we in TP_RENDER). On a non-first
|
||
// batch the flush emits ONLY written pixels, so it overlays the batch's prims onto the
|
||
// accumulated framebuffer instead of overwriting it with the green clear. Pruned when
|
||
// TILE_ACCUM_ENABLE=0 (the flush gate below stops reading it). 256 = one 16x16 tile.
|
||
logic [255:0] tile_written_r;
|
||
always_ff @(posedge clk) begin
|
||
if (tile_phase_r == TP_CLEAR) tile_written_r <= 256'd0;
|
||
else if (tile_color_we && (tile_phase_r == TP_RENDER)) tile_written_r[tile_color_waddr] <= 1'b1;
|
||
end
|
||
|
||
// Per-primitive walker-bbox clip against the CURRENT tile is computed
|
||
// DIRECTLY from a given slot's fifo bbox (not via the prim_* regs) so a
|
||
// reload can clip in the same cycle it picks the slot — see the
|
||
// mp_first_*/mp_after_* wires defined just after the helper function below.
|
||
|
||
// "Skip empty primitives" helper: given a starting prim index, return the
|
||
// first index >= start whose clip against the CURRENT tile is NON-empty,
|
||
// or prim_count_r if none remain. iverilog-12: no `return` from a task —
|
||
// implemented as a function with a single result variable. The function
|
||
// reads grid_base_rptr / prim_count_r / tile bbox / fifo bbox (all stable
|
||
// within the evaluating cycle).
|
||
function automatic [FIFO_CNT_W-1:0] mp_next_nonempty (input [FIFO_CNT_W-1:0] start);
|
||
logic [FIFO_CNT_W-1:0] idx;
|
||
logic [FIFO_CNT_W-1:0] res;
|
||
logic [FIFO_PTR_W-1:0] slot;
|
||
logic [11:0] cx_lo, cx_hi, cy_lo, cy_hi;
|
||
logic [11:0] fx_lo, fx_hi, fy_lo, fy_hi;
|
||
logic found;
|
||
begin
|
||
res = prim_count_r;
|
||
found = 1'b0;
|
||
// CONSTANT loop bound (FIFO_DEPTH) so Quartus can unroll — iverilog
|
||
// tolerates a non-constant bound (prim_count_r) but Quartus rejects it
|
||
// ("non-constant loop count limit exceeded"). The original [start,
|
||
// prim_count_r) range is preserved by the in-loop guard below.
|
||
for (idx = '0; idx < FIFO_CNT_W'(FIFO_DEPTH); idx = idx + FIFO_CNT_W'(1)) begin
|
||
slot = grid_base_rptr + FIFO_PTR_W'(idx);
|
||
fx_lo = fifo_x_min[slot]; fx_hi = fifo_x_max[slot];
|
||
fy_lo = fifo_y_min[slot]; fy_hi = fifo_y_max[slot];
|
||
cx_lo = (fx_lo > tile_x_lo) ? fx_lo : tile_x_lo;
|
||
cx_hi = (fx_hi < tile_x_hi) ? fx_hi : tile_x_hi;
|
||
cy_lo = (fy_lo > tile_y_lo) ? fy_lo : tile_y_lo;
|
||
cy_hi = (fy_hi < tile_y_hi) ? fy_hi : tile_y_hi;
|
||
// Fold scissor rect into the empty-test (no-op at default).
|
||
cx_lo = (cx_lo > eff_scx0) ? cx_lo : eff_scx0;
|
||
cx_hi = (cx_hi < eff_scx1) ? cx_hi : eff_scx1;
|
||
cy_lo = (cy_lo > eff_scy0) ? cy_lo : eff_scy0;
|
||
cy_hi = (cy_hi < eff_scy1) ? cy_hi : eff_scy1;
|
||
if (!found && (idx >= start) && (idx < prim_count_r)
|
||
&& !((cx_lo > cx_hi) || (cy_lo > cy_hi))) begin
|
||
res = idx;
|
||
found = 1'b1;
|
||
end
|
||
end
|
||
mp_next_nonempty = res;
|
||
end
|
||
endfunction
|
||
|
||
// FIRST non-empty primitive of the batch for THIS tile (used at CLEAR-done),
|
||
// and its source slot + tile-clipped walker bbox.
|
||
logic [FIFO_CNT_W-1:0] mp_first_idx;
|
||
logic [FIFO_PTR_W-1:0] mp_first_slot;
|
||
logic [11:0] mp_first_cx_lo, mp_first_cx_hi, mp_first_cy_lo, mp_first_cy_hi;
|
||
// NEXT non-empty primitive strictly after prim_idx_r (used at RENDER-drain),
|
||
// and its source slot + tile-clipped walker bbox. prim_idx_r+1 may overflow
|
||
// FIFO_CNT_W only past the batch, which mp_next_nonempty's `< prim_count_r`
|
||
// bound already rejects (returns prim_count_r), so the add is width-safe.
|
||
logic [FIFO_CNT_W-1:0] mp_after_arg;
|
||
logic [FIFO_CNT_W-1:0] mp_after_idx;
|
||
logic [FIFO_PTR_W-1:0] mp_after_slot;
|
||
logic [11:0] mp_after_cx_lo, mp_after_cx_hi, mp_after_cy_lo, mp_after_cy_hi;
|
||
// Computed PROCEDURALLY (always_comb), NOT via chained continuous assigns:
|
||
// iverilog-12 mis-evaluates a variable-indexed array read (fifo_x_max[slot])
|
||
// inside a continuous assign when the index net is itself driven by an assign
|
||
// through a function — it returns 0 for the indexed read (observed: a hang
|
||
// with the loaded walker bbox stuck at [0..0]). In a procedural always_comb the
|
||
// index reads evaluate correctly. (The original assign form is legal SV and
|
||
// would synthesize fine in Quartus; this is purely a sim-modelling workaround.)
|
||
// Tile-only intersection temps (prim bbox n tile bbox), before the
|
||
// scissor is folded in below. Indexed array reads are pulled into
|
||
// locals first (iverilog-12 variable-index-in-always_comb is fine).
|
||
logic [11:0] mpf_pt_cx_lo, mpf_pt_cx_hi, mpf_pt_cy_lo, mpf_pt_cy_hi;
|
||
logic [11:0] mpa_pt_cx_lo, mpa_pt_cx_hi, mpa_pt_cy_lo, mpa_pt_cy_hi;
|
||
logic [11:0] mpf_fx_lo, mpf_fx_hi, mpf_fy_lo, mpf_fy_hi;
|
||
logic [11:0] mpa_fx_lo, mpa_fx_hi, mpa_fy_lo, mpa_fy_hi;
|
||
always_comb begin
|
||
mp_first_idx = mp_next_nonempty(FIFO_CNT_W'(0));
|
||
mp_first_slot = grid_base_rptr + FIFO_PTR_W'(mp_first_idx);
|
||
mpf_fx_lo = fifo_x_min[mp_first_slot]; mpf_fx_hi = fifo_x_max[mp_first_slot];
|
||
mpf_fy_lo = fifo_y_min[mp_first_slot]; mpf_fy_hi = fifo_y_max[mp_first_slot];
|
||
mpf_pt_cx_lo = (mpf_fx_lo > tile_x_lo) ? mpf_fx_lo : tile_x_lo;
|
||
mpf_pt_cx_hi = (mpf_fx_hi < tile_x_hi) ? mpf_fx_hi : tile_x_hi;
|
||
mpf_pt_cy_lo = (mpf_fy_lo > tile_y_lo) ? mpf_fy_lo : tile_y_lo;
|
||
mpf_pt_cy_hi = (mpf_fy_hi < tile_y_hi) ? mpf_fy_hi : tile_y_hi;
|
||
// Fold scissor rect (no-op at default SCISSOR_ENABLE=0).
|
||
mp_first_cx_lo = (mpf_pt_cx_lo > eff_scx0) ? mpf_pt_cx_lo : eff_scx0;
|
||
mp_first_cx_hi = (mpf_pt_cx_hi < eff_scx1) ? mpf_pt_cx_hi : eff_scx1;
|
||
mp_first_cy_lo = (mpf_pt_cy_lo > eff_scy0) ? mpf_pt_cy_lo : eff_scy0;
|
||
mp_first_cy_hi = (mpf_pt_cy_hi < eff_scy1) ? mpf_pt_cy_hi : eff_scy1;
|
||
|
||
mp_after_arg = prim_idx_r + FIFO_CNT_W'(1);
|
||
mp_after_idx = mp_next_nonempty(mp_after_arg);
|
||
mp_after_slot = grid_base_rptr + FIFO_PTR_W'(mp_after_idx);
|
||
mpa_fx_lo = fifo_x_min[mp_after_slot]; mpa_fx_hi = fifo_x_max[mp_after_slot];
|
||
mpa_fy_lo = fifo_y_min[mp_after_slot]; mpa_fy_hi = fifo_y_max[mp_after_slot];
|
||
mpa_pt_cx_lo = (mpa_fx_lo > tile_x_lo) ? mpa_fx_lo : tile_x_lo;
|
||
mpa_pt_cx_hi = (mpa_fx_hi < tile_x_hi) ? mpa_fx_hi : tile_x_hi;
|
||
mpa_pt_cy_lo = (mpa_fy_lo > tile_y_lo) ? mpa_fy_lo : tile_y_lo;
|
||
mpa_pt_cy_hi = (mpa_fy_hi < tile_y_hi) ? mpa_fy_hi : tile_y_hi;
|
||
// Fold scissor rect (no-op at default SCISSOR_ENABLE=0).
|
||
mp_after_cx_lo = (mpa_pt_cx_lo > eff_scx0) ? mpa_pt_cx_lo : eff_scx0;
|
||
mp_after_cx_hi = (mpa_pt_cx_hi < eff_scx1) ? mpa_pt_cx_hi : eff_scx1;
|
||
mp_after_cy_lo = (mpa_pt_cy_lo > eff_scy0) ? mpa_pt_cy_lo : eff_scy0;
|
||
mp_after_cy_hi = (mpa_pt_cy_hi < eff_scy1) ? mpa_pt_cy_hi : eff_scy1;
|
||
end
|
||
|
||
// ------------------------------------------------------------------
|
||
// BIN-WALK clip wires (BIN_BUFFER_ENABLE render path). cur_t = the tile
|
||
// currently being rendered (= tile_row_r*COLS + tile_col_r). For the CLEAR
|
||
// ->RENDER load we want bin slot 0; for a RENDER->RENDER advance we want bin
|
||
// slot bin_slot_r+1. Each gives a prim INDEX (bin_prim[cur_t][slot]); the
|
||
// tile-clipped walker bbox is the SAME max/min/scissor math as mp_first_*
|
||
// (clipped against the CURRENT tile via tile_x_lo/hi). At BIN_BUFFER_ENABLE=0
|
||
// these are unread dead wires (the bin-walk render arms are folded out).
|
||
// ------------------------------------------------------------------
|
||
logic [CUR_T_W-1:0] cur_t;
|
||
assign cur_t = CUR_T_W'(tile_row_r * TILE_COLS + tile_col_r);
|
||
|
||
// selected prim INDEX for the two walk cases.
|
||
logic [FIFO_CNT_W-1:0] binw_first_idx, binw_next_idx;
|
||
logic [FIFO_CNT_W-1:0] binw_next_slot_sel; // bin slot for the advance
|
||
logic [FIFO_PTR_W-1:0] binw_first_slot, binw_next_fslot;
|
||
logic [11:0] binw_first_cx_lo, binw_first_cx_hi, binw_first_cy_lo, binw_first_cy_hi;
|
||
logic [11:0] binw_next_cx_lo, binw_next_cx_hi, binw_next_cy_lo, binw_next_cy_hi;
|
||
// intermediates (indexed reads pulled into locals first).
|
||
logic [11:0] bwf_fx_lo, bwf_fx_hi, bwf_fy_lo, bwf_fy_hi;
|
||
logic [11:0] bwn_fx_lo, bwn_fx_hi, bwn_fy_lo, bwn_fy_hi;
|
||
logic [11:0] bwf_pt_cx_lo, bwf_pt_cx_hi, bwf_pt_cy_lo, bwf_pt_cy_hi;
|
||
logic [11:0] bwn_pt_cx_lo, bwn_pt_cx_hi, bwn_pt_cy_lo, bwn_pt_cy_hi;
|
||
always_comb begin
|
||
// bin slot 0 (first prim of cur_t's bin).
|
||
binw_first_idx = bin_prim[cur_t][0];
|
||
binw_first_slot = grid_base_rptr + FIFO_PTR_W'(binw_first_idx);
|
||
bwf_fx_lo = fifo_x_min[binw_first_slot]; bwf_fx_hi = fifo_x_max[binw_first_slot];
|
||
bwf_fy_lo = fifo_y_min[binw_first_slot]; bwf_fy_hi = fifo_y_max[binw_first_slot];
|
||
bwf_pt_cx_lo = (bwf_fx_lo > tile_x_lo) ? bwf_fx_lo : tile_x_lo;
|
||
bwf_pt_cx_hi = (bwf_fx_hi < tile_x_hi) ? bwf_fx_hi : tile_x_hi;
|
||
bwf_pt_cy_lo = (bwf_fy_lo > tile_y_lo) ? bwf_fy_lo : tile_y_lo;
|
||
bwf_pt_cy_hi = (bwf_fy_hi < tile_y_hi) ? bwf_fy_hi : tile_y_hi;
|
||
binw_first_cx_lo = (bwf_pt_cx_lo > eff_scx0) ? bwf_pt_cx_lo : eff_scx0;
|
||
binw_first_cx_hi = (bwf_pt_cx_hi < eff_scx1) ? bwf_pt_cx_hi : eff_scx1;
|
||
binw_first_cy_lo = (bwf_pt_cy_lo > eff_scy0) ? bwf_pt_cy_lo : eff_scy0;
|
||
binw_first_cy_hi = (bwf_pt_cy_hi < eff_scy1) ? bwf_pt_cy_hi : eff_scy1;
|
||
|
||
// next bin slot (bin_slot_r+1). Index into bin_prim is taken modulo
|
||
// FIFO_DEPTH (low FIFO_PTR_W bits) so it is ALWAYS in [0..FIFO_DEPTH-1]
|
||
// — the advance only CONSUMES this when bin_slot_r+1 < bin_n[cur_t] <=
|
||
// FIFO_DEPTH, so the masked value equals the true slot whenever used;
|
||
// the mask only tames the unused wrap case (Quartus const-range safety).
|
||
binw_next_slot_sel = bin_slot_r + FIFO_CNT_W'(1);
|
||
binw_next_idx = bin_prim[cur_t][binw_next_slot_sel[FIFO_PTR_W-1:0]];
|
||
binw_next_fslot = grid_base_rptr + FIFO_PTR_W'(binw_next_idx);
|
||
bwn_fx_lo = fifo_x_min[binw_next_fslot]; bwn_fx_hi = fifo_x_max[binw_next_fslot];
|
||
bwn_fy_lo = fifo_y_min[binw_next_fslot]; bwn_fy_hi = fifo_y_max[binw_next_fslot];
|
||
bwn_pt_cx_lo = (bwn_fx_lo > tile_x_lo) ? bwn_fx_lo : tile_x_lo;
|
||
bwn_pt_cx_hi = (bwn_fx_hi < tile_x_hi) ? bwn_fx_hi : tile_x_hi;
|
||
bwn_pt_cy_lo = (bwn_fy_lo > tile_y_lo) ? bwn_fy_lo : tile_y_lo;
|
||
bwn_pt_cy_hi = (bwn_fy_hi < tile_y_hi) ? bwn_fy_hi : tile_y_hi;
|
||
binw_next_cx_lo = (bwn_pt_cx_lo > eff_scx0) ? bwn_pt_cx_lo : eff_scx0;
|
||
binw_next_cx_hi = (bwn_pt_cx_hi < eff_scx1) ? bwn_pt_cx_hi : eff_scx1;
|
||
binw_next_cy_lo = (bwn_pt_cy_lo > eff_scy0) ? bwn_pt_cy_lo : eff_scy0;
|
||
binw_next_cy_hi = (bwn_pt_cy_hi < eff_scy1) ? bwn_pt_cy_hi : eff_scy1;
|
||
end
|
||
|
||
// tile-local index of the HELD combined walker pixel = {y[3:0],x[3:0]}.
|
||
// For ANY 16-aligned tile this is the tile-local RAM address (the low
|
||
// 4 bits of the screen coord), so it is offset-agnostic by construction.
|
||
logic [7:0] tile_idx_w;
|
||
assign tile_idx_w = {ras_cur_y[3:0], ras_cur_x[3:0]};
|
||
|
||
// Combined-active gate: a combined-mode scan is in flight.
|
||
// Ch304 — in tile-local mode the combined per-pixel FSM may run ONLY
|
||
// during the RENDER phase (NOT while the tile is being CLEARed, even
|
||
// though raster_state is already R_SCAN then). tile_active is constant
|
||
// 0 at TILE_LOCAL=0, so the added term collapses away (byte-identical).
|
||
logic comb_scan_active;
|
||
assign comb_scan_active = ras_combined && (raster_state == R_SCAN)
|
||
&& (!tile_active || (tile_phase_r == TP_RENDER));
|
||
|
||
// Per-beat read-strobe contributions. One consumer per beat:
|
||
// beat 0 (CB_Z) : Z read, inside pixel only.
|
||
// beat 1 (CB_ZW): texel read, depth-PASS only.
|
||
// beat 2 (CB_T) : dest-fb read, depth-PASS only (latched pass).
|
||
assign comb_z_req = comb_scan_active && (comb_state_r == CB_Z)
|
||
&& comb_pix_inside_w;
|
||
assign comb_tex_req = comb_scan_active && (comb_state_r == CB_ZW)
|
||
&& comb_ztest_pass_w;
|
||
assign comb_fb_req = comb_scan_active && (comb_state_r == CB_T)
|
||
&& comb_ztest_pass_r;
|
||
|
||
// Ch310 — is THIS combined primitive doing a bilinear PSMCT32 sample? Only
|
||
// then does the texel read take the multi-beat 4-tap path that needs the
|
||
// CB_TWAIT stall. Compile-time-constant 0 when BILINEAR_ENABLE=0, so the
|
||
// CB_TWAIT branch + the bilinear advance are pruned and the FSM is exactly
|
||
// the legacy CB_ZW→CB_T→CB_FB→CB_ZWR sequence (byte-identical).
|
||
// ras_filter_lin = the per-primitive TEX1_1.MMAG snapshot; ras_tpsm==6'h00
|
||
// = PSMCT32 (bilinear is PSMCT32-only, matching u_tex's is_ct32 gate).
|
||
logic bili_now;
|
||
assign bili_now = BILINEAR_ENABLE && comb_scan_active
|
||
&& ras_filter_lin && (ras_tpsm == 6'h00);
|
||
|
||
// Observable schedule taps.
|
||
assign comb_active = comb_scan_active;
|
||
assign comb_beat = comb_state_r;
|
||
assign comb_pix_inside = comb_scan_active ? comb_pix_inside_w : 1'b0;
|
||
assign comb_ztest_pass = comb_scan_active ? comb_ztest_pass_r : 1'b0;
|
||
|
||
// ------------------------------------------------------------------
|
||
// FLUSH read/emit pipeline. The tile color RAM read is REGISTERED, so
|
||
// a raddr presented on cycle N returns rdata on cycle N+1. During
|
||
// TP_FLUSH we present raddr=tile_sweep_r each cycle and, ONE cycle
|
||
// later, emit the framebuffer write for the index we presented last
|
||
// cycle. flush_emit_q / flush_idx_q carry the 1-cycle delay.
|
||
// ------------------------------------------------------------------
|
||
logic flush_emit_q; // emit a FB write THIS cycle
|
||
logic [7:0] flush_idx_q; // tile index whose color is emitted now
|
||
|
||
// Linear PSMCT32 FB byte address for tile-local index idx (x=idx[3:0],
|
||
// y=idx[7:4]), matching the combined comb_fb_addr formula:
|
||
// pixel_index = y*(fbw*64) + x ; addr = (fbp<<11) + (pixel_index<<2).
|
||
// Ch305 — the FLUSH FB address is OFFSET by the tile origin. flush_x/y
|
||
// are the tile-LOCAL 0..15 indices; the screen coord is tile_ox/oy +
|
||
// those. flush_ox_q / flush_oy_q latch the tile origin alongside the
|
||
// emit pipeline so the +1-cycle registered read stays aligned even if
|
||
// the tile counter advances (it does not until FLUSH ends, but latching
|
||
// keeps the address self-consistent and TB-observable). At
|
||
// COLS=ROWS=1 the offset is 0 → byte-identical to Ch303.
|
||
logic [11:0] flush_x_w, flush_y_w; // screen coords
|
||
logic [11:0] flush_ox_q, flush_oy_q; // latched tile origin for the emit
|
||
logic [31:0] flush_pixel_index_w, flush_fb_addr_w;
|
||
assign flush_x_w = flush_ox_q + {8'd0, flush_idx_q[3:0]};
|
||
assign flush_y_w = flush_oy_q + {8'd0, flush_idx_q[7:4]};
|
||
assign flush_pixel_index_w = ({20'd0, flush_y_w} * ({26'd0, ras_fbw} << 6))
|
||
+ {20'd0, flush_x_w};
|
||
assign flush_fb_addr_w = ({23'd0, ras_fbp} << 11) + (flush_pixel_index_w << 2);
|
||
// Ch324 — current tile's PSMCT32 byte offset into the raster framebuffer: SAME formula as the
|
||
// flush pixel index, but at the tile ORIGIN (tile_ox/tile_oy). gs_tile_reload adds this to its
|
||
// COLOR/Z region base to gather exactly the tile the flush wrote. 1x1 grid → tile_ox=oy=0 → 0.
|
||
assign reload_base_o = 30'((((({20'd0, tile_oy} * ({26'd0, ras_fbw} << 6)) + {20'd0, tile_ox})) << 2));
|
||
// Ch305+ PSMCT16 flush FB byte address: 2 bytes/pixel (<<1) vs 4 for
|
||
// PSMCT32 (<<2). The low bit (pixel_index<<1) supplies byte_addr[1],
|
||
// which vram_normalize_pkg keys the low/high halfword selection off.
|
||
// Dead (unread) at TILE_COLOR_PSMCT16=0.
|
||
wire [31:0] flush_fb_addr16_w = ({23'd0, ras_fbp} << 11) + (flush_pixel_index_w << 1);
|
||
|
||
// Ch323 — Z-FLUSH byte address (TILE_SPILL_ENABLE). Same pixel-index math as the color
|
||
// flush, but Z-BACKING-relative (NO fbp base — the de25 Z-writer adds the LPDDR z_base)
|
||
// and ALWAYS 4 bytes/pixel (<<2; Z is 32-bit regardless of TILE_COLOR_PSMCT16). Uses the
|
||
// z_flush_* latches (presented during TP_ZFLUSH). Dead at the default.
|
||
wire [11:0] z_flush_x_w = z_flush_ox_q + {8'd0, z_flush_idx_q[3:0]};
|
||
wire [11:0] z_flush_y_w = z_flush_oy_q + {8'd0, z_flush_idx_q[7:4]};
|
||
wire [31:0] z_flush_pixel_index_w = ({20'd0, z_flush_y_w} * ({26'd0, ras_fbw} << 6))
|
||
+ {20'd0, z_flush_x_w};
|
||
wire [31:0] z_flush_addr_w = z_flush_pixel_index_w << 2;
|
||
|
||
// Ch323 — tile RELOAD staging read address: during the TP_RELOAD sweep (after the staging
|
||
// fill is ready), present tile index 0..255 to the de25 staging engine. 0 otherwise.
|
||
assign tile_reload_raddr_o = (tile_phase_r == TP_RELOAD && !reload_wait && tile_sweep_r < 9'd256)
|
||
? tile_sweep_r[7:0] : 8'd0;
|
||
assign tile_phase_o = tile_phase_r; // Ch323 diag — current tile phase (de25 counts entries)
|
||
|
||
// ------------------------------------------------------------------
|
||
// Tile RAM write/read port drivers (combinational). One write port +
|
||
// one read port each; the phases never collide (CLEAR writes only;
|
||
// RENDER reads Z@beat1 / color@beat3 and writes color@beat3 / Z@beat4
|
||
// — read and write are different beats; FLUSH reads only).
|
||
// (tile_color_rdata / tile_z_rdata declared up near comb_z_compare_w
|
||
// so the depth-test + blend-dest muxes can reference them.)
|
||
// ------------------------------------------------------------------
|
||
|
||
// RENDER write data:
|
||
// color (beat3, CB_FB) = blended PSMCT32 word.
|
||
// Z (beat4, CB_ZWR) = fragment Z (comb_frag_z), skipped if ZMSK.
|
||
logic [31:0] tile_render_color_w;
|
||
assign tile_render_color_w = {comb_blend_a, comb_blend_b,
|
||
comb_blend_g, comb_blend_r};
|
||
|
||
// Ch305+ PSMCT16 pack of the CLEAR + RENDER ABGR8888 color into
|
||
// pix16 = {a1, b5, g5, r5}, the EXACT convention used by the S2
|
||
// raster pack (s2_pack_psm16) and the gs_pcrtc unpack: r5=[7:3],
|
||
// g5=[15:11], b5=[23:19], a1=[31]. Dead (unread) at TILE_COLOR_PSMCT16=0.
|
||
wire [15:0] tile_clear_pix16 = {TILE_CLEAR_COLOR[31], TILE_CLEAR_COLOR[23:19],
|
||
TILE_CLEAR_COLOR[15:11], TILE_CLEAR_COLOR[7:3]};
|
||
wire [15:0] tile_render_pix16 = {tile_render_color_w[31], tile_render_color_w[23:19],
|
||
tile_render_color_w[15:11], tile_render_color_w[7:3]};
|
||
|
||
// Tile color write strobe + addr + data.
|
||
always_comb begin
|
||
tile_color_we = 1'b0;
|
||
tile_color_waddr = 8'd0;
|
||
tile_color_wdata = '0;
|
||
tile_color_raddr = 8'd0;
|
||
tile_z_we = 1'b0;
|
||
tile_z_waddr = 8'd0;
|
||
tile_z_wdata = 32'd0;
|
||
tile_z_raddr = 8'd0;
|
||
if (tile_active) begin
|
||
unique case (tile_phase_r)
|
||
TP_CLEAR: begin
|
||
// Sweep all 256 entries to the clear values.
|
||
if (tile_sweep_r < 9'd256) begin
|
||
tile_color_we = 1'b1;
|
||
tile_color_waddr = tile_sweep_r[7:0];
|
||
// PSMCT16: store packed pix16; PSMCT32: full ABGR.
|
||
// Both width-correct for TILE_COLOR_W (16 vs 32).
|
||
tile_color_wdata = TILE_COLOR_PSMCT16
|
||
? TILE_COLOR_W'(tile_clear_pix16)
|
||
: TILE_COLOR_W'(TILE_CLEAR_COLOR);
|
||
// Ch338 — with persistent cross-batch Z, clear Z ONLY on the first
|
||
// (full-flush) batch of a scene; later (sparse) batches keep the resident
|
||
// Z so a tile re-rendered in a later batch depth-tests against prior ones.
|
||
// batch_full_r is the latched flush mode (1 = first batch). At
|
||
// TILE_Z_PERSIST=0 this is constant 1 → byte-identical (always clears).
|
||
tile_z_we = (!TILE_Z_PERSIST) || batch_full_r;
|
||
tile_z_waddr = tile_sweep_r[7:0];
|
||
tile_z_wdata = TILE_CLEAR_Z;
|
||
end
|
||
end
|
||
TP_RENDER: begin
|
||
if (comb_scan_active) begin
|
||
unique case (comb_state_r)
|
||
// beat0 (CB_Z): present Z read for held pixel.
|
||
CB_Z: if (comb_pix_inside_w)
|
||
tile_z_raddr = tile_idx_w;
|
||
// beat2 (CB_T): present color (dest) read.
|
||
CB_T: tile_color_raddr = tile_idx_w;
|
||
// beat3 (CB_FB): WRITE blended color to tile.
|
||
CB_FB: begin
|
||
tile_color_we = 1'b1;
|
||
tile_color_waddr = tile_idx_w;
|
||
// PSMCT16: pack blended ABGR -> pix16;
|
||
// PSMCT32: full blended ABGR word.
|
||
tile_color_wdata = TILE_COLOR_PSMCT16
|
||
? TILE_COLOR_W'(tile_render_pix16)
|
||
: TILE_COLOR_W'(tile_render_color_w);
|
||
end
|
||
// beat4 (CB_ZWR): WRITE fragment Z (unless ZMSK).
|
||
CB_ZWR: if (!ras_zmsk) begin
|
||
tile_z_we = 1'b1;
|
||
tile_z_waddr = tile_idx_w;
|
||
tile_z_wdata = comb_frag_z;
|
||
end
|
||
default: ;
|
||
endcase
|
||
end
|
||
end
|
||
TP_FLUSH: begin
|
||
// Present a color read for each index 0..255; the
|
||
// emit happens one cycle later (registered read).
|
||
if (tile_sweep_r < 9'd256)
|
||
tile_color_raddr = tile_sweep_r[7:0];
|
||
end
|
||
// Ch323 (TILE_SPILL_ENABLE) — these states are only entered when spill is
|
||
// enabled, so at the default they are unreachable and the sim is byte-identical.
|
||
TP_ZFLUSH: begin
|
||
// Present a Z read for each index 0..255; the Z emit happens one cycle
|
||
// later (registered read), mirroring TP_FLUSH for color.
|
||
if (tile_sweep_r < 9'd256)
|
||
tile_z_raddr = tile_sweep_r[7:0];
|
||
end
|
||
TP_RELOAD: begin
|
||
// Write reloaded color+Z from the staging engine into the tile RAMs. The
|
||
// staging read addr was presented LAST cycle (1-cyc latency), so its data
|
||
// (tile_reload_color_i / tile_reload_z_i) is valid this cycle. reload_wr_we
|
||
// gates the write window; reload_wr_addr is the tile index for this beat.
|
||
// SEPARATE color/Z write enables (Codex) — both asserted together so the
|
||
// reload is all-or-nothing, but independently wired.
|
||
if (reload_wr_we) begin
|
||
tile_color_we = 1'b1;
|
||
tile_color_waddr = reload_wr_addr;
|
||
tile_color_wdata = TILE_COLOR_W'(tile_reload_color_i);
|
||
tile_z_we = 1'b1;
|
||
tile_z_waddr = reload_wr_addr;
|
||
tile_z_wdata = tile_reload_z_i;
|
||
end
|
||
end
|
||
default: ;
|
||
endcase
|
||
end
|
||
end
|
||
|
||
// Ch305+ PSMCT16 dest-read unpack: tile_color_rdata holds pix16 in
|
||
// [15:0]; bit-replicate to ABGR8888 for the blend dest (gs_alpha_blend
|
||
// uses dest R/G/B + source alpha, so a8 only feeds trace consistency —
|
||
// we use {8{a1}}). Same bit-replication as gs_pcrtc_stub's CT16 unpack:
|
||
// r8={r5,r5[4:2]}, g8={g5,g5[4:2]}, b8={b5,b5[4:2]}. At
|
||
// TILE_COLOR_PSMCT16=0 tile_color_rdata is the legacy 32-bit ABGR word
|
||
// and this unpacked net is unread (the mux below selects rdata directly).
|
||
logic [4:0] tile_rd_r5, tile_rd_g5, tile_rd_b5;
|
||
logic tile_rd_a1;
|
||
logic [31:0] tile_color_rdata_abgr;
|
||
assign tile_rd_r5 = tile_color_rdata[4:0];
|
||
assign tile_rd_g5 = tile_color_rdata[9:5];
|
||
assign tile_rd_b5 = tile_color_rdata[14:10];
|
||
assign tile_rd_a1 = tile_color_rdata[15];
|
||
assign tile_color_rdata_abgr = {{8{tile_rd_a1}},
|
||
{tile_rd_b5, tile_rd_b5[4:2]},
|
||
{tile_rd_g5, tile_rd_g5[4:2]},
|
||
{tile_rd_r5, tile_rd_r5[4:2]}};
|
||
|
||
// Zero-extend tile_color_rdata to 32 bits with a width-cast (no-op at
|
||
// TILE_COLOR_W=32 -> byte-identical; high 16 are 0 at TILE_COLOR_W=16
|
||
// but that arm is never selected for the PSMCT32-mode mux below).
|
||
logic [31:0] tile_color_rdata32;
|
||
assign tile_color_rdata32 = 32'(tile_color_rdata);
|
||
|
||
// Combined blend dest mux: tile color read in tile mode, else VRAM.
|
||
// PSMCT16 tile uses the unpacked ABGR; PSMCT32 tile uses rdata as-is
|
||
// (32-bit). Both arms are 32-bit into comb_blend_cd; at
|
||
// TILE_COLOR_PSMCT16=0 this collapses to the legacy expression
|
||
// (tile_color_rdata32 === tile_color_rdata, a 32-bit net).
|
||
assign comb_blend_cd = tile_active
|
||
? (TILE_COLOR_PSMCT16 ? tile_color_rdata_abgr
|
||
: tile_color_rdata32)
|
||
: fb_rd_data;
|
||
|
||
// ------------------------------------------------------------------
|
||
// Tile RAM instances — ONE 16x16 color tile + ONE 16x16 Z tile (256
|
||
// entries each, 32-bit). Instantiated only at TILE_LOCAL=1; tied to 0
|
||
// reads otherwise so non-tiled builds infer no BRAM.
|
||
// ------------------------------------------------------------------
|
||
// Ch338 — persistent cross-batch Z: when TILE_Z_PERSIST (and the grid has >1 tile) the Z tile is
|
||
// the WHOLE grid (NTILES*256), indexed by {tile_id, local}. The COLOR tile stays one 16x16 tile —
|
||
// color rides the sparse flush into VRAM, only Z must persist on-chip. tile_id_w is the current
|
||
// tile (stable across its render), so the per-pixel Z read/write hit the right tile's region.
|
||
localparam int TZ_ID_BITS = (TILE_Z_PERSIST && N_TILES > 1) ? $clog2(N_TILES) : 0;
|
||
localparam int TZ_ADDR_W = 8 + TZ_ID_BITS;
|
||
logic [TZ_ADDR_W-1:0] tile_z_gwaddr, tile_z_graddr;
|
||
generate
|
||
if (TZ_ID_BITS > 0) begin : g_ztile_global
|
||
assign tile_z_gwaddr = {tile_id_w[TZ_ID_BITS-1:0], tile_z_waddr};
|
||
assign tile_z_graddr = {tile_id_w[TZ_ID_BITS-1:0], tile_z_raddr};
|
||
end else begin : g_ztile_local
|
||
assign tile_z_gwaddr = tile_z_waddr;
|
||
assign tile_z_graddr = tile_z_raddr;
|
||
end
|
||
endgenerate
|
||
generate
|
||
if (TILE_LOCAL) begin : g_tile_ram
|
||
// Ch305+ COLOR tile RAM width = TILE_COLOR_W (16 for PSMCT16,
|
||
// 32 for PSMCT32). gs_tile_ram already has a DATA_W param.
|
||
gs_tile_ram #(.ADDR_W(8), .DATA_W(TILE_COLOR_W)) u_tile_color (
|
||
.clk (clk),
|
||
.rst_n (rst_n),
|
||
.we (tile_color_we),
|
||
.waddr (tile_color_waddr),
|
||
.wdata (tile_color_wdata),
|
||
.raddr (tile_color_raddr),
|
||
.rdata (tile_color_rdata)
|
||
);
|
||
gs_tile_ram #(.ADDR_W(TZ_ADDR_W), .DATA_W(32)) u_tile_z (
|
||
.clk (clk),
|
||
.rst_n (rst_n),
|
||
.we (tile_z_we),
|
||
.waddr (tile_z_gwaddr),
|
||
.wdata (tile_z_wdata),
|
||
.raddr (tile_z_graddr),
|
||
.rdata (tile_z_rdata)
|
||
);
|
||
end else begin : g_no_tile_ram
|
||
assign tile_color_rdata = '0; // TILE_COLOR_W-wide tie-off
|
||
assign tile_z_rdata = 32'd0;
|
||
end
|
||
endgenerate
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch88 — 3-stage SCAN pipeline
|
||
//
|
||
// S0 (this cycle, pre-pipe) : ras_cur_x / ras_cur_y advance.
|
||
// Registered. Drives s1 latches at
|
||
// next edge.
|
||
// S1 (one cycle after S0) : edge functions + top-left bias
|
||
// + inside test. Combinational from
|
||
// s1_x_q / s1_y_q. Drives s2 latches.
|
||
// S2 (two cycles after S0) : color interp + fb_addr + emit.
|
||
// Combinational from s2_x_q / s2_y_q
|
||
// / s2_L*_q. Drives raster_pixel_*
|
||
// outputs.
|
||
//
|
||
// Throughput stays at 1 candidate pixel per cycle once the pipe
|
||
// is full. Latency is 2-3 cycles per pixel (S0 → emit). The FSM
|
||
// adds an R_DRAIN state after S0 finishes the bbox so S1 and S2
|
||
// can flush before the next FIFO entry pops; raster_active stays
|
||
// high through R_SCAN AND R_DRAIN so external waits don't see
|
||
// gaps mid-primitive.
|
||
// ------------------------------------------------------------------
|
||
|
||
// S1 stage registers
|
||
logic [11:0] s1_x_q, s1_y_q;
|
||
logic s1_valid_q;
|
||
|
||
// S2 stage registers
|
||
logic [11:0] s2_x_q, s2_y_q;
|
||
logic s2_valid_q;
|
||
logic s2_inside_q;
|
||
logic signed [31:0] s2_L0_q, s2_L1_q, s2_L2_q;
|
||
raster_mode_e s2_mode_q; // mirrored — ras_mode could change between scans
|
||
|
||
// S1 combinational: edge functions on the s1 stage's latched coords.
|
||
logic [31:0] s1_e0, s1_e1, s1_e2;
|
||
logic [31:0] s1_e0_biased, s1_e1_biased, s1_e2_biased;
|
||
logic s1_in_triangle;
|
||
logic s1_pixel_inside;
|
||
assign s1_e0 = edge_fn(s1_x_q, s1_y_q, ras_v0_x, ras_v0_y, ras_v1_x, ras_v1_y);
|
||
assign s1_e1 = edge_fn(s1_x_q, s1_y_q, ras_v1_x, ras_v1_y, ras_v2_x, ras_v2_y);
|
||
assign s1_e2 = edge_fn(s1_x_q, s1_y_q, ras_v2_x, ras_v2_y, ras_v0_x, ras_v0_y);
|
||
assign s1_e0_biased = s1_e0 + {31'd0, ras_bias[0]};
|
||
assign s1_e1_biased = s1_e1 + {31'd0, ras_bias[1]};
|
||
assign s1_e2_biased = s1_e2 + {31'd0, ras_bias[2]};
|
||
assign s1_in_triangle = ($signed(s1_e0_biased) <= 32'sd0)
|
||
& ($signed(s1_e1_biased) <= 32'sd0)
|
||
& ($signed(s1_e2_biased) <= 32'sd0);
|
||
assign s1_pixel_inside = (ras_mode == RM_SPRITE) ? 1'b1
|
||
: (ras_mode == RM_TRI) ? s1_in_triangle
|
||
: 1'b0;
|
||
|
||
// ------------------------------------------------------------------
|
||
// Brick 1 — SPRITE texture sampling (PSMCT32 DECAL).
|
||
//
|
||
// Per-pixel texel coordinate by LINEAR interpolation across the
|
||
// sprite, evaluated on the S1-stage pixel (s1_x_q, s1_y_q):
|
||
// u = u0 + du_dx * (px - x0)
|
||
// v = v0 + dv_dy * (py - y0)
|
||
// where (x0,y0)/(x1,y1) are the two SPRITE endpoints
|
||
// (ras_v0_*, ras_v1_*) and (u0,v0)/(u1,v1) the integer texel
|
||
// coords latched at those endpoints.
|
||
//
|
||
// SYNTHESIZABLE DDA (no per-pixel divide). The per-texel step
|
||
// du_dx = (u1-u0)/(x1-x0) and dv_dy = (v1-v0)/(y1-y0) are
|
||
// computed ONCE per primitive in Q16.16 fixed-point at FIFO push
|
||
// (a single divide each, latency-tolerant — it happens at the
|
||
// close cycle, long before SCAN). The popped ras_du_dx_q /
|
||
// ras_dv_dy_q are then applied per pixel with a MULTIPLY + shift
|
||
// only — fully synthesizable. For the common 1:1 texel:pixel
|
||
// SPRITE (step == 1.0 exactly), the Q16 step is exactly 0x10000
|
||
// so u = px-x0+u0 / v = py-y0+v0 to the bit, byte-identical to
|
||
// the old per-pixel divide. Degenerate spans (x1==x0) latch a
|
||
// zero step so the coord holds u0/v0. The whole path is now LIVE
|
||
// for synthesis — no `// synthesis translate_off`.
|
||
//
|
||
// The sampler is driven from the S1 stage and its returned color
|
||
// is registered into s2_tex_color_q at the S1->S2 edge, so the
|
||
// texel lands in the SAME pipeline slot as the S2 pixel it
|
||
// belongs to (tex_rd_data from the combinational read2 port is
|
||
// valid in the S1 cycle; the register absorbs the path into S2).
|
||
// ------------------------------------------------------------------
|
||
// Texturing-active gate. SPRITE feeds the linear DDA UV; a TME
|
||
// TRIANGLE (textured-triangle rung) feeds the AFFINE U/V computed
|
||
// from the shared-divider gradients (or the perspective U/V on the
|
||
// Ch301 path). Ch348 — DECAL texturing admits PSMCT32 (0x00) AND the
|
||
// indexed CLUT formats PSMT8 (0x13) / PSMT4 (0x14) for BOTH sprites
|
||
// and triangles: the index->CLUT->ABGR decode is in the shared
|
||
// gs_texture_unit, so no per-primitive-type restriction. (Affine
|
||
// PSMT8+CLUT triangles are proven by tb_gs_psmt8_clut_triangle.)
|
||
logic s1_tex_active; // texturing this pixel?
|
||
assign s1_tex_active = ((ras_mode == RM_SPRITE) ||
|
||
((ras_mode == RM_TRI) && ras_tri_active))
|
||
&& ras_tme
|
||
&& ((ras_tpsm == 6'h00) // PSMCT32 direct DECAL
|
||
|| (ras_tpsm == 6'h13) // Ch296 PSMT8 indexed DECAL
|
||
|| (ras_tpsm == 6'h14)); // Ch297 PSMT4 indexed DECAL
|
||
|
||
// DDA coordinate evaluator: coord = c0 + ((p - a) * step_q16) >>> 16,
|
||
// clamped to the 11-bit texel range. `step_q16` is the popped
|
||
// Q16.16 per-pixel increment; `p` the current screen coord; `a`
|
||
// the first endpoint's screen coord; `c0` the first endpoint's
|
||
// texel coord. Pure multiply + arithmetic shift — synthesizable.
|
||
function automatic logic [10:0] dda_uv(input logic [11:0] p,
|
||
input logic [11:0] a,
|
||
input logic [10:0] c0,
|
||
input logic signed [31:0] step_q16);
|
||
logic signed [31:0] dp;
|
||
logic signed [63:0] prod;
|
||
logic signed [31:0] coord;
|
||
logic signed [63:0] prod_sra;
|
||
dp = $signed({20'd0, p}) - $signed({20'd0, a});
|
||
prod = $signed({{32{step_q16[31]}}, step_q16}) * $signed({{32{dp[31]}}, dp});
|
||
prod_sra = prod >>> 16; // arithmetic >> 16
|
||
coord = $signed({21'd0, c0}) + prod_sra[31:0];
|
||
return coord[10:0];
|
||
endfunction
|
||
|
||
// Texel coordinate source. TEX_RD_REGISTERED selects whether the
|
||
// address is generated on the S1 stage (combinational read2) or the
|
||
// S0 stage (registered/BRAM read2 — one cycle earlier so the
|
||
// registered data still lands at S1). ras_cur_x/ras_cur_y are the S0
|
||
// walker coords that latch into s1_x_q/s1_y_q at the next edge, so
|
||
// the same coordinate is sampled either way — only the cycle it is
|
||
// presented to VRAM differs. ras_v0_*/ras_u0/ras_v0t/ras_d*_q are
|
||
// per-primitive constants (held for the whole scan) so the DDA is
|
||
// identical regardless of which stage's coords feed it.
|
||
logic [11:0] tex_coord_x, tex_coord_y;
|
||
logic tex_coord_valid;
|
||
assign tex_coord_x = TEX_RD_REGISTERED ? ras_cur_x : s1_x_q;
|
||
assign tex_coord_y = TEX_RD_REGISTERED ? ras_cur_y : s1_y_q;
|
||
assign tex_coord_valid = TEX_RD_REGISTERED ? (raster_state == R_SCAN)
|
||
: s1_valid_q;
|
||
|
||
// Texture-stage pixel offset from post-swap vertex 0, for the TRI
|
||
// affine U/V. Same offset basis (relative to ras_v0_x/y) and the
|
||
// same right-sized signed-16 narrowing the S2 colour/Z interp uses;
|
||
// here it is evaluated on the texture-stage coords so the sampled
|
||
// texel lines up with the pixel emitted at that location.
|
||
logic signed [31:0] tex_dx, tex_dy;
|
||
assign tex_dx = $signed({20'd0, tex_coord_x}) - $signed({20'd0, ras_v0_x});
|
||
assign tex_dy = $signed({20'd0, tex_coord_y}) - $signed({20'd0, ras_v0_y});
|
||
|
||
// Ch301 perspective — the perspective divide runs its OWN longer pipeline
|
||
// off the S1 stage (gs_persp_uv 4 cyc + texel 1 cyc), gated by the S1-timed
|
||
// persp_in_valid (s1_valid_q && s1_pixel_inside) and emitted against
|
||
// persp_x5/persp_y5 (= s1_x_q/s1_y_q delayed 5). So the S/T/Q interp MUST
|
||
// use the S1-stage coords (s1_x_q/s1_y_q), NOT the S0-based tex_coord (which
|
||
// the affine registered-read path uses) — else the numerators are one pixel
|
||
// ahead of their valid/emit-coord and the texture shifts by one texel.
|
||
logic signed [31:0] tex_dx_s1, tex_dy_s1;
|
||
assign tex_dx_s1 = $signed({20'd0, s1_x_q}) - $signed({20'd0, ras_v0_x});
|
||
assign tex_dy_s1 = $signed({20'd0, s1_y_q}) - $signed({20'd0, ras_v0_y});
|
||
|
||
// Triangle-interpolated affine U/V (reuses the 32x16 interp_affine_uv).
|
||
logic [10:0] s1_u_tri, s1_v_tri;
|
||
assign s1_u_tri = interp_affine_uv(ras_u0_base, ras_du_dx_t, ras_du_dy_t, tex_dx, tex_dy);
|
||
assign s1_v_tri = interp_affine_uv(ras_v0_base, ras_dv_dx_t, ras_dv_dy_t, tex_dx, tex_dy);
|
||
|
||
// Ch301 perspective — per-pixel SCREEN-LINEAR interpolated S/T/Q
|
||
// (24-bit, FRAC=12), evaluated on the SAME tex_dx/tex_dy offsets the
|
||
// affine U/V use, via the no-clamp interp_affine_wide. These are the
|
||
// perspective NUMERATORS/DENOMINATOR: the per-pixel divide
|
||
// u = S/Q, v = T/Q is done by gs_persp_uv on the EMIT side.
|
||
// Ch301 perspective: emit-side wiring TBD (human) — feed
|
||
// s1_s_persp / s1_t_persp / s1_q_persp into the gs_persp_uv
|
||
// (uq,vq,q) inputs, gate the sampler u/v mux on ras_persp, and add
|
||
// the matching 4-cycle delay of color/z/x/y/valid.
|
||
// GATED: 0 when PERSPECTIVE_CORRECT=0 (the generate ties them low so
|
||
// no perspective datapath is synthesized in the affine-only build).
|
||
logic [23:0] s1_s_persp, s1_t_persp, s1_q_persp;
|
||
generate
|
||
if (PERSPECTIVE_CORRECT) begin : g_persp_interp
|
||
assign s1_s_persp = interp_affine_wide(ras_s0_base, ras_ds_dx, ras_ds_dy, tex_dx_s1, tex_dy_s1);
|
||
assign s1_t_persp = interp_affine_wide(ras_t0_base, ras_dt_dx, ras_dt_dy, tex_dx_s1, tex_dy_s1);
|
||
assign s1_q_persp = interp_affine_wide(ras_q0_base, ras_dq_dx, ras_dq_dy, tex_dx_s1, tex_dy_s1);
|
||
end else begin : g_no_persp_interp
|
||
assign s1_s_persp = 24'd0;
|
||
assign s1_t_persp = 24'd0;
|
||
assign s1_q_persp = 24'd0;
|
||
end
|
||
endgenerate
|
||
|
||
// U/V presented to the texel-fetch path: SPRITE → linear DDA;
|
||
// TRIANGLE → affine. Selected by ras_tri_active (set only for a TRI).
|
||
logic [10:0] s1_u, s1_v;
|
||
assign s1_u = ras_tri_active ? s1_u_tri
|
||
: dda_uv(tex_coord_x, ras_v0_x, ras_u0, ras_du_dx_q);
|
||
assign s1_v = ras_tri_active ? s1_v_tri
|
||
: dda_uv(tex_coord_y, ras_v0_y, ras_v0t, ras_dv_dy_q);
|
||
|
||
// Ch310 — sub-texel FRACTION paralleling s1_u/s1_v (BILINEAR). On the
|
||
// affine TRI path it is interp_affine_uv_frac over the SAME args as
|
||
// s1_u_tri/s1_v_tri (so the 4-bit frac re-pairs with the integer texel
|
||
// exactly). For ras_persp or a non-TRI (SPRITE) primitive we pass frac=0
|
||
// — bilinear is exercised only on the affine combined TRI, and a 0 frac is
|
||
// the nearest-tap value there, which is safe. These wires are UNREAD by the
|
||
// nearest texel path, so at BILINEAR_ENABLE=0 the default build is unchanged.
|
||
logic [3:0] s1_u_frac, s1_v_frac;
|
||
assign s1_u_frac = ras_tri_active
|
||
? interp_affine_uv_frac(ras_u0_base, ras_du_dx_t, ras_du_dy_t, tex_dx, tex_dy)
|
||
: 4'd0;
|
||
assign s1_v_frac = ras_tri_active
|
||
? interp_affine_uv_frac(ras_v0_base, ras_dv_dx_t, ras_dv_dy_t, tex_dx, tex_dy)
|
||
: 4'd0;
|
||
|
||
// ------------------------------------------------------------------
|
||
// Ch302 — PERSPECTIVE-CORRECT emit-side wiring.
|
||
//
|
||
// The per-pixel divide (u = S/Q, v = T/Q) is done by gs_persp_uv
|
||
// (LATENCY=4: in_valid → out_valid). It consumes the screen-linear
|
||
// interpolated S_fp/T_fp/Q_fp (s1_s_persp/s1_t_persp/s1_q_persp),
|
||
// which are valid at the SAME S1 texture stage as the affine
|
||
// s1_u/s1_v. Its integer texel output (persp_u, persp_v) is muxed
|
||
// into the SHARED gs_texture_unit below (a perspective primitive
|
||
// never does affine simultaneously — only one read2 port exists).
|
||
//
|
||
// Pipeline alignment (combinational read2: SEL_DELAY=0, RD_LATENCY=1):
|
||
// S1+0 : gs_persp_uv.in_valid presented (S/T/Q live at S1)
|
||
// S1+4 : persp_outvalid + persp_u/persp_v valid
|
||
// S1+5 : s1_tex_color (perspective texel) valid (1 cyc read latency)
|
||
// AND persp_emit5 (= persp_outvalid delayed 1) high
|
||
// AND persp_x5/persp_y5 (= s1_x_q/s1_y_q delayed 5) aligned
|
||
//
|
||
// GATED: outputs declared here, driven only inside the
|
||
// PERSPECTIVE_CORRECT generate; tied 0 in the else so the param=0
|
||
// build is byte-identical (ras_persp is constant 0 there too).
|
||
logic [10:0] persp_u, persp_v;
|
||
logic persp_outvalid;
|
||
logic [11:0] persp_x5, persp_y5;
|
||
logic persp_emit5;
|
||
logic [5:0] persp_inflight; // in-flight count for the drain gate
|
||
logic persp_pipe_busy; // OR-reduce: any perspective work pending
|
||
generate
|
||
if (PERSPECTIVE_CORRECT) begin : g_persp_emit
|
||
logic persp_in_valid;
|
||
assign persp_in_valid = ras_persp && s1_valid_q && s1_pixel_inside;
|
||
|
||
`ifdef CH351_TRACE
|
||
// Ch351 trace: dump the per-pixel interpolated perspective attributes for one scanline. If
|
||
// s1_q_persp is CONSTANT across x (not stepping), the perspective slopes are 0/unsolved =
|
||
// under-interpolation; if it steps smoothly, the interp is fine and the divide is the suspect.
|
||
always_ff @(posedge clk) if (persp_in_valid && s1_y_q == `CH351_TRACE)
|
||
$display("CH351 x=%0d y=%0d s=%0d t=%0d q=%0d (ds_dx=%0d dq_dx=%0d q0=%0d)",
|
||
s1_x_q, s1_y_q, $signed(s1_s_persp), $signed(s1_t_persp), $signed(s1_q_persp),
|
||
ras_ds_dx, ras_dq_dx, ras_q0_base);
|
||
`endif
|
||
|
||
gs_persp_uv #(
|
||
.ATTR_W(24), .Q_W(24), .FRAC(12), .SCALE(24),
|
||
.RECIP_W(25), .TEXEL_W(11), .TEXEL_MAX(2047),
|
||
.RECIP_IDX_BITS(PERSP_RECIP_IDX_BITS)
|
||
) u_persp_uv (
|
||
.clk (clk),
|
||
.rst_n (rst_n),
|
||
.in_valid (persp_in_valid),
|
||
.uq (s1_s_persp),
|
||
.vq (s1_t_persp),
|
||
.q (s1_q_persp),
|
||
.out_valid(persp_outvalid),
|
||
.u (persp_u),
|
||
.v (persp_v)
|
||
);
|
||
|
||
// 5-deep shift register of {x,y} so persp_x5/persp_y5 hold the
|
||
// coords of the pixel whose perspective texel is valid this
|
||
// cycle (S1+5). reg[4] is the +5 tap.
|
||
logic [11:0] persp_x_sr [0:4];
|
||
logic [11:0] persp_y_sr [0:4];
|
||
always_ff @(posedge clk or negedge rst_n) begin
|
||
if (!rst_n) begin
|
||
for (int i = 0; i < 5; i++) begin
|
||
persp_x_sr[i] <= 12'd0;
|
||
persp_y_sr[i] <= 12'd0;
|
||
end
|
||
persp_emit5 <= 1'b0;
|
||
persp_inflight <= 6'd0;
|
||
end else begin
|
||
persp_x_sr[0] <= s1_x_q;
|
||
persp_y_sr[0] <= s1_y_q;
|
||
for (int i = 1; i < 5; i++) begin
|
||
persp_x_sr[i] <= persp_x_sr[i-1];
|
||
persp_y_sr[i] <= persp_y_sr[i-1];
|
||
end
|
||
// Emit strobe = persp_outvalid (S1+4) registered once
|
||
// → lands at S1+5, exactly when s1_tex_color is the
|
||
// perspective texel for persp_x5/persp_y5.
|
||
persp_emit5 <= persp_outvalid;
|
||
// In-flight count: a divide enters on in_valid, leaves
|
||
// on persp_emit5. Tracks the whole 5-deep emit pipe so
|
||
// R_DRAIN waits for the perspective pipe to empty.
|
||
case ({persp_in_valid, persp_emit5})
|
||
2'b10: persp_inflight <= persp_inflight + 6'd1;
|
||
2'b01: persp_inflight <= persp_inflight - 6'd1;
|
||
default: persp_inflight <= persp_inflight;
|
||
endcase
|
||
end
|
||
end
|
||
assign persp_x5 = persp_x_sr[4];
|
||
assign persp_y5 = persp_y_sr[4];
|
||
// Busy while any divide is in flight OR an emit is about to
|
||
// fire this cycle (persp_emit5 / persp_outvalid not yet drained).
|
||
assign persp_pipe_busy = (persp_inflight != 6'd0)
|
||
|| persp_outvalid || persp_emit5;
|
||
end else begin : g_no_persp_emit
|
||
assign persp_u = 11'd0;
|
||
assign persp_v = 11'd0;
|
||
assign persp_outvalid = 1'b0;
|
||
assign persp_x5 = 12'd0;
|
||
assign persp_y5 = 12'd0;
|
||
assign persp_emit5 = 1'b0;
|
||
assign persp_inflight = 6'd0;
|
||
assign persp_pipe_busy = 1'b0;
|
||
end
|
||
endgenerate
|
||
|
||
logic [31:0] s1_tex_color;
|
||
// Ch296 — SEL_DELAY re-pairs the PSMT8 byte selector with the
|
||
// in-flight registered read when the texel address advances each
|
||
// cycle (TEX_RD_REGISTERED=1). Combinational-read mode holds the
|
||
// address stable, so SEL_DELAY=0.
|
||
// Ch310 — bilinear sampler status (driven only when BILINEAR_ENABLE=1).
|
||
// s1_tex_outvalid pulses when the 4-tap sample completes; tex_busy is high
|
||
// across the multi-beat run. On the nearest path (BILINEAR_ENABLE=0 or
|
||
// non-PSMCT32 / MMAG=0) busy stays 0 and out_valid follows the 1-cycle read,
|
||
// so the combined FSM never enters CB_TWAIT and the build is byte-identical.
|
||
logic tex_busy;
|
||
logic s1_tex_outvalid;
|
||
gs_texture_unit #(
|
||
.PSMT4_SWIZZLE(PSMT4_SWIZZLE),
|
||
.PSMT8_SWIZZLE(PSMT8_SWIZZLE),
|
||
.PSMCT32_SWIZZLE(PSMCT32_SWIZZLE),
|
||
.TEX_WRAP_ENABLE(TEX_WRAP_ENABLE),
|
||
.BILINEAR_ENABLE(BILINEAR_ENABLE),
|
||
.PALETTE_BILINEAR(PALETTE_BILINEAR),
|
||
.RD_LATENCY(TEX_RD_LATENCY),
|
||
.SEL_DELAY (TEX_RD_REGISTERED ? TEX_RD_LATENCY : 0)
|
||
) u_tex (
|
||
.clk (clk),
|
||
.rst_n (rst_n),
|
||
// Ch302 — perspective primitive feeds the SHARED texture unit
|
||
// from gs_persp_uv (persp_u/v valid at S1+4). A perspective prim
|
||
// never simultaneously does affine, so muxing on ras_persp is
|
||
// exclusive. When ras_persp=0 (always so at PERSPECTIVE_CORRECT=0)
|
||
// this is byte-identical to the affine inputs.
|
||
// COMBINED probe: in combined mode the texel fetch is issued ONLY
|
||
// on beat 1 (comb_tex_req), so tex_rd_en fires for exactly one
|
||
// cycle per passing pixel. The u/v already track the HELD walker
|
||
// pixel (tex_coord_x/y == ras_cur_x/y under TEX_RD_REGISTERED=1
|
||
// while the walker is stalled), so the legacy s1_u/s1_v feed the
|
||
// correct affine texel address. ras_combined is constant 0 at
|
||
// COMBINED_TAZ=0, so this mux collapses to the legacy expression.
|
||
.in_valid (ras_combined ? comb_tex_req
|
||
: (ras_persp ? persp_outvalid
|
||
: (tex_coord_valid && s1_tex_active
|
||
&& !(ras_tex_abe && ta_beat_q)))), // Ch344: texel on primary beat only
|
||
.u (ras_persp ? persp_u : s1_u),
|
||
.v (ras_persp ? persp_v : s1_v),
|
||
// Ch310 — fractional UV for BILINEAR. Perspective passes 0 fracs
|
||
// (bilinear demo uses the affine TRI path). Unused at BILINEAR_ENABLE=0.
|
||
.u_frac (ras_persp ? 4'd0 : s1_u_frac),
|
||
.v_frac (ras_persp ? 4'd0 : s1_v_frac),
|
||
// Ch310 — per-primitive runtime filter select (TEX1_1.MMAG snapshot).
|
||
.filter_lin (ras_filter_lin),
|
||
.tbp0_base_bytes (ras_tex_base),
|
||
.tbw (ras_tbw),
|
||
.psm (ras_tpsm),
|
||
.wms (ras_wms),
|
||
.wmt (ras_wmt),
|
||
.tw (ras_tw),
|
||
.th (ras_th),
|
||
.tex_rd_en (tex_rd_en),
|
||
.tex_rd_addr (tex_rd_addr),
|
||
.tex_rd_data (tex_rd_data),
|
||
.clut_rd_idx (clut_rd_idx),
|
||
.clut_rd_data (clut_rd_data),
|
||
.out_valid (s1_tex_outvalid), // Ch310 — consumed by CB_TWAIT; nearest path ignores it
|
||
.tex_color (s1_tex_color),
|
||
.busy (tex_busy) // Ch310 — high across the 4-tap run; 0 on nearest
|
||
);
|
||
|
||
// S1 -> S2 texel register. s2_tex_active_q gates the substitution
|
||
// at emit; s2_tex_color_q is the sampled ABGR for the S2 pixel.
|
||
logic s2_tex_active_q;
|
||
logic [31:0] s2_tex_color_q;
|
||
|
||
// ------------------------------------------------------------------
|
||
// Brick 2a — S3 alpha-blend stage.
|
||
//
|
||
// When ras_abe is active, an S2 inside-pixel does NOT drive the
|
||
// emit outputs directly. Instead it presents the dest-read address
|
||
// (= the pixel's write address) on fb_rd_addr/fb_rd_en during the
|
||
// S2 cycle and latches a "pending blend" record into the S3 stage.
|
||
// One cycle later, S3 blends the source color (s3_cs_*) against the
|
||
// dest pixel read back from VRAM and drives the emit outputs. This
|
||
// adds exactly ONE pipeline cycle for blended primitives; opaque
|
||
// primitives (ras_abe=0) keep the original direct S2->output emit
|
||
// and are byte-identical to pre-Brick-2a.
|
||
// ------------------------------------------------------------------
|
||
logic s3_valid_q; // a blended pixel is in the S3 stage
|
||
logic [11:0] s3_x_q, s3_y_q;
|
||
logic [31:0] s3_fb_addr_q;
|
||
logic [3:0] s3_be_q;
|
||
logic [31:0] s3_mask_q;
|
||
logic [5:0] s3_psm_q;
|
||
logic [63:0] s3_cs_q; // source color (ABGR + Q), pre-blend
|
||
logic [31:0] s3_dest_q; // dest pixel (PSMCT32), comb-read latch
|
||
logic [7:0] s3_as_q; // source alpha (RGBAQ.A, 0..128 scale)
|
||
|
||
// Dest pixel feeding the blend: live for the registered read port
|
||
// (data arrives at S3), latched copy for the combinational port
|
||
// (data captured at the S2->S3 edge).
|
||
logic [31:0] s3_dest_pixel;
|
||
assign s3_dest_pixel = FB_RD_REGISTERED ? fb_rd_data : s3_dest_q;
|
||
|
||
// ------------------------------------------------------------------
|
||
// Brick 2b — S3 Z-test stage + half-rate 2-beat write sequencing.
|
||
//
|
||
// A Z-tested inside pixel presents its Z-buffer address on z_rd_*
|
||
// during S2 and latches a "pending Z-test" record into the S3 stage
|
||
// (mirrors the alpha S3 mechanism). One cycle later, S3 compares the
|
||
// fragment Z (constant flat ras_z_value) against the stored Z read
|
||
// back from VRAM, per ZTST. On PASS the pixel needs TWO writes
|
||
// through the single VRAM write port: the fb COLOR and (if ZMSK=0)
|
||
// the Z VALUE. We serialize them across two beats and freeze the
|
||
// pixel pipeline for the second beat so the next pixel's color does
|
||
// not contend for the port.
|
||
//
|
||
// z_beat: 0 = "primary" beat (pipeline advances; color write fires
|
||
// on a passing pixel and a pending Z write is queued);
|
||
// 1 = "z" beat (pipeline frozen; the queued Z write fires).
|
||
// z_beat toggles every cycle ONLY while ras_zte is active and a scan
|
||
// is in flight. When ras_zte=0 it is held at 0 and the pipeline runs
|
||
// full rate — byte-identical to pre-Brick-2b.
|
||
// ------------------------------------------------------------------
|
||
logic s3_zvalid_q; // a Z-tested pixel is in the S3 stage
|
||
logic [11:0] s3_zx_q, s3_zy_q;
|
||
logic [31:0] s3_zfb_addr_q; // framebuffer (color) byte address
|
||
logic [31:0] s3_z_addr_q; // Z-buffer byte address
|
||
logic [63:0] s3_zcolor_q; // fb color to write on pass (PSMCT32)
|
||
logic [31:0] s3_zval_q; // fragment Z to write on pass
|
||
logic [1:0] s3_ztst_q; // compare op
|
||
logic s3_zmsk_q; // 1 => do not update Z
|
||
logic [31:0] s3_zstored_q; // stored Z, captured for the compare
|
||
|
||
// Stored Z feeding the compare: ALWAYS the latched copy. Under the
|
||
// half-rate 2-beat sequencing the S2->S3 distance is 2 cycles, so a
|
||
// "live registered z_rd_data at S3" (the alpha-style trick) would be
|
||
// off by one pixel. Instead s3_zstored_q is captured on the FROZEN
|
||
// z beat that immediately follows the S2 primary beat (see the
|
||
// pipeline body). At that moment z_rd_addr has been held across the
|
||
// primary->frozen edge, so the registered read (1-cycle) AND the
|
||
// combinational read both present this pixel's stored Z — a single
|
||
// capture point is correct for both Z_RD_REGISTERED settings.
|
||
logic [31:0] s3_zstored;
|
||
assign s3_zstored = s3_zstored_q;
|
||
|
||
// Depth comparison (PS2: larger Z = nearer the viewer).
|
||
// NEVER : never pass
|
||
// ALWAYS : always pass
|
||
// GEQUAL : frag_Z >= stored_Z
|
||
// GREATER : frag_Z > stored_Z
|
||
logic z_test_pass;
|
||
always_comb begin
|
||
unique case (s3_ztst_q)
|
||
2'd0: z_test_pass = 1'b0; // NEVER
|
||
2'd1: z_test_pass = 1'b1; // ALWAYS
|
||
2'd2: z_test_pass = (s3_zval_q >= s3_zstored); // GEQUAL
|
||
2'd3: z_test_pass = (s3_zval_q > s3_zstored); // GREATER
|
||
default: z_test_pass = 1'b0;
|
||
endcase
|
||
end
|
||
|
||
// Half-rate beat toggle. Combinational "is this a Z scan" gate.
|
||
logic z_scan_active;
|
||
assign z_scan_active = ras_zte && (raster_state != R_IDLE);
|
||
logic z_beat_q; // 0 = primary, 1 = z beat
|
||
logic z_advance; // pipeline advances this cycle?
|
||
assign z_advance = !(z_scan_active && z_beat_q);
|
||
|
||
// Queued Z write produced on the primary beat (a passing pixel that
|
||
// must update Z). Consumed on the z beat.
|
||
logic zw_pending_q;
|
||
logic [31:0] zw_addr_q;
|
||
logic [31:0] zw_val_q;
|
||
|
||
logic [7:0] blend_r, blend_g, blend_b, blend_a;
|
||
gs_alpha_blend u_blend (
|
||
.cs_r (s3_cs_q[7:0]),
|
||
.cs_g (s3_cs_q[15:8]),
|
||
.cs_b (s3_cs_q[23:16]),
|
||
.as (s3_as_q),
|
||
.cd_r (s3_dest_pixel[7:0]),
|
||
.cd_g (s3_dest_pixel[15:8]),
|
||
.cd_b (s3_dest_pixel[23:16]),
|
||
.cv_r (blend_r),
|
||
.cv_g (blend_g),
|
||
.cv_b (blend_b),
|
||
.a_out(blend_a)
|
||
);
|
||
|
||
// Blended PSMCT32 ABGR word (source alpha passed through), Q upper.
|
||
logic [63:0] s3_blended_color64;
|
||
assign s3_blended_color64 = {s3_cs_q[63:32],
|
||
blend_a, blend_b, blend_g, blend_r};
|
||
|
||
// ==================================================================
|
||
// Ch344 — TEXTURED + source-over ALPHA SPRITE path (isolated; NOT the
|
||
// combined-TAZ compositor). A ras_tex_abe SPRITE runs HALF-RATE so the
|
||
// texel read (S1, read2) and the dest read (frozen beat, read2) never
|
||
// co-fire — same mutual-exclusion guarantee as Brick 2b's Z sprite:
|
||
// primary beat (ta_beat=0): texel fetch (tex_rd); latched to s2_tex_color_q.
|
||
// frozen beat (ta_beat=1): dest fetch (fb_rd) -> ta_dest_q.
|
||
// next primary beat : blend texel*vtx-color over ta_dest_q, emit.
|
||
// Source alpha = TEXEL alpha (TCC=1). Source color = texel MODULATEd by the
|
||
// per-vertex RGBAQ tint (ras_color); a unity 0x80 vertex tint = pass-through.
|
||
// ras_tex_abe is constant 0 at SPRITE_TEX_ALPHA=0, so all of this prunes.
|
||
logic ta_scan_active;
|
||
assign ta_scan_active = ras_tex_abe && (raster_state != R_IDLE);
|
||
logic ta_beat_q; // 0 = primary (texel), 1 = frozen (dest)
|
||
logic ta_advance;
|
||
assign ta_advance = !(ta_scan_active && ta_beat_q);
|
||
// This path targets the BOARD wrapper's read model: TEX_RD_REGISTERED=1 + FB_RD_REGISTERED=1 (1-cycle
|
||
// registered read2). Under that model the streaming s2_tex_color_q is captured on the advance beat —
|
||
// one cycle too early for the texel (which lands on the frozen beat) — so we capture our OWN copies:
|
||
// ta_tex_q <= s1_tex_color on the FROZEN beat (the texel for THIS S2 pixel is the registered result
|
||
// of the primary-beat texel request — exactly how the combined FSM latches it at CB_T), and
|
||
// the dest = fb_rd_data live on the primary/emit beat (registered result of the frozen-beat fb_rd).
|
||
// Texel alignment to S2 (registered read2): the texel for a pixel is the registered result of the
|
||
// texel request issued when that pixel was at S1 (primary beat) — it lands on the NEXT (frozen) beat.
|
||
// But the half-rate stall delays the pixel reaching S2 by an extra beat, so we capture s1_tex_color
|
||
// UNCONDITIONALLY on every frozen beat into ta_tex_q and keep a 1-deep delayed copy ta_tex_q1; the
|
||
// delayed copy is exactly aligned with the S2 pixel at its emit beat (verified cycle-by-cycle).
|
||
logic [31:0] ta_tex_q; // newest sampled texel (frozen-beat capture)
|
||
logic [31:0] ta_tex_q1; // 1-deep delay -> aligned with the S2 pixel being emitted
|
||
logic [31:0] ta_dest_q; // (combinational fallback only) dest captured on the frozen beat
|
||
function automatic logic [7:0] ta_mod8(input logic [7:0] t, input logic [7:0] c);
|
||
logic [15:0] p; begin p = ({8'd0,t} * {8'd0,c}) >> 7; ta_mod8 = (p > 16'd255) ? 8'hFF : p[7:0]; end
|
||
endfunction
|
||
logic [31:0] ta_cs; // MODULATEd source color (texel alpha kept)
|
||
assign ta_cs = {ta_tex_q1[31:24],
|
||
ta_mod8(ta_tex_q1[23:16], ras_color[23:16]),
|
||
ta_mod8(ta_tex_q1[15:8], ras_color[15:8]),
|
||
ta_mod8(ta_tex_q1[7:0], ras_color[7:0])};
|
||
logic [31:0] ta_cd;
|
||
assign ta_cd = FB_RD_REGISTERED ? fb_rd_data : ta_dest_q;
|
||
logic [7:0] ta_br, ta_bg, ta_bb, ta_ba;
|
||
gs_alpha_blend u_ta_blend (
|
||
.cs_r (ta_cs[7:0]), .cs_g (ta_cs[15:8]), .cs_b (ta_cs[23:16]),
|
||
.as (ta_tex_q1[31:24]),
|
||
.cd_r (ta_cd[7:0]), .cd_g (ta_cd[15:8]), .cd_b (ta_cd[23:16]),
|
||
.cv_r (ta_br), .cv_g (ta_bg), .cv_b (ta_bb), .a_out (ta_ba)
|
||
);
|
||
logic [63:0] ta_blended_color64;
|
||
assign ta_blended_color64 = {ras_color[63:32], ta_ba, ta_bb, ta_bg, ta_br};
|
||
// Dest-read enable: frozen beat only (read2 = dest). The texel read is gated to the
|
||
// primary beat (in_valid below), so the two read2 consumers are time-exclusive.
|
||
logic s2_tex_abe_dest;
|
||
assign s2_tex_abe_dest = ras_tex_abe && s2_valid_q && s2_inside_q && ta_beat_q;
|
||
|
||
// Brick 3 — SYNTHESIZABLE per-pixel affine attribute interpolation,
|
||
// evaluated in the S2 stage. The popped Q16.16 gradients (ras_d*_dx /
|
||
// ras_d*_dy) and per-vertex-0 base values are constant for the
|
||
// in-flight primitive; S2 only needs s2_x_q/s2_y_q (relative to v0)
|
||
// to accumulate. Pure multiply + add + arithmetic shift — NO divide,
|
||
// NO `// synthesis translate_off`. This REPLACES the Ch86 non-
|
||
// synthesizable barycentric divide (interp_byte). The s2_L*_q
|
||
// barycentric weights are no longer used for interpolation (kept only
|
||
// as legacy white-box probes; the coverage test still uses the edge
|
||
// functions directly).
|
||
//
|
||
// A(x,y) = base0 + ( dx*dAdx + dy*dAdy ) >> 16
|
||
// dx = s2_x - v0_x , dy = s2_y - v0_y
|
||
//
|
||
// Color channels clamp to [0,255]; Z keeps the full 32-bit value.
|
||
logic signed [31:0] s2_dx, s2_dy;
|
||
assign s2_dx = $signed({20'd0, s2_x_q}) - $signed({20'd0, ras_v0_x});
|
||
assign s2_dy = $signed({20'd0, s2_y_q}) - $signed({20'd0, ras_v0_y});
|
||
|
||
// Ch296 — per-pixel multiplier resize (Quartus area fit). The
|
||
// affine step is `dadx*dx + dady*dy`. `dadx`/`dady` are signed
|
||
// Q16.16 gradients (full 32-bit). `dx`/`dy` are PIXEL offsets from
|
||
// the reference vertex: dx = s2_x_q - ras_v0_x with both operands
|
||
// 12-bit UNSIGNED screen coords ([0,4095]), so dx,dy ∈ [-4095,+4095]
|
||
// — strictly inside signed 16-bit ([-32768,+32767]) with ample
|
||
// margin (a signed 13-bit range would already suffice). The prior
|
||
// code sign-extended BOTH operands to 64 bits, synthesizing a
|
||
// 64×64 multiplier per attribute (~8 instances). Sizing dx/dy to
|
||
// signed 16 bits makes each a 32×16 signed multiply whose product
|
||
// (≤48 bits) and the 2-term sum (≤49 bits) are LOSSLESSLY captured
|
||
// by the 64-bit `step`/`acc`/`val` accumulators — so the accumulate,
|
||
// `>>> 16`, base add, and clamp are BIT-IDENTICAL to before. The
|
||
// dxw/dyw narrowing is a guaranteed value-preserving truncation
|
||
// because the high bits are pure sign extension of the 16-bit value
|
||
// (see the bound above). tb_gs_tri_interp asserts dx/dy in range.
|
||
//
|
||
// Affine evaluator for an 8-bit color channel: clamp result to 0..255.
|
||
function automatic logic [7:0] interp_affine8(
|
||
input logic [7:0] base0,
|
||
input logic signed [31:0] dadx,
|
||
input logic signed [31:0] dady,
|
||
input logic signed [31:0] dx,
|
||
input logic signed [31:0] dy);
|
||
logic signed [15:0] dxw, dyw; // pixel offsets, true width
|
||
logic signed [47:0] prod_x, prod_y; // 32×16 signed products
|
||
logic signed [63:0] acc;
|
||
logic signed [63:0] step;
|
||
logic signed [63:0] val;
|
||
dxw = dx[15:0];
|
||
dyw = dy[15:0];
|
||
prod_x = dadx * dxw; // 32×16 signed
|
||
prod_y = dady * dyw; // 32×16 signed
|
||
step = $signed({{16{prod_x[47]}}, prod_x})
|
||
+ $signed({{16{prod_y[47]}}, prod_y});
|
||
acc = step >>> 16; // arithmetic >>16
|
||
val = $signed({56'd0, base0}) + acc;
|
||
if (val < 64'sd0) interp_affine8 = 8'd0;
|
||
else if (val > 64'sd255) interp_affine8 = 8'd255;
|
||
else interp_affine8 = val[7:0];
|
||
endfunction
|
||
|
||
// Affine evaluator for the full 32-bit fragment Z (no clamp).
|
||
function automatic logic [31:0] interp_affine_z(
|
||
input logic [31:0] base0,
|
||
input logic signed [31:0] dadx,
|
||
input logic signed [31:0] dady,
|
||
input logic signed [31:0] dx,
|
||
input logic signed [31:0] dy);
|
||
logic signed [15:0] dxw, dyw; // pixel offsets, true width
|
||
logic signed [47:0] prod_x, prod_y; // 32×16 signed products
|
||
logic signed [63:0] step;
|
||
logic signed [63:0] val;
|
||
dxw = dx[15:0];
|
||
dyw = dy[15:0];
|
||
prod_x = dadx * dxw; // 32×16 signed
|
||
prod_y = dady * dyw; // 32×16 signed
|
||
step = $signed({{16{prod_x[47]}}, prod_x})
|
||
+ $signed({{16{prod_y[47]}}, prod_y});
|
||
val = $signed({32'd0, base0}) + (step >>> 16);
|
||
interp_affine_z = val[31:0];
|
||
endfunction
|
||
|
||
// Textured-triangle rung — affine evaluator for an 11-bit texel
|
||
// coordinate (U or V). IDENTICAL right-sized 32x16 multiply
|
||
// structure as interp_affine8/_z (dadx/dady are signed Q16.16
|
||
// gradients; dx/dy are the same signed-16 pixel offsets). Clamp to
|
||
// the 0..2047 texel range so an out-of-range affine extrapolation
|
||
// (off-triangle rounding) saturates instead of wrapping. NO new
|
||
// multiplier width: prod_x/prod_y are the same 32x16 signed products
|
||
// the colour/Z evaluators use.
|
||
function automatic logic [10:0] interp_affine_uv(
|
||
input logic [10:0] base0,
|
||
input logic signed [31:0] dadx,
|
||
input logic signed [31:0] dady,
|
||
input logic signed [31:0] dx,
|
||
input logic signed [31:0] dy);
|
||
logic signed [15:0] dxw, dyw; // pixel offsets, true width
|
||
logic signed [47:0] prod_x, prod_y; // 32×16 signed products
|
||
logic signed [63:0] step;
|
||
logic signed [63:0] val;
|
||
dxw = dx[15:0];
|
||
dyw = dy[15:0];
|
||
prod_x = dadx * dxw; // 32×16 signed
|
||
prod_y = dady * dyw; // 32×16 signed
|
||
step = $signed({{16{prod_x[47]}}, prod_x})
|
||
+ $signed({{16{prod_y[47]}}, prod_y});
|
||
val = $signed({53'd0, base0}) + (step >>> 16);
|
||
if (val < 64'sd0) interp_affine_uv = 11'd0;
|
||
else if (val > 64'sd2047) interp_affine_uv = 11'd2047;
|
||
else interp_affine_uv = val[10:0];
|
||
endfunction
|
||
|
||
// Ch310 — SUB-TEXEL FRACTION sibling of interp_affine_uv (BILINEAR).
|
||
// Identical 32×16 multiply + accumulate structure; instead of the integer
|
||
// texel (val[10:0] after >>>16), it returns the 4-bit fraction that the
|
||
// >>>16 truncates away. base0 is an integer texel coord (no fraction), so
|
||
// the entire sub-texel fraction lives in the low 16 bits of `step`; the top
|
||
// 4 of those (step[15:12]) are the /16 frac the 4-tap lerp consumes.
|
||
// The fraction is forced to 0 whenever the integer part saturates to the
|
||
// 0 or 2047 texel-clamp boundary (matching interp_affine_uv's clamp), so a
|
||
// clamped edge tap blends with frac=0 = the boundary texel exactly (no
|
||
// half-step past the edge). iverilog-12: NO bit-select on a parenthesized
|
||
// expr — `step` is a named temp, then step[15:12].
|
||
function automatic logic [3:0] interp_affine_uv_frac(
|
||
input logic [10:0] base0,
|
||
input logic signed [31:0] dadx,
|
||
input logic signed [31:0] dady,
|
||
input logic signed [31:0] dx,
|
||
input logic signed [31:0] dy);
|
||
logic signed [15:0] dxw, dyw; // pixel offsets, true width
|
||
logic signed [47:0] prod_x, prod_y; // 32×16 signed products
|
||
logic signed [63:0] step;
|
||
logic signed [63:0] val;
|
||
dxw = dx[15:0];
|
||
dyw = dy[15:0];
|
||
prod_x = dadx * dxw;
|
||
prod_y = dady * dyw;
|
||
step = $signed({{16{prod_x[47]}}, prod_x})
|
||
+ $signed({{16{prod_y[47]}}, prod_y});
|
||
val = $signed({53'd0, base0}) + (step >>> 16);
|
||
// frac = step[15:12] (the truncated sub-texel bits), 0 at the clamps.
|
||
if (val < 64'sd0) interp_affine_uv_frac = 4'd0;
|
||
else if (val > 64'sd2047) interp_affine_uv_frac = 4'd0;
|
||
else interp_affine_uv_frac = step[15:12];
|
||
endfunction
|
||
|
||
// Ch301 perspective — WIDE affine evaluator for a 24-bit perspective
|
||
// attribute (S_fp / T_fp / Q_fp, FRAC=12). IDENTICAL 32x16 multiply
|
||
// structure as interp_affine_uv but returns the raw 24-bit linear
|
||
// interpolant WITHOUT the [0,2047] texel clamp — these are the
|
||
// numerator/denominator the per-pixel divide (gs_persp_uv, emit side)
|
||
// consumes, not direct texel coords. base0 is the 24-bit value at
|
||
// post-swap v0; dadx/dady the signed Q16.16 gradients; dx/dy the same
|
||
// signed-16 pixel offsets the affine U/V use. Negative / overflow are
|
||
// truncated to 24 bits (the gradient solve keeps values in range for
|
||
// an in-triangle pixel; off-triangle extrapolation is sampled but the
|
||
// emit-side divide / sampler bounds it).
|
||
function automatic logic [23:0] interp_affine_wide(
|
||
input logic [23:0] base0,
|
||
input logic signed [31:0] dadx,
|
||
input logic signed [31:0] dady,
|
||
input logic signed [31:0] dx,
|
||
input logic signed [31:0] dy);
|
||
logic signed [15:0] dxw, dyw; // pixel offsets, true width
|
||
logic signed [47:0] prod_x, prod_y; // 32×16 signed products
|
||
logic signed [63:0] step;
|
||
logic signed [63:0] val;
|
||
dxw = dx[15:0];
|
||
dyw = dy[15:0];
|
||
prod_x = dadx * dxw; // 32×16 signed
|
||
prod_y = dady * dyw; // 32×16 signed
|
||
step = $signed({{16{prod_x[47]}}, prod_x})
|
||
+ $signed({{16{prod_y[47]}}, prod_y});
|
||
val = $signed({40'd0, base0}) + (step >>> 16);
|
||
interp_affine_wide = val[23:0];
|
||
endfunction
|
||
|
||
logic [7:0] s2_interp_r, s2_interp_g, s2_interp_b, s2_interp_a;
|
||
assign s2_interp_r = interp_affine8(ras_c0_q[7:0], ras_dr_dx, ras_dr_dy, s2_dx, s2_dy);
|
||
assign s2_interp_g = interp_affine8(ras_c0_q[15:8], ras_dg_dx, ras_dg_dy, s2_dx, s2_dy);
|
||
assign s2_interp_b = interp_affine8(ras_c0_q[23:16], ras_db_dx, ras_db_dy, s2_dx, s2_dy);
|
||
assign s2_interp_a = interp_affine8(ras_c0_q[31:24], ras_da_dx, ras_da_dy, s2_dx, s2_dy);
|
||
|
||
// Ch335 — combined-path per-pixel interpolated (Gouraud) color, computed from the HELD
|
||
// candidate pixel's offset (comb_dx/comb_dy) so it aligns with s1_tex_color at CB_T. For a
|
||
// FLAT primitive the gradients are 0 so this collapses to ras_c0_q (the constant color) — i.e.
|
||
// Ch333/334 flat-color scenes are byte-identical; non-zero gradients give a smooth gradient.
|
||
wire [7:0] comb_interp_r = interp_affine8(ras_c0_q[7:0], ras_dr_dx, ras_dr_dy, comb_dx, comb_dy);
|
||
wire [7:0] comb_interp_g = interp_affine8(ras_c0_q[15:8], ras_dg_dx, ras_dg_dy, comb_dx, comb_dy);
|
||
wire [7:0] comb_interp_b = interp_affine8(ras_c0_q[23:16], ras_db_dx, ras_db_dy, comb_dx, comb_dy);
|
||
wire [31:0] comb_interp_color = {8'd0, comb_interp_b, comb_interp_g, comb_interp_r};
|
||
|
||
// Per-pixel interpolated fragment Z for a TRI (Brick 3). For a SPRITE
|
||
// the flat ras_z_value is used; the mux happens at the Z-test latch.
|
||
logic [31:0] s2_interp_z;
|
||
assign s2_interp_z = interp_affine_z(ras_z0, ras_dz_dx, ras_dz_dy, s2_dx, s2_dy);
|
||
|
||
logic [63:0] s2_interp_color;
|
||
assign s2_interp_color = {ras_c2_q[63:32], // Q passthrough
|
||
s2_interp_a, s2_interp_b, s2_interp_g, s2_interp_r};
|
||
|
||
// ----------------------------------------------------------------
|
||
// Ch95 — PSM-aware emit packing.
|
||
//
|
||
// The 32-bit ABGR produced by Gouraud (TRI) or `ras_color`
|
||
// (SPRITE) is the natural PSMCT32 lane. For 16-bit PSMs
|
||
// (PSMCT16/PSMCT16S/PSMZ16/PSMZ16S, all of which set
|
||
// ras_bpp_shift==1), pack ABGR → RGB5A1 by taking the high
|
||
// 5 bits of each color channel and the alpha MSB:
|
||
// r5 = R[7:3], g5 = G[7:3], b5 = B[7:3], a1 = A[7]
|
||
// pix16 = {a1, b5, g5, r5}
|
||
// The 16-bit value is placed in the LOW 16 bits of the
|
||
// emit lane, and `raster_pixel_be_q = 4'b0011` so vram_stub
|
||
// commits exactly the 2 bytes at write_addr (the pixel's
|
||
// byte address) without stomping anything else. write_addr
|
||
// here is the byte address from gs_stub's PSM-aware fb_addr
|
||
// math (Ch83) — for PSMCT16 it's already 2-byte aligned but
|
||
// not necessarily 4-byte aligned, and per-byte enable makes
|
||
// unaligned halfword writes safe.
|
||
// ----------------------------------------------------------------
|
||
logic [31:0] s2_natural_abgr;
|
||
logic [4:0] s2_pack_r5;
|
||
logic [4:0] s2_pack_g5;
|
||
logic [4:0] s2_pack_b5;
|
||
logic s2_pack_a1;
|
||
logic [15:0] s2_pack_psm16;
|
||
logic [7:0] s2_pack_t8_index;
|
||
logic [3:0] s2_pack_t4_nibble;
|
||
logic s2_psm_is_16bit;
|
||
logic s2_psm_is_t8;
|
||
logic s2_psm_is_t4;
|
||
logic [63:0] s2_emit_color64;
|
||
logic [3:0] s2_emit_be;
|
||
logic [31:0] s2_emit_mask;
|
||
|
||
// Brick 1 — for a textured SPRITE pixel (PSMCT32 direct or, Ch296,
|
||
// PSMT8 indexed DECAL) the sampled texel REPLACES the flat sprite
|
||
// color. Otherwise the SPRITE color is the latched flat ras_color,
|
||
// exactly as before. s2_tex_active_q is only ever set for
|
||
// SPRITE+TME+(PSMCT32|PSMT8), so when it's low the SPRITE path is
|
||
// byte-identical to pre-Brick-1.
|
||
logic [63:0] s2_sprite_color64;
|
||
assign s2_sprite_color64 = s2_tex_active_q
|
||
? {ras_color[63:32], s2_tex_color_q}
|
||
: ras_color;
|
||
|
||
// Textured-triangle rung — for a TME PSMCT32 TRIANGLE the sampled
|
||
// texel REPLACES the interpolated Gouraud color (DECAL), mirroring
|
||
// the SPRITE substitution above. s2_tex_active_q is set only for a
|
||
// TRI+TME+PSMCT32 pixel here, so an untextured Gouraud triangle keeps
|
||
// s2_interp_color exactly — the legacy Gouraud emit is byte-identical.
|
||
// ZTE/ABE are OFF for this rung (architect scope), so the texel goes
|
||
// straight to the emit color with no blend/Z interaction.
|
||
logic [63:0] s2_tri_color64;
|
||
assign s2_tri_color64 = s2_tex_active_q
|
||
? {s2_interp_color[63:32], s2_tex_color_q}
|
||
: s2_interp_color;
|
||
|
||
assign s2_natural_abgr = (s2_mode_q == RM_TRI) ? s2_tri_color64[31:0]
|
||
: s2_sprite_color64[31:0];
|
||
assign s2_pack_r5 = s2_natural_abgr[7:3];
|
||
assign s2_pack_g5 = s2_natural_abgr[15:11];
|
||
assign s2_pack_b5 = s2_natural_abgr[23:19];
|
||
assign s2_pack_a1 = s2_natural_abgr[31];
|
||
assign s2_pack_psm16 = {s2_pack_a1, s2_pack_b5, s2_pack_g5, s2_pack_r5};
|
||
assign s2_pack_t8_index = s2_natural_abgr[7:0]; // Ch105: real-GS uses R as PSMT8 index
|
||
assign s2_pack_t4_nibble = s2_natural_abgr[3:0]; // Ch106: low 4 bits of R as PSMT4 index
|
||
assign s2_psm_is_16bit = (ras_bpp_shift == 2'd1);
|
||
assign s2_psm_is_t8 = (ras_bpp_shift == 2'd0);
|
||
assign s2_psm_is_t4 = (ras_psm == 6'h14);
|
||
// Ch105: PSMT8 fb_addr is 1 byte/pixel — write_addr is the
|
||
// exact byte address. vram_stub commits mem[write_addr] from
|
||
// data[7:0] when write_be[0] is set, so a 1-byte commit at
|
||
// any byte alignment is just be=4'b0001 with the index in
|
||
// the low byte of write_data. No lane shifting needed.
|
||
//
|
||
// Ch106: PSMT4 packs 2 pixels per byte. The byte address is
|
||
// pixel_index >> 1; pixel_index[0] picks low (=0) or high
|
||
// (=1) nibble inside that byte. Emit be=4'b0001 with the
|
||
// 4-bit index in the LOW nibble of write_data and use
|
||
// write_mask = 0x0000_000F (low) / 0x0000_00F0 (high) so
|
||
// vram_stub merges only the targeted nibble — the OTHER
|
||
// nibble in the same byte is preserved. Back-to-back same-
|
||
// byte writes (e.g. PSMT4 pixels x=0 and x=1) chain through
|
||
// NBA semantics: the second write samples mem[addr] AFTER
|
||
// the prior commit, so both nibbles end up in the byte.
|
||
|
||
// Ch140 — nibble selector for PSMT4 emit. Linear path uses
|
||
// s2_pixel_index[0] (Ch106). Swizzled path (PSMT4_SWIZZLE=1)
|
||
// uses the swizzle module's nibble_hi output — required
|
||
// because the canonical PCSX2 columnTable4 reorders nibbles
|
||
// within a block, so s2_pixel_index[0] is wrong. Declared
|
||
// here at module scope (referenced by the always_comb below).
|
||
// swizzle4_raster_nibble_hi is generated further down where
|
||
// the PSMT4 raster swizzle module is instantiated; this is
|
||
// forward-referenced at elaboration time which iverilog
|
||
// accepts for net assignments.
|
||
logic psmt4_raster_nibble_select;
|
||
assign psmt4_raster_nibble_select = PSMT4_SWIZZLE ? swizzle4_raster_nibble_hi
|
||
: s2_pixel_index[0];
|
||
|
||
always_comb begin
|
||
if (s2_psm_is_t4) begin
|
||
// PSMT4: 4 bits per pixel. Index in low nibble of
|
||
// write_data, write_mask gates which nibble of the
|
||
// byte gets merged. The "high nibble" position is
|
||
// realised by placing the index at [7:4] in the
|
||
// commit data — the mask is 0x0F or 0xF0 to match.
|
||
// Selector keys on linear x[0] when PSMT4_SWIZZLE=0,
|
||
// or on the swizzle's nibble_hi when the gate is on.
|
||
s2_emit_color64 = {(s2_mode_q == RM_TRI) ? s2_interp_color[63:32]
|
||
: ras_color[63:32],
|
||
24'd0,
|
||
psmt4_raster_nibble_select ? {s2_pack_t4_nibble, 4'd0}
|
||
: {4'd0, s2_pack_t4_nibble}};
|
||
s2_emit_be = 4'b0001;
|
||
s2_emit_mask = psmt4_raster_nibble_select ? 32'h0000_00F0
|
||
: 32'h0000_000F;
|
||
end else if (s2_psm_is_t8) begin
|
||
// PSMT8: 1 byte per pixel. Index in the LOW byte of
|
||
// write_data; vram_stub commits exactly that byte at
|
||
// write_addr. Q (upper 32) preserved for trace-side
|
||
// consistency.
|
||
s2_emit_color64 = {(s2_mode_q == RM_TRI) ? s2_interp_color[63:32]
|
||
: ras_color[63:32],
|
||
24'd0, s2_pack_t8_index};
|
||
s2_emit_be = 4'b0001;
|
||
s2_emit_mask = 32'hFFFF_FFFF;
|
||
end else if (s2_psm_is_16bit) begin
|
||
// 16-bit pixel goes in the LOW halfword; upper half
|
||
// doesn't matter (be gates it off). Q (upper 32) is
|
||
// preserved for trace-side consistency.
|
||
s2_emit_color64 = {(s2_mode_q == RM_TRI) ? s2_interp_color[63:32]
|
||
: ras_color[63:32],
|
||
16'd0, s2_pack_psm16};
|
||
s2_emit_be = 4'b0011;
|
||
s2_emit_mask = 32'hFFFF_FFFF;
|
||
end else begin
|
||
// PSMCT32 (4 bytes/pixel). For a textured SPRITE the
|
||
// sampled texel replaces the flat color (Brick 1); for a
|
||
// textured TRIANGLE the texel replaces the Gouraud color
|
||
// (s2_tri_color64). Both collapse to their respective
|
||
// untextured color when texturing is inactive, so the
|
||
// TME=0 paths are byte-identical.
|
||
s2_emit_color64 = (s2_mode_q == RM_TRI) ? s2_tri_color64
|
||
: s2_sprite_color64;
|
||
s2_emit_be = 4'b1111;
|
||
s2_emit_mask = 32'hFFFF_FFFF;
|
||
end
|
||
end
|
||
|
||
// fb_addr for the S2-stage pixel.
|
||
logic [31:0] s2_fbp_bytes;
|
||
logic [31:0] s2_pixels_per_row;
|
||
logic [31:0] s2_pixel_index;
|
||
logic [31:0] s2_pixel_byte_offset;
|
||
logic [31:0] s2_fb_addr;
|
||
assign s2_fbp_bytes = {23'd0, ras_fbp} << 11;
|
||
assign s2_pixels_per_row = {26'd0, ras_fbw} << 6;
|
||
assign s2_pixel_index = ({20'd0, s2_y_q} * s2_pixels_per_row)
|
||
+ {20'd0, s2_x_q};
|
||
// Ch106: PSMT4 byte offset is pixel_index >> 1 (2 pixels per
|
||
// byte). Other PSMs use pixel_index << ras_bpp_shift.
|
||
assign s2_pixel_byte_offset = s2_psm_is_t4 ? (s2_pixel_index >> 1)
|
||
: (s2_pixel_index << ras_bpp_shift);
|
||
logic [31:0] s2_fb_addr_linear;
|
||
assign s2_fb_addr_linear = s2_fbp_bytes + s2_pixel_byte_offset;
|
||
|
||
// Ch302 — perspective fb addr. The perspective DECAL primitive is
|
||
// PSMCT32, non-swizzled (PSMCT32_SWIZZLE=0 for the demo), so it uses
|
||
// the SAME linear PSMCT32 formula s2_fb_addr_linear uses, but
|
||
// evaluated on the +5-aligned perspective coords (persp_x5/persp_y5)
|
||
// and with ras_bpp_shift==2 (4 bytes/pixel for PSMCT32).
|
||
// addr = FBP_bytes + (y*pixels_per_row + x) << 2
|
||
logic [31:0] persp_pixel_index;
|
||
logic [31:0] persp_fb_addr;
|
||
assign persp_pixel_index = ({20'd0, persp_y5} * s2_pixels_per_row)
|
||
+ {20'd0, persp_x5};
|
||
assign persp_fb_addr = s2_fbp_bytes + (persp_pixel_index << ras_bpp_shift);
|
||
|
||
// Ch122 — optional PSMCT32 raster swizzle. The swizzle module
|
||
// is purely combinational; we feed it the raster's per-cycle
|
||
// (s2_x_q, s2_y_q) plus the active framebuffer's (FBP, FBW)
|
||
// snapshot. When the gate is on AND the active PSM is PSMCT32,
|
||
// use its absolute byte address; otherwise fall back to the
|
||
// legacy linear formula.
|
||
logic s2_psm_is_ct32;
|
||
logic [31:0] s2_fb_addr_swizzled;
|
||
assign s2_psm_is_ct32 = (ras_psm == 6'h00);
|
||
gs_swizzle_psmct32_stub u_raster_swizzle (
|
||
.fbp (ras_fbp),
|
||
.fbw (ras_fbw),
|
||
.x (s2_x_q[11:0]),
|
||
.y (s2_y_q[11:0]),
|
||
.addr(s2_fb_addr_swizzled)
|
||
);
|
||
|
||
// Ch128 — optional PSMCT16 raster swizzle. Same wiring shape
|
||
// as Ch122 but uses gs_swizzle_psmct16_stub. Active when
|
||
// PSMCT16_SWIZZLE=1 AND ras_psm == PSMCT16 (=0x02). The
|
||
// swizzle module bakes its own page (64×64), block grid
|
||
// (4 cols × 8 rows of 16×8 blocks), and within-block
|
||
// columnTable16 in. Default PSMCT16_SWIZZLE=0 preserves the
|
||
// legacy linear PSMCT16 raster path for Ch95 and every other
|
||
// existing PSMCT16 raster TB.
|
||
logic s2_psm_is_ct16;
|
||
logic [31:0] s2_fb_addr_swizzled16;
|
||
assign s2_psm_is_ct16 = (ras_psm == 6'h02);
|
||
gs_swizzle_psmct16_stub u_raster_swizzle16 (
|
||
.fbp (ras_fbp),
|
||
.fbw (ras_fbw),
|
||
.x (s2_x_q[11:0]),
|
||
.y (s2_y_q[11:0]),
|
||
.addr(s2_fb_addr_swizzled16)
|
||
);
|
||
|
||
// Ch134 — optional PSMT8 raster swizzle. Same wiring shape as
|
||
// Ch122/Ch128 but uses gs_swizzle_psmt8_stub. Active when
|
||
// PSMT8_SWIZZLE=1 AND ras_psm == PSMT8 (=0x13). PSMT8 pages
|
||
// are 128 px wide (vs 64 px for direct-color PSMs) so the
|
||
// swizzle internally uses bw_pg = ras_fbw>>1 — PCSX2 asserts
|
||
// FBW must be even for PSMT8. Default PSMT8_SWIZZLE=0
|
||
// preserves the legacy linear PSMT8 raster path for Ch105 +
|
||
// every TB that emits PSMT8 raster (Ch107 PSMT4-via-CT16-CLUT
|
||
// palette etc.).
|
||
logic s2_psm_is_psmt8;
|
||
logic [31:0] s2_fb_addr_swizzled8;
|
||
assign s2_psm_is_psmt8 = (ras_psm == 6'h13);
|
||
gs_swizzle_psmt8_stub u_raster_swizzle8 (
|
||
.fbp (ras_fbp),
|
||
.fbw (ras_fbw),
|
||
.x (s2_x_q[11:0]),
|
||
.y (s2_y_q[11:0]),
|
||
.addr(s2_fb_addr_swizzled8)
|
||
);
|
||
|
||
// Ch140 — optional PSMT4 raster swizzle. Same wiring shape as
|
||
// Ch122/Ch128/Ch134 but uses gs_swizzle_psmt4_stub which
|
||
// outputs both an absolute byte address AND a nibble_hi
|
||
// selector. Active when PSMT4_SWIZZLE=1 AND ras_psm == PSMT4
|
||
// (=0x14). PSMT4 pages are 128×128 px; the swizzle
|
||
// internally uses bw_pg = ras_fbw>>1 — PCSX2 asserts FBW must
|
||
// be even for PSMT4. The nibble_hi output threads into the
|
||
// PSMT4 emit-color/mask logic above (see s2_psm_is_t4 branch
|
||
// in the always_comb): when this gate is on AND s2_psm_is_t4,
|
||
// the existing Ch106 nibble RMW machinery (write_be=4'b0001,
|
||
// write_mask 0x0F or 0xF0) keys on swizzle4_raster_nibble_hi
|
||
// instead of s2_pixel_index[0]. Default PSMT4_SWIZZLE=0
|
||
// preserves the legacy linear PSMT4 raster path.
|
||
logic s2_psm_is_t4_swizzle;
|
||
logic [31:0] s2_fb_addr_swizzled4;
|
||
logic swizzle4_raster_nibble_hi;
|
||
assign s2_psm_is_t4_swizzle = (ras_psm == 6'h14);
|
||
gs_swizzle_psmt4_stub u_raster_swizzle4 (
|
||
.fbp (ras_fbp),
|
||
.fbw (ras_fbw),
|
||
.x (s2_x_q[11:0]),
|
||
.y (s2_y_q[11:0]),
|
||
.addr (s2_fb_addr_swizzled4),
|
||
.nibble_hi(swizzle4_raster_nibble_hi)
|
||
);
|
||
|
||
// Per-PSM swizzle dispatch. The four parameters are
|
||
// independent; defaults of 0 keep every PSM on the legacy
|
||
// linear path.
|
||
assign s2_fb_addr = (PSMCT32_SWIZZLE && s2_psm_is_ct32) ? s2_fb_addr_swizzled :
|
||
(PSMCT16_SWIZZLE && s2_psm_is_ct16) ? s2_fb_addr_swizzled16 :
|
||
(PSMT8_SWIZZLE && s2_psm_is_psmt8) ? s2_fb_addr_swizzled8 :
|
||
(PSMT4_SWIZZLE && s2_psm_is_t4_swizzle) ? s2_fb_addr_swizzled4 :
|
||
s2_fb_addr_linear;
|
||
|
||
// Brick 2b — Z-buffer byte address for the S2-stage pixel.
|
||
//
|
||
// PSMZ32 is 4 bytes/pixel and shares the framebuffer's row stride
|
||
// (FBW), exactly like the color framebuffer, so the Z address has
|
||
// the same linear shape as s2_fb_addr_linear but with the Z-buffer
|
||
// base (ZBP * 2048) instead of FBP * 2048:
|
||
// z_addr = ZBP*2048 + (y * FBW*64 + x) * 4
|
||
// The Z buffer is stored as plain 32-bit words; brick 2b uses the
|
||
// legacy LINEAR layout (no Z swizzle in v1 — the swizzled raster
|
||
// gates apply to the COLOR fb only). The pixel index reuse keeps
|
||
// the same per-row arithmetic as the color path.
|
||
logic [31:0] s2_zbp_bytes;
|
||
logic [31:0] s2_z_addr;
|
||
assign s2_zbp_bytes = {23'd0, ras_zbp} << 11; // ZBP * 2048
|
||
// s2_pixel_index already = y*FBW*64 + x; PSMZ32 is 4 bytes/pixel.
|
||
assign s2_z_addr = s2_zbp_bytes + (s2_pixel_index << 2);
|
||
|
||
// Brick 2a — dest-fb read port drive. Combinational from the S2
|
||
// stage: when this S2 pixel is an inside pixel of an alpha-blended
|
||
// primitive, present its write address as the dest-read address.
|
||
// The wrapper routes fb_rd_addr onto vram read2 and returns the
|
||
// dest pixel (combinationally or 1-cycle-registered per
|
||
// FB_RD_REGISTERED). fb_rd_en stays low for opaque primitives, so
|
||
// ABE=0 reads nothing and the path is byte-identical.
|
||
logic s2_blend_pixel;
|
||
// Ch344 — FLAT alpha only (!ras_tme): ras_abe is now also set for textured-alpha sprites, but
|
||
// those drive their OWN half-rate beat-gated dest read (s2_tex_abe_dest below), never this
|
||
// streaming path — otherwise the legacy fb_rd would collide with the texel read on read2.
|
||
assign s2_blend_pixel = s2_valid_q && s2_inside_q && ras_abe && !ras_tme;
|
||
// COMBINED probe: in combined mode the dest-fb read fires only on
|
||
// beat 2 (comb_fb_req) at the held-pixel FB address. s2_blend_pixel
|
||
// is 0 during a combined scan (ras_abe is 0 for the combined prim),
|
||
// so the OR is conflict-free; ras_combined is constant 0 at param=0.
|
||
// Ch304 — in tile-local mode the combined dest read is ON-CHIP
|
||
// (tile_color RAM), so the VRAM dest-read enable MUST stay LOW for the
|
||
// combined primitive. tile_active is constant 0 at TILE_LOCAL=0, so
|
||
// this collapses to the legacy expression (byte-identical).
|
||
assign fb_rd_en = tile_active ? 1'b0
|
||
: ras_combined ? comb_fb_req : (s2_blend_pixel || s2_tex_abe_dest); // Ch344: +textured-alpha dest read (frozen beat)
|
||
assign fb_rd_addr = ras_combined ? comb_fb_addr : s2_fb_addr;
|
||
|
||
// Brick 2b — stored-Z read port drive. Combinational from the S2
|
||
// stage: when this S2 pixel is an inside pixel of a Z-tested
|
||
// primitive, present the Z-buffer address on read2. The wrapper
|
||
// routes z_rd_addr onto read2 (mutually exclusive with texel /
|
||
// dest-fb by feature) and returns the stored Z (comb or 1-cycle
|
||
// registered per Z_RD_REGISTERED). z_rd_en stays low for non-Z
|
||
// primitives so the path is byte-identical.
|
||
logic s2_ztest_pixel;
|
||
assign s2_ztest_pixel = s2_valid_q && s2_inside_q && ras_zte;
|
||
// COMBINED probe: in combined mode the stored-Z read fires only on
|
||
// beat 0 (comb_z_req) at the held-pixel Z address. s2_ztest_pixel is
|
||
// 0 during a combined scan (ras_zte is 0 for the combined prim), so
|
||
// the mux is conflict-free; ras_combined is constant 0 at param=0.
|
||
// Ch304 — in tile-local mode the combined stored-Z read is ON-CHIP
|
||
// (tile_z RAM), so the VRAM Z-read enable MUST stay LOW for the
|
||
// combined primitive. tile_active const 0 at TILE_LOCAL=0 (byte-id).
|
||
assign z_rd_en = tile_active ? 1'b0
|
||
: ras_combined ? comb_z_req : s2_ztest_pixel;
|
||
assign z_rd_addr = ras_combined ? comb_z_addr : s2_z_addr;
|
||
|
||
// Scan advance — row-major within the bounding box. last_pixel
|
||
// = at (x_max, y_max), end of scan.
|
||
logic ras_at_x_end;
|
||
logic ras_at_y_end;
|
||
assign ras_at_x_end = (ras_cur_x == ras_x_max);
|
||
assign ras_at_y_end = (ras_cur_y == ras_y_max);
|
||
|
||
raster_mode_e new_mode;
|
||
assign new_mode = classify_prim_for_raster(prim_type);
|
||
|
||
// Brick 2a — is the CLOSING primitive an alpha-blended FLAT SPRITE?
|
||
// Requirements (all must hold; otherwise opaque, byte-identical):
|
||
// - PRIM.ABE = 1
|
||
// - SPRITE primitive (RM_SPRITE)
|
||
// - destination FRAME PSM is PSMCT32 (0x00)
|
||
// - NOT textured-effective. A SPRITE textures only when
|
||
// prim_tme=1 AND its TEX0 PSM is PSMCT32 or (Ch296) PSMT8
|
||
// (matching s1_tex_active). A flat blend reads ONE dest pixel
|
||
// through read2; a textured blend would need TWO reads (texel +
|
||
// dest) — out of scope, so when a SPRITE is textured we fall back
|
||
// to opaque DECAL. Keeping this PSM set in lockstep with
|
||
// s1_tex_active preserves the read2 mutual-exclusion invariant
|
||
// (texel-fetch never collides with the alpha dest-read).
|
||
// - ALPHA_1 selects the source-over config.
|
||
logic close_tme_effective;
|
||
logic new_abe_active;
|
||
assign close_tme_effective = prim_tme && ((tex0_psm == 6'h00) ||
|
||
(tex0_psm == 6'h13));
|
||
// Ch344 — new_abe_active now sets ras_abe for the FLAT alpha sprite (the original
|
||
// !close_tme_effective case) OR the textured-alpha sprite (new_tex_abe_active). This reuses
|
||
// the already-packed ras_abe so NO FIFO attr-word change is needed; ras_tme then distinguishes
|
||
// the two at the consumer (ras_tex_abe = ras_abe && ras_tme). With SPRITE_TEX_ALPHA=0,
|
||
// new_tex_abe_active is constant 0, so this collapses to the exact pre-Ch344 flat-only term.
|
||
assign new_abe_active = (prim_abe
|
||
&& (new_mode == RM_SPRITE)
|
||
&& (frame_1_q[29:24] == 6'h00) // PSMCT32 dest
|
||
&& !close_tme_effective
|
||
&& alpha_is_source_over)
|
||
|| new_tex_abe_active;
|
||
|
||
// Ch344 — is the CLOSING primitive a TEXTURED + source-over alpha SPRITE? Same SPRITE +
|
||
// PSMCT32-dest + source-over requirements as the flat case, but REQUIRES texturing (PSMCT32
|
||
// TEX0). Gated on SPRITE_TEX_ALPHA so param-off keeps a textured ABE sprite falling back to
|
||
// opaque DECAL. ORed into new_abe_active above (sets ras_abe); the legacy flat dest-read
|
||
// (s2_blend_pixel) is gated to !ras_tme so it never fires for these — the textured path uses
|
||
// its own half-rate beat-gated dest read.
|
||
logic new_tex_abe_active;
|
||
assign new_tex_abe_active = SPRITE_TEX_ALPHA
|
||
&& prim_abe
|
||
&& (new_mode == RM_SPRITE)
|
||
&& (frame_1_q[29:24] == 6'h00) // PSMCT32 dest
|
||
&& prim_tme
|
||
&& ((tex0_psm == 6'h00) // PSMCT32 texel (Ch344), or
|
||
|| (SPRITE_TEX_ALPHA_CLUT && tex0_psm == 6'h13)) // PSMT8 CLUT texel (Ch347)
|
||
&& alpha_is_source_over;
|
||
|
||
// Brick 2b — is the CLOSING primitive a Z-tested FLAT PSMCT32 SPRITE?
|
||
// Requirements (all must hold; otherwise no depth test, byte-
|
||
// identical to pre-2b):
|
||
// - TEST_1.ZTE = 1 AND ZBUF PSM = PSMZ32 (z_format_ok)
|
||
// - SPRITE primitive (RM_SPRITE)
|
||
// - destination FRAME PSM is PSMCT32 (0x00)
|
||
// - NOT textured-effective (a Z-tested flat sprite never textures;
|
||
// a textured Z-tested sprite would need TWO read2 reads/pixel —
|
||
// OUT OF SCOPE, falls back to no-Z opaque DECAL)
|
||
// - NOT alpha-blended (likewise a 2nd read2 consumer — OUT OF
|
||
// SCOPE; an ABE Z-tested sprite falls back to plain alpha blend
|
||
// without depth test)
|
||
// These exclusions make the stored-Z read the SOLE read2 consumer
|
||
// for the primitive, so read2 arbitration stays collision-free.
|
||
logic new_zte_active;
|
||
assign new_zte_active = z_format_ok
|
||
&& (new_mode == RM_SPRITE)
|
||
&& (frame_1_q[29:24] == 6'h00) // PSMCT32 dest
|
||
&& !close_tme_effective
|
||
&& !new_abe_active;
|
||
|
||
// Brick 3 — is the CLOSING primitive a Z-tested PSMCT32 TRIANGLE?
|
||
// Same requirements as the sprite Z path (ZTE=1, PSMZ32, PSMCT32
|
||
// dest), but for RM_TRI. TRI in v1 is never textured/blended (those
|
||
// FIFO fields are forced 0 on the TRI push), so the stored-Z read is
|
||
// again the sole read2 consumer and arbitration stays collision-free.
|
||
// The fragment Z is the per-pixel INTERPOLATED s2_interp_z (vs the
|
||
// sprite path's flat ras_z_value).
|
||
// Textured-triangle rung — a TME TRIANGLE drives the texel-fetch
|
||
// read2; excluding it here keeps the stored-Z read the SOLE read2
|
||
// consumer (mutual-exclusion-by-feature), exactly like the sprite Z
|
||
// path's !close_tme_effective guard. A textured triangle therefore
|
||
// has ZTE forced off (architect scope: ZTE=0/ABE=0 for this rung),
|
||
// so read2 arbitration stays collision-free.
|
||
logic new_tri_zte_active;
|
||
assign new_tri_zte_active = z_format_ok
|
||
&& (new_mode == RM_TRI)
|
||
&& (frame_1_q[29:24] == 6'h00) // PSMCT32 dest
|
||
&& !close_tme_effective;
|
||
|
||
// COMBINED probe — is the CLOSING primitive the textured + alpha +
|
||
// depth-tested TRIANGLE the combined FSM owns? ALL must hold:
|
||
// - COMBINED_TAZ enabled
|
||
// - TRIANGLE (RM_TRI)
|
||
// - PSMCT32 destination FRAME
|
||
// - TEXTURED, PSMCT32 DECAL (strictly tex0_psm==PSMCT32 — not T8;
|
||
// the combined sampler treats the texel as direct ABGR)
|
||
// - alpha-blended, source-over (ABE=1 + ALPHA_1 source-over)
|
||
// - depth-tested, PSMZ32 (z_format_ok already folds in TEST_1.ZTE)
|
||
// When set, this slot is pushed with fifo_combined=1 and the legacy
|
||
// per-feature FIFO fields (fifo_abe/fifo_zte) stay 0 so ONLY the
|
||
// combined FSM acts on the primitive. Constant 0 at COMBINED_TAZ=0.
|
||
// Ch313 — full PSMCT16 framebuffer mode. When TILE_COLOR_PSMCT16, the
|
||
// tile color RAM, dest-color reads and flush emit are ALL already PSMCT16
|
||
// (16-bit lanes, byte-enables, <<1 FB addr — see ~lines 3424/3499/6220).
|
||
// The only thing that was still forcing a PSMCT32 FRAME was THIS
|
||
// eligibility gate, so relax it to also accept a PSMCT16 FRAME
|
||
// (frame_1_q[29:24]==6'h02) when the tile machinery is in PSMCT16 mode.
|
||
// The whole render/flush/scanout path is then consistently PSMCT16 and the
|
||
// framebuffer is half-size. At TILE_COLOR_PSMCT16=0 (default) the second
|
||
// term is constant-0, so this collapses to the original PSMCT32-only gate
|
||
// (byte-identical). A PSMCT16 FRAME only makes sense with PSMCT16 tiles,
|
||
// so keying off TILE_COLOR_PSMCT16 also rules out nonsensical mixed combos.
|
||
logic close_combined;
|
||
assign close_combined = COMBINED_TAZ
|
||
&& (new_mode == RM_TRI)
|
||
&& ((frame_1_q[29:24] == 6'h00) // PSMCT32 dest
|
||
|| (TILE_COLOR_PSMCT16 && (frame_1_q[29:24] == 6'h02))) // Ch313: PSMCT16 dest
|
||
// Ch314 — texture PSM: PSMCT32 DECAL always; with
|
||
// PALETTE_BILINEAR also a PSMT8 (0x13) / PSMT4 (0x14)
|
||
// indexed DECAL (the shared sampler CLUTs each tap). The
|
||
// extra disjunct is constant-0 at default → byte-identical.
|
||
&& prim_tme && ((tex0_psm == 6'h00)
|
||
|| (PALETTE_BILINEAR && ((tex0_psm == 6'h13)
|
||
|| (tex0_psm == 6'h14))))
|
||
// Ch309 — with ALPHA_MODES_ENABLE the generic blender
|
||
// handles any ALPHA_1 (A/B/C/D/FIX) config, so accept any
|
||
// ABE primitive; at default (0) only source-over qualifies
|
||
// (byte-identical).
|
||
&& prim_abe && (alpha_is_source_over || ALPHA_MODES_ENABLE)
|
||
&& z_format_ok; // ZTE + PSMZ32
|
||
|
||
// Ch87 — combinational helpers that compute the bounding box
|
||
// for either primitive mode at the close cycle. Used both at
|
||
// direct-load (when FSM is IDLE and FIFO is empty) and at
|
||
// enqueue (when something is already in flight).
|
||
logic [11:0] enq_x_min, enq_x_max, enq_y_min, enq_y_max;
|
||
always_comb begin
|
||
if (new_mode == RM_TRI) begin
|
||
enq_x_min = umin12(umin12(tri_v0x, tri_v1x), tri_v2x);
|
||
enq_x_max = umax12(umax12(tri_v0x, tri_v1x), tri_v2x);
|
||
enq_y_min = umin12(umin12(tri_v0y, tri_v1y), tri_v2y);
|
||
enq_y_max = umax12(umax12(tri_v0y, tri_v1y), tri_v2y);
|
||
end else begin
|
||
enq_x_min = umin12(sp_v0_x_next, sp_v1_x_next);
|
||
enq_x_max = umax12(sp_v0_x_next, sp_v1_x_next);
|
||
enq_y_min = umin12(sp_v0_y_next, sp_v1_y_next);
|
||
enq_y_max = umax12(sp_v0_y_next, sp_v1_y_next);
|
||
end
|
||
end
|
||
|
||
// Push / pop event predicates, evaluated combinationally.
|
||
// Push fires on prim_complete_now for an eligible non-degenerate
|
||
// primitive (degenerate TRI sets raster_degenerate but is not
|
||
// enqueued). FIFO-full push attempts set raster_overflow and
|
||
// drop the primitive — UNLESS a pop fires the same cycle, in
|
||
// which case the freed slot accepts the new push and count
|
||
// stays at full (Ch87 Codex audit-medium fix). Pop fires when
|
||
// the FSM needs the next entry: at IDLE while FIFO has work,
|
||
// or at end-of-scan while FIFO has work.
|
||
logic prim_eligible;
|
||
logic prim_filtered_degen;
|
||
logic push_attempt;
|
||
logic push_drop;
|
||
logic push_ok;
|
||
|
||
// Ch88 pipeline-aware end-of-scan / drain-done.
|
||
// ras_at_end_of_s0 : S0 is producing the last bbox coord
|
||
// (corner pixel). After this cycle, FSM
|
||
// transitions to R_DRAIN.
|
||
// ras_drain_done : S1 and S2 valid bits are both low while
|
||
// in R_DRAIN, so the pipeline is fully
|
||
// flushed. Only at this point can the
|
||
// next FIFO entry pop into ras_*_q
|
||
// (the in-flight pixels would otherwise
|
||
// see clobbered context).
|
||
logic ras_at_end_of_s0;
|
||
logic ras_drain_done;
|
||
assign ras_at_end_of_s0 = (raster_state == R_SCAN)
|
||
& ras_at_x_end & ras_at_y_end;
|
||
// Brick 2a — the drain must also wait for the S3 blend stage to
|
||
// empty, otherwise a popped next-primitive could clobber ras_abe /
|
||
// ras_color while a blended pixel is still in S3.
|
||
// Brick 2b — the drain must ALSO wait for the Z S3 stage to empty,
|
||
// for any queued Z write to fire, and for the half-rate beat to land
|
||
// back on the primary beat — otherwise a popped next-primitive could
|
||
// clobber ras_zte / ras_z_value while a Z pixel is still in flight,
|
||
// or the queued Z write could be dropped. z_beat_q is included so the
|
||
// pop happens on a primary beat (z_advance high).
|
||
// Ch302 — a perspective primitive's emit lives in a +5-deep pipe
|
||
// BEHIND the S1/S2/S3 stages (gs_persp_uv 4-cycle divide + 1-cycle
|
||
// texel read). The drain must ALSO wait for that pipe to empty, or a
|
||
// popped next-primitive would clobber ras_persp / ras_s0_base / … and
|
||
// raster_active could drop while perspective pixels are still emitting.
|
||
// persp_pipe_busy is constant 0 at PERSPECTIVE_CORRECT=0, so this term
|
||
// is a no-op in the affine-only build (byte-identical).
|
||
// Ch304 — in tile-local mode the primitive is NOT done when the
|
||
// combined walker reaches R_DRAIN: the on-chip tile must still be
|
||
// FLUSHed to the framebuffer. tile_render_busy holds the drain (and
|
||
// therefore the pop / R_IDLE transition + raster_active) until the
|
||
// tile phase returns to TP_OFF after FLUSH completes. tile_active is
|
||
// constant 0 at TILE_LOCAL=0, so this term is a no-op (byte-identical).
|
||
logic tile_render_busy;
|
||
assign tile_render_busy = tile_active && (tile_phase_r != TP_OFF);
|
||
// Ch337 — scene-level busy (see the output-port comment). Holds through inter-batch gaps: the
|
||
// next batch's primitives sit in the FIFO (fifo_count != 0) and/or an end-of-list flush is
|
||
// pending (mp_flush_pending) even while raster_active momentarily drops between grid passes.
|
||
assign raster_scene_busy = raster_active || tile_render_busy
|
||
|| (fifo_count != FIFO_CNT_W'(0)) || mp_flush_pending;
|
||
assign ras_drain_done = (raster_state == R_DRAIN)
|
||
& !s1_valid_q & !s2_valid_q & !s3_valid_q
|
||
& !s3_zvalid_q & !zw_pending_q & !z_beat_q
|
||
& !persp_pipe_busy
|
||
& !tile_render_busy;
|
||
|
||
// Ch305 MULTIPRIM — pipeline-empty for the INTER-PRIMITIVE advance within a
|
||
// tile render. Identical to ras_drain_done WITHOUT the !tile_render_busy term
|
||
// (which is always 1 mid-grid, forcing ras_drain_done to 0). Advancing prim
|
||
// N->N+1 on bare (raster_state==R_DRAIN) would clobber ras_* and start the
|
||
// next primitive's tile_z reads while primitive N's last color/Z writes are
|
||
// still in the s1/s2/s3 + queued-Z + half-rate-z pipeline — corrupting the
|
||
// composite (broken occlusion / lost writes). This holds the advance until
|
||
// that pipeline is fully flushed. Dead at TILE_MULTIPRIM=0 (never read there).
|
||
logic comb_pipe_empty;
|
||
assign comb_pipe_empty = (raster_state == R_DRAIN)
|
||
& !s1_valid_q & !s2_valid_q & !s3_valid_q
|
||
& !s3_zvalid_q & !zw_pending_q & !z_beat_q
|
||
& !persp_pipe_busy;
|
||
|
||
logic pop_ok;
|
||
// Ch295 — also block the pop while the slot at fifo_rptr still has
|
||
// its affine gradients in flight in the setup engine. SPRITE slots
|
||
// clear grad_pending at push, so they are never held; a TRI slot is
|
||
// released the cycle the engine writes its last gradient. The engine
|
||
// finishes in ~10 cycles, far ahead of the rasterizer's readiness,
|
||
// so this never stalls the steady-state pipeline.
|
||
// Ch296 — also block the pop while a VRAM->CLUT load is in flight, so
|
||
// the textured scan's texel fetch never collides with the load on the
|
||
// shared read2 port. Gated by the CLUT_STALL parameter so only a top
|
||
// that actually instantiates a CLUT loader (and drives clut_load_busy)
|
||
// pays for / depends on this term; for every other gs_stub instance
|
||
// the parameter is 0 and `clut_load_busy` is don't-care (may float),
|
||
// exactly as before this chapter.
|
||
logic clut_stall_eff;
|
||
assign clut_stall_eff = CLUT_STALL ? clut_load_busy : 1'b0;
|
||
// Ch305+ MULTIPRIM — in multiprim mode the batch grid path OWNS the FIFO,
|
||
// so the normal streaming pop is permanently disabled. TILE_MULTIPRIM is a
|
||
// compile-time 0 by default, so this term folds away → byte-identical.
|
||
assign pop_ok = !fifo_empty && !fifo_grad_pending[fifo_rptr] &&
|
||
!clut_stall_eff &&
|
||
!(TILE_LOCAL && TILE_MULTIPRIM) &&
|
||
((raster_state == R_IDLE) || ras_drain_done);
|
||
|
||
assign prim_eligible = prim_complete_now && (new_mode != RM_NONE);
|
||
assign prim_filtered_degen = prim_eligible && (new_mode == RM_TRI) && tri_degenerate;
|
||
assign push_attempt = prim_eligible && !prim_filtered_degen;
|
||
// A simultaneous pop frees a slot, so a full-FIFO push still
|
||
// succeeds in that cycle. push_drop only triggers when full
|
||
// AND no pop is happening — those primitives are genuinely
|
||
// beyond the rasterizer's depth.
|
||
assign push_drop = push_attempt && fifo_full && !pop_ok;
|
||
assign push_ok = push_attempt && (!fifo_full || pop_ok);
|
||
|
||
// COMBINED probe — advance the stalled walker to the next candidate
|
||
// pixel and reset the per-pixel FSM to beat 0. Row-major within the
|
||
// bounding box, identical to the legacy S0 walker advance, plus the
|
||
// R_SCAN→R_DRAIN transition at the bbox corner. Called from the
|
||
// combined FSM (NBAs to module regs; no `return;` per iverilog-12).
|
||
task automatic comb_advance_walker;
|
||
if (ras_at_end_of_s0) begin
|
||
raster_state <= R_DRAIN;
|
||
end else if (ras_at_x_end) begin
|
||
ras_cur_x <= ras_x_min;
|
||
ras_cur_y <= ras_cur_y + 12'd1;
|
||
end else begin
|
||
ras_cur_x <= ras_cur_x + 12'd1;
|
||
end
|
||
comb_state_r <= CB_Z;
|
||
endtask
|
||
|
||
// Ch305+ MULTIPRIM — load all ras_* working fields from FIFO slot `slot`,
|
||
// clipping the walker bbox to the supplied (already-tile-clipped) corners.
|
||
// Mirrors the pop-block field set EXACTLY (4744+) but reads `slot` instead
|
||
// of fifo_rptr, does NOT advance fifo_rptr, and does NOT touch the prim_*
|
||
// copy or the tile/grid counters. Only ever called from `if (TILE_MULTIPRIM)`
|
||
// branches, so the default build never invokes it (NBAs, no `return;`).
|
||
task automatic mp_load_prim (input [FIFO_PTR_W-1:0] slot,
|
||
input [11:0] cx_lo, input [11:0] cx_hi,
|
||
input [11:0] cy_lo, input [11:0] cy_hi);
|
||
// Ch328 (1b) — INPUT fields unpacked from the M20K attr_ram[slot] in EXACT pack order
|
||
// (LHS-concat; attr_ram is now fresh-at-push so no read-after-write hazard). HOT fields
|
||
// (mode/tri_active/combined) + the 20 gradient OUTPUTS stay in fifo_* registers; bbox from
|
||
// the caller. mp_dump_* sink the grad-engine-only inputs the rasterizer doesn't consume.
|
||
{ ras_v0_x, ras_v0_y, ras_v1_x, ras_v1_y, ras_v2_x, ras_v2_y, ras_bias, ras_sa_q,
|
||
ras_color, ras_c0_q, ras_c1_q, ras_c2_q, ras_fbp, ras_fbw, ras_psm, ras_bpp_shift,
|
||
ras_tme, ras_u0, ras_v0t, ras_u1, ras_v1t, ras_tex_base, ras_tbw, ras_tpsm,
|
||
ras_wms, ras_wmt, ras_tw, ras_th, ras_du_dx_q, ras_dv_dy_q, ras_abe, ras_filter_lin,
|
||
ras_alpha_a, ras_alpha_b, ras_alpha_c, ras_alpha_d, ras_alpha_fix,
|
||
ras_zte, ras_ztst, ras_zmsk, ras_zbp, ras_z_value, ras_z0, ras_u0_base, ras_v0_base,
|
||
mp_dump_u1v, mp_dump_u2v, ras_persp, ras_s0_base, ras_t0_base, ras_q0_base,
|
||
mp_dump_stq1, mp_dump_stq2, mp_dump_q1, mp_dump_q2, mp_dump_v1z, mp_dump_v2z } <= attr_rd_q; // Ch328 1c: single read port (slot was issued last cycle)
|
||
ras_mode <= fifo_mode [slot]; // HOT (register)
|
||
// Clipped walker bbox (against the current tile), supplied by caller.
|
||
ras_x_min <= cx_lo;
|
||
ras_x_max <= cx_hi;
|
||
ras_y_min <= cy_lo;
|
||
ras_y_max <= cy_hi;
|
||
ras_cur_x <= cx_lo;
|
||
ras_cur_y <= cy_lo;
|
||
ras_tri_active <= fifo_tri_active [slot]; // HOT
|
||
ras_combined_r <= fifo_combined [slot]; // HOT
|
||
comb_state_r <= CB_Z;
|
||
// 20 ENGINE-WRITTEN gradient OUTPUTS — sideband registers (NOT in attr_ram).
|
||
ras_dr_dx <= fifo_dr_dx [slot]; ras_dr_dy <= fifo_dr_dy [slot];
|
||
ras_dg_dx <= fifo_dg_dx [slot]; ras_dg_dy <= fifo_dg_dy [slot];
|
||
ras_db_dx <= fifo_db_dx [slot]; ras_db_dy <= fifo_db_dy [slot];
|
||
ras_da_dx <= fifo_da_dx [slot]; ras_da_dy <= fifo_da_dy [slot];
|
||
ras_dz_dx <= fifo_dz_dx [slot]; ras_dz_dy <= fifo_dz_dy [slot];
|
||
ras_du_dx_t <= fifo_du_dx_t [slot]; ras_du_dy_t <= fifo_du_dy_t [slot];
|
||
ras_dv_dx_t <= fifo_dv_dx_t [slot]; ras_dv_dy_t <= fifo_dv_dy_t [slot];
|
||
ras_ds_dx <= fifo_ds_dx [slot]; ras_ds_dy <= fifo_ds_dy [slot];
|
||
ras_dt_dx <= fifo_dt_dx [slot]; ras_dt_dy <= fifo_dt_dy [slot];
|
||
ras_dq_dx <= fifo_dq_dx [slot]; ras_dq_dy <= fifo_dq_dy [slot];
|
||
endtask
|
||
|
||
// Ch328 (1b) — the push now writes attr_ram[fifo_wptr] DIRECTLY same-cycle (attr_word_next),
|
||
// so this block just tracks the pushed slot for the equivalence checker (chk_* lag pack_* by
|
||
// 1 more cycle, by which point both attr_ram[slot] and fifo_*[slot] are settled).
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
pack_v <= 1'b0;
|
||
end else begin
|
||
pack_v <= push_ok;
|
||
pack_slot <= fifo_wptr;
|
||
end
|
||
end
|
||
|
||
`ifndef SYNTHESIS
|
||
// Ch328 (1a) EQUIVALENCE CHECKER (Codex gate). INDEPENDENT decode: slice the stored word
|
||
// back into width-typed fields via LHS-concat (NOT the pack concat function) and compare each
|
||
// to the live fifo_* register. Runs across every regression scene. A width/order drift between
|
||
// the pack concat and this decode → mismatch. Sim-only (no synth cost). chk_* lag pack_* by 1
|
||
// more cycle so attr_ram[slot] is settled.
|
||
logic chk_v;
|
||
logic [$clog2(FIFO_DEPTH)-1:0] chk_slot;
|
||
logic [11:0] k_v0x,k_v0y,k_v1x,k_v1y,k_v2x,k_v2y; logic [2:0] k_bias; logic signed [31:0] k_sa;
|
||
logic [63:0] k_color,k_c0,k_c1,k_c2; logic [8:0] k_fbp; logic [5:0] k_fbw,k_psm; logic [1:0] k_bpp;
|
||
logic k_tme; logic [10:0] k_u0,k_v0,k_u1,k_v1; logic [31:0] k_texb; logic [13:0] k_tbw; logic [5:0] k_tpsm;
|
||
logic [1:0] k_wms,k_wmt; logic [3:0] k_tw,k_th; logic signed [31:0] k_dudx,k_dvdy; logic k_abe,k_flin;
|
||
logic [1:0] k_aa,k_ab,k_ac,k_ad; logic [7:0] k_afix; logic k_zte; logic [1:0] k_ztst; logic k_zmsk;
|
||
logic [8:0] k_zbp; logic [31:0] k_zval;
|
||
logic [31:0] k_z0;
|
||
logic [10:0] k_u0b,k_v0b; logic [31:0] k_u1v,k_u2v;
|
||
logic k_persp; logic [23:0] k_s0b,k_t0b,k_q0b; logic [63:0] k_stq1,k_stq2; logic [23:0] k_q1,k_q2;
|
||
logic [31:0] k_v1z,k_v2z;
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) chk_v <= 1'b0;
|
||
else begin
|
||
chk_v <= pack_v;
|
||
chk_slot <= pack_slot;
|
||
if (chk_v) begin
|
||
{ k_v0x,k_v0y,k_v1x,k_v1y,k_v2x,k_v2y,k_bias,k_sa,k_color,k_c0,k_c1,k_c2,
|
||
k_fbp,k_fbw,k_psm,k_bpp,k_tme,k_u0,k_v0,k_u1,k_v1,k_texb,k_tbw,k_tpsm,
|
||
k_wms,k_wmt,k_tw,k_th,k_dudx,k_dvdy,k_abe,k_flin,k_aa,k_ab,k_ac,k_ad,k_afix,
|
||
k_zte,k_ztst,k_zmsk,k_zbp,k_zval,k_z0,k_u0b,k_v0b,k_u1v,k_u2v,
|
||
k_persp,k_s0b,k_t0b,k_q0b,k_stq1,k_stq2,k_q1,k_q2,
|
||
k_v1z,k_v2z } = attr_ram[chk_slot];
|
||
if (k_v0x!==fifo_v0x[chk_slot]||k_v0y!==fifo_v0y[chk_slot]||k_v1x!==fifo_v1x[chk_slot]||
|
||
k_v1y!==fifo_v1y[chk_slot]||k_v2x!==fifo_v2x[chk_slot]||k_v2y!==fifo_v2y[chk_slot]||
|
||
k_bias!==fifo_bias[chk_slot]||k_sa!==fifo_sa[chk_slot]||k_color!==fifo_color[chk_slot]||
|
||
k_c0!==fifo_c0[chk_slot]||k_c1!==fifo_c1[chk_slot]||k_c2!==fifo_c2[chk_slot]||
|
||
k_fbp!==fifo_fbp[chk_slot]||k_fbw!==fifo_fbw[chk_slot]||k_psm!==fifo_psm[chk_slot]||
|
||
k_bpp!==fifo_bpp_shift[chk_slot]||k_tme!==fifo_tme[chk_slot]||k_u0!==fifo_u0[chk_slot]||
|
||
k_v0!==fifo_v0[chk_slot]||k_u1!==fifo_u1[chk_slot]||k_v1!==fifo_v1[chk_slot]||
|
||
k_texb!==fifo_tex_base[chk_slot]||k_tbw!==fifo_tbw[chk_slot]||k_tpsm!==fifo_tpsm[chk_slot]||
|
||
k_wms!==fifo_wms[chk_slot]||k_wmt!==fifo_wmt[chk_slot]||k_tw!==fifo_tw[chk_slot]||
|
||
k_th!==fifo_th[chk_slot]||k_dudx!==fifo_du_dx[chk_slot]||k_dvdy!==fifo_dv_dy[chk_slot]||
|
||
k_abe!==fifo_abe[chk_slot]||k_flin!==fifo_filter_lin[chk_slot]||k_aa!==fifo_alpha_a[chk_slot]||
|
||
k_ab!==fifo_alpha_b[chk_slot]||k_ac!==fifo_alpha_c[chk_slot]||k_ad!==fifo_alpha_d[chk_slot]||
|
||
k_afix!==fifo_alpha_fix[chk_slot]||k_zte!==fifo_zte[chk_slot]||k_ztst!==fifo_ztst[chk_slot]||
|
||
k_zmsk!==fifo_zmsk[chk_slot]||k_zbp!==fifo_zbp[chk_slot]||k_zval!==fifo_zval[chk_slot]||
|
||
k_z0!==fifo_z0[chk_slot]||k_u0b!==fifo_u0_base[chk_slot]||
|
||
k_v0b!==fifo_v0_base[chk_slot]||k_u1v!==fifo_u1v[chk_slot]||k_u2v!==fifo_u2v[chk_slot]||
|
||
k_persp!==fifo_persp[chk_slot]||k_s0b!==fifo_s0_base[chk_slot]||
|
||
k_t0b!==fifo_t0_base[chk_slot]||k_q0b!==fifo_q0_base[chk_slot]||k_stq1!==fifo_stq1[chk_slot]||
|
||
k_stq2!==fifo_stq2[chk_slot]||k_q1!==fifo_q1[chk_slot]||k_q2!==fifo_q2[chk_slot]||
|
||
k_v1z!==fifo_v1z[chk_slot]||k_v2z!==fifo_v2z[chk_slot])
|
||
$error("ch328 attr_ram pack/unpack MISMATCH at slot %0d (field order/width drift)", chk_slot);
|
||
end
|
||
end
|
||
end
|
||
`endif
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
raster_state <= R_IDLE;
|
||
ras_mode <= RM_NONE;
|
||
ras_v0_x <= 12'd0; ras_v0_y <= 12'd0;
|
||
ras_v1_x <= 12'd0; ras_v1_y <= 12'd0;
|
||
ras_v2_x <= 12'd0; ras_v2_y <= 12'd0;
|
||
ras_x_min <= 12'd0; ras_x_max <= 12'd0;
|
||
ras_y_min <= 12'd0; ras_y_max <= 12'd0;
|
||
ras_cur_x <= 12'd0; ras_cur_y <= 12'd0;
|
||
ras_color <= 64'd0;
|
||
ras_fbp <= 9'd0; ras_fbw <= 6'd0; ras_psm <= 6'd0;
|
||
ras_bpp_shift <= 2'd2;
|
||
ras_tme <= 1'b0;
|
||
ras_u0 <= 11'd0; ras_v0t <= 11'd0; ras_u1 <= 11'd0; ras_v1t <= 11'd0;
|
||
ras_tex_base <= 32'd0; ras_tbw <= 14'd0; ras_tpsm <= 6'd0;
|
||
ras_wms <= 2'd0; ras_wmt <= 2'd0; ras_tw <= 4'd0; ras_th <= 4'd0;
|
||
ras_du_dx_q <= 32'sd0; ras_dv_dy_q <= 32'sd0;
|
||
ras_abe <= 1'b0;
|
||
ras_filter_lin <= 1'b0;
|
||
ras_alpha_a <= 2'd0;
|
||
ras_alpha_b <= 2'd0;
|
||
ras_alpha_c <= 2'd0;
|
||
ras_alpha_d <= 2'd0;
|
||
ras_alpha_fix <= 8'd0;
|
||
ras_zte <= 1'b0;
|
||
ras_ztst <= 2'd0;
|
||
ras_zmsk <= 1'b0;
|
||
ras_zbp <= 9'd0;
|
||
ras_z_value <= 32'd0;
|
||
ras_bias <= 3'b000;
|
||
ras_c0_q <= 64'd0;
|
||
ras_c1_q <= 64'd0;
|
||
ras_c2_q <= 64'd0;
|
||
ras_sa_q <= 32'sd0;
|
||
// Brick 3 — affine interpolation context reset.
|
||
ras_tri_active <= 1'b0;
|
||
// COMBINED probe — reset combined-mode state.
|
||
ras_combined_r <= 1'b0;
|
||
comb_state_r <= CB_Z;
|
||
comb_ztest_pass_r <= 1'b0;
|
||
comb_zstored_r <= 32'd0;
|
||
comb_cs_r <= 32'd0;
|
||
comb_as_r <= 8'd0;
|
||
// Ch304 TILE-LOCAL — reset tile-phase FSM + sweep/flush state.
|
||
tile_phase_r <= TP_OFF;
|
||
tile_sweep_r <= 9'd0;
|
||
flush_emit_q <= 1'b0;
|
||
flush_idx_q <= 8'd0;
|
||
// Ch305 TILE GRID — reset grid counters + preserved prim bbox.
|
||
tile_col_r <= 12'd0;
|
||
tile_row_r <= 12'd0;
|
||
spill_valid <= SPILL_FORCE_VALID ? '1 : '0; // Ch323 — no valid backing until spilled (test hook forces valid)
|
||
// Ch305+ MULTIPRIM — reset batch state.
|
||
prim_idx_r <= '0;
|
||
mp_rd_state <= RDS_IDLE; // Ch328 1c
|
||
prim_count_r <= '0;
|
||
grid_base_rptr <= '0;
|
||
// BIN BUFFER — clear bin counters / sweep counters (dead at
|
||
// BIN_BUFFER_ENABLE=0). bin_prim contents need no reset (only read
|
||
// through bin_n, which is zeroed). bin_n is the live state.
|
||
bin_t <= '0;
|
||
bin_p <= '0;
|
||
bin_slot_r <= '0;
|
||
for (int bt = 0; bt < NTILES; bt = bt + 1)
|
||
bin_n[bt] <= '0;
|
||
prim_x_min <= 12'd0; prim_x_max <= 12'd0;
|
||
prim_y_min <= 12'd0; prim_y_max <= 12'd0;
|
||
flush_ox_q <= 12'd0; flush_oy_q <= 12'd0;
|
||
ras_z0 <= 32'd0;
|
||
ras_dr_dx <= 32'sd0; ras_dr_dy <= 32'sd0;
|
||
ras_dg_dx <= 32'sd0; ras_dg_dy <= 32'sd0;
|
||
ras_db_dx <= 32'sd0; ras_db_dy <= 32'sd0;
|
||
ras_da_dx <= 32'sd0; ras_da_dy <= 32'sd0;
|
||
ras_dz_dx <= 32'sd0; ras_dz_dy <= 32'sd0;
|
||
// Textured-triangle rung — affine U/V context reset.
|
||
ras_u0_base <= 11'd0; ras_v0_base <= 11'd0;
|
||
ras_du_dx_t <= 32'sd0; ras_du_dy_t <= 32'sd0;
|
||
ras_dv_dx_t <= 32'sd0; ras_dv_dy_t <= 32'sd0;
|
||
// Ch301 perspective — S/T/Q affine context reset.
|
||
ras_persp <= 1'b0;
|
||
ras_s0_base <= 24'd0; ras_t0_base <= 24'd0; ras_q0_base <= 24'd0;
|
||
ras_ds_dx <= 32'sd0; ras_ds_dy <= 32'sd0;
|
||
ras_dt_dx <= 32'sd0; ras_dt_dy <= 32'sd0;
|
||
ras_dq_dx <= 32'sd0; ras_dq_dy <= 32'sd0;
|
||
// Ch88 pipeline regs
|
||
s1_x_q <= 12'd0; s1_y_q <= 12'd0;
|
||
s1_valid_q <= 1'b0;
|
||
s2_x_q <= 12'd0; s2_y_q <= 12'd0;
|
||
s2_valid_q <= 1'b0;
|
||
s2_inside_q <= 1'b0;
|
||
s2_L0_q <= 32'sd0; s2_L1_q <= 32'sd0; s2_L2_q <= 32'sd0;
|
||
s2_mode_q <= RM_NONE;
|
||
s2_tex_active_q <= 1'b0;
|
||
s2_tex_color_q <= 32'd0;
|
||
// Brick 2a S3 blend stage
|
||
s3_valid_q <= 1'b0;
|
||
s3_x_q <= 12'd0; s3_y_q <= 12'd0;
|
||
s3_fb_addr_q <= 32'd0;
|
||
s3_be_q <= 4'b1111;
|
||
s3_mask_q <= 32'hFFFF_FFFF;
|
||
s3_psm_q <= 6'd0;
|
||
s3_cs_q <= 64'd0;
|
||
s3_dest_q <= 32'd0;
|
||
s3_as_q <= 8'd0;
|
||
// Brick 2b Z-test S3 stage + beat/queue
|
||
s3_zvalid_q <= 1'b0;
|
||
s3_zx_q <= 12'd0; s3_zy_q <= 12'd0;
|
||
s3_zfb_addr_q<= 32'd0;
|
||
s3_z_addr_q <= 32'd0;
|
||
s3_zcolor_q <= 64'd0;
|
||
s3_zval_q <= 32'd0;
|
||
s3_ztst_q <= 2'd0;
|
||
s3_zmsk_q <= 1'b0;
|
||
s3_zstored_q <= 32'd0;
|
||
z_beat_q <= 1'b0;
|
||
ta_beat_q <= 1'b0; // Ch344
|
||
ta_dest_q <= 32'd0; // Ch344
|
||
ta_tex_q <= 32'd0; // Ch344
|
||
ta_tex_q1 <= 32'd0; // Ch344
|
||
zw_pending_q <= 1'b0;
|
||
zw_addr_q <= 32'd0;
|
||
zw_val_q <= 32'd0;
|
||
raster_pixel_emit <= 1'b0;
|
||
// Ch323 — Z-flush stream + pipeline latches reset.
|
||
z_flush_emit_o <= 1'b0;
|
||
z_flush_addr_o <= 32'd0;
|
||
z_flush_data_o <= 32'd0;
|
||
tile_color_flush_emit_o <= 1'b0;
|
||
tile_color_flush_addr_o <= 32'd0;
|
||
tile_color_flush_data_o <= 32'd0;
|
||
z_flush_emit_q <= 1'b0;
|
||
z_flush_idx_q <= 8'd0;
|
||
z_flush_ox_q <= 12'd0;
|
||
z_flush_oy_q <= 12'd0;
|
||
// Ch323 — tile RELOAD control reset (tile_reload_raddr_o is a continuous assign).
|
||
reload_start_o <= 1'b0;
|
||
reload_wait <= 1'b0;
|
||
reload_wr_we <= 1'b0;
|
||
reload_wr_addr <= 8'd0;
|
||
raster_pixel_emit_count <= 32'd0;
|
||
raster_pixel_x_q <= 12'd0;
|
||
raster_pixel_y_q <= 12'd0;
|
||
raster_pixel_color_q <= 64'd0;
|
||
raster_pixel_fb_addr_q <= 32'd0;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= 6'd0;
|
||
raster_active <= 1'b0;
|
||
raster_overflow <= 1'b0;
|
||
raster_overflow_count_r <= 16'd0; // Ch315
|
||
bin_occ_max_r <= '0; // Ch315
|
||
bin_overflow_r <= 1'b0; // Ch315
|
||
tile_refused_count <= 16'd0; // Ch329
|
||
raster_degenerate <= 1'b0;
|
||
fifo_wptr <= '0;
|
||
fifo_rptr <= '0;
|
||
fifo_count <= '0;
|
||
// Ch295 — gradient setup engine reset.
|
||
grad_busy <= 1'b0;
|
||
grad_prefetching <= 1'b0; // Ch328 (1b) step C
|
||
grad_prefetch2 <= 1'b0; // Ch328 1c
|
||
grad_word_q <= '0;
|
||
grad_step <= 5'd0;
|
||
grad_settle <= 5'd0;
|
||
grad_result_q <= 32'sd0;
|
||
grad_writing <= 1'b0;
|
||
div_start <= 1'b0;
|
||
grad_slot <= '0;
|
||
grad_det_q <= 32'sd0;
|
||
for (int g = 0; g < GRAD_STEPS; g = g + 1) grad_num_q[g] <= 56'sd0;
|
||
for (int i = 0; i < FIFO_DEPTH; i = i + 1) begin
|
||
fifo_v1z [i] <= 32'd0;
|
||
fifo_v2z [i] <= 32'd0;
|
||
end
|
||
for (int i = 0; i < FIFO_DEPTH; i = i + 1) begin
|
||
fifo_grad_pending [i] <= 1'b0;
|
||
fifo_mode [i] <= RM_NONE;
|
||
fifo_v0x [i] <= 12'd0; fifo_v0y[i] <= 12'd0;
|
||
fifo_v1x [i] <= 12'd0; fifo_v1y[i] <= 12'd0;
|
||
fifo_v2x [i] <= 12'd0; fifo_v2y[i] <= 12'd0;
|
||
fifo_x_min [i] <= 12'd0; fifo_x_max[i] <= 12'd0;
|
||
fifo_y_min [i] <= 12'd0; fifo_y_max[i] <= 12'd0;
|
||
fifo_bias [i] <= 3'd0;
|
||
fifo_sa [i] <= 32'sd0;
|
||
fifo_color [i] <= 64'd0;
|
||
fifo_c0 [i] <= 64'd0;
|
||
fifo_c1 [i] <= 64'd0;
|
||
fifo_c2 [i] <= 64'd0;
|
||
fifo_fbp [i] <= 9'd0;
|
||
fifo_fbw [i] <= 6'd0;
|
||
fifo_psm [i] <= 6'd0;
|
||
fifo_bpp_shift [i] <= 2'd2;
|
||
fifo_tme [i] <= 1'b0;
|
||
fifo_u0 [i] <= 11'd0;
|
||
fifo_v0 [i] <= 11'd0;
|
||
fifo_u1 [i] <= 11'd0;
|
||
fifo_v1 [i] <= 11'd0;
|
||
fifo_tex_base [i] <= 32'd0;
|
||
fifo_tbw [i] <= 14'd0;
|
||
fifo_tpsm [i] <= 6'd0;
|
||
fifo_wms [i] <= 2'd0;
|
||
fifo_wmt [i] <= 2'd0;
|
||
fifo_tw [i] <= 4'd0;
|
||
fifo_th [i] <= 4'd0;
|
||
fifo_abe [i] <= 1'b0;
|
||
fifo_filter_lin [i] <= 1'b0;
|
||
fifo_alpha_a [i] <= 2'd0;
|
||
fifo_alpha_b [i] <= 2'd0;
|
||
fifo_alpha_c [i] <= 2'd0;
|
||
fifo_alpha_d [i] <= 2'd0;
|
||
fifo_alpha_fix [i] <= 8'd0;
|
||
fifo_zte [i] <= 1'b0;
|
||
fifo_ztst [i] <= 2'd0;
|
||
fifo_zmsk [i] <= 1'b0;
|
||
fifo_zbp [i] <= 9'd0;
|
||
fifo_zval [i] <= 32'd0;
|
||
// Brick 3 — affine interpolation context init.
|
||
fifo_tri_active [i] <= 1'b0;
|
||
fifo_combined [i] <= 1'b0; // COMBINED probe
|
||
fifo_z0 [i] <= 32'd0;
|
||
fifo_dr_dx [i] <= 32'sd0; fifo_dr_dy [i] <= 32'sd0;
|
||
fifo_dg_dx [i] <= 32'sd0; fifo_dg_dy [i] <= 32'sd0;
|
||
fifo_db_dx [i] <= 32'sd0; fifo_db_dy [i] <= 32'sd0;
|
||
fifo_da_dx [i] <= 32'sd0; fifo_da_dy [i] <= 32'sd0;
|
||
fifo_dz_dx [i] <= 32'sd0; fifo_dz_dy [i] <= 32'sd0;
|
||
// Textured-triangle rung — affine U/V context init.
|
||
fifo_u0_base [i] <= 11'd0;
|
||
fifo_v0_base [i] <= 11'd0;
|
||
fifo_u1v [i] <= 32'd0;
|
||
fifo_u2v [i] <= 32'd0;
|
||
fifo_du_dx_t [i] <= 32'sd0; fifo_du_dy_t [i] <= 32'sd0;
|
||
fifo_dv_dx_t [i] <= 32'sd0; fifo_dv_dy_t [i] <= 32'sd0;
|
||
// Ch301 perspective — S/T/Q context init.
|
||
fifo_persp [i] <= 1'b0;
|
||
fifo_s0_base [i] <= 24'd0;
|
||
fifo_t0_base [i] <= 24'd0;
|
||
fifo_q0_base [i] <= 24'd0;
|
||
fifo_stq1 [i] <= 64'd0;
|
||
fifo_stq2 [i] <= 64'd0;
|
||
fifo_q1 [i] <= 24'd0;
|
||
fifo_q2 [i] <= 24'd0;
|
||
fifo_ds_dx [i] <= 32'sd0; fifo_ds_dy [i] <= 32'sd0;
|
||
fifo_dt_dx [i] <= 32'sd0; fifo_dt_dy [i] <= 32'sd0;
|
||
fifo_dq_dx [i] <= 32'sd0; fifo_dq_dy [i] <= 32'sd0;
|
||
end
|
||
end else begin
|
||
raster_pixel_emit <= 1'b0;
|
||
z_flush_emit_o <= 1'b0; // Ch323 — Z-flush output strobe (pulse), default each cycle
|
||
tile_color_flush_emit_o <= 1'b0; // Ch323 — dedicated color-flush spill strobe, default each cycle
|
||
reload_start_o <= 1'b0; // Ch323 — reload-arm pulse, default each cycle
|
||
reload_wr_we <= 1'b0; // Ch323 — reload tile-RAM write window, re-asserted in TP_RELOAD
|
||
|
||
// ------------------------------------------------------
|
||
// Push side: enqueue new primitive context (or drop /
|
||
// degenerate-flag).
|
||
// ------------------------------------------------------
|
||
if (prim_filtered_degen) begin
|
||
raster_degenerate <= 1'b1;
|
||
end
|
||
if (push_drop) begin
|
||
raster_overflow <= 1'b1;
|
||
raster_overflow_count_r <= raster_overflow_count_r + 16'd1; // Ch315 — count drops
|
||
end
|
||
if (push_ok) begin
|
||
fifo_mode [fifo_wptr] <= new_mode;
|
||
fifo_color [fifo_wptr] <= rgbaq_q;
|
||
fifo_fbp [fifo_wptr] <= frame_1_q[8:0];
|
||
fifo_fbw [fifo_wptr] <= frame_1_q[21:16];
|
||
fifo_psm [fifo_wptr] <= frame_1_q[29:24];
|
||
fifo_bpp_shift [fifo_wptr] <= bpp_shift;
|
||
fifo_x_min [fifo_wptr] <= enq_x_min;
|
||
fifo_x_max [fifo_wptr] <= enq_x_max;
|
||
fifo_y_min [fifo_wptr] <= enq_y_min;
|
||
fifo_y_max [fifo_wptr] <= enq_y_max;
|
||
// Ch328 (1b) — SAME-CYCLE attr_ram write (valid push+1, like the register FIFO).
|
||
attr_ram [fifo_wptr] <= attr_word_next;
|
||
if (new_mode == RM_TRI) begin
|
||
fifo_v0x [fifo_wptr] <= tri_v0x;
|
||
fifo_v0y [fifo_wptr] <= tri_v0y;
|
||
fifo_v1x [fifo_wptr] <= tri_v1x;
|
||
fifo_v1y [fifo_wptr] <= tri_v1y;
|
||
fifo_v2x [fifo_wptr] <= tri_v2x;
|
||
fifo_v2y [fifo_wptr] <= tri_v2y;
|
||
fifo_bias [fifo_wptr] <= {~tl2_init, ~tl1_init, ~tl0_init};
|
||
fifo_sa [fifo_wptr] <= tri_swap_ccw ? -tri_sa_calc : tri_sa_calc;
|
||
fifo_c0 [fifo_wptr] <= fan_sat_path ? c_pivot_q : c_prev_q;
|
||
fifo_c1 [fifo_wptr] <= tri_swap_ccw ? rgbaq_q : c_curr_q;
|
||
fifo_c2 [fifo_wptr] <= tri_swap_ccw ? c_curr_q : rgbaq_q;
|
||
// Textured-triangle rung — capture the texture
|
||
// descriptor for a TRI+TME primitive. fifo_tme gates
|
||
// the textured path; only PSMCT32 (0x00) DECAL is
|
||
// sampled (the per-pixel s1_tex_active gate enforces
|
||
// this, mirroring the SPRITE path). For an untextured
|
||
// TRI close_tme_effective is 0 → these collapse to the
|
||
// legacy all-zero values and the Gouraud path is
|
||
// byte-identical. The SPRITE-DDA fields (fifo_u0/v0/u1/
|
||
// v1, fifo_du_dx/dv_dy) stay 0 for a TRI: the triangle
|
||
// uses AFFINE U/V (fifo_u0_base + fifo_du_dx_t etc.),
|
||
// not the 2-endpoint linear DDA.
|
||
fifo_tme [fifo_wptr] <= close_tme_effective;
|
||
fifo_u0 [fifo_wptr] <= 11'd0;
|
||
fifo_v0 [fifo_wptr] <= 11'd0;
|
||
fifo_u1 [fifo_wptr] <= 11'd0;
|
||
fifo_v1 [fifo_wptr] <= 11'd0;
|
||
fifo_tex_base [fifo_wptr] <= tex_base_next;
|
||
fifo_tbw [fifo_wptr] <= {8'd0, tex0_tbw};
|
||
fifo_tpsm [fifo_wptr] <= tex0_psm;
|
||
// Ch294 — wrap-mode snapshot (CLAMP_1 + TEX0 dims).
|
||
fifo_wms [fifo_wptr] <= clamp_wms;
|
||
fifo_wmt [fifo_wptr] <= clamp_wmt;
|
||
fifo_tw [fifo_wptr] <= tex0_tw;
|
||
fifo_th [fifo_wptr] <= tex0_th;
|
||
fifo_du_dx [fifo_wptr] <= 32'sd0;
|
||
fifo_dv_dy [fifo_wptr] <= 32'sd0;
|
||
// Textured-triangle rung — affine U/V context. Base at
|
||
// post-swap v0; the 4 gradients are solved by the
|
||
// shared divider engine (steps 10..13) and pre-cleared
|
||
// here. The packed {V,U} for v1/v2 lets the decoupled
|
||
// engine recompute the U/V numerators at schedule time.
|
||
fifo_u0_base [fifo_wptr] <= tri_uv0_u;
|
||
fifo_v0_base [fifo_wptr] <= tri_uv0_v;
|
||
fifo_u1v [fifo_wptr] <= {5'd0, tri_uv1_v, 5'd0, tri_uv1_u};
|
||
fifo_u2v [fifo_wptr] <= {5'd0, tri_uv2_v, 5'd0, tri_uv2_u};
|
||
fifo_du_dx_t [fifo_wptr] <= 32'sd0;
|
||
fifo_du_dy_t [fifo_wptr] <= 32'sd0;
|
||
fifo_dv_dx_t [fifo_wptr] <= 32'sd0;
|
||
fifo_dv_dy_t [fifo_wptr] <= 32'sd0;
|
||
// Brick 2a — TRI alpha-blend not implemented in v1.
|
||
fifo_abe [fifo_wptr] <= 1'b0;
|
||
// Ch310 — snapshot TEX1_1.MMAG bilinear filter (mirrors fifo_abe).
|
||
fifo_filter_lin [fifo_wptr] <= tex1_mmag;
|
||
// Brick 2c — snapshot ALPHA_1 selectors (mirrors fifo_abe).
|
||
fifo_alpha_a [fifo_wptr] <= alpha_a;
|
||
fifo_alpha_b [fifo_wptr] <= alpha_b;
|
||
fifo_alpha_c [fifo_wptr] <= alpha_c;
|
||
fifo_alpha_d [fifo_wptr] <= alpha_d;
|
||
fifo_alpha_fix [fifo_wptr] <= alpha_fix;
|
||
// Brick 3 — TRI Z-test feeds the Brick-2b Z stage
|
||
// with the per-pixel INTERPOLATED fragment Z. The
|
||
// ztst/zmsk/zbp context is shared with the sprite Z
|
||
// path; only the fragment-Z source differs (interp vs
|
||
// flat). The fb_color path stays the affine Gouraud
|
||
// color (s2_emit_color64 from s2_interp_color).
|
||
fifo_zte [fifo_wptr] <= new_tri_zte_active;
|
||
fifo_ztst [fifo_wptr] <= test_ztst;
|
||
fifo_zmsk [fifo_wptr] <= zbuf_zmsk;
|
||
fifo_zbp [fifo_wptr] <= zbuf_zbp;
|
||
fifo_zval [fifo_wptr] <= 32'd0; // unused for TRI (interp)
|
||
// Brick 3 / Ch295 — affine interpolation context.
|
||
// The 10 Q16.16 gradients are NO LONGER solved here
|
||
// with 10 inlined dividers. Instead the pre-shifted
|
||
// numerators (pure mul/sub/shift) and the common
|
||
// divisor are captured into the time-shared setup
|
||
// engine (grad_* regs, handled below); a SINGLE
|
||
// divider sequences the 10 quotients into the
|
||
// fifo_d*_* fields over the following ~10 cycles.
|
||
// The gradient fields are pre-cleared and the slot is
|
||
// flagged grad_pending so pop_ok holds it until the
|
||
// engine finishes. Result is bit-identical to the
|
||
// old per-call (num<<<16)/det+truncate.
|
||
fifo_tri_active [fifo_wptr] <= 1'b1;
|
||
// COMBINED probe — flag the slot if this TRI is the
|
||
// textured+alpha+depth primitive. fifo_tme/fifo_zte
|
||
// already collapse correctly for it (fifo_zte=0 here
|
||
// because new_tri_zte_active requires !close_tme_
|
||
// effective, and the combined prim IS textured); the
|
||
// combined FSM owns the reads/writes. Constant 0 at
|
||
// COMBINED_TAZ=0.
|
||
fifo_combined [fifo_wptr] <= close_combined;
|
||
fifo_z0 [fifo_wptr] <= tri_v0z;
|
||
fifo_dr_dx [fifo_wptr] <= 32'sd0; fifo_dr_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dg_dx [fifo_wptr] <= 32'sd0; fifo_dg_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_db_dx [fifo_wptr] <= 32'sd0; fifo_db_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_da_dx [fifo_wptr] <= 32'sd0; fifo_da_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dz_dx [fifo_wptr] <= 32'sd0; fifo_dz_dy [fifo_wptr] <= 32'sd0;
|
||
// Brick-3 fix — store v1.z / v2.z so the decoupled
|
||
// gradient engine can recompute the Z numerators for
|
||
// this slot when it schedules it (v0.z is in fifo_z0).
|
||
fifo_v1z [fifo_wptr] <= tri_v1z;
|
||
fifo_v2z [fifo_wptr] <= tri_v2z;
|
||
// Flag the slot as awaiting gradients. The engine
|
||
// (below) picks it up on a later idle cycle — the push
|
||
// no longer touches the engine state, so a back-to-back
|
||
// TRI push cannot clobber an in-flight solve.
|
||
fifo_grad_pending [fifo_wptr] <= 1'b1;
|
||
// Ch301 perspective — capture S/T/Q context for this
|
||
// TRI. fifo_persp gates the perspective emit path: set
|
||
// only when a TME triangle's texcoords came via ST
|
||
// (saw_st_q) AND the texture is actually sampled
|
||
// (close_tme_effective). saw_st_q is constant 0 when
|
||
// PERSPECTIVE_CORRECT=0, so fifo_persp is 0 and the
|
||
// perspective gradient steps 14..19 are inert (GRAD_STEPS
|
||
// stays 14). S/T are post-swap v0 bases; v1/v2 packed
|
||
// {16'd0,T,S}; Q from the RGBAQ.Q field of each vertex.
|
||
fifo_persp [fifo_wptr] <= saw_st_q && close_tme_effective;
|
||
fifo_s0_base [fifo_wptr] <= tri_s0;
|
||
fifo_t0_base [fifo_wptr] <= tri_t0;
|
||
fifo_q0_base [fifo_wptr] <= tri_q0;
|
||
fifo_stq1 [fifo_wptr] <= {16'd0, tri_t1, tri_s1};
|
||
fifo_stq2 [fifo_wptr] <= {16'd0, tri_t2, tri_s2};
|
||
fifo_q1 [fifo_wptr] <= tri_q1;
|
||
fifo_q2 [fifo_wptr] <= tri_q2;
|
||
fifo_ds_dx [fifo_wptr] <= 32'sd0; fifo_ds_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dt_dx [fifo_wptr] <= 32'sd0; fifo_dt_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dq_dx [fifo_wptr] <= 32'sd0; fifo_dq_dy [fifo_wptr] <= 32'sd0;
|
||
end else begin // RM_SPRITE
|
||
fifo_v0x [fifo_wptr] <= sp_v0_x_next;
|
||
fifo_v0y [fifo_wptr] <= sp_v0_y_next;
|
||
fifo_v1x [fifo_wptr] <= sp_v1_x_next;
|
||
fifo_v1y [fifo_wptr] <= sp_v1_y_next;
|
||
fifo_v2x [fifo_wptr] <= 12'd0;
|
||
fifo_v2y [fifo_wptr] <= 12'd0;
|
||
fifo_bias[fifo_wptr] <= 3'b000;
|
||
fifo_sa [fifo_wptr] <= 32'sd0;
|
||
fifo_c0 [fifo_wptr] <= 64'd0;
|
||
fifo_c1 [fifo_wptr] <= 64'd0;
|
||
fifo_c2 [fifo_wptr] <= 64'd0;
|
||
// Brick 1 — capture SPRITE texture context. Only
|
||
// PSMCT32 (0x00) actually textures in v1; other
|
||
// PSMs fall back to flat below even with TME=1.
|
||
fifo_tme [fifo_wptr] <= prim_tme;
|
||
fifo_u0 [fifo_wptr] <= sp_u0_next;
|
||
fifo_v0 [fifo_wptr] <= sp_v0t_next;
|
||
fifo_u1 [fifo_wptr] <= sp_u1_next;
|
||
fifo_v1 [fifo_wptr] <= sp_v1t_next;
|
||
fifo_tex_base [fifo_wptr] <= tex_base_next;
|
||
fifo_tbw [fifo_wptr] <= {8'd0, tex0_tbw};
|
||
fifo_tpsm [fifo_wptr] <= tex0_psm;
|
||
// Ch294 — wrap-mode snapshot (CLAMP_1 + TEX0 dims).
|
||
fifo_wms [fifo_wptr] <= clamp_wms;
|
||
fifo_wmt [fifo_wptr] <= clamp_wmt;
|
||
fifo_tw [fifo_wptr] <= tex0_tw;
|
||
fifo_th [fifo_wptr] <= tex0_th;
|
||
fifo_du_dx [fifo_wptr] <= du_dx_next;
|
||
fifo_dv_dy [fifo_wptr] <= dv_dy_next;
|
||
// Brick 2a — capture alpha-blend-active for this SPRITE.
|
||
fifo_abe [fifo_wptr] <= new_abe_active;
|
||
// Ch310 — snapshot TEX1_1.MMAG bilinear filter (mirrors fifo_abe).
|
||
fifo_filter_lin [fifo_wptr] <= tex1_mmag;
|
||
// Brick 2c — snapshot ALPHA_1 selectors (mirrors fifo_abe).
|
||
fifo_alpha_a [fifo_wptr] <= alpha_a;
|
||
fifo_alpha_b [fifo_wptr] <= alpha_b;
|
||
fifo_alpha_c [fifo_wptr] <= alpha_c;
|
||
fifo_alpha_d [fifo_wptr] <= alpha_d;
|
||
fifo_alpha_fix [fifo_wptr] <= alpha_fix;
|
||
// Brick 2b — capture Z-test context for this SPRITE.
|
||
// close_z is the flat fragment Z (XYZ2 bits[63:32]).
|
||
fifo_zte [fifo_wptr] <= new_zte_active;
|
||
fifo_ztst [fifo_wptr] <= test_ztst;
|
||
fifo_zmsk [fifo_wptr] <= zbuf_zmsk;
|
||
fifo_zbp [fifo_wptr] <= zbuf_zbp;
|
||
fifo_zval [fifo_wptr] <= close_z;
|
||
// Brick 3 — SPRITE uses flat color/Z, no affine ctx.
|
||
fifo_tri_active [fifo_wptr] <= 1'b0;
|
||
fifo_combined [fifo_wptr] <= 1'b0; // COMBINED probe — SPRITE never combined
|
||
fifo_z0 [fifo_wptr] <= 32'd0;
|
||
fifo_dr_dx [fifo_wptr] <= 32'sd0; fifo_dr_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dg_dx [fifo_wptr] <= 32'sd0; fifo_dg_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_db_dx [fifo_wptr] <= 32'sd0; fifo_db_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_da_dx [fifo_wptr] <= 32'sd0; fifo_da_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dz_dx [fifo_wptr] <= 32'sd0; fifo_dz_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_v1z [fifo_wptr] <= 32'd0;
|
||
fifo_v2z [fifo_wptr] <= 32'd0;
|
||
// Textured-triangle rung — SPRITE uses the linear DDA
|
||
// UV path (fifo_u0/v0 + fifo_du_dx/dv_dy above), not
|
||
// the affine U/V context; clear it.
|
||
fifo_u0_base [fifo_wptr] <= 11'd0;
|
||
fifo_v0_base [fifo_wptr] <= 11'd0;
|
||
fifo_u1v [fifo_wptr] <= 32'd0;
|
||
fifo_u2v [fifo_wptr] <= 32'd0;
|
||
fifo_du_dx_t [fifo_wptr] <= 32'sd0;
|
||
fifo_du_dy_t [fifo_wptr] <= 32'sd0;
|
||
fifo_dv_dx_t [fifo_wptr] <= 32'sd0;
|
||
fifo_dv_dy_t [fifo_wptr] <= 32'sd0;
|
||
// Ch295 — SPRITE has no affine gradients, so its slot
|
||
// is never gradient-pending (pop can proceed at once).
|
||
fifo_grad_pending [fifo_wptr] <= 1'b0;
|
||
// Ch301 perspective — SPRITE never uses the perspective
|
||
// S/T/Q path; clear it.
|
||
fifo_persp [fifo_wptr] <= 1'b0;
|
||
fifo_s0_base [fifo_wptr] <= 24'd0;
|
||
fifo_t0_base [fifo_wptr] <= 24'd0;
|
||
fifo_q0_base [fifo_wptr] <= 24'd0;
|
||
fifo_stq1 [fifo_wptr] <= 64'd0;
|
||
fifo_stq2 [fifo_wptr] <= 64'd0;
|
||
fifo_q1 [fifo_wptr] <= 24'd0;
|
||
fifo_q2 [fifo_wptr] <= 24'd0;
|
||
fifo_ds_dx [fifo_wptr] <= 32'sd0; fifo_ds_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dt_dx [fifo_wptr] <= 32'sd0; fifo_dt_dy [fifo_wptr] <= 32'sd0;
|
||
fifo_dq_dx [fifo_wptr] <= 32'sd0; fifo_dq_dy [fifo_wptr] <= 32'sd0;
|
||
end
|
||
// 1-bit ptr in a 2-entry ring just toggles.
|
||
// Ch171 — wrap via +1 mod FIFO_DEPTH (was `~fifo_wptr`
|
||
// which only flips a 1-bit pointer; broke when
|
||
// FIFO_DEPTH grew past 2).
|
||
fifo_wptr <= fifo_wptr + FIFO_PTR_W'(1);
|
||
end
|
||
|
||
// ------------------------------------------------------
|
||
// Brick-3 fix — DECOUPLED gradient setup engine. One divide
|
||
// per cycle, sequenced across the 10 (attribute × axis)
|
||
// numerators of the slot the engine is currently solving.
|
||
// grad_result = grad_quo[31:0] is BIT-IDENTICAL to the
|
||
// pre-fix per-call (num<<<16)/det → [31:0]: same numerators
|
||
// (grad_num_dadx/dady), same divisor (fifo_sa == tri_det_post),
|
||
// same truncation. Writes the result into the matching
|
||
// fifo_d*_* field of grad_slot; on the final step clears the
|
||
// slot's grad_pending flag and goes idle.
|
||
//
|
||
// When idle, the engine scans the FIFO for any slot flagged
|
||
// grad_pending (grad_pending_any) and starts on the lowest
|
||
// such slot, LATCHING that slot's numerators / divisor from
|
||
// the combinational grad_load_* (computed from FIFO storage).
|
||
// It then runs its 10 steps undisturbed — the push side never
|
||
// touches grad_busy/grad_step/grad_slot/grad_num_q, so a
|
||
// back-to-back TRI push cannot clobber an in-flight solve, and
|
||
// the next pending slot is picked up the cycle after this one
|
||
// finishes. pop_ok holds each slot until its own grad_pending
|
||
// clears, so no slot is ever rasterized with stale gradients.
|
||
if (grad_busy) begin
|
||
// Ch352 (Codex) — at K>1 the SETTLE phase holds grad_step/grad_num_q/grad_det stable for
|
||
// GRAD_DIV_CYCLES while the combinational divide settles, then CAPTURES the settled quotient into
|
||
// grad_result_q (the SINGLE multicycle endpoint -> the SDC exception covers ~32 regs, not the 5120
|
||
// fifo_d* fields that made Place churn). The COMMIT cycle writes grad_wr_data into the selected fifo
|
||
// and advances. At GRAD_DIV_CYCLES==1 (sim) the settle phase folds away (param compare) and COMMIT
|
||
// runs every cycle writing the combinational divide directly — byte-identical to pre-Ch352.
|
||
if (!grad_writing && (GRAD_SEQ_DIVIDER || (GRAD_DIV_CYCLES != 1))) begin
|
||
if (GRAD_SEQ_DIVIDER) begin
|
||
// Ch352 sequential divider: kicked on entry; wait div_done, capture the bit-exact quotient.
|
||
div_start <= 1'b0; // deassert the kick pulse
|
||
if (div_done_w) begin
|
||
grad_result_q <= div_quo[31:0];
|
||
grad_writing <= 1'b1;
|
||
end
|
||
end else if (grad_settle != 5'(GRAD_DIV_CYCLES-1)) begin
|
||
grad_settle <= grad_settle + 5'd1;
|
||
end else begin
|
||
grad_result_q <= grad_quo[31:0]; // settled combinational divide
|
||
grad_settle <= 5'd0;
|
||
grad_writing <= 1'b1;
|
||
end
|
||
end else begin
|
||
grad_writing <= 1'b0; // commit (K=1: every cycle; K>1: the write phase)
|
||
case (grad_step)
|
||
5'd0: fifo_dr_dx [grad_slot] <= grad_wr_data;
|
||
5'd1: fifo_dr_dy [grad_slot] <= grad_wr_data;
|
||
5'd2: fifo_dg_dx [grad_slot] <= grad_wr_data;
|
||
5'd3: fifo_dg_dy [grad_slot] <= grad_wr_data;
|
||
5'd4: fifo_db_dx [grad_slot] <= grad_wr_data;
|
||
5'd5: fifo_db_dy [grad_slot] <= grad_wr_data;
|
||
5'd6: fifo_da_dx [grad_slot] <= grad_wr_data;
|
||
5'd7: fifo_da_dy [grad_slot] <= grad_wr_data;
|
||
5'd8: fifo_dz_dx [grad_slot] <= grad_wr_data;
|
||
5'd9: fifo_dz_dy [grad_slot] <= grad_wr_data;
|
||
5'd10: fifo_du_dx_t [grad_slot] <= grad_wr_data;
|
||
5'd11: fifo_du_dy_t [grad_slot] <= grad_wr_data;
|
||
5'd12: fifo_dv_dx_t [grad_slot] <= grad_wr_data;
|
||
5'd13: fifo_dv_dy_t [grad_slot] <= grad_wr_data;
|
||
// Ch301 perspective — steps 14..19 only reached when
|
||
// GRAD_STEPS==20 (PERSPECTIVE_CORRECT=1). At param=0 the
|
||
// engine terminates at step 13 (GRAD_STEPS-1) below, so
|
||
// these never fire and the fifo_d{s,t,q}_* fields hold
|
||
// their pre-cleared 0 — param=0 byte-identical.
|
||
5'd14: fifo_ds_dx [grad_slot] <= grad_wr_data;
|
||
5'd15: fifo_ds_dy [grad_slot] <= grad_wr_data;
|
||
5'd16: fifo_dt_dx [grad_slot] <= grad_wr_data;
|
||
5'd17: fifo_dt_dy [grad_slot] <= grad_wr_data;
|
||
5'd18: fifo_dq_dx [grad_slot] <= grad_wr_data;
|
||
5'd19: fifo_dq_dy [grad_slot] <= grad_wr_data;
|
||
default: ;
|
||
endcase
|
||
if (grad_step == 5'(GRAD_STEPS-1)) begin // last step
|
||
grad_busy <= 1'b0;
|
||
fifo_grad_pending [grad_slot] <= 1'b0;
|
||
end else begin
|
||
grad_step <= grad_step + 5'd1;
|
||
if (GRAD_SEQ_DIVIDER) div_start <= 1'b1; // kick the next step's sequential divide
|
||
end
|
||
end
|
||
end else if (grad_prefetch2) begin
|
||
// Ch328 1c — stage 3: grad_word_q is settled (latched last cycle from attr_rd_q), so
|
||
// grad_load_num/grad_load_det (combinational off grad_word_q) are valid. Latch the
|
||
// numerators and start the solve next cycle.
|
||
grad_prefetch2 <= 1'b0;
|
||
grad_busy <= 1'b1;
|
||
grad_step <= 5'd0;
|
||
grad_settle <= 5'd0;
|
||
grad_writing <= 1'b0;
|
||
if (GRAD_SEQ_DIVIDER) div_start <= 1'b1; // kick the first step's sequential divide
|
||
grad_det_q <= grad_load_det;
|
||
for (int g = 0; g < GRAD_STEPS; g = g + 1)
|
||
grad_num_q[g] <= grad_load_num[g];
|
||
end else if (grad_prefetching) begin
|
||
// Ch328 1c — stage 2: the single read port issued attr_ram[grad_slot] last cycle, so
|
||
// attr_rd_q is valid now → capture it into grad_word_q.
|
||
grad_prefetching <= 1'b0;
|
||
grad_prefetch2 <= 1'b1;
|
||
grad_word_q <= attr_rd_q;
|
||
end else if (grad_pending_any) begin
|
||
// Ch328 1c — stage 1: idle + work waiting. grad_rd_issue (combinational, above) drives
|
||
// attr_rd_addr = grad_pending_slot THIS cycle; attr_rd_q holds it next cycle. pop_ok
|
||
// holds the slot until grad_pending clears, so the extra latency stays render-neutral.
|
||
grad_prefetching <= 1'b1;
|
||
grad_slot <= grad_pending_slot;
|
||
end
|
||
|
||
// ------------------------------------------------------
|
||
// Pop side: dequeue into ras_*_q when FSM is ready.
|
||
// ------------------------------------------------------
|
||
if (pop_ok) begin
|
||
// Streaming pop stays a SINGLE-cycle register read (Ch328 1c did NOT move it to the
|
||
// M20K port): the full-FIFO push+pop concurrency relies on the slot freeing the SAME
|
||
// cycle pop_ok asserts, which a multi-cycle M20K read would break — and streaming is
|
||
// the shallow, non-capacity path that gains nothing from M20K. M20K is for the
|
||
// multiprim grid (mp+grad), where deep per-tile capacity is the goal.
|
||
// INPUT fields from attr_ram[fifo_rptr]; HOT + 20 gradient OUTPUTS from fifo_*.
|
||
{ ras_v0_x, ras_v0_y, ras_v1_x, ras_v1_y, ras_v2_x, ras_v2_y, ras_bias, ras_sa_q,
|
||
ras_color, ras_c0_q, ras_c1_q, ras_c2_q, ras_fbp, ras_fbw, ras_psm, ras_bpp_shift,
|
||
ras_tme, ras_u0, ras_v0t, ras_u1, ras_v1t, ras_tex_base, ras_tbw, ras_tpsm,
|
||
ras_wms, ras_wmt, ras_tw, ras_th, ras_du_dx_q, ras_dv_dy_q, ras_abe, ras_filter_lin,
|
||
ras_alpha_a, ras_alpha_b, ras_alpha_c, ras_alpha_d, ras_alpha_fix,
|
||
ras_zte, ras_ztst, ras_zmsk, ras_zbp, ras_z_value, ras_z0, ras_u0_base, ras_v0_base,
|
||
pp_dump_u1v, pp_dump_u2v, ras_persp, ras_s0_base, ras_t0_base, ras_q0_base,
|
||
pp_dump_stq1, pp_dump_stq2, pp_dump_q1, pp_dump_q2, pp_dump_v1z, pp_dump_v2z } <= attr_ram[fifo_rptr];
|
||
ras_mode <= fifo_mode [fifo_rptr]; // HOT (register)
|
||
ras_x_min <= fifo_x_min [fifo_rptr];
|
||
ras_x_max <= fifo_x_max [fifo_rptr];
|
||
ras_y_min <= fifo_y_min [fifo_rptr];
|
||
ras_y_max <= fifo_y_max [fifo_rptr];
|
||
ras_cur_x <= fifo_x_min [fifo_rptr];
|
||
ras_cur_y <= fifo_y_min [fifo_rptr];
|
||
prim_x_min <= fifo_x_min [fifo_rptr];
|
||
prim_x_max <= fifo_x_max [fifo_rptr];
|
||
prim_y_min <= fifo_y_min [fifo_rptr];
|
||
prim_y_max <= fifo_y_max [fifo_rptr];
|
||
tile_col_r <= 12'd0;
|
||
tile_row_r <= 12'd0;
|
||
ras_tri_active <= fifo_tri_active [fifo_rptr]; // HOT
|
||
ras_combined_r <= fifo_combined [fifo_rptr]; // HOT
|
||
comb_state_r <= CB_Z;
|
||
// 20 ENGINE-WRITTEN gradient OUTPUTS — sideband registers (NOT in attr_ram).
|
||
ras_dr_dx <= fifo_dr_dx [fifo_rptr]; ras_dr_dy <= fifo_dr_dy [fifo_rptr];
|
||
ras_dg_dx <= fifo_dg_dx [fifo_rptr]; ras_dg_dy <= fifo_dg_dy [fifo_rptr];
|
||
ras_db_dx <= fifo_db_dx [fifo_rptr]; ras_db_dy <= fifo_db_dy [fifo_rptr];
|
||
ras_da_dx <= fifo_da_dx [fifo_rptr]; ras_da_dy <= fifo_da_dy [fifo_rptr];
|
||
ras_dz_dx <= fifo_dz_dx [fifo_rptr]; ras_dz_dy <= fifo_dz_dy [fifo_rptr];
|
||
ras_du_dx_t <= fifo_du_dx_t [fifo_rptr]; ras_du_dy_t <= fifo_du_dy_t [fifo_rptr];
|
||
ras_dv_dx_t <= fifo_dv_dx_t [fifo_rptr]; ras_dv_dy_t <= fifo_dv_dy_t [fifo_rptr];
|
||
ras_ds_dx <= fifo_ds_dx [fifo_rptr]; ras_ds_dy <= fifo_ds_dy [fifo_rptr];
|
||
ras_dt_dx <= fifo_dt_dx [fifo_rptr]; ras_dt_dy <= fifo_dt_dy [fifo_rptr];
|
||
ras_dq_dx <= fifo_dq_dx [fifo_rptr]; ras_dq_dy <= fifo_dq_dy [fifo_rptr];
|
||
// Ch171 — same wrap fix as wptr above.
|
||
fifo_rptr <= fifo_rptr + FIFO_PTR_W'(1);
|
||
raster_state <= R_SCAN;
|
||
raster_active <= 1'b1;
|
||
// Ch304 — TILE-LOCAL: if the popped primitive is the
|
||
// combined T+A+Z triangle (fifo_combined), begin with the
|
||
// CLEAR phase before the combined render runs. Otherwise
|
||
// the tile machine stays OFF. TILE_LOCAL gates the whole
|
||
// block; at TILE_LOCAL=0 tile_phase_r stays TP_OFF and the
|
||
// sweep counter stays 0 (dead logic).
|
||
if (TILE_LOCAL) begin
|
||
if (fifo_combined[fifo_rptr]) begin
|
||
tile_phase_r <= TP_CLEAR;
|
||
tile_sweep_r <= 9'd0;
|
||
end else begin
|
||
tile_phase_r <= TP_OFF;
|
||
end
|
||
end
|
||
end else if (mp_grid_start) begin
|
||
// Ch305+ MULTIPRIM — start a batch grid render. Latch the
|
||
// batch (slot 0 = current fifo_rptr, count = whole FIFO),
|
||
// reset the per-tile primitive index, start the grid at tile
|
||
// (0,0), and enter the CLEAR phase. The per-primitive geometry
|
||
// load happens at CLEAR-done (prim 0) and each RENDER advance.
|
||
// Dead at the default TILE_MULTIPRIM=0 (mp_grid_start const 0).
|
||
grid_base_rptr <= fifo_rptr;
|
||
prim_count_r <= fifo_count;
|
||
prim_idx_r <= '0;
|
||
tile_col_r <= 12'd0;
|
||
tile_row_r <= 12'd0;
|
||
ras_combined_r <= 1'b1;
|
||
// Ch316 — load the batch's FRAME-derived FLUSH params NOW (from the
|
||
// oldest FIFO entry = fifo_rptr) so the per-tile flush address is
|
||
// valid for EVERY tile, including EMPTY tiles that PRECEDE the first
|
||
// non-empty tile (which load no primitive of their own). The flush
|
||
// row stride is flush_y * (ras_fbw<<6); a leading-empty tile would
|
||
// otherwise flush with the RESET ras_fbw=0 → stride 0 → all rows
|
||
// collapse onto row 0 → the tile's screen region keeps the FB-init
|
||
// value (black). All prims in a batch share FRAME, so this is exactly
|
||
// what the per-prim load (mp_load_prim) sets at render — hence
|
||
// byte-identical for any batch whose first tile is non-empty.
|
||
ras_fbp <= fifo_fbp [fifo_rptr];
|
||
ras_fbw <= fifo_fbw [fifo_rptr];
|
||
ras_psm <= fifo_psm [fifo_rptr];
|
||
ras_bpp_shift <= fifo_bpp_shift [fifo_rptr];
|
||
raster_active <= 1'b1;
|
||
raster_state <= R_SCAN;
|
||
tile_sweep_r <= 9'd0;
|
||
if (BIN_BUFFER_ENABLE) begin
|
||
// BIN BUFFER — run the classification sweep BEFORE rendering.
|
||
// Clear all bins, reset the (t,p) sweep counters, enter TP_BIN.
|
||
// When the sweep completes it enters TP_CLEAR of tile (0,0).
|
||
bin_t <= '0;
|
||
bin_p <= '0;
|
||
bin_slot_r <= '0;
|
||
for (int bt = 0; bt < NTILES; bt = bt + 1)
|
||
bin_n[bt] <= '0;
|
||
tile_phase_r <= TP_BIN;
|
||
end else begin
|
||
tile_phase_r <= TP_CLEAR;
|
||
end
|
||
end else if (ras_drain_done) begin
|
||
// Pipeline drained, no more work — go idle.
|
||
raster_state <= R_IDLE;
|
||
raster_active <= 1'b0;
|
||
ras_mode <= RM_NONE;
|
||
end
|
||
|
||
// FIFO occupancy: net change = push_ok - pop_ok.
|
||
case ({push_ok, pop_ok})
|
||
2'b10: fifo_count <= fifo_count + FIFO_CNT_W'(1);
|
||
2'b01: fifo_count <= fifo_count - FIFO_CNT_W'(1);
|
||
default: ; // 00 or 11 (concurrent push+pop) — no net change
|
||
endcase
|
||
|
||
// ------------------------------------------------------
|
||
// Ch88 — pipeline stages
|
||
//
|
||
// S0 (this cycle, while in R_SCAN): drive the pipe with
|
||
// ras_cur_x/y; advance for next cycle. On reaching the
|
||
// bbox corner, transition to R_DRAIN (S0 stops
|
||
// producing valid pixels but S1/S2 still propagate).
|
||
//
|
||
// S1 (one cycle after S0): latch s1_x_q / s1_y_q and
|
||
// s1_valid_q. S1's combinational `s1_pixel_inside` is
|
||
// already evaluated for the next-cycle latch into S2.
|
||
//
|
||
// S2 (two cycles after S0): latch s2_*_q. Emit if
|
||
// s2_valid_q && s2_inside_q.
|
||
// ------------------------------------------------------
|
||
|
||
// Brick 2b — half-rate beat toggle. While a Z-tested scan is
|
||
// in flight, z_beat_q alternates every cycle so each pixel
|
||
// gets a "primary" beat (z_beat=0; pipeline advances, color
|
||
// write) and a "z" beat (z_beat=1; pipeline frozen, Z write).
|
||
// When ras_zte=0, z_advance is always 1 and z_beat_q stays 0,
|
||
// so the pipeline runs full rate — byte-identical to pre-2b.
|
||
if (z_scan_active) z_beat_q <= ~z_beat_q;
|
||
else z_beat_q <= 1'b0;
|
||
|
||
// Ch344 — textured-alpha SPRITE half-rate beat (mirrors z_beat_q). Mutually exclusive
|
||
// with z_scan_active (a textured-alpha sprite has ras_zte=0), so at most one half-rate
|
||
// scan is active and `z_advance && ta_advance` collapses to whichever is in flight.
|
||
if (ta_scan_active) ta_beat_q <= ~ta_beat_q;
|
||
else ta_beat_q <= 1'b0;
|
||
|
||
// Ch344 — capture the dest pixel on the FROZEN beat (ta_beat=1), when fb_rd fired for
|
||
// this S2 pixel (s2_tex_abe_dest). Combinational read2 returns it the same cycle; the
|
||
// blend+emit consumes it on the following primary beat. Mirrors Brick 2b's frozen-beat
|
||
// s3_zstored capture. (Registered read2 would return it 1 cyc later, still before the
|
||
// primary beat — the frozen beat holds fb_rd_addr, so a single capture point is correct.)
|
||
// Texel capture: UNCONDITIONAL on every frozen beat (a texel request was issued on the
|
||
// preceding primary beat, so s1_tex_color is freshly valid here) + 1-deep delay for S2 align.
|
||
if (ta_scan_active && ta_beat_q) begin
|
||
ta_tex_q <= s1_tex_color;
|
||
ta_tex_q1 <= ta_tex_q;
|
||
end
|
||
// Dest capture (combinational-read fallback; registered uses fb_rd_data live on the emit beat).
|
||
if (ta_scan_active && ta_beat_q && s2_valid_q && s2_inside_q && ras_tex_abe)
|
||
ta_dest_q <= fb_rd_data;
|
||
|
||
// Pipeline advance (S0 walker + S0->S1->S2->S3 latches) is
|
||
// GATED by z_advance. On a frozen z beat all pipeline regs
|
||
// hold and only the queued Z write fires (below).
|
||
if (z_advance && ta_advance) begin // Ch344: freeze on the textured-alpha frozen beat too
|
||
// S0 → S1 latch.
|
||
// COMBINED probe: while a combined scan owns the walker,
|
||
// the standard pixel pipeline is kept EMPTY (s1_valid_q
|
||
// forced 0) so no legacy emit / S3 capture fires for the
|
||
// combined primitive — the combined FSM (below) drives all
|
||
// reads/writes. !ras_combined is constant 1 at param=0, so
|
||
// this is byte-identical to the legacy latch then.
|
||
s1_x_q <= ras_cur_x;
|
||
s1_y_q <= ras_cur_y;
|
||
s1_valid_q <= (raster_state == R_SCAN) && !ras_combined;
|
||
|
||
// S1 → S2 latch
|
||
s2_x_q <= s1_x_q;
|
||
s2_y_q <= s1_y_q;
|
||
s2_valid_q <= s1_valid_q;
|
||
s2_inside_q <= s1_pixel_inside;
|
||
s2_L0_q <= -$signed(s1_e1);
|
||
s2_L1_q <= -$signed(s1_e2);
|
||
s2_L2_q <= -$signed(s1_e0);
|
||
s2_mode_q <= ras_mode;
|
||
// Brick 1 — carry the sampled texel + texturing-active
|
||
// flag forward in lockstep with the S2 pixel it belongs
|
||
// to. With TEX_RD_LATENCY=1 against a combinational read2
|
||
// port, s1_tex_color is already the texel for this S1
|
||
// pixel, so a single register aligns it with S2 emit.
|
||
s2_tex_active_q <= s1_tex_active;
|
||
s2_tex_color_q <= s1_tex_color;
|
||
|
||
// S2 emit / S2->S3 blend or Z-test latch.
|
||
//
|
||
// Opaque non-Z pixel (ras_abe=0, ras_zte=0): drive the
|
||
// emit outputs directly, exactly as pre-Brick-2a/2b.
|
||
//
|
||
// Blended pixel (ras_abe=1): capture into the alpha S3
|
||
// stage (no Z scan can also be alpha — mutually exclusive
|
||
// by new_zte_active).
|
||
//
|
||
// Z-tested pixel (ras_zte=1): capture into the Z S3 stage.
|
||
// z_rd_* (above) presented the Z-buffer address this S2
|
||
// cycle. For the combinational read port latch the stored
|
||
// Z now; for the registered port the live z_rd_data is
|
||
// compared at S3.
|
||
s3_valid_q <= 1'b0;
|
||
s3_zvalid_q <= 1'b0;
|
||
if (s2_valid_q && s2_inside_q) begin
|
||
if (ras_tex_abe) begin
|
||
// Ch344 — textured-alpha SPRITE: emit the blended pixel on this primary beat.
|
||
// The texel is live in s2_tex_color_q; the dest was captured into ta_dest_q on
|
||
// the prior frozen beat. ta_blended_color64 = source-over(texel*vtx, ta_dest_q).
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= s2_x_q;
|
||
raster_pixel_y_q <= s2_y_q;
|
||
raster_pixel_color_q <= ta_blended_color64;
|
||
raster_pixel_fb_addr_q <= s2_fb_addr;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= ras_psm;
|
||
end else if (ras_abe) begin
|
||
s3_valid_q <= 1'b1;
|
||
s3_x_q <= s2_x_q;
|
||
s3_y_q <= s2_y_q;
|
||
s3_fb_addr_q <= s2_fb_addr;
|
||
s3_be_q <= s2_emit_be;
|
||
s3_mask_q <= s2_emit_mask;
|
||
s3_psm_q <= ras_psm;
|
||
// Flat SPRITE source color is ras_color (no texturing
|
||
// when blending in v1). Source alpha = RGBAQ.A.
|
||
s3_cs_q <= ras_color;
|
||
s3_as_q <= ras_color[31:24];
|
||
s3_dest_q <= fb_rd_data; // used when FB_RD_REGISTERED=0
|
||
end else if (ras_zte) begin
|
||
s3_zvalid_q <= 1'b1;
|
||
s3_zx_q <= s2_x_q;
|
||
s3_zy_q <= s2_y_q;
|
||
s3_zfb_addr_q <= s2_fb_addr;
|
||
s3_z_addr_q <= s2_z_addr;
|
||
s3_zcolor_q <= s2_emit_color64; // flat (sprite) or Gouraud (TRI) PSMCT32 color
|
||
// Brick 3 — TRI uses the per-pixel INTERPOLATED Z
|
||
// (s2_interp_z); SPRITE uses the flat ras_z_value.
|
||
s3_zval_q <= ras_tri_active ? s2_interp_z : ras_z_value;
|
||
s3_ztst_q <= ras_ztst;
|
||
s3_zmsk_q <= ras_zmsk;
|
||
// Combinational read port: z_rd_data is valid THIS
|
||
// S2 primary beat (mem[s2_z_addr] same cycle), so
|
||
// capture P's stored Z now. The registered read is
|
||
// 1 cycle late and is captured on the following
|
||
// frozen z beat instead (below).
|
||
if (!Z_RD_REGISTERED)
|
||
s3_zstored_q <= z_rd_data;
|
||
end else if (!ras_persp && !ras_combined) begin
|
||
// COMBINED probe: suppress the legacy opaque DECAL
|
||
// emit for the combined primitive (the combined FSM
|
||
// drives the blended color + Z writes instead). This
|
||
// branch is already unreachable in combined mode
|
||
// (s2_valid_q is held 0), but the explicit guard
|
||
// documents the suppression. Constant 1 at param=0.
|
||
// Ch302 — SUPPRESS the affine S2 opaque emit for a
|
||
// perspective primitive. The S2 pixel carries the
|
||
// STALE affine texel (the texture unit is muxed to
|
||
// the perspective u/v), so it must NOT drive the
|
||
// raster_pixel_* port. The perspective emit (below)
|
||
// drives them instead, from the +5 stage. When
|
||
// ras_persp=0 (always at PERSPECTIVE_CORRECT=0) this
|
||
// is byte-identical to the legacy opaque emit.
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= s2_x_q;
|
||
raster_pixel_y_q <= s2_y_q;
|
||
raster_pixel_color_q <= s2_emit_color64;
|
||
raster_pixel_fb_addr_q <= s2_fb_addr;
|
||
raster_pixel_be_q <= s2_emit_be;
|
||
raster_pixel_mask_q <= s2_emit_mask;
|
||
raster_pixel_psm_q <= ras_psm;
|
||
end
|
||
end
|
||
|
||
// Ch302 — PERSPECTIVE emit. Fires at S1+5 (persp_emit5),
|
||
// when s1_tex_color is the perspective DECAL texel for the
|
||
// pixel at (persp_x5, persp_y5). PSMCT32 full-word write
|
||
// (be=1111, mask=all). The DECAL texel color is the sampled
|
||
// ABGR zero-extended to 64 bits (upper 32 = Q, don't-care
|
||
// for the PSMCT32 fb write — mirrors the affine TRI DECAL
|
||
// packing {.., s2_tex_color_q} whose upper half is also
|
||
// unused by the write). Ordered AFTER the affine opaque
|
||
// emit so it wins the raster_pixel_* port on a perspective
|
||
// prim; the affine branch is suppressed via !ras_persp
|
||
// above, so there is never a same-cycle double-drive.
|
||
// ras_persp is constant 0 at PERSPECTIVE_CORRECT=0, so this
|
||
// block is dead in the affine-only build.
|
||
if (ras_persp && persp_emit5) begin
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= persp_x5;
|
||
raster_pixel_y_q <= persp_y5;
|
||
raster_pixel_color_q <= {32'd0, s1_tex_color};
|
||
raster_pixel_fb_addr_q <= persp_fb_addr;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= ras_psm;
|
||
end
|
||
|
||
// S3 emit (Brick 2a) — drive the emit outputs from the
|
||
// blended color. PSMCT32 full-word write (be=1111, mask=all).
|
||
if (s3_valid_q) begin
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= s3_x_q;
|
||
raster_pixel_y_q <= s3_y_q;
|
||
raster_pixel_color_q <= s3_blended_color64;
|
||
raster_pixel_fb_addr_q <= s3_fb_addr_q;
|
||
raster_pixel_be_q <= s3_be_q;
|
||
raster_pixel_mask_q <= s3_mask_q;
|
||
raster_pixel_psm_q <= s3_psm_q;
|
||
end
|
||
|
||
// S3 Z-test (Brick 2b) — PRIMARY-beat write.
|
||
//
|
||
// This always runs on z_advance (the primary beat for a
|
||
// Z scan, OR every cycle if somehow not a Z scan — but
|
||
// s3_zvalid_q is only ever set under ras_zte). On a
|
||
// passing pixel: write the COLOR to the framebuffer this
|
||
// beat (full-word PSMCT32), and queue the Z write for the
|
||
// following frozen z beat (unless ZMSK=1, which suppresses
|
||
// the Z update). On a failing pixel: write NOTHING (the
|
||
// pixel is occluded) and queue nothing.
|
||
zw_pending_q <= 1'b0;
|
||
if (s3_zvalid_q) begin
|
||
if (z_test_pass) begin
|
||
// Framebuffer color write (PSMCT32, full word).
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= s3_zx_q;
|
||
raster_pixel_y_q <= s3_zy_q;
|
||
raster_pixel_color_q <= s3_zcolor_q;
|
||
raster_pixel_fb_addr_q <= s3_zfb_addr_q;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= 6'h00; // PSMCT32 color fb
|
||
// Queue the Z-buffer write for the z beat unless
|
||
// ZMSK masks Z updates. The Z value is stored as a
|
||
// plain 32-bit word (PSMZ32 == 32-bit, bit-identical
|
||
// to a PSMCT32 word write at the Z address).
|
||
if (!s3_zmsk_q) begin
|
||
zw_pending_q <= 1'b1;
|
||
zw_addr_q <= s3_z_addr_q;
|
||
zw_val_q <= s3_zval_q;
|
||
end
|
||
end
|
||
// FAIL: no fb write, no Z write — pixel discarded.
|
||
end
|
||
|
||
// S0 advance + R_SCAN → R_DRAIN transition.
|
||
// COMBINED probe: the legacy 1-pixel/cycle advance is
|
||
// gated OFF while a combined scan owns the walker — the
|
||
// combined FSM (below) advances ras_cur_x/y only after a
|
||
// pixel's full 5-beat sequence (or early-out) completes.
|
||
// !ras_combined is constant 1 at param=0 (byte-identical).
|
||
if ((raster_state == R_SCAN) && !ras_combined) begin
|
||
if (ras_at_end_of_s0) begin
|
||
raster_state <= R_DRAIN;
|
||
end else if (ras_at_x_end) begin
|
||
ras_cur_x <= ras_x_min;
|
||
ras_cur_y <= ras_cur_y + 12'd1;
|
||
end else begin
|
||
ras_cur_x <= ras_cur_x + 12'd1;
|
||
end
|
||
end
|
||
|
||
// ======================================================
|
||
// COMBINED PROBE — per-pixel 5-beat FSM.
|
||
//
|
||
// Drives the SAME raster_pixel_* write port and the SAME
|
||
// tex/fb/z read ports as the legacy paths, but ONE beat
|
||
// at a time so the read2 mux + mutual-exclusion asserts in
|
||
// the BRAM top never see two consumers in one cycle. The
|
||
// read enables themselves (comb_z_req / comb_tex_req /
|
||
// comb_fb_req feeding z_rd_en / tex in_valid / fb_rd_en)
|
||
// are driven combinationally above; this block sequences
|
||
// the beat register, latches returned data, issues the two
|
||
// writes, and advances the walker. Guarded so it is dead
|
||
// logic at COMBINED_TAZ=0 (comb_scan_active const 0).
|
||
// ======================================================
|
||
if (comb_scan_active) begin
|
||
unique case (comb_state_r)
|
||
// Beat 0 — Z read issued this cycle (comb_z_req).
|
||
CB_Z: begin
|
||
if (comb_pix_inside_w) begin
|
||
comb_state_r <= CB_ZW;
|
||
end else begin
|
||
// Outside the triangle: NO reads, NO writes
|
||
// for this pixel — advance immediately.
|
||
comb_advance_walker;
|
||
end
|
||
end
|
||
|
||
// Beat 1 — stored Z ready (z_rd_data). Compute +
|
||
// latch the depth test. On PASS issue the texel
|
||
// read (comb_tex_req high this cycle). On FAIL no
|
||
// texel/dest read, no write — advance.
|
||
CB_ZW: begin
|
||
comb_ztest_pass_r <= comb_ztest_pass_w;
|
||
// Ch304 — capture the Z actually compared
|
||
// (tile Z in tile mode, VRAM Z otherwise).
|
||
// comb_z_compare_w == z_rd_data at TILE_LOCAL=0.
|
||
comb_zstored_r <= comb_z_compare_w;
|
||
if (comb_ztest_pass_w) begin
|
||
// Ch310 — on the bilinear path the texel read
|
||
// issued THIS cycle (comb_tex_req pulsed
|
||
// u_tex.in_valid; the 4-tap sampler started and
|
||
// tex_busy is now high). STALL in CB_TWAIT until
|
||
// it completes, then run the legacy CB_T beat.
|
||
// On the nearest path (bili_now=0, always so at
|
||
// BILINEAR_ENABLE=0) go straight to CB_T as
|
||
// before — byte-identical.
|
||
if (bili_now) comb_state_r <= CB_TWAIT;
|
||
else comb_state_r <= CB_T;
|
||
end else begin
|
||
comb_advance_walker;
|
||
end
|
||
end
|
||
|
||
// Ch310 — Beat 1.5 (BILINEAR ONLY) — hold while the
|
||
// 4-tap sampler runs its ~9-cycle multi-beat read. The
|
||
// sampler runs full-rate; this FSM only steps on
|
||
// z_advance beats, so we wait on the LEVEL !tex_busy
|
||
// (which stays low once the sampler returns to BS_IDLE)
|
||
// rather than the single-cycle out_valid pulse, so a
|
||
// frozen z beat can never miss the completion. in_valid
|
||
// is NOT re-pulsed here: comb_tex_req is gated on
|
||
// (comb_state_r==CB_ZW), and CB_TWAIT!=CB_ZW. When the
|
||
// sampler is done, fall through to CB_T — which latches
|
||
// s1_tex_color (HELD stable since out_valid per the
|
||
// gs_texture_unit hold register) and issues the dest-fb
|
||
// read exactly as the nearest path. This state is dead
|
||
// logic at BILINEAR_ENABLE=0 (never entered).
|
||
CB_TWAIT: begin
|
||
if (!tex_busy) comb_state_r <= CB_T;
|
||
// else: hold (do not advance, do not re-pulse).
|
||
end
|
||
|
||
// Beat 2 — texel ready (s1_tex_color). Latch as the
|
||
// source color Cs + As; issue the dest-fb read
|
||
// (comb_fb_req high this cycle). On the bilinear path
|
||
// s1_tex_color is the HELD blended ABGR captured above.
|
||
CB_T: begin
|
||
// Ch333/335 — MODULATE (TFX=0): texel * interpolated vertex color (Gouraud;
|
||
// flat collapses to the constant); DECAL (TFX=1): texel replaces (unchanged).
|
||
// Alpha keeps the texel value either way.
|
||
comb_cs_r <= comb_modulate ? gs_modulate_abgr(s1_tex_color, comb_interp_color)
|
||
: s1_tex_color;
|
||
comb_as_r <= s1_tex_color[31:24];
|
||
comb_state_r <= CB_FB;
|
||
end
|
||
|
||
// Beat 3 — dest color ready (fb_rd_data). WRITE #1:
|
||
// the blended PSMCT32 color to the FB address.
|
||
CB_FB: begin
|
||
// Ch304 — in tile-local mode the blended color
|
||
// write lands in the on-chip color tile (driven
|
||
// combinationally by the tile-RAM driver this
|
||
// same beat), so SUPPRESS the VRAM framebuffer
|
||
// emit here. tile_active is constant 0 at
|
||
// TILE_LOCAL=0, so this is the legacy emit then.
|
||
if (!tile_active) begin
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= ras_cur_x;
|
||
raster_pixel_y_q <= ras_cur_y;
|
||
// Upper 32 bits unused for a PSMCT32 (32-bit word)
|
||
// write. comb_cs_r is only [31:0], so the original
|
||
// comb_cs_r[63:32] was an out-of-range select (iverilog
|
||
// X-filled it; Quartus rejects it) — zero-fill instead.
|
||
raster_pixel_color_q <= {32'd0,
|
||
comb_blend_a, comb_blend_b,
|
||
comb_blend_g, comb_blend_r};
|
||
raster_pixel_fb_addr_q <= comb_fb_addr;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= 6'h00; // PSMCT32
|
||
end
|
||
comb_state_r <= CB_ZWR;
|
||
end
|
||
|
||
// Beat 4 — WRITE #2: the fragment Z to the Z addr
|
||
// (PSMCT32 full word at the Z address). If ZMSK,
|
||
// skip the write but STILL consume the beat. Then
|
||
// advance the walker (both writes are now done).
|
||
CB_ZWR: begin
|
||
// Ch304 — in tile-local mode the fragment-Z
|
||
// write lands in the on-chip Z tile (driven by
|
||
// the tile-RAM driver this same beat, also ZMSK-
|
||
// gated), so SUPPRESS the VRAM Z emit here.
|
||
// tile_active const 0 at TILE_LOCAL=0 (legacy).
|
||
if (!ras_zmsk && !tile_active) begin
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= ras_cur_x;
|
||
raster_pixel_y_q <= ras_cur_y;
|
||
raster_pixel_color_q <= {32'd0, comb_frag_z};
|
||
raster_pixel_fb_addr_q <= comb_z_addr;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= 6'h00; // 32-bit word
|
||
end
|
||
comb_advance_walker;
|
||
end
|
||
|
||
default: comb_state_r <= CB_Z;
|
||
endcase
|
||
end
|
||
end else begin
|
||
// ----------------------------------------------------
|
||
// Frozen z beat (Brick 2b): pipeline held. Fire the
|
||
// queued Z-buffer write through the single VRAM write
|
||
// port (emitted as a PSMCT32 full-word write at the Z
|
||
// address). No new color contends because S2/S3 are
|
||
// frozen this cycle.
|
||
// ----------------------------------------------------
|
||
if (zw_pending_q) begin
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= s3_zx_q;
|
||
raster_pixel_y_q <= s3_zy_q;
|
||
raster_pixel_color_q <= {32'd0, zw_val_q};
|
||
raster_pixel_fb_addr_q <= zw_addr_q;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= 6'h00; // 32-bit word write
|
||
end
|
||
zw_pending_q <= 1'b0;
|
||
|
||
// Registered read port: z_rd_addr was held across the
|
||
// primary->frozen edge, so z_rd_data on THIS frozen beat
|
||
// is the 1-cycle-late read of the S3-resident pixel's
|
||
// Z-buffer address — i.e. this pixel's stored Z. Capture
|
||
// it for the compare on the next primary beat. (The
|
||
// combinational port already captured at the S2 latch.)
|
||
if (Z_RD_REGISTERED && s3_zvalid_q)
|
||
s3_zstored_q <= z_rd_data;
|
||
end
|
||
|
||
// ==========================================================
|
||
// Ch304 TILE-LOCAL — phase FSM + CLEAR/FLUSH sweeps.
|
||
//
|
||
// Placed OUTSIDE the z_advance/frozen split so the CLEAR and
|
||
// FLUSH sweeps run at FULL rate (one tile RAM op / clock) and
|
||
// the registered tile-RAM read aligns to the very next clock —
|
||
// the combined prim is ZTE so z_beat toggles, and a sweep gated
|
||
// on z_advance would lose the read/emit alignment across frozen
|
||
// beats. The combined RENDER (above) still steps on z_advance;
|
||
// CLEAR/FLUSH touch only the tile RAMs, not the pixel pipeline.
|
||
// Entirely gated by tile_active → dead at TILE_LOCAL=0.
|
||
//
|
||
// TP_CLEAR : the combinational driver writes the clear
|
||
// color/Z at index tile_sweep_r this clock; here
|
||
// we advance the sweep. After all 256 entries
|
||
// (sweep 255 → next 256), move to TP_RENDER.
|
||
// TP_RENDER : the combined FSM (above) runs and, at the bbox
|
||
// corner, comb_advance_walker sets raster_state
|
||
// <= R_DRAIN. Observing R_DRAIN here closes the
|
||
// render → kick off TP_FLUSH.
|
||
// TP_FLUSH : the combinational driver presents color
|
||
// raddr=tile_sweep_r; ONE clock later we emit the
|
||
// FB write (flush_emit_q / flush_idx_q). After all
|
||
// 256 indices presented + the last emit drained,
|
||
// return to TP_OFF → ras_drain_done can fire and
|
||
// the FSM idles / pops the next primitive.
|
||
// ==========================================================
|
||
flush_emit_q <= 1'b0;
|
||
z_flush_emit_q <= 1'b0; // Ch323 — Z-flush pipeline strobe (pulse), default each cycle
|
||
if (tile_active) begin
|
||
if (mp_rd_state != RDS_IDLE) begin
|
||
// Ch328 1c — attr read IN FLIGHT: hold the tile phase until attr_rd_q is valid.
|
||
// RDS_ISSUE presented attr_rd_addr=mp_pend_slot this cycle (comb); attr_rd_q is
|
||
// valid in RDS_CONSUME, where mp_load_prim latches it and the phase advances.
|
||
if (mp_rd_state == RDS_ISSUE) begin
|
||
mp_rd_state <= RDS_CONSUME;
|
||
end else begin // RDS_CONSUME
|
||
`ifndef SYNTHESIS
|
||
// Ch329 — the binning gate must guarantee only COMBINED prims reach the grid
|
||
// render (non-combined have no tile-local path). If one leaks, the walk would
|
||
// stall (tile_active=0) — catch it here instead.
|
||
if (!fifo_combined[mp_pend_slot])
|
||
$error("ch329: multiprim grid loaded NON-combined prim (slot %0d) — binning gate failed", mp_pend_slot);
|
||
`endif
|
||
mp_load_prim(mp_pend_slot, mp_pend_cxlo, mp_pend_cxhi,
|
||
mp_pend_cylo, mp_pend_cyhi);
|
||
raster_state <= R_SCAN;
|
||
comb_state_r <= CB_Z;
|
||
// tile_phase is already TP_RENDER (set at the issuing site, so the clear/flush
|
||
// combinational drivers don't keep writing during the stall).
|
||
mp_rd_state <= RDS_IDLE;
|
||
end
|
||
end else
|
||
unique case (tile_phase_r)
|
||
// ==================================================
|
||
// TP_BIN (BIN_BUFFER_ENABLE only) — classification sweep.
|
||
// One (prim bin_p, tile bin_t) pair per cycle. If the pair
|
||
// overlaps, APPEND prim bin_p to tile bin_t's bin in p order.
|
||
// p (inner) advances every cycle; when p reaches the last
|
||
// prim of the batch, reset p to 0 and advance t (outer).
|
||
// When t passes the last tile, all pairs are classified ->
|
||
// enter TP_CLEAR of tile (0,0). Touches NO tile RAM, so the
|
||
// combinational tile-RAM driver's case-default leaves it inert.
|
||
// Schedule: prim_count_r * NTILES cycles (4 prims x 4 tiles
|
||
// = 16 cycles for a full 2x2 batch), then 1 cycle to TP_CLEAR.
|
||
// ==================================================
|
||
TP_BIN: if (BIN_BUFFER_ENABLE) begin
|
||
// Ch329 — count each non-combined prim ONCE (on the first tile pass); it is
|
||
// refused below. Non-combined prims have no tile-local path (combined-only grid).
|
||
if (bin_t == BIN_T_W'(0) && !fifo_combined[bin_p])
|
||
tile_refused_count <= tile_refused_count + 16'd1;
|
||
// Ch329 — only COMBINED prims are binned (they alone have a tile-local color/Z
|
||
// path). A non-combined prim is skipped → never loaded by the bin-walk → cannot
|
||
// stall the grid (the old behavior: tile_active=0 froze the walk).
|
||
if (bin_overlap && fifo_combined[bin_p]) begin
|
||
// Ch315 — bin-capacity guard. With the bin sized to the
|
||
// FIFO (M=N) bin_n can never reach FIFO_DEPTH (the FIFO
|
||
// drops the prim first), so the else-arm is defensive and
|
||
// would only fire under a future M<N decoupling.
|
||
if (bin_n[bin_t] < FIFO_CNT_W'(FIFO_DEPTH)) begin
|
||
bin_prim[bin_t][bin_n[bin_t]] <= bin_p;
|
||
bin_n[bin_t] <= bin_n[bin_t] + FIFO_CNT_W'(1);
|
||
// peak per-tile occupancy (post-append count).
|
||
if ((bin_n[bin_t] + FIFO_CNT_W'(1)) > bin_occ_max_r)
|
||
bin_occ_max_r <= bin_n[bin_t] + FIFO_CNT_W'(1);
|
||
end else begin
|
||
bin_overflow_r <= 1'b1;
|
||
end
|
||
end
|
||
if (bin_p >= prim_count_r - FIFO_CNT_W'(1)) begin
|
||
// last prim for this tile -> advance to next tile.
|
||
bin_p <= '0;
|
||
if (bin_t >= BIN_T_W'(NTILES - 1)) begin
|
||
// All (p,t) pairs classified -> start rendering.
|
||
tile_phase_r <= TP_CLEAR;
|
||
tile_sweep_r <= 9'd0;
|
||
end else begin
|
||
bin_t <= bin_t + BIN_T_W'(1);
|
||
end
|
||
end else begin
|
||
bin_p <= bin_p + FIFO_CNT_W'(1);
|
||
end
|
||
end
|
||
|
||
TP_CLEAR: begin
|
||
if (tile_sweep_r == 9'd255) begin
|
||
tile_sweep_r <= 9'd0;
|
||
if (BIN_BUFFER_ENABLE) begin
|
||
// BIN BUFFER — walk cur_t's PRECOMPUTED bin
|
||
// instead of re-scanning. Empty bin (bin_n==0)
|
||
// -> FLUSH (tile = clear). Else load bin slot 0's
|
||
// prim (bin_prim[cur_t][0]) with its tile-clipped
|
||
// bbox and render. Bin order == mp_next_nonempty
|
||
// order, so this is image-equivalent to the
|
||
// re-test path. (Reached only at TILE_MULTIPRIM=1,
|
||
// the only mode that ever enters TP_BIN.)
|
||
if (bin_n[cur_t] == FIFO_CNT_W'(0)) begin
|
||
bin_slot_r <= '0;
|
||
prim_idx_r <= '0;
|
||
tile_phase_r <= TP_FLUSH;
|
||
end else begin
|
||
bin_slot_r <= '0;
|
||
prim_idx_r <= binw_first_idx;
|
||
// Ch328 1c — issue single-port read; stall FSM consumes + → TP_RENDER.
|
||
mp_rd_state <= RDS_ISSUE;
|
||
mp_pend_slot <= binw_first_slot;
|
||
mp_pend_cxlo <= binw_first_cx_lo; mp_pend_cxhi <= binw_first_cx_hi;
|
||
mp_pend_cylo <= binw_first_cy_lo; mp_pend_cyhi <= binw_first_cy_hi;
|
||
tile_phase_r <= TP_RENDER;
|
||
end
|
||
end else if (TILE_MULTIPRIM) begin
|
||
// Ch305+ MULTIPRIM — the tile RAMs are now
|
||
// cleared. Load the FIRST non-empty primitive of
|
||
// the batch (skipping primitives whose clip
|
||
// against this tile is empty) and start
|
||
// rendering it. If NO primitive overlaps this
|
||
// tile, go straight to FLUSH (tile = clear).
|
||
// prim_idx_r is set to the chosen index so the
|
||
// RENDER-advance below continues from there.
|
||
if (mp_first_idx >= prim_count_r) begin
|
||
// No primitive overlaps this tile.
|
||
prim_idx_r <= '0;
|
||
tile_phase_r <= TP_FLUSH;
|
||
end else begin
|
||
prim_idx_r <= mp_first_idx;
|
||
// Ch328 1c — issue single-port read; stall FSM consumes + → TP_RENDER.
|
||
mp_rd_state <= RDS_ISSUE;
|
||
mp_pend_slot <= mp_first_slot;
|
||
mp_pend_cxlo <= mp_first_cx_lo; mp_pend_cxhi <= mp_first_cx_hi;
|
||
mp_pend_cylo <= mp_first_cy_lo; mp_pend_cyhi <= mp_first_cy_hi;
|
||
tile_phase_r <= TP_RENDER;
|
||
end
|
||
end else begin
|
||
// Ch305 — CLIP the primitive bbox to THIS tile and
|
||
// load it into the walker bbox. tile_idx_w (low 4
|
||
// bits) gives the tile-local RAM address for ANY
|
||
// 16-aligned tile, so no other retargeting needed.
|
||
// If the clip is EMPTY (prim doesn't overlap this
|
||
// tile) SKIP the RENDER → go straight to FLUSH so
|
||
// the tile shows the clear color. Re-arm R_SCAN so
|
||
// the combined walker runs again for this tile (it
|
||
// is left in R_DRAIN by the previous tile's render;
|
||
// at COLS=ROWS=1 raster_state is already R_SCAN, so
|
||
// this is a no-op → byte-identical).
|
||
ras_x_min <= clip_x_min;
|
||
ras_x_max <= clip_x_max;
|
||
ras_y_min <= clip_y_min;
|
||
ras_y_max <= clip_y_max;
|
||
ras_cur_x <= clip_x_min;
|
||
ras_cur_y <= clip_y_min;
|
||
if (tile_clip_empty) begin
|
||
tile_phase_r <= TP_FLUSH;
|
||
end else if (TILE_SPILL_ENABLE && spill_valid[tile_id_w]) begin
|
||
// Ch323 — RELOAD this tile's color+Z from LPDDR before RENDER, but
|
||
// ONLY if it has been spilled before (valid backing). Arm the
|
||
// staging fill, then TP_RELOAD sweeps it into the tile RAMs → RENDER.
|
||
tile_phase_r <= TP_RELOAD;
|
||
reload_start_o <= 1'b1; // pulse: arm de25 staging fill
|
||
reload_wait <= 1'b1;
|
||
tile_sweep_r <= 9'd0;
|
||
end else begin
|
||
// No valid backing yet (first pass) OR spill disabled: render from
|
||
// the local CLEAR (clear Z/color already written by TP_CLEAR). This
|
||
// is the clean-Z bootstrap — never reload garbage on the first batch.
|
||
tile_phase_r <= TP_RENDER;
|
||
raster_state <= R_SCAN;
|
||
comb_state_r <= CB_Z;
|
||
end
|
||
end
|
||
end else begin
|
||
tile_sweep_r <= tile_sweep_r + 9'd1;
|
||
end
|
||
end
|
||
|
||
TP_RENDER: begin
|
||
if (BIN_BUFFER_ENABLE) begin
|
||
// BIN BUFFER — same pipeline-drain gate as multiprim,
|
||
// but advance through cur_t's PRECOMPUTED bin: the
|
||
// next slot is bin_slot_r+1. If it is past bin_n[cur_t]
|
||
// the tile's bin is exhausted -> FLUSH. Else step the
|
||
// slot, load that bin entry's prim (tile-clipped), and
|
||
// STAY in TP_RENDER (no inter-prim clear). The walk
|
||
// order is the bin's append order == draw order ==
|
||
// mp_next_nonempty order, so image-equivalent.
|
||
if (comb_pipe_empty) begin
|
||
if (bin_slot_r + FIFO_CNT_W'(1) >= bin_n[cur_t]) begin
|
||
tile_phase_r <= TP_FLUSH;
|
||
tile_sweep_r <= 9'd0;
|
||
end else begin
|
||
bin_slot_r <= bin_slot_r + FIFO_CNT_W'(1);
|
||
prim_idx_r <= binw_next_idx;
|
||
// Ch328 1c — issue single-port read; stall FSM consumes (STAY TP_RENDER).
|
||
mp_rd_state <= RDS_ISSUE;
|
||
mp_pend_slot <= binw_next_fslot;
|
||
mp_pend_cxlo <= binw_next_cx_lo; mp_pend_cxhi <= binw_next_cx_hi;
|
||
mp_pend_cylo <= binw_next_cy_lo; mp_pend_cyhi <= binw_next_cy_hi;
|
||
end
|
||
end
|
||
end else if (TILE_MULTIPRIM) begin
|
||
// Ch305+ MULTIPRIM — wait for the per-pixel pipeline to
|
||
// FULLY flush (comb_pipe_empty), not just the walker
|
||
// reaching R_DRAIN, before touching ras_* / starting the
|
||
// next primitive. Otherwise primitive N's in-flight
|
||
// color/Z writes are lost and N+1's tile_z reads see
|
||
// stale data (broken occlusion). prim_idx_r just
|
||
// finished compositing; advance to the next NON-EMPTY
|
||
// primitive (mp_after_idx), or FLUSH if the batch is
|
||
// exhausted for this tile.
|
||
if (comb_pipe_empty) begin
|
||
if (mp_after_idx >= prim_count_r) begin
|
||
tile_phase_r <= TP_FLUSH;
|
||
tile_sweep_r <= 9'd0;
|
||
end else begin
|
||
prim_idx_r <= mp_after_idx;
|
||
// Ch328 1c — issue single-port read; stall FSM consumes (STAY TP_RENDER).
|
||
mp_rd_state <= RDS_ISSUE;
|
||
mp_pend_slot <= mp_after_slot;
|
||
mp_pend_cxlo <= mp_after_cx_lo; mp_pend_cxhi <= mp_after_cx_hi;
|
||
mp_pend_cylo <= mp_after_cy_lo; mp_pend_cyhi <= mp_after_cy_hi;
|
||
end
|
||
end
|
||
end else begin
|
||
// Ch304 single-primitive path — unchanged (byte-identical).
|
||
if (raster_state == R_DRAIN) begin
|
||
tile_phase_r <= TP_FLUSH;
|
||
tile_sweep_r <= 9'd0;
|
||
end
|
||
end
|
||
end
|
||
|
||
TP_FLUSH: begin
|
||
if (tile_sweep_r < 9'd256) begin
|
||
// Ch336 — sparse flush: a non-first batch (batch_full_r=0) writes ONLY the
|
||
// pixels it drew, preserving the accumulated FB. Always-full when not
|
||
// accumulating, or on a scene's first batch -> byte-identical legacy flush.
|
||
flush_emit_q <= (!TILE_ACCUM_ENABLE) || batch_full_r
|
||
|| tile_written_r[tile_sweep_r[7:0]];
|
||
flush_idx_q <= tile_sweep_r[7:0];
|
||
// Latch the tile origin alongside the emit so the
|
||
// registered-read +1-cycle FB write lands at the
|
||
// right screen offset. 0 at COLS=ROWS=1.
|
||
flush_ox_q <= tile_ox;
|
||
flush_oy_q <= tile_oy;
|
||
tile_sweep_r <= tile_sweep_r + 9'd1;
|
||
end else begin
|
||
// All 256 raddrs presented; the final emit (for
|
||
// index 255) drains THIS clock (flush_emit_q set
|
||
// last clock). Ch305 — ADVANCE to the next tile:
|
||
// if this was the last tile, the grid render is
|
||
// done → TP_OFF; otherwise step the tile counter
|
||
// (col++, wrap to row++) and re-CLEAR. At
|
||
// COLS=ROWS=1 tile_is_last is always true → TP_OFF
|
||
// after one tile (byte-identical to Ch303).
|
||
// Ch323 — when spilling, flush THIS tile's Z to LPDDR first
|
||
// (TP_ZFLUSH) and advance from there. Default (no spill) falls
|
||
// straight through to the unchanged advance below.
|
||
if (TILE_SPILL_ENABLE) begin
|
||
tile_phase_r <= TP_ZFLUSH;
|
||
tile_sweep_r <= 9'd0;
|
||
end else
|
||
if (tile_is_last) begin
|
||
if (TILE_MULTIPRIM) begin
|
||
// Ch305+ MULTIPRIM — last tile of the grid
|
||
// done. The batch IS the whole FIFO and the
|
||
// feed finished before grid start, so DRAIN
|
||
// the entire FIFO and end the grid, mirroring
|
||
// the normal ras_drain_done idle tidy-up.
|
||
// NOTE: streaming-multiprim (partial drain
|
||
// with count arithmetic) is future work.
|
||
fifo_rptr <= fifo_wptr;
|
||
fifo_count <= '0;
|
||
for (int di = 0; di < FIFO_DEPTH; di = di + 1)
|
||
fifo_grad_pending[di] <= 1'b0;
|
||
tile_phase_r <= TP_OFF;
|
||
raster_active <= 1'b0;
|
||
raster_state <= R_IDLE;
|
||
ras_mode <= RM_NONE;
|
||
end else begin
|
||
tile_phase_r <= TP_OFF;
|
||
end
|
||
end else begin
|
||
tile_phase_r <= TP_CLEAR;
|
||
tile_sweep_r <= 9'd0;
|
||
if (tile_col_r == 12'(TILE_COLS - 1)) begin
|
||
tile_col_r <= 12'd0;
|
||
tile_row_r <= tile_row_r + 12'd1;
|
||
end else begin
|
||
tile_col_r <= tile_col_r + 12'd1;
|
||
end
|
||
end
|
||
end
|
||
end
|
||
|
||
// Ch323 (TILE_SPILL_ENABLE) — RELOAD this tile's color+Z from the staging
|
||
// engine (filled from LPDDR) into the tile RAMs BEFORE rendering. Wait for
|
||
// the staging fill (tile_reload_ready_i), then sweep 0..255 presenting
|
||
// tile_reload_raddr_o; the returned color/Z (1-cyc later) is written via
|
||
// reload_wr_we (the comb arm above). ALL 256 written before TP_RENDER.
|
||
TP_RELOAD: begin
|
||
if (reload_wait) begin
|
||
// staging fill in flight — hold until warm, no tile-RAM writes yet.
|
||
if (tile_reload_ready_i) begin
|
||
reload_wait <= 1'b0;
|
||
tile_sweep_r <= 9'd0;
|
||
end
|
||
end else if (tile_sweep_r < 9'd256) begin
|
||
// present raddr=sweep this cycle; write idx=sweep NEXT cycle when its
|
||
// staging data is valid (reload_wr_we/addr registered 1-cyc behind).
|
||
reload_wr_we <= 1'b1;
|
||
reload_wr_addr <= tile_sweep_r[7:0];
|
||
tile_sweep_r <= tile_sweep_r + 9'd1;
|
||
end else begin
|
||
// sweep done; the final write (idx 255) drains THIS cycle via the
|
||
// reload_wr_we set last cycle. All 256 now resident → RENDER.
|
||
tile_phase_r <= TP_RENDER;
|
||
raster_state <= R_SCAN;
|
||
comb_state_r <= CB_Z;
|
||
end
|
||
end
|
||
|
||
// Ch323 (TILE_SPILL_ENABLE) — emit this tile's Z RAM to the LPDDR
|
||
// Z-backing, then run the SAME tile-advance as TP_FLUSH. Separate phase
|
||
// from color flush (Codex): color flush completes first, then Z flush;
|
||
// distinct counters/errors. Never entered at the default.
|
||
TP_ZFLUSH: begin
|
||
if (tile_sweep_r < 9'd256) begin
|
||
z_flush_emit_q <= 1'b1;
|
||
z_flush_idx_q <= tile_sweep_r[7:0];
|
||
z_flush_ox_q <= tile_ox;
|
||
z_flush_oy_q <= tile_oy;
|
||
tile_sweep_r <= tile_sweep_r + 9'd1;
|
||
end else begin
|
||
// 256 Z reads presented; the final Z emit (index 255) drains
|
||
// THIS clock. The tile's color+Z are now spilled to LPDDR, so its
|
||
// backing is VALID — later passes of this tile may reload it.
|
||
spill_valid[tile_id_w] <= 1'b1;
|
||
// Now advance exactly like TP_FLUSH does.
|
||
if (tile_is_last) begin
|
||
if (TILE_MULTIPRIM) begin
|
||
fifo_rptr <= fifo_wptr;
|
||
fifo_count <= '0;
|
||
for (int di = 0; di < FIFO_DEPTH; di = di + 1)
|
||
fifo_grad_pending[di] <= 1'b0;
|
||
tile_phase_r <= TP_OFF;
|
||
raster_active <= 1'b0;
|
||
raster_state <= R_IDLE;
|
||
ras_mode <= RM_NONE;
|
||
end else begin
|
||
tile_phase_r <= TP_OFF;
|
||
end
|
||
end else begin
|
||
tile_phase_r <= TP_CLEAR;
|
||
tile_sweep_r <= 9'd0;
|
||
if (tile_col_r == 12'(TILE_COLS - 1)) begin
|
||
tile_col_r <= 12'd0;
|
||
tile_row_r <= tile_row_r + 12'd1;
|
||
end else begin
|
||
tile_col_r <= tile_col_r + 12'd1;
|
||
end
|
||
end
|
||
end
|
||
end
|
||
|
||
default: ; // TP_OFF — idle.
|
||
endcase
|
||
end
|
||
|
||
// Ch304 — TILE-LOCAL flush emit. One clock after a FLUSH raddr
|
||
// was presented, the registered tile color rdata is valid; emit
|
||
// it as a PSMCT32 framebuffer write at the linear address for
|
||
// (x,y)=(flush_idx[3:0], flush_idx[7:4]). During CLEAR/RENDER no
|
||
// flush_emit_q is set, and the combined FSM suppresses its own
|
||
// emit in tile mode, so there is never a same-clock double-drive
|
||
// of raster_pixel_*. tile_active const 0 at TILE_LOCAL=0 (dead).
|
||
if (tile_active && flush_emit_q) begin
|
||
raster_pixel_emit <= 1'b1;
|
||
raster_pixel_emit_count <= raster_pixel_emit_count + 32'd1;
|
||
raster_pixel_x_q <= {8'd0, flush_idx_q[3:0]};
|
||
raster_pixel_y_q <= {8'd0, flush_idx_q[7:4]};
|
||
if (TILE_COLOR_PSMCT16) begin
|
||
// Ch305+ PSMCT16 flush. tile_color_rdata[15:0] is already
|
||
// the packed pix16; emit it exactly as the S2 PSMCT16
|
||
// raster path does (s2_psm_is_16bit arm ~line 3973):
|
||
// * pix16 in the LOW halfword of the color word,
|
||
// * be = 4'b0011, mask = 0xFFFF_FFFF, psm = 0x02,
|
||
// * fb_addr = PSMCT16 byte address (2 bytes/pixel).
|
||
// vram_normalize_pkg::normalize_write then takes
|
||
// payload[15:0] and commits it to the low or high
|
||
// halfword per byte_addr[1] (flush_fb_addr16_w supplies
|
||
// that bit), so a PSMCT16 scanout reads back this pix16.
|
||
raster_pixel_color_q <= {48'd0, tile_color_rdata[15:0]};
|
||
raster_pixel_fb_addr_q <= flush_fb_addr16_w;
|
||
raster_pixel_be_q <= 4'b0011;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= 6'h02; // PSMCT16
|
||
end else begin
|
||
// Legacy PSMCT32 flush (byte-identical).
|
||
raster_pixel_color_q <= {32'd0, tile_color_rdata32};
|
||
raster_pixel_fb_addr_q <= flush_fb_addr_w;
|
||
raster_pixel_be_q <= 4'b1111;
|
||
raster_pixel_mask_q <= 32'hFFFF_FFFF;
|
||
raster_pixel_psm_q <= 6'h00; // PSMCT32
|
||
end
|
||
end
|
||
|
||
// Ch323 — TILE Z-FLUSH emit (TILE_SPILL_ENABLE). One clock after a TP_ZFLUSH Z
|
||
// raddr was presented, the registered tile_z_rdata is valid; emit it to the
|
||
// LPDDR Z-backing via the SEPARATE z_flush_* channel (the color flush path above
|
||
// is untouched). z_flush_emit_q is only ever set in TP_ZFLUSH, so at the default
|
||
// this is dead and z_flush_emit_o stays 0.
|
||
if (TILE_SPILL_ENABLE && tile_active && z_flush_emit_q) begin
|
||
z_flush_emit_o <= 1'b1;
|
||
z_flush_addr_o <= z_flush_addr_w; // Z-backing-relative byte offset (index*4)
|
||
z_flush_data_o <= tile_z_rdata; // 32-bit Z for this tile pixel
|
||
end
|
||
|
||
// Ch323 — DEDICATED color-flush spill emit: gated EXACTLY like the Z-flush above
|
||
// (tile_active && flush_emit_q = TP_FLUSH only), so the color spill writer sees one
|
||
// emit per tile pixel ON FLUSH and NOT the RENDER-phase raster_pixel_emit traffic.
|
||
// Same pixel_index*4 addressing as Z (no FBP). flush_emit_q is set only in TP_FLUSH.
|
||
if (TILE_SPILL_ENABLE && tile_active && flush_emit_q) begin
|
||
tile_color_flush_emit_o <= 1'b1;
|
||
tile_color_flush_addr_o <= flush_pixel_index_w << 2;
|
||
tile_color_flush_data_o <= tile_color_rdata32;
|
||
end
|
||
end
|
||
end
|
||
|
||
// ------------------------------------------------------------------
|
||
// Trace emission
|
||
// ------------------------------------------------------------------
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
ev_valid <= 1'b0;
|
||
ev_subsys <= SUBSYS_GS;
|
||
ev_event <= EV_MODE;
|
||
ev_arg0 <= 64'd0;
|
||
ev_arg1 <= 64'd0;
|
||
ev_arg2 <= 64'd0;
|
||
ev_arg3 <= 64'd0;
|
||
ev_flags <= 32'd0;
|
||
end else if (reg_wr_en) begin
|
||
ev_valid <= 1'b1;
|
||
ev_subsys <= SUBSYS_GS;
|
||
ev_flags <= 32'd0;
|
||
// Privileged CRTC/MMIO writes. Only BGCOLOR is decoded
|
||
// today; the rest fall to EV_MODE.
|
||
if (reg_wr_addr == BGCOLOR_OFFSET) begin
|
||
ev_event <= EV_BGCOLOR;
|
||
ev_arg0 <= {56'd0, reg_wr_data[23:16]};
|
||
ev_arg1 <= {56'd0, reg_wr_data[15:8]};
|
||
ev_arg2 <= {56'd0, reg_wr_data[7:0]};
|
||
ev_arg3 <= 64'd0;
|
||
end else begin
|
||
ev_event <= EV_MODE;
|
||
ev_arg0 <= {48'd0, reg_wr_addr};
|
||
ev_arg1 <= reg_wr_data;
|
||
ev_arg2 <= 64'd0;
|
||
ev_arg3 <= 64'd0;
|
||
end
|
||
end else if (gif_reg_wr_en) begin
|
||
// Ch75/Ch98: GIF-A+D writes route through the new port.
|
||
// arg0 = reg#, arg1 = data, arg2 = stable per-reg
|
||
// selector (1=PRIM, 2=RGBAQ, 3=XYZF2, 4=XYZ2,
|
||
// 5=FRAME_1, 6=ZBUF_1, 7=TEX0_1; 0 for unknown).
|
||
// Known regs emit EV_WRITE, unknown ones EV_MODE.
|
||
//
|
||
// Ch76: an XYZ2/XYZF2 commit that closes a primitive
|
||
// preempts EV_WRITE in this same cycle with EV_PRIM_DRAW
|
||
// — arg0=prim_type, arg1=threshold, arg2=cumulative
|
||
// count (post-increment), arg3=closing vertex data. The
|
||
// xyz2_q/xyzf2_q latch still updates this cycle so the
|
||
// raw vertex value remains observable; the trace just
|
||
// upgrades to the more specific event.
|
||
ev_valid <= 1'b1;
|
||
ev_subsys <= SUBSYS_GS;
|
||
ev_flags <= 32'd0;
|
||
if (prim_complete_now) begin
|
||
ev_event <= EV_PRIM_DRAW;
|
||
ev_arg0 <= {61'd0, prim_type};
|
||
ev_arg1 <= {61'd0, vert_threshold};
|
||
ev_arg2 <= {32'd0, prim_complete_count + 32'd1};
|
||
ev_arg3 <= gif_reg_data;
|
||
end else begin
|
||
ev_event <= gif_is_known_reg ? EV_WRITE : EV_MODE;
|
||
ev_arg0 <= {56'd0, gif_reg_num};
|
||
ev_arg1 <= gif_reg_data;
|
||
ev_arg2 <= (gif_reg_num == GIF_REG_PRIM) ? 64'd1
|
||
: (gif_reg_num == GIF_REG_RGBAQ) ? 64'd2
|
||
: (gif_reg_num == GIF_REG_XYZF2) ? 64'd3
|
||
: (gif_reg_num == GIF_REG_XYZ2) ? 64'd4
|
||
: (gif_reg_num == GIF_REG_FRAME_1) ? 64'd5
|
||
: (gif_reg_num == GIF_REG_ZBUF_1) ? 64'd6
|
||
: (gif_reg_num == GIF_REG_TEX0_1) ? 64'd7
|
||
: (gif_reg_num == GIF_REG_ALPHA_1) ? 64'd8
|
||
: (gif_reg_num == GIF_REG_TEST_1) ? 64'd9
|
||
: 64'd0;
|
||
ev_arg3 <= 64'd0;
|
||
end
|
||
end else begin
|
||
ev_valid <= 1'b0;
|
||
end
|
||
end
|
||
|
||
endmodule : gs_stub
|