Files
retroDE_ps2/rtl/top/top_psmct32_raster_demo_bram.sv
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

1673 lines
87 KiB
Systemverilog
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// retroDE_ps2 — top_psmct32_raster_demo_bram (Ch157 BRAM
// wrapper with writer-side PSM normalization + PSMT4 RMW pipe)
//
// **Ch155 origin / Ch156 generalization / Ch157 PSMT4 enable.**
// Ch156 plumbs `vram_normalize_pkg::normalize_write` between the
// writer engines (gs_stub raster + gif_image_xfer_stub) and
// `vram_bram_stub`. Ch157 adds a 1-cycle wrapper-site read-modify-
// write pipeline that lets `normalize_write` splice the PSMT4
// nibble against the live `old_byte` from VRAM (instead of
// `old_byte=0`, which Ch156 worked around with a hard-gate). The
// VRAM instance is `vram_bram_stub` (Ch154 BRAM-friendly: 2048 ×
// 32-bit, sync read 1-cycle latency, byte-WE only, no per-bit
// mask) instead of `vram_stub` (legacy 8192 × 8-bit byte-
// addressable + per-bit mask RMW + combinational reads).
//
// **PSM coverage** (Ch157):
// - CT32: passthrough (byte_addr is word-aligned, payload is
// the ABGR word, be=4'b1111). Same-cycle write.
// - CT16: byte_addr[1] selects low/high halfword lane; payload
// shifted to the right 16-bit lane; be=4'b0011 or 4'b1100.
// Same-cycle write.
// - PSMT8: byte_addr[1:0] selects 1 of 4 byte lanes; payload
// shifted; be picks one byte. Same-cycle write.
// - PSMT4: 1-cycle delayed write through the RMW pipe. The
// wrapper drives `read2_addr = byte_addr` on the emit cycle
// and uses the registered `read2_data` one cycle later as
// `old_byte` for `normalize_write`. The Ch156 hard-gate is
// gone. Back-to-back same-byte T4 emits (e.g. PSMT4 SPRITE
// pixels at x=2k and x=2k+1 share a byte) hazard-forward
// through `t4_prev_*` registers — see the comment block at
// the pipe instantiation below.
//
// **Ch158 PCRTC sync-read alignment**: `gs_pcrtc_stub` is now
// instantiated with `VRAM_SYNC_READ=1` so its data-decode +
// sync-output stages are delayed by 1 cycle to align with
// `vram_bram_stub`'s registered `read_data`. The address-side
// (`vram_read_addr`) keeps using the current scanout coords so
// the read is issued one pixel "ahead"; the registered
// `vram_read_data` returns one cycle later, paired with the
// matching delayed counter view inside the PCRTC. Captured
// scanout pixels are no longer 1-column shifted, so the
// integration TB can now verify the captured frame matches the
// rasterized VRAM contents end-to-end (Ch155 had to skip frame
// capture, Ch156/Ch157 still skipped it; Ch158 unblocks it).
//
// Topology — same as the Ch146 wrapper (see
// rtl/top/top_psmct32_raster_demo.sv) except for the VRAM
// instance:
//
// bios_rom_stub#(.IMAGE_FILE(BIOS_IMAGE_FILE)) — EE bootlet at 0xBFC0_0000
// ee_ram_stub#(.IMAGE_FILE(PAYLOAD_IMAGE_FILE)) — GIF payload at phys 0x100
// ee_memory_map_stub#(.USEG_SHADOW_WORDS_PARAM(1024)) — Ch145 BRAM shrink
// ee_core_stub#(.PC_RESET(0xBFC00000)) — MIPS R5900 core
// ee_gs_priv_bridge_stub — 32-bit MMIO → 64-bit GS-priv
// dmac_reg_stub — DMAC ch2
// gif_packed_stub#(.REAL_AD_REG_MAP(1'b1)) — GIFtag + PACKED A+D parser
// gs_stub#(.PSMCT32_SWIZZLE(1'b1)) — GS register file + raster
// gif_image_xfer_stub#(.PSMCT32_SWIZZLE(1'b1)) — TRXDIR/IMAGE engine (idle in Ch123)
// **vram_bram_stub#(.BYTES(8192)) — Ch154 BRAM-friendly VRAM** (DIFF)
// gs_pcrtc_stub#(.PSMCT32_SWIZZLE(1'b1), .VRAM_SYNC_READ(1'b1)) — PCRTC sync-read scanout (Ch158)
//
// All other behavior is inherited from the Ch146 wrapper:
// `$readmemh` from IMAGE_FILE parameters, useg_shadow trimmed
// to 1024 words via Ch145, status bundle (core_halt /
// dma_done_seen / frame_seen) exposed for LEDs, no procedural
// drives.
//
// Top-level ports:
// clk, rst_n — single clock domain, active-low synchronous reset
// core_go — pulsed high for one cycle to start the EE bootlet
// (a board reset-release sequencer can tie it high
// after rst_n deasserts)
// r/g/b, hsync, vsync, de — 8-bit RGB scanout (PCRTC active region)
// core_halt — high once SYSCALL halts the EE
// dma_done_seen — sticky: high once DMAC channel-2 fires its DONE event
// frame_seen — sticky: high once one full PCRTC frame end-of-frame fires
//
// Parameters:
// H_ACTIVE / V_ACTIVE — PCRTC active region (defaults to the Ch123 16×8)
// BIOS_SIZE_BYTES — bios_rom_stub size (default 4 KiB)
// RAM_SIZE_BYTES — ee_ram_stub size (default 4 KiB)
// VRAM_BYTES — vram_stub size (default 8 KiB)
// USEG_SHADOW_WORDS_PARAM — Ch145 useg-shadow size (default 1024 = 4 KiB)
//
// Macros (NOT parameters — iverilog-12 string-parameter forwarding
// limitation forced them to be macros; see the `\`define` block
// below the `timescale directive):
// TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE — path to bios.mem
// (one 32-bit hex word/line)
// TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE — path to payload.mem
// (one 128-bit hex qword/line)
// Both default to "" so the wrapper is still elaborable without
// fixtures (synthetic NOP-sled in bios_rom_stub + zero-init
// ee_ram_stub, which produces no DMAC payload but a stable PCRTC
// frame). On synthesis these become FPGA-tool defines.
//
// PASS for the integration TB (`tb_top_psmct32_raster_demo_bram`):
// - all 128 PSMCT32 pixel words at canonical swizzled byte
// addresses match expected ABGR via hierarchical probe of
// `dut.u_vram.mem[byte_addr >> 2]` (Phase 1, Ch155)
// - core_halt + dma_done_seen + frame_seen latched after the
// EE bootlet SYSCALLs and the DMAC drains
// - one full PCRTC frame captured and per-pixel verified
// against the rasterized image (Phase 2, Ch158)
`timescale 1ns/1ps
// BIOS / payload image paths are passed via macros (iverilog-12
// limitation: string parameter forwarding through hierarchy
// elaborates inconsistently). On synthesis the same macros become
// FPGA-tool defines pointing at .mem fixtures or board-specific
// files. The macros default to empty strings (synthetic NOP-sled +
// zero-RAM fallback in bios_rom_stub / ee_ram_stub) so the wrapper
// is still elaborable without bake artifacts present.
`ifndef TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE
`define TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE ""
`endif
`ifndef TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE
`define TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE ""
`endif
module top_psmct32_raster_demo_bram
import trace_pkg::*;
#(
parameter int H_ACTIVE = 16,
parameter int V_ACTIVE = 8,
// Ch169 — expose PCRTC blanking parameters so the board wrapper can
// override them to a standard HDMI mode (e.g. VGA 640x480@60). The
// defaults (1-pixel borders) preserve the Ch123 tiny-frame behavior
// used by every existing sim TB.
parameter int H_FRONT = 1,
parameter int H_SYNC = 1,
parameter int H_BACK = 1,
parameter int V_FRONT = 1,
parameter int V_SYNC = 1,
parameter int V_BACK = 1,
parameter int BIOS_SIZE_BYTES = 4 * 1024,
parameter int RAM_SIZE_BYTES = 4 * 1024,
parameter int VRAM_BYTES = 8 * 1024,
// Ch251.4 — VRAM second-read-port enable. Default = 1 keeps every
// simulation TB byte-identical (PSMT4 RMW path is live). Hardware
// build overrides to 0 to halve VRAM's M20K footprint by avoiding
// the 1W+2R → 2×(1W+1R) replication. See vram_bram_stub for the
// full contract.
parameter bit VRAM_ENABLE_READ2 = 1'b1,
parameter int unsigned USEG_SHADOW_WORDS_PARAM = 1024,
// Ch296 — useg-shadow backing enable, threaded down to
// ee_memory_map_stub. Default 1 keeps every sim TB byte-identical
// (useg shadow live). The board build overrides to 0 to remove the
// ~33k-FF useg_shadow_mem array; the PSMCT32 SPRITE-only bootlet
// runs from BIOS + EE-RAM and issues no useg traffic, so the shadow
// is dead on the board path. See ee_memory_map_stub.USEG_SHADOW_ENABLE.
parameter bit USEG_SHADOW_ENABLE = 1'b1,
// Ch162 — passes through to `ee_core_stub.STRIP_HW_DIVIDER`. Set
// to 1 on hardware builds (the PSMCT32 SPRITE-only bootlet
// doesn't execute DIVU) so Quartus doesn't infer the 32-bit
// hardware divider and can close timing on a faster clock.
// Default 0 keeps every existing sim TB unchanged.
parameter bit STRIP_HW_DIVIDER = 1'b0,
// Ch163 — passes through to `gs_pcrtc_stub.STRIP_PCRTC_MAG_DIV`.
// Hardware builds set this to 1 (the demo locks MAGH=MAGV=0 so
// the divisor is constant 1 and the math collapses to a
// passthrough); Quartus then can't infer the PCRTC magnification
// divider, retiring the Ch162-onwards STA worst path. Default 0
// keeps every existing scanout MAG TB unchanged.
parameter bit STRIP_PCRTC_MAG_DIV = 1'b0,
// Ch295 — PSMCT32 page/block swizzle gate, mirroring the
// vram_stub variant (top_psmct32_raster_demo). Default 1'b1
// preserves the Ch123/Ch251 swizzled raster+scanout behavior and
// every existing TB that drives this BRAM top (the flat
// production demo). A TEXTURED-sprite fixture sets this to 0 so
// the linear gs_texel_addr fetch and the BITBLT texture upload
// land in the SAME (linear) VRAM layout — the v1 textured-path
// scope. Forwarded to gs_stub / gif_image_xfer_stub / gs_pcrtc_stub
// together so all three VRAM views stay consistent.
parameter bit PSMCT32_SWIZZLE = 1'b1,
// Ch298 — SWIZZLED PSMT4 texture path. When 1, the PSMT4 texture UPLOAD
// (gif_image_xfer_stub) writes the real PS2 block layout AND the texture
// SAMPLER (gs_stub -> gs_texture_unit) reads it back swizzled, so the two
// VRAM views are consistent. Default 0 keeps every linear PSMT4/PSMT8/
// PSMCT32 demo + TB byte-identical; the swizzle demo sets it to 1.
parameter bit PSMT4_SWIZZLE = 1'b0,
// Ch299 — SWIZZLED PSMT8 texture path. The sibling of PSMT4_SWIZZLE,
// MINUS the nibble (PSMT8 is 1 byte/texel). When 1, the PSMT8 texture
// UPLOAD (gif_image_xfer_stub) writes the real PS2 block layout AND the
// texture SAMPLER (gs_stub -> gs_texture_unit) reads it back swizzled, so
// the two VRAM views are consistent. The framebuffer SCANOUT stays linear
// PSMCT32 (gs_pcrtc PSMT8_SWIZZLE untouched). Default 0 keeps every linear
// PSMT8/PSMT4/PSMCT32 demo + TB byte-identical; the swizzle demo sets it 1.
parameter bit PSMT8_SWIZZLE = 1'b0,
// Ch301 — PERSPECTIVE-CORRECT textured triangles (forwarded to gs_stub).
// When 1, a TME TRIANGLE supplied via ST (S=u/w,T=v/w) + RGBAQ.Q (=1/w)
// is interpolated perspective-correctly via the pipelined reciprocal LUT.
// Default 0 generate-guards all perspective logic out (zero cost); only the
// GS_PERSP_DEMO board profile sets it 1.
parameter bit PERSPECTIVE_CORRECT = 1'b0,
parameter int PERSP_RECIP_IDX_BITS = 8, // Ch351 — perspective reciprocal LUT width (far-W -> 11)
parameter int GRAD_DIV_CYCLES = 1, // Ch352 — triangle-setup divide settle cycles (board fits -> 4)
parameter bit GRAD_SEQ_DIVIDER = 1'b0, // Ch352 — sequential gradient divider (board fits -> 1)
// Ch344 — TEXTURED + source-over ALPHA SPRITE path (forwarded to gs_stub). Default 0 -> byte-identical.
parameter bit SPRITE_TEX_ALPHA = 1'b0,
parameter bit SPRITE_TEX_ALPHA_CLUT = 1'b0, // Ch347 — admit PSMT8 (CLUT) textures into the alpha-sprite path
parameter bit CLUT_CSM1_ENABLE = 1'b0, // Ch350 — CSM1 16x16 CT32 grid CLUT load (SH3 indexed env path)
// Ch302 — COMBINED textured+alpha+depth probe (forwarded to gs_stub). When 1,
// a TME+ABE+ZTE triangle runs the multi-beat per-pixel FSM (Zread->Ztest->
// texel->dest->colorwrite->Zwrite). Default 0 generate-guards it out (every
// existing demo byte-identical); only the GS_COMBINED_DEMO board profile sets 1.
parameter bit COMBINED_TAZ = 1'b0,
// Ch303 — TILE-LOCAL render mode (forwarded to gs_stub). When 1, a combined
// TME+ABE+ZTE triangle renders into an on-chip 16x16 color+Z tile
// (CLEAR->RENDER->FLUSH); texture still from VRAM, only color/Z move on-chip.
// Default 0 generate-guards it out (every existing demo byte-identical); only
// the GS_TILE_DEMO board profile sets 1 (implies COMBINED_TAZ=1).
parameter bit TILE_LOCAL = 1'b0,
// Ch304 — tile GRID dimensions (forwarded to gs_stub). Default 1x1 = the
// Ch303 single tile (byte-identical); GS_TILE2X2_DEMO sets 2x2 to render one
// primitive across a 2x2 grid of 16x16 tiles (per-tile clear/render/flush).
parameter int TILE_COLS = 1,
parameter int TILE_ROWS = 1,
// Ch305 — MULTI-PRIMITIVE tiled scene. When TILE_MULTIPRIM=1 the tile grid
// re-renders a LIST of TILE_PRIM_COUNT primitives (all buffered in the FIFO)
// per tile, in order, so later primitives depth-test/alpha-blend over earlier
// ones within each tile. Default 0 = the Ch304 single-primitive grid
// (byte-identical); GS_TILE_MULTIPRIM_DEMO sets 1 + the batch size.
parameter bit TILE_MULTIPRIM = 1'b0,
parameter int TILE_PRIM_COUNT = 1,
// Ch329 — the Ch255 heartbeat read-splicer forcibly patches EE-RAM qword 115's low 32
// bits (0x730) with hb_rgbaq on every DMAC read, for the input-driven heartbeat demo.
// That CORRUPTS any GIF payload that happens to occupy qword 115 (e.g. a depth-64 multi-
// prim batch whose 17th prim's first XYZ2 lands there → vertex reads 0xFFF/0xFF0). Gate it
// OFF for non-heartbeat profiles (GS tile/capacity tests). Default ON = byte-identical.
parameter bit HEARTBEAT_SPLICE_ENABLE = 1'b1,
// Ch330 — runtime primitive-list feeder. When 1, gs_prim_list_feeder becomes the EXCLUSIVE
// owner of the gif_reg_* stream into gs_stub (the GIF unpacker is muxed out), expanding a
// normalized combined-TAZ list from staging instead of the baked DMA payload. Default 0 =
// dead logic, byte-identical for every existing profile.
parameter bit FEEDER_ENABLE = 1'b0,
parameter int FEEDER_STG_WORDS = 256,
// Ch315 — primitive FIFO / per-tile bin depth (capacity). Power-of-2, default
// 4 = byte-identical. GS_TILE_CAP_DEMO sets 8 to scale capacity past the old 4.
parameter int TILE_FIFO_DEPTH = 4,
// Ch317 — LPDDR-backed framebuffer (tile-flush only). When 1, the PSMCT16 tile
// FLUSH stream is ALSO committed to an LPDDR-style framebuffer model
// (gs_lpddr_fb_writer) — a transitional additive MIRROR alongside the on-chip
// BRAM FB, so the LPDDR write path (linear address, per-row bursts, 4 KiB cap)
// can be proven by readback while scanout still comes from BRAM. Tile color/Z
// and texture stay on-chip. Default 0 → the writer is inert (byte-identical).
parameter bit LPDDR_FB_ENABLE = 1'b0,
parameter int LPDDR_FB_BYTES = 8192, // 64x64 PSMCT16 = 8 KiB
// Ch306 — GS SCISSOR_1 rectangular clipping baked into the tile walker bounds
// (effective bounds = primitive bbox ∩ tile bbox ∩ scissor rect). Default 0 =
// no scissor (byte-identical); GS_TILE_SCISSOR_DEMO sets 1.
parameter bit SCISSOR_ENABLE = 1'b0,
// Ch307 — GS texture WRAP MODES (CLAMP_1 WMS/WMT: REPEAT/CLAMP) applied in the
// sampler before texel-address gen. Default 0 = pass-through (byte-identical);
// GS_TILE_WRAP_DEMO sets 1.
parameter bit TEX_WRAP_ENABLE = 1'b0,
// Ch308 — on-chip tile COLOR buffer stored as PSMCT16 (RGB5A1, 16-bit) instead
// of PSMCT32 (32-bit): halves the tile color RAM + flushes a PSMCT16 framebuffer.
// Default 0 = PSMCT32 (byte-identical); GS_TILE_PSMCT16_DEMO sets 1.
parameter bit TILE_COLOR_PSMCT16 = 1'b0,
// Ch309 — generic GS ALPHA blend modes (Cv=((A-B)*C>>7)+D selectors + FIX) per
// primitive, instead of only source-over. Default 0 = source-over (byte-identical);
// GS_TILE_ALPHA_DEMO sets 1.
parameter bit ALPHA_MODES_ENABLE = 1'b0,
// Ch310 — 4-tap BILINEAR texture filtering (PSMCT32) in the combined tile path,
// per-primitive via TEX1.MMAG. Default 0 = nearest (byte-identical);
// GS_TILE_BILINEAR_DEMO sets 1.
parameter bit BILINEAR_ENABLE = 1'b0,
// Ch314 — bilinear for PALETTIZED (PSMT8/PSMT4) textures in the combined
// path (CLUT-before-interp). Default 0 = byte-identical; GS_TILE_PALBILINEAR_DEMO sets 1.
parameter bit PALETTE_BILINEAR = 1'b0,
// Ch311 — real per-tile BIN BUFFER: precompute per-tile primitive lists in a
// binning pass, render walks each tile's bin (vs re-testing all prims per tile).
// Default 0 = Ch305 re-test path (byte-identical); GS_TILE_BIN_DEMO sets 1.
parameter bit BIN_BUFFER_ENABLE = 1'b0,
// Ch323 — tile COLOR+Z spill/reload to LPDDR (forwarded to gs_stub). Default 0 →
// byte-identical (no RELOAD/ZFLUSH phases, no Z-flush stream). The GS_TILE_SPILL_DEMO
// board/e2e profile sets 1.
parameter bit TILE_SPILL_ENABLE = 1'b0,
parameter bit SPILL_FORCE_VALID = 1'b0, // Ch323 test hook (negative bootstrap test)
// Ch326 — LPDDR-ONLY framebuffer: when 1, the tile color flush does NOT mirror into the
// on-chip vram_stub FB (it only spills to LPDDR via the dedicated color channel), so the
// 64 KiB BRAM FB mirror is reclaimed and the displayed FB is the EXTERNAL LPDDR one. Only
// valid with TILE_SPILL_ENABLE + LPDDR scanout (no BRAM-scanout fallback). vram_stub then
// holds only the texture (place it low, e.g. TBP0=0, and shrink VRAM_BYTES accordingly).
parameter bit FB_LPDDR_ONLY = 1'b0,
// Ch322 — LPDDR-backed texture (PREFILLED cache). When 1, the texel-fetch
// read port is MUXED to an external prefilled texture cache (gs_texture_cache,
// instantiated in the de25 top on emif_clk, filled from LPDDR4B before raster)
// once that cache reports ready — at the SAME 1-cycle latency as the BRAM read2,
// so the nearest-path sampler's fixed-latency contract is preserved. Default 0
// generate-guards the mux to a constant pass-through (byte-identical); only the
// GS_LPDDR_TEX board/e2e profile sets 1. TEX_VRAM_BASE/TEX_CACHE_BYTES bound the
// cached VRAM byte range (reads outside it still come from BRAM).
parameter bit GS_LPDDR_TEX = 1'b0,
parameter int TEX_VRAM_BASE = 2048, // TBP0*256 (tritex: TBP0=8)
parameter int TEX_CACHE_BYTES = 256 // 8x8 PSMCT32
) (
input logic clk,
input logic rst_n,
input logic core_go,
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
output logic hsync,
output logic vsync,
output logic de,
// Ch320 — PCRTC scanout VRAM byte address, for the LPDDR4B scanout reader.
// LPDDR mirrors BRAM VRAM byte-for-byte, so indexing the LPDDR frame cache
// by this address yields the identical pixel (seamless video-source mux).
output logic [31:0] vram_read_addr_o,
// Ch320 — high when the scanout pixel is inside the displayed frame window
// (for gating an external LPDDR4B scanout so it shows one frame, not a tiled fill).
output logic pix_window_o,
output logic core_halt,
output logic dma_done_seen,
output logic frame_seen,
// Ch173 — pass the gs_stub raster_overflow flag out so the
// board wrapper can route it into the HPS bridge's
// RASTER_OVERFLOW_COUNT register. Under Ch172 backpressure
// this should stay LOW forever; non-zero on hardware reads
// means the backpressure path broke somewhere.
output logic raster_overflow,
// Ch174 — event toggles (not raw pulses). These flip on every
// PCRTC end-of-frame / DMAC done pulse in the design clock
// domain. The bridge 2-FF syncs the toggle, XORs against its
// last sample, and increments the matching counter on each
// detected edge. This is the textbook pulse-CDC primitive —
// the toggle stays at its new value until the next event
// (~16.7 ms for frames), so the synchronizer has megacycles
// of slack and cannot miss an event. Do NOT "simplify" this
// back into raw 1-cycle pulses crossing CLOCK2_50: a 25 MHz
// pulse is borderline against a 50 MHz 2-FF sync.
output logic frame_toggle,
output logic dma_done_toggle,
// Ch255 — heartbeat color override from the controller. Tied
// straight to INPUT_P1_RAW[9] (Sony ○ / JOY_A) and INPUT_P1_RAW[7]
// (Sony × / JOY_B) at the board top. While the demo's normal EE-
// driven cyan↔red toggle keeps running in the background, holding
// a face button overrides the heartbeat color that the splicer
// injects into the DMAC read response on the NEXT drain:
//
// joy_a_pressed_i alone : force RED (0xFF0000FF)
// joy_b_pressed_i alone : force CYAN (0xFFFFFF00)
// both pressed : invert the current EE value
// (XOR with 0x00FFFFFF, swaps cyan↔red)
// neither pressed : EE's hb_rgbaq_reg passes through
//
// Response latency is one DMAC drain cycle (~2 s at Ch254 cadence)
// since the GS only repaints the heartbeat sprite once per drain.
// Sim TBs default these to 0 to keep regression byte-identical
// (tb_ch171 exercises the four combinations explicitly).
input logic joy_a_pressed_i,
input logic joy_b_pressed_i,
// Ch318 — PSMCT16 tile-FLUSH stream exposed for an external LPDDR AXI writer
// (gs_lpddr_axi_master, instantiated in the de25 top). These mirror the internal
// flush emit; unused (left open) by every existing consumer. flush_psm lets the
// writer gate on PSMCT16 (0x02).
output logic flush_emit_o,
output logic [31:0] flush_addr_o, // linear FB byte address
output logic [15:0] flush_pix16_o,
output logic [31:0] flush_color32_o, // Ch323 — full 32-bit flushed color (for spill round-trip capture)
output logic [5:0] flush_psm_o,
// Ch323 — tile Z-FLUSH stream (TILE_SPILL_ENABLE; the de25 routes it to an LPDDR Z-backing
// writer). z_flush_addr_o is Z-backing-relative (pixel_index*4, 32-bit Z). Inert/0 unless
// spilling; unused (left open) by every existing consumer.
output logic z_flush_emit_o,
output logic [31:0] z_flush_addr_o,
output logic [31:0] z_flush_data_o,
// Ch323 — DEDICATED color-flush spill stream (TP_FLUSH only; the de25 feeds the color writer
// from THIS, not the generic flush_emit_o which also carries RENDER-phase raster emits).
output logic tile_color_flush_emit_o,
output logic [31:0] tile_color_flush_addr_o,
output logic [31:0] tile_color_flush_data_o,
// Ch323 — tile RELOAD staging interface (TILE_SPILL_ENABLE; de25 attaches gs_tile_reload).
// reload_start_o arms the staging fill; tile_reload_raddr_o sweeps tile indices; the engine
// returns color/Z (1-cyc) on tile_reload_color_i/z_i with tile_reload_ready_i = warm.
output logic reload_start_o,
output logic [7:0] tile_reload_raddr_o,
output logic [29:0] reload_base_o, // Ch324 — current tile's raster-FB byte offset
input logic tile_reload_ready_i,
input logic [31:0] tile_reload_color_i,
input logic [31:0] tile_reload_z_i,
output logic [2:0] tile_phase_o, // Ch323 diag — current tile phase (de25 event counters)
// Ch322 — texel-fetch request exposed for an external prefilled texture cache
// (gs_texture_cache in the de25 top). The cache returns the texel on
// tex_cache_data_i (1-cycle registered, matching read2) and asserts
// tex_cache_ready_i once warm. Unused (left open / tied 0) by every existing
// consumer; only meaningful under GS_LPDDR_TEX=1.
output logic gs_tex_rd_en_o,
output logic [31:0] gs_tex_rd_addr_o,
input logic [31:0] tex_cache_data_i,
input logic tex_cache_ready_i,
// Ch322 — texel-source proof counters (design_clk; reset with the core/rst_n, so each
// render's counts are fresh). cache_hits = texel reads served from the LPDDR cache;
// bram_hits = texel reads served from BRAM (fallback). After a render, cache_hits>0 is
// the bridge-visible proof the triangle's texels came from LPDDR. Tied 0 unless GS_LPDDR_TEX.
output logic [31:0] tex_cache_hits_o,
output logic [31:0] tex_bram_hits_o,
// Ch330 Brick 4 — HPS/bridge runtime command-list feeder interface. The bridge
// writes the staging RAM one 64-bit record word at a time ({we,waddr,wdata}) and
// pulses feeder_go to retrigger; the feeder exposes ready + the records/wait
// counters back. Tied off / left open by non-feeder profiles (FEEDER_ENABLE=0 ->
// g_no_feeder doesn't use them), so default builds are byte-identical.
input logic feeder_stg_we_i,
input logic [11:0] feeder_stg_waddr_i,
input logic [63:0] feeder_stg_wdata_i,
input logic feeder_go_i, // retrigger pulse (honoured only in C_READY)
output logic feeder_ready_o, // control FSM is in C_READY (a new list may start)
output logic [15:0] feeder_records_o, // primitives emitted by the current list
output logic [31:0] feeder_waits_o // cycles the feeder paused under fifo_full
);
localparam int RAM_ADDR_W = $clog2(RAM_SIZE_BYTES);
localparam int BIOS_ADDR_W = $clog2(BIOS_SIZE_BYTES);
// ---------------------------------------------------------------------
// ee_ram_stub — DMAC-side GIF payload
// ---------------------------------------------------------------------
logic ram_rd_en;
logic [RAM_ADDR_W-1:0] ram_rd_addr;
logic [127:0] ram_rd_data;
logic ram_rd_valid;
// Top has no TB-direct write path; ee_ram_stub's wr_* are tied
// off and ee_memory_map_stub's ram_wr_* outputs are unconnected
// (Ch251 fitter-rescue — see comment on u_ram). Heartbeat
// snoop lives below and uses the EE core's already-routed
// map_wr_* wires (`ee_cpu_wr_*`) instead.
logic [7:0] ram_master_id;
assign ram_master_id = ram_rd_en ? 8'd1 : 8'd0;
// Ch251 fitter-rescue: keep ee_ram_stub.wr_* tied off so the
// synthesizer keeps inferring it as a ROM (the pre-Ch251 fit
// that the Agilex 5 M20K budget had headroom for). Enabling the
// live write port — the natural way to let the bootlet update
// the heartbeat RGBAQ qword — turned the 128-bit-wide × 16-byte-
// enable backing into a heavy distributed memory + per-byte M20K
// split, blowing the budget by ~160 blocks even with explicit
// `ramstyle = "M20K"`. The Ch251.3 "patch register + DMAC-read
// splicer" below sidesteps that: the EE's SW to 0x8000_0730
// captures into a 32-bit register; when the DMAC fetches qword
// 115 for the heartbeat SPRITE, the register's value is muxed
// into the low 32 bits of the read response. ee_ram_stub stays
// a cheap ROM, the bootlet still gets the live RGBAQ update.
ee_ram_stub #(
.SIZE_BYTES(RAM_SIZE_BYTES),
.IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE)
) u_ram (
.clk(clk), .rst_n(rst_n),
.rd_en(ram_rd_en), .rd_addr(ram_rd_addr),
.rd_data(ram_rd_data), .rd_valid(ram_rd_valid),
.wr_en(1'b0), .wr_addr('0), .wr_data(128'd0), .wr_be(16'd0),
.master_id(ram_master_id),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// Ch251.3 — Heartbeat RGBAQ patch register (M20K-budget-safe).
//
// The Ch251 animated bootlet does `SW r5, 0(r3)` with r3 =
// 0x8000_0730 to alternate the heartbeat SPRITE's RGBAQ between
// CYAN (0xFFFFFF00) and RED (0xFF0000FF). 0x8000_0730 maps to
// ee_ram_stub byte offset 0x730 = qword 115 byte 0, the low
// 32 bits of the 17th SPRITE's RGBAQ A+D packet.
//
// **M20K-budget-safe capture path.** We snoop directly from
// `ee_cpu_wr_*` (the EE core's output, already routed into
// ee_memory_map_stub via the wires above) rather than from
// ee_memory_map_stub's `ram_wr_*` output bundle. Consuming the
// latter ran the design past the Agilex 5's 358-M20K budget
// even though the bundle is purely combinational — Quartus's
// inference around the 128-bit lane-expansion + bridge-mux
// path was substantially heavier than expected. ee_cpu_wr_*
// was already a live consumer of those signals (going into
// ee_memory_map_stub.ee_wr_*) so adding a parallel snoop is
// free.
//
// The address-decode below recognizes the heartbeat as any SW
// (full-word BE) whose physical address (low 29 bits, kseg0
// bit stripped) is exactly 0x730. The bootlet uses kseg0
// (0x8000_0730) but useg (0x0000_0730) would map to the same
// RAM location too, so we mask off the segment bit.
//
// Splice path: ee_ram_stub.rd_data has a 1-cycle latency. We
// register the "is-heartbeat-qword" decoded read address one
// cycle so the splice mux fires on the same cycle that
// ram_rd_data presents qword 115.
// ---------------------------------------------------------------------
localparam logic [RAM_ADDR_W-5:0] HEARTBEAT_QW_INDEX =
(25'h0000_0730) >> 4; // = qword 115
logic [31:0] hb_rgbaq_reg;
wire hb_write_hit = ee_cpu_wr_en
&& (ee_cpu_wr_addr[28:0] == 29'h0000_0730)
&& (ee_cpu_wr_be == 4'b1111);
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
// Match bake.py's initial heartbeat RGBAQ (CYAN). The
// first DMAC kick paints this without needing the patch
// (ee_ram_stub's $readmemh already has CYAN at qword
// 115); matching just avoids any reset-window artifact.
hb_rgbaq_reg <= 32'hFFFF_FF00;
end else if (hb_write_hit) begin
hb_rgbaq_reg <= ee_cpu_wr_data;
end
end
// Ch255 — controller-driven override layer ahead of the splicer.
// The EE keeps animating hb_rgbaq_reg in the background; this mux
// only changes what the splicer INJECTS into the DMAC read response
// when the GS is repainting the heartbeat sprite. Priority order:
//
// both buttons : invert EE's current value (XOR 0x00FFFFFF)
// A only : force RED (0xFF0000FF)
// B only : force CYAN (0xFFFFFF00)
// neither : hb_rgbaq_reg (EE pass-through)
//
// Pure combinational — joy_*_pressed_i comes from the bridge
// input_p1_raw_o (already design_clk-synced) and is stable for
// millions of cycles relative to a human button press. The next
// DMAC drain captures whatever value this mux outputs.
localparam logic [31:0] HB_OVERRIDE_RED = 32'hFF00_00FF;
localparam logic [31:0] HB_OVERRIDE_CYAN = 32'hFFFF_FF00;
localparam logic [31:0] HB_OVERRIDE_XOR = 32'h00FF_FFFF;
wire [31:0] hb_rgbaq_effective =
(joy_a_pressed_i && joy_b_pressed_i) ? (hb_rgbaq_reg ^ HB_OVERRIDE_XOR) :
joy_a_pressed_i ? HB_OVERRIDE_RED :
joy_b_pressed_i ? HB_OVERRIDE_CYAN :
hb_rgbaq_reg;
// Read splicer: delay the "qword 115" detection by 1 cycle to
// align with ram_rd_data (registered output of ee_ram_stub).
logic hb_read_hit_d;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) hb_read_hit_d <= 1'b0;
else hb_read_hit_d <= HEARTBEAT_SPLICE_ENABLE // Ch329 — off for non-heartbeat profiles
&& ram_rd_en
&& (ram_rd_addr[RAM_ADDR_W-1:4] == HEARTBEAT_QW_INDEX);
end
wire [127:0] ram_rd_data_patched = hb_read_hit_d
? {ram_rd_data[127:32], hb_rgbaq_effective}
: ram_rd_data;
// ---------------------------------------------------------------------
// bios_rom_stub — EE bootlet at 0xBFC0_0000
// ---------------------------------------------------------------------
logic bios_rd_en;
logic [21:0] bios_rd_addr_full;
logic [BIOS_ADDR_W-1:0] bios_rd_addr;
logic bios_rd_valid;
logic [31:0] bios_rd_data;
assign bios_rd_addr = bios_rd_addr_full[BIOS_ADDR_W-1:0];
bios_rom_stub #(
.SIZE_BYTES(BIOS_SIZE_BYTES),
.IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE)
) u_bios (
.clk(clk), .rst_n(rst_n),
.rd_en(bios_rd_en),
.rd_addr(bios_rd_addr),
.rd_data(bios_rd_data),
.rd_valid(bios_rd_valid),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// dmac_reg_stub — channel-2 NORMAL transfer
// ---------------------------------------------------------------------
logic dmac_reg_wr_en;
logic [7:0] dmac_reg_offset;
logic [31:0] dmac_reg_wr_data;
logic dmac_mem_rd_en;
logic [31:0] dmac_mem_rd_addr;
logic dmac_gif_valid;
logic [127:0] dmac_gif_data;
logic dmac_gif_last;
logic dmac_gif_ready;
logic dmac_ev_valid;
subsys_e dmac_ev_subsys;
event_e dmac_ev_event;
logic [127:0] map_to_dmac_rd_data;
logic map_to_dmac_rd_valid;
dmac_reg_stub u_dmac (
.clk(clk), .rst_n(rst_n),
.reg_wr_en(dmac_reg_wr_en), .reg_offset(dmac_reg_offset),
.reg_wr_data(dmac_reg_wr_data),
.reg_rd_en(1'b0), .reg_rd_data(), .reg_rd_valid(),
.mem_rd_en(dmac_mem_rd_en), .mem_rd_addr(dmac_mem_rd_addr),
.mem_rd_data(map_to_dmac_rd_data), .mem_rd_valid(map_to_dmac_rd_valid),
.ep_valid(dmac_gif_valid), .ep_data(dmac_gif_data),
.ep_last(dmac_gif_last), .ep_ready(dmac_gif_ready),
.irq_completion_o(),
.ev_valid(dmac_ev_valid), .ev_subsys(dmac_ev_subsys),
.ev_event(dmac_ev_event),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// ee_memory_map_stub — bus arbiter (USEG_SHADOW shrunk per Ch145)
// ---------------------------------------------------------------------
logic ee_cpu_rd_en;
logic [31:0] ee_cpu_rd_addr;
logic [31:0] ee_cpu_rd_data;
logic ee_cpu_rd_valid;
logic ee_cpu_wr_en;
logic [31:0] ee_cpu_wr_addr;
logic [31:0] ee_cpu_wr_data;
logic [3:0] ee_cpu_wr_be;
logic map_gs_priv_wr_en;
logic [15:0] map_gs_priv_wr_addr;
logic [31:0] map_gs_priv_wr_data;
logic [3:0] map_gs_priv_wr_be;
logic map_ram_rd_en;
logic [24:0] map_ram_rd_addr;
ee_memory_map_stub #(
.USEG_SHADOW_WORDS_PARAM(USEG_SHADOW_WORDS_PARAM),
.USEG_SHADOW_ENABLE(USEG_SHADOW_ENABLE)
) u_map (
.clk(clk), .rst_n(rst_n),
.ee_rd_en (ee_cpu_rd_en),
.ee_rd_addr(ee_cpu_rd_addr),
.ee_rd_data(ee_cpu_rd_data),
.ee_rd_valid(ee_cpu_rd_valid),
.ee_wr_en (ee_cpu_wr_en),
.ee_wr_addr(ee_cpu_wr_addr),
.ee_wr_data(ee_cpu_wr_data),
.ee_wr_be (ee_cpu_wr_be),
.dmac_rd_en(dmac_mem_rd_en), .dmac_rd_addr(dmac_mem_rd_addr),
.dmac_rd_data(map_to_dmac_rd_data),
.dmac_rd_valid(map_to_dmac_rd_valid),
.bios_rd_en (bios_rd_en),
.bios_rd_addr(bios_rd_addr_full),
.bios_rd_data(bios_rd_data),
.bios_rd_valid(bios_rd_valid),
.ram_rd_en(map_ram_rd_en), .ram_rd_addr(map_ram_rd_addr),
.ram_rd_data(ram_rd_data_patched), .ram_rd_valid(ram_rd_valid),
.bridge_wr_en(1'b0), .bridge_wr_addr(32'd0),
.bridge_wr_data(128'd0), .bridge_wr_be(16'd0),
.bridge_master_id(8'd0),
// Ch251 fitter-rescue: leave ee_memory_map_stub's ram_wr_*
// outputs unconnected at the wrapper. Consuming them forced
// Quartus to materialize the 128-bit lane-expansion + bridge-
// mux logic that lives behind those outputs — combinational
// alone, but the M20K usage cascaded past the 358-block
// Agilex 5 budget when the wide data paths got wired up. The
// heartbeat-patch register below snoops the EE's SW directly
// from `ee_cpu_wr_*` (already-existing consumers) instead.
.ram_wr_en(), .ram_wr_addr(), .ram_wr_data(),
.ram_wr_be(), .ram_master_id(),
.ee_dmac_ch2_wr_en (dmac_reg_wr_en),
.ee_dmac_ch2_wr_addr(dmac_reg_offset),
.ee_dmac_ch2_wr_data(dmac_reg_wr_data),
.ee_dmac_ch2_rd_en(), .ee_dmac_ch2_rd_addr(),
.ee_dmac_ch2_rd_data(32'd0), .ee_dmac_ch2_rd_valid(1'b0),
.ee_intc_wr_en(), .ee_intc_wr_addr(), .ee_intc_wr_data(),
.ee_intc_rd_en(), .ee_intc_rd_addr(),
.ee_intc_rd_data(32'd0), .ee_intc_rd_valid(1'b0),
.ee_misc_mmio_wr_en(), .ee_misc_mmio_wr_addr(), .ee_misc_mmio_wr_data(), .ee_misc_mmio_wr_be(),
.ee_misc_mmio_rd_en(), .ee_misc_mmio_rd_addr(),
.ee_misc_mmio_rd_data(32'd0), .ee_misc_mmio_rd_valid(1'b0),
.ee_biu_wr_en(), .ee_biu_wr_addr(), .ee_biu_wr_data(), .ee_biu_wr_be(),
.ee_biu_rd_en(), .ee_biu_rd_addr(),
.ee_biu_rd_data(32'd0), .ee_biu_rd_valid(1'b0),
.ee_gs_priv_wr_en (map_gs_priv_wr_en),
.ee_gs_priv_wr_addr(map_gs_priv_wr_addr),
.ee_gs_priv_wr_data(map_gs_priv_wr_data),
.ee_gs_priv_wr_be (map_gs_priv_wr_be),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
assign ram_rd_en = map_ram_rd_en;
assign ram_rd_addr = map_ram_rd_addr[RAM_ADDR_W-1:0];
// ---------------------------------------------------------------------
// ee_core_stub
// ---------------------------------------------------------------------
logic [31:0] core_pc;
logic core_trap;
ee_core_stub #(
.PC_RESET(32'hBFC0_0000),
.STRICT_UNSUPPORTED(1'b0),
.STRIP_HW_DIVIDER(STRIP_HW_DIVIDER)
) u_core (
.clk(clk), .rst_n(rst_n),
.go_i(core_go),
.map_rd_en (ee_cpu_rd_en),
.map_rd_addr(ee_cpu_rd_addr),
.map_rd_data(ee_cpu_rd_data),
.map_rd_valid(ee_cpu_rd_valid),
.map_wr_en (ee_cpu_wr_en),
.map_wr_addr(ee_cpu_wr_addr),
.map_wr_data(ee_cpu_wr_data),
.map_wr_be (ee_cpu_wr_be),
.cpu_irq(1'b0),
.halt_o(core_halt),
.pc_o (core_pc),
.trap_o(core_trap),
.trap_pc_o(),
.trap_instr_o(),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// gif_packed_stub
// ---------------------------------------------------------------------
logic gif_in_ready;
logic [7:0] gif_gif_reg_num;
logic gif_gif_reg_wr_en;
logic [63:0] gif_gif_reg_data;
logic gif_image_data_valid;
logic [127:0] gif_image_data;
logic gif_image_data_last;
logic xfer_data_ready;
// Ch172 — raster FIFO full from gs_stub feeds gif_packed_stub's
// backpressure input. Declared here, driven by u_gs below, and
// consumed by u_gif above.
logic gs_raster_fifo_full;
gif_packed_stub #(.REAL_AD_REG_MAP(1'b1)) u_gif (
.clk(clk), .rst_n(rst_n),
.in_valid(dmac_gif_valid), .in_data(dmac_gif_data),
.in_last(dmac_gif_last), .in_ready(gif_in_ready),
.image_data_valid(gif_image_data_valid),
.image_data(gif_image_data),
.image_data_last(gif_image_data_last),
.image_data_ready(xfer_data_ready),
.raster_fifo_full(gs_raster_fifo_full),
.gs_wr_en(), .gs_wr_addr(), .gs_wr_data(),
.gif_reg_wr_en(gif_gif_reg_wr_en),
.gif_reg_num(gif_gif_reg_num),
.gif_reg_data(gif_gif_reg_data),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// Ch330 — runtime primitive-list feeder with a PHASE OWNER on gif_reg_*.
// The bootlet's setup/upload/state writes (BITBLTBUF/TRXPOS/TRXREG/TRXDIR/
// TEX0/FRAME/ALPHA/TEST/ZBUF) are latched by gs_stub from the SAME gif_reg_*
// stream — so a blanket feeder mux from reset would mux out the texture-upload
// arming. Instead:
// Phase 0 (feeder_owns_bus=0): the UNPACKER owns gif_reg_*; the bootlet does
// texture upload + state setup, reaching gs_stub exactly as proven.
// Handoff: dma_done_seen && !xfer_busy (setup + TRX/IMAGE upload complete).
// Phase 1 (feeder_owns_bus=1): the FEEDER owns gif_reg_* and emits the prims.
// FEEDER_ENABLE=0 → feeder_owns_bus tied 0 → dead logic, byte-identical.
// ---------------------------------------------------------------------
logic feeder_gif_reg_wr_en;
logic [7:0] feeder_gif_reg_num;
logic [63:0] feeder_gif_reg_data;
logic feeder_owns_bus; // drives the mux (feeder owns from the first handoff onward)
logic gs_raster_active; // from gs_stub — raster pipeline active (dips between batches)
logic gs_scene_busy; // Ch337 — from gs_stub — whole multi-batch scene busy (no inter-batch dip)
logic [15:0] feeder_records; // Ch330 observability counters
logic [31:0] feeder_waits;
logic feeder_done_p; // feeder finished emitting the current list (1-cyc)
logic feeder_list_flush; // Ch331 — feeder end-of-list grid flush (delayed done)
generate
if (FEEDER_ENABLE) begin : g_feeder
logic [63:0] feeder_stg [0:FEEDER_STG_WORDS-1]; // staging list (1R1W: feeder reads, bridge writes)
`ifdef FEEDER_STG_INIT_FILE
// Ch330 Brick 4 — board power-up list (list A): bitstream-init the staging RAM so the
// first feeder auto-run after setup draws a real scene instead of uninitialized garbage.
// Synth/board-only — sim TBs $readmemh the staging directly and never define this macro.
initial $readmemh(`FEEDER_STG_INIT_FILE, feeder_stg);
`endif
logic [11:0] fdr_rd_addr;
logic [63:0] fdr_rd_data;
// 1R1W: the feeder's sequential read + the bridge's runtime staging write.
always_ff @(posedge clk) begin
fdr_rd_data <= feeder_stg[fdr_rd_addr[$clog2(FEEDER_STG_WORDS)-1:0]];
if (feeder_stg_we_i)
feeder_stg[feeder_stg_waddr_i[$clog2(FEEDER_STG_WORDS)-1:0]] <= feeder_stg_wdata_i;
end
logic fdr_start, fdr_busy, fdr_done;
logic grid_ran; // gs_stub raster_active was high for the in-flight list
// Control FSM enforcing Codex's "known-empty" rule: a new list never starts until the
// previous list's grid has actually RUN and DRAINED (raster_active high->low) — so list B
// can never append onto list A's still-resident FIFO/grid state.
localparam logic [1:0] C_SETUP = 2'd0, C_RUN = 2'd1, C_DRAIN = 2'd2, C_READY = 2'd3;
logic [1:0] cst;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin cst <= C_SETUP; fdr_start <= 1'b0; grid_ran <= 1'b0; end
else begin
fdr_start <= 1'b0;
unique case (cst)
C_SETUP: if (dma_done_seen && !xfer_busy) begin // bootlet setup + TRX/IMAGE done
fdr_start <= 1'b1; grid_ran <= 1'b0; cst <= C_RUN; end
C_RUN: begin if (gs_scene_busy) grid_ran <= 1'b1;
if (fdr_done) cst <= C_DRAIN; end
// Ch337 — gate on gs_scene_busy (NOT gs_raster_active): a >FIFO_DEPTH scene
// renders in multiple batches and raster_active DIPS between them, but the
// next batch's prims are still queued (gs_scene_busy stays high). Waiting on
// raster_active alone would reach C_READY mid-scene and let a retrigger race
// the last batch's render/flush. gs_scene_busy only clears once EVERY batch has
// rendered + flushed and the FIFO is empty.
C_DRAIN: begin if (gs_scene_busy) grid_ran <= 1'b1;
if (grid_ran && !gs_scene_busy) cst <= C_READY; end // whole scene drained
C_READY: if (feeder_go_i) begin fdr_start <= 1'b1; grid_ran <= 1'b0; cst <= C_RUN; end
endcase
end
end
assign feeder_owns_bus = (cst != C_SETUP); // feeder owns gif_reg_* from the first handoff onward
assign feeder_done_p = fdr_done;
assign feeder_ready_o = (cst == C_READY); // a new list may be retriggered
// Ch331 — end-of-list grid flush: fdr_done delayed a few cycles so the LAST primitive's
// raster-FIFO push + gradient-pending are guaranteed set before gs_stub latches the flush
// (gs_stub's all_grad_done gate then holds the grid until every prim is graded). The
// delay (vs a same-cycle done) closes the "done arrives before the last prim commits" race.
logic [3:0] fdr_done_dly;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) fdr_done_dly <= 4'd0;
else fdr_done_dly <= {fdr_done_dly[2:0], fdr_done};
end
assign feeder_list_flush = fdr_done_dly[3];
gs_prim_list_feeder #(.STG_ADDR_W(12)) u_feeder (
.clk(clk), .rst_n(rst_n), .start(fdr_start), .busy(fdr_busy), .done(fdr_done),
.records_emitted(feeder_records), .fifo_wait_cycles(feeder_waits),
.stg_rd_addr(fdr_rd_addr), .stg_rd_data(fdr_rd_data),
.fifo_full(gs_raster_fifo_full),
.gif_reg_wr_en(feeder_gif_reg_wr_en),
.gif_reg_num(feeder_gif_reg_num),
.gif_reg_data(feeder_gif_reg_data));
end else begin : g_no_feeder
assign feeder_gif_reg_wr_en = 1'b0;
assign feeder_gif_reg_num = 8'd0;
assign feeder_gif_reg_data = 64'd0;
assign feeder_owns_bus = 1'b0;
assign feeder_records = 16'd0;
assign feeder_waits = 32'd0;
assign feeder_done_p = 1'b0;
assign feeder_ready_o = 1'b0;
assign feeder_list_flush = 1'b0;
end
endgenerate
// Ch330 Brick 4 — surface the feeder counters to the bridge (both profiles).
assign feeder_records_o = feeder_records;
assign feeder_waits_o = feeder_waits;
// Phased gif_reg_* mux: feeder ONLY once it owns the bus (post-setup); else the unpacker.
wire gs_gif_reg_wr_en = feeder_owns_bus ? feeder_gif_reg_wr_en : gif_gif_reg_wr_en;
wire [7:0] gs_gif_reg_num = feeder_owns_bus ? feeder_gif_reg_num : gif_gif_reg_num;
wire [63:0] gs_gif_reg_data = feeder_owns_bus ? feeder_gif_reg_data : gif_gif_reg_data;
// DMAC ready follows gif_packed_stub's in_ready directly (Ch110
// image-xfer backpressure propagates through gif_packed_stub).
assign dmac_gif_ready = gif_in_ready;
// ---------------------------------------------------------------------
// gs_stub — PSMCT32 raster, swizzled
// ---------------------------------------------------------------------
logic priv_reg_wr_en;
logic [15:0] priv_reg_wr_addr;
logic [63:0] priv_reg_wr_data;
logic [63:0] pmode_q, dispfb1_q, display1_q;
logic [63:0] bitbltbuf_q, trxpos_q, trxreg_q, trxdir_q;
logic trxdir_wr_q;
logic raster_pixel_emit;
logic [63:0] raster_pixel_color_q;
logic [31:0] raster_pixel_fb_addr_q;
logic [3:0] raster_pixel_be_q;
logic [31:0] raster_pixel_mask_q;
logic [5:0] raster_pixel_psm_q;
// Ch318 — expose the flush stream for an external LPDDR AXI writer (de25 top).
assign flush_emit_o = raster_pixel_emit;
assign flush_addr_o = raster_pixel_fb_addr_q;
assign flush_pix16_o = raster_pixel_color_q[15:0];
assign flush_color32_o = raster_pixel_color_q[31:0]; // Ch323 — full color for spill capture
assign flush_psm_o = raster_pixel_psm_q;
// Ch295 — texture-sampler read port out of gs_stub. Wired to
// vram_bram_stub's SECOND read port (read2) below, MUXED with the
// PSMT4 RMW old-byte read. See the read2 arbitration block.
//
// TEX_RD_REGISTERED(1): vram_bram_stub.read2 is a 1-cycle
// REGISTERED (sync) read — unlike vram_stub's combinational read2
// that the Brick-1 demo top uses. gs_stub therefore generates the
// texel address one pipeline stage earlier (S0 coords) so the
// registered data lands at the S1 stage, where the existing single
// S1->S2 texel register samples it. This keeps the emit-stage
// timing identical to the combinational-read variant.
logic gs_tex_rd_en;
logic [31:0] gs_tex_rd_addr;
logic [31:0] gs_tex_rd_data;
// Ch296 — PSMT8 indexed-texture CLUT lookup. gs_stub fetches an 8-bit
// index through the texel read2 port, then `gs_clut_rd_idx` looks it
// up in clut_stub (the palette filled by clut_loader_stub at TEX0
// commit) to produce the PSMCT32 texel color. clut_stub's second
// (combinational) read port returns it on `gs_clut_rd_data`.
logic [7:0] gs_clut_rd_idx;
logic [31:0] gs_clut_rd_data;
// Ch296 — CLUT-load-busy (declared here, ahead of the gs_stub instance
// that consumes it as clut_load_busy to hold the FIFO pop while the
// VRAM->CLUT load runs). Driven by clut_loader_stub.load_busy below.
logic clut_ld_busy;
// Ch296 — gs_stub TEX0_1 decode taps that feed clut_loader_stub: the
// 1-cycle commit pulse + CBP/CPSM/CSM/CSA/CLD that decide when (and
// from where) the VRAM→CLUT copy fires.
logic [13:0] gs_tex0_cbp;
logic [3:0] gs_tex0_cpsm;
logic gs_tex0_csm;
logic [4:0] gs_tex0_csa;
logic [2:0] gs_tex0_cld;
logic gs_tex0_wr;
// Brick 2a — dest-framebuffer read port for alpha blending. Wired
// to vram_bram_stub.read2 below, arbitrated with the texel-fetch
// port (mutually exclusive: a flat blend never textures). Uses the
// 1-cycle registered read model (FB_RD_REGISTERED=1) to match the
// BRAM read2 latency.
logic gs_fb_rd_en;
logic [31:0] gs_fb_rd_addr;
logic [31:0] gs_fb_rd_data;
// Brick 2b — Z-buffer stored-Z read port. Wired to vram_bram_stub.read2
// below, arbitrated with the texel-fetch + alpha dest-fb ports (mutually
// exclusive: a flat Z-tested sprite never textures and never blends).
// Uses the 1-cycle registered read model (Z_RD_REGISTERED=1) to match
// the BRAM read2 latency.
logic gs_z_rd_en;
logic [31:0] gs_z_rd_addr;
logic [31:0] gs_z_rd_data;
gs_stub #(
.PSMCT32_SWIZZLE (PSMCT32_SWIZZLE),
.PSMT4_SWIZZLE (PSMT4_SWIZZLE),
.PSMT8_SWIZZLE (PSMT8_SWIZZLE),
.TEX_RD_LATENCY (1),
.CLUT_STALL (1'b1), // hold pop while VRAM->CLUT load runs
.TEX_RD_REGISTERED(1'b1),
.FB_RD_REGISTERED (1'b1),
.Z_RD_REGISTERED (1'b1),
.PERSPECTIVE_CORRECT(PERSPECTIVE_CORRECT), // Ch301 — forwarded board param (default 0)
.PERSP_RECIP_IDX_BITS(PERSP_RECIP_IDX_BITS), // Ch351 — perspective reciprocal LUT width (far-W -> 11)
.GRAD_DIV_CYCLES(GRAD_DIV_CYCLES), // Ch352 — triangle-setup divide settle cycles (board -> 4)
.GRAD_SEQ_DIVIDER(GRAD_SEQ_DIVIDER), // Ch352 — sequential gradient divider (board -> 1)
.SPRITE_TEX_ALPHA (SPRITE_TEX_ALPHA), // Ch344 — textured + source-over alpha SPRITE (default 0)
.SPRITE_TEX_ALPHA_CLUT (SPRITE_TEX_ALPHA_CLUT), // Ch347 — PSMT8 CLUT into the alpha-sprite path (default 0)
.COMBINED_TAZ (COMBINED_TAZ), // Ch302 — combined tex+alpha+depth probe (default 0)
.TILE_LOCAL (TILE_LOCAL), // Ch303 — on-chip tile color+Z render (default 0)
.TILE_COLS (TILE_COLS), // Ch304 — tile grid cols (default 1)
.TILE_ROWS (TILE_ROWS), // Ch304 — tile grid rows (default 1)
.TILE_MULTIPRIM (TILE_MULTIPRIM), // Ch305 — render a primitive LIST per tile (default 0)
.TILE_PRIM_COUNT (TILE_PRIM_COUNT), // Ch305 — primitives in the batch (default 1)
.TILE_FIFO_DEPTH (TILE_FIFO_DEPTH), // Ch315 — prim FIFO / bin depth (capacity, default 4)
.SCISSOR_ENABLE (SCISSOR_ENABLE), // Ch306 — SCISSOR_1 rect clip in tile walker (default 0)
.TEX_WRAP_ENABLE (TEX_WRAP_ENABLE), // Ch307 — texture wrap/clamp in sampler (default 0)
.TILE_COLOR_PSMCT16 (TILE_COLOR_PSMCT16), // Ch308 — PSMCT16 tile color buffer (default 0)
.ALPHA_MODES_ENABLE (ALPHA_MODES_ENABLE), // Ch309 — generic ALPHA blend modes (default 0)
.BILINEAR_ENABLE (BILINEAR_ENABLE), // Ch310 — bilinear filtering in combined path (default 0)
.PALETTE_BILINEAR (PALETTE_BILINEAR), // Ch314 — bilinear for PSMT8/PSMT4 indexed textures (default 0)
.BIN_BUFFER_ENABLE (BIN_BUFFER_ENABLE), // Ch311 — per-tile bin buffer (default 0)
.TILE_SPILL_ENABLE (TILE_SPILL_ENABLE), // Ch323 — tile color+Z spill/reload (default 0)
.SPILL_FORCE_VALID (SPILL_FORCE_VALID), // Ch323 — test hook (negative bootstrap test)
.MP_FLUSH_ONLY (FEEDER_ENABLE), // Ch331 — feeder mode: grid fires on end-of-list flush
.TILE_ACCUM_ENABLE (FEEDER_ENABLE), // Ch336 — >FIFO_DEPTH framebuffer accumulation (feeder mode)
.TILE_Z_PERSIST (FEEDER_ENABLE) // Ch338 — persistent cross-batch Z (feeder mode)
) u_gs (
.clk(clk), .rst_n(rst_n),
.reg_wr_en (priv_reg_wr_en),
.reg_wr_addr(priv_reg_wr_addr),
.reg_wr_data(priv_reg_wr_data), // Ch330: gif_reg_* below is muxed (feeder vs unpacker)
.gif_reg_wr_en(gs_gif_reg_wr_en),
.gif_reg_num (gs_gif_reg_num),
.gif_reg_data (gs_gif_reg_data),
.prim_list_flush_i(feeder_list_flush), // Ch331 — feeder end-of-list grid flush (0 in non-feeder)
.bg_r(), .bg_g(), .bg_b(),
.pmode_q(pmode_q), .dispfb1_q(dispfb1_q), .display1_q(display1_q),
.prim_q(), .rgbaq_q(),
.xyz2_q(), .xyzf2_q(),
.frame_1_q(), .zbuf_1_q(),
.tex0_1_q(), .tex0_1_cbp_q(gs_tex0_cbp), .tex0_1_cpsm_q(gs_tex0_cpsm),
.tex0_1_csm_q(gs_tex0_csm), .tex0_1_csa_q(gs_tex0_csa),
.tex0_1_cld_q(gs_tex0_cld), .tex0_1_wr_q(gs_tex0_wr),
.bitbltbuf_q(bitbltbuf_q),
.trxpos_q(trxpos_q),
.trxreg_q(trxreg_q),
.trxdir_q(trxdir_q),
.trxdir_wr_q(trxdir_wr_q),
.prim_complete(), .prim_complete_count(),
.prim_v0_q(), .prim_v1_q(), .prim_v2_q(),
.prim_color_q(),
.prim_color_v0_q(), .prim_color_v1_q(), .prim_color_v2_q(),
.prim_v0_decoded_q(), .prim_v1_decoded_q(), .prim_v2_decoded_q(),
.prim_v0_color_decoded_q(), .prim_v1_color_decoded_q(), .prim_v2_color_decoded_q(),
.pixel_emit(), .pixel_emit_count(),
.pixel_x_q(), .pixel_y_q(),
.pixel_color_q(),
.pixel_fbp_q(), .pixel_fbw_q(), .pixel_psm_q(), .pixel_fb_addr_q(),
.raster_pixel_emit(raster_pixel_emit),
.raster_pixel_emit_count(),
.raster_pixel_x_q(), .raster_pixel_y_q(),
.raster_pixel_color_q(raster_pixel_color_q),
.raster_pixel_fb_addr_q(raster_pixel_fb_addr_q),
.raster_pixel_be_q(raster_pixel_be_q),
.raster_pixel_mask_q(raster_pixel_mask_q),
.raster_pixel_psm_q(raster_pixel_psm_q),
.z_flush_emit_o(z_flush_emit_o), // Ch323 — tile Z-flush stream (de25 Z-writer)
.z_flush_addr_o(z_flush_addr_o),
.z_flush_data_o(z_flush_data_o),
.tile_color_flush_emit_o(tile_color_flush_emit_o), // Ch323 — dedicated color-flush spill stream
.tile_color_flush_addr_o(tile_color_flush_addr_o),
.tile_color_flush_data_o(tile_color_flush_data_o),
.reload_start_o(reload_start_o), // Ch323 — tile reload staging interface
.tile_reload_raddr_o(tile_reload_raddr_o),
.reload_base_o(reload_base_o), // Ch324 — per-tile raster-FB byte offset
.tile_reload_ready_i(tile_reload_ready_i),
.tile_reload_color_i(tile_reload_color_i[(TILE_COLOR_PSMCT16?16:32)-1:0]),
.tile_reload_z_i(tile_reload_z_i),
.tile_phase_o(tile_phase_o), // Ch323 diag — current tile phase
.raster_active(gs_raster_active), // Ch330 — drives the feeder retrigger 'known-empty' gate
.raster_scene_busy(gs_scene_busy), // Ch337 — whole-scene drain gate (no inter-batch dip)
.raster_overflow(raster_overflow),
.raster_fifo_full(gs_raster_fifo_full),
.raster_degenerate(),
.tex_rd_en (gs_tex_rd_en),
.tex_rd_addr(gs_tex_rd_addr),
.tex_rd_data(gs_tex_rd_data),
.clut_rd_idx (gs_clut_rd_idx),
.clut_rd_data(gs_clut_rd_data),
.clut_load_busy(clut_ld_busy), // hold pop while VRAM->CLUT load runs
.fb_rd_en (gs_fb_rd_en),
.fb_rd_addr(gs_fb_rd_addr),
.fb_rd_data(gs_fb_rd_data),
.z_rd_en (gs_z_rd_en),
.z_rd_addr(gs_z_rd_addr),
.z_rd_data(gs_z_rd_data),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// ---------------------------------------------------------------------
// Ch317 — LPDDR-backed framebuffer write sink (tile-flush MIRROR). Fed by the
// PSMCT16 tile-FLUSH stream (emit + linear fb byte addr + pix16). Commits to an
// LPDDR-style model with per-row bursts + 4 KiB cap + bandwidth/over-underflow
// counters for write/readback proof. enable=0 → inert. Scanout still comes from
// the on-chip BRAM FB; a later rung swaps this model for the real EMIF AXI master
// + LPDDR scanout. Gated to PSMCT16 flushes (psm==0x02) so it only fires in a
// full-PSMCT16-FB demo (TILE_COLOR_PSMCT16=1); harmless otherwise.
// ---------------------------------------------------------------------
logic [31:0] lpfb_bytes_written, lpfb_burst_count, lpfb_busy_cycles;
logic [31:0] lpfb_overflow_count, lpfb_underflow_count;
logic [15:0] lpfb_occ;
generate
if (LPDDR_FB_ENABLE) begin : g_lpddr_fb
gs_lpddr_fb_writer #(.FB_BYTES(LPDDR_FB_BYTES), .FIFO_DEPTH(32), .MAX_BURST_BYTES(4096)) u_lpddr_fb (
.clk(clk), .rst_n(rst_n),
.enable(1'b1),
.px_emit(raster_pixel_emit && (raster_pixel_psm_q == 6'h02)),
.px_addr(raster_pixel_fb_addr_q),
.px_pix16(raster_pixel_color_q[15:0]),
.bytes_written(lpfb_bytes_written),
.burst_count(lpfb_burst_count),
.busy_cycles(lpfb_busy_cycles),
.fifo_overflow_count(lpfb_overflow_count),
.fifo_underflow_count(lpfb_underflow_count),
.fifo_occ(lpfb_occ)
);
end else begin : g_no_lpddr_fb
// default OFF — no fbmem / FIFO instantiated (pruned), status tied to 0.
assign lpfb_bytes_written = '0; assign lpfb_burst_count = '0;
assign lpfb_busy_cycles = '0; assign lpfb_overflow_count = '0;
assign lpfb_underflow_count = '0; assign lpfb_occ = '0;
end
endgenerate
// ---------------------------------------------------------------------
// ee_gs_priv_bridge_stub
// ---------------------------------------------------------------------
ee_gs_priv_bridge_stub u_priv_bridge (
.clk(clk), .rst_n(rst_n),
.ee_wr_en (map_gs_priv_wr_en),
.ee_wr_addr(map_gs_priv_wr_addr),
.ee_wr_data(map_gs_priv_wr_data),
.ee_wr_be (map_gs_priv_wr_be),
.gs_reg_wr_en (priv_reg_wr_en),
.gs_reg_wr_addr(priv_reg_wr_addr),
.gs_reg_wr_data(priv_reg_wr_data)
);
// ---------------------------------------------------------------------
// gif_image_xfer_stub — idle in Ch123 (no TRXDIR/IMAGE), but
// instantiated for symmetry. The TRXDIR-driven Ch124 demo would
// turn it load-bearing.
// ---------------------------------------------------------------------
logic xfer_we;
logic [31:0] xfer_waddr;
logic [31:0] xfer_wdata;
logic [3:0] xfer_wbe;
logic [31:0] xfer_wmask;
logic xfer_busy;
gif_image_xfer_stub #(
.PSMCT32_SWIZZLE(PSMCT32_SWIZZLE),
.PSMT4_SWIZZLE (PSMT4_SWIZZLE),
.PSMT8_SWIZZLE (PSMT8_SWIZZLE)
) u_xfer (
.clk(clk), .rst_n(rst_n),
.trxdir_wr_pulse(trxdir_wr_q),
.trxdir(trxdir_q),
.bitbltbuf(bitbltbuf_q),
.trxpos(trxpos_q),
.trxreg(trxreg_q),
.data_valid(gif_image_data_valid),
.data_qword(gif_image_data),
.data_last (gif_image_data_last),
.data_ready(xfer_data_ready),
.vram_we (xfer_we),
.vram_waddr(xfer_waddr),
.vram_wdata(xfer_wdata),
.vram_wbe (xfer_wbe),
.vram_wmask(xfer_wmask),
.busy (xfer_busy)
);
// ---------------------------------------------------------------------
// VRAM mux: xfer-OWNED when xfer.busy, raster-OWNED otherwise.
// (Sequenced: in Ch123 raster fills exclusively; xfer never fires.
// In a future TRXDIR variant the mux still works — payload upload
// finishes before raster starts.)
//
// Ch156: the legacy writer engines emit at byte-addressable
// granularity with per-bit `vram_wmask`. `vram_bram_stub` is
// word-aligned + byte-WE only. We mux the writer engines'
// pre-normalize signals first, then run the result through
// `vram_normalize_pkg::normalize_write` to translate into the
// BRAM contract. The PSM source is the raster-side
// `raster_pixel_psm_q` during raster emits and `bitbltbuf_q`'s
// DPSM field (bits [61:56]) during xfer emits — both match
// what the corresponding writer engine used to compute its
// emit shape.
// ---------------------------------------------------------------------
logic vram_we_pre;
logic [31:0] vram_waddr_pre;
logic [31:0] vram_wdata_pre;
logic [5:0] vram_psm_pre;
logic [31:0] vram_mask_pre;
// Ch326 — FB_LPDDR_ONLY gates the flush's BRAM-mirror write; texture upload (xfer) still
// writes vram_stub. The dedicated LPDDR color flush (tile_color_flush_emit_o) is unaffected.
assign vram_we_pre = xfer_busy ? xfer_we : (raster_pixel_emit && !FB_LPDDR_ONLY);
assign vram_waddr_pre = xfer_busy ? xfer_waddr : raster_pixel_fb_addr_q;
assign vram_wdata_pre = xfer_busy ? xfer_wdata : raster_pixel_color_q[31:0];
assign vram_psm_pre = xfer_busy ? bitbltbuf_q[61:56] : raster_pixel_psm_q;
assign vram_mask_pre = xfer_busy ? xfer_wmask : raster_pixel_mask_q;
// Ch157 PSMT4 RMW pipeline (replaces Ch156's hard-gate).
//
// PSMT4 packs 2 pixels per byte: the writer emits a 4-bit
// nibble pre-shifted into either the LOW or HIGH nibble of
// `vram_wdata_pre[7:0]`, with `vram_mask_pre[7:0]` set to 0x0F
// (low) or 0xF0 (high). To commit one nibble while preserving
// the other, vram_bram_stub (byte-WE only, no per-bit RMW)
// needs the FULL byte value spliced upstream — that is what
// `vram_normalize_pkg::normalize_write`'s PSMT4 branch does
// when handed the LIVE `old_byte` from `mem[byte_addr]`.
//
// The pipe drives `read2_addr = byte_addr` on the T4 emit
// cycle. One cycle later (`vram_bram_stub` registers reads
// with 1-cycle latency), `vram_read2_data` is mem[byte_addr]
// BEFORE any pending writes; we extract `old_byte` from the
// `byte_addr[1:0]` lane, splice in the new nibble, and drive
// a full-byte write at the same address — one cycle after the
// emit fired. Non-T4 emits skip the pipe entirely and write
// same-cycle through `vram_norm` (CT32/CT16/T8 normalize_write
// is pure-comb and doesn't need a read).
//
// **Forwarding hazard**: a PSMT4 SPRITE rasters adjacent
// pixels x=2k and x=2k+1 to the SAME byte_addr (low + high
// nibble). At cycle N+1 we read mem[byte_addr] for emit-2
// while emit-1's write is firing in the SAME posedge. With
// separate always_ff blocks for write and read inside
// vram_bram_stub, the read sees the PRE-write value due to
// NBA semantics. We forward emit-1's just-computed
// `t4_prev_new_byte_q` when `byte_addr` of the in-flight
// emit-2 matches the previous emit's `byte_addr`. This keeps
// the chain correct across any number of back-to-back same-
// byte writes — emit-N reads emit-(N-1)'s new_byte from the
// forward register, splices on top, and emit-(N+1) reads
// emit-N's new_byte from that same register.
logic is_t4_emit;
logic t4_nibble_hi;
logic [3:0] t4_nibble_value;
assign is_t4_emit = vram_we_pre &&
(vram_psm_pre == vram_normalize_pkg::PSM_PSMT4);
assign t4_nibble_hi = (vram_mask_pre[7:0] == 8'hF0);
assign t4_nibble_value = t4_nibble_hi ? vram_wdata_pre[7:4]
: vram_wdata_pre[3:0];
// Ch157 writer-side normalization. CT32/CT16/T8 use the pure-
// comb path (same-cycle write). T4 uses `vram_norm_t4` below
// with the read-back `old_byte` plumbed in.
vram_normalize_pkg::norm_out_t vram_norm;
assign vram_norm = vram_normalize_pkg::normalize_write(
vram_waddr_pre,
vram_psm_pre,
vram_wdata_pre,
1'b0, // nibble_hi — non-T4 paths ignore it
8'd0 // old_byte — non-T4 paths ignore it
);
// Pipe stage 1 — captured T4 emit signals one cycle after the
// emit fired. Read2 has produced mem[byte_addr] at this point.
logic t4_pipe_valid_q;
logic [31:0] t4_pipe_addr_q;
logic t4_pipe_nibble_hi_q;
logic [3:0] t4_pipe_nibble_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
t4_pipe_valid_q <= 1'b0;
t4_pipe_addr_q <= 32'd0;
t4_pipe_nibble_hi_q <= 1'b0;
t4_pipe_nibble_q <= 4'd0;
end else begin
t4_pipe_valid_q <= is_t4_emit;
t4_pipe_addr_q <= vram_waddr_pre;
t4_pipe_nibble_hi_q <= t4_nibble_hi;
t4_pipe_nibble_q <= t4_nibble_value;
end
end
// Pipe stage 2 forward — the just-completed RMW's address +
// produced byte. Used to forward when the next T4 emit hits
// the same byte_addr.
logic t4_prev_valid_q;
logic [31:0] t4_prev_addr_q;
logic [7:0] t4_prev_new_byte_q;
// Read-back lane extraction.
logic [7:0] t4_read_byte_lane;
always_comb begin
case (t4_pipe_addr_q[1:0])
2'b00: t4_read_byte_lane = vram_read2_data[ 7: 0];
2'b01: t4_read_byte_lane = vram_read2_data[15: 8];
2'b10: t4_read_byte_lane = vram_read2_data[23:16];
2'b11: t4_read_byte_lane = vram_read2_data[31:24];
endcase
end
// Forwarded `old_byte`: prev-RMW's new_byte if same address,
// else the freshly-read lane from vram_read2_data.
logic [7:0] t4_effective_old_byte;
assign t4_effective_old_byte = (t4_prev_valid_q && (t4_prev_addr_q == t4_pipe_addr_q))
? t4_prev_new_byte_q
: t4_read_byte_lane;
// Splice — same math as `normalize_write`'s PSMT4 branch.
logic [7:0] t4_new_byte;
always_comb begin
if (t4_pipe_nibble_hi_q)
t4_new_byte = {t4_pipe_nibble_q, t4_effective_old_byte[3:0]};
else
t4_new_byte = {t4_effective_old_byte[7:4], t4_pipe_nibble_q};
end
always_ff @(posedge clk) begin
if (!rst_n) begin
t4_prev_valid_q <= 1'b0;
t4_prev_addr_q <= 32'd0;
t4_prev_new_byte_q <= 8'd0;
end else begin
t4_prev_valid_q <= t4_pipe_valid_q;
t4_prev_addr_q <= t4_pipe_addr_q;
t4_prev_new_byte_q <= t4_new_byte;
end
end
// T4 RMW write-side: word-aligned addr, full-byte data shifted
// to the byte_addr[1:0] lane, single-byte write_be.
logic [31:0] t4_write_addr;
logic [31:0] t4_write_data;
logic [3:0] t4_write_be;
assign t4_write_addr = t4_pipe_addr_q & ~32'd3;
always_comb begin
case (t4_pipe_addr_q[1:0])
2'b00: begin t4_write_data = {24'd0, t4_new_byte}; t4_write_be = 4'b0001; end
2'b01: begin t4_write_data = {16'd0, t4_new_byte, 8'd0}; t4_write_be = 4'b0010; end
2'b10: begin t4_write_data = { 8'd0, t4_new_byte, 16'd0}; t4_write_be = 4'b0100; end
2'b11: begin t4_write_data = { t4_new_byte, 24'd0}; t4_write_be = 4'b1000; end
endcase
end
// VRAM final mux: T4 RMW pipe write OR same-cycle non-T4 write.
// gs_stub keeps PSM constant within a raster, so a T4 pipe
// entry never overlaps with a non-T4 emit cycle from the
// raster path (the raster pipe drains before any FRAME_1
// swap can re-arm a different PSM). Non-T4 emits route
// through `vram_norm` directly.
//
// **Ch157 audit Medium fix — overlap assertion**: the wrapper
// also muxes `gif_image_xfer_stub` writes through `vram_we_pre`
// when `xfer_busy=1`. The "no T4-pipe / non-T4-emit overlap"
// property is therefore an *unenforced* invariant in the design
// today (raster + xfer aren't allowed to overlap PSM-mismatched
// writes by the surrounding flow, but nothing in this wrapper
// checks it). The mux below gives the T4 pipe unconditional
// priority — if a caller ever violates the invariant, the non-
// T4 write would be silently dropped. The sim-only block at
// the end of the file asserts the invariant loudly so the
// violation surfaces as a $error instead of as silent data
// loss. Synthesis sees only the mux logic; the assertion is
// bracketed by translate_off/_on.
logic vram_we_final;
logic [31:0] vram_waddr_final;
logic [31:0] vram_wdata_final;
logic [3:0] vram_wbe_final;
assign vram_we_final = t4_pipe_valid_q ? 1'b1
: (vram_we_pre && !is_t4_emit);
assign vram_waddr_final = t4_pipe_valid_q ? t4_write_addr : vram_norm.write_addr;
assign vram_wdata_final = t4_pipe_valid_q ? t4_write_data : vram_norm.write_data;
assign vram_wbe_final = t4_pipe_valid_q ? t4_write_be : vram_norm.write_be;
// synthesis translate_off
always_ff @(posedge clk) begin
if (rst_n && t4_pipe_valid_q && vram_we_pre && !is_t4_emit)
$error("Ch157: T4 RMW pipe write @%0t collides with non-T4 vram_we_pre (psm=0x%02h, addr=0x%08h); non-T4 write would be dropped — caller violated the no-overlap invariant",
$time, vram_psm_pre, vram_waddr_pre);
end
// synthesis translate_on
// ---------------------------------------------------------------------
// Ch296 — CLUT load engine + palette table for PSMT8 indexed textures.
//
// The read2 bus signals are declared here (ahead of the arbitration
// mux) so the CLUT loader instance below can tap vram_read2_data.
// (clut_ld_busy is declared up by the gs_clut_* signals so the gs_stub
// instance above can reference it.)
logic [31:0] vram_read2_addr;
logic [31:0] vram_read2_data;
logic [31:0] vram_raddr;
logic [31:0] vram_rdata;
// A CSM1-only profile whose textures come from the external LPDDR cache
// does not need the replicated VRAM read2 array during rasterization. Reuse
// the existing scanout read port for the short boot-time CLUT load instead.
// The load precedes raster, so temporarily pausing scanout reads is harmless;
// both VRAM ports have the same registered one-cycle latency.
localparam bit CLUT_LOAD_USE_READ0 = CLUT_CSM1_ENABLE && !VRAM_ENABLE_READ2;
//
// clut_loader_stub copies 256 PSMCT32 entries from VRAM[CBP*256] into
// clut_stub when a TEX0_1 commit (gs_tex0_wr) carries a load-enabling
// CLD/CSM/CPSM. This runs at TEX0 commit — BEFORE the raster scan of
// the textured primitive — so the loader's read2 use is time-disjoint
// from the texel fetch (the mutual exclusion the architecture relies
// on; it is the FIFTH read2 consumer, given top priority below).
//
// The loader was written against a COMBINATIONAL VRAM read (data in
// the SAME cycle as the address). vram_bram_stub.read2 is a 1-cycle
// REGISTERED read, so the word for entry N arrives one cycle AFTER the
// loader presents addr(N)+write_idx=N. The loader's clut_write_data
// (= vram_read_data = the registered read) is therefore ALREADY
// aligned to the late entry, while clut_write_en/idx are one cycle
// early. To realign we delay ONLY en+idx by one cycle (so en/idx for
// entry N land in the same cycle the registered data for entry N
// returns) and use the loader's LIVE write_data — which at that cycle
// already carries entry N's word.
// ---------------------------------------------------------------------
logic [31:0] clut_ld_rd_addr;
logic clut_ld_wr_en_c;
logic [7:0] clut_ld_wr_idx_c;
logic [31:0] clut_ld_wr_data_c;
clut_loader_stub #(.CLUT_CSM1_ENABLE(CLUT_CSM1_ENABLE)) u_clut_loader (
.clk(clk), .rst_n(rst_n),
.tex0_wr_pulse (gs_tex0_wr),
.tex0_cbp (gs_tex0_cbp),
.tex0_cpsm (gs_tex0_cpsm),
.tex0_csm (gs_tex0_csm),
.tex0_csa (gs_tex0_csa),
.tex0_cld (gs_tex0_cld),
.vram_read_addr(clut_ld_rd_addr),
.vram_read_data(CLUT_LOAD_USE_READ0 ? vram_rdata : vram_read2_data),
.clut_write_en (clut_ld_wr_en_c),
.clut_write_idx(clut_ld_wr_idx_c),
.clut_write_data(clut_ld_wr_data_c),
.load_busy (clut_ld_busy)
);
// 1-cycle realignment: delay en+idx only; write_data is taken LIVE
// (it is the registered read, already aligned to the late entry).
logic clut_ld_wr_en_q;
logic [7:0] clut_ld_wr_idx_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
clut_ld_wr_en_q <= 1'b0;
clut_ld_wr_idx_q <= 8'd0;
end else begin
clut_ld_wr_en_q <= clut_ld_wr_en_c;
clut_ld_wr_idx_q <= clut_ld_wr_idx_c;
end
end
clut_stub u_clut (
.clk(clk), .rst_n(rst_n),
.write_en (clut_ld_wr_en_q),
.write_idx(clut_ld_wr_idx_q),
.write_data(clut_ld_wr_data_c), // LIVE registered-read data
// pcrtc scanout read port — unused in this textured-board top
// (PCRTC clut_enable=0). Tie its index to 0.
.read_idx (8'd0),
.read_data(),
// texture-sampler read port — drives the PSMT8 index lookup.
.tex_read_idx (gs_clut_rd_idx),
.tex_read_data(gs_clut_rd_data)
);
// ---------------------------------------------------------------------
// Ch295 — read2 (second VRAM read port) ARBITRATION.
//
// vram_bram_stub exposes ONE second read port (read2). Two
// consumers want it, but they are MUTUALLY EXCLUSIVE BY PIXEL
// FORMAT, so a static-priority mux is collision-free:
//
// (A) PSMT4 RMW old-byte read — fires only on `is_t4_emit`, i.e.
// when the FRAME buffer PSM == PSMT4 (the raster/xfer writer
// is committing a 4-bit nibble and needs the live byte to
// splice). gs_stub holds PSM constant within a raster.
// (B) Textured-SPRITE texel fetch — fires only on `gs_tex_rd_en`,
// i.e. SPRITE + TME + texture-PSM==PSMCT32 (see gs_stub
// s1_tex_active). A textured PSMCT32 SPRITE writes the FRAME
// buffer as PSMCT32, so `is_t4_emit` is 0 for every pixel of
// that primitive — the T4 path is dormant.
//
// Because a given primitive is EITHER a PSMT4 write (consumer A) OR
// a PSMCT32 textured SPRITE (consumer B), the two enables never
// assert in the same cycle. T4 gets static priority (it is the
// load-bearing RMW path for the PSMT4 production cases); the texel
// fetch takes the port whenever T4 is idle. The runtime assertion
// below flags any overlap loudly in sim (it must never fire).
//
// Latency: vram_bram_stub.read2 is a 1-cycle REGISTERED read. The
// T4 pipe already assumes this (it presents addr on the emit cycle
// and consumes read2_data one cycle later). gs_stub is built with
// TEX_RD_REGISTERED=1 so it presents the texel address one stage
// early (S0) and consumes the registered data at S1 — the same
// 1-cycle round trip. Both consumers therefore see the identical
// BRAM read latency they were each designed against.
//
// Brick 2a — THIRD read2 consumer: the alpha-blend dest-fb read
// (gs_fb_rd_en/gs_fb_rd_addr). It is mutually exclusive with BOTH
// existing consumers by pixel format:
// - vs PSMT4 RMW (A): a flat alpha-blend SPRITE writes PSMCT32, so
// is_t4_emit is 0 for every pixel of that primitive.
// - vs texel fetch (B): gs_stub only sets fb_rd_en for a FLAT
// (non-textured) blended SPRITE, and only sets tex_rd_en for a
// TEXTURED SPRITE. A given SPRITE is one or the other, never
// both — gs_stub.new_abe_active requires !close_tme_effective.
// Brick 2b — FOURTH read2 consumer: the Z-buffer stored-Z read
// (gs_z_rd_en/gs_z_rd_addr). Mutually exclusive with ALL three
// existing consumers by pixel format / feature:
// - vs PSMT4 RMW (A): a flat Z-tested SPRITE writes PSMCT32, so
// is_t4_emit is 0 for every pixel of that primitive.
// - vs texel fetch (B): gs_stub only sets z_rd_en for a FLAT
// (non-textured) Z-tested SPRITE (new_zte_active requires
// !close_tme_effective); tex_rd_en is set only for TEXTURED.
// - vs alpha dest-fb (C): new_zte_active requires !new_abe_active,
// so a primitive is EITHER Z-tested OR alpha-blended, never both.
// Static priority T4 > texel > dest-fb > Z-read; the assertion below
// flags any overlap loudly in sim (it must never fire).
// Ch296 — FIFTH read2 consumer: the CLUT load (clut_ld_busy). It runs
// at TEX0 commit, strictly BEFORE the raster scan, so it is
// time-disjoint from the four raster-time consumers below. Given top
// priority; the assertion confirms it never coincides with a raster
// read in practice. (vram_read2_addr/_data are declared above.)
assign vram_read2_addr = clut_ld_busy ? clut_ld_rd_addr
: is_t4_emit ? (vram_waddr_pre & ~32'd3)
: gs_tex_rd_en ? gs_tex_rd_addr
: gs_fb_rd_en ? gs_fb_rd_addr
: gs_z_rd_en ? gs_z_rd_addr
: 32'd0;
// Texel fetch, dest-fb read AND Z-read consume the SAME registered
// read2_data. (The T4 lane extraction above also reads
// vram_read2_data; none of the four overlap.)
// Ch322 — TEXEL data is muxed to the external prefilled texture cache when
// GS_LPDDR_TEX and the cache is warm and the byte address is inside the cached
// texture range. The select is registered ONE cycle so it aligns with the cache's
// 1-cycle registered data AND vram_read2_data (both land the cycle after the
// address is presented) — identical timing to the BRAM texel path. fb/Z reads
// always come from BRAM (the cache only services texture).
assign gs_tex_rd_en_o = gs_tex_rd_en;
assign gs_tex_rd_addr_o = gs_tex_rd_addr;
generate if (GS_LPDDR_TEX) begin : g_lpddr_tex
wire tex_in_range = (gs_tex_rd_addr >= TEX_VRAM_BASE)
&& (gs_tex_rd_addr < (TEX_VRAM_BASE + TEX_CACHE_BYTES));
logic tex_cache_sel_q, gs_tex_rd_en_d;
logic [31:0] cache_hits_q, bram_hits_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
tex_cache_sel_q <= 1'b0; gs_tex_rd_en_d <= 1'b0;
cache_hits_q <= 32'd0; bram_hits_q <= 32'd0;
end else begin
tex_cache_sel_q <= tex_cache_ready_i && gs_tex_rd_en && tex_in_range;
gs_tex_rd_en_d <= gs_tex_rd_en;
// count each served texel read by source (sel_q and en_d are both 1-cycle
// delayed off the same read, so they align): cache vs BRAM-fallback.
if (tex_cache_sel_q) cache_hits_q <= cache_hits_q + 32'd1;
else if (gs_tex_rd_en_d) bram_hits_q <= bram_hits_q + 32'd1;
end
end
assign gs_tex_rd_data = tex_cache_sel_q ? tex_cache_data_i : vram_read2_data;
assign tex_cache_hits_o = cache_hits_q;
assign tex_bram_hits_o = bram_hits_q;
end else begin : g_no_lpddr_tex
assign gs_tex_rd_data = vram_read2_data;
assign tex_cache_hits_o = 32'd0;
assign tex_bram_hits_o = 32'd0;
end endgenerate
assign gs_fb_rd_data = vram_read2_data;
assign gs_z_rd_data = vram_read2_data;
// synthesis translate_off
always_ff @(posedge clk) begin
if (rst_n && is_t4_emit && gs_tex_rd_en)
$error("Ch295: read2 arbitration overlap @%0t — PSMT4 RMW (is_t4_emit) and texel fetch (gs_tex_rd_en) both active; one read is being dropped. These are supposed to be mutually exclusive by pixel format.",
$time);
if (rst_n && gs_fb_rd_en && (is_t4_emit || gs_tex_rd_en))
$error("Brick2a: read2 arbitration overlap @%0t — alpha dest-fb read (gs_fb_rd_en) collides with %s; one read is being dropped. Flat-blend must be mutually exclusive with both.",
$time, is_t4_emit ? "PSMT4 RMW" : "texel fetch");
if (rst_n && gs_z_rd_en && (is_t4_emit || gs_tex_rd_en || gs_fb_rd_en))
$error("Brick2b: read2 arbitration overlap @%0t — Z-buffer read (gs_z_rd_en) collides with another consumer; one read is being dropped. Z-tested flat sprite must be mutually exclusive with T4/texel/alpha.",
$time);
if (rst_n && clut_ld_busy && (is_t4_emit || gs_tex_rd_en || gs_fb_rd_en || gs_z_rd_en))
$error("Ch296: read2 arbitration overlap @%0t — CLUT load (clut_ld_busy) collides with a raster-time consumer; one read is being dropped. CLUT load must complete BEFORE the raster scan begins.",
$time);
end
// synthesis translate_on
assign vram_read_addr_o = vram_raddr; // Ch320 — forward PCRTC scanout addr to the wrapper
logic [31:0] vram_read0_addr;
assign vram_read0_addr = (CLUT_LOAD_USE_READ0 && clut_ld_busy)
? clut_ld_rd_addr : vram_raddr;
// Ch157 — vram_bram_stub fed by the normalize_write output for
// CT32/CT16/T8 (same-cycle) and by the T4 RMW pipe (1-cycle
// delayed). PCRTC's `vram_read_data` arrives one cycle late vs
// the legacy combinational read, and gs_pcrtc_stub now consumes
// it via VRAM_SYNC_READ=1 (Ch158): the data-decode + sync-output
// pipeline is shifted right by 1 cycle so it lines up with the
// BRAM's registered output, and the sub-word lane extract
// resolves the right byte/halfword/nibble inside the 32-bit
// word for CT16/T8/T4 reads.
vram_bram_stub #(
.BYTES (VRAM_BYTES),
.ENABLE_READ2 (VRAM_ENABLE_READ2)
) u_vram (
.clk(clk), .rst_n(rst_n),
.write_en (vram_we_final),
.write_addr(vram_waddr_final),
.write_data(vram_wdata_final),
.write_be (vram_wbe_final),
.read_addr (vram_read0_addr),
.read_data (vram_rdata),
.read_valid(),
.read2_addr(vram_read2_addr),
.read2_data(vram_read2_data),
.read2_valid()
);
// ---------------------------------------------------------------------
// gs_pcrtc_stub — PSMCT32 swizzled scanout
// ---------------------------------------------------------------------
logic end_of_frame;
gs_pcrtc_stub #(
.H_ACTIVE(H_ACTIVE), .H_FRONT(H_FRONT), .H_SYNC(H_SYNC), .H_BACK(H_BACK),
.V_ACTIVE(V_ACTIVE), .V_FRONT(V_FRONT), .V_SYNC(V_SYNC), .V_BACK(V_BACK),
.PSMCT32_SWIZZLE (PSMCT32_SWIZZLE),
.VRAM_SYNC_READ (1'b1),
.STRIP_PCRTC_MAG_DIV(STRIP_PCRTC_MAG_DIV)
) u_pcrtc (
.clk(clk), .rst_n(rst_n),
.pmode_q (pmode_q),
.dispfb1_q (dispfb1_q),
.display1_q (display1_q),
.vram_read_addr(vram_raddr),
.vram_read_data(vram_rdata),
.clut_enable (1'b0),
// (vram_read_addr_o forwarded below)
.clut_csa (5'd0),
.clut_read_idx (),
.clut_read_data(32'd0),
.hsync(hsync), .vsync(vsync), .de(de),
.r(r), .g(g), .b(b),
.pix_window_o(pix_window_o),
.ev_valid(), .ev_subsys(), .ev_event(),
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
);
// gs_pcrtc_stub doesn't expose end_of_frame as a port; the Ch123 TB
// taps it via hierarchical ref. For the top wrapper we synthesize an
// equivalent edge by watching vsync rise.
logic vsync_d;
always_ff @(posedge clk) begin
if (!rst_n) vsync_d <= 1'b0;
else vsync_d <= vsync;
end
assign end_of_frame = vsync && !vsync_d;
// ---------------------------------------------------------------------
// Sticky status outputs.
// ---------------------------------------------------------------------
logic dma_done_seen_q;
logic frame_seen_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
dma_done_seen_q <= 1'b0;
frame_seen_q <= 1'b0;
end else begin
if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
dma_done_seen_q <= 1'b1;
if (end_of_frame)
frame_seen_q <= 1'b1;
end
end
assign dma_done_seen = dma_done_seen_q;
assign frame_seen = frame_seen_q;
// ---------------------------------------------------------------------
// Ch174 — event toggles for HPS-visible counters. Separate from the
// sticky status latches above so LED bits / CORE_STATUS keep their
// existing one-shot semantics.
// ---------------------------------------------------------------------
logic frame_toggle_q;
logic dma_done_toggle_q;
always_ff @(posedge clk) begin
if (!rst_n) begin
frame_toggle_q <= 1'b0;
dma_done_toggle_q <= 1'b0;
end else begin
if (end_of_frame)
frame_toggle_q <= ~frame_toggle_q;
if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
dma_done_toggle_q <= ~dma_done_toggle_q;
end
end
assign frame_toggle = frame_toggle_q;
assign dma_done_toggle = dma_done_toggle_q;
endmodule : top_psmct32_raster_demo_bram