ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1673 lines
87 KiB
Systemverilog
1673 lines
87 KiB
Systemverilog
// retroDE_ps2 — top_psmct32_raster_demo_bram (Ch157 BRAM
|
||
// wrapper with writer-side PSM normalization + PSMT4 RMW pipe)
|
||
//
|
||
// **Ch155 origin / Ch156 generalization / Ch157 PSMT4 enable.**
|
||
// Ch156 plumbs `vram_normalize_pkg::normalize_write` between the
|
||
// writer engines (gs_stub raster + gif_image_xfer_stub) and
|
||
// `vram_bram_stub`. Ch157 adds a 1-cycle wrapper-site read-modify-
|
||
// write pipeline that lets `normalize_write` splice the PSMT4
|
||
// nibble against the live `old_byte` from VRAM (instead of
|
||
// `old_byte=0`, which Ch156 worked around with a hard-gate). The
|
||
// VRAM instance is `vram_bram_stub` (Ch154 BRAM-friendly: 2048 ×
|
||
// 32-bit, sync read 1-cycle latency, byte-WE only, no per-bit
|
||
// mask) instead of `vram_stub` (legacy 8192 × 8-bit byte-
|
||
// addressable + per-bit mask RMW + combinational reads).
|
||
//
|
||
// **PSM coverage** (Ch157):
|
||
// - CT32: passthrough (byte_addr is word-aligned, payload is
|
||
// the ABGR word, be=4'b1111). Same-cycle write.
|
||
// - CT16: byte_addr[1] selects low/high halfword lane; payload
|
||
// shifted to the right 16-bit lane; be=4'b0011 or 4'b1100.
|
||
// Same-cycle write.
|
||
// - PSMT8: byte_addr[1:0] selects 1 of 4 byte lanes; payload
|
||
// shifted; be picks one byte. Same-cycle write.
|
||
// - PSMT4: 1-cycle delayed write through the RMW pipe. The
|
||
// wrapper drives `read2_addr = byte_addr` on the emit cycle
|
||
// and uses the registered `read2_data` one cycle later as
|
||
// `old_byte` for `normalize_write`. The Ch156 hard-gate is
|
||
// gone. Back-to-back same-byte T4 emits (e.g. PSMT4 SPRITE
|
||
// pixels at x=2k and x=2k+1 share a byte) hazard-forward
|
||
// through `t4_prev_*` registers — see the comment block at
|
||
// the pipe instantiation below.
|
||
//
|
||
// **Ch158 PCRTC sync-read alignment**: `gs_pcrtc_stub` is now
|
||
// instantiated with `VRAM_SYNC_READ=1` so its data-decode +
|
||
// sync-output stages are delayed by 1 cycle to align with
|
||
// `vram_bram_stub`'s registered `read_data`. The address-side
|
||
// (`vram_read_addr`) keeps using the current scanout coords so
|
||
// the read is issued one pixel "ahead"; the registered
|
||
// `vram_read_data` returns one cycle later, paired with the
|
||
// matching delayed counter view inside the PCRTC. Captured
|
||
// scanout pixels are no longer 1-column shifted, so the
|
||
// integration TB can now verify the captured frame matches the
|
||
// rasterized VRAM contents end-to-end (Ch155 had to skip frame
|
||
// capture, Ch156/Ch157 still skipped it; Ch158 unblocks it).
|
||
//
|
||
// Topology — same as the Ch146 wrapper (see
|
||
// rtl/top/top_psmct32_raster_demo.sv) except for the VRAM
|
||
// instance:
|
||
//
|
||
// bios_rom_stub#(.IMAGE_FILE(BIOS_IMAGE_FILE)) — EE bootlet at 0xBFC0_0000
|
||
// ee_ram_stub#(.IMAGE_FILE(PAYLOAD_IMAGE_FILE)) — GIF payload at phys 0x100
|
||
// ee_memory_map_stub#(.USEG_SHADOW_WORDS_PARAM(1024)) — Ch145 BRAM shrink
|
||
// ee_core_stub#(.PC_RESET(0xBFC00000)) — MIPS R5900 core
|
||
// ee_gs_priv_bridge_stub — 32-bit MMIO → 64-bit GS-priv
|
||
// dmac_reg_stub — DMAC ch2
|
||
// gif_packed_stub#(.REAL_AD_REG_MAP(1'b1)) — GIFtag + PACKED A+D parser
|
||
// gs_stub#(.PSMCT32_SWIZZLE(1'b1)) — GS register file + raster
|
||
// gif_image_xfer_stub#(.PSMCT32_SWIZZLE(1'b1)) — TRXDIR/IMAGE engine (idle in Ch123)
|
||
// **vram_bram_stub#(.BYTES(8192)) — Ch154 BRAM-friendly VRAM** (DIFF)
|
||
// gs_pcrtc_stub#(.PSMCT32_SWIZZLE(1'b1), .VRAM_SYNC_READ(1'b1)) — PCRTC sync-read scanout (Ch158)
|
||
//
|
||
// All other behavior is inherited from the Ch146 wrapper:
|
||
// `$readmemh` from IMAGE_FILE parameters, useg_shadow trimmed
|
||
// to 1024 words via Ch145, status bundle (core_halt /
|
||
// dma_done_seen / frame_seen) exposed for LEDs, no procedural
|
||
// drives.
|
||
//
|
||
// Top-level ports:
|
||
// clk, rst_n — single clock domain, active-low synchronous reset
|
||
// core_go — pulsed high for one cycle to start the EE bootlet
|
||
// (a board reset-release sequencer can tie it high
|
||
// after rst_n deasserts)
|
||
// r/g/b, hsync, vsync, de — 8-bit RGB scanout (PCRTC active region)
|
||
// core_halt — high once SYSCALL halts the EE
|
||
// dma_done_seen — sticky: high once DMAC channel-2 fires its DONE event
|
||
// frame_seen — sticky: high once one full PCRTC frame end-of-frame fires
|
||
//
|
||
// Parameters:
|
||
// H_ACTIVE / V_ACTIVE — PCRTC active region (defaults to the Ch123 16×8)
|
||
// BIOS_SIZE_BYTES — bios_rom_stub size (default 4 KiB)
|
||
// RAM_SIZE_BYTES — ee_ram_stub size (default 4 KiB)
|
||
// VRAM_BYTES — vram_stub size (default 8 KiB)
|
||
// USEG_SHADOW_WORDS_PARAM — Ch145 useg-shadow size (default 1024 = 4 KiB)
|
||
//
|
||
// Macros (NOT parameters — iverilog-12 string-parameter forwarding
|
||
// limitation forced them to be macros; see the `\`define` block
|
||
// below the `timescale directive):
|
||
// TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE — path to bios.mem
|
||
// (one 32-bit hex word/line)
|
||
// TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE — path to payload.mem
|
||
// (one 128-bit hex qword/line)
|
||
// Both default to "" so the wrapper is still elaborable without
|
||
// fixtures (synthetic NOP-sled in bios_rom_stub + zero-init
|
||
// ee_ram_stub, which produces no DMAC payload but a stable PCRTC
|
||
// frame). On synthesis these become FPGA-tool defines.
|
||
//
|
||
// PASS for the integration TB (`tb_top_psmct32_raster_demo_bram`):
|
||
// - all 128 PSMCT32 pixel words at canonical swizzled byte
|
||
// addresses match expected ABGR via hierarchical probe of
|
||
// `dut.u_vram.mem[byte_addr >> 2]` (Phase 1, Ch155)
|
||
// - core_halt + dma_done_seen + frame_seen latched after the
|
||
// EE bootlet SYSCALLs and the DMAC drains
|
||
// - one full PCRTC frame captured and per-pixel verified
|
||
// against the rasterized image (Phase 2, Ch158)
|
||
|
||
`timescale 1ns/1ps
|
||
|
||
// BIOS / payload image paths are passed via macros (iverilog-12
|
||
// limitation: string parameter forwarding through hierarchy
|
||
// elaborates inconsistently). On synthesis the same macros become
|
||
// FPGA-tool defines pointing at .mem fixtures or board-specific
|
||
// files. The macros default to empty strings (synthetic NOP-sled +
|
||
// zero-RAM fallback in bios_rom_stub / ee_ram_stub) so the wrapper
|
||
// is still elaborable without bake artifacts present.
|
||
`ifndef TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE
|
||
`define TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE ""
|
||
`endif
|
||
`ifndef TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE
|
||
`define TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE ""
|
||
`endif
|
||
|
||
module top_psmct32_raster_demo_bram
|
||
import trace_pkg::*;
|
||
#(
|
||
parameter int H_ACTIVE = 16,
|
||
parameter int V_ACTIVE = 8,
|
||
// Ch169 — expose PCRTC blanking parameters so the board wrapper can
|
||
// override them to a standard HDMI mode (e.g. VGA 640x480@60). The
|
||
// defaults (1-pixel borders) preserve the Ch123 tiny-frame behavior
|
||
// used by every existing sim TB.
|
||
parameter int H_FRONT = 1,
|
||
parameter int H_SYNC = 1,
|
||
parameter int H_BACK = 1,
|
||
parameter int V_FRONT = 1,
|
||
parameter int V_SYNC = 1,
|
||
parameter int V_BACK = 1,
|
||
parameter int BIOS_SIZE_BYTES = 4 * 1024,
|
||
parameter int RAM_SIZE_BYTES = 4 * 1024,
|
||
parameter int VRAM_BYTES = 8 * 1024,
|
||
// Ch251.4 — VRAM second-read-port enable. Default = 1 keeps every
|
||
// simulation TB byte-identical (PSMT4 RMW path is live). Hardware
|
||
// build overrides to 0 to halve VRAM's M20K footprint by avoiding
|
||
// the 1W+2R → 2×(1W+1R) replication. See vram_bram_stub for the
|
||
// full contract.
|
||
parameter bit VRAM_ENABLE_READ2 = 1'b1,
|
||
parameter int unsigned USEG_SHADOW_WORDS_PARAM = 1024,
|
||
// Ch296 — useg-shadow backing enable, threaded down to
|
||
// ee_memory_map_stub. Default 1 keeps every sim TB byte-identical
|
||
// (useg shadow live). The board build overrides to 0 to remove the
|
||
// ~33k-FF useg_shadow_mem array; the PSMCT32 SPRITE-only bootlet
|
||
// runs from BIOS + EE-RAM and issues no useg traffic, so the shadow
|
||
// is dead on the board path. See ee_memory_map_stub.USEG_SHADOW_ENABLE.
|
||
parameter bit USEG_SHADOW_ENABLE = 1'b1,
|
||
// Ch162 — passes through to `ee_core_stub.STRIP_HW_DIVIDER`. Set
|
||
// to 1 on hardware builds (the PSMCT32 SPRITE-only bootlet
|
||
// doesn't execute DIVU) so Quartus doesn't infer the 32-bit
|
||
// hardware divider and can close timing on a faster clock.
|
||
// Default 0 keeps every existing sim TB unchanged.
|
||
parameter bit STRIP_HW_DIVIDER = 1'b0,
|
||
// Ch163 — passes through to `gs_pcrtc_stub.STRIP_PCRTC_MAG_DIV`.
|
||
// Hardware builds set this to 1 (the demo locks MAGH=MAGV=0 so
|
||
// the divisor is constant 1 and the math collapses to a
|
||
// passthrough); Quartus then can't infer the PCRTC magnification
|
||
// divider, retiring the Ch162-onwards STA worst path. Default 0
|
||
// keeps every existing scanout MAG TB unchanged.
|
||
parameter bit STRIP_PCRTC_MAG_DIV = 1'b0,
|
||
|
||
// Ch295 — PSMCT32 page/block swizzle gate, mirroring the
|
||
// vram_stub variant (top_psmct32_raster_demo). Default 1'b1
|
||
// preserves the Ch123/Ch251 swizzled raster+scanout behavior and
|
||
// every existing TB that drives this BRAM top (the flat
|
||
// production demo). A TEXTURED-sprite fixture sets this to 0 so
|
||
// the linear gs_texel_addr fetch and the BITBLT texture upload
|
||
// land in the SAME (linear) VRAM layout — the v1 textured-path
|
||
// scope. Forwarded to gs_stub / gif_image_xfer_stub / gs_pcrtc_stub
|
||
// together so all three VRAM views stay consistent.
|
||
parameter bit PSMCT32_SWIZZLE = 1'b1,
|
||
// Ch298 — SWIZZLED PSMT4 texture path. When 1, the PSMT4 texture UPLOAD
|
||
// (gif_image_xfer_stub) writes the real PS2 block layout AND the texture
|
||
// SAMPLER (gs_stub -> gs_texture_unit) reads it back swizzled, so the two
|
||
// VRAM views are consistent. Default 0 keeps every linear PSMT4/PSMT8/
|
||
// PSMCT32 demo + TB byte-identical; the swizzle demo sets it to 1.
|
||
parameter bit PSMT4_SWIZZLE = 1'b0,
|
||
// Ch299 — SWIZZLED PSMT8 texture path. The sibling of PSMT4_SWIZZLE,
|
||
// MINUS the nibble (PSMT8 is 1 byte/texel). When 1, the PSMT8 texture
|
||
// UPLOAD (gif_image_xfer_stub) writes the real PS2 block layout AND the
|
||
// texture SAMPLER (gs_stub -> gs_texture_unit) reads it back swizzled, so
|
||
// the two VRAM views are consistent. The framebuffer SCANOUT stays linear
|
||
// PSMCT32 (gs_pcrtc PSMT8_SWIZZLE untouched). Default 0 keeps every linear
|
||
// PSMT8/PSMT4/PSMCT32 demo + TB byte-identical; the swizzle demo sets it 1.
|
||
parameter bit PSMT8_SWIZZLE = 1'b0,
|
||
// Ch301 — PERSPECTIVE-CORRECT textured triangles (forwarded to gs_stub).
|
||
// When 1, a TME TRIANGLE supplied via ST (S=u/w,T=v/w) + RGBAQ.Q (=1/w)
|
||
// is interpolated perspective-correctly via the pipelined reciprocal LUT.
|
||
// Default 0 generate-guards all perspective logic out (zero cost); only the
|
||
// GS_PERSP_DEMO board profile sets it 1.
|
||
parameter bit PERSPECTIVE_CORRECT = 1'b0,
|
||
parameter int PERSP_RECIP_IDX_BITS = 8, // Ch351 — perspective reciprocal LUT width (far-W -> 11)
|
||
parameter int GRAD_DIV_CYCLES = 1, // Ch352 — triangle-setup divide settle cycles (board fits -> 4)
|
||
parameter bit GRAD_SEQ_DIVIDER = 1'b0, // Ch352 — sequential gradient divider (board fits -> 1)
|
||
// Ch344 — TEXTURED + source-over ALPHA SPRITE path (forwarded to gs_stub). Default 0 -> byte-identical.
|
||
parameter bit SPRITE_TEX_ALPHA = 1'b0,
|
||
parameter bit SPRITE_TEX_ALPHA_CLUT = 1'b0, // Ch347 — admit PSMT8 (CLUT) textures into the alpha-sprite path
|
||
parameter bit CLUT_CSM1_ENABLE = 1'b0, // Ch350 — CSM1 16x16 CT32 grid CLUT load (SH3 indexed env path)
|
||
// Ch302 — COMBINED textured+alpha+depth probe (forwarded to gs_stub). When 1,
|
||
// a TME+ABE+ZTE triangle runs the multi-beat per-pixel FSM (Zread->Ztest->
|
||
// texel->dest->colorwrite->Zwrite). Default 0 generate-guards it out (every
|
||
// existing demo byte-identical); only the GS_COMBINED_DEMO board profile sets 1.
|
||
parameter bit COMBINED_TAZ = 1'b0,
|
||
// Ch303 — TILE-LOCAL render mode (forwarded to gs_stub). When 1, a combined
|
||
// TME+ABE+ZTE triangle renders into an on-chip 16x16 color+Z tile
|
||
// (CLEAR->RENDER->FLUSH); texture still from VRAM, only color/Z move on-chip.
|
||
// Default 0 generate-guards it out (every existing demo byte-identical); only
|
||
// the GS_TILE_DEMO board profile sets 1 (implies COMBINED_TAZ=1).
|
||
parameter bit TILE_LOCAL = 1'b0,
|
||
// Ch304 — tile GRID dimensions (forwarded to gs_stub). Default 1x1 = the
|
||
// Ch303 single tile (byte-identical); GS_TILE2X2_DEMO sets 2x2 to render one
|
||
// primitive across a 2x2 grid of 16x16 tiles (per-tile clear/render/flush).
|
||
parameter int TILE_COLS = 1,
|
||
parameter int TILE_ROWS = 1,
|
||
// Ch305 — MULTI-PRIMITIVE tiled scene. When TILE_MULTIPRIM=1 the tile grid
|
||
// re-renders a LIST of TILE_PRIM_COUNT primitives (all buffered in the FIFO)
|
||
// per tile, in order, so later primitives depth-test/alpha-blend over earlier
|
||
// ones within each tile. Default 0 = the Ch304 single-primitive grid
|
||
// (byte-identical); GS_TILE_MULTIPRIM_DEMO sets 1 + the batch size.
|
||
parameter bit TILE_MULTIPRIM = 1'b0,
|
||
parameter int TILE_PRIM_COUNT = 1,
|
||
// Ch329 — the Ch255 heartbeat read-splicer forcibly patches EE-RAM qword 115's low 32
|
||
// bits (0x730) with hb_rgbaq on every DMAC read, for the input-driven heartbeat demo.
|
||
// That CORRUPTS any GIF payload that happens to occupy qword 115 (e.g. a depth-64 multi-
|
||
// prim batch whose 17th prim's first XYZ2 lands there → vertex reads 0xFFF/0xFF0). Gate it
|
||
// OFF for non-heartbeat profiles (GS tile/capacity tests). Default ON = byte-identical.
|
||
parameter bit HEARTBEAT_SPLICE_ENABLE = 1'b1,
|
||
// Ch330 — runtime primitive-list feeder. When 1, gs_prim_list_feeder becomes the EXCLUSIVE
|
||
// owner of the gif_reg_* stream into gs_stub (the GIF unpacker is muxed out), expanding a
|
||
// normalized combined-TAZ list from staging instead of the baked DMA payload. Default 0 =
|
||
// dead logic, byte-identical for every existing profile.
|
||
parameter bit FEEDER_ENABLE = 1'b0,
|
||
parameter int FEEDER_STG_WORDS = 256,
|
||
// Ch315 — primitive FIFO / per-tile bin depth (capacity). Power-of-2, default
|
||
// 4 = byte-identical. GS_TILE_CAP_DEMO sets 8 to scale capacity past the old 4.
|
||
parameter int TILE_FIFO_DEPTH = 4,
|
||
// Ch317 — LPDDR-backed framebuffer (tile-flush only). When 1, the PSMCT16 tile
|
||
// FLUSH stream is ALSO committed to an LPDDR-style framebuffer model
|
||
// (gs_lpddr_fb_writer) — a transitional additive MIRROR alongside the on-chip
|
||
// BRAM FB, so the LPDDR write path (linear address, per-row bursts, 4 KiB cap)
|
||
// can be proven by readback while scanout still comes from BRAM. Tile color/Z
|
||
// and texture stay on-chip. Default 0 → the writer is inert (byte-identical).
|
||
parameter bit LPDDR_FB_ENABLE = 1'b0,
|
||
parameter int LPDDR_FB_BYTES = 8192, // 64x64 PSMCT16 = 8 KiB
|
||
// Ch306 — GS SCISSOR_1 rectangular clipping baked into the tile walker bounds
|
||
// (effective bounds = primitive bbox ∩ tile bbox ∩ scissor rect). Default 0 =
|
||
// no scissor (byte-identical); GS_TILE_SCISSOR_DEMO sets 1.
|
||
parameter bit SCISSOR_ENABLE = 1'b0,
|
||
// Ch307 — GS texture WRAP MODES (CLAMP_1 WMS/WMT: REPEAT/CLAMP) applied in the
|
||
// sampler before texel-address gen. Default 0 = pass-through (byte-identical);
|
||
// GS_TILE_WRAP_DEMO sets 1.
|
||
parameter bit TEX_WRAP_ENABLE = 1'b0,
|
||
// Ch308 — on-chip tile COLOR buffer stored as PSMCT16 (RGB5A1, 16-bit) instead
|
||
// of PSMCT32 (32-bit): halves the tile color RAM + flushes a PSMCT16 framebuffer.
|
||
// Default 0 = PSMCT32 (byte-identical); GS_TILE_PSMCT16_DEMO sets 1.
|
||
parameter bit TILE_COLOR_PSMCT16 = 1'b0,
|
||
// Ch309 — generic GS ALPHA blend modes (Cv=((A-B)*C>>7)+D selectors + FIX) per
|
||
// primitive, instead of only source-over. Default 0 = source-over (byte-identical);
|
||
// GS_TILE_ALPHA_DEMO sets 1.
|
||
parameter bit ALPHA_MODES_ENABLE = 1'b0,
|
||
// Ch310 — 4-tap BILINEAR texture filtering (PSMCT32) in the combined tile path,
|
||
// per-primitive via TEX1.MMAG. Default 0 = nearest (byte-identical);
|
||
// GS_TILE_BILINEAR_DEMO sets 1.
|
||
parameter bit BILINEAR_ENABLE = 1'b0,
|
||
// Ch314 — bilinear for PALETTIZED (PSMT8/PSMT4) textures in the combined
|
||
// path (CLUT-before-interp). Default 0 = byte-identical; GS_TILE_PALBILINEAR_DEMO sets 1.
|
||
parameter bit PALETTE_BILINEAR = 1'b0,
|
||
// Ch311 — real per-tile BIN BUFFER: precompute per-tile primitive lists in a
|
||
// binning pass, render walks each tile's bin (vs re-testing all prims per tile).
|
||
// Default 0 = Ch305 re-test path (byte-identical); GS_TILE_BIN_DEMO sets 1.
|
||
parameter bit BIN_BUFFER_ENABLE = 1'b0,
|
||
// Ch323 — tile COLOR+Z spill/reload to LPDDR (forwarded to gs_stub). Default 0 →
|
||
// byte-identical (no RELOAD/ZFLUSH phases, no Z-flush stream). The GS_TILE_SPILL_DEMO
|
||
// board/e2e profile sets 1.
|
||
parameter bit TILE_SPILL_ENABLE = 1'b0,
|
||
parameter bit SPILL_FORCE_VALID = 1'b0, // Ch323 test hook (negative bootstrap test)
|
||
// Ch326 — LPDDR-ONLY framebuffer: when 1, the tile color flush does NOT mirror into the
|
||
// on-chip vram_stub FB (it only spills to LPDDR via the dedicated color channel), so the
|
||
// 64 KiB BRAM FB mirror is reclaimed and the displayed FB is the EXTERNAL LPDDR one. Only
|
||
// valid with TILE_SPILL_ENABLE + LPDDR scanout (no BRAM-scanout fallback). vram_stub then
|
||
// holds only the texture (place it low, e.g. TBP0=0, and shrink VRAM_BYTES accordingly).
|
||
parameter bit FB_LPDDR_ONLY = 1'b0,
|
||
// Ch322 — LPDDR-backed texture (PREFILLED cache). When 1, the texel-fetch
|
||
// read port is MUXED to an external prefilled texture cache (gs_texture_cache,
|
||
// instantiated in the de25 top on emif_clk, filled from LPDDR4B before raster)
|
||
// once that cache reports ready — at the SAME 1-cycle latency as the BRAM read2,
|
||
// so the nearest-path sampler's fixed-latency contract is preserved. Default 0
|
||
// generate-guards the mux to a constant pass-through (byte-identical); only the
|
||
// GS_LPDDR_TEX board/e2e profile sets 1. TEX_VRAM_BASE/TEX_CACHE_BYTES bound the
|
||
// cached VRAM byte range (reads outside it still come from BRAM).
|
||
parameter bit GS_LPDDR_TEX = 1'b0,
|
||
parameter int TEX_VRAM_BASE = 2048, // TBP0*256 (tritex: TBP0=8)
|
||
parameter int TEX_CACHE_BYTES = 256 // 8x8 PSMCT32
|
||
) (
|
||
input logic clk,
|
||
input logic rst_n,
|
||
input logic core_go,
|
||
|
||
output logic [7:0] r,
|
||
output logic [7:0] g,
|
||
output logic [7:0] b,
|
||
output logic hsync,
|
||
output logic vsync,
|
||
output logic de,
|
||
// Ch320 — PCRTC scanout VRAM byte address, for the LPDDR4B scanout reader.
|
||
// LPDDR mirrors BRAM VRAM byte-for-byte, so indexing the LPDDR frame cache
|
||
// by this address yields the identical pixel (seamless video-source mux).
|
||
output logic [31:0] vram_read_addr_o,
|
||
// Ch320 — high when the scanout pixel is inside the displayed frame window
|
||
// (for gating an external LPDDR4B scanout so it shows one frame, not a tiled fill).
|
||
output logic pix_window_o,
|
||
|
||
output logic core_halt,
|
||
output logic dma_done_seen,
|
||
output logic frame_seen,
|
||
// Ch173 — pass the gs_stub raster_overflow flag out so the
|
||
// board wrapper can route it into the HPS bridge's
|
||
// RASTER_OVERFLOW_COUNT register. Under Ch172 backpressure
|
||
// this should stay LOW forever; non-zero on hardware reads
|
||
// means the backpressure path broke somewhere.
|
||
output logic raster_overflow,
|
||
// Ch174 — event toggles (not raw pulses). These flip on every
|
||
// PCRTC end-of-frame / DMAC done pulse in the design clock
|
||
// domain. The bridge 2-FF syncs the toggle, XORs against its
|
||
// last sample, and increments the matching counter on each
|
||
// detected edge. This is the textbook pulse-CDC primitive —
|
||
// the toggle stays at its new value until the next event
|
||
// (~16.7 ms for frames), so the synchronizer has megacycles
|
||
// of slack and cannot miss an event. Do NOT "simplify" this
|
||
// back into raw 1-cycle pulses crossing CLOCK2_50: a 25 MHz
|
||
// pulse is borderline against a 50 MHz 2-FF sync.
|
||
output logic frame_toggle,
|
||
output logic dma_done_toggle,
|
||
|
||
// Ch255 — heartbeat color override from the controller. Tied
|
||
// straight to INPUT_P1_RAW[9] (Sony ○ / JOY_A) and INPUT_P1_RAW[7]
|
||
// (Sony × / JOY_B) at the board top. While the demo's normal EE-
|
||
// driven cyan↔red toggle keeps running in the background, holding
|
||
// a face button overrides the heartbeat color that the splicer
|
||
// injects into the DMAC read response on the NEXT drain:
|
||
//
|
||
// joy_a_pressed_i alone : force RED (0xFF0000FF)
|
||
// joy_b_pressed_i alone : force CYAN (0xFFFFFF00)
|
||
// both pressed : invert the current EE value
|
||
// (XOR with 0x00FFFFFF, swaps cyan↔red)
|
||
// neither pressed : EE's hb_rgbaq_reg passes through
|
||
//
|
||
// Response latency is one DMAC drain cycle (~2 s at Ch254 cadence)
|
||
// since the GS only repaints the heartbeat sprite once per drain.
|
||
// Sim TBs default these to 0 to keep regression byte-identical
|
||
// (tb_ch171 exercises the four combinations explicitly).
|
||
input logic joy_a_pressed_i,
|
||
input logic joy_b_pressed_i,
|
||
|
||
// Ch318 — PSMCT16 tile-FLUSH stream exposed for an external LPDDR AXI writer
|
||
// (gs_lpddr_axi_master, instantiated in the de25 top). These mirror the internal
|
||
// flush emit; unused (left open) by every existing consumer. flush_psm lets the
|
||
// writer gate on PSMCT16 (0x02).
|
||
output logic flush_emit_o,
|
||
output logic [31:0] flush_addr_o, // linear FB byte address
|
||
output logic [15:0] flush_pix16_o,
|
||
output logic [31:0] flush_color32_o, // Ch323 — full 32-bit flushed color (for spill round-trip capture)
|
||
output logic [5:0] flush_psm_o,
|
||
// Ch323 — tile Z-FLUSH stream (TILE_SPILL_ENABLE; the de25 routes it to an LPDDR Z-backing
|
||
// writer). z_flush_addr_o is Z-backing-relative (pixel_index*4, 32-bit Z). Inert/0 unless
|
||
// spilling; unused (left open) by every existing consumer.
|
||
output logic z_flush_emit_o,
|
||
output logic [31:0] z_flush_addr_o,
|
||
output logic [31:0] z_flush_data_o,
|
||
// Ch323 — DEDICATED color-flush spill stream (TP_FLUSH only; the de25 feeds the color writer
|
||
// from THIS, not the generic flush_emit_o which also carries RENDER-phase raster emits).
|
||
output logic tile_color_flush_emit_o,
|
||
output logic [31:0] tile_color_flush_addr_o,
|
||
output logic [31:0] tile_color_flush_data_o,
|
||
// Ch323 — tile RELOAD staging interface (TILE_SPILL_ENABLE; de25 attaches gs_tile_reload).
|
||
// reload_start_o arms the staging fill; tile_reload_raddr_o sweeps tile indices; the engine
|
||
// returns color/Z (1-cyc) on tile_reload_color_i/z_i with tile_reload_ready_i = warm.
|
||
output logic reload_start_o,
|
||
output logic [7:0] tile_reload_raddr_o,
|
||
output logic [29:0] reload_base_o, // Ch324 — current tile's raster-FB byte offset
|
||
input logic tile_reload_ready_i,
|
||
input logic [31:0] tile_reload_color_i,
|
||
input logic [31:0] tile_reload_z_i,
|
||
output logic [2:0] tile_phase_o, // Ch323 diag — current tile phase (de25 event counters)
|
||
|
||
// Ch322 — texel-fetch request exposed for an external prefilled texture cache
|
||
// (gs_texture_cache in the de25 top). The cache returns the texel on
|
||
// tex_cache_data_i (1-cycle registered, matching read2) and asserts
|
||
// tex_cache_ready_i once warm. Unused (left open / tied 0) by every existing
|
||
// consumer; only meaningful under GS_LPDDR_TEX=1.
|
||
output logic gs_tex_rd_en_o,
|
||
output logic [31:0] gs_tex_rd_addr_o,
|
||
input logic [31:0] tex_cache_data_i,
|
||
input logic tex_cache_ready_i,
|
||
// Ch322 — texel-source proof counters (design_clk; reset with the core/rst_n, so each
|
||
// render's counts are fresh). cache_hits = texel reads served from the LPDDR cache;
|
||
// bram_hits = texel reads served from BRAM (fallback). After a render, cache_hits>0 is
|
||
// the bridge-visible proof the triangle's texels came from LPDDR. Tied 0 unless GS_LPDDR_TEX.
|
||
output logic [31:0] tex_cache_hits_o,
|
||
output logic [31:0] tex_bram_hits_o,
|
||
|
||
// Ch330 Brick 4 — HPS/bridge runtime command-list feeder interface. The bridge
|
||
// writes the staging RAM one 64-bit record word at a time ({we,waddr,wdata}) and
|
||
// pulses feeder_go to retrigger; the feeder exposes ready + the records/wait
|
||
// counters back. Tied off / left open by non-feeder profiles (FEEDER_ENABLE=0 ->
|
||
// g_no_feeder doesn't use them), so default builds are byte-identical.
|
||
input logic feeder_stg_we_i,
|
||
input logic [11:0] feeder_stg_waddr_i,
|
||
input logic [63:0] feeder_stg_wdata_i,
|
||
input logic feeder_go_i, // retrigger pulse (honoured only in C_READY)
|
||
output logic feeder_ready_o, // control FSM is in C_READY (a new list may start)
|
||
output logic [15:0] feeder_records_o, // primitives emitted by the current list
|
||
output logic [31:0] feeder_waits_o // cycles the feeder paused under fifo_full
|
||
);
|
||
|
||
localparam int RAM_ADDR_W = $clog2(RAM_SIZE_BYTES);
|
||
localparam int BIOS_ADDR_W = $clog2(BIOS_SIZE_BYTES);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// ee_ram_stub — DMAC-side GIF payload
|
||
// ---------------------------------------------------------------------
|
||
logic ram_rd_en;
|
||
logic [RAM_ADDR_W-1:0] ram_rd_addr;
|
||
logic [127:0] ram_rd_data;
|
||
logic ram_rd_valid;
|
||
// Top has no TB-direct write path; ee_ram_stub's wr_* are tied
|
||
// off and ee_memory_map_stub's ram_wr_* outputs are unconnected
|
||
// (Ch251 fitter-rescue — see comment on u_ram). Heartbeat
|
||
// snoop lives below and uses the EE core's already-routed
|
||
// map_wr_* wires (`ee_cpu_wr_*`) instead.
|
||
logic [7:0] ram_master_id;
|
||
assign ram_master_id = ram_rd_en ? 8'd1 : 8'd0;
|
||
|
||
// Ch251 fitter-rescue: keep ee_ram_stub.wr_* tied off so the
|
||
// synthesizer keeps inferring it as a ROM (the pre-Ch251 fit
|
||
// that the Agilex 5 M20K budget had headroom for). Enabling the
|
||
// live write port — the natural way to let the bootlet update
|
||
// the heartbeat RGBAQ qword — turned the 128-bit-wide × 16-byte-
|
||
// enable backing into a heavy distributed memory + per-byte M20K
|
||
// split, blowing the budget by ~160 blocks even with explicit
|
||
// `ramstyle = "M20K"`. The Ch251.3 "patch register + DMAC-read
|
||
// splicer" below sidesteps that: the EE's SW to 0x8000_0730
|
||
// captures into a 32-bit register; when the DMAC fetches qword
|
||
// 115 for the heartbeat SPRITE, the register's value is muxed
|
||
// into the low 32 bits of the read response. ee_ram_stub stays
|
||
// a cheap ROM, the bootlet still gets the live RGBAQ update.
|
||
ee_ram_stub #(
|
||
.SIZE_BYTES(RAM_SIZE_BYTES),
|
||
.IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_PAYLOAD_IMAGE_FILE)
|
||
) u_ram (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.rd_en(ram_rd_en), .rd_addr(ram_rd_addr),
|
||
.rd_data(ram_rd_data), .rd_valid(ram_rd_valid),
|
||
.wr_en(1'b0), .wr_addr('0), .wr_data(128'd0), .wr_be(16'd0),
|
||
.master_id(ram_master_id),
|
||
.ev_valid(), .ev_subsys(), .ev_event(),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// Ch251.3 — Heartbeat RGBAQ patch register (M20K-budget-safe).
|
||
//
|
||
// The Ch251 animated bootlet does `SW r5, 0(r3)` with r3 =
|
||
// 0x8000_0730 to alternate the heartbeat SPRITE's RGBAQ between
|
||
// CYAN (0xFFFFFF00) and RED (0xFF0000FF). 0x8000_0730 maps to
|
||
// ee_ram_stub byte offset 0x730 = qword 115 byte 0, the low
|
||
// 32 bits of the 17th SPRITE's RGBAQ A+D packet.
|
||
//
|
||
// **M20K-budget-safe capture path.** We snoop directly from
|
||
// `ee_cpu_wr_*` (the EE core's output, already routed into
|
||
// ee_memory_map_stub via the wires above) rather than from
|
||
// ee_memory_map_stub's `ram_wr_*` output bundle. Consuming the
|
||
// latter ran the design past the Agilex 5's 358-M20K budget
|
||
// even though the bundle is purely combinational — Quartus's
|
||
// inference around the 128-bit lane-expansion + bridge-mux
|
||
// path was substantially heavier than expected. ee_cpu_wr_*
|
||
// was already a live consumer of those signals (going into
|
||
// ee_memory_map_stub.ee_wr_*) so adding a parallel snoop is
|
||
// free.
|
||
//
|
||
// The address-decode below recognizes the heartbeat as any SW
|
||
// (full-word BE) whose physical address (low 29 bits, kseg0
|
||
// bit stripped) is exactly 0x730. The bootlet uses kseg0
|
||
// (0x8000_0730) but useg (0x0000_0730) would map to the same
|
||
// RAM location too, so we mask off the segment bit.
|
||
//
|
||
// Splice path: ee_ram_stub.rd_data has a 1-cycle latency. We
|
||
// register the "is-heartbeat-qword" decoded read address one
|
||
// cycle so the splice mux fires on the same cycle that
|
||
// ram_rd_data presents qword 115.
|
||
// ---------------------------------------------------------------------
|
||
localparam logic [RAM_ADDR_W-5:0] HEARTBEAT_QW_INDEX =
|
||
(25'h0000_0730) >> 4; // = qword 115
|
||
|
||
logic [31:0] hb_rgbaq_reg;
|
||
wire hb_write_hit = ee_cpu_wr_en
|
||
&& (ee_cpu_wr_addr[28:0] == 29'h0000_0730)
|
||
&& (ee_cpu_wr_be == 4'b1111);
|
||
always_ff @(posedge clk or negedge rst_n) begin
|
||
if (!rst_n) begin
|
||
// Match bake.py's initial heartbeat RGBAQ (CYAN). The
|
||
// first DMAC kick paints this without needing the patch
|
||
// (ee_ram_stub's $readmemh already has CYAN at qword
|
||
// 115); matching just avoids any reset-window artifact.
|
||
hb_rgbaq_reg <= 32'hFFFF_FF00;
|
||
end else if (hb_write_hit) begin
|
||
hb_rgbaq_reg <= ee_cpu_wr_data;
|
||
end
|
||
end
|
||
|
||
// Ch255 — controller-driven override layer ahead of the splicer.
|
||
// The EE keeps animating hb_rgbaq_reg in the background; this mux
|
||
// only changes what the splicer INJECTS into the DMAC read response
|
||
// when the GS is repainting the heartbeat sprite. Priority order:
|
||
//
|
||
// both buttons : invert EE's current value (XOR 0x00FFFFFF)
|
||
// A only : force RED (0xFF0000FF)
|
||
// B only : force CYAN (0xFFFFFF00)
|
||
// neither : hb_rgbaq_reg (EE pass-through)
|
||
//
|
||
// Pure combinational — joy_*_pressed_i comes from the bridge
|
||
// input_p1_raw_o (already design_clk-synced) and is stable for
|
||
// millions of cycles relative to a human button press. The next
|
||
// DMAC drain captures whatever value this mux outputs.
|
||
localparam logic [31:0] HB_OVERRIDE_RED = 32'hFF00_00FF;
|
||
localparam logic [31:0] HB_OVERRIDE_CYAN = 32'hFFFF_FF00;
|
||
localparam logic [31:0] HB_OVERRIDE_XOR = 32'h00FF_FFFF;
|
||
wire [31:0] hb_rgbaq_effective =
|
||
(joy_a_pressed_i && joy_b_pressed_i) ? (hb_rgbaq_reg ^ HB_OVERRIDE_XOR) :
|
||
joy_a_pressed_i ? HB_OVERRIDE_RED :
|
||
joy_b_pressed_i ? HB_OVERRIDE_CYAN :
|
||
hb_rgbaq_reg;
|
||
|
||
// Read splicer: delay the "qword 115" detection by 1 cycle to
|
||
// align with ram_rd_data (registered output of ee_ram_stub).
|
||
logic hb_read_hit_d;
|
||
always_ff @(posedge clk or negedge rst_n) begin
|
||
if (!rst_n) hb_read_hit_d <= 1'b0;
|
||
else hb_read_hit_d <= HEARTBEAT_SPLICE_ENABLE // Ch329 — off for non-heartbeat profiles
|
||
&& ram_rd_en
|
||
&& (ram_rd_addr[RAM_ADDR_W-1:4] == HEARTBEAT_QW_INDEX);
|
||
end
|
||
wire [127:0] ram_rd_data_patched = hb_read_hit_d
|
||
? {ram_rd_data[127:32], hb_rgbaq_effective}
|
||
: ram_rd_data;
|
||
|
||
// ---------------------------------------------------------------------
|
||
// bios_rom_stub — EE bootlet at 0xBFC0_0000
|
||
// ---------------------------------------------------------------------
|
||
logic bios_rd_en;
|
||
logic [21:0] bios_rd_addr_full;
|
||
logic [BIOS_ADDR_W-1:0] bios_rd_addr;
|
||
logic bios_rd_valid;
|
||
logic [31:0] bios_rd_data;
|
||
assign bios_rd_addr = bios_rd_addr_full[BIOS_ADDR_W-1:0];
|
||
|
||
bios_rom_stub #(
|
||
.SIZE_BYTES(BIOS_SIZE_BYTES),
|
||
.IMAGE_FILE(`TOP_PSMCT32_RASTER_DEMO_BIOS_IMAGE_FILE)
|
||
) u_bios (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.rd_en(bios_rd_en),
|
||
.rd_addr(bios_rd_addr),
|
||
.rd_data(bios_rd_data),
|
||
.rd_valid(bios_rd_valid),
|
||
.ev_valid(), .ev_subsys(), .ev_event(),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// dmac_reg_stub — channel-2 NORMAL transfer
|
||
// ---------------------------------------------------------------------
|
||
logic dmac_reg_wr_en;
|
||
logic [7:0] dmac_reg_offset;
|
||
logic [31:0] dmac_reg_wr_data;
|
||
logic dmac_mem_rd_en;
|
||
logic [31:0] dmac_mem_rd_addr;
|
||
logic dmac_gif_valid;
|
||
logic [127:0] dmac_gif_data;
|
||
logic dmac_gif_last;
|
||
logic dmac_gif_ready;
|
||
|
||
logic dmac_ev_valid;
|
||
subsys_e dmac_ev_subsys;
|
||
event_e dmac_ev_event;
|
||
|
||
logic [127:0] map_to_dmac_rd_data;
|
||
logic map_to_dmac_rd_valid;
|
||
|
||
dmac_reg_stub u_dmac (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.reg_wr_en(dmac_reg_wr_en), .reg_offset(dmac_reg_offset),
|
||
.reg_wr_data(dmac_reg_wr_data),
|
||
.reg_rd_en(1'b0), .reg_rd_data(), .reg_rd_valid(),
|
||
.mem_rd_en(dmac_mem_rd_en), .mem_rd_addr(dmac_mem_rd_addr),
|
||
.mem_rd_data(map_to_dmac_rd_data), .mem_rd_valid(map_to_dmac_rd_valid),
|
||
.ep_valid(dmac_gif_valid), .ep_data(dmac_gif_data),
|
||
.ep_last(dmac_gif_last), .ep_ready(dmac_gif_ready),
|
||
.irq_completion_o(),
|
||
.ev_valid(dmac_ev_valid), .ev_subsys(dmac_ev_subsys),
|
||
.ev_event(dmac_ev_event),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// ee_memory_map_stub — bus arbiter (USEG_SHADOW shrunk per Ch145)
|
||
// ---------------------------------------------------------------------
|
||
logic ee_cpu_rd_en;
|
||
logic [31:0] ee_cpu_rd_addr;
|
||
logic [31:0] ee_cpu_rd_data;
|
||
logic ee_cpu_rd_valid;
|
||
logic ee_cpu_wr_en;
|
||
logic [31:0] ee_cpu_wr_addr;
|
||
logic [31:0] ee_cpu_wr_data;
|
||
logic [3:0] ee_cpu_wr_be;
|
||
|
||
logic map_gs_priv_wr_en;
|
||
logic [15:0] map_gs_priv_wr_addr;
|
||
logic [31:0] map_gs_priv_wr_data;
|
||
logic [3:0] map_gs_priv_wr_be;
|
||
|
||
logic map_ram_rd_en;
|
||
logic [24:0] map_ram_rd_addr;
|
||
|
||
ee_memory_map_stub #(
|
||
.USEG_SHADOW_WORDS_PARAM(USEG_SHADOW_WORDS_PARAM),
|
||
.USEG_SHADOW_ENABLE(USEG_SHADOW_ENABLE)
|
||
) u_map (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.ee_rd_en (ee_cpu_rd_en),
|
||
.ee_rd_addr(ee_cpu_rd_addr),
|
||
.ee_rd_data(ee_cpu_rd_data),
|
||
.ee_rd_valid(ee_cpu_rd_valid),
|
||
.ee_wr_en (ee_cpu_wr_en),
|
||
.ee_wr_addr(ee_cpu_wr_addr),
|
||
.ee_wr_data(ee_cpu_wr_data),
|
||
.ee_wr_be (ee_cpu_wr_be),
|
||
.dmac_rd_en(dmac_mem_rd_en), .dmac_rd_addr(dmac_mem_rd_addr),
|
||
.dmac_rd_data(map_to_dmac_rd_data),
|
||
.dmac_rd_valid(map_to_dmac_rd_valid),
|
||
.bios_rd_en (bios_rd_en),
|
||
.bios_rd_addr(bios_rd_addr_full),
|
||
.bios_rd_data(bios_rd_data),
|
||
.bios_rd_valid(bios_rd_valid),
|
||
.ram_rd_en(map_ram_rd_en), .ram_rd_addr(map_ram_rd_addr),
|
||
.ram_rd_data(ram_rd_data_patched), .ram_rd_valid(ram_rd_valid),
|
||
.bridge_wr_en(1'b0), .bridge_wr_addr(32'd0),
|
||
.bridge_wr_data(128'd0), .bridge_wr_be(16'd0),
|
||
.bridge_master_id(8'd0),
|
||
// Ch251 fitter-rescue: leave ee_memory_map_stub's ram_wr_*
|
||
// outputs unconnected at the wrapper. Consuming them forced
|
||
// Quartus to materialize the 128-bit lane-expansion + bridge-
|
||
// mux logic that lives behind those outputs — combinational
|
||
// alone, but the M20K usage cascaded past the 358-block
|
||
// Agilex 5 budget when the wide data paths got wired up. The
|
||
// heartbeat-patch register below snoops the EE's SW directly
|
||
// from `ee_cpu_wr_*` (already-existing consumers) instead.
|
||
.ram_wr_en(), .ram_wr_addr(), .ram_wr_data(),
|
||
.ram_wr_be(), .ram_master_id(),
|
||
.ee_dmac_ch2_wr_en (dmac_reg_wr_en),
|
||
.ee_dmac_ch2_wr_addr(dmac_reg_offset),
|
||
.ee_dmac_ch2_wr_data(dmac_reg_wr_data),
|
||
.ee_dmac_ch2_rd_en(), .ee_dmac_ch2_rd_addr(),
|
||
.ee_dmac_ch2_rd_data(32'd0), .ee_dmac_ch2_rd_valid(1'b0),
|
||
.ee_intc_wr_en(), .ee_intc_wr_addr(), .ee_intc_wr_data(),
|
||
.ee_intc_rd_en(), .ee_intc_rd_addr(),
|
||
.ee_intc_rd_data(32'd0), .ee_intc_rd_valid(1'b0),
|
||
.ee_misc_mmio_wr_en(), .ee_misc_mmio_wr_addr(), .ee_misc_mmio_wr_data(), .ee_misc_mmio_wr_be(),
|
||
.ee_misc_mmio_rd_en(), .ee_misc_mmio_rd_addr(),
|
||
.ee_misc_mmio_rd_data(32'd0), .ee_misc_mmio_rd_valid(1'b0),
|
||
.ee_biu_wr_en(), .ee_biu_wr_addr(), .ee_biu_wr_data(), .ee_biu_wr_be(),
|
||
.ee_biu_rd_en(), .ee_biu_rd_addr(),
|
||
.ee_biu_rd_data(32'd0), .ee_biu_rd_valid(1'b0),
|
||
.ee_gs_priv_wr_en (map_gs_priv_wr_en),
|
||
.ee_gs_priv_wr_addr(map_gs_priv_wr_addr),
|
||
.ee_gs_priv_wr_data(map_gs_priv_wr_data),
|
||
.ee_gs_priv_wr_be (map_gs_priv_wr_be),
|
||
.ev_valid(), .ev_subsys(), .ev_event(),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
assign ram_rd_en = map_ram_rd_en;
|
||
assign ram_rd_addr = map_ram_rd_addr[RAM_ADDR_W-1:0];
|
||
|
||
// ---------------------------------------------------------------------
|
||
// ee_core_stub
|
||
// ---------------------------------------------------------------------
|
||
logic [31:0] core_pc;
|
||
logic core_trap;
|
||
|
||
ee_core_stub #(
|
||
.PC_RESET(32'hBFC0_0000),
|
||
.STRICT_UNSUPPORTED(1'b0),
|
||
.STRIP_HW_DIVIDER(STRIP_HW_DIVIDER)
|
||
) u_core (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.go_i(core_go),
|
||
.map_rd_en (ee_cpu_rd_en),
|
||
.map_rd_addr(ee_cpu_rd_addr),
|
||
.map_rd_data(ee_cpu_rd_data),
|
||
.map_rd_valid(ee_cpu_rd_valid),
|
||
.map_wr_en (ee_cpu_wr_en),
|
||
.map_wr_addr(ee_cpu_wr_addr),
|
||
.map_wr_data(ee_cpu_wr_data),
|
||
.map_wr_be (ee_cpu_wr_be),
|
||
.cpu_irq(1'b0),
|
||
.halt_o(core_halt),
|
||
.pc_o (core_pc),
|
||
.trap_o(core_trap),
|
||
.trap_pc_o(),
|
||
.trap_instr_o(),
|
||
.ev_valid(), .ev_subsys(), .ev_event(),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// gif_packed_stub
|
||
// ---------------------------------------------------------------------
|
||
logic gif_in_ready;
|
||
logic [7:0] gif_gif_reg_num;
|
||
logic gif_gif_reg_wr_en;
|
||
logic [63:0] gif_gif_reg_data;
|
||
logic gif_image_data_valid;
|
||
logic [127:0] gif_image_data;
|
||
logic gif_image_data_last;
|
||
logic xfer_data_ready;
|
||
// Ch172 — raster FIFO full from gs_stub feeds gif_packed_stub's
|
||
// backpressure input. Declared here, driven by u_gs below, and
|
||
// consumed by u_gif above.
|
||
logic gs_raster_fifo_full;
|
||
|
||
gif_packed_stub #(.REAL_AD_REG_MAP(1'b1)) u_gif (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.in_valid(dmac_gif_valid), .in_data(dmac_gif_data),
|
||
.in_last(dmac_gif_last), .in_ready(gif_in_ready),
|
||
.image_data_valid(gif_image_data_valid),
|
||
.image_data(gif_image_data),
|
||
.image_data_last(gif_image_data_last),
|
||
.image_data_ready(xfer_data_ready),
|
||
.raster_fifo_full(gs_raster_fifo_full),
|
||
.gs_wr_en(), .gs_wr_addr(), .gs_wr_data(),
|
||
.gif_reg_wr_en(gif_gif_reg_wr_en),
|
||
.gif_reg_num(gif_gif_reg_num),
|
||
.gif_reg_data(gif_gif_reg_data),
|
||
.ev_valid(), .ev_subsys(), .ev_event(),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// Ch330 — runtime primitive-list feeder with a PHASE OWNER on gif_reg_*.
|
||
// The bootlet's setup/upload/state writes (BITBLTBUF/TRXPOS/TRXREG/TRXDIR/
|
||
// TEX0/FRAME/ALPHA/TEST/ZBUF) are latched by gs_stub from the SAME gif_reg_*
|
||
// stream — so a blanket feeder mux from reset would mux out the texture-upload
|
||
// arming. Instead:
|
||
// Phase 0 (feeder_owns_bus=0): the UNPACKER owns gif_reg_*; the bootlet does
|
||
// texture upload + state setup, reaching gs_stub exactly as proven.
|
||
// Handoff: dma_done_seen && !xfer_busy (setup + TRX/IMAGE upload complete).
|
||
// Phase 1 (feeder_owns_bus=1): the FEEDER owns gif_reg_* and emits the prims.
|
||
// FEEDER_ENABLE=0 → feeder_owns_bus tied 0 → dead logic, byte-identical.
|
||
// ---------------------------------------------------------------------
|
||
logic feeder_gif_reg_wr_en;
|
||
logic [7:0] feeder_gif_reg_num;
|
||
logic [63:0] feeder_gif_reg_data;
|
||
logic feeder_owns_bus; // drives the mux (feeder owns from the first handoff onward)
|
||
logic gs_raster_active; // from gs_stub — raster pipeline active (dips between batches)
|
||
logic gs_scene_busy; // Ch337 — from gs_stub — whole multi-batch scene busy (no inter-batch dip)
|
||
logic [15:0] feeder_records; // Ch330 observability counters
|
||
logic [31:0] feeder_waits;
|
||
logic feeder_done_p; // feeder finished emitting the current list (1-cyc)
|
||
logic feeder_list_flush; // Ch331 — feeder end-of-list grid flush (delayed done)
|
||
generate
|
||
if (FEEDER_ENABLE) begin : g_feeder
|
||
logic [63:0] feeder_stg [0:FEEDER_STG_WORDS-1]; // staging list (1R1W: feeder reads, bridge writes)
|
||
`ifdef FEEDER_STG_INIT_FILE
|
||
// Ch330 Brick 4 — board power-up list (list A): bitstream-init the staging RAM so the
|
||
// first feeder auto-run after setup draws a real scene instead of uninitialized garbage.
|
||
// Synth/board-only — sim TBs $readmemh the staging directly and never define this macro.
|
||
initial $readmemh(`FEEDER_STG_INIT_FILE, feeder_stg);
|
||
`endif
|
||
logic [11:0] fdr_rd_addr;
|
||
logic [63:0] fdr_rd_data;
|
||
// 1R1W: the feeder's sequential read + the bridge's runtime staging write.
|
||
always_ff @(posedge clk) begin
|
||
fdr_rd_data <= feeder_stg[fdr_rd_addr[$clog2(FEEDER_STG_WORDS)-1:0]];
|
||
if (feeder_stg_we_i)
|
||
feeder_stg[feeder_stg_waddr_i[$clog2(FEEDER_STG_WORDS)-1:0]] <= feeder_stg_wdata_i;
|
||
end
|
||
|
||
logic fdr_start, fdr_busy, fdr_done;
|
||
logic grid_ran; // gs_stub raster_active was high for the in-flight list
|
||
// Control FSM enforcing Codex's "known-empty" rule: a new list never starts until the
|
||
// previous list's grid has actually RUN and DRAINED (raster_active high->low) — so list B
|
||
// can never append onto list A's still-resident FIFO/grid state.
|
||
localparam logic [1:0] C_SETUP = 2'd0, C_RUN = 2'd1, C_DRAIN = 2'd2, C_READY = 2'd3;
|
||
logic [1:0] cst;
|
||
always_ff @(posedge clk or negedge rst_n) begin
|
||
if (!rst_n) begin cst <= C_SETUP; fdr_start <= 1'b0; grid_ran <= 1'b0; end
|
||
else begin
|
||
fdr_start <= 1'b0;
|
||
unique case (cst)
|
||
C_SETUP: if (dma_done_seen && !xfer_busy) begin // bootlet setup + TRX/IMAGE done
|
||
fdr_start <= 1'b1; grid_ran <= 1'b0; cst <= C_RUN; end
|
||
C_RUN: begin if (gs_scene_busy) grid_ran <= 1'b1;
|
||
if (fdr_done) cst <= C_DRAIN; end
|
||
// Ch337 — gate on gs_scene_busy (NOT gs_raster_active): a >FIFO_DEPTH scene
|
||
// renders in multiple batches and raster_active DIPS between them, but the
|
||
// next batch's prims are still queued (gs_scene_busy stays high). Waiting on
|
||
// raster_active alone would reach C_READY mid-scene and let a retrigger race
|
||
// the last batch's render/flush. gs_scene_busy only clears once EVERY batch has
|
||
// rendered + flushed and the FIFO is empty.
|
||
C_DRAIN: begin if (gs_scene_busy) grid_ran <= 1'b1;
|
||
if (grid_ran && !gs_scene_busy) cst <= C_READY; end // whole scene drained
|
||
C_READY: if (feeder_go_i) begin fdr_start <= 1'b1; grid_ran <= 1'b0; cst <= C_RUN; end
|
||
endcase
|
||
end
|
||
end
|
||
assign feeder_owns_bus = (cst != C_SETUP); // feeder owns gif_reg_* from the first handoff onward
|
||
assign feeder_done_p = fdr_done;
|
||
assign feeder_ready_o = (cst == C_READY); // a new list may be retriggered
|
||
|
||
// Ch331 — end-of-list grid flush: fdr_done delayed a few cycles so the LAST primitive's
|
||
// raster-FIFO push + gradient-pending are guaranteed set before gs_stub latches the flush
|
||
// (gs_stub's all_grad_done gate then holds the grid until every prim is graded). The
|
||
// delay (vs a same-cycle done) closes the "done arrives before the last prim commits" race.
|
||
logic [3:0] fdr_done_dly;
|
||
always_ff @(posedge clk or negedge rst_n) begin
|
||
if (!rst_n) fdr_done_dly <= 4'd0;
|
||
else fdr_done_dly <= {fdr_done_dly[2:0], fdr_done};
|
||
end
|
||
assign feeder_list_flush = fdr_done_dly[3];
|
||
|
||
gs_prim_list_feeder #(.STG_ADDR_W(12)) u_feeder (
|
||
.clk(clk), .rst_n(rst_n), .start(fdr_start), .busy(fdr_busy), .done(fdr_done),
|
||
.records_emitted(feeder_records), .fifo_wait_cycles(feeder_waits),
|
||
.stg_rd_addr(fdr_rd_addr), .stg_rd_data(fdr_rd_data),
|
||
.fifo_full(gs_raster_fifo_full),
|
||
.gif_reg_wr_en(feeder_gif_reg_wr_en),
|
||
.gif_reg_num(feeder_gif_reg_num),
|
||
.gif_reg_data(feeder_gif_reg_data));
|
||
end else begin : g_no_feeder
|
||
assign feeder_gif_reg_wr_en = 1'b0;
|
||
assign feeder_gif_reg_num = 8'd0;
|
||
assign feeder_gif_reg_data = 64'd0;
|
||
assign feeder_owns_bus = 1'b0;
|
||
assign feeder_records = 16'd0;
|
||
assign feeder_waits = 32'd0;
|
||
assign feeder_done_p = 1'b0;
|
||
assign feeder_ready_o = 1'b0;
|
||
assign feeder_list_flush = 1'b0;
|
||
end
|
||
endgenerate
|
||
|
||
// Ch330 Brick 4 — surface the feeder counters to the bridge (both profiles).
|
||
assign feeder_records_o = feeder_records;
|
||
assign feeder_waits_o = feeder_waits;
|
||
|
||
// Phased gif_reg_* mux: feeder ONLY once it owns the bus (post-setup); else the unpacker.
|
||
wire gs_gif_reg_wr_en = feeder_owns_bus ? feeder_gif_reg_wr_en : gif_gif_reg_wr_en;
|
||
wire [7:0] gs_gif_reg_num = feeder_owns_bus ? feeder_gif_reg_num : gif_gif_reg_num;
|
||
wire [63:0] gs_gif_reg_data = feeder_owns_bus ? feeder_gif_reg_data : gif_gif_reg_data;
|
||
|
||
// DMAC ready follows gif_packed_stub's in_ready directly (Ch110
|
||
// image-xfer backpressure propagates through gif_packed_stub).
|
||
assign dmac_gif_ready = gif_in_ready;
|
||
|
||
// ---------------------------------------------------------------------
|
||
// gs_stub — PSMCT32 raster, swizzled
|
||
// ---------------------------------------------------------------------
|
||
logic priv_reg_wr_en;
|
||
logic [15:0] priv_reg_wr_addr;
|
||
logic [63:0] priv_reg_wr_data;
|
||
|
||
logic [63:0] pmode_q, dispfb1_q, display1_q;
|
||
logic [63:0] bitbltbuf_q, trxpos_q, trxreg_q, trxdir_q;
|
||
logic trxdir_wr_q;
|
||
|
||
logic raster_pixel_emit;
|
||
logic [63:0] raster_pixel_color_q;
|
||
logic [31:0] raster_pixel_fb_addr_q;
|
||
logic [3:0] raster_pixel_be_q;
|
||
logic [31:0] raster_pixel_mask_q;
|
||
logic [5:0] raster_pixel_psm_q;
|
||
|
||
// Ch318 — expose the flush stream for an external LPDDR AXI writer (de25 top).
|
||
assign flush_emit_o = raster_pixel_emit;
|
||
assign flush_addr_o = raster_pixel_fb_addr_q;
|
||
assign flush_pix16_o = raster_pixel_color_q[15:0];
|
||
assign flush_color32_o = raster_pixel_color_q[31:0]; // Ch323 — full color for spill capture
|
||
assign flush_psm_o = raster_pixel_psm_q;
|
||
|
||
// Ch295 — texture-sampler read port out of gs_stub. Wired to
|
||
// vram_bram_stub's SECOND read port (read2) below, MUXED with the
|
||
// PSMT4 RMW old-byte read. See the read2 arbitration block.
|
||
//
|
||
// TEX_RD_REGISTERED(1): vram_bram_stub.read2 is a 1-cycle
|
||
// REGISTERED (sync) read — unlike vram_stub's combinational read2
|
||
// that the Brick-1 demo top uses. gs_stub therefore generates the
|
||
// texel address one pipeline stage earlier (S0 coords) so the
|
||
// registered data lands at the S1 stage, where the existing single
|
||
// S1->S2 texel register samples it. This keeps the emit-stage
|
||
// timing identical to the combinational-read variant.
|
||
logic gs_tex_rd_en;
|
||
logic [31:0] gs_tex_rd_addr;
|
||
logic [31:0] gs_tex_rd_data;
|
||
|
||
// Ch296 — PSMT8 indexed-texture CLUT lookup. gs_stub fetches an 8-bit
|
||
// index through the texel read2 port, then `gs_clut_rd_idx` looks it
|
||
// up in clut_stub (the palette filled by clut_loader_stub at TEX0
|
||
// commit) to produce the PSMCT32 texel color. clut_stub's second
|
||
// (combinational) read port returns it on `gs_clut_rd_data`.
|
||
logic [7:0] gs_clut_rd_idx;
|
||
logic [31:0] gs_clut_rd_data;
|
||
|
||
// Ch296 — CLUT-load-busy (declared here, ahead of the gs_stub instance
|
||
// that consumes it as clut_load_busy to hold the FIFO pop while the
|
||
// VRAM->CLUT load runs). Driven by clut_loader_stub.load_busy below.
|
||
logic clut_ld_busy;
|
||
|
||
// Ch296 — gs_stub TEX0_1 decode taps that feed clut_loader_stub: the
|
||
// 1-cycle commit pulse + CBP/CPSM/CSM/CSA/CLD that decide when (and
|
||
// from where) the VRAM→CLUT copy fires.
|
||
logic [13:0] gs_tex0_cbp;
|
||
logic [3:0] gs_tex0_cpsm;
|
||
logic gs_tex0_csm;
|
||
logic [4:0] gs_tex0_csa;
|
||
logic [2:0] gs_tex0_cld;
|
||
logic gs_tex0_wr;
|
||
|
||
// Brick 2a — dest-framebuffer read port for alpha blending. Wired
|
||
// to vram_bram_stub.read2 below, arbitrated with the texel-fetch
|
||
// port (mutually exclusive: a flat blend never textures). Uses the
|
||
// 1-cycle registered read model (FB_RD_REGISTERED=1) to match the
|
||
// BRAM read2 latency.
|
||
logic gs_fb_rd_en;
|
||
logic [31:0] gs_fb_rd_addr;
|
||
logic [31:0] gs_fb_rd_data;
|
||
|
||
// Brick 2b — Z-buffer stored-Z read port. Wired to vram_bram_stub.read2
|
||
// below, arbitrated with the texel-fetch + alpha dest-fb ports (mutually
|
||
// exclusive: a flat Z-tested sprite never textures and never blends).
|
||
// Uses the 1-cycle registered read model (Z_RD_REGISTERED=1) to match
|
||
// the BRAM read2 latency.
|
||
logic gs_z_rd_en;
|
||
logic [31:0] gs_z_rd_addr;
|
||
logic [31:0] gs_z_rd_data;
|
||
|
||
gs_stub #(
|
||
.PSMCT32_SWIZZLE (PSMCT32_SWIZZLE),
|
||
.PSMT4_SWIZZLE (PSMT4_SWIZZLE),
|
||
.PSMT8_SWIZZLE (PSMT8_SWIZZLE),
|
||
.TEX_RD_LATENCY (1),
|
||
.CLUT_STALL (1'b1), // hold pop while VRAM->CLUT load runs
|
||
.TEX_RD_REGISTERED(1'b1),
|
||
.FB_RD_REGISTERED (1'b1),
|
||
.Z_RD_REGISTERED (1'b1),
|
||
.PERSPECTIVE_CORRECT(PERSPECTIVE_CORRECT), // Ch301 — forwarded board param (default 0)
|
||
.PERSP_RECIP_IDX_BITS(PERSP_RECIP_IDX_BITS), // Ch351 — perspective reciprocal LUT width (far-W -> 11)
|
||
.GRAD_DIV_CYCLES(GRAD_DIV_CYCLES), // Ch352 — triangle-setup divide settle cycles (board -> 4)
|
||
.GRAD_SEQ_DIVIDER(GRAD_SEQ_DIVIDER), // Ch352 — sequential gradient divider (board -> 1)
|
||
.SPRITE_TEX_ALPHA (SPRITE_TEX_ALPHA), // Ch344 — textured + source-over alpha SPRITE (default 0)
|
||
.SPRITE_TEX_ALPHA_CLUT (SPRITE_TEX_ALPHA_CLUT), // Ch347 — PSMT8 CLUT into the alpha-sprite path (default 0)
|
||
.COMBINED_TAZ (COMBINED_TAZ), // Ch302 — combined tex+alpha+depth probe (default 0)
|
||
.TILE_LOCAL (TILE_LOCAL), // Ch303 — on-chip tile color+Z render (default 0)
|
||
.TILE_COLS (TILE_COLS), // Ch304 — tile grid cols (default 1)
|
||
.TILE_ROWS (TILE_ROWS), // Ch304 — tile grid rows (default 1)
|
||
.TILE_MULTIPRIM (TILE_MULTIPRIM), // Ch305 — render a primitive LIST per tile (default 0)
|
||
.TILE_PRIM_COUNT (TILE_PRIM_COUNT), // Ch305 — primitives in the batch (default 1)
|
||
.TILE_FIFO_DEPTH (TILE_FIFO_DEPTH), // Ch315 — prim FIFO / bin depth (capacity, default 4)
|
||
.SCISSOR_ENABLE (SCISSOR_ENABLE), // Ch306 — SCISSOR_1 rect clip in tile walker (default 0)
|
||
.TEX_WRAP_ENABLE (TEX_WRAP_ENABLE), // Ch307 — texture wrap/clamp in sampler (default 0)
|
||
.TILE_COLOR_PSMCT16 (TILE_COLOR_PSMCT16), // Ch308 — PSMCT16 tile color buffer (default 0)
|
||
.ALPHA_MODES_ENABLE (ALPHA_MODES_ENABLE), // Ch309 — generic ALPHA blend modes (default 0)
|
||
.BILINEAR_ENABLE (BILINEAR_ENABLE), // Ch310 — bilinear filtering in combined path (default 0)
|
||
.PALETTE_BILINEAR (PALETTE_BILINEAR), // Ch314 — bilinear for PSMT8/PSMT4 indexed textures (default 0)
|
||
.BIN_BUFFER_ENABLE (BIN_BUFFER_ENABLE), // Ch311 — per-tile bin buffer (default 0)
|
||
.TILE_SPILL_ENABLE (TILE_SPILL_ENABLE), // Ch323 — tile color+Z spill/reload (default 0)
|
||
.SPILL_FORCE_VALID (SPILL_FORCE_VALID), // Ch323 — test hook (negative bootstrap test)
|
||
.MP_FLUSH_ONLY (FEEDER_ENABLE), // Ch331 — feeder mode: grid fires on end-of-list flush
|
||
.TILE_ACCUM_ENABLE (FEEDER_ENABLE), // Ch336 — >FIFO_DEPTH framebuffer accumulation (feeder mode)
|
||
.TILE_Z_PERSIST (FEEDER_ENABLE) // Ch338 — persistent cross-batch Z (feeder mode)
|
||
) u_gs (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.reg_wr_en (priv_reg_wr_en),
|
||
.reg_wr_addr(priv_reg_wr_addr),
|
||
.reg_wr_data(priv_reg_wr_data), // Ch330: gif_reg_* below is muxed (feeder vs unpacker)
|
||
.gif_reg_wr_en(gs_gif_reg_wr_en),
|
||
.gif_reg_num (gs_gif_reg_num),
|
||
.gif_reg_data (gs_gif_reg_data),
|
||
.prim_list_flush_i(feeder_list_flush), // Ch331 — feeder end-of-list grid flush (0 in non-feeder)
|
||
.bg_r(), .bg_g(), .bg_b(),
|
||
.pmode_q(pmode_q), .dispfb1_q(dispfb1_q), .display1_q(display1_q),
|
||
.prim_q(), .rgbaq_q(),
|
||
.xyz2_q(), .xyzf2_q(),
|
||
.frame_1_q(), .zbuf_1_q(),
|
||
.tex0_1_q(), .tex0_1_cbp_q(gs_tex0_cbp), .tex0_1_cpsm_q(gs_tex0_cpsm),
|
||
.tex0_1_csm_q(gs_tex0_csm), .tex0_1_csa_q(gs_tex0_csa),
|
||
.tex0_1_cld_q(gs_tex0_cld), .tex0_1_wr_q(gs_tex0_wr),
|
||
.bitbltbuf_q(bitbltbuf_q),
|
||
.trxpos_q(trxpos_q),
|
||
.trxreg_q(trxreg_q),
|
||
.trxdir_q(trxdir_q),
|
||
.trxdir_wr_q(trxdir_wr_q),
|
||
.prim_complete(), .prim_complete_count(),
|
||
.prim_v0_q(), .prim_v1_q(), .prim_v2_q(),
|
||
.prim_color_q(),
|
||
.prim_color_v0_q(), .prim_color_v1_q(), .prim_color_v2_q(),
|
||
.prim_v0_decoded_q(), .prim_v1_decoded_q(), .prim_v2_decoded_q(),
|
||
.prim_v0_color_decoded_q(), .prim_v1_color_decoded_q(), .prim_v2_color_decoded_q(),
|
||
.pixel_emit(), .pixel_emit_count(),
|
||
.pixel_x_q(), .pixel_y_q(),
|
||
.pixel_color_q(),
|
||
.pixel_fbp_q(), .pixel_fbw_q(), .pixel_psm_q(), .pixel_fb_addr_q(),
|
||
.raster_pixel_emit(raster_pixel_emit),
|
||
.raster_pixel_emit_count(),
|
||
.raster_pixel_x_q(), .raster_pixel_y_q(),
|
||
.raster_pixel_color_q(raster_pixel_color_q),
|
||
.raster_pixel_fb_addr_q(raster_pixel_fb_addr_q),
|
||
.raster_pixel_be_q(raster_pixel_be_q),
|
||
.raster_pixel_mask_q(raster_pixel_mask_q),
|
||
.raster_pixel_psm_q(raster_pixel_psm_q),
|
||
.z_flush_emit_o(z_flush_emit_o), // Ch323 — tile Z-flush stream (de25 Z-writer)
|
||
.z_flush_addr_o(z_flush_addr_o),
|
||
.z_flush_data_o(z_flush_data_o),
|
||
.tile_color_flush_emit_o(tile_color_flush_emit_o), // Ch323 — dedicated color-flush spill stream
|
||
.tile_color_flush_addr_o(tile_color_flush_addr_o),
|
||
.tile_color_flush_data_o(tile_color_flush_data_o),
|
||
.reload_start_o(reload_start_o), // Ch323 — tile reload staging interface
|
||
.tile_reload_raddr_o(tile_reload_raddr_o),
|
||
.reload_base_o(reload_base_o), // Ch324 — per-tile raster-FB byte offset
|
||
.tile_reload_ready_i(tile_reload_ready_i),
|
||
.tile_reload_color_i(tile_reload_color_i[(TILE_COLOR_PSMCT16?16:32)-1:0]),
|
||
.tile_reload_z_i(tile_reload_z_i),
|
||
.tile_phase_o(tile_phase_o), // Ch323 diag — current tile phase
|
||
.raster_active(gs_raster_active), // Ch330 — drives the feeder retrigger 'known-empty' gate
|
||
.raster_scene_busy(gs_scene_busy), // Ch337 — whole-scene drain gate (no inter-batch dip)
|
||
.raster_overflow(raster_overflow),
|
||
.raster_fifo_full(gs_raster_fifo_full),
|
||
.raster_degenerate(),
|
||
.tex_rd_en (gs_tex_rd_en),
|
||
.tex_rd_addr(gs_tex_rd_addr),
|
||
.tex_rd_data(gs_tex_rd_data),
|
||
.clut_rd_idx (gs_clut_rd_idx),
|
||
.clut_rd_data(gs_clut_rd_data),
|
||
.clut_load_busy(clut_ld_busy), // hold pop while VRAM->CLUT load runs
|
||
.fb_rd_en (gs_fb_rd_en),
|
||
.fb_rd_addr(gs_fb_rd_addr),
|
||
.fb_rd_data(gs_fb_rd_data),
|
||
.z_rd_en (gs_z_rd_en),
|
||
.z_rd_addr(gs_z_rd_addr),
|
||
.z_rd_data(gs_z_rd_data),
|
||
.ev_valid(), .ev_subsys(), .ev_event(),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// Ch317 — LPDDR-backed framebuffer write sink (tile-flush MIRROR). Fed by the
|
||
// PSMCT16 tile-FLUSH stream (emit + linear fb byte addr + pix16). Commits to an
|
||
// LPDDR-style model with per-row bursts + 4 KiB cap + bandwidth/over-underflow
|
||
// counters for write/readback proof. enable=0 → inert. Scanout still comes from
|
||
// the on-chip BRAM FB; a later rung swaps this model for the real EMIF AXI master
|
||
// + LPDDR scanout. Gated to PSMCT16 flushes (psm==0x02) so it only fires in a
|
||
// full-PSMCT16-FB demo (TILE_COLOR_PSMCT16=1); harmless otherwise.
|
||
// ---------------------------------------------------------------------
|
||
logic [31:0] lpfb_bytes_written, lpfb_burst_count, lpfb_busy_cycles;
|
||
logic [31:0] lpfb_overflow_count, lpfb_underflow_count;
|
||
logic [15:0] lpfb_occ;
|
||
generate
|
||
if (LPDDR_FB_ENABLE) begin : g_lpddr_fb
|
||
gs_lpddr_fb_writer #(.FB_BYTES(LPDDR_FB_BYTES), .FIFO_DEPTH(32), .MAX_BURST_BYTES(4096)) u_lpddr_fb (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.enable(1'b1),
|
||
.px_emit(raster_pixel_emit && (raster_pixel_psm_q == 6'h02)),
|
||
.px_addr(raster_pixel_fb_addr_q),
|
||
.px_pix16(raster_pixel_color_q[15:0]),
|
||
.bytes_written(lpfb_bytes_written),
|
||
.burst_count(lpfb_burst_count),
|
||
.busy_cycles(lpfb_busy_cycles),
|
||
.fifo_overflow_count(lpfb_overflow_count),
|
||
.fifo_underflow_count(lpfb_underflow_count),
|
||
.fifo_occ(lpfb_occ)
|
||
);
|
||
end else begin : g_no_lpddr_fb
|
||
// default OFF — no fbmem / FIFO instantiated (pruned), status tied to 0.
|
||
assign lpfb_bytes_written = '0; assign lpfb_burst_count = '0;
|
||
assign lpfb_busy_cycles = '0; assign lpfb_overflow_count = '0;
|
||
assign lpfb_underflow_count = '0; assign lpfb_occ = '0;
|
||
end
|
||
endgenerate
|
||
|
||
// ---------------------------------------------------------------------
|
||
// ee_gs_priv_bridge_stub
|
||
// ---------------------------------------------------------------------
|
||
ee_gs_priv_bridge_stub u_priv_bridge (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.ee_wr_en (map_gs_priv_wr_en),
|
||
.ee_wr_addr(map_gs_priv_wr_addr),
|
||
.ee_wr_data(map_gs_priv_wr_data),
|
||
.ee_wr_be (map_gs_priv_wr_be),
|
||
.gs_reg_wr_en (priv_reg_wr_en),
|
||
.gs_reg_wr_addr(priv_reg_wr_addr),
|
||
.gs_reg_wr_data(priv_reg_wr_data)
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// gif_image_xfer_stub — idle in Ch123 (no TRXDIR/IMAGE), but
|
||
// instantiated for symmetry. The TRXDIR-driven Ch124 demo would
|
||
// turn it load-bearing.
|
||
// ---------------------------------------------------------------------
|
||
logic xfer_we;
|
||
logic [31:0] xfer_waddr;
|
||
logic [31:0] xfer_wdata;
|
||
logic [3:0] xfer_wbe;
|
||
logic [31:0] xfer_wmask;
|
||
logic xfer_busy;
|
||
|
||
gif_image_xfer_stub #(
|
||
.PSMCT32_SWIZZLE(PSMCT32_SWIZZLE),
|
||
.PSMT4_SWIZZLE (PSMT4_SWIZZLE),
|
||
.PSMT8_SWIZZLE (PSMT8_SWIZZLE)
|
||
) u_xfer (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.trxdir_wr_pulse(trxdir_wr_q),
|
||
.trxdir(trxdir_q),
|
||
.bitbltbuf(bitbltbuf_q),
|
||
.trxpos(trxpos_q),
|
||
.trxreg(trxreg_q),
|
||
.data_valid(gif_image_data_valid),
|
||
.data_qword(gif_image_data),
|
||
.data_last (gif_image_data_last),
|
||
.data_ready(xfer_data_ready),
|
||
.vram_we (xfer_we),
|
||
.vram_waddr(xfer_waddr),
|
||
.vram_wdata(xfer_wdata),
|
||
.vram_wbe (xfer_wbe),
|
||
.vram_wmask(xfer_wmask),
|
||
.busy (xfer_busy)
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// VRAM mux: xfer-OWNED when xfer.busy, raster-OWNED otherwise.
|
||
// (Sequenced: in Ch123 raster fills exclusively; xfer never fires.
|
||
// In a future TRXDIR variant the mux still works — payload upload
|
||
// finishes before raster starts.)
|
||
//
|
||
// Ch156: the legacy writer engines emit at byte-addressable
|
||
// granularity with per-bit `vram_wmask`. `vram_bram_stub` is
|
||
// word-aligned + byte-WE only. We mux the writer engines'
|
||
// pre-normalize signals first, then run the result through
|
||
// `vram_normalize_pkg::normalize_write` to translate into the
|
||
// BRAM contract. The PSM source is the raster-side
|
||
// `raster_pixel_psm_q` during raster emits and `bitbltbuf_q`'s
|
||
// DPSM field (bits [61:56]) during xfer emits — both match
|
||
// what the corresponding writer engine used to compute its
|
||
// emit shape.
|
||
// ---------------------------------------------------------------------
|
||
logic vram_we_pre;
|
||
logic [31:0] vram_waddr_pre;
|
||
logic [31:0] vram_wdata_pre;
|
||
logic [5:0] vram_psm_pre;
|
||
logic [31:0] vram_mask_pre;
|
||
|
||
// Ch326 — FB_LPDDR_ONLY gates the flush's BRAM-mirror write; texture upload (xfer) still
|
||
// writes vram_stub. The dedicated LPDDR color flush (tile_color_flush_emit_o) is unaffected.
|
||
assign vram_we_pre = xfer_busy ? xfer_we : (raster_pixel_emit && !FB_LPDDR_ONLY);
|
||
assign vram_waddr_pre = xfer_busy ? xfer_waddr : raster_pixel_fb_addr_q;
|
||
assign vram_wdata_pre = xfer_busy ? xfer_wdata : raster_pixel_color_q[31:0];
|
||
assign vram_psm_pre = xfer_busy ? bitbltbuf_q[61:56] : raster_pixel_psm_q;
|
||
assign vram_mask_pre = xfer_busy ? xfer_wmask : raster_pixel_mask_q;
|
||
|
||
// Ch157 PSMT4 RMW pipeline (replaces Ch156's hard-gate).
|
||
//
|
||
// PSMT4 packs 2 pixels per byte: the writer emits a 4-bit
|
||
// nibble pre-shifted into either the LOW or HIGH nibble of
|
||
// `vram_wdata_pre[7:0]`, with `vram_mask_pre[7:0]` set to 0x0F
|
||
// (low) or 0xF0 (high). To commit one nibble while preserving
|
||
// the other, vram_bram_stub (byte-WE only, no per-bit RMW)
|
||
// needs the FULL byte value spliced upstream — that is what
|
||
// `vram_normalize_pkg::normalize_write`'s PSMT4 branch does
|
||
// when handed the LIVE `old_byte` from `mem[byte_addr]`.
|
||
//
|
||
// The pipe drives `read2_addr = byte_addr` on the T4 emit
|
||
// cycle. One cycle later (`vram_bram_stub` registers reads
|
||
// with 1-cycle latency), `vram_read2_data` is mem[byte_addr]
|
||
// BEFORE any pending writes; we extract `old_byte` from the
|
||
// `byte_addr[1:0]` lane, splice in the new nibble, and drive
|
||
// a full-byte write at the same address — one cycle after the
|
||
// emit fired. Non-T4 emits skip the pipe entirely and write
|
||
// same-cycle through `vram_norm` (CT32/CT16/T8 normalize_write
|
||
// is pure-comb and doesn't need a read).
|
||
//
|
||
// **Forwarding hazard**: a PSMT4 SPRITE rasters adjacent
|
||
// pixels x=2k and x=2k+1 to the SAME byte_addr (low + high
|
||
// nibble). At cycle N+1 we read mem[byte_addr] for emit-2
|
||
// while emit-1's write is firing in the SAME posedge. With
|
||
// separate always_ff blocks for write and read inside
|
||
// vram_bram_stub, the read sees the PRE-write value due to
|
||
// NBA semantics. We forward emit-1's just-computed
|
||
// `t4_prev_new_byte_q` when `byte_addr` of the in-flight
|
||
// emit-2 matches the previous emit's `byte_addr`. This keeps
|
||
// the chain correct across any number of back-to-back same-
|
||
// byte writes — emit-N reads emit-(N-1)'s new_byte from the
|
||
// forward register, splices on top, and emit-(N+1) reads
|
||
// emit-N's new_byte from that same register.
|
||
logic is_t4_emit;
|
||
logic t4_nibble_hi;
|
||
logic [3:0] t4_nibble_value;
|
||
|
||
assign is_t4_emit = vram_we_pre &&
|
||
(vram_psm_pre == vram_normalize_pkg::PSM_PSMT4);
|
||
assign t4_nibble_hi = (vram_mask_pre[7:0] == 8'hF0);
|
||
assign t4_nibble_value = t4_nibble_hi ? vram_wdata_pre[7:4]
|
||
: vram_wdata_pre[3:0];
|
||
|
||
// Ch157 writer-side normalization. CT32/CT16/T8 use the pure-
|
||
// comb path (same-cycle write). T4 uses `vram_norm_t4` below
|
||
// with the read-back `old_byte` plumbed in.
|
||
vram_normalize_pkg::norm_out_t vram_norm;
|
||
assign vram_norm = vram_normalize_pkg::normalize_write(
|
||
vram_waddr_pre,
|
||
vram_psm_pre,
|
||
vram_wdata_pre,
|
||
1'b0, // nibble_hi — non-T4 paths ignore it
|
||
8'd0 // old_byte — non-T4 paths ignore it
|
||
);
|
||
|
||
// Pipe stage 1 — captured T4 emit signals one cycle after the
|
||
// emit fired. Read2 has produced mem[byte_addr] at this point.
|
||
logic t4_pipe_valid_q;
|
||
logic [31:0] t4_pipe_addr_q;
|
||
logic t4_pipe_nibble_hi_q;
|
||
logic [3:0] t4_pipe_nibble_q;
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
t4_pipe_valid_q <= 1'b0;
|
||
t4_pipe_addr_q <= 32'd0;
|
||
t4_pipe_nibble_hi_q <= 1'b0;
|
||
t4_pipe_nibble_q <= 4'd0;
|
||
end else begin
|
||
t4_pipe_valid_q <= is_t4_emit;
|
||
t4_pipe_addr_q <= vram_waddr_pre;
|
||
t4_pipe_nibble_hi_q <= t4_nibble_hi;
|
||
t4_pipe_nibble_q <= t4_nibble_value;
|
||
end
|
||
end
|
||
|
||
// Pipe stage 2 forward — the just-completed RMW's address +
|
||
// produced byte. Used to forward when the next T4 emit hits
|
||
// the same byte_addr.
|
||
logic t4_prev_valid_q;
|
||
logic [31:0] t4_prev_addr_q;
|
||
logic [7:0] t4_prev_new_byte_q;
|
||
|
||
// Read-back lane extraction.
|
||
logic [7:0] t4_read_byte_lane;
|
||
always_comb begin
|
||
case (t4_pipe_addr_q[1:0])
|
||
2'b00: t4_read_byte_lane = vram_read2_data[ 7: 0];
|
||
2'b01: t4_read_byte_lane = vram_read2_data[15: 8];
|
||
2'b10: t4_read_byte_lane = vram_read2_data[23:16];
|
||
2'b11: t4_read_byte_lane = vram_read2_data[31:24];
|
||
endcase
|
||
end
|
||
|
||
// Forwarded `old_byte`: prev-RMW's new_byte if same address,
|
||
// else the freshly-read lane from vram_read2_data.
|
||
logic [7:0] t4_effective_old_byte;
|
||
assign t4_effective_old_byte = (t4_prev_valid_q && (t4_prev_addr_q == t4_pipe_addr_q))
|
||
? t4_prev_new_byte_q
|
||
: t4_read_byte_lane;
|
||
|
||
// Splice — same math as `normalize_write`'s PSMT4 branch.
|
||
logic [7:0] t4_new_byte;
|
||
always_comb begin
|
||
if (t4_pipe_nibble_hi_q)
|
||
t4_new_byte = {t4_pipe_nibble_q, t4_effective_old_byte[3:0]};
|
||
else
|
||
t4_new_byte = {t4_effective_old_byte[7:4], t4_pipe_nibble_q};
|
||
end
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
t4_prev_valid_q <= 1'b0;
|
||
t4_prev_addr_q <= 32'd0;
|
||
t4_prev_new_byte_q <= 8'd0;
|
||
end else begin
|
||
t4_prev_valid_q <= t4_pipe_valid_q;
|
||
t4_prev_addr_q <= t4_pipe_addr_q;
|
||
t4_prev_new_byte_q <= t4_new_byte;
|
||
end
|
||
end
|
||
|
||
// T4 RMW write-side: word-aligned addr, full-byte data shifted
|
||
// to the byte_addr[1:0] lane, single-byte write_be.
|
||
logic [31:0] t4_write_addr;
|
||
logic [31:0] t4_write_data;
|
||
logic [3:0] t4_write_be;
|
||
assign t4_write_addr = t4_pipe_addr_q & ~32'd3;
|
||
always_comb begin
|
||
case (t4_pipe_addr_q[1:0])
|
||
2'b00: begin t4_write_data = {24'd0, t4_new_byte}; t4_write_be = 4'b0001; end
|
||
2'b01: begin t4_write_data = {16'd0, t4_new_byte, 8'd0}; t4_write_be = 4'b0010; end
|
||
2'b10: begin t4_write_data = { 8'd0, t4_new_byte, 16'd0}; t4_write_be = 4'b0100; end
|
||
2'b11: begin t4_write_data = { t4_new_byte, 24'd0}; t4_write_be = 4'b1000; end
|
||
endcase
|
||
end
|
||
|
||
// VRAM final mux: T4 RMW pipe write OR same-cycle non-T4 write.
|
||
// gs_stub keeps PSM constant within a raster, so a T4 pipe
|
||
// entry never overlaps with a non-T4 emit cycle from the
|
||
// raster path (the raster pipe drains before any FRAME_1
|
||
// swap can re-arm a different PSM). Non-T4 emits route
|
||
// through `vram_norm` directly.
|
||
//
|
||
// **Ch157 audit Medium fix — overlap assertion**: the wrapper
|
||
// also muxes `gif_image_xfer_stub` writes through `vram_we_pre`
|
||
// when `xfer_busy=1`. The "no T4-pipe / non-T4-emit overlap"
|
||
// property is therefore an *unenforced* invariant in the design
|
||
// today (raster + xfer aren't allowed to overlap PSM-mismatched
|
||
// writes by the surrounding flow, but nothing in this wrapper
|
||
// checks it). The mux below gives the T4 pipe unconditional
|
||
// priority — if a caller ever violates the invariant, the non-
|
||
// T4 write would be silently dropped. The sim-only block at
|
||
// the end of the file asserts the invariant loudly so the
|
||
// violation surfaces as a $error instead of as silent data
|
||
// loss. Synthesis sees only the mux logic; the assertion is
|
||
// bracketed by translate_off/_on.
|
||
logic vram_we_final;
|
||
logic [31:0] vram_waddr_final;
|
||
logic [31:0] vram_wdata_final;
|
||
logic [3:0] vram_wbe_final;
|
||
|
||
assign vram_we_final = t4_pipe_valid_q ? 1'b1
|
||
: (vram_we_pre && !is_t4_emit);
|
||
assign vram_waddr_final = t4_pipe_valid_q ? t4_write_addr : vram_norm.write_addr;
|
||
assign vram_wdata_final = t4_pipe_valid_q ? t4_write_data : vram_norm.write_data;
|
||
assign vram_wbe_final = t4_pipe_valid_q ? t4_write_be : vram_norm.write_be;
|
||
|
||
// synthesis translate_off
|
||
always_ff @(posedge clk) begin
|
||
if (rst_n && t4_pipe_valid_q && vram_we_pre && !is_t4_emit)
|
||
$error("Ch157: T4 RMW pipe write @%0t collides with non-T4 vram_we_pre (psm=0x%02h, addr=0x%08h); non-T4 write would be dropped — caller violated the no-overlap invariant",
|
||
$time, vram_psm_pre, vram_waddr_pre);
|
||
end
|
||
// synthesis translate_on
|
||
|
||
// ---------------------------------------------------------------------
|
||
// Ch296 — CLUT load engine + palette table for PSMT8 indexed textures.
|
||
//
|
||
// The read2 bus signals are declared here (ahead of the arbitration
|
||
// mux) so the CLUT loader instance below can tap vram_read2_data.
|
||
// (clut_ld_busy is declared up by the gs_clut_* signals so the gs_stub
|
||
// instance above can reference it.)
|
||
logic [31:0] vram_read2_addr;
|
||
logic [31:0] vram_read2_data;
|
||
logic [31:0] vram_raddr;
|
||
logic [31:0] vram_rdata;
|
||
|
||
// A CSM1-only profile whose textures come from the external LPDDR cache
|
||
// does not need the replicated VRAM read2 array during rasterization. Reuse
|
||
// the existing scanout read port for the short boot-time CLUT load instead.
|
||
// The load precedes raster, so temporarily pausing scanout reads is harmless;
|
||
// both VRAM ports have the same registered one-cycle latency.
|
||
localparam bit CLUT_LOAD_USE_READ0 = CLUT_CSM1_ENABLE && !VRAM_ENABLE_READ2;
|
||
//
|
||
// clut_loader_stub copies 256 PSMCT32 entries from VRAM[CBP*256] into
|
||
// clut_stub when a TEX0_1 commit (gs_tex0_wr) carries a load-enabling
|
||
// CLD/CSM/CPSM. This runs at TEX0 commit — BEFORE the raster scan of
|
||
// the textured primitive — so the loader's read2 use is time-disjoint
|
||
// from the texel fetch (the mutual exclusion the architecture relies
|
||
// on; it is the FIFTH read2 consumer, given top priority below).
|
||
//
|
||
// The loader was written against a COMBINATIONAL VRAM read (data in
|
||
// the SAME cycle as the address). vram_bram_stub.read2 is a 1-cycle
|
||
// REGISTERED read, so the word for entry N arrives one cycle AFTER the
|
||
// loader presents addr(N)+write_idx=N. The loader's clut_write_data
|
||
// (= vram_read_data = the registered read) is therefore ALREADY
|
||
// aligned to the late entry, while clut_write_en/idx are one cycle
|
||
// early. To realign we delay ONLY en+idx by one cycle (so en/idx for
|
||
// entry N land in the same cycle the registered data for entry N
|
||
// returns) and use the loader's LIVE write_data — which at that cycle
|
||
// already carries entry N's word.
|
||
// ---------------------------------------------------------------------
|
||
logic [31:0] clut_ld_rd_addr;
|
||
logic clut_ld_wr_en_c;
|
||
logic [7:0] clut_ld_wr_idx_c;
|
||
logic [31:0] clut_ld_wr_data_c;
|
||
|
||
clut_loader_stub #(.CLUT_CSM1_ENABLE(CLUT_CSM1_ENABLE)) u_clut_loader (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.tex0_wr_pulse (gs_tex0_wr),
|
||
.tex0_cbp (gs_tex0_cbp),
|
||
.tex0_cpsm (gs_tex0_cpsm),
|
||
.tex0_csm (gs_tex0_csm),
|
||
.tex0_csa (gs_tex0_csa),
|
||
.tex0_cld (gs_tex0_cld),
|
||
.vram_read_addr(clut_ld_rd_addr),
|
||
.vram_read_data(CLUT_LOAD_USE_READ0 ? vram_rdata : vram_read2_data),
|
||
.clut_write_en (clut_ld_wr_en_c),
|
||
.clut_write_idx(clut_ld_wr_idx_c),
|
||
.clut_write_data(clut_ld_wr_data_c),
|
||
.load_busy (clut_ld_busy)
|
||
);
|
||
|
||
// 1-cycle realignment: delay en+idx only; write_data is taken LIVE
|
||
// (it is the registered read, already aligned to the late entry).
|
||
logic clut_ld_wr_en_q;
|
||
logic [7:0] clut_ld_wr_idx_q;
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
clut_ld_wr_en_q <= 1'b0;
|
||
clut_ld_wr_idx_q <= 8'd0;
|
||
end else begin
|
||
clut_ld_wr_en_q <= clut_ld_wr_en_c;
|
||
clut_ld_wr_idx_q <= clut_ld_wr_idx_c;
|
||
end
|
||
end
|
||
|
||
clut_stub u_clut (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.write_en (clut_ld_wr_en_q),
|
||
.write_idx(clut_ld_wr_idx_q),
|
||
.write_data(clut_ld_wr_data_c), // LIVE registered-read data
|
||
// pcrtc scanout read port — unused in this textured-board top
|
||
// (PCRTC clut_enable=0). Tie its index to 0.
|
||
.read_idx (8'd0),
|
||
.read_data(),
|
||
// texture-sampler read port — drives the PSMT8 index lookup.
|
||
.tex_read_idx (gs_clut_rd_idx),
|
||
.tex_read_data(gs_clut_rd_data)
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// Ch295 — read2 (second VRAM read port) ARBITRATION.
|
||
//
|
||
// vram_bram_stub exposes ONE second read port (read2). Two
|
||
// consumers want it, but they are MUTUALLY EXCLUSIVE BY PIXEL
|
||
// FORMAT, so a static-priority mux is collision-free:
|
||
//
|
||
// (A) PSMT4 RMW old-byte read — fires only on `is_t4_emit`, i.e.
|
||
// when the FRAME buffer PSM == PSMT4 (the raster/xfer writer
|
||
// is committing a 4-bit nibble and needs the live byte to
|
||
// splice). gs_stub holds PSM constant within a raster.
|
||
// (B) Textured-SPRITE texel fetch — fires only on `gs_tex_rd_en`,
|
||
// i.e. SPRITE + TME + texture-PSM==PSMCT32 (see gs_stub
|
||
// s1_tex_active). A textured PSMCT32 SPRITE writes the FRAME
|
||
// buffer as PSMCT32, so `is_t4_emit` is 0 for every pixel of
|
||
// that primitive — the T4 path is dormant.
|
||
//
|
||
// Because a given primitive is EITHER a PSMT4 write (consumer A) OR
|
||
// a PSMCT32 textured SPRITE (consumer B), the two enables never
|
||
// assert in the same cycle. T4 gets static priority (it is the
|
||
// load-bearing RMW path for the PSMT4 production cases); the texel
|
||
// fetch takes the port whenever T4 is idle. The runtime assertion
|
||
// below flags any overlap loudly in sim (it must never fire).
|
||
//
|
||
// Latency: vram_bram_stub.read2 is a 1-cycle REGISTERED read. The
|
||
// T4 pipe already assumes this (it presents addr on the emit cycle
|
||
// and consumes read2_data one cycle later). gs_stub is built with
|
||
// TEX_RD_REGISTERED=1 so it presents the texel address one stage
|
||
// early (S0) and consumes the registered data at S1 — the same
|
||
// 1-cycle round trip. Both consumers therefore see the identical
|
||
// BRAM read latency they were each designed against.
|
||
//
|
||
// Brick 2a — THIRD read2 consumer: the alpha-blend dest-fb read
|
||
// (gs_fb_rd_en/gs_fb_rd_addr). It is mutually exclusive with BOTH
|
||
// existing consumers by pixel format:
|
||
// - vs PSMT4 RMW (A): a flat alpha-blend SPRITE writes PSMCT32, so
|
||
// is_t4_emit is 0 for every pixel of that primitive.
|
||
// - vs texel fetch (B): gs_stub only sets fb_rd_en for a FLAT
|
||
// (non-textured) blended SPRITE, and only sets tex_rd_en for a
|
||
// TEXTURED SPRITE. A given SPRITE is one or the other, never
|
||
// both — gs_stub.new_abe_active requires !close_tme_effective.
|
||
// Brick 2b — FOURTH read2 consumer: the Z-buffer stored-Z read
|
||
// (gs_z_rd_en/gs_z_rd_addr). Mutually exclusive with ALL three
|
||
// existing consumers by pixel format / feature:
|
||
// - vs PSMT4 RMW (A): a flat Z-tested SPRITE writes PSMCT32, so
|
||
// is_t4_emit is 0 for every pixel of that primitive.
|
||
// - vs texel fetch (B): gs_stub only sets z_rd_en for a FLAT
|
||
// (non-textured) Z-tested SPRITE (new_zte_active requires
|
||
// !close_tme_effective); tex_rd_en is set only for TEXTURED.
|
||
// - vs alpha dest-fb (C): new_zte_active requires !new_abe_active,
|
||
// so a primitive is EITHER Z-tested OR alpha-blended, never both.
|
||
// Static priority T4 > texel > dest-fb > Z-read; the assertion below
|
||
// flags any overlap loudly in sim (it must never fire).
|
||
// Ch296 — FIFTH read2 consumer: the CLUT load (clut_ld_busy). It runs
|
||
// at TEX0 commit, strictly BEFORE the raster scan, so it is
|
||
// time-disjoint from the four raster-time consumers below. Given top
|
||
// priority; the assertion confirms it never coincides with a raster
|
||
// read in practice. (vram_read2_addr/_data are declared above.)
|
||
assign vram_read2_addr = clut_ld_busy ? clut_ld_rd_addr
|
||
: is_t4_emit ? (vram_waddr_pre & ~32'd3)
|
||
: gs_tex_rd_en ? gs_tex_rd_addr
|
||
: gs_fb_rd_en ? gs_fb_rd_addr
|
||
: gs_z_rd_en ? gs_z_rd_addr
|
||
: 32'd0;
|
||
|
||
// Texel fetch, dest-fb read AND Z-read consume the SAME registered
|
||
// read2_data. (The T4 lane extraction above also reads
|
||
// vram_read2_data; none of the four overlap.)
|
||
// Ch322 — TEXEL data is muxed to the external prefilled texture cache when
|
||
// GS_LPDDR_TEX and the cache is warm and the byte address is inside the cached
|
||
// texture range. The select is registered ONE cycle so it aligns with the cache's
|
||
// 1-cycle registered data AND vram_read2_data (both land the cycle after the
|
||
// address is presented) — identical timing to the BRAM texel path. fb/Z reads
|
||
// always come from BRAM (the cache only services texture).
|
||
assign gs_tex_rd_en_o = gs_tex_rd_en;
|
||
assign gs_tex_rd_addr_o = gs_tex_rd_addr;
|
||
generate if (GS_LPDDR_TEX) begin : g_lpddr_tex
|
||
wire tex_in_range = (gs_tex_rd_addr >= TEX_VRAM_BASE)
|
||
&& (gs_tex_rd_addr < (TEX_VRAM_BASE + TEX_CACHE_BYTES));
|
||
logic tex_cache_sel_q, gs_tex_rd_en_d;
|
||
logic [31:0] cache_hits_q, bram_hits_q;
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
tex_cache_sel_q <= 1'b0; gs_tex_rd_en_d <= 1'b0;
|
||
cache_hits_q <= 32'd0; bram_hits_q <= 32'd0;
|
||
end else begin
|
||
tex_cache_sel_q <= tex_cache_ready_i && gs_tex_rd_en && tex_in_range;
|
||
gs_tex_rd_en_d <= gs_tex_rd_en;
|
||
// count each served texel read by source (sel_q and en_d are both 1-cycle
|
||
// delayed off the same read, so they align): cache vs BRAM-fallback.
|
||
if (tex_cache_sel_q) cache_hits_q <= cache_hits_q + 32'd1;
|
||
else if (gs_tex_rd_en_d) bram_hits_q <= bram_hits_q + 32'd1;
|
||
end
|
||
end
|
||
assign gs_tex_rd_data = tex_cache_sel_q ? tex_cache_data_i : vram_read2_data;
|
||
assign tex_cache_hits_o = cache_hits_q;
|
||
assign tex_bram_hits_o = bram_hits_q;
|
||
end else begin : g_no_lpddr_tex
|
||
assign gs_tex_rd_data = vram_read2_data;
|
||
assign tex_cache_hits_o = 32'd0;
|
||
assign tex_bram_hits_o = 32'd0;
|
||
end endgenerate
|
||
assign gs_fb_rd_data = vram_read2_data;
|
||
assign gs_z_rd_data = vram_read2_data;
|
||
|
||
// synthesis translate_off
|
||
always_ff @(posedge clk) begin
|
||
if (rst_n && is_t4_emit && gs_tex_rd_en)
|
||
$error("Ch295: read2 arbitration overlap @%0t — PSMT4 RMW (is_t4_emit) and texel fetch (gs_tex_rd_en) both active; one read is being dropped. These are supposed to be mutually exclusive by pixel format.",
|
||
$time);
|
||
if (rst_n && gs_fb_rd_en && (is_t4_emit || gs_tex_rd_en))
|
||
$error("Brick2a: read2 arbitration overlap @%0t — alpha dest-fb read (gs_fb_rd_en) collides with %s; one read is being dropped. Flat-blend must be mutually exclusive with both.",
|
||
$time, is_t4_emit ? "PSMT4 RMW" : "texel fetch");
|
||
if (rst_n && gs_z_rd_en && (is_t4_emit || gs_tex_rd_en || gs_fb_rd_en))
|
||
$error("Brick2b: read2 arbitration overlap @%0t — Z-buffer read (gs_z_rd_en) collides with another consumer; one read is being dropped. Z-tested flat sprite must be mutually exclusive with T4/texel/alpha.",
|
||
$time);
|
||
if (rst_n && clut_ld_busy && (is_t4_emit || gs_tex_rd_en || gs_fb_rd_en || gs_z_rd_en))
|
||
$error("Ch296: read2 arbitration overlap @%0t — CLUT load (clut_ld_busy) collides with a raster-time consumer; one read is being dropped. CLUT load must complete BEFORE the raster scan begins.",
|
||
$time);
|
||
end
|
||
// synthesis translate_on
|
||
|
||
assign vram_read_addr_o = vram_raddr; // Ch320 — forward PCRTC scanout addr to the wrapper
|
||
logic [31:0] vram_read0_addr;
|
||
assign vram_read0_addr = (CLUT_LOAD_USE_READ0 && clut_ld_busy)
|
||
? clut_ld_rd_addr : vram_raddr;
|
||
|
||
// Ch157 — vram_bram_stub fed by the normalize_write output for
|
||
// CT32/CT16/T8 (same-cycle) and by the T4 RMW pipe (1-cycle
|
||
// delayed). PCRTC's `vram_read_data` arrives one cycle late vs
|
||
// the legacy combinational read, and gs_pcrtc_stub now consumes
|
||
// it via VRAM_SYNC_READ=1 (Ch158): the data-decode + sync-output
|
||
// pipeline is shifted right by 1 cycle so it lines up with the
|
||
// BRAM's registered output, and the sub-word lane extract
|
||
// resolves the right byte/halfword/nibble inside the 32-bit
|
||
// word for CT16/T8/T4 reads.
|
||
vram_bram_stub #(
|
||
.BYTES (VRAM_BYTES),
|
||
.ENABLE_READ2 (VRAM_ENABLE_READ2)
|
||
) u_vram (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.write_en (vram_we_final),
|
||
.write_addr(vram_waddr_final),
|
||
.write_data(vram_wdata_final),
|
||
.write_be (vram_wbe_final),
|
||
.read_addr (vram_read0_addr),
|
||
.read_data (vram_rdata),
|
||
.read_valid(),
|
||
.read2_addr(vram_read2_addr),
|
||
.read2_data(vram_read2_data),
|
||
.read2_valid()
|
||
);
|
||
|
||
// ---------------------------------------------------------------------
|
||
// gs_pcrtc_stub — PSMCT32 swizzled scanout
|
||
// ---------------------------------------------------------------------
|
||
logic end_of_frame;
|
||
|
||
gs_pcrtc_stub #(
|
||
.H_ACTIVE(H_ACTIVE), .H_FRONT(H_FRONT), .H_SYNC(H_SYNC), .H_BACK(H_BACK),
|
||
.V_ACTIVE(V_ACTIVE), .V_FRONT(V_FRONT), .V_SYNC(V_SYNC), .V_BACK(V_BACK),
|
||
.PSMCT32_SWIZZLE (PSMCT32_SWIZZLE),
|
||
.VRAM_SYNC_READ (1'b1),
|
||
.STRIP_PCRTC_MAG_DIV(STRIP_PCRTC_MAG_DIV)
|
||
) u_pcrtc (
|
||
.clk(clk), .rst_n(rst_n),
|
||
.pmode_q (pmode_q),
|
||
.dispfb1_q (dispfb1_q),
|
||
.display1_q (display1_q),
|
||
.vram_read_addr(vram_raddr),
|
||
.vram_read_data(vram_rdata),
|
||
.clut_enable (1'b0),
|
||
// (vram_read_addr_o forwarded below)
|
||
.clut_csa (5'd0),
|
||
.clut_read_idx (),
|
||
.clut_read_data(32'd0),
|
||
.hsync(hsync), .vsync(vsync), .de(de),
|
||
.r(r), .g(g), .b(b),
|
||
.pix_window_o(pix_window_o),
|
||
.ev_valid(), .ev_subsys(), .ev_event(),
|
||
.ev_arg0(), .ev_arg1(), .ev_arg2(), .ev_arg3(), .ev_flags()
|
||
);
|
||
|
||
// gs_pcrtc_stub doesn't expose end_of_frame as a port; the Ch123 TB
|
||
// taps it via hierarchical ref. For the top wrapper we synthesize an
|
||
// equivalent edge by watching vsync rise.
|
||
logic vsync_d;
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) vsync_d <= 1'b0;
|
||
else vsync_d <= vsync;
|
||
end
|
||
assign end_of_frame = vsync && !vsync_d;
|
||
|
||
// ---------------------------------------------------------------------
|
||
// Sticky status outputs.
|
||
// ---------------------------------------------------------------------
|
||
logic dma_done_seen_q;
|
||
logic frame_seen_q;
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
dma_done_seen_q <= 1'b0;
|
||
frame_seen_q <= 1'b0;
|
||
end else begin
|
||
if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
|
||
dma_done_seen_q <= 1'b1;
|
||
if (end_of_frame)
|
||
frame_seen_q <= 1'b1;
|
||
end
|
||
end
|
||
assign dma_done_seen = dma_done_seen_q;
|
||
assign frame_seen = frame_seen_q;
|
||
|
||
// ---------------------------------------------------------------------
|
||
// Ch174 — event toggles for HPS-visible counters. Separate from the
|
||
// sticky status latches above so LED bits / CORE_STATUS keep their
|
||
// existing one-shot semantics.
|
||
// ---------------------------------------------------------------------
|
||
logic frame_toggle_q;
|
||
logic dma_done_toggle_q;
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
frame_toggle_q <= 1'b0;
|
||
dma_done_toggle_q <= 1'b0;
|
||
end else begin
|
||
if (end_of_frame)
|
||
frame_toggle_q <= ~frame_toggle_q;
|
||
if (dmac_ev_valid && (dmac_ev_event == EV_DMA_DONE))
|
||
dma_done_toggle_q <= ~dma_done_toggle_q;
|
||
end
|
||
end
|
||
assign frame_toggle = frame_toggle_q;
|
||
assign dma_done_toggle = dma_done_toggle_q;
|
||
|
||
endmodule : top_psmct32_raster_demo_bram
|