// retroDE_ps2 — gs_pcrtc_stub (Ch90) // // Minimal PCRTC (Programmable CRT Controller) scanout engine. // Real PS2 PCRTC reads VRAM via a DISPFB (display framebuffer) // configuration register and feeds the analog video DAC. This // stub is the SCANOUT side of the GS pipeline — its dual is // gs_stub, which is the WRITE side. Together they close the loop // from `raster_pixel_emit` (Ch88) → vram_stub (Ch89) → visible // pixels (Ch90). // // Architectural note. `platform_video_stub` is a flood-fill video // adapter that always paints BGCOLOR within its active area — // it predates VRAM persistence and stays as-is for back-compat. // `gs_pcrtc_stub` is the SCANOUT-AWARE alternative, used by TBs // that want to verify the round trip "gs_stub writes a pixel → // vram_stub stores it → pcrtc reads it back as video." We did // not extend platform_video_stub (which would have rippled // through 6 existing TBs); pcrtc is a parallel module that owns // its own raster timing AND vram read addressing, so a TB picks // the one that fits. // // Scope: // - Single DISPFB context: pcrtc consumes `pmode_q` and // `dispfb1_q` directly from gs_stub's privileged CPU MMIO // latches (Ch91). The Ch90 sideband ports // (scanout_enable / dispfb_fbp / dispfb_fbw) are gone — TBs // drive scanout configuration the way a real driver would, // by writing PMODE and DISPFB1 through the gs_stub.reg_wr_* // port. This means `wait (raster_done); write PMODE.EN1=1` // is the canonical sequence, not a sideband poke. // - Addressing: linear by DEFAULT — fb_addr math mirrors // gs_stub's pixel fb_addr math byte-exactly so a pixel // written at (x,y) reads back at (x,y) without swizzle // reconciliation. Four OPTIONAL per-PSM swizzle paths gated // by parameters: `PSMCT32_SWIZZLE=1` (Ch120) routes PSMCT32 // reads through gs_swizzle_psmct32_stub; `PSMCT16_SWIZZLE=1` // (Ch126) routes PSMCT16 reads through gs_swizzle_psmct16_stub; // `PSMT8_SWIZZLE=1` (Ch132) routes PSMT8 reads through // gs_swizzle_psmt8_stub (page=128×64 px, bw_pg=FBW>>1 — FBW // must be even for PSMT8); `PSMT4_SWIZZLE=1` (Ch138) routes // PSMT4 reads through gs_swizzle_psmt4_stub (page=128×128 px, // bw_pg=FBW>>1 — FBW must be even for PSMT4; module also // outputs nibble_hi selector since PSMT4 packs 2 pixels/byte). // The four parameters are independent. All four defaults are // 0 → existing TBs see legacy linear behavior. // - PSMCT32 (PSM=0), PSMCT16 (PSM=2), PSMT8 (PSM=0x13), and // PSMT4 (PSM=0x14) are honored at this scope. Any other // PSM forces scanout off rather than mis-decoding the byte // layout. PSMCT16 reads 2 bytes/pixel and unpacks RGB5A1 → // RGB888 via bit-replicate. PSMT8 reads 1 byte/pixel and // PSMT4 reads 4 bits/pixel (2 pixels/byte, low nibble = // even pixel). For PSMT8 / PSMT4, with `clut_enable=1` the // index is looked up in clut_stub for real RGB; with // `clut_enable=0`, the index/nibble surfaces as grayscale. // gs_stub's raster channel emits PSMCT32 + PSMCT16 (Ch95) + // PSMT8 (Ch105) + PSMT4 (Ch106). CLUT contents come from a // TB-direct write OR from a VRAM→CLUT load triggered by // TEX0_1.CLD via clut_loader_stub (Ch99..Ch102). // - Single CRTC: one display, one DISPFB context. Real PS2 has // two (DISPFB1/DISPLAY1 and DISPFB2/DISPLAY2) for interlace/ // merge. The PMODE.EN2 + DISPFB2/DISPLAY2 path is deferred. // - DISPLAY1 DX/DY/DW/DH ARE honored (Ch92): they define the // display window inside the active area. Outside the window, // pcrtc emits 0 for r/g/b even with scanout_enable=1. // MAGH/MAGV ARE honored (Ch93): each VRAM column shows for // (MAGH+1) consecutive VCK pulses before advancing, and each // VRAM line shows for (MAGV+1) raster lines. Practically, // a 4-pixel-wide VRAM sprite with MAGH=1 (2×) appears 8 // pixels wide on screen. The H/V totals still come from // module parameters at instantiation. Real PS2 driver- // equivalent bring-up is now "configure DISPFB1 → configure // DISPLAY1 → render → set PMODE.EN1=1." Note: DISPLAY1=0 // (post-reset default) means a 1×1 window at (0,0); a TB // MUST configure DISPLAY1 for anything visible to scan out. // - When scanout_enable // (= PMODE.EN1 & (PSMCT32 || PSMCT16 || PSMT8 || PSMT4)) // is 0, r/g/b output is forced to 0 across the active area. // There's no BGCOLOR fallback in this module — that lives in // platform_video_stub. // // Trace payload: one EV_MODE pulse per completed frame, mirroring // platform_video_stub's schema (arg0=frame_count, arg1=H*V). // PLAT MODE arg0=frame_number arg1=pixels_per_frame arg2=- arg3=- `timescale 1ns/1ps module gs_pcrtc_stub import trace_pkg::*; #( // Horizontal timing (in pixel clocks). Defaults match // platform_video_stub's tiny-TB convention. parameter int H_ACTIVE = 16, parameter int H_FRONT = 2, parameter int H_SYNC = 4, parameter int H_BACK = 2, // Vertical timing (in lines) parameter int V_ACTIVE = 8, parameter int V_FRONT = 1, parameter int V_SYNC = 1, parameter int V_BACK = 1, parameter bit HSYNC_ACTIVE_LOW = 1'b1, parameter bit VSYNC_ACTIVE_LOW = 1'b1, // Ch120 — when set, PSMCT32 scanout reads VRAM via the real PS2 // GS page/block swizzle (gs_swizzle_psmct32_stub) instead of the // legacy linear `FBW*64*y + x*4` formula. PSMCT16 / PSMT8 / PSMT4 // are governed by their own gates (PSMCT16_SWIZZLE Ch126, // PSMT8_SWIZZLE Ch132, PSMT4_SWIZZLE Ch138 — see below). // Default 0 keeps every existing PSMCT32 scanout TB on the // original linear addressing. parameter bit PSMCT32_SWIZZLE = 1'b0, // Ch126 — when set, PSMCT16 scanout reads VRAM via the real PS2 // GS page/block/column swizzle (gs_swizzle_psmct16_stub) instead // of the legacy linear `FBW*64*y + x*2` formula. PSMCT32 / PSMT8 // / PSMT4 are governed by their own gates (PSMCT32_SWIZZLE / // PSMT8_SWIZZLE) or stay linear. Default 0 keeps every existing // PSMCT16 scanout TB (Ch94 PSM-aware, Ch95 raster, Ch103 PSMT4- // via-CT16-CLUT, etc.) on the original linear addressing. parameter bit PSMCT16_SWIZZLE = 1'b0, // Ch132 — when set, PSMT8 scanout reads VRAM via the real PS2 GS // page/block/column swizzle (gs_swizzle_psmt8_stub) instead of // the legacy linear `FBW*64*y + x` formula. PSMT8 pages are 128 // px wide (vs 64 px for CT32/CT16) so the swizzle internally uses // bw_pg = FBW>>1 — PCSX2 asserts FBW must be even for PSMT8. // Default 0 keeps every existing PSMT8 scanout TB (Ch96, Ch97, // Ch103 PSMT4-via-CT16-CLUT, Ch107 PSMT4-e2e palette path, etc.) // on the original linear addressing. PSMCT32 / PSMCT16 / PSMT4 // are governed by their own gates or stay linear. parameter bit PSMT8_SWIZZLE = 1'b0, // Ch138 — when set, PSMT4 scanout reads VRAM via the real PS2 GS // page/block/column swizzle (gs_swizzle_psmt4_stub) instead of // the legacy linear `byte_offset = pixel_index >> 1` formula. // PSMT4 pixels are 4 bits each (2 pixels per byte); the swizzle // module outputs both an absolute byte address AND a `nibble_hi` // selector that picks the high or low nibble of the byte at // that address. PSMT4 pages are 128 px wide (same as PSMT8) so // the swizzle internally uses bw_pg = FBW>>1 — PCSX2 asserts // FBW must be even for PSMT4. The grayscale + CLUT lookup paths // BOTH use the same swizzle output: the byte at `addr` is read // from VRAM, and `nibble_hi` (instead of pixel_index[0]) picks // which nibble. Default 0 keeps every existing PSMT4 scanout TB // (Ch103 PSMT4+CLUT, Ch104 PSMT4 round-trip, Ch107 PSMT4 e2e, // etc.) on the original linear addressing. PSMCT32 / PSMCT16 / // PSMT8 are governed by their own gates. parameter bit PSMT4_SWIZZLE = 1'b0, // Ch158 — when set, the data-decode + sync-output pipeline is // delayed by 1 cycle so it aligns with a sync-read VRAM (e.g. // `vram_bram_stub`, Ch154) whose `read_data` is registered. // The address-driving stage (`vram_read_addr`) keeps using the // current `(hcnt, vcnt)` so the read is issued one pixel // "ahead"; the registered `vram_read_data` returns a cycle // later, and the decode comb consumes the matching delayed // counter view via the `*_dec` signals. // // Default 0 preserves the legacy combinational-read behavior // every existing PCRTC TB (Ch90+ scanout TBs) is written // against — those TBs drive `vram_read_data` via legacy // `vram_stub` (comb read) and consume r/g/b on the same // cycle as the addr drive. Set to 1 in the BRAM wrapper / // board top once `vram_bram_stub` is the storage. parameter bit VRAM_SYNC_READ = 1'b0, // Ch163 — bypass the magnification dividers // `vram_x_unshift = hwin_rel / hmag_factor` and the matching y // form when the demo locks `MAGH = MAGV = 0`. Quartus infers a // 32-bit hardware divider from the `/` operators above (the // Ch162 STA worst path after STRIP_HW_DIVIDER closed the EE- // core divider). For demos that never write MAGH/MAGV non-zero // — which includes the PSMCT32 raster demo and every other // hardware-target wrapper today — the divisors are constant 1 // and the math collapses to a passthrough. // // Default 0 keeps the existing divider math live so every // Ch93-era scanout MAG TB stays green (the TBs that drive // MAGH != 0 / MAGV != 0 such as `tb_gs_scanout_magh_magv` // continue to use the default). // // When 1, `vram_x_unshift = hwin_rel` / `vram_y_unshift = // vwin_rel` — equivalent to the MAGH=MAGV=0 case but without // the divider. The hardware-demo path forwards this parameter // through `top_psmct32_raster_demo_bram` and the DE25-Nano // board top sets it to 1'b1. parameter bit STRIP_PCRTC_MAG_DIV = 1'b0 ) ( input logic clk, input logic rst_n, // Ch91/Ch92/Ch93/Ch94/Ch96/Ch103 — PMODE + DISPFB1 + DISPLAY1 // latches from gs_stub's privileged CPU MMIO port. // EN1 (PMODE bit 0) gates scanout. DISPFB1 carries the // framebuffer base / width / PSM the PCRTC reads from // (PSMCT32, PSMCT16, PSMT8, and PSMT4 honored at this scope; // any other PSM forces scanout off). DISPLAY1 carries the // display window: DX/DY = origin within the active area; // DW/DH = width/height MINUS one (real PS2 semantics). // MAGH/MAGV (Ch93) scale the window-relative coordinate so // each VRAM column/line repeats for (MAGH+1)/(MAGV+1) // displayed pulses/lines; pcrtc still takes H/V TOTALS from // module parameters at instantiation, not from registers. input logic [63:0] pmode_q, input logic [63:0] dispfb1_q, input logic [63:0] display1_q, // VRAM read port: combinational read from vram_stub. output logic [31:0] vram_read_addr, input logic [31:0] vram_read_data, // Ch97 — CLUT (palette) read port for indexed-color scanout. // When `clut_enable` is high AND the active PSM is PSMT8, // pcrtc presents `clut_read_idx = vram_read_data[7:0] + // (clut_csa << 4)` and decodes the returned PSMCT32 RGB // entry instead of the grayscale fallback. CSM is implicitly // CSM2 (linear). CSA shifts the lookup window in 16-entry // increments and wraps mod 256. When `clut_enable` is low, // the CLUT is bypassed and PSMT8 still scans out as // grayscale (Ch96 default). input logic clut_enable, input logic [4:0] clut_csa, output logic [7:0] clut_read_idx, input logic [31:0] clut_read_data, // Video out output logic hsync, output logic vsync, output logic de, output logic [7:0] r, output logic [7:0] g, output logic [7:0] b, // Ch320 — high exactly when this scanout pixel is inside the displayed frame // (scanout enabled AND within the DX/DY/DW/DH display window). Aligned to r/g/b. // An LPDDR4B scanout reader gates its pixels by this so it shows ONE frame, not // a tiled fill of the whole active line. output logic pix_window_o, // Trace output logic ev_valid, output subsys_e ev_subsys, output event_e ev_event, output logic [63:0] ev_arg0, output logic [63:0] ev_arg1, output logic [63:0] ev_arg2, output logic [63:0] ev_arg3, output logic [31:0] ev_flags ); localparam int H_TOTAL = H_ACTIVE + H_FRONT + H_SYNC + H_BACK; localparam int V_TOTAL = V_ACTIVE + V_FRONT + V_SYNC + V_BACK; localparam int H_SYNC_START = H_ACTIVE + H_FRONT; localparam int H_SYNC_END = H_SYNC_START + H_SYNC; localparam int V_SYNC_START = V_ACTIVE + V_FRONT; localparam int V_SYNC_END = V_SYNC_START + V_SYNC; localparam int HCNT_W = $clog2(H_TOTAL); localparam int VCNT_W = $clog2(V_TOTAL); logic [HCNT_W-1:0] hcnt; logic [VCNT_W-1:0] vcnt; logic end_of_line; logic end_of_frame; assign end_of_line = (hcnt == HCNT_W'(H_TOTAL - 1)); assign end_of_frame = end_of_line && (vcnt == VCNT_W'(V_TOTAL - 1)); always_ff @(posedge clk) begin if (!rst_n) begin hcnt <= '0; vcnt <= '0; end else if (end_of_line) begin hcnt <= '0; vcnt <= end_of_frame ? '0 : (vcnt + VCNT_W'(1)); end else begin hcnt <= hcnt + HCNT_W'(1); end end logic active_h; logic active_v; logic in_hsync; logic in_vsync; assign active_h = (hcnt < HCNT_W'(H_ACTIVE)); assign active_v = (vcnt < VCNT_W'(V_ACTIVE)); assign in_hsync = (hcnt >= HCNT_W'(H_SYNC_START)) && (hcnt < HCNT_W'(H_SYNC_END)); assign in_vsync = (vcnt >= VCNT_W'(V_SYNC_START)) && (vcnt < VCNT_W'(V_SYNC_END)); // ------------------------------------------------------------------ // Ch158 — decode-stage pipeline. When `VRAM_SYNC_READ=1`, every // hcnt/vcnt-derived signal that the data-decode stage consumes // is delayed by 1 cycle so it lines up with `vram_bram_stub`'s // 1-cycle-late `vram_read_data`. The address-side // (`vram_read_addr`) keeps using the current `hcnt`/`vcnt` so the // read is issued one pixel "ahead". // // The registers below always exist (zero-cost in sim, optimized // away when unreached in synthesis); the `*_dec` muxes select // between the registered view (sync) and the live signal // (legacy comb-read passthrough). // ------------------------------------------------------------------ logic in_hsync_q, in_vsync_q; logic active_h_q, active_v_q; logic in_display_window_q, scanout_enable_q; logic dispfb_psm_ct32_q, dispfb_psm_ct16_q, dispfb_psm_t8_q, dispfb_psm_t4_q; logic psm4_nibble_select_q; logic end_of_frame_q; logic in_hsync_dec, in_vsync_dec; logic active_h_dec, active_v_dec; logic in_display_window_dec, scanout_enable_dec; logic dispfb_psm_ct32_dec, dispfb_psm_ct16_dec, dispfb_psm_t8_dec, dispfb_psm_t4_dec; logic psm4_nibble_select_dec; logic end_of_frame_dec; // psm4_nibble_select / dispfb_psm_* / scanout_enable / // in_display_window are forward-referenced — they are declared // and assigned later in the file (after the address/decode // logic that produces them). SystemVerilog allows module-level // forward references inside always_ff/always_comb blocks; the // registers below capture them at every posedge. always_ff @(posedge clk) begin if (!rst_n) begin in_hsync_q <= 1'b0; in_vsync_q <= 1'b0; active_h_q <= 1'b0; active_v_q <= 1'b0; in_display_window_q <= 1'b0; scanout_enable_q <= 1'b0; dispfb_psm_ct32_q <= 1'b0; dispfb_psm_ct16_q <= 1'b0; dispfb_psm_t8_q <= 1'b0; dispfb_psm_t4_q <= 1'b0; psm4_nibble_select_q <= 1'b0; end_of_frame_q <= 1'b0; end else begin in_hsync_q <= in_hsync; in_vsync_q <= in_vsync; active_h_q <= active_h; active_v_q <= active_v; in_display_window_q <= in_display_window; scanout_enable_q <= scanout_enable; dispfb_psm_ct32_q <= dispfb_psm_ct32; dispfb_psm_ct16_q <= dispfb_psm_ct16; dispfb_psm_t8_q <= dispfb_psm_t8; dispfb_psm_t4_q <= dispfb_psm_t4; psm4_nibble_select_q <= psm4_nibble_select; end_of_frame_q <= end_of_frame; end end assign in_hsync_dec = VRAM_SYNC_READ ? in_hsync_q : in_hsync; assign in_vsync_dec = VRAM_SYNC_READ ? in_vsync_q : in_vsync; assign active_h_dec = VRAM_SYNC_READ ? active_h_q : active_h; assign active_v_dec = VRAM_SYNC_READ ? active_v_q : active_v; assign in_display_window_dec = VRAM_SYNC_READ ? in_display_window_q : in_display_window; assign scanout_enable_dec = VRAM_SYNC_READ ? scanout_enable_q : scanout_enable; // Ch320 — same gate the r/g/b output uses (line ~"if (de && scanout_enable_dec && // in_display_window_dec)"), minus de (the HDMI path applies de). Lets an external // LPDDR4B scanout reader blank outside the displayed frame, matching BRAM scanout. assign pix_window_o = scanout_enable_dec && in_display_window_dec; assign dispfb_psm_ct32_dec = VRAM_SYNC_READ ? dispfb_psm_ct32_q : dispfb_psm_ct32; assign dispfb_psm_ct16_dec = VRAM_SYNC_READ ? dispfb_psm_ct16_q : dispfb_psm_ct16; assign dispfb_psm_t8_dec = VRAM_SYNC_READ ? dispfb_psm_t8_q : dispfb_psm_t8; assign dispfb_psm_t4_dec = VRAM_SYNC_READ ? dispfb_psm_t4_q : dispfb_psm_t4; assign psm4_nibble_select_dec = VRAM_SYNC_READ ? psm4_nibble_select_q : psm4_nibble_select; assign end_of_frame_dec = VRAM_SYNC_READ ? end_of_frame_q : end_of_frame; assign hsync = HSYNC_ACTIVE_LOW ? ~in_hsync_dec : in_hsync_dec; assign vsync = VSYNC_ACTIVE_LOW ? ~in_vsync_dec : in_vsync_dec; assign de = active_h_dec && active_v_dec; // ------------------------------------------------------------------ // VRAM addressing. Mirror gs_stub's fb_addr math byte-exactly // so written-then-scanned pixels round-trip without // reconciliation: // fbp_bytes = dispfb_fbp << 11 (FBP * 2048) // pixels_per_row = dispfb_fbw << 6 (FBW * 64) // effective_x = (hcnt - DX) / (MAGH+1) + DBX (Ch92/Ch93) // effective_y = (vcnt - DY) / (MAGV+1) + DBY // pixel_index = effective_y * pixels_per_row + effective_x // byte_offset = pixel_index << dispfb_bpp_shift // fb_addr = fbp_bytes + byte_offset // dispfb_bpp_shift is now PSM-aware (Ch94/Ch96): 2 for // PSMCT32, 1 for PSMCT16, 0 for PSMT8. Other PSMs force // scanout off rather than mis-decoding bytes. // ------------------------------------------------------------------ // Decode DISPFB1 sub-fields per real PS2 GS register layout // (PCSX2 GSRegs.h — DISPFB structure): // FBP : [8:0] base address in 2048-byte units // FBW : [14:9] width in 64-pixel units // PSM : [19:15] pixel storage mode (we only honor PSMCT32 = 0) // DBX : [42:32] display-buffer X origin (Ch91-audit fix) // DBY : [53:43] display-buffer Y origin (Ch91-audit fix) // // DBX/DBY shift the scanout's VRAM origin: the pixel that // appears at (hcnt=0, vcnt=0) is VRAM (DBX, DBY), not (0, 0). // Useful for double-buffered framebuffers and offset display // windows. logic [8:0] dispfb_fbp; logic [5:0] dispfb_fbw; logic [4:0] dispfb_psm; logic [10:0] dispfb_dbx; logic [10:0] dispfb_dby; logic dispfb_psm_ok; logic pmode_en1; logic scanout_enable; assign dispfb_fbp = dispfb1_q[8:0]; assign dispfb_fbw = dispfb1_q[14:9]; assign dispfb_psm = dispfb1_q[19:15]; assign dispfb_dbx = dispfb1_q[42:32]; assign dispfb_dby = dispfb1_q[53:43]; // Ch94/Ch96/Ch97/Ch103 — scanout PSM awareness. Four formats: // PSMCT32 (5'h00) — 4 bytes/pixel, byte order {A,B,G,R}. // PSMCT16 (5'h02) — 2 bytes/pixel, RGB5A1 packed: // R[4:0] G[9:5] B[14:10] A[15]. // PSMT8 (5'h13) — 1 byte/pixel, 8-bit index. // PSMT4 (5'h14) — 4 bits/pixel = 2 pixels/byte. Byte // offset = pixel_index >> 1; nibble // selector = pixel_index[0] (low = // even, high = odd). The 4-bit nibble // zero-extends to an 8-bit CLUT index; // CSA picks the 16-entry palette window. // For PSMT8/PSMT4, with `clut_enable=1` pcrtc looks up // CLUT[idx + (CSA << 4)] in the external clut_stub for real // RGB. With `clut_enable=0`, the index/nibble surfaces as // grayscale (8-bit replication for PSMT8, 4→8 bit-replicate // for PSMT4) so the storage lane stays visually verifiable // without programming a palette. // 5→8 expansion (PSMCT16) uses bit-replicate ({r5, r5[4:2]}), // matching PCSX2. Other PSMs still disable scanout rather // than mis-decode bytes; PSMCT24/PSMCT16S/PSMZ32/etc. force // scanout off here. logic dispfb_psm_ct32; logic dispfb_psm_ct16; logic dispfb_psm_t8; logic dispfb_psm_t4; logic [1:0] dispfb_bpp_shift; assign dispfb_psm_ct32 = (dispfb_psm == 5'h00); assign dispfb_psm_ct16 = (dispfb_psm == 5'h02); assign dispfb_psm_t8 = (dispfb_psm == 5'h13); assign dispfb_psm_t4 = (dispfb_psm == 5'h14); assign dispfb_psm_ok = dispfb_psm_ct32 | dispfb_psm_ct16 | dispfb_psm_t8 | dispfb_psm_t4; assign dispfb_bpp_shift = dispfb_psm_ct32 ? 2'd2 : // 4 bytes/pixel dispfb_psm_ct16 ? 2'd1 : // 2 bytes/pixel dispfb_psm_t8 ? 2'd0 : // 1 byte/pixel 2'd2; // PSMT4 uses byte_offset right-shift, not bpp_shift assign pmode_en1 = pmode_q[0]; assign scanout_enable = pmode_en1 & dispfb_psm_ok; // Ch92/Ch93 — DISPLAY1 sub-fields per real PS2 GS register // layout (PCSX2 GSRegs.h — DISPLAY structure): // DX : [11:0] display window X start (in VCK pulses) // DY : [22:12] display window Y start (in raster lines) // MAGH : [26:23] horizontal magnification - 1 (Ch93) // MAGV : [28:27] vertical magnification - 1 (Ch93) // DW : [43:32] display width - 1 (in VCK pulses) // DH : [54:44] display height - 1 (in raster lines) // // The display window is the sub-rect (DX..DX+DW, DY..DY+DH) // inside the active area. Outside the window, r/g/b is 0 // even when scanout_enable is 1. Inside, the VRAM index is // measured RELATIVE to the window origin, scaled DOWN by the // magnification factors (MAGH+1 / MAGV+1), then shifted by // DBX/DBY. This means the pixel at displayed (DX, DY) // corresponds to VRAM (DBX, DBY); successive displayed // pixels along H map to the SAME VRAM column for (MAGH+1) // VCK pulses before advancing. logic [11:0] display_dx; logic [10:0] display_dy; logic [3:0] display_magh; logic [1:0] display_magv; logic [11:0] display_dw; logic [10:0] display_dh; assign display_dx = display1_q[11:0]; assign display_dy = display1_q[22:12]; assign display_magh = display1_q[26:23]; assign display_magv = display1_q[28:27]; assign display_dw = display1_q[43:32]; assign display_dh = display1_q[54:44]; // Window inside-test: (hcnt - DX) in [0, DW] AND (vcnt - DY) // in [0, DH]. We do the lower-bound check by comparing >= // and the upper-bound by computing the relative coord. logic [11:0] hwin_rel; logic [11:0] vwin_rel; logic in_display_window; assign hwin_rel = {{(12-HCNT_W){1'b0}}, hcnt} - {{0{1'b0}}, display_dx}; assign vwin_rel = {{(12-VCNT_W){1'b0}}, vcnt[VCNT_W-1:0]} - {1'b0, display_dy}; assign in_display_window = ({{(12-HCNT_W){1'b0}}, hcnt} >= {{0{1'b0}}, display_dx}) && (hwin_rel <= display_dw) && ({{(12-VCNT_W){1'b0}}, vcnt[VCNT_W-1:0]} >= {1'b0, display_dy}) && (vwin_rel <= {1'b0, display_dh}); logic [31:0] fbp_bytes; logic [31:0] pixels_per_row; logic [31:0] hmag_factor; // MAGH + 1, range 1..16 logic [31:0] vmag_factor; // MAGV + 1, range 1..4 logic [31:0] vram_x_unshift; logic [31:0] vram_y_unshift; logic [31:0] effective_x; logic [31:0] effective_y; logic [31:0] pixel_index; logic [31:0] byte_offset; // VRAM index is measured from inside the display window and // SCALED DOWN by the magnification factors: // effective_x = ((hcnt - DX) / (MAGH+1)) + DBX // effective_y = ((vcnt - DY) / (MAGV+1)) + DBY // MAGH=MAGV=0 → factors=1×, math collapses to the pre-Ch93 // form (and the pre-Ch92 form when DISPLAY1 covers the full // active area). MAGH=N>0 means each VRAM column shows for // (N+1) consecutive VCK pulses before the next column. SystemVerilog // `/` truncates toward zero on unsigned 32-bit operands — // matches PS2 PCRTC behavior since (hcnt-DX) is always // non-negative inside the window (the window check guards // hcnt >= DX before VRAM is read). assign fbp_bytes = {23'd0, dispfb_fbp} << 11; assign pixels_per_row = {26'd0, dispfb_fbw} << 6; assign hmag_factor = {28'd0, display_magh} + 32'd1; assign vmag_factor = {30'd0, display_magv} + 32'd1; // Ch163 — when STRIP_PCRTC_MAG_DIV is 1, bypass the divisions // and use the window-relative coords directly. Quartus then has // nothing to infer for the magnification divider (the Ch162-onwards // STA worst path on `u_demo|u_pcrtc|div_1_rtl_0|...`). The // hardware-demo path locks MAGH=MAGV=0 so the divisors are // constant 1 and this is behavior-neutral. The default 0 keeps // the live divider math for the existing Ch93 magnification // scanout TBs (`tb_gs_scanout_magh_magv` etc.). assign vram_x_unshift = STRIP_PCRTC_MAG_DIV ? {20'd0, hwin_rel} : ({20'd0, hwin_rel} / hmag_factor); assign vram_y_unshift = STRIP_PCRTC_MAG_DIV ? {20'd0, vwin_rel} : ({20'd0, vwin_rel} / vmag_factor); assign effective_x = vram_x_unshift + {21'd0, dispfb_dbx}; assign effective_y = vram_y_unshift + {21'd0, dispfb_dby}; assign pixel_index = (effective_y * pixels_per_row) + effective_x; // PSMT4 packs 2 pixels per byte → byte_offset = pixel_index/2; // all other supported PSMs are integer-bytes-per-pixel and // use the standard left-shift by bpp_shift. assign byte_offset = dispfb_psm_t4 ? (pixel_index >> 1) : (pixel_index << dispfb_bpp_shift); logic [31:0] vram_linear_addr; assign vram_linear_addr = fbp_bytes + byte_offset; // Ch120 — optional PSMCT32 swizzled scanout. The swizzle module // is purely combinational and reuses dispfb_fbp / dispfb_fbw + // the per-cycle effective_x / effective_y (already magnification- // aware via Ch93). When PSMCT32_SWIZZLE=1 AND the active PSM is // PSMCT32, mux its output into vram_read_addr. Other PSMs (CT16, // T8, T4) and PSMCT32_SWIZZLE=0 keep the legacy linear address. logic [31:0] vram_swizzled_addr; gs_swizzle_psmct32_stub u_swizzle ( .fbp (dispfb_fbp), .fbw (dispfb_fbw), .x (effective_x[11:0]), .y (effective_y[11:0]), .addr(vram_swizzled_addr) ); // Ch126 — optional PSMCT16 swizzled scanout. Same wiring shape // as Ch120 but uses gs_swizzle_psmct16_stub. The PSMCT16 module // bakes its own page-shape (64×64 vs CT32's 64×32), block grid // (4 cols × 8 rows vs CT32's 8×4), and within-block column-table // permutation in. Default PSMCT16_SWIZZLE=0 preserves linear // PSMCT16 scanout for the legacy TBs (Ch94/Ch95/Ch103/etc.). logic [31:0] vram_swizzled16_addr; gs_swizzle_psmct16_stub u_swizzle16 ( .fbp (dispfb_fbp), .fbw (dispfb_fbw), .x (effective_x[11:0]), .y (effective_y[11:0]), .addr(vram_swizzled16_addr) ); // Ch132 — optional PSMT8 swizzled scanout. Same wiring shape as // Ch120/Ch126. PSMT8 pages are 128 px wide so the swizzle // internally divides FBW by 2 (PCSX2 asserts FBW must be even // for PSMT8). Default PSMT8_SWIZZLE=0 preserves linear PSMT8 // scanout for the legacy TBs (Ch96, Ch97, Ch103, Ch107, etc.). logic [31:0] vram_swizzled8_addr; gs_swizzle_psmt8_stub u_swizzle8 ( .fbp (dispfb_fbp), .fbw (dispfb_fbw), .x (effective_x[11:0]), .y (effective_y[11:0]), .addr(vram_swizzled8_addr) ); // Ch138 — optional PSMT4 swizzled scanout. Same wiring shape as // Ch120/Ch126/Ch132 but uses gs_swizzle_psmt4_stub. PSMT4 is // 4 bits/pixel, so the module outputs both an absolute byte // address AND a `nibble_hi` selector. Default PSMT4_SWIZZLE=0 // preserves linear PSMT4 scanout for the legacy TBs (Ch103, // Ch104, Ch107, etc.) — the linear path uses pixel_index[0] as // the nibble selector; the swizzled path uses the swizzle // module's nibble_hi output instead. logic [31:0] vram_swizzled4_addr; logic swizzle4_nibble_hi; gs_swizzle_psmt4_stub u_swizzle4 ( .fbp (dispfb_fbp), .fbw (dispfb_fbw), .x (effective_x[11:0]), .y (effective_y[11:0]), .addr (vram_swizzled4_addr), .nibble_hi(swizzle4_nibble_hi) ); assign vram_read_addr = (PSMCT32_SWIZZLE && dispfb_psm_ct32) ? vram_swizzled_addr : (PSMCT16_SWIZZLE && dispfb_psm_ct16) ? vram_swizzled16_addr : (PSMT8_SWIZZLE && dispfb_psm_t8) ? vram_swizzled8_addr : (PSMT4_SWIZZLE && dispfb_psm_t4) ? vram_swizzled4_addr : vram_linear_addr; // PSMCT32 layout in vram_stub: little-endian write of // raster_pixel_color_q[31:0] = {A, B, G, R}. Read back as: // data[7:0] = R // data[15:8] = G // data[23:16] = B // data[31:24] = A (alpha, not exposed at the video DAC) // Ch94/Ch96/Ch97 — PSM-aware color decode. // PSMCT32: lower 24 bits = {B, G, R}; alpha at [31:24] // dropped. // PSMCT16: RGB5A1 in lower 16 bits, 5→8 bit-replicate. // PSMT8 : index in vram_read_data[7:0]. With clut_enable // (Ch97), CLUT[idx + (CSA << 4)] is looked up for // real RGB; without it, the index is emitted as // grayscale (Ch96 fallback). The vram_stub read // returns 4 bytes starting at the byte address, // so [7:0] is the byte at the addressed PSMT8 // pixel regardless of 4-byte alignment. logic [15:0] psm16_pixel; logic [4:0] psm16_r5, psm16_g5, psm16_b5; logic [7:0] psm16_r8, psm16_g8, psm16_b8; logic [7:0] psm8_idx; logic [3:0] psm4_nibble; logic [7:0] psm4_idx; logic [7:0] psm4_gray; // Ch158 (audit Medium fix) — sub-word PSM lane selection. // // `vram_stub` returns the 4 bytes STARTING at `byte_addr`, so // for the legacy comb-read shape the sub-word value is always // at the LOW lane of `vram_read_data` (CT16 → [15:0], T8 → [7:0], // T4 byte → [7:0]). `vram_bram_stub` is word-addressable // (returns mem[byte_addr >> 2]), so the sub-word value lives // at lane `byte_addr[1:0]` within the returned 32-bit word — // CT16 halfword at byte_addr[1]==1 sits at [31:16] and is // missed by a fixed-low-lane extract. // // The address-LSB register below is a 1-cycle-delayed copy of // `vram_read_addr[1:0]` matching the `_dec` decode-stage view // of the registered `vram_read_data`. The `data_lane` mux is // forced to 0 in legacy mode (so vram_stub's byte-addressable // semantics keep working) and uses the registered LSBs in // sync mode (so vram_bram_stub's word-addressable layout // resolves to the right byte/halfword). logic [1:0] vram_addr_lane_q; logic [1:0] vram_addr_lane_dec; logic [1:0] data_lane; always_ff @(posedge clk) begin if (!rst_n) vram_addr_lane_q <= 2'd0; else vram_addr_lane_q <= vram_read_addr[1:0]; end assign vram_addr_lane_dec = VRAM_SYNC_READ ? vram_addr_lane_q : vram_read_addr[1:0]; assign data_lane = VRAM_SYNC_READ ? vram_addr_lane_dec : 2'd0; // CT16 halfword: [1] picks low (==0) or high (==1) halfword of // the 32-bit word. byte_addr[0]==1 is misuse for CT16 (the // address-stage formula always yields even byte addresses). assign psm16_pixel = data_lane[1] ? vram_read_data[31:16] : vram_read_data[15:0]; // PSMT8/T4 byte: [1:0] picks 1 of 4 byte lanes. Used directly // as `psm8_idx` and as the source byte for the PSMT4 nibble // extract below. logic [7:0] vram_byte_lane; always_comb begin case (data_lane) 2'b00: vram_byte_lane = vram_read_data[ 7: 0]; 2'b01: vram_byte_lane = vram_read_data[15: 8]; 2'b10: vram_byte_lane = vram_read_data[23:16]; 2'b11: vram_byte_lane = vram_read_data[31:24]; endcase end assign psm16_r5 = psm16_pixel[4:0]; assign psm16_g5 = psm16_pixel[9:5]; assign psm16_b5 = psm16_pixel[14:10]; assign psm16_r8 = {psm16_r5, psm16_r5[4:2]}; assign psm16_g8 = {psm16_g5, psm16_g5[4:2]}; assign psm16_b8 = {psm16_b5, psm16_b5[4:2]}; assign psm8_idx = vram_byte_lane; // Ch103 — PSMT4 nibble extraction. The byte at byte_offset // holds two pixels: low nibble = even pixel, high nibble = // odd pixel. pixel_index[0] picks which one this scanout // cycle is reading. The 4-bit nibble zero-extends to an // 8-bit CLUT index; the grayscale fallback replicates the // nibble across both halves of an 8-bit channel value // (4'hF → 8'hFF, 4'h5 → 8'h55, etc.). // // Ch138 — when PSMT4_SWIZZLE=1 AND the active PSM is PSMT4, // the nibble selector comes from the swizzle module's // `nibble_hi` output (which is `columnTable4[yb][xb] & 1` — // the canonical PCSX2 selector under the swizzled layout). // pixel_index[0] is the linear formula's selector; the // swizzled formula needs the swizzle's own bit because the // swizzle reorders pixels within a block. logic psm4_nibble_select; assign psm4_nibble_select = (PSMT4_SWIZZLE && dispfb_psm_t4) ? swizzle4_nibble_hi : pixel_index[0]; // Ch158 — pair the nibble selector with vram_read_data: in // legacy comb-read mode they are both same-cycle; in sync-read // mode the selector is registered (psm4_nibble_select_dec) so // it lines up with the registered VRAM data. The `_dec` mux // selects between the two views via `VRAM_SYNC_READ`. The // BYTE that holds the nibble is picked from `vram_byte_lane` // (the byte_addr[1:0]-keyed lane in sync mode, the low lane // in legacy mode — see the audit-Medium fix above). assign psm4_nibble = psm4_nibble_select_dec ? vram_byte_lane[7:4] : vram_byte_lane[3:0]; assign psm4_idx = {4'd0, psm4_nibble}; assign psm4_gray = {psm4_nibble, psm4_nibble}; // Ch97/Ch103 — CLUT effective index. `clut_csa` shifts the // lookup window in 16-entry units. The 8-bit add wraps mod // 256, matching the size of the staging area. The base index // is the PSMT8 byte index for PSMT8, the zero-extended PSMT4 // nibble for PSMT4, otherwise unused (pcrtc just doesn't // consume the CLUT output). // Ch158 — clut_idx_base + clut_read_idx are derived from // vram_read_data (already aligned with the data-decode stage) // and from `dispfb_psm_t4_dec` (the registered/passthrough // PSM flag), so the CLUT lookup happens on the same cycle as // the pixel-emit decode comb. logic [7:0] clut_idx_base; assign clut_idx_base = dispfb_psm_t4_dec ? psm4_idx : psm8_idx; assign clut_read_idx = clut_idx_base + {clut_csa, 4'd0}; always_comb begin if (de && scanout_enable_dec && in_display_window_dec) begin if (dispfb_psm_ct16_dec) begin r = psm16_r8; g = psm16_g8; b = psm16_b8; end else if (dispfb_psm_t8_dec) begin if (clut_enable) begin // CLUT lookup. Each entry is PSMCT32. Byte // order matches PSMCT32 framebuffer reads: // [7:0]=R, [15:8]=G, [23:16]=B, [31:24]=A r = clut_read_data[7:0]; g = clut_read_data[15:8]; b = clut_read_data[23:16]; end else begin // Ch96 fallback: surface index as grayscale. r = psm8_idx; g = psm8_idx; b = psm8_idx; end end else if (dispfb_psm_t4_dec) begin if (clut_enable) begin // Ch103 — PSMT4 + CLUT. The 4-bit nibble has // already been mux'd into clut_read_idx via // clut_idx_base + (CSA<<4); the returned // entry is PSMCT32 ABGR. r = clut_read_data[7:0]; g = clut_read_data[15:8]; b = clut_read_data[23:16]; end else begin // Grayscale fallback — replicate the nibble // across the 8-bit DAC value so 4'hF → 8'hFF. r = psm4_gray; g = psm4_gray; b = psm4_gray; end end else begin // PSMCT32 — the only remaining format that // dispfb_psm_ok admits at this scope. r = vram_read_data[7:0]; g = vram_read_data[15:8]; b = vram_read_data[23:16]; end end else begin r = 8'd0; g = 8'd0; b = 8'd0; end end // ------------------------------------------------------------------ // Trace: one EV_MODE per completed frame. // ------------------------------------------------------------------ logic [31:0] frame_count; always_ff @(posedge clk) begin if (!rst_n) begin frame_count <= 32'd0; ev_valid <= 1'b0; ev_subsys <= SUBSYS_PLAT; ev_event <= EV_MODE; ev_arg0 <= 64'd0; ev_arg1 <= 64'd0; ev_arg2 <= 64'd0; ev_arg3 <= 64'd0; ev_flags <= 32'd0; end else if (end_of_frame_dec) begin // Ch158: when VRAM_SYNC_READ=1, end_of_frame_dec lags // the counter-side end_of_frame by 1 cycle so it fires // when the LAST visible pixel actually emits (which is // 1 cycle after the address-stage hits the last cell). // Legacy comb-read passthrough makes end_of_frame_dec // == end_of_frame, so existing TBs are unaffected. frame_count <= frame_count + 32'd1; ev_valid <= 1'b1; ev_subsys <= SUBSYS_PLAT; ev_event <= EV_MODE; ev_arg0 <= {32'd0, frame_count}; ev_arg1 <= {32'd0, 32'(H_ACTIVE * V_ACTIVE)}; ev_arg2 <= 64'd0; ev_arg3 <= 64'd0; ev_flags <= 32'd0; end else begin ev_valid <= 1'b0; end end endmodule : gs_pcrtc_stub