Files
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

276 lines
13 KiB
Systemverilog
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// retroDE_ps2 — clut_loader_stub (Ch99 + Ch100 + Ch101)
//
// VRAM→CLUT load engine triggered by GIF TEX0.CLD. Watches the
// 1-cycle `tex0_wr_pulse` from gs_stub and starts a 256-entry
// load when the just-written TEX0 satisfies all three:
// - CSM == 1 (CSM2 linear; CSM1 swizzle deferred)
// - CPSM ∈ {PSMCT32, PSMCT16}
// - CLD permits a load under the change-detect policy:
// 0 = never, 1 = always,
// 2 = CBP changed since last load,
// 3 = CBP, CPSM, or CSA changed since last load,
// 4 = always, but write only the 16-entry CSA window
// (Ch102) — destination indices CSA*16..CSA*16+15
// wrap mod 256; the rest of clut_stub is preserved.
// 5..7 = reserved/edge cases at this scope (no-op).
// Per-CPSM stride: PSMCT32 reads 4 bytes/entry from
// VRAM[CBP*256 + i*4]; PSMCT16 reads 2 bytes/entry from
// VRAM[CBP*256 + i*2] and unpacks RGB5A1 → PSMCT32 ABGR with
// 5→8 bit-replicate. clut_stub always sees PSMCT32 entries.
//
// Scope (Ch99 + Ch100):
// - CSM2 (linear addressing) only — entry i lives at byte
// offset i*entry_stride from CBP*256, where entry_stride is
// 4 (PSMCT32) or 2 (PSMCT16). The loader explicitly gates
// start on tex0_csm == 1'b1 (CSM2). A TEX0_1 write with
// CSM=0 (CSM1, 16×16 grid swizzle) is silently ignored at
// this scope rather than performing a wrong linear load.
// - CPSM=PSMCT32 (=0) and CPSM=PSMCT16 (=2) accepted. PSMCT16
// entries are unpacked from RGB5A1 to PSMCT32 ABGR via 5→8
// bit-replicate ({c5, c5[4:2]}) so clut_stub always stores
// PSMCT32 regardless of source format and pcrtc's existing
// PSMT8+CLUT lookup path stays unchanged. Alpha is replicated
// across 8 bits ({8{a1}}). Other CPSM codes (PSMCT24, PSMT8H,
// etc.) are silently ignored.
// - CLD modes (Ch101 + Ch102): full conditional policy
// honored for CLD ∈ {0, 1, 2, 3, 4}.
// 0 = no load.
// 1 = always load — full 256 entries.
// 2 = load only when CBP changed since last load.
// 3 = load when CBP, CPSM, or CSA changed since last load.
// 4 = partial CSA-window load (Ch102) — always fires, but
// writes only 16 entries at clut_stub[CSA*16 + i] for
// i ∈ 0..15 (CSA*16 wraps mod 256). The other 240
// entries are preserved; the VRAM source still starts
// at CBP*256 and uses the same per-CPSM byte stride.
// CLD ∈ {5, 6, 7} silently no-op at this scope (reserved /
// edge cases). The change-detect compares against `prev_*`
// regs latched on entry to S_LOAD; reset clears them to 0,
// so a first CLD=2 with CBP==0 is silently skipped (matches
// the "nothing changed" interpretation).
// - Reference (kept for posterity): real PS2 CLD encodes:
// 1 = always
// 2 = CBP changed
// 3 = CBP, CPSM, or CSA changed
// 4 = CSA changed (partial 16-entry load at CSA)
// 5..7 = reserved/edge cases
// Modeling those needs a full-CLUT register snapshot for
// change detection — deferred.
// - CSA is consumed two ways. (a) For CLD=3 it's a
// change-detect input (any prev-vs-new CSA delta triggers
// a full reload). (b) For CLD=4 it picks the destination
// window: load_csa_base = {CSA, 4'd0} (8-bit, so CSA=16..31
// wrap to base 0..240). Full-CLUT loads (CLD ∈ {1,2,3})
// overwrite all 256 entries regardless of CSA.
// - One in-flight load at a time. A new TEX0_1 write while
// `load_busy=1` is silently ignored at this scope.
//
// Timing: full load = 256 clocks; partial (CLD=4) = 16 clocks.
// `load_busy` is high throughout. TBs typically `wait (load_busy == 0)` to
// gate scanout configuration on the load completing.
`timescale 1ns/1ps
module clut_loader_stub #(
// Ch350 — CSM1 (16×16 CT32 grid) CLUT-load path. Default OFF so all existing CSM2-linear behaviour is
// BYTE-IDENTICAL (a CSM=0 TEX0 is still silently ignored when this is 0, exactly as Ch99..Ch102). When 1,
// a CSM=0 / CPSM=PSMCT32 TEX0 commit loads the palette in the real GS CSM1 grid order: palette entry i is
// read at (x=i[3:0], y=i[7:4]) of a 16×16 PSMCT32 surface based at CBP, via the CT32 block+byte swizzle.
// This is the order Ch349 proved SH3 uses (host gs_localmem 'grid'); CSM2-linear scatters those colours.
parameter bit CLUT_CSM1_ENABLE = 1'b0
) (
input logic clk,
input logic rst_n,
// From gs_stub: 1-cycle pulse on TEX0_1 commit + the
// newly-decoded sub-fields.
input logic tex0_wr_pulse,
input logic [13:0] tex0_cbp,
input logic [3:0] tex0_cpsm,
input logic tex0_csm, // Ch99 audit: must be 1 (CSM2)
input logic [4:0] tex0_csa, // Ch101: change-detect for CLD=3
input logic [2:0] tex0_cld,
// VRAM second read port — combinational byte-addressed read.
output logic [31:0] vram_read_addr,
input logic [31:0] vram_read_data,
// CLUT staging-area write port.
output logic clut_write_en,
output logic [7:0] clut_write_idx,
output logic [31:0] clut_write_data,
// Status: high while a load is in flight.
output logic load_busy
);
typedef enum logic [0:0] {
S_IDLE,
S_LOAD
} state_e;
state_e state;
logic [7:0] load_idx;
logic [13:0] load_cbp;
logic load_cpsm_is_ct16; // Ch100: latched CPSM mode.
// Ch101 — change-detect snapshots. Updated on every entry to
// S_LOAD (i.e., every successful start). Used by CLD=2 (CBP
// change) and CLD=3 (CBP/CPSM/CSA any-change).
logic [13:0] prev_cbp;
logic [3:0] prev_cpsm;
logic [4:0] prev_csa;
// Ch102 — partial CSA-window load mode (CLD=4). When set,
// walks 16 entries instead of 256 and writes them to
// clut_stub[load_csa_base + load_idx] (8-bit wrap).
logic load_partial;
logic [7:0] load_csa_base;
logic [7:0] load_terminal;
assign load_terminal = load_partial ? 8'h0F : 8'hFF;
logic load_csm1; // Ch350: latched CSM1-grid mode (PSMCT32 only).
logic [31:0] cbp_bytes;
logic [31:0] addr_offset_ct32;
logic [31:0] addr_offset_ct16;
logic [31:0] addr_offset_csm1;
logic [31:0] addr_offset;
assign load_busy = (state == S_LOAD);
// CBP is in 256-byte units (matches PS2 GS docs for the CLUT
// staging area: each CBP step covers one 256-byte block).
assign cbp_bytes = {18'd0, load_cbp} << 8;
// Per-PSM byte offset within the staging block.
// PSMCT32 entries are 4 bytes → byte offset = idx * 4.
// PSMCT16 entries are 2 bytes → byte offset = idx * 2.
assign addr_offset_ct32 = {22'd0, load_idx, 2'd0};
assign addr_offset_ct16 = {23'd0, load_idx, 1'd0};
// Ch350 — CSM1 16×16 CT32 grid offset for entry load_idx (ix=load_idx[3:0], iy=load_idx[7:4]):
// block = {iy[3], ix[3]} (0..3) → block*256 ; byte_in_block = iy[2:0]*32 + ix[2:0]*4.
// Matches gs_localmem.ct32_addr(cbp,dbw=1,ix,iy) (page_index=0 for a 16×16 region). PSMCT32 only.
assign addr_offset_csm1 = ({30'd0, load_idx[7], load_idx[3]} << 8) // block * 256
+ ({29'd0, load_idx[6:4]} << 5) // iy[2:0] * 32
+ ({29'd0, load_idx[2:0]} << 2); // ix[2:0] * 4
assign addr_offset = load_csm1 ? addr_offset_csm1
: load_cpsm_is_ct16 ? addr_offset_ct16
: addr_offset_ct32;
assign vram_read_addr = cbp_bytes + addr_offset;
// Ch100 — PSMCT16 → PSMCT32 unpack. RGB5A1 packing in the
// low 16 bits of vram_read_data: R[4:0] G[9:5] B[14:10] A[15].
// 5→8 bit-replicate matches the same expansion pcrtc uses
// for direct PSMCT16 framebuffer scanout (Ch94). Alpha is
// replicated across 8 bits.
logic [15:0] psm16_entry;
logic [4:0] psm16_r5, psm16_g5, psm16_b5;
logic psm16_a1;
logic [7:0] psm16_r8, psm16_g8, psm16_b8, psm16_a8;
logic [31:0] write_data_ct16;
assign psm16_entry = vram_read_data[15:0];
assign psm16_r5 = psm16_entry[4:0];
assign psm16_g5 = psm16_entry[9:5];
assign psm16_b5 = psm16_entry[14:10];
assign psm16_a1 = psm16_entry[15];
assign psm16_r8 = {psm16_r5, psm16_r5[4:2]};
assign psm16_g8 = {psm16_g5, psm16_g5[4:2]};
assign psm16_b8 = {psm16_b5, psm16_b5[4:2]};
assign psm16_a8 = {8{psm16_a1}};
assign write_data_ct16 = {psm16_a8, psm16_b8, psm16_g8, psm16_r8};
// Combinational addr/data feed for vram_stub port 1 and
// clut_stub write port. Idle when not loading. In partial
// (CLD=4) mode the destination index is the CSA window base
// + load_idx, with 8-bit wrap; in full mode it's just
// load_idx (0..255).
assign clut_write_en = (state == S_LOAD);
assign clut_write_idx = load_partial ? (load_csa_base + load_idx)
: load_idx;
assign clut_write_data = load_cpsm_is_ct16 ? write_data_ct16
: vram_read_data;
// Ch101 — CLD-mode trigger policy. cld_match says "the CLD
// value alone permits a load (assuming CSM/CPSM also OK)."
// The full start gate ANDs this with the existing CSM/CPSM
// checks below.
logic cld_match;
always_comb begin
unique case (tex0_cld)
3'd0: cld_match = 1'b0; // no load
3'd1: cld_match = 1'b1; // always (full)
3'd2: cld_match = (tex0_cbp != prev_cbp); // CBP changed
3'd3: cld_match = (tex0_cbp != prev_cbp)
|| (tex0_cpsm != prev_cpsm)
|| (tex0_csa != prev_csa);
3'd4: cld_match = 1'b1; // always (partial CSA window)
default: cld_match = 1'b0; // CLD ∈ {5..7} reserved
endcase
end
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
load_idx <= 8'd0;
load_cbp <= 14'd0;
load_cpsm_is_ct16 <= 1'b0;
load_csm1 <= 1'b0;
load_partial <= 1'b0;
load_csa_base <= 8'd0;
prev_cbp <= 14'd0;
prev_cpsm <= 4'd0;
prev_csa <= 5'd0;
end else begin
unique case (state)
S_IDLE: begin
// Ch99 audit-medium: CSM=0 means CSM1 (16×16
// grid swizzle inside a CSPM block) which we
// do NOT model yet. Silently treating CSM=0
// as CSM2-linear would produce wrong palette
// bytes, so gate the start on CSM=1 as well.
// Ch100: CPSM == 0 (PSMCT32) and CPSM == 2
// (PSMCT16) are both honored. Other CPSMs
// are silently ignored.
// Ch101: CLD-mode policy decides whether the
// load fires AND records the new TEX0
// CBP/CPSM/CSA in prev_* for future change
// detection.
// Ch102: CLD=4 starts a 16-entry partial load
// at the CSA window; load_partial /
// load_csa_base latch the mode + destination
// base.
// Ch350 — start when CSM2 (csm=1, CPSM∈{CT32,CT16}, unchanged) OR, only when
// CLUT_CSM1_ENABLE, CSM1-grid (csm=0, CPSM=CT32). The CSM1 branch is the sole new
// trigger; with the param OFF this AND-term is constant-0 so a csm=0 TEX0 is ignored
// exactly as before (CSM2 path byte-identical).
if (tex0_wr_pulse
&& cld_match
&& ( ((tex0_csm == 1'b1) && ((tex0_cpsm == 4'd0) || (tex0_cpsm == 4'd2)))
|| (CLUT_CSM1_ENABLE && (tex0_csm == 1'b0) && (tex0_cpsm == 4'd0)) )) begin
state <= S_LOAD;
load_idx <= 8'd0;
load_cbp <= tex0_cbp;
load_cpsm_is_ct16 <= (tex0_cpsm == 4'd2);
load_csm1 <= (tex0_csm == 1'b0); // CSM1-grid addressing (PSMCT32)
load_partial <= (tex0_cld == 3'd4);
load_csa_base <= {tex0_csa, 4'd0};
prev_cbp <= tex0_cbp;
prev_cpsm <= tex0_cpsm;
prev_csa <= tex0_csa;
end
end
S_LOAD: begin
// Terminal index is 0xFF for full load, 0x0F
// for partial (CSA window). load_terminal
// mux below picks between them.
if (load_idx == load_terminal) begin
state <= S_IDLE;
end
load_idx <= load_idx + 8'd1;
end
endcase
end
end
endmodule : clut_loader_stub