ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
276 lines
13 KiB
Systemverilog
276 lines
13 KiB
Systemverilog
// retroDE_ps2 — clut_loader_stub (Ch99 + Ch100 + Ch101)
|
||
//
|
||
// VRAM→CLUT load engine triggered by GIF TEX0.CLD. Watches the
|
||
// 1-cycle `tex0_wr_pulse` from gs_stub and starts a 256-entry
|
||
// load when the just-written TEX0 satisfies all three:
|
||
// - CSM == 1 (CSM2 linear; CSM1 swizzle deferred)
|
||
// - CPSM ∈ {PSMCT32, PSMCT16}
|
||
// - CLD permits a load under the change-detect policy:
|
||
// 0 = never, 1 = always,
|
||
// 2 = CBP changed since last load,
|
||
// 3 = CBP, CPSM, or CSA changed since last load,
|
||
// 4 = always, but write only the 16-entry CSA window
|
||
// (Ch102) — destination indices CSA*16..CSA*16+15
|
||
// wrap mod 256; the rest of clut_stub is preserved.
|
||
// 5..7 = reserved/edge cases at this scope (no-op).
|
||
// Per-CPSM stride: PSMCT32 reads 4 bytes/entry from
|
||
// VRAM[CBP*256 + i*4]; PSMCT16 reads 2 bytes/entry from
|
||
// VRAM[CBP*256 + i*2] and unpacks RGB5A1 → PSMCT32 ABGR with
|
||
// 5→8 bit-replicate. clut_stub always sees PSMCT32 entries.
|
||
//
|
||
// Scope (Ch99 + Ch100):
|
||
// - CSM2 (linear addressing) only — entry i lives at byte
|
||
// offset i*entry_stride from CBP*256, where entry_stride is
|
||
// 4 (PSMCT32) or 2 (PSMCT16). The loader explicitly gates
|
||
// start on tex0_csm == 1'b1 (CSM2). A TEX0_1 write with
|
||
// CSM=0 (CSM1, 16×16 grid swizzle) is silently ignored at
|
||
// this scope rather than performing a wrong linear load.
|
||
// - CPSM=PSMCT32 (=0) and CPSM=PSMCT16 (=2) accepted. PSMCT16
|
||
// entries are unpacked from RGB5A1 to PSMCT32 ABGR via 5→8
|
||
// bit-replicate ({c5, c5[4:2]}) so clut_stub always stores
|
||
// PSMCT32 regardless of source format and pcrtc's existing
|
||
// PSMT8+CLUT lookup path stays unchanged. Alpha is replicated
|
||
// across 8 bits ({8{a1}}). Other CPSM codes (PSMCT24, PSMT8H,
|
||
// etc.) are silently ignored.
|
||
// - CLD modes (Ch101 + Ch102): full conditional policy
|
||
// honored for CLD ∈ {0, 1, 2, 3, 4}.
|
||
// 0 = no load.
|
||
// 1 = always load — full 256 entries.
|
||
// 2 = load only when CBP changed since last load.
|
||
// 3 = load when CBP, CPSM, or CSA changed since last load.
|
||
// 4 = partial CSA-window load (Ch102) — always fires, but
|
||
// writes only 16 entries at clut_stub[CSA*16 + i] for
|
||
// i ∈ 0..15 (CSA*16 wraps mod 256). The other 240
|
||
// entries are preserved; the VRAM source still starts
|
||
// at CBP*256 and uses the same per-CPSM byte stride.
|
||
// CLD ∈ {5, 6, 7} silently no-op at this scope (reserved /
|
||
// edge cases). The change-detect compares against `prev_*`
|
||
// regs latched on entry to S_LOAD; reset clears them to 0,
|
||
// so a first CLD=2 with CBP==0 is silently skipped (matches
|
||
// the "nothing changed" interpretation).
|
||
// - Reference (kept for posterity): real PS2 CLD encodes:
|
||
// 1 = always
|
||
// 2 = CBP changed
|
||
// 3 = CBP, CPSM, or CSA changed
|
||
// 4 = CSA changed (partial 16-entry load at CSA)
|
||
// 5..7 = reserved/edge cases
|
||
// Modeling those needs a full-CLUT register snapshot for
|
||
// change detection — deferred.
|
||
// - CSA is consumed two ways. (a) For CLD=3 it's a
|
||
// change-detect input (any prev-vs-new CSA delta triggers
|
||
// a full reload). (b) For CLD=4 it picks the destination
|
||
// window: load_csa_base = {CSA, 4'd0} (8-bit, so CSA=16..31
|
||
// wrap to base 0..240). Full-CLUT loads (CLD ∈ {1,2,3})
|
||
// overwrite all 256 entries regardless of CSA.
|
||
// - One in-flight load at a time. A new TEX0_1 write while
|
||
// `load_busy=1` is silently ignored at this scope.
|
||
//
|
||
// Timing: full load = 256 clocks; partial (CLD=4) = 16 clocks.
|
||
// `load_busy` is high throughout. TBs typically `wait (load_busy == 0)` to
|
||
// gate scanout configuration on the load completing.
|
||
|
||
`timescale 1ns/1ps
|
||
|
||
module clut_loader_stub #(
|
||
// Ch350 — CSM1 (16×16 CT32 grid) CLUT-load path. Default OFF so all existing CSM2-linear behaviour is
|
||
// BYTE-IDENTICAL (a CSM=0 TEX0 is still silently ignored when this is 0, exactly as Ch99..Ch102). When 1,
|
||
// a CSM=0 / CPSM=PSMCT32 TEX0 commit loads the palette in the real GS CSM1 grid order: palette entry i is
|
||
// read at (x=i[3:0], y=i[7:4]) of a 16×16 PSMCT32 surface based at CBP, via the CT32 block+byte swizzle.
|
||
// This is the order Ch349 proved SH3 uses (host gs_localmem 'grid'); CSM2-linear scatters those colours.
|
||
parameter bit CLUT_CSM1_ENABLE = 1'b0
|
||
) (
|
||
input logic clk,
|
||
input logic rst_n,
|
||
|
||
// From gs_stub: 1-cycle pulse on TEX0_1 commit + the
|
||
// newly-decoded sub-fields.
|
||
input logic tex0_wr_pulse,
|
||
input logic [13:0] tex0_cbp,
|
||
input logic [3:0] tex0_cpsm,
|
||
input logic tex0_csm, // Ch99 audit: must be 1 (CSM2)
|
||
input logic [4:0] tex0_csa, // Ch101: change-detect for CLD=3
|
||
input logic [2:0] tex0_cld,
|
||
|
||
// VRAM second read port — combinational byte-addressed read.
|
||
output logic [31:0] vram_read_addr,
|
||
input logic [31:0] vram_read_data,
|
||
|
||
// CLUT staging-area write port.
|
||
output logic clut_write_en,
|
||
output logic [7:0] clut_write_idx,
|
||
output logic [31:0] clut_write_data,
|
||
|
||
// Status: high while a load is in flight.
|
||
output logic load_busy
|
||
);
|
||
|
||
typedef enum logic [0:0] {
|
||
S_IDLE,
|
||
S_LOAD
|
||
} state_e;
|
||
|
||
state_e state;
|
||
logic [7:0] load_idx;
|
||
logic [13:0] load_cbp;
|
||
logic load_cpsm_is_ct16; // Ch100: latched CPSM mode.
|
||
|
||
// Ch101 — change-detect snapshots. Updated on every entry to
|
||
// S_LOAD (i.e., every successful start). Used by CLD=2 (CBP
|
||
// change) and CLD=3 (CBP/CPSM/CSA any-change).
|
||
logic [13:0] prev_cbp;
|
||
logic [3:0] prev_cpsm;
|
||
logic [4:0] prev_csa;
|
||
|
||
// Ch102 — partial CSA-window load mode (CLD=4). When set,
|
||
// walks 16 entries instead of 256 and writes them to
|
||
// clut_stub[load_csa_base + load_idx] (8-bit wrap).
|
||
logic load_partial;
|
||
logic [7:0] load_csa_base;
|
||
logic [7:0] load_terminal;
|
||
assign load_terminal = load_partial ? 8'h0F : 8'hFF;
|
||
logic load_csm1; // Ch350: latched CSM1-grid mode (PSMCT32 only).
|
||
logic [31:0] cbp_bytes;
|
||
logic [31:0] addr_offset_ct32;
|
||
logic [31:0] addr_offset_ct16;
|
||
logic [31:0] addr_offset_csm1;
|
||
logic [31:0] addr_offset;
|
||
|
||
assign load_busy = (state == S_LOAD);
|
||
|
||
// CBP is in 256-byte units (matches PS2 GS docs for the CLUT
|
||
// staging area: each CBP step covers one 256-byte block).
|
||
assign cbp_bytes = {18'd0, load_cbp} << 8;
|
||
|
||
// Per-PSM byte offset within the staging block.
|
||
// PSMCT32 entries are 4 bytes → byte offset = idx * 4.
|
||
// PSMCT16 entries are 2 bytes → byte offset = idx * 2.
|
||
assign addr_offset_ct32 = {22'd0, load_idx, 2'd0};
|
||
assign addr_offset_ct16 = {23'd0, load_idx, 1'd0};
|
||
// Ch350 — CSM1 16×16 CT32 grid offset for entry load_idx (ix=load_idx[3:0], iy=load_idx[7:4]):
|
||
// block = {iy[3], ix[3]} (0..3) → block*256 ; byte_in_block = iy[2:0]*32 + ix[2:0]*4.
|
||
// Matches gs_localmem.ct32_addr(cbp,dbw=1,ix,iy) (page_index=0 for a 16×16 region). PSMCT32 only.
|
||
assign addr_offset_csm1 = ({30'd0, load_idx[7], load_idx[3]} << 8) // block * 256
|
||
+ ({29'd0, load_idx[6:4]} << 5) // iy[2:0] * 32
|
||
+ ({29'd0, load_idx[2:0]} << 2); // ix[2:0] * 4
|
||
assign addr_offset = load_csm1 ? addr_offset_csm1
|
||
: load_cpsm_is_ct16 ? addr_offset_ct16
|
||
: addr_offset_ct32;
|
||
assign vram_read_addr = cbp_bytes + addr_offset;
|
||
|
||
// Ch100 — PSMCT16 → PSMCT32 unpack. RGB5A1 packing in the
|
||
// low 16 bits of vram_read_data: R[4:0] G[9:5] B[14:10] A[15].
|
||
// 5→8 bit-replicate matches the same expansion pcrtc uses
|
||
// for direct PSMCT16 framebuffer scanout (Ch94). Alpha is
|
||
// replicated across 8 bits.
|
||
logic [15:0] psm16_entry;
|
||
logic [4:0] psm16_r5, psm16_g5, psm16_b5;
|
||
logic psm16_a1;
|
||
logic [7:0] psm16_r8, psm16_g8, psm16_b8, psm16_a8;
|
||
logic [31:0] write_data_ct16;
|
||
|
||
assign psm16_entry = vram_read_data[15:0];
|
||
assign psm16_r5 = psm16_entry[4:0];
|
||
assign psm16_g5 = psm16_entry[9:5];
|
||
assign psm16_b5 = psm16_entry[14:10];
|
||
assign psm16_a1 = psm16_entry[15];
|
||
assign psm16_r8 = {psm16_r5, psm16_r5[4:2]};
|
||
assign psm16_g8 = {psm16_g5, psm16_g5[4:2]};
|
||
assign psm16_b8 = {psm16_b5, psm16_b5[4:2]};
|
||
assign psm16_a8 = {8{psm16_a1}};
|
||
assign write_data_ct16 = {psm16_a8, psm16_b8, psm16_g8, psm16_r8};
|
||
|
||
// Combinational addr/data feed for vram_stub port 1 and
|
||
// clut_stub write port. Idle when not loading. In partial
|
||
// (CLD=4) mode the destination index is the CSA window base
|
||
// + load_idx, with 8-bit wrap; in full mode it's just
|
||
// load_idx (0..255).
|
||
assign clut_write_en = (state == S_LOAD);
|
||
assign clut_write_idx = load_partial ? (load_csa_base + load_idx)
|
||
: load_idx;
|
||
assign clut_write_data = load_cpsm_is_ct16 ? write_data_ct16
|
||
: vram_read_data;
|
||
|
||
// Ch101 — CLD-mode trigger policy. cld_match says "the CLD
|
||
// value alone permits a load (assuming CSM/CPSM also OK)."
|
||
// The full start gate ANDs this with the existing CSM/CPSM
|
||
// checks below.
|
||
logic cld_match;
|
||
always_comb begin
|
||
unique case (tex0_cld)
|
||
3'd0: cld_match = 1'b0; // no load
|
||
3'd1: cld_match = 1'b1; // always (full)
|
||
3'd2: cld_match = (tex0_cbp != prev_cbp); // CBP changed
|
||
3'd3: cld_match = (tex0_cbp != prev_cbp)
|
||
|| (tex0_cpsm != prev_cpsm)
|
||
|| (tex0_csa != prev_csa);
|
||
3'd4: cld_match = 1'b1; // always (partial CSA window)
|
||
default: cld_match = 1'b0; // CLD ∈ {5..7} reserved
|
||
endcase
|
||
end
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
state <= S_IDLE;
|
||
load_idx <= 8'd0;
|
||
load_cbp <= 14'd0;
|
||
load_cpsm_is_ct16 <= 1'b0;
|
||
load_csm1 <= 1'b0;
|
||
load_partial <= 1'b0;
|
||
load_csa_base <= 8'd0;
|
||
prev_cbp <= 14'd0;
|
||
prev_cpsm <= 4'd0;
|
||
prev_csa <= 5'd0;
|
||
end else begin
|
||
unique case (state)
|
||
S_IDLE: begin
|
||
// Ch99 audit-medium: CSM=0 means CSM1 (16×16
|
||
// grid swizzle inside a CSPM block) which we
|
||
// do NOT model yet. Silently treating CSM=0
|
||
// as CSM2-linear would produce wrong palette
|
||
// bytes, so gate the start on CSM=1 as well.
|
||
// Ch100: CPSM == 0 (PSMCT32) and CPSM == 2
|
||
// (PSMCT16) are both honored. Other CPSMs
|
||
// are silently ignored.
|
||
// Ch101: CLD-mode policy decides whether the
|
||
// load fires AND records the new TEX0
|
||
// CBP/CPSM/CSA in prev_* for future change
|
||
// detection.
|
||
// Ch102: CLD=4 starts a 16-entry partial load
|
||
// at the CSA window; load_partial /
|
||
// load_csa_base latch the mode + destination
|
||
// base.
|
||
// Ch350 — start when CSM2 (csm=1, CPSM∈{CT32,CT16}, unchanged) OR, only when
|
||
// CLUT_CSM1_ENABLE, CSM1-grid (csm=0, CPSM=CT32). The CSM1 branch is the sole new
|
||
// trigger; with the param OFF this AND-term is constant-0 so a csm=0 TEX0 is ignored
|
||
// exactly as before (CSM2 path byte-identical).
|
||
if (tex0_wr_pulse
|
||
&& cld_match
|
||
&& ( ((tex0_csm == 1'b1) && ((tex0_cpsm == 4'd0) || (tex0_cpsm == 4'd2)))
|
||
|| (CLUT_CSM1_ENABLE && (tex0_csm == 1'b0) && (tex0_cpsm == 4'd0)) )) begin
|
||
state <= S_LOAD;
|
||
load_idx <= 8'd0;
|
||
load_cbp <= tex0_cbp;
|
||
load_cpsm_is_ct16 <= (tex0_cpsm == 4'd2);
|
||
load_csm1 <= (tex0_csm == 1'b0); // CSM1-grid addressing (PSMCT32)
|
||
load_partial <= (tex0_cld == 3'd4);
|
||
load_csa_base <= {tex0_csa, 4'd0};
|
||
prev_cbp <= tex0_cbp;
|
||
prev_cpsm <= tex0_cpsm;
|
||
prev_csa <= tex0_csa;
|
||
end
|
||
end
|
||
S_LOAD: begin
|
||
// Terminal index is 0xFF for full load, 0x0F
|
||
// for partial (CSA window). load_terminal
|
||
// mux below picks between them.
|
||
if (load_idx == load_terminal) begin
|
||
state <= S_IDLE;
|
||
end
|
||
load_idx <= load_idx + 8'd1;
|
||
end
|
||
endcase
|
||
end
|
||
end
|
||
|
||
endmodule : clut_loader_stub
|