ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
664 lines
30 KiB
Systemverilog
664 lines
30 KiB
Systemverilog
// retroDE_ps2 — gif_image_xfer_stub (Ch110)
|
||
//
|
||
// Host→local image-transfer engine. On a TRXDIR write that arms a
|
||
// host→local upload (XDIR == 0), the engine snapshots the
|
||
// already-latched BITBLTBUF / TRXPOS / TRXREG fields and consumes
|
||
// IMAGE-mode quadwords from gif_packed_stub, unpacking them into
|
||
// per-pixel VRAM writes at the destination region defined by
|
||
// (DBP, DBW, DPSM, DSAX, DSAY, RRW, RRH).
|
||
//
|
||
// Scope (after Ch139):
|
||
// - PSMCT32 (DPSM == 6'h00): 4 bytes/pixel, 4 pixels/qword,
|
||
// row_stride = DBW * 256, write_be = 4'b1111, mask=0xFFFFFFFF.
|
||
// - PSMCT16 (DPSM == 6'h02): 2 bytes/pixel, 8 pixels/qword,
|
||
// row_stride = DBW * 128, write_be = 4'b0011, mask=0xFFFFFFFF.
|
||
// - PSMT8 (DPSM == 6'h13): 1 byte/pixel (an 8-bit CLUT index),
|
||
// 16 pixels/qword, row_stride = DBW * 64, write_be = 4'b0001,
|
||
// mask = 0xFFFFFFFF.
|
||
// - PSMT4 (DPSM == 6'h14): 0.5 bytes/pixel (a 4-bit CLUT index),
|
||
// 32 pixels/qword (2 px/byte × 16 bytes), row_stride = DBW * 32,
|
||
// write_be = 4'b0001 with a per-emit nibble mask: 0x0000_000F
|
||
// for the LOW nibble of the byte (when (DSAX+x) is even) or
|
||
// 0x0000_00F0 for the HIGH nibble (when (DSAX+x) is odd). The
|
||
// 4-bit index sits at the matching nibble position in
|
||
// write_data[7:0]; vram_stub's per-bit merge commits exactly
|
||
// that nibble — the OTHER nibble of the same byte is preserved.
|
||
// Back-to-back emits to the same byte (e.g. x=0 + x=1 of the
|
||
// same row) chain through NBA semantics without bypass logic
|
||
// (same trick the raster channel uses since Ch106).
|
||
// - Other PSMs (PSMCT24/PSMZ-*): the engine still consumes
|
||
// IMAGE qwords (so gif_packed_stub doesn't desync) but emits
|
||
// zero VRAM writes. Lane cadence falls back to PSMCT32
|
||
// (4 lanes/qword).
|
||
// - Addressing: linear by DEFAULT — the destination address
|
||
// math is
|
||
// dest_base = DBP * 256
|
||
// row_stride = DBW * 64 * bpp
|
||
// addr(x, y) = dest_base + (DSAY + y) * row_stride
|
||
// + (DSAX + x) * bpp
|
||
// Four OPTIONAL per-PSM swizzle paths gated by parameters:
|
||
// `PSMCT32_SWIZZLE=1` (Ch121) routes PSMCT32 uploads through
|
||
// gs_swizzle_psmct32_stub; `PSMCT16_SWIZZLE=1` (Ch127) routes
|
||
// PSMCT16 uploads through gs_swizzle_psmct16_stub;
|
||
// `PSMT8_SWIZZLE=1` (Ch133) routes PSMT8 uploads through
|
||
// gs_swizzle_psmt8_stub (page=128×64 px, bw_pg=DBW>>1 — DBW
|
||
// must be even for PSMT8); `PSMT4_SWIZZLE=1` (Ch139) routes
|
||
// PSMT4 uploads through gs_swizzle_psmt4_stub (page=128×128
|
||
// px, bw_pg=DBW>>1 — DBW must be even for PSMT4 too; module
|
||
// also outputs nibble_hi selector since PSMT4 packs 2 pixels
|
||
// per byte). In all four cases the per-pixel byte address is
|
||
// `dest_base + swizzle(FBP=0, FBW=DBW, x=DSAX+cur_x,
|
||
// y=DSAY+cur_y)`. The PSMT4 path additionally uses the
|
||
// swizzle's `nibble_hi` output (instead of the linear
|
||
// formula's x_eff[0]) to pick which nibble of the byte gets
|
||
// the upload's 4-bit pixel — the existing Ch118 nibble RMW
|
||
// write-mask machinery (write_be=4'b0001, write_mask=
|
||
// 0x0F or 0xF0) layers on top of the swizzled byte address.
|
||
// The four parameters are independent. All four parameter
|
||
// defaults are 0 → legacy linear behavior.
|
||
// - One pending qword buffer + a 5-bit lane counter (0..3 for
|
||
// PSMCT32, 0..7 for PSMCT16, 0..15 for PSMT8, 0..31 for
|
||
// PSMT4; the last-lane index is snapshotted at TRXDIR-arm
|
||
// time per `lane_last_q`).
|
||
// Backpressure to the upstream is exposed via `data_ready`.
|
||
// Wired into `gif_packed_stub.image_data_ready` (Ch110), so
|
||
// the GIF gates `in_ready` only in S_IMAGE state with FLG=2;
|
||
// the DMAC's ep_ready follows gif_in_ready directly. Outside
|
||
// S_IMAGE the gate is a no-op.
|
||
//
|
||
// Wiring contract (TB-level):
|
||
// trxdir_wr_pulse ← gs_stub.trxdir_wr_q
|
||
// trxdir ← gs_stub.trxdir_q
|
||
// bitbltbuf ← gs_stub.bitbltbuf_q
|
||
// trxpos ← gs_stub.trxpos_q
|
||
// trxreg ← gs_stub.trxreg_q
|
||
// data_valid ← gif_packed_stub.image_data_valid
|
||
// data_qword ← gif_packed_stub.image_data
|
||
// data_last ← gif_packed_stub.image_data_last
|
||
// data_ready → gif_packed_stub.image_data_ready (Ch110).
|
||
// The GIF FSM uses it to gate in_ready only in
|
||
// S_IMAGE+FLG=2; dmac.ep_ready follows
|
||
// gif.in_ready directly (no TB-level AND).
|
||
// vram_we / waddr / wdata / wbe / wmask → muxed into vram_stub's
|
||
// write port (the TB selects between the engine, the raster
|
||
// channel, and any TB-direct path).
|
||
// busy → high while a transfer is active (between trxdir_wr arm
|
||
// and the last lane emit). TB uses this for the vram_stub
|
||
// write-port mux.
|
||
//
|
||
// What this stub does NOT do:
|
||
// - Source-direction (local→host or local→local) transfers.
|
||
// - PSMCT24 / PSMZ-* image transfers (not currently exercised
|
||
// in the demo flow).
|
||
// - Mid-transfer TRXDIR re-arm or interleaving with REGLIST.
|
||
// - HWREG-side legacy/non-PSM-aware swizzle (out of scope —
|
||
// PSMCT32 since Ch121, PSMCT16 since Ch127, PSMT8 since
|
||
// Ch133, PSMT4 since Ch139 all support the canonical PCSX2
|
||
// swizzle behind their respective parameter gates).
|
||
// - HWREG via privileged-MMIO (the real PS2 path that reads
|
||
// pixel data through the privileged HWREG register at
|
||
// 0x12001000); IMAGE-mode GIF qwords are the only data
|
||
// source modelled here.
|
||
|
||
`timescale 1ns/1ps
|
||
|
||
module gif_image_xfer_stub
|
||
import trace_pkg::*;
|
||
#(
|
||
// Ch121 — when set, PSMCT32 uploads compute the per-pixel VRAM
|
||
// byte address via the real PS2 GS page/block swizzle
|
||
// (gs_swizzle_psmct32_stub) instead of the legacy linear formula
|
||
// `dest_base + (DSAY+y)*row_stride + (DSAX+x)*4`. Other PSMs
|
||
// are not affected by this parameter — PSMCT16 has its own
|
||
// gate (PSMCT16_SWIZZLE, Ch127), PSMT8 has PSMT8_SWIZZLE
|
||
// (Ch133), PSMT4 has PSMT4_SWIZZLE (Ch139, see below).
|
||
// Default 0 keeps every existing PSMCT32 image-xfer TB on
|
||
// the original linear addressing — its expectations don't
|
||
// change.
|
||
parameter bit PSMCT32_SWIZZLE = 1'b0,
|
||
|
||
// Ch127 — when set, PSMCT16 uploads compute the per-pixel VRAM
|
||
// byte address via the canonical PS2 GS page/block/column
|
||
// swizzle (gs_swizzle_psmct16_stub) instead of the legacy
|
||
// linear formula `dest_base + (DSAY+y)*row_stride +
|
||
// (DSAX+x)*2`. PSMCT32 / PSMT8 / PSMT4 are governed by their
|
||
// own gates (PSMCT32_SWIZZLE / PSMT8_SWIZZLE / PSMT4_SWIZZLE).
|
||
// Default 0 keeps every existing PSMCT16 image-xfer
|
||
// TB on the legacy linear path. Mirrors the Ch126 PCRTC
|
||
// read-side wiring at the upload write side, completing
|
||
// the second integration point for the Ch125 PSMCT16
|
||
// primitive.
|
||
parameter bit PSMCT16_SWIZZLE = 1'b0,
|
||
|
||
// Ch133 — when set, PSMT8 uploads compute the per-pixel VRAM
|
||
// byte address via the canonical PS2 GS page/block/column
|
||
// swizzle (gs_swizzle_psmt8_stub) instead of the legacy
|
||
// linear formula `dest_base + (DSAY+y)*row_stride +
|
||
// (DSAX+x)*1`. PSMT8 pages are 128 px wide so the swizzle
|
||
// internally uses `bw_pg = DBW >> 1` — PCSX2 asserts DBW must
|
||
// be even for PSMT8 at GSLocalMemory.h:553. PSMCT32 / PSMCT16
|
||
// / PSMT4 are governed by their own gates.
|
||
// Default 0 keeps every existing PSMT8 image-xfer TB
|
||
// (Ch117 PSMT8, Ch107 PSMT4-via-CT16-CLUT palette path) on
|
||
// the legacy linear addressing. Mirrors the Ch132 PCRTC
|
||
// read-side wiring at the upload write side, completing
|
||
// the second integration point for the Ch131 PSMT8 primitive.
|
||
parameter bit PSMT8_SWIZZLE = 1'b0,
|
||
|
||
// Ch139 — when set, PSMT4 uploads compute the per-pixel VRAM
|
||
// byte address via the canonical PS2 GS page/block/column
|
||
// swizzle (gs_swizzle_psmt4_stub) instead of the legacy
|
||
// linear formula `dest_base + (DSAY+y)*row_stride +
|
||
// (DSAX+x)*0.5`. PSMT4 pages are 128 px wide AND 128 px tall;
|
||
// the swizzle internally uses `bw_pg = DBW >> 1` — PCSX2
|
||
// asserts DBW must be even for PSMT4 at GSLocalMemory.h:560.
|
||
// The PSMT4 swizzle module also outputs a `nibble_hi`
|
||
// selector that picks which nibble of the byte at the
|
||
// swizzled address holds this pixel — the linear formula's
|
||
// x_eff[0] selector is wrong under the swizzled layout
|
||
// because the canonical PCSX2 column table reorders nibbles
|
||
// within a block. The existing Ch118 nibble RMW machinery
|
||
// (write_be=4'b0001 + write_mask 0x0F or 0xF0) layers on top
|
||
// of the swizzled byte address: the mask is selected by the
|
||
// swizzle's nibble_hi when this gate is on, instead of by
|
||
// x_eff[0]. PSMCT32 / PSMCT16 / PSMT8 are governed by their
|
||
// own gates. Default 0 keeps every existing PSMT4 image-xfer
|
||
// TB (Ch118 PSMT4, Ch107 PSMT4-e2e palette path) on the
|
||
// legacy linear addressing. Mirrors the Ch138 PCRTC
|
||
// read-side wiring at the upload write side, completing the
|
||
// second integration point for the Ch137 PSMT4 primitive.
|
||
parameter bit PSMT4_SWIZZLE = 1'b0
|
||
)(
|
||
input logic clk,
|
||
input logic rst_n,
|
||
|
||
// Arm input — pulses for one cycle on TRXDIR commit.
|
||
input logic trxdir_wr_pulse,
|
||
input logic [63:0] trxdir,
|
||
input logic [63:0] bitbltbuf,
|
||
input logic [63:0] trxpos,
|
||
input logic [63:0] trxreg,
|
||
|
||
// IMAGE qword stream from gif_packed_stub.
|
||
input logic data_valid,
|
||
input logic [127:0] data_qword,
|
||
input logic data_last,
|
||
output logic data_ready,
|
||
|
||
// VRAM write port. PSM-aware be + per-bit merge mask:
|
||
// PSMCT32 (Ch110): be = 4'b1111, mask = 0xFFFFFFFF.
|
||
// PSMCT16 (Ch116): be = 4'b0011, mask = 0xFFFFFFFF.
|
||
// PSMT8 (Ch117): be = 4'b0001, mask = 0xFFFFFFFF.
|
||
// PSMT4 (Ch118): be = 4'b0001, mask = 0x0000_000F (low
|
||
// nibble) or 0x0000_00F0 (high nibble),
|
||
// keyed by (DSAX+x)[0]. The 4-bit index
|
||
// sits at the matching nibble position in
|
||
// write_data[7:0]; vram_stub merges only
|
||
// the targeted nibble.
|
||
output logic vram_we,
|
||
output logic [31:0] vram_waddr,
|
||
output logic [31:0] vram_wdata,
|
||
output logic [3:0] vram_wbe,
|
||
output logic [31:0] vram_wmask,
|
||
|
||
// Engine status.
|
||
output logic busy
|
||
);
|
||
|
||
// BITBLTBUF field decode (real PS2 layout, per PCSX2 GSRegs.h):
|
||
// [13:0] SBP
|
||
// [21:16] SBW
|
||
// [29:24] SPSM
|
||
// [45:32] DBP
|
||
// [53:48] DBW
|
||
// [61:56] DPSM
|
||
logic [13:0] dbp;
|
||
logic [5:0] dbw;
|
||
logic [5:0] dpsm;
|
||
assign dbp = bitbltbuf[45:32];
|
||
assign dbw = bitbltbuf[53:48];
|
||
assign dpsm = bitbltbuf[61:56];
|
||
|
||
// TRXPOS field decode:
|
||
// [10:0] SSAX
|
||
// [26:16] SSAY
|
||
// [42:32] DSAX
|
||
// [58:48] DSAY
|
||
// [60:59] DIR
|
||
logic [10:0] dsax;
|
||
logic [10:0] dsay;
|
||
assign dsax = trxpos[42:32];
|
||
assign dsay = trxpos[58:48];
|
||
|
||
// TRXREG field decode:
|
||
// [11:0] RRW
|
||
// [43:32] RRH
|
||
logic [11:0] rrw;
|
||
logic [11:0] rrh;
|
||
assign rrw = trxreg[11:0];
|
||
assign rrh = trxreg[43:32];
|
||
|
||
// TRXDIR field decode (XDIR is bits [1:0]).
|
||
logic [1:0] xdir;
|
||
assign xdir = trxdir[1:0];
|
||
|
||
// Snapshotted transfer parameters (latched at trxdir_wr arm).
|
||
logic [13:0] dbp_q;
|
||
logic [5:0] dbw_q;
|
||
logic [5:0] dpsm_q;
|
||
logic [10:0] dsax_q;
|
||
logic [10:0] dsay_q;
|
||
logic [11:0] rrw_q;
|
||
logic [11:0] rrh_q;
|
||
logic [31:0] dest_base_q; // DBP * 256 (bytes)
|
||
logic [31:0] row_stride_q; // DBW * 64 * bpp
|
||
logic psmct32_q; // DPSM == 0x00 → 4 bytes/pixel
|
||
logic psmct16_q; // DPSM == 0x02 → 2 bytes/pixel (Ch116)
|
||
logic psmt8_q; // DPSM == 0x13 → 1 byte/pixel (Ch117)
|
||
logic psmt4_q; // DPSM == 0x14 → 0.5 byte/pixel (Ch118)
|
||
// Last-lane index for the current PSM (3 for PSMCT32 → 4
|
||
// lanes, 7 for PSMCT16 → 8 lanes, 15 for PSMT8 → 16 lanes,
|
||
// 31 for PSMT4 → 32 lanes).
|
||
// Other PSMs use the PSMCT32 cadence (3) for silent consume.
|
||
logic [4:0] lane_last_q;
|
||
|
||
// Per-emit progression: which qword (0..NLOOP-1) and which
|
||
// lane within the qword (0..3 for PSMCT32, 0..7 for PSMCT16,
|
||
// 0..15 for PSMT8, 0..31 for PSMT4).
|
||
logic [127:0] qword_q;
|
||
logic [4:0] lane_q; // widened to 5 bits for PSMT4
|
||
logic lane_valid_q; // a buffered qword is being drained
|
||
|
||
// Pixel cursor (cur_x, cur_y) within the destination rect,
|
||
// measured from (DSAX, DSAY). Wrap at RRW.
|
||
logic [11:0] cur_x_q;
|
||
logic [11:0] cur_y_q;
|
||
logic [23:0] pix_total_q; // RRW * RRH (cap 16M)
|
||
logic [23:0] pix_done_q;
|
||
|
||
// FSM.
|
||
typedef enum logic [1:0] {
|
||
S_IDLE = 2'd0,
|
||
S_RUN = 2'd1
|
||
} state_e;
|
||
state_e state;
|
||
|
||
assign busy = (state == S_RUN);
|
||
|
||
// The engine is "ready" for a new qword when no qword is
|
||
// currently being drained. In S_IDLE we admit qwords too —
|
||
// upstream image_data_valid won't pulse outside an active
|
||
// S_IMAGE state, so this is benign.
|
||
assign data_ready = !lane_valid_q;
|
||
|
||
// Combinational pixel address for the in-flight lane.
|
||
// PSMCT32: addr = dest_base + (DSAY+cur_y) * row_stride
|
||
// + (DSAX+cur_x) * 4
|
||
// PSMCT16: addr = dest_base + (DSAY+cur_y) * row_stride
|
||
// + (DSAX+cur_x) * 2
|
||
// PSMT8 : addr = dest_base + (DSAY+cur_y) * row_stride
|
||
// + (DSAX+cur_x) * 1
|
||
// PSMT4 : addr = dest_base + (DSAY+cur_y) * row_stride
|
||
// + ((DSAX+cur_x) >> 1)
|
||
// nibble = (DSAX+cur_x)[0] high vs low
|
||
// (row_stride already encodes the bpp factor.)
|
||
logic [31:0] cur_addr_c;
|
||
logic [31:0] cur_data_c;
|
||
logic [3:0] cur_be_c;
|
||
logic [31:0] cur_mask_c;
|
||
always_comb begin
|
||
logic [31:0] x_off;
|
||
logic [11:0] x_eff;
|
||
logic [3:0] t4_nibble;
|
||
x_eff = dsax_q + cur_x_q;
|
||
if (psmt4_q)
|
||
x_off = {21'd0, x_eff[11:1]}; // (x_eff >> 1)
|
||
else if (psmt8_q)
|
||
x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q});
|
||
else if (psmct16_q)
|
||
x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q}) * 32'd2;
|
||
else
|
||
x_off = ({20'd0, dsax_q} + {20'd0, cur_x_q}) * 32'd4;
|
||
cur_addr_c = dest_base_q
|
||
+ (32'(dsay_q) + 32'(cur_y_q)) * row_stride_q
|
||
+ x_off;
|
||
// PSMT4: extract the 4-bit nibble at lane_q from qword_q.
|
||
// qword[lane*4 +: 4] for lane in 0..31. iverilog 12 supports
|
||
// indexed part-select with variable base + constant width.
|
||
t4_nibble = qword_q[(5'(lane_q) * 4) +: 4];
|
||
if (psmt4_q) begin
|
||
// 32 PSMT4 pixels per qword (2 px/byte × 16 bytes).
|
||
// Place the 4-bit index at the matching nibble position
|
||
// in write_data[7:0] keyed by the nibble selector.
|
||
// Linear (PSMT4_SWIZZLE=0): x_eff[0] is the selector
|
||
// (low nibble = even pixel, high = odd pixel).
|
||
// Swizzled (Ch139, PSMT4_SWIZZLE=1): the swizzle module
|
||
// outputs `nibble_hi` directly — required because the
|
||
// canonical PCSX2 columnTable4 reorders nibbles within
|
||
// a block, so x_eff[0] is no longer correct. write_be
|
||
// is 4'b0001 (single-byte commit) and write_mask
|
||
// gates the targeted nibble; vram_stub merges only
|
||
// that nibble, preserving the OTHER nibble of the
|
||
// same byte.
|
||
logic psmt4_nibble_select;
|
||
psmt4_nibble_select = PSMT4_SWIZZLE ? swizzle4_nibble_hi
|
||
: x_eff[0];
|
||
if (psmt4_nibble_select) begin
|
||
cur_data_c = {24'd0, t4_nibble, 4'd0}; // high nibble
|
||
cur_mask_c = 32'h0000_00F0;
|
||
end else begin
|
||
cur_data_c = {24'd0, 4'd0, t4_nibble}; // low nibble
|
||
cur_mask_c = 32'h0000_000F;
|
||
end
|
||
cur_be_c = 4'b0001;
|
||
end else if (psmt8_q) begin
|
||
// 16 PSMT8 pixels per qword. Place the 8-bit index in
|
||
// the LOW byte of write_data; vram_stub's per-byte BE
|
||
// commits exactly 1 byte at the exact pixel address
|
||
// (write_addr = cur_addr_c) at any byte alignment.
|
||
cur_mask_c = 32'hFFFF_FFFF;
|
||
unique case (lane_q[3:0])
|
||
4'd0: cur_data_c = {24'd0, qword_q[ 7: 0]};
|
||
4'd1: cur_data_c = {24'd0, qword_q[ 15: 8]};
|
||
4'd2: cur_data_c = {24'd0, qword_q[ 23: 16]};
|
||
4'd3: cur_data_c = {24'd0, qword_q[ 31: 24]};
|
||
4'd4: cur_data_c = {24'd0, qword_q[ 39: 32]};
|
||
4'd5: cur_data_c = {24'd0, qword_q[ 47: 40]};
|
||
4'd6: cur_data_c = {24'd0, qword_q[ 55: 48]};
|
||
4'd7: cur_data_c = {24'd0, qword_q[ 63: 56]};
|
||
4'd8: cur_data_c = {24'd0, qword_q[ 71: 64]};
|
||
4'd9: cur_data_c = {24'd0, qword_q[ 79: 72]};
|
||
4'd10: cur_data_c = {24'd0, qword_q[ 87: 80]};
|
||
4'd11: cur_data_c = {24'd0, qword_q[ 95: 88]};
|
||
4'd12: cur_data_c = {24'd0, qword_q[103: 96]};
|
||
4'd13: cur_data_c = {24'd0, qword_q[111:104]};
|
||
4'd14: cur_data_c = {24'd0, qword_q[119:112]};
|
||
default: cur_data_c = {24'd0, qword_q[127:120]};
|
||
endcase
|
||
cur_be_c = 4'b0001;
|
||
end else if (psmct16_q) begin
|
||
// 8 PSMCT16 pixels per qword. Place the 16-bit value
|
||
// in the LOW halfword of write_data; vram_stub's per-
|
||
// byte BE commits exactly 2 bytes at the 2-byte-
|
||
// aligned pixel address (write_addr = cur_addr_c).
|
||
cur_mask_c = 32'hFFFF_FFFF;
|
||
unique case (lane_q[2:0])
|
||
3'd0: cur_data_c = {16'd0, qword_q[ 15: 0]};
|
||
3'd1: cur_data_c = {16'd0, qword_q[ 31: 16]};
|
||
3'd2: cur_data_c = {16'd0, qword_q[ 47: 32]};
|
||
3'd3: cur_data_c = {16'd0, qword_q[ 63: 48]};
|
||
3'd4: cur_data_c = {16'd0, qword_q[ 79: 64]};
|
||
3'd5: cur_data_c = {16'd0, qword_q[ 95: 80]};
|
||
3'd6: cur_data_c = {16'd0, qword_q[111: 96]};
|
||
default: cur_data_c = {16'd0, qword_q[127:112]};
|
||
endcase
|
||
cur_be_c = 4'b0011;
|
||
end else begin
|
||
// PSMCT32: 4 pixels per qword, full 32-bit.
|
||
cur_mask_c = 32'hFFFF_FFFF;
|
||
unique case (lane_q[1:0])
|
||
2'd0: cur_data_c = qword_q[ 31: 0];
|
||
2'd1: cur_data_c = qword_q[ 63: 32];
|
||
2'd2: cur_data_c = qword_q[ 95: 64];
|
||
default: cur_data_c = qword_q[127: 96];
|
||
endcase
|
||
cur_be_c = 4'b1111;
|
||
end
|
||
end
|
||
|
||
// Ch121 — optional PSMCT32 swizzled write address.
|
||
//
|
||
// When PSMCT32_SWIZZLE=1 AND the active PSM is PSMCT32, route
|
||
// the per-pixel byte address through gs_swizzle_psmct32_stub
|
||
// instead of the linear formula. The swizzle module gives
|
||
// a within-FB byte offset relative to FBP=0; we add dest_base_q
|
||
// (= DBP*256) to anchor the upload at the same DBP-relative
|
||
// base the linear path uses. dbw_q feeds the swizzle's FBW
|
||
// input directly (both are in 64-pixel units, matching the
|
||
// PSMCT32 page = 64 px wide convention). The per-pixel x and
|
||
// y inputs are the FULL effective coordinates (DSAX+cur_x,
|
||
// DSAY+cur_y), so the swizzle correctly handles non-zero
|
||
// DSAX/DSAY uploads as well.
|
||
//
|
||
// Other PSMs are governed by their own dispatch branches in
|
||
// the per-PSM mux below (PSMCT16 via PSMCT16_SWIZZLE Ch127,
|
||
// PSMT8 via PSMT8_SWIZZLE Ch133, PSMT4 via PSMT4_SWIZZLE
|
||
// Ch139). With PSMCT32_SWIZZLE=0 the PSMCT32 path falls
|
||
// through to cur_addr_c. The swizzle module is purely
|
||
// combinational; when its gate is off its output is unused
|
||
// and the synthesizer trims it.
|
||
logic [31:0] cur_addr_swizzled_c;
|
||
logic [11:0] swizzle_x_in;
|
||
logic [11:0] swizzle_y_in;
|
||
assign swizzle_x_in = dsax_q + cur_x_q;
|
||
assign swizzle_y_in = dsay_q + cur_y_q;
|
||
logic [31:0] swizzle_addr_off;
|
||
gs_swizzle_psmct32_stub u_swizzle (
|
||
.fbp (9'd0),
|
||
.fbw (dbw_q),
|
||
.x (swizzle_x_in),
|
||
.y (swizzle_y_in),
|
||
.addr(swizzle_addr_off)
|
||
);
|
||
assign cur_addr_swizzled_c = dest_base_q + swizzle_addr_off;
|
||
|
||
// Ch127 — optional PSMCT16 swizzled write address. Same shape
|
||
// as Ch121 above but uses gs_swizzle_psmct16_stub. The PSMCT16
|
||
// page (64×64) and block grid (4 cols × 8 rows of 16×8 blocks)
|
||
// and within-block columnTable16 are all baked into that
|
||
// module — we just feed it `dbw_q` as FBW and the full
|
||
// effective coords. dest_base_q (= DBP*256) is added on top
|
||
// so any DBP works; the swizzle module is given FBP=0 so its
|
||
// output is the within-FB byte offset only.
|
||
logic [31:0] cur_addr_swizzled16_c;
|
||
logic [31:0] swizzle16_addr_off;
|
||
gs_swizzle_psmct16_stub u_swizzle16 (
|
||
.fbp (9'd0),
|
||
.fbw (dbw_q),
|
||
.x (swizzle_x_in),
|
||
.y (swizzle_y_in),
|
||
.addr(swizzle16_addr_off)
|
||
);
|
||
assign cur_addr_swizzled16_c = dest_base_q + swizzle16_addr_off;
|
||
|
||
// Ch133 — optional PSMT8 swizzled write address. Same shape as
|
||
// Ch121 / Ch127 above but uses gs_swizzle_psmt8_stub. PSMT8
|
||
// pages are 128 px wide so the swizzle internally uses
|
||
// bw_pg = DBW>>1 (PCSX2 asserts DBW must be even for PSMT8).
|
||
// dest_base_q (= DBP*256) is added on top so any DBP works;
|
||
// the swizzle module is given FBP=0 so its output is the
|
||
// within-FB byte offset only.
|
||
logic [31:0] cur_addr_swizzled8_c;
|
||
logic [31:0] swizzle8_addr_off;
|
||
gs_swizzle_psmt8_stub u_swizzle8 (
|
||
.fbp (9'd0),
|
||
.fbw (dbw_q),
|
||
.x (swizzle_x_in),
|
||
.y (swizzle_y_in),
|
||
.addr(swizzle8_addr_off)
|
||
);
|
||
assign cur_addr_swizzled8_c = dest_base_q + swizzle8_addr_off;
|
||
|
||
// Ch139 — optional PSMT4 swizzled write address. Same wiring
|
||
// shape as Ch121/Ch127/Ch133 but uses gs_swizzle_psmt4_stub,
|
||
// which outputs both an absolute byte address AND a
|
||
// `nibble_hi` selector. PSMT4 pages are 128 px wide AND tall;
|
||
// the swizzle internally uses bw_pg=DBW>>1 (PCSX2 asserts
|
||
// DBW must be even for PSMT4). dest_base_q (= DBP*256) is
|
||
// added on top so any DBP works; the swizzle module is given
|
||
// FBP=0 so its addr output is the within-FB byte offset only.
|
||
// The nibble_hi output threads into the PSMT4 data lane mux
|
||
// below: when this gate is on AND psmt4_q, the existing Ch118
|
||
// nibble RMW machinery (write_be=4'b0001, write_mask 0x0F or
|
||
// 0xF0) keys on the swizzle's nibble_hi instead of x_eff[0].
|
||
logic [31:0] cur_addr_swizzled4_c;
|
||
logic [31:0] swizzle4_addr_off;
|
||
logic swizzle4_nibble_hi;
|
||
gs_swizzle_psmt4_stub u_swizzle4 (
|
||
.fbp (9'd0),
|
||
.fbw (dbw_q),
|
||
.x (swizzle_x_in),
|
||
.y (swizzle_y_in),
|
||
.addr (swizzle4_addr_off),
|
||
.nibble_hi(swizzle4_nibble_hi)
|
||
);
|
||
assign cur_addr_swizzled4_c = dest_base_q + swizzle4_addr_off;
|
||
|
||
// VRAM write outputs — pulse for one cycle per pixel emit.
|
||
// Only fire when DPSM is supported (PSMCT32, PSMCT16, PSMT8,
|
||
// or PSMT4). Other PSMs still consume qwords lane-by-lane to
|
||
// keep gif_packed_stub from desync, but no VRAM write happens.
|
||
logic emit_now;
|
||
assign emit_now = lane_valid_q &&
|
||
(psmct32_q || psmct16_q || psmt8_q || psmt4_q);
|
||
|
||
// Per-PSM swizzle dispatch. The four parameters are
|
||
// independent; defaults of 0 keep every PSM on the legacy
|
||
// linear path.
|
||
assign vram_we = emit_now;
|
||
assign vram_waddr = (PSMCT32_SWIZZLE && psmct32_q) ? cur_addr_swizzled_c :
|
||
(PSMCT16_SWIZZLE && psmct16_q) ? cur_addr_swizzled16_c :
|
||
(PSMT8_SWIZZLE && psmt8_q) ? cur_addr_swizzled8_c :
|
||
(PSMT4_SWIZZLE && psmt4_q) ? cur_addr_swizzled4_c :
|
||
cur_addr_c;
|
||
assign vram_wdata = cur_data_c;
|
||
assign vram_wbe = cur_be_c;
|
||
assign vram_wmask = cur_mask_c;
|
||
|
||
// Compute target pixel count = RRW * RRH (24-bit cap is fine
|
||
// for any palette/texture upload we model here).
|
||
logic [23:0] pix_total_calc;
|
||
assign pix_total_calc = {12'd0, rrw} * {12'd0, rrh};
|
||
|
||
// Step / wrap logic (on the cycle a pixel emits).
|
||
logic [11:0] next_x;
|
||
logic [11:0] next_y;
|
||
logic wrap_row;
|
||
assign wrap_row = (cur_x_q + 12'd1 == rrw_q);
|
||
assign next_x = wrap_row ? 12'd0 : (cur_x_q + 12'd1);
|
||
assign next_y = wrap_row ? (cur_y_q + 12'd1) : cur_y_q;
|
||
|
||
always_ff @(posedge clk) begin
|
||
if (!rst_n) begin
|
||
state <= S_IDLE;
|
||
dbp_q <= 14'd0;
|
||
dbw_q <= 6'd0;
|
||
dpsm_q <= 6'd0;
|
||
dsax_q <= 11'd0;
|
||
dsay_q <= 11'd0;
|
||
rrw_q <= 12'd0;
|
||
rrh_q <= 12'd0;
|
||
dest_base_q <= 32'd0;
|
||
row_stride_q <= 32'd0;
|
||
psmct32_q <= 1'b0;
|
||
psmct16_q <= 1'b0;
|
||
psmt8_q <= 1'b0;
|
||
psmt4_q <= 1'b0;
|
||
lane_last_q <= 5'd3;
|
||
qword_q <= 128'd0;
|
||
lane_q <= 5'd0;
|
||
lane_valid_q <= 1'b0;
|
||
cur_x_q <= 12'd0;
|
||
cur_y_q <= 12'd0;
|
||
pix_total_q <= 24'd0;
|
||
pix_done_q <= 24'd0;
|
||
end else begin
|
||
unique case (state)
|
||
S_IDLE: begin
|
||
if (trxdir_wr_pulse && (xdir == 2'd0)) begin
|
||
logic is_ct32, is_ct16, is_t8, is_t4;
|
||
is_ct32 = (dpsm == 6'h00);
|
||
is_ct16 = (dpsm == 6'h02);
|
||
is_t8 = (dpsm == 6'h13);
|
||
is_t4 = (dpsm == 6'h14);
|
||
// Snapshot all transfer params.
|
||
dbp_q <= dbp;
|
||
dbw_q <= dbw;
|
||
dpsm_q <= dpsm;
|
||
dsax_q <= dsax;
|
||
dsay_q <= dsay;
|
||
rrw_q <= rrw;
|
||
rrh_q <= rrh;
|
||
dest_base_q <= {18'd0, dbp} << 8;
|
||
// row_stride = DBW * 64 * bpp:
|
||
// PSMCT32 → DBW * 256 (DBW << 8)
|
||
// PSMCT16 → DBW * 128 (DBW << 7)
|
||
// PSMT8 → DBW * 64 (DBW << 6)
|
||
// PSMT4 → DBW * 32 (DBW << 5)
|
||
// other → fall back to PSMCT32-stride
|
||
// (no VRAM emit anyway)
|
||
row_stride_q <= is_t4 ? ({18'd0, dbw} << 5)
|
||
: is_t8 ? ({18'd0, dbw} << 6)
|
||
: is_ct16 ? ({18'd0, dbw} << 7)
|
||
: ({18'd0, dbw} << 8);
|
||
psmct32_q <= is_ct32;
|
||
psmct16_q <= is_ct16;
|
||
psmt8_q <= is_t8;
|
||
psmt4_q <= is_t4;
|
||
// Lanes/qword: 4 (PSMCT32) → last=3,
|
||
// 8 (PSMCT16) → last=7, 16 (PSMT8) → last=15,
|
||
// 32 (PSMT4) → last=31. Other PSMs use the
|
||
// PSMCT32 cadence (silent consume).
|
||
lane_last_q <= is_t4 ? 5'd31
|
||
: is_t8 ? 5'd15
|
||
: is_ct16 ? 5'd7
|
||
: 5'd3;
|
||
cur_x_q <= 12'd0;
|
||
cur_y_q <= 12'd0;
|
||
pix_total_q <= pix_total_calc;
|
||
pix_done_q <= 24'd0;
|
||
lane_valid_q <= 1'b0;
|
||
state <= S_RUN;
|
||
end
|
||
end
|
||
|
||
S_RUN: begin
|
||
if (!lane_valid_q && data_valid && data_ready) begin
|
||
// Latch a fresh qword to drain.
|
||
qword_q <= data_qword;
|
||
lane_q <= 5'd0;
|
||
lane_valid_q <= 1'b1;
|
||
end else if (lane_valid_q) begin
|
||
// Drain one lane per cycle. Step the cursor
|
||
// when the emit fires (supported PSM or not —
|
||
// for unsupported PSM, no VRAM write fires
|
||
// but we still consume the lane to keep
|
||
// gif_packed_stub from desync).
|
||
if (lane_q == lane_last_q) begin
|
||
lane_valid_q <= 1'b0;
|
||
lane_q <= 5'd0;
|
||
end else begin
|
||
lane_q <= lane_q + 5'd1;
|
||
end
|
||
|
||
// Pixel-cursor + done-count step.
|
||
cur_x_q <= next_x;
|
||
cur_y_q <= next_y;
|
||
pix_done_q <= pix_done_q + 24'd1;
|
||
|
||
// Did this lane emit complete the rect?
|
||
if (pix_done_q + 24'd1 >= pix_total_q) begin
|
||
// End of transfer. Drop any remaining
|
||
// unused lanes — for PSMCT32 the rect
|
||
// size should be a multiple of 4 px,
|
||
// for PSMCT16 a multiple of 8 px, for
|
||
// PSMT8 a multiple of 16 px, for PSMT4
|
||
// a multiple of 32 px (else the extra
|
||
// trailing lanes within the last qword
|
||
// are silently swallowed).
|
||
// Return to IDLE on the same cycle the
|
||
// last lane emits.
|
||
state <= S_IDLE;
|
||
lane_valid_q <= 1'b0;
|
||
lane_q <= 5'd0;
|
||
end
|
||
end
|
||
end
|
||
|
||
default: state <= S_IDLE;
|
||
endcase
|
||
end
|
||
end
|
||
|
||
endmodule : gif_image_xfer_stub
|