Files
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

174 lines
7.8 KiB
Systemverilog

// ============================================================================
// gs_lpddr_scanout.sv (Ch320 Brick 1)
//
// LPDDR4B-backed scanout for the SMALL framebuffer demo (explicitly scoped to
// a tiny frame — default 64x64 PSMCT16 = 8 KiB). Instead of ao486-style line
// buffering/streaming, it copies the WHOLE framebuffer from LPDDR4B into a
// small on-chip cache once per frame, then serves scanout pixels from the
// cache. When the framebuffer grows past "tiny demo", revisit ao486's
// vga_fb_ddr line-buffer approach.
//
// Pixel mapping is automatic: the GS writer mirrors BRAM VRAM byte-for-byte
// into LPDDR4B, so cache[addr] == BRAM_VRAM[addr]. We index the cache by the
// PCRTC's own `vram_read_addr`, so the decoded pixel is identical to the BRAM
// scanout pixel for the same raster position — the video-source mux is seamless
// regardless of swizzle/MAG.
//
// Two clock domains:
// axi_clk (emif_clk, ~310 MHz) — fill the cache from LPDDR4B over AXI4
// video_clk (design_clk) — index the cache by vram_read_addr -> r/g/b
//
// Fill happens on frame_start (vsync) and completes during vblank (256 single
// beats ~ 1 us); scanout reads the stable cache during the active region. The
// read channel is shared with the Ch319 read-probe via an external arbiter.
// ============================================================================
`timescale 1ns/1ps
module gs_lpddr_scanout #(
parameter [29:0] FB_BASE = 30'd0, // LPDDR byte base to FETCH the framebuffer from
parameter int CACHE_BEATS = 256, // 256 * 32 B = 8 KiB = 64x64 PSMCT16
// Ch324 — the LPDDR fetch base (FB_BASE) and the PCRTC vram_read_addr base can DIFFER:
// the spill framebuffer lives at COLOR_SPILL_BASE in LPDDR but the PCRTC addresses it
// BRAM-relative (0-based). VRAM_BASE is the vram_read_addr origin (defaults to FB_BASE
// for the Ch320/321 mirror case where they coincide).
parameter [29:0] VRAM_BASE = FB_BASE,
// Ch324 — pixel format: 0 = PSMCT16 (RGBA5551, 16 px/beat), 1 = PSMCT32 (ABGR, 8 px/beat).
parameter bit PSMCT32 = 1'b0
)(
// ---- AXI read clock domain (emif_clk) ----
input logic axi_clk,
input logic axi_rst_n,
input logic enable, // 1 = refill the cache on frame_start
input logic frame_start, // video-domain pulse/level (vsync); synced internally
// ---- video clock domain (design_clk) ----
input logic video_clk,
input logic [31:0] vram_read_addr,
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
// ---- status (axi_clk domain; the bridge syncs these) ----
output logic cache_valid, // a full frame has been loaded
output logic [31:0] rd_beats, // beats read (cumulative)
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
output logic [29:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready
);
localparam int BEAT_BITS = $clog2(CACHE_BEATS); // 8 for 256
localparam int FB_SPAN = CACHE_BEATS * 32; // 8192 bytes
assign arburst = 2'b01; // INCR
assign arid = 7'd2; // distinct from writer (0) and read-probe (1)
assign arlen = 8'd0; // single beat
assign arsize = 3'b101; // 32 bytes
// Frame cache: one 256-bit word per 32-byte beat.
logic [255:0] cache [0:CACHE_BEATS-1];
// ---------------- fill side (axi_clk) ----------------
logic [2:0] fs_sync;
wire fs_edge = (fs_sync[2] != fs_sync[1]);
typedef enum logic [1:0] { F_IDLE, F_AR, F_R } fstate_t;
fstate_t fst;
logic [BEAT_BITS:0] beat; // 0..CACHE_BEATS (extra bit for the terminal compare)
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
fs_sync <= 3'd0; fst <= F_IDLE; beat <= '0;
araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
cache_valid <= 1'b0; rd_beats <= 32'd0; rd_errs <= 32'd0;
end else begin
fs_sync <= {fs_sync[1:0], frame_start};
case (fst)
F_IDLE: begin
if (fs_edge && enable) begin
beat <= '0;
cache_valid <= 1'b0;
araddr <= FB_BASE;
arvalid <= 1'b1;
fst <= F_AR;
end
end
F_AR: begin
if (arready) begin
arvalid <= 1'b0;
rready <= 1'b1;
fst <= F_R;
end
end
F_R: begin
if (rvalid) begin
cache[beat[BEAT_BITS-1:0]] <= rdata;
rready <= 1'b0;
rd_beats <= rd_beats + 32'd1;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
if (beat == CACHE_BEATS-1) begin
cache_valid <= 1'b1;
fst <= F_IDLE;
end else begin
beat <= beat + 1'b1;
araddr <= FB_BASE + (({{(30-BEAT_BITS-1){1'b0}}, (beat + 1'b1)}) << 5);
arvalid <= 1'b1;
fst <= F_AR;
end
end
end
default: fst <= F_IDLE;
endcase
end
end
// ---------------- scanout side (video_clk) ----------------
// Byte offset of the requested pixel within the framebuffer (vram_read_addr is BRAM-relative
// = VRAM_BASE-origin; the cache holds the SAME bytes fetched from FB_BASE in LPDDR).
wire [31:0] off = vram_read_addr - {2'b00, VRAM_BASE};
wire [BEAT_BITS-1:0] beat_ix = off[BEAT_BITS+4 -: BEAT_BITS]; // off>>5, low BEAT_BITS
wire [3:0] hw_sel = off[4:1]; // PSMCT16: 16 halfwords / beat
wire [2:0] w_sel = off[4:2]; // PSMCT32: 8 words / beat
wire in_range = cache_valid
&& (vram_read_addr >= {2'b00, VRAM_BASE})
&& (off < FB_SPAN);
// Registered (sync-read) cache lookup — 1-cycle latency to match the PCRTC's
// VRAM_SYNC_READ pixel timing so the muxed output aligns with PCRTC de/sync.
// Split the array-index and the part-select across the register boundary
// (chained index+part-select in one expr trips iverilog-12).
logic [255:0] word_q;
logic [3:0] hw_q;
logic [2:0] w_q;
logic in_range_q;
always_ff @(posedge video_clk) begin
word_q <= cache[beat_ix];
hw_q <= hw_sel;
w_q <= w_sel;
in_range_q <= in_range;
end
wire [15:0] px16_q = word_q[hw_q*16 +: 16];
wire [31:0] px32_q = word_q[w_q*32 +: 32];
// PSMCT16 (RGBA5551): R[4:0] G[9:5] B[14:10], 5->8 by bit-replication ({c5,c5[4:2]}).
wire [4:0] r5 = px16_q[4:0];
wire [4:0] g5 = px16_q[9:5];
wire [4:0] b5 = px16_q[14:10];
wire [7:0] r16 = {r5, r5[4:2]}, g16 = {g5, g5[4:2]}, b16 = {b5, b5[4:2]};
// PSMCT32 (ABGR8888): R[7:0] G[15:8] B[23:16] (A discarded) — identical decode to gs_pcrtc.
wire [7:0] r32 = px32_q[7:0], g32 = px32_q[15:8], b32 = px32_q[23:16];
assign r = !in_range_q ? 8'd0 : (PSMCT32 ? r32 : r16);
assign g = !in_range_q ? 8'd0 : (PSMCT32 ? g32 : g16);
assign b = !in_range_q ? 8'd0 : (PSMCT32 ? b32 : b16);
endmodule