Files
retroDE_ps2/rtl/gif_gs/gs_lpddr_scanout_lb.sv
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

214 lines
11 KiB
Systemverilog

// ============================================================================
// gs_lpddr_scanout_lb.sv (Ch321 Brick 2)
//
// LINE-BUFFER LPDDR4B scanout — the architectural successor to the whole-frame
// cache (gs_lpddr_scanout). Instead of mirroring the entire framebuffer in
// on-chip RAM (which defeats the point of putting the FB in LPDDR), this holds
// just TWO scanlines: it displays row L from one buffer while prefetching row
// L+1 into the other. On-chip cost is O(width), not O(width*height).
//
// NARROW SCOPE (Ch321): the 128x128 PSMCT16 demo. The frame is LINEAR (the GS
// writer mirrors the rasterizer's linear flush addresses), display window at
// origin, 1:1 (MAG off) — so the reader serves pixel (col=pixel_x, line=pixel_y)
// directly when inside the window. No general MAG/window handling beyond that.
//
// Two clock domains:
// axi_clk (emif_clk) — AXI4 burst-read one row (ROW_BEATS beats) into a buffer
// video_clk (design) — pixel_x/pixel_y index the active line buffer -> r/g/b
//
// Prefetch handshake: on each new display line (and at frame start) the video
// side requests the next FB row via a toggle; the axi side fills the OTHER
// buffer. `underflow` flags any pixel read before its row finished loading.
// ============================================================================
`timescale 1ns/1ps
module gs_lpddr_scanout_lb #(
parameter [29:0] FB_BASE = 30'd0,
parameter int STRIDE_BYTES = 256, // PSMCT16 128px*2B=256; PSMCT32 128px*4B=512
parameter int ROW_BEATS = 8, // STRIDE_BYTES / 32 (PSMCT16 128px=8; PSMCT32 128px=16)
parameter int N_ROWS = 128,
// Ch327a — PSMCT32 (ABGR8888, 8 px/256-bit beat) vs the original PSMCT16 (RGBA5551,
// 16 px/beat). The Ch326 LPDDR-only spill framebuffer is PSMCT32 @ COLOR_SPILL_BASE, so the
// line-buffer must decode it — NOT a config flip of the Ch321 PSMCT16/FB-at-0 path.
parameter bit PSMCT32 = 1'b0
)(
// ---- AXI read clock domain (emif_clk) ----
input logic axi_clk,
input logic axi_rst_n,
input logic enable, // 1 = active (prefetch + serve)
// ---- video clock domain (design_clk) ----
input logic video_clk,
input logic frame_start, // vsync pulse/level (synced internally)
input logic [11:0] pixel_x, // raster column (display)
input logic [11:0] pixel_y, // raster line (display)
input logic in_window, // PCRTC displayed-frame window gate
output logic [7:0] r,
output logic [7:0] g,
output logic [7:0] b,
// ---- status (axi_clk domain; bridge syncs) ----
output logic line_valid, // at least one row has been loaded
output logic underflow, // a pixel was read before its row was ready (sticky)
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
output logic [29:0] araddr,
output logic [1:0] arburst,
output logic [6:0] arid,
output logic [7:0] arlen,
output logic [2:0] arsize,
output logic arvalid,
input logic arready,
input logic [255:0] rdata,
input logic [1:0] rresp,
input logic rlast,
input logic rvalid,
output logic rready
);
localparam int RB_BITS = $clog2(ROW_BEATS); // 3 for 8
assign arburst = 2'b01; // INCR
assign arid = 7'd3; // distinct: writer=0, probe=1, frame-cache=2, line-buf=3
assign arlen = 8'd0; // SINGLE-BEAT per read — the only AXI read pattern proven on this
// EMIF (writer/probe/frame-cache all use arlen=0). A multi-beat
// burst (arlen=ROW_BEATS-1) was untested and garbled on hardware.
assign arsize = 3'b101; // 32 bytes
// Two line buffers, ROW_BEATS x 256-bit each (one display row).
logic [255:0] lb0 [0:ROW_BEATS-1];
logic [255:0] lb1 [0:ROW_BEATS-1];
// ================= video side (video_clk) =================
// No miss-prone request toggle. The video side just exposes the current
// in-window display row; the axi side free-runs, fetching rows sequentially
// and staying one row ahead (see below). disp_row_v resets on vsync.
logic [$clog2(N_ROWS):0] disp_row_v;
logic [2:0] fs_sync_v;
wire fs_edge_v = (fs_sync_v[2] != fs_sync_v[1]);
// The buffer holding display line L is L&1 (row L is fetched into L&1). Select
// it DIRECTLY from pixel_y[0] (tracks the current pixel) — a separately-registered
// "disp_buf" lags by one cycle and corrupts col 0 of each line.
wire disp_buf = pixel_y[0];
always_ff @(posedge video_clk) begin
if (!enable) begin
disp_row_v <= '0; fs_sync_v <= 3'd0;
end else begin
fs_sync_v <= {fs_sync_v[1:0], frame_start};
if (fs_edge_v) disp_row_v <= '0;
else if (in_window && (pixel_y < N_ROWS)) disp_row_v <= ($clog2(N_ROWS)+1)'(pixel_y);
end
end
// Registered (sync-read) pixel: pick buffer + beat + within-beat lane from pixel_x.
// PSMCT32: 8 px/256-bit beat -> beat = pixel_x>>3, lane = pixel_x[2:0] (32-bit).
// PSMCT16: 16 px/beat -> beat = pixel_x>>4, lane = pixel_x[3:0] (16-bit).
localparam int PXSH = PSMCT32 ? 3 : 4; // px-per-beat shift
localparam int PX_PER_ROW = PSMCT32 ? (STRIDE_BYTES/4) : (STRIDE_BYTES/2);
wire [RB_BITS-1:0] col_beat = pixel_x[RB_BITS+PXSH-1 -: RB_BITS];
wire [3:0] col_lane = PSMCT32 ? {1'b0, pixel_x[2:0]} : pixel_x[3:0];
logic [255:0] word_q; logic [3:0] lane_q; logic in_q;
always_ff @(posedge video_clk) begin
word_q <= disp_buf ? lb1[col_beat] : lb0[col_beat];
lane_q <= col_lane;
in_q <= in_window && (pixel_x < PX_PER_ROW) && (pixel_y < N_ROWS);
end
// PSMCT32 ABGR8888 (r=[7:0],g=[15:8],b=[23:16]) — matches gs_lpddr_scanout (frame-cache).
wire [31:0] px32 = word_q[lane_q[2:0]*32 +: 32]; // 3-bit lane: always in-range (0..224)
wire [7:0] r32 = px32[7:0], g32 = px32[15:8], b32 = px32[23:16];
// PSMCT16 RGBA5551 5-bit lanes expanded to 8-bit.
wire [15:0] px16 = word_q[lane_q*16 +: 16];
wire [4:0] r5 = px16[4:0], g5 = px16[9:5], b5 = px16[14:10];
assign r = !in_q ? 8'd0 : (PSMCT32 ? r32 : {r5, r5[4:2]});
assign g = !in_q ? 8'd0 : (PSMCT32 ? g32 : {g5, g5[4:2]});
assign b = !in_q ? 8'd0 : (PSMCT32 ? b32 : {b5, b5[4:2]});
// ================= axi side (axi_clk) — row fill FSM =================
// free-running prefetcher: fetch rows sequentially, staying <= disp_row+1 ahead.
// disp_row crosses video->axi (slowly-changing; the +1 throttle tolerates a 1-off
// transient). frame_start is edge-detected here to reset next_fetch every frame.
logic [2:0] fs_sync_e;
wire fs_edge_e = (fs_sync_e[2] != fs_sync_e[1]);
logic [$clog2(N_ROWS):0] disp_row_s0, disp_row_e;
logic [$clog2(N_ROWS):0] next_fetch; // next row to load (0..N_ROWS)
typedef enum logic [1:0] { L_IDLE, L_AR, L_R } lstate_t;
lstate_t lst;
logic [$clog2(N_ROWS):0] cur_row;
logic cur_buf;
logic [RB_BITS:0] beat;
logic fs_pending; // a vsync restart is pending; applied in L_IDLE (never mid-read)
always_ff @(posedge axi_clk) begin
if (!axi_rst_n) begin
fs_sync_e <= 3'd0; disp_row_s0 <= '0; disp_row_e <= '0; next_fetch <= '0;
lst <= L_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
cur_row <= '0; cur_buf <= 1'b0; beat <= '0;
line_valid <= 1'b0; rd_errs <= 32'd0; fs_pending <= 1'b0;
end else begin
fs_sync_e <= {fs_sync_e[1:0], frame_start};
disp_row_s0 <= disp_row_v; // 2-FF sync of the display row
disp_row_e <= disp_row_s0;
// vsync: mark a prefetch restart. DEFER it to L_IDLE so an in-flight AXI
// read is never aborted mid-handshake (which would deadlock the slave).
if (fs_edge_e) fs_pending <= 1'b1;
case (lst)
L_IDLE: begin
if (fs_pending) begin
next_fetch <= '0; // restart prefetch sequence from row 0
fs_pending <= 1'b0;
end else if (enable && (next_fetch < N_ROWS) && (next_fetch <= disp_row_e + 1'b1)) begin
cur_row <= next_fetch;
cur_buf <= next_fetch[0];
araddr <= FB_BASE + (next_fetch * STRIDE_BYTES);
beat <= '0;
arvalid <= 1'b1;
lst <= L_AR;
end
end
L_AR: begin
if (arready) begin
arvalid <= 1'b0;
rready <= 1'b1;
lst <= L_R;
end
end
L_R: begin
if (rvalid) begin
if (cur_buf) lb1[beat[RB_BITS-1:0]] <= rdata;
else lb0[beat[RB_BITS-1:0]] <= rdata;
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
rready <= 1'b0;
if (beat == ROW_BEATS-1) begin
line_valid <= 1'b1;
next_fetch <= next_fetch + 1'b1; // advance prefetch (rows 0..next_fetch-1 loaded)
lst <= L_IDLE;
end else begin
// next single-beat read of this row (arlen=0 each).
beat <= beat + 1'b1;
araddr <= araddr + 30'd32;
arvalid <= 1'b1;
lst <= L_AR;
end
end
end
default: lst <= L_IDLE;
endcase
end
end
// underflow (sticky, video domain): an in-window pixel for line pixel_y is read
// before that row was prefetched. The axi side loads rows 0..next_fetch-1, so row
// pixel_y is ready iff pixel_y < next_fetch. next_fetch crosses axi->video synced
// (slowly-changing; a 1-off transient is harmless). Resets on vsync.
logic [$clog2(N_ROWS):0] nf_s0, nf_v;
logic underflow_v;
always_ff @(posedge video_clk) begin
nf_s0 <= next_fetch; nf_v <= nf_s0;
if (!enable || fs_edge_v) underflow_v <= 1'b0;
else if (in_window && (pixel_y < N_ROWS) && (($clog2(N_ROWS)+1)'(pixel_y) >= nf_v))
underflow_v <= 1'b1;
end
assign underflow = underflow_v;
endmodule