ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
214 lines
11 KiB
Systemverilog
214 lines
11 KiB
Systemverilog
// ============================================================================
|
|
// gs_lpddr_scanout_lb.sv (Ch321 Brick 2)
|
|
//
|
|
// LINE-BUFFER LPDDR4B scanout — the architectural successor to the whole-frame
|
|
// cache (gs_lpddr_scanout). Instead of mirroring the entire framebuffer in
|
|
// on-chip RAM (which defeats the point of putting the FB in LPDDR), this holds
|
|
// just TWO scanlines: it displays row L from one buffer while prefetching row
|
|
// L+1 into the other. On-chip cost is O(width), not O(width*height).
|
|
//
|
|
// NARROW SCOPE (Ch321): the 128x128 PSMCT16 demo. The frame is LINEAR (the GS
|
|
// writer mirrors the rasterizer's linear flush addresses), display window at
|
|
// origin, 1:1 (MAG off) — so the reader serves pixel (col=pixel_x, line=pixel_y)
|
|
// directly when inside the window. No general MAG/window handling beyond that.
|
|
//
|
|
// Two clock domains:
|
|
// axi_clk (emif_clk) — AXI4 burst-read one row (ROW_BEATS beats) into a buffer
|
|
// video_clk (design) — pixel_x/pixel_y index the active line buffer -> r/g/b
|
|
//
|
|
// Prefetch handshake: on each new display line (and at frame start) the video
|
|
// side requests the next FB row via a toggle; the axi side fills the OTHER
|
|
// buffer. `underflow` flags any pixel read before its row finished loading.
|
|
// ============================================================================
|
|
`timescale 1ns/1ps
|
|
|
|
module gs_lpddr_scanout_lb #(
|
|
parameter [29:0] FB_BASE = 30'd0,
|
|
parameter int STRIDE_BYTES = 256, // PSMCT16 128px*2B=256; PSMCT32 128px*4B=512
|
|
parameter int ROW_BEATS = 8, // STRIDE_BYTES / 32 (PSMCT16 128px=8; PSMCT32 128px=16)
|
|
parameter int N_ROWS = 128,
|
|
// Ch327a — PSMCT32 (ABGR8888, 8 px/256-bit beat) vs the original PSMCT16 (RGBA5551,
|
|
// 16 px/beat). The Ch326 LPDDR-only spill framebuffer is PSMCT32 @ COLOR_SPILL_BASE, so the
|
|
// line-buffer must decode it — NOT a config flip of the Ch321 PSMCT16/FB-at-0 path.
|
|
parameter bit PSMCT32 = 1'b0
|
|
)(
|
|
// ---- AXI read clock domain (emif_clk) ----
|
|
input logic axi_clk,
|
|
input logic axi_rst_n,
|
|
input logic enable, // 1 = active (prefetch + serve)
|
|
|
|
// ---- video clock domain (design_clk) ----
|
|
input logic video_clk,
|
|
input logic frame_start, // vsync pulse/level (synced internally)
|
|
input logic [11:0] pixel_x, // raster column (display)
|
|
input logic [11:0] pixel_y, // raster line (display)
|
|
input logic in_window, // PCRTC displayed-frame window gate
|
|
output logic [7:0] r,
|
|
output logic [7:0] g,
|
|
output logic [7:0] b,
|
|
|
|
// ---- status (axi_clk domain; bridge syncs) ----
|
|
output logic line_valid, // at least one row has been loaded
|
|
output logic underflow, // a pixel was read before its row was ready (sticky)
|
|
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
|
|
|
|
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
|
|
output logic [29:0] araddr,
|
|
output logic [1:0] arburst,
|
|
output logic [6:0] arid,
|
|
output logic [7:0] arlen,
|
|
output logic [2:0] arsize,
|
|
output logic arvalid,
|
|
input logic arready,
|
|
input logic [255:0] rdata,
|
|
input logic [1:0] rresp,
|
|
input logic rlast,
|
|
input logic rvalid,
|
|
output logic rready
|
|
);
|
|
localparam int RB_BITS = $clog2(ROW_BEATS); // 3 for 8
|
|
|
|
assign arburst = 2'b01; // INCR
|
|
assign arid = 7'd3; // distinct: writer=0, probe=1, frame-cache=2, line-buf=3
|
|
assign arlen = 8'd0; // SINGLE-BEAT per read — the only AXI read pattern proven on this
|
|
// EMIF (writer/probe/frame-cache all use arlen=0). A multi-beat
|
|
// burst (arlen=ROW_BEATS-1) was untested and garbled on hardware.
|
|
assign arsize = 3'b101; // 32 bytes
|
|
|
|
// Two line buffers, ROW_BEATS x 256-bit each (one display row).
|
|
logic [255:0] lb0 [0:ROW_BEATS-1];
|
|
logic [255:0] lb1 [0:ROW_BEATS-1];
|
|
|
|
// ================= video side (video_clk) =================
|
|
// No miss-prone request toggle. The video side just exposes the current
|
|
// in-window display row; the axi side free-runs, fetching rows sequentially
|
|
// and staying one row ahead (see below). disp_row_v resets on vsync.
|
|
logic [$clog2(N_ROWS):0] disp_row_v;
|
|
logic [2:0] fs_sync_v;
|
|
wire fs_edge_v = (fs_sync_v[2] != fs_sync_v[1]);
|
|
// The buffer holding display line L is L&1 (row L is fetched into L&1). Select
|
|
// it DIRECTLY from pixel_y[0] (tracks the current pixel) — a separately-registered
|
|
// "disp_buf" lags by one cycle and corrupts col 0 of each line.
|
|
wire disp_buf = pixel_y[0];
|
|
|
|
always_ff @(posedge video_clk) begin
|
|
if (!enable) begin
|
|
disp_row_v <= '0; fs_sync_v <= 3'd0;
|
|
end else begin
|
|
fs_sync_v <= {fs_sync_v[1:0], frame_start};
|
|
if (fs_edge_v) disp_row_v <= '0;
|
|
else if (in_window && (pixel_y < N_ROWS)) disp_row_v <= ($clog2(N_ROWS)+1)'(pixel_y);
|
|
end
|
|
end
|
|
|
|
// Registered (sync-read) pixel: pick buffer + beat + within-beat lane from pixel_x.
|
|
// PSMCT32: 8 px/256-bit beat -> beat = pixel_x>>3, lane = pixel_x[2:0] (32-bit).
|
|
// PSMCT16: 16 px/beat -> beat = pixel_x>>4, lane = pixel_x[3:0] (16-bit).
|
|
localparam int PXSH = PSMCT32 ? 3 : 4; // px-per-beat shift
|
|
localparam int PX_PER_ROW = PSMCT32 ? (STRIDE_BYTES/4) : (STRIDE_BYTES/2);
|
|
wire [RB_BITS-1:0] col_beat = pixel_x[RB_BITS+PXSH-1 -: RB_BITS];
|
|
wire [3:0] col_lane = PSMCT32 ? {1'b0, pixel_x[2:0]} : pixel_x[3:0];
|
|
logic [255:0] word_q; logic [3:0] lane_q; logic in_q;
|
|
always_ff @(posedge video_clk) begin
|
|
word_q <= disp_buf ? lb1[col_beat] : lb0[col_beat];
|
|
lane_q <= col_lane;
|
|
in_q <= in_window && (pixel_x < PX_PER_ROW) && (pixel_y < N_ROWS);
|
|
end
|
|
// PSMCT32 ABGR8888 (r=[7:0],g=[15:8],b=[23:16]) — matches gs_lpddr_scanout (frame-cache).
|
|
wire [31:0] px32 = word_q[lane_q[2:0]*32 +: 32]; // 3-bit lane: always in-range (0..224)
|
|
wire [7:0] r32 = px32[7:0], g32 = px32[15:8], b32 = px32[23:16];
|
|
// PSMCT16 RGBA5551 5-bit lanes expanded to 8-bit.
|
|
wire [15:0] px16 = word_q[lane_q*16 +: 16];
|
|
wire [4:0] r5 = px16[4:0], g5 = px16[9:5], b5 = px16[14:10];
|
|
assign r = !in_q ? 8'd0 : (PSMCT32 ? r32 : {r5, r5[4:2]});
|
|
assign g = !in_q ? 8'd0 : (PSMCT32 ? g32 : {g5, g5[4:2]});
|
|
assign b = !in_q ? 8'd0 : (PSMCT32 ? b32 : {b5, b5[4:2]});
|
|
|
|
// ================= axi side (axi_clk) — row fill FSM =================
|
|
// free-running prefetcher: fetch rows sequentially, staying <= disp_row+1 ahead.
|
|
// disp_row crosses video->axi (slowly-changing; the +1 throttle tolerates a 1-off
|
|
// transient). frame_start is edge-detected here to reset next_fetch every frame.
|
|
logic [2:0] fs_sync_e;
|
|
wire fs_edge_e = (fs_sync_e[2] != fs_sync_e[1]);
|
|
logic [$clog2(N_ROWS):0] disp_row_s0, disp_row_e;
|
|
logic [$clog2(N_ROWS):0] next_fetch; // next row to load (0..N_ROWS)
|
|
typedef enum logic [1:0] { L_IDLE, L_AR, L_R } lstate_t;
|
|
lstate_t lst;
|
|
logic [$clog2(N_ROWS):0] cur_row;
|
|
logic cur_buf;
|
|
logic [RB_BITS:0] beat;
|
|
logic fs_pending; // a vsync restart is pending; applied in L_IDLE (never mid-read)
|
|
|
|
always_ff @(posedge axi_clk) begin
|
|
if (!axi_rst_n) begin
|
|
fs_sync_e <= 3'd0; disp_row_s0 <= '0; disp_row_e <= '0; next_fetch <= '0;
|
|
lst <= L_IDLE; araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
|
|
cur_row <= '0; cur_buf <= 1'b0; beat <= '0;
|
|
line_valid <= 1'b0; rd_errs <= 32'd0; fs_pending <= 1'b0;
|
|
end else begin
|
|
fs_sync_e <= {fs_sync_e[1:0], frame_start};
|
|
disp_row_s0 <= disp_row_v; // 2-FF sync of the display row
|
|
disp_row_e <= disp_row_s0;
|
|
// vsync: mark a prefetch restart. DEFER it to L_IDLE so an in-flight AXI
|
|
// read is never aborted mid-handshake (which would deadlock the slave).
|
|
if (fs_edge_e) fs_pending <= 1'b1;
|
|
case (lst)
|
|
L_IDLE: begin
|
|
if (fs_pending) begin
|
|
next_fetch <= '0; // restart prefetch sequence from row 0
|
|
fs_pending <= 1'b0;
|
|
end else if (enable && (next_fetch < N_ROWS) && (next_fetch <= disp_row_e + 1'b1)) begin
|
|
cur_row <= next_fetch;
|
|
cur_buf <= next_fetch[0];
|
|
araddr <= FB_BASE + (next_fetch * STRIDE_BYTES);
|
|
beat <= '0;
|
|
arvalid <= 1'b1;
|
|
lst <= L_AR;
|
|
end
|
|
end
|
|
L_AR: begin
|
|
if (arready) begin
|
|
arvalid <= 1'b0;
|
|
rready <= 1'b1;
|
|
lst <= L_R;
|
|
end
|
|
end
|
|
L_R: begin
|
|
if (rvalid) begin
|
|
if (cur_buf) lb1[beat[RB_BITS-1:0]] <= rdata;
|
|
else lb0[beat[RB_BITS-1:0]] <= rdata;
|
|
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
|
|
rready <= 1'b0;
|
|
if (beat == ROW_BEATS-1) begin
|
|
line_valid <= 1'b1;
|
|
next_fetch <= next_fetch + 1'b1; // advance prefetch (rows 0..next_fetch-1 loaded)
|
|
lst <= L_IDLE;
|
|
end else begin
|
|
// next single-beat read of this row (arlen=0 each).
|
|
beat <= beat + 1'b1;
|
|
araddr <= araddr + 30'd32;
|
|
arvalid <= 1'b1;
|
|
lst <= L_AR;
|
|
end
|
|
end
|
|
end
|
|
default: lst <= L_IDLE;
|
|
endcase
|
|
end
|
|
end
|
|
|
|
// underflow (sticky, video domain): an in-window pixel for line pixel_y is read
|
|
// before that row was prefetched. The axi side loads rows 0..next_fetch-1, so row
|
|
// pixel_y is ready iff pixel_y < next_fetch. next_fetch crosses axi->video synced
|
|
// (slowly-changing; a 1-off transient is harmless). Resets on vsync.
|
|
logic [$clog2(N_ROWS):0] nf_s0, nf_v;
|
|
logic underflow_v;
|
|
always_ff @(posedge video_clk) begin
|
|
nf_s0 <= next_fetch; nf_v <= nf_s0;
|
|
if (!enable || fs_edge_v) underflow_v <= 1'b0;
|
|
else if (in_window && (pixel_y < N_ROWS) && (($clog2(N_ROWS)+1)'(pixel_y) >= nf_v))
|
|
underflow_v <= 1'b1;
|
|
end
|
|
assign underflow = underflow_v;
|
|
endmodule
|