ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
174 lines
7.8 KiB
Systemverilog
174 lines
7.8 KiB
Systemverilog
// ============================================================================
|
|
// gs_lpddr_scanout.sv (Ch320 Brick 1)
|
|
//
|
|
// LPDDR4B-backed scanout for the SMALL framebuffer demo (explicitly scoped to
|
|
// a tiny frame — default 64x64 PSMCT16 = 8 KiB). Instead of ao486-style line
|
|
// buffering/streaming, it copies the WHOLE framebuffer from LPDDR4B into a
|
|
// small on-chip cache once per frame, then serves scanout pixels from the
|
|
// cache. When the framebuffer grows past "tiny demo", revisit ao486's
|
|
// vga_fb_ddr line-buffer approach.
|
|
//
|
|
// Pixel mapping is automatic: the GS writer mirrors BRAM VRAM byte-for-byte
|
|
// into LPDDR4B, so cache[addr] == BRAM_VRAM[addr]. We index the cache by the
|
|
// PCRTC's own `vram_read_addr`, so the decoded pixel is identical to the BRAM
|
|
// scanout pixel for the same raster position — the video-source mux is seamless
|
|
// regardless of swizzle/MAG.
|
|
//
|
|
// Two clock domains:
|
|
// axi_clk (emif_clk, ~310 MHz) — fill the cache from LPDDR4B over AXI4
|
|
// video_clk (design_clk) — index the cache by vram_read_addr -> r/g/b
|
|
//
|
|
// Fill happens on frame_start (vsync) and completes during vblank (256 single
|
|
// beats ~ 1 us); scanout reads the stable cache during the active region. The
|
|
// read channel is shared with the Ch319 read-probe via an external arbiter.
|
|
// ============================================================================
|
|
`timescale 1ns/1ps
|
|
|
|
module gs_lpddr_scanout #(
|
|
parameter [29:0] FB_BASE = 30'd0, // LPDDR byte base to FETCH the framebuffer from
|
|
parameter int CACHE_BEATS = 256, // 256 * 32 B = 8 KiB = 64x64 PSMCT16
|
|
// Ch324 — the LPDDR fetch base (FB_BASE) and the PCRTC vram_read_addr base can DIFFER:
|
|
// the spill framebuffer lives at COLOR_SPILL_BASE in LPDDR but the PCRTC addresses it
|
|
// BRAM-relative (0-based). VRAM_BASE is the vram_read_addr origin (defaults to FB_BASE
|
|
// for the Ch320/321 mirror case where they coincide).
|
|
parameter [29:0] VRAM_BASE = FB_BASE,
|
|
// Ch324 — pixel format: 0 = PSMCT16 (RGBA5551, 16 px/beat), 1 = PSMCT32 (ABGR, 8 px/beat).
|
|
parameter bit PSMCT32 = 1'b0
|
|
)(
|
|
// ---- AXI read clock domain (emif_clk) ----
|
|
input logic axi_clk,
|
|
input logic axi_rst_n,
|
|
input logic enable, // 1 = refill the cache on frame_start
|
|
input logic frame_start, // video-domain pulse/level (vsync); synced internally
|
|
|
|
// ---- video clock domain (design_clk) ----
|
|
input logic video_clk,
|
|
input logic [31:0] vram_read_addr,
|
|
output logic [7:0] r,
|
|
output logic [7:0] g,
|
|
output logic [7:0] b,
|
|
|
|
// ---- status (axi_clk domain; the bridge syncs these) ----
|
|
output logic cache_valid, // a full frame has been loaded
|
|
output logic [31:0] rd_beats, // beats read (cumulative)
|
|
output logic [31:0] rd_errs, // non-OKAY read responses (cumulative)
|
|
|
|
// ---- AXI4 read channel to the EMIF user port (axi_clk, 256-bit) ----
|
|
output logic [29:0] araddr,
|
|
output logic [1:0] arburst,
|
|
output logic [6:0] arid,
|
|
output logic [7:0] arlen,
|
|
output logic [2:0] arsize,
|
|
output logic arvalid,
|
|
input logic arready,
|
|
input logic [255:0] rdata,
|
|
input logic [1:0] rresp,
|
|
input logic rlast,
|
|
input logic rvalid,
|
|
output logic rready
|
|
);
|
|
localparam int BEAT_BITS = $clog2(CACHE_BEATS); // 8 for 256
|
|
localparam int FB_SPAN = CACHE_BEATS * 32; // 8192 bytes
|
|
|
|
assign arburst = 2'b01; // INCR
|
|
assign arid = 7'd2; // distinct from writer (0) and read-probe (1)
|
|
assign arlen = 8'd0; // single beat
|
|
assign arsize = 3'b101; // 32 bytes
|
|
|
|
// Frame cache: one 256-bit word per 32-byte beat.
|
|
logic [255:0] cache [0:CACHE_BEATS-1];
|
|
|
|
// ---------------- fill side (axi_clk) ----------------
|
|
logic [2:0] fs_sync;
|
|
wire fs_edge = (fs_sync[2] != fs_sync[1]);
|
|
|
|
typedef enum logic [1:0] { F_IDLE, F_AR, F_R } fstate_t;
|
|
fstate_t fst;
|
|
logic [BEAT_BITS:0] beat; // 0..CACHE_BEATS (extra bit for the terminal compare)
|
|
|
|
always_ff @(posedge axi_clk) begin
|
|
if (!axi_rst_n) begin
|
|
fs_sync <= 3'd0; fst <= F_IDLE; beat <= '0;
|
|
araddr <= '0; arvalid <= 1'b0; rready <= 1'b0;
|
|
cache_valid <= 1'b0; rd_beats <= 32'd0; rd_errs <= 32'd0;
|
|
end else begin
|
|
fs_sync <= {fs_sync[1:0], frame_start};
|
|
case (fst)
|
|
F_IDLE: begin
|
|
if (fs_edge && enable) begin
|
|
beat <= '0;
|
|
cache_valid <= 1'b0;
|
|
araddr <= FB_BASE;
|
|
arvalid <= 1'b1;
|
|
fst <= F_AR;
|
|
end
|
|
end
|
|
F_AR: begin
|
|
if (arready) begin
|
|
arvalid <= 1'b0;
|
|
rready <= 1'b1;
|
|
fst <= F_R;
|
|
end
|
|
end
|
|
F_R: begin
|
|
if (rvalid) begin
|
|
cache[beat[BEAT_BITS-1:0]] <= rdata;
|
|
rready <= 1'b0;
|
|
rd_beats <= rd_beats + 32'd1;
|
|
if (rresp != 2'b00) rd_errs <= rd_errs + 32'd1;
|
|
if (beat == CACHE_BEATS-1) begin
|
|
cache_valid <= 1'b1;
|
|
fst <= F_IDLE;
|
|
end else begin
|
|
beat <= beat + 1'b1;
|
|
araddr <= FB_BASE + (({{(30-BEAT_BITS-1){1'b0}}, (beat + 1'b1)}) << 5);
|
|
arvalid <= 1'b1;
|
|
fst <= F_AR;
|
|
end
|
|
end
|
|
end
|
|
default: fst <= F_IDLE;
|
|
endcase
|
|
end
|
|
end
|
|
|
|
// ---------------- scanout side (video_clk) ----------------
|
|
// Byte offset of the requested pixel within the framebuffer (vram_read_addr is BRAM-relative
|
|
// = VRAM_BASE-origin; the cache holds the SAME bytes fetched from FB_BASE in LPDDR).
|
|
wire [31:0] off = vram_read_addr - {2'b00, VRAM_BASE};
|
|
wire [BEAT_BITS-1:0] beat_ix = off[BEAT_BITS+4 -: BEAT_BITS]; // off>>5, low BEAT_BITS
|
|
wire [3:0] hw_sel = off[4:1]; // PSMCT16: 16 halfwords / beat
|
|
wire [2:0] w_sel = off[4:2]; // PSMCT32: 8 words / beat
|
|
wire in_range = cache_valid
|
|
&& (vram_read_addr >= {2'b00, VRAM_BASE})
|
|
&& (off < FB_SPAN);
|
|
|
|
// Registered (sync-read) cache lookup — 1-cycle latency to match the PCRTC's
|
|
// VRAM_SYNC_READ pixel timing so the muxed output aligns with PCRTC de/sync.
|
|
// Split the array-index and the part-select across the register boundary
|
|
// (chained index+part-select in one expr trips iverilog-12).
|
|
logic [255:0] word_q;
|
|
logic [3:0] hw_q;
|
|
logic [2:0] w_q;
|
|
logic in_range_q;
|
|
always_ff @(posedge video_clk) begin
|
|
word_q <= cache[beat_ix];
|
|
hw_q <= hw_sel;
|
|
w_q <= w_sel;
|
|
in_range_q <= in_range;
|
|
end
|
|
wire [15:0] px16_q = word_q[hw_q*16 +: 16];
|
|
wire [31:0] px32_q = word_q[w_q*32 +: 32];
|
|
|
|
// PSMCT16 (RGBA5551): R[4:0] G[9:5] B[14:10], 5->8 by bit-replication ({c5,c5[4:2]}).
|
|
wire [4:0] r5 = px16_q[4:0];
|
|
wire [4:0] g5 = px16_q[9:5];
|
|
wire [4:0] b5 = px16_q[14:10];
|
|
wire [7:0] r16 = {r5, r5[4:2]}, g16 = {g5, g5[4:2]}, b16 = {b5, b5[4:2]};
|
|
// PSMCT32 (ABGR8888): R[7:0] G[15:8] B[23:16] (A discarded) — identical decode to gs_pcrtc.
|
|
wire [7:0] r32 = px32_q[7:0], g32 = px32_q[15:8], b32 = px32_q[23:16];
|
|
assign r = !in_range_q ? 8'd0 : (PSMCT32 ? r32 : r16);
|
|
assign g = !in_range_q ? 8'd0 : (PSMCT32 ? g32 : g16);
|
|
assign b = !in_range_q ? 8'd0 : (PSMCT32 ? b32 : b16);
|
|
endmodule
|