Files
retroDE_ps2/tools/gs_make_sh3_real_draw_fixture.py
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

383 lines
24 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""retroDE_ps2 — Ch350 fixtures: the EXACT Ch349 SH3 draw for the full-res LPDDR integration TB.
Codex scope (no crop/downscale/repack): the actual draw geometry (f1 idx89761 TRI_STRIP -> triangle LIST),
the reconstructed 512x512 PSMT8 texture (in LPDDR), the real CSM1 CLUT (grid/CT32 order in BRAM), perspective
ST/Q, DECAL. Pixel-diff vs the Ch349 host reference with the Ch348 bounded <=1-texel acceptance.
Emits (LOCAL/gitignored -> sim/data/top_psmct32_raster_demo/):
sh3_real_tex_lpddr.mem 65536 LE words = the PSMT8-SWIZZLED texture bytes mem[TBP*256 : +262144]
(loaded into the behavioral LPDDR model; the GS texture-unit re-swizzles on read).
sh3_real_idx.mem 65536 words = the DE-SWIZZLED 512x512 indices packed 4/word (TB reference search).
sh3_real_clut.mem 256 words = the CT32-grid CLUT bytes mem[CBP*256 : +1024] (backdoored into BRAM;
the Ch350 CSM1 clut_loader reads them in grid order).
feeder_sh3_real.mem feeder staging: 68 tris (translated to FB origin) + TEX0(PSMT8,CSM1,CLD=1,DECAL).
sh3_real_refmap.mem per FB pixel (FBW*64 x FBH): covered|interior|tu|tv for the bounded TB check.
sh3_real_ref.png eyeball reference (host DECAL render at FB scale).
Geometry/addressing constants are printed + emitted as `sh3_real_params.vh` for the TB to include.
Usage: gs_make_sh3_real_draw_fixture.py [dump.gs.zst] [--draw-idx 89761]
"""
import sys, os, glob
HERE=os.path.dirname(os.path.abspath(__file__)); ROOT=os.path.normpath(os.path.join(HERE,".."))
DATA=os.path.join(ROOT,"sim","data","top_psmct32_raster_demo")
sys.path.insert(0, HERE)
import gs_sh3_draw_census as C
import gs_sh3_recon as RC
import gs_localmem as LM
sys.path.insert(0, DATA); import bake
# ---- address map (Ch350 BRAM-CROP diagnostic: read2 needs VRAM < 256 KiB; FB cropped, texture/CLUT/geometry
# FULL-RES). VRAM_BYTES = 0x20000 (128 KiB, 2^15 words) so the read2 tripwire (>=256 KiB) is NOT tripped.
# Only the FRAMEBUFFER/VIEWPORT is cropped to a deterministic CH-tall band; the texture stays full 512x512
# (in LPDDR, cache-intercepted base 0x40000), the CSM1 CLUT is full, the geometry/ST/Q are unchanged. ----
FBW = 4 # 256 px wide FB (64-px units) — full draw width (247) fits
FBPXW = FBW*64 # 256
CH = 120 # FB rows: 256*120*4 = 122880 B; + CLUT(1 KiB) < 128 KiB
VRAM_BYTES = 0x20000 # 128 KiB BRAM VRAM (2^15 words) — under the 256 KiB read2 tripwire
CBP = 0x1E000//256 # 480 (CLUT right after the 256x120 FB = 0x1E000)
NEW_TBP = 0x40000//256 # 1024 (texture VRAM base; cache-intercepted, NOT in the 128 KiB BRAM)
TEX_VRAM_BASE = NEW_TBP*256 # 0x40000
TEX_BYTES = 512*512 # 262144 (PSMT8) — FULL texture, no downscale
LPDDR_TEX_BASE = 0x00200000 # texture byte base in LPDDR4B
TW_LOG, TH_LOG = 9, 9 # 512x512
TBW_TEX = 8 # texture TBW (64-px units) — MUST match the original draw's swizzle
STG_WORDS = 768
def recip8(q, IDX_BITS=8, SCALE=24, Q_W=24):
"""Exact Python replica of gs_reciprocal_stub: recip = (floor(2^(SCALE+IDX_BITS-1)/M) >> e), M = q
normalized to an IDX_BITS mantissa (MSB at TOP_BIT), e = msb index. Used to build the RTL-FAITHFUL
reference that isolates reciprocal quantization from S1 attribute under-interpolation."""
OUT_MAX = (1 << (SCALE+1)) - 1
if q <= 0: return OUT_MAX
TOP_BIT = IDX_BITS - 1
e = q.bit_length() - 1
norm = (q >> (e - TOP_BIT)) if e >= TOP_BIT else (q << (TOP_BIT - e))
M = norm & ((1 << IDX_BITS) - 1)
if M == 0: return OUT_MAX
r = ((1 << (SCALE + TOP_BIT)) // M) >> e
return min(r, OUT_MAX)
def persp_texel_recip(uq, vq, q, tw, th, idx_bits=8, SCALE=24):
"""gs_persp_uv with the idx_bits reciprocal LUT: u=(uq*recip)>>SCALE clamped to 2047, then REPEAT-wrap."""
recip = recip8(int(round(q)), IDX_BITS=idx_bits, SCALE=SCALE)
u = (int(round(uq)) * recip) >> SCALE
v = (int(round(vq)) * recip) >> SCALE
if u > 2047: u = 2047
if v > 2047: v = 2047
return (u % tw), (v % th)
def f32_to(v): return v # placeholder
def tex0_real(tbp, cbp):
# PSMT8 (psm=0x13) 512x512, TFX=DECAL(1); CLUT: CBP, CPSM=PSMCT32(0), CSM=0 (CSM1 grid!), CSA=0, CLD=1.
v = bake.tex0_pack(tbp, TBW_TEX, psm=0x13, tw=TW_LOG, th=TH_LOG, tfx=1)
v |= (cbp & 0x3FFF) << 37
v |= (0 & 0xF) << 51 # CPSM = PSMCT32
v |= (0 & 0x1) << 55 # CSM = 0 -> CSM1 (16x16 CT32 grid) — the Ch350 path
v |= (0 & 0x1F) << 56 # CSA = 0
v |= (1 & 0x7) << 61 # CLD = 1 -> always load
return v
def main(argv):
dump = None; draw_idx = 89761
a = argv[1:]
if a and not a[0].startswith("--"): dump = a[0]
if "--draw-idx" in a: draw_idx = int(a[a.index("--draw-idx")+1])
if dump is None:
c = glob.glob(os.path.join(ROOT,"captures","gs","silenthill3","*224139*.gs.zst"))
if not c: sys.exit("no SH3 dump found; pass the .gs.zst path")
dump = c[0]
PERSP_FRAC = bake.PERSP_FRAC
dr = C.get_draw(dump, draw_idx)
if dr is None: sys.exit(f"draw idx {draw_idx} not found")
t0 = dr["tex0"]; ORIG_TBP = t0["tbp"]; ORIG_CBP = t0["cbp"]
TW, TH = t0["tw"], t0["th"]
assert TW==512 and TH==512 and t0["psm"]==0x13, f"unexpected TEX0 {t0}"
# --- reconstruct GS local memory at draw time (Ch349) ---
mem, replayed, uploads, events, vram = RC.build_localmem_to(dump, draw_idx)
if mem is None: sys.exit("VRAM snapshot absent")
# de-swizzled 512x512 index image (for TB reference) + swizzled bytes (for LPDDR)
idx = mem.read_psmt8(ORIG_TBP, t0["tbw"], TW, TH) # de-swizzled indices
tex_swz = bytes(mem.m[ORIG_TBP*256 : ORIG_TBP*256 + TEX_BYTES]) # swizzled bytes -> LPDDR
clut_bytes = bytes(mem.m[ORIG_CBP*256 : ORIG_CBP*256 + 1024]) # CT32-grid CLUT bytes -> BRAM
pal = RC.read_clut32(mem, ORIG_CBP, order="grid") # for the reference PNG
# --- geometry: translate to draw origin (full frame), then choose a deterministic CH-tall VIEWPORT crop ---
xmin = min(v["x"] for v in dr["verts"]); ymin = min(v["y"] for v in dr["verts"])
OX, OY = int(xmin), int(ymin)
fverts = [dict(x=v["x"]-OX, y=v["y"]-OY, s=v["s"], t=v["t"], q=v["q"]) for v in dr["verts"]]
ftris = [(i-2,i-1,i) for i in range(2,len(fverts))]
full_h = int(max(v["y"] for v in fverts)) + 1
def edge(ax,ay,bx,by,px,py): return (px-ax)*(by-ay)-(py-ay)*(bx-ax)
# per-row coverage histogram over the FULL frame -> pick CY0 = argmax covered pixels in a CH-tall band.
row_cov = [0]*(full_h+CH+2)
for (a0,b0,c0) in ftris:
v0,v1,v2=fverts[a0],fverts[b0],fverts[c0]
x0,y0=v0["x"],v0["y"]; x1,y1=v1["x"],v1["y"]; x2,y2=v2["x"],v2["y"]
ar=edge(x0,y0,x1,y1,x2,y2)
if abs(ar)<1e-9: continue
inv=1.0/ar
for py in range(max(0,int(min(y0,y1,y2))), min(full_h-1,int(max(y0,y1,y2))+1)+1):
for px in range(max(0,int(min(x0,x1,x2))), min(FBPXW-1,int(max(x0,x1,x2))+1)+1):
cx,cy=px+0.5,py+0.5
w0=edge(x1,y1,x2,y2,cx,cy)*inv; w1=edge(x2,y2,x0,y0,cx,cy)*inv; w2=1.0-w0-w1
if w0>=-0.001 and w1>=-0.001 and w2>=-0.001: row_cov[py]+=1
best_cy0, best_sum = 0, -1
for cy0 in range(0, max(1, full_h-CH+1)):
s = sum(row_cov[cy0:cy0+CH])
if s > best_sum: best_sum, best_cy0 = s, cy0
CY0 = best_cy0; CX0 = 0
# apply the viewport crop: shift Y by -CY0 (ST/Q UNCHANGED — only the framebuffer window moves), then
# CLIP each triangle to the crop rect [0,FBPXW]x[0,CH] (Sutherland-Hodgman, interpolating S/T/Q linearly in
# screen space — correct since S,T,Q are already premultiplied by 1/w). This is the VIEWPORT scissor done at
# the host: every emitted vertex lands inside the FB (no out-of-bounds writes), geometry SHAPE + per-vertex
# ST/Q are preserved exactly; only the framebuffer window is cropped. Codex's "cropped or scissored" rule.
def lerp(p1, p2, a):
return dict(x=p1["x"]+a*(p2["x"]-p1["x"]), y=p1["y"]+a*(p2["y"]-p1["y"]),
s=p1["s"]+a*(p2["s"]-p1["s"]), t=p1["t"]+a*(p2["t"]-p1["t"]), q=p1["q"]+a*(p2["q"]-p1["q"]))
def clip_edge(poly, inside, isect):
out=[]
for i in range(len(poly)):
cur=poly[i]; prv=poly[i-1]
ci=inside(cur); pi=inside(prv)
if ci:
if not pi: out.append(isect(prv,cur))
out.append(cur)
elif pi:
out.append(isect(prv,cur))
return out
def clip_rect(poly):
# left x>=0, right x<=FBPXW, top y>=0, bottom y<=CH
poly=clip_edge(poly, lambda p:p["x"]>=0.0, lambda a,b:lerp(a,b,(0.0-a["x"])/(b["x"]-a["x"])))
if not poly: return poly
poly=clip_edge(poly, lambda p:p["x"]<=FBPXW, lambda a,b:lerp(a,b,(FBPXW-a["x"])/(b["x"]-a["x"])))
if not poly: return poly
poly=clip_edge(poly, lambda p:p["y"]>=0.0, lambda a,b:lerp(a,b,(0.0-a["y"])/(b["y"]-a["y"])))
if not poly: return poly
poly=clip_edge(poly, lambda p:p["y"]<=CH, lambda a,b:lerp(a,b,(CH-a["y"])/(b["y"]-a["y"])))
return poly
sverts = [dict(x=v["x"]-CX0, y=v["y"]-CY0, s=v["s"], t=v["t"], q=v["q"]) for v in fverts]
def rnd(v): # round XY to integer screen coords (the feeder gets ints) — host ref MUST use the SAME ints,
return dict(x=float(int(round(v["x"]))), y=float(int(round(v["y"]))), s=v["s"], t=v["t"], q=v["q"])
tris = [] # list of (v0,v1,v2) explicit clipped vertex dicts with INTEGER screen XY
for (a0,b0,c0) in ((i-2,i-1,i) for i in range(2,len(sverts))):
poly = clip_rect([sverts[a0], sverts[b0], sverts[c0]])
poly = [rnd(p) for p in poly]
for k in range(1, len(poly)-1): # fan the clipped polygon into triangles
tris.append((poly[0], poly[k], poly[k+1]))
print(f"[Ch350] draw idx{draw_idx}: {len(sverts)} verts; full frame {FBPXW}x{full_h}; DETERMINISTIC crop "
f"CX0={CX0} CY0={CY0} -> FB {FBPXW}x{CH} ({best_sum} covered px in band); clipped to {len(tris)} tris")
# --- feeder staging ---
# NOTE: gs_persp_uv contract is uq=(u/w)*2^FRAC, q=(1/w)*2^FRAC, u=(uq*floor(2^SCALE/q))>>SCALE. Scaling
# S/T/Q by K is INVARIANT (cancels) — confirmed it doesn't move the residual. The texel-accuracy limit is
# the gs_reciprocal_stub 8-bit (256-entry) LUT: ~0.4% relative -> <=1 texel for Ch348's TW=64 but ~2+ texels
# for this TW=512 texture (plus the S1-path under-interpolation banding). A perspective-precision limit.
# Ch351 — EFFECTIVE PERSP_FRAC. The hardware u=s/q divide cancels the frac scale, so "widening PERSP_FRAC"
# is a HOST PACKING choice: pack S/T/Q with more frac bits so the far-surface denominator q=(1/w)*2^FRACeff
# doesn't round to 1-2 (FRAC=12 collapses for w~2048). PSCALE=2^k gives FRACeff = PERSP_FRAC + k. PSCALE=256
# -> FRACeff=20, which took the SH3 crop 20%->80% (Codex's "Q×256 ≈ +8 frac bits"). Default PSCALE=1 keeps
# Ch342/348 at FRAC=12 (canaries). The 24-bit signed S/T field bounds FRACeff for a given S/T range — checked.
PSCALE = 4096
S24_MAX = (1<<23) - 1
max_sfp = [0]
def vert_words(v):
s_fp = round(v["s"] * TW * (1<<PERSP_FRAC) * PSCALE) # s_fp/q_fp = (S/Q)*TW = texel_u (perspective)
t_fp = round(v["t"] * TH * (1<<PERSP_FRAC) * PSCALE)
q_fp = round(v["q"] * (1<<PERSP_FRAC) * PSCALE)
max_sfp[0] = max(max_sfp[0], abs(s_fp), abs(t_fp))
if abs(s_fp) > S24_MAX or abs(t_fp) > S24_MAX: # 24-bit signed ST field overflow guard (Codex #3)
sys.exit(f"[Ch351] OVERFLOW: |s_fp|={abs(s_fp)} or |t_fp|={abs(t_fp)} > 2^23-1 at PSCALE={PSCALE} "
f"(FRACeff={PERSP_FRAC}+{PSCALE.bit_length()-1}). Lower PSCALE for this S/T range.")
if abs(q_fp) > 0x7FFFFFFF:
sys.exit(f"[Ch351] OVERFLOW: |q_fp|={abs(q_fp)} > 2^31-1 (Q field). Lower PSCALE.")
sx = max(0, min(FBPXW-1, int(round(v["x"]))))
sy = max(0, min(CH-1, int(round(v["y"]))))
return [bake.rgbaq_with_q(0,0,0, q_fp & 0xFFFFFFFF),
bake.st_data(s_fp & 0xFFFFFF, t_fp & 0xFFFFFF),
bake.xyz2_dataz(sx, sy, 0x0000_5000)]
stg = []
stg.append(len(tris) | (1<<32)) # word0: ntris | perspective-format flag
stg.append(bake.frame_1_psmct32(FBW))
stg.append(bake.alpha_pack(0,1,0,1))
stg.append(0) # TEST_1 = 0 (ZTE=0, ATE=0): no depth/alpha test
stg.append(bake.zbuf1_pack(0, zmsk=1)) # ZMSK=1: no Z writes -> no Z buffer needed
stg.append(tex0_real(NEW_TBP, CBP)) # PSMT8 + CSM1 CLUT (CLD=1) -> feeder commit fires the load
stg.append(3 | (1<<4)) # TRI + TME, ABE=0 (S1 perspective path)
for (v0,v1,v2) in tris:
for v in (v0,v1,v2): stg += vert_words(v)
if len(stg) > STG_WORDS: sys.exit(f"staging {len(stg)} > {STG_WORDS} (raise STG_WORDS)")
print(f"[Ch350] feeder staging: {len(stg)} words (<= {STG_WORDS})")
print(f"[Ch351] effective PERSP_FRAC = {PERSP_FRAC}+{PSCALE.bit_length()-1} = {PERSP_FRAC+PSCALE.bit_length()-1} "
f"(PSCALE={PSCALE}); max |s_fp/t_fp|={max_sfp[0]} of 2^23-1 ({100.0*max_sfp[0]/((1<<23)-1):.1f}% of the 24-bit ST field)")
# --- host reference + per-pixel texel map. TWO references over the SAME clipped geometry:
# refmap = FLOAT perspective (ideal) — the Codex pixel-diff oracle.
# refmap_rec = RTL-FAITHFUL: fixed-point vertex attrs (uq=s*TW*2^FRAC, q=Q*2^FRAC), float interp, then the
# 8-bit gs_reciprocal_stub. Comparing the RTL FB vs BOTH isolates reciprocal quantization
# (RTL≈refmap_rec, refmap_rec≠refmap) from S1 under-interpolation banding (RTL≠refmap_rec).
refmap = [0]*(FBPXW*CH); refpix = [(0,0,0)]*(FBPXW*CH)
refmap_rec = [0]*(FBPXW*CH); refpix_rec = [(0,0,0)]*(FBPXW*CH)
refmap_aff = [0]*(FBPXW*CH) # AFFINE: per-vertex texel, linear u,v interp (NOT perspective-correct)
F = 1<<PERSP_FRAC
for (v0,v1,v2) in tris:
x0,y0=v0["x"],v0["y"]; x1,y1=v1["x"],v1["y"]; x2,y2=v2["x"],v2["y"]
area = edge(x0,y0,x1,y1,x2,y2)
if abs(area)<1e-9: continue
inv = 1.0/area
# per-vertex FIXED-POINT attributes (exactly what the feeder staging carries)
uqv=[round(v["s"]*TW*F) for v in (v0,v1,v2)]
vqv=[round(v["t"]*TH*F) for v in (v0,v1,v2)]
qv =[round(v["q"]*F) for v in (v0,v1,v2)]
# per-vertex TEXEL (perspective divide at the vertex) for the affine reference
auv=[((v["s"]/v["q"])*TW if abs(v["q"])>1e-12 else 0.0) for v in (v0,v1,v2)]
avv=[((v["t"]/v["q"])*TH if abs(v["q"])>1e-12 else 0.0) for v in (v0,v1,v2)]
minx=max(0,int(min(x0,x1,x2))); maxx=min(FBPXW-1,int(max(x0,x1,x2))+1)
miny=max(0,int(min(y0,y1,y2))); maxy=min(CH-1,int(max(y0,y1,y2))+1)
for py in range(miny,maxy+1):
for px in range(minx,maxx+1):
cx,cy=px+0.5,py+0.5
w0=edge(x1,y1,x2,y2,cx,cy)*inv; w1=edge(x2,y2,x0,y0,cx,cy)*inv; w2=1.0-w0-w1
if w0<-0.001 or w1<-0.001 or w2<-0.001: continue
# Ch351 convention fix: coverage/interior at pixel CENTER (px+0.5), but the RTL interpolates
# the perspective ATTRIBUTES at the INTEGER pixel coord (tex_dx_s1 = s1_x_q - v0_x, no +0.5).
# Use a CORNER barycentric for S/T/Q to match -> removes the ~0.5-texel drift.
a0w=edge(x1,y1,x2,y2,float(px),float(py))*inv; a1w=edge(x2,y2,x0,y0,float(px),float(py))*inv; a2w=1.0-a0w-a1w
S=a0w*v0["s"]+a1w*v1["s"]+a2w*v2["s"]; T=a0w*v0["t"]+a1w*v1["t"]+a2w*v2["t"]
Q=a0w*v0["q"]+a1w*v1["q"]+a2w*v2["q"]
if abs(Q)<1e-12: continue
tu=int((S/Q)*TW) % TW; tv=int((T/Q)*TH) % TH
if tu<0: tu+=TW
if tv<0: tv+=TH
# RTL-faithful: interp the FIXED-POINT attrs, then the 8-bit reciprocal
uq=w0*uqv[0]+w1*uqv[1]+w2*uqv[2]; vq=w0*vqv[0]+w1*vqv[1]+w2*vqv[2]; qq=w0*qv[0]+w1*qv[1]+w2*qv[2]
tur,tvr = persp_texel_recip(uq,vq,qq,TW,TH,idx_bits=8)
# AFFINE texel: linear interp of the per-vertex texels (the under-interpolation hypothesis)
au=int(w0*auv[0]+w1*auv[1]+w2*auv[2]) % TW; av=int(w0*avv[0]+w1*avv[1]+w2*avv[2]) % TH
if au<0: au+=TW
if av<0: av+=TH
mw = min(w0,w1,w2)
interior = 1 if mw > 0.04 else 0 # away from this triangle's own edges
deep = 1 if mw > 0.15 else 0 # DEEP interior — far from any edge => seam-free zone
o=py*FBPXW+px
refmap[o] = (1<<31)|(interior<<30)|(deep<<29)|((tu&0x1FF)<<9)|(tv&0x1FF)
refmap_rec[o] = (1<<31)|(interior<<30)|((tur&0x1FF)<<9)|(tvr&0x1FF)
refmap_aff[o] = (1<<31)|(interior<<30)|((au&0x1FF)<<9)|(av&0x1FF)
p=pal[idx[tv*TW+tu]&0xFF]; refpix[o]=(p&0xFF,(p>>8)&0xFF,(p>>16)&0xFF)
pr=pal[idx[tvr*TW+tur]&0xFF];refpix_rec[o]=(pr&0xFF,(pr>>8)&0xFF,(pr>>16)&0xFF)
covered = sum(1 for w in refmap if w>>31)
print(f"[Ch350] host reference: {covered} covered FB pixels")
# emit the RTL-faithful refmap + PNG for the Ch351 oracle
with open(os.path.join(DATA,"sh3_real_refmap_recip.mem"),"w") as f:
f.write("// Ch351 RTL-faithful (8-bit reciprocal) per-pixel texel map. gitignored.\n")
for x in refmap_rec: f.write(f"{x & 0xFFFFFFFF:08x}\n")
with open(os.path.join(DATA,"sh3_real_refmap_affine.mem"),"w") as f:
f.write("// Ch351 AFFINE (per-vertex texel, linear interp) per-pixel texel map. gitignored.\n")
for x in refmap_aff: f.write(f"{x & 0xFFFFFFFF:08x}\n")
try:
from PIL import Image
Image.new("RGB",(FBPXW,CH)).copy() # noop guard
im2=Image.new("RGB",(FBPXW,CH)); im2.putdata(refpix_rec)
im2.save(os.path.join(ROOT,"captures","gs","silenthill3","extracted","recon","sh3_real_ref_recip.png"))
except Exception as e:
print("(PIL skip recip png:", e, ")")
# --- setup bootlet: BOARD-READY CLUT upload (Ch347 pattern). The CSM1 loader reads the CLUT from VRAM at
# CBP via GRID offsets; sh3_real_clut.mem IS the raw grid bytes, so a LINEAR 256x1 BITBLT of those 256 words
# (PSMCT32_SWIZZLE=0) places byte CBP+k*4 = word k = the grid byte verbatim -> the loader reads it correctly.
# The upload also fires dma_done_seen, which auto-starts the feeder (C_SETUP->C_RUN). On the board this is the
# ONLY CLUT path (no TB backdoor); the SIM TB still backdoors the same bytes (redundant + consistent).
clut_words_b = [int.from_bytes(clut_bytes[i*4:i*4+4],"little") for i in range(256)]
RAM_QWORDS = 512
pay = []
pay.append(bake.giftag(1,0,0,4,int('E'*4,16)))
pay.append(bake.aplusd(bake.R_BITBLTBUF, bake.bitbltbuf_pack(CBP, 1, 0x00))) # PSMCT32 dest @CBP
pay.append(bake.aplusd(bake.R_TRXPOS, bake.trxpos_pack(0,0)))
pay.append(bake.aplusd(bake.R_TRXREG, bake.trxreg_pack(256,1))) # 256x1 contiguous
pay.append(bake.aplusd(bake.R_TRXDIR, bake.trxdir_pack(0)))
pay.append(bake.giftag(256//4,1,2,0,0)) # 64 qwords image data, EOP
for q in range(256//4):
word=0
for lane in range(4): word |= (clut_words_b[q*4+lane]&0xFFFFFFFF) << (32*lane)
pay.append(word)
qwc=len(pay)
disp_hi=((CH-1)<<12)|(FBPXW-1)
with open(os.path.join(DATA,"payload_sh3_real.mem"),"w") as f:
f.write(f"// Ch352 LOCAL SH3 real-draw setup payload (CSM1 CLUT 256x1 -> CBP={CBP}, grid bytes verbatim). gitignored. QWC={qwc}.\n")
for _ in range(16): f.write(f"{0:032x}\n")
for x in pay: f.write(f"{x&((1<<128)-1):032x}\n")
for _ in range(RAM_QWORDS-16-qwc): f.write(f"{0:032x}\n")
bake.write_bios_mem("bios_sh3_real.mem",
bake.build_textured_demo_bootlet_disp(qwc, disp_hi, FBW),
f"Ch352 LOCAL SH3 real-draw setup bootlet (QWC={qwc}, DISPLAY1={FBPXW}x{CH}). gitignored.")
print(f"[Ch352] setup bootlet: payload {qwc} qw (CSM1 CLUT 256x1 upload to CBP={CBP}).")
# --- emit ---
def wmem(name, words, width_hex, banner):
with open(os.path.join(DATA,name),"w") as f:
f.write(f"// {banner}\n")
for x in words: f.write(f"{x & ((1<<(4*width_hex))-1):0{width_hex}x}\n")
# de-swizzled index image, packed 4 idx/word (LINEAR row-major: byte v*TW+u = idx(u,v))
idx_words = [idx[i*4]|(idx[i*4+1]<<8)|(idx[i*4+2]<<16)|(idx[i*4+3]<<24) for i in range(TW*TH//4)]
wmem("sh3_real_idx.mem", idx_words, 8, "Ch350 LOCAL SH3 512x512 de-swizzled indices (4/word) for TB ref. gitignored.")
# LPDDR texture: the bram-top defaults PSMT8_SWIZZLE=0 (LINEAR read, like Ch347/348) — so store the texture
# LINEAR (de-swizzled, = idx_words). The texture unit's linear PSMT8 addr (base + v*TBW*64 + u) then reads
# texel(u,v)=idx(u,v). (The raw SWIZZLED bytes would need PSMT8_SWIZZLE=1; kept as sh3_real_tex_lpddr_swz.mem
# for that variant.) This is the Ch299/Ch350 root-cause fix: linear texture <-> linear read.
wmem("sh3_real_tex_lpddr.mem", idx_words, 8, "Ch350 LOCAL SH3 512x512 LINEAR de-swizzled indices -> LPDDR model (PSMT8_SWIZZLE=0). gitignored.")
# Ch352 guardrail #2 — board-side READBACK CHECKSUM: after the HPS write-probe uploads these 65536 words to
# LPDDR @0x200000, the HPS read-probe should read them back and confirm sum32 + xor32 BEFORE the cache fill.
tex_sum = sum(idx_words) & 0xFFFFFFFF
tex_xor = 0
for w in idx_words: tex_xor ^= w
print(f"[Ch352] TEXTURE→LPDDR upload checksum (verify via read-probe before cache-fill): "
f"{len(idx_words)} words @ LPDDR 0x{LPDDR_TEX_BASE:07x} sum32=0x{tex_sum:08x} xor32=0x{tex_xor:08x}")
tex_swz_words = [int.from_bytes(tex_swz[i*4:i*4+4],"little") for i in range(TEX_BYTES//4)]
wmem("sh3_real_tex_lpddr_swz.mem", tex_swz_words, 8, "Ch350 LOCAL SH3 512x512 SWIZZLED bytes -> LPDDR (for PSMT8_SWIZZLE=1 variant). gitignored.")
clut_words = [int.from_bytes(clut_bytes[i*4:i*4+4],"little") for i in range(256)]
wmem("sh3_real_clut.mem", clut_words, 8, "Ch350 LOCAL SH3 CSM1 CLUT (raw CT32-grid bytes @CBP) -> BRAM (HW CSM1 loader reads these in grid order). gitignored.")
# de-gridded palette pal[i] (what the HW CSM1 grid-read produces) -> TB reference expected colors
wmem("sh3_real_pal.mem", [p & 0xFFFFFFFF for p in pal], 8,
"Ch350 LOCAL SH3 de-gridded palette pal[i] (grid-read of the CBP bytes) for the TB reference. gitignored.")
bake.write_feeder_stg_mem("feeder_sh3_real.mem", stg,
"Ch350 LOCAL SH3 REAL draw (idx89761) feeder staging: triangle list + TEX0(PSMT8,CSM1,CLD=1,DECAL). gitignored.",
total=STG_WORDS)
wmem("sh3_real_refmap.mem", refmap, 8, "Ch350 LOCAL per-FB-pixel covered|interior|tu|tv reference map. gitignored.")
# params include for the TB
with open(os.path.join(DATA,"sh3_real_params.vh"),"w") as f:
f.write("// Ch350 LOCAL generated params for tb_top_psmct32_sh3_real_draw_demo. gitignored.\n")
f.write(f"localparam int FBW = {FBW};\n")
f.write(f"localparam int FBPXW = {FBPXW};\n")
f.write(f"localparam int FBH = {CH};\n")
f.write(f"localparam int VRAM_BYTES_P = {VRAM_BYTES};\n")
f.write(f"localparam int CROP_CX0 = {CX0};\n")
f.write(f"localparam int CROP_CY0 = {CY0};\n")
f.write(f"localparam int CLUT_CBP = {CBP};\n")
f.write(f"localparam int NEW_TBP = {NEW_TBP};\n")
f.write(f"localparam int TEX_VRAM_BASE= {TEX_VRAM_BASE};\n")
f.write(f"localparam int TEX_BYTES = {TEX_BYTES};\n")
f.write(f"localparam [29:0] LPDDR_TEX_BASE = 30'h{LPDDR_TEX_BASE:07x};\n")
f.write(f"localparam int N_BEATS = {TEX_BYTES//32};\n")
f.write(f"localparam int STG_WORDS = {STG_WORDS};\n")
f.write(f"localparam int TW = {TW};\n")
f.write(f"localparam int TH = {TH};\n")
# eyeball PNG
try:
from PIL import Image
im=Image.new("RGB",(FBPXW,CH)); im.putdata(refpix)
im.save(os.path.join(ROOT,"captures","gs","silenthill3","extracted","recon","sh3_real_ref.png"))
print("[Ch350] wrote sh3_real_ref.png")
except Exception as e:
print("(PIL skipped:", e, ")")
print(f"[Ch350] emitted fixtures -> {DATA} (LOCAL). TEX_VRAM_BASE=0x{TEX_VRAM_BASE:x} TBP={NEW_TBP} CBP={CBP} "
f"LPDDR_TEX_BASE=0x{LPDDR_TEX_BASE:x} N_BEATS={TEX_BYTES//32}")
if __name__ == "__main__":
main(sys.argv)