ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
383 lines
24 KiB
Python
383 lines
24 KiB
Python
#!/usr/bin/env python3
|
||
"""retroDE_ps2 — Ch350 fixtures: the EXACT Ch349 SH3 draw for the full-res LPDDR integration TB.
|
||
|
||
Codex scope (no crop/downscale/repack): the actual draw geometry (f1 idx89761 TRI_STRIP -> triangle LIST),
|
||
the reconstructed 512x512 PSMT8 texture (in LPDDR), the real CSM1 CLUT (grid/CT32 order in BRAM), perspective
|
||
ST/Q, DECAL. Pixel-diff vs the Ch349 host reference with the Ch348 bounded <=1-texel acceptance.
|
||
|
||
Emits (LOCAL/gitignored -> sim/data/top_psmct32_raster_demo/):
|
||
sh3_real_tex_lpddr.mem 65536 LE words = the PSMT8-SWIZZLED texture bytes mem[TBP*256 : +262144]
|
||
(loaded into the behavioral LPDDR model; the GS texture-unit re-swizzles on read).
|
||
sh3_real_idx.mem 65536 words = the DE-SWIZZLED 512x512 indices packed 4/word (TB reference search).
|
||
sh3_real_clut.mem 256 words = the CT32-grid CLUT bytes mem[CBP*256 : +1024] (backdoored into BRAM;
|
||
the Ch350 CSM1 clut_loader reads them in grid order).
|
||
feeder_sh3_real.mem feeder staging: 68 tris (translated to FB origin) + TEX0(PSMT8,CSM1,CLD=1,DECAL).
|
||
sh3_real_refmap.mem per FB pixel (FBW*64 x FBH): covered|interior|tu|tv for the bounded TB check.
|
||
sh3_real_ref.png eyeball reference (host DECAL render at FB scale).
|
||
|
||
Geometry/addressing constants are printed + emitted as `sh3_real_params.vh` for the TB to include.
|
||
Usage: gs_make_sh3_real_draw_fixture.py [dump.gs.zst] [--draw-idx 89761]
|
||
"""
|
||
import sys, os, glob
|
||
HERE=os.path.dirname(os.path.abspath(__file__)); ROOT=os.path.normpath(os.path.join(HERE,".."))
|
||
DATA=os.path.join(ROOT,"sim","data","top_psmct32_raster_demo")
|
||
sys.path.insert(0, HERE)
|
||
import gs_sh3_draw_census as C
|
||
import gs_sh3_recon as RC
|
||
import gs_localmem as LM
|
||
sys.path.insert(0, DATA); import bake
|
||
|
||
# ---- address map (Ch350 BRAM-CROP diagnostic: read2 needs VRAM < 256 KiB; FB cropped, texture/CLUT/geometry
|
||
# FULL-RES). VRAM_BYTES = 0x20000 (128 KiB, 2^15 words) so the read2 tripwire (>=256 KiB) is NOT tripped.
|
||
# Only the FRAMEBUFFER/VIEWPORT is cropped to a deterministic CH-tall band; the texture stays full 512x512
|
||
# (in LPDDR, cache-intercepted base 0x40000), the CSM1 CLUT is full, the geometry/ST/Q are unchanged. ----
|
||
FBW = 4 # 256 px wide FB (64-px units) — full draw width (247) fits
|
||
FBPXW = FBW*64 # 256
|
||
CH = 120 # FB rows: 256*120*4 = 122880 B; + CLUT(1 KiB) < 128 KiB
|
||
VRAM_BYTES = 0x20000 # 128 KiB BRAM VRAM (2^15 words) — under the 256 KiB read2 tripwire
|
||
CBP = 0x1E000//256 # 480 (CLUT right after the 256x120 FB = 0x1E000)
|
||
NEW_TBP = 0x40000//256 # 1024 (texture VRAM base; cache-intercepted, NOT in the 128 KiB BRAM)
|
||
TEX_VRAM_BASE = NEW_TBP*256 # 0x40000
|
||
TEX_BYTES = 512*512 # 262144 (PSMT8) — FULL texture, no downscale
|
||
LPDDR_TEX_BASE = 0x00200000 # texture byte base in LPDDR4B
|
||
TW_LOG, TH_LOG = 9, 9 # 512x512
|
||
TBW_TEX = 8 # texture TBW (64-px units) — MUST match the original draw's swizzle
|
||
STG_WORDS = 768
|
||
|
||
def recip8(q, IDX_BITS=8, SCALE=24, Q_W=24):
|
||
"""Exact Python replica of gs_reciprocal_stub: recip = (floor(2^(SCALE+IDX_BITS-1)/M) >> e), M = q
|
||
normalized to an IDX_BITS mantissa (MSB at TOP_BIT), e = msb index. Used to build the RTL-FAITHFUL
|
||
reference that isolates reciprocal quantization from S1 attribute under-interpolation."""
|
||
OUT_MAX = (1 << (SCALE+1)) - 1
|
||
if q <= 0: return OUT_MAX
|
||
TOP_BIT = IDX_BITS - 1
|
||
e = q.bit_length() - 1
|
||
norm = (q >> (e - TOP_BIT)) if e >= TOP_BIT else (q << (TOP_BIT - e))
|
||
M = norm & ((1 << IDX_BITS) - 1)
|
||
if M == 0: return OUT_MAX
|
||
r = ((1 << (SCALE + TOP_BIT)) // M) >> e
|
||
return min(r, OUT_MAX)
|
||
|
||
def persp_texel_recip(uq, vq, q, tw, th, idx_bits=8, SCALE=24):
|
||
"""gs_persp_uv with the idx_bits reciprocal LUT: u=(uq*recip)>>SCALE clamped to 2047, then REPEAT-wrap."""
|
||
recip = recip8(int(round(q)), IDX_BITS=idx_bits, SCALE=SCALE)
|
||
u = (int(round(uq)) * recip) >> SCALE
|
||
v = (int(round(vq)) * recip) >> SCALE
|
||
if u > 2047: u = 2047
|
||
if v > 2047: v = 2047
|
||
return (u % tw), (v % th)
|
||
|
||
def f32_to(v): return v # placeholder
|
||
|
||
def tex0_real(tbp, cbp):
|
||
# PSMT8 (psm=0x13) 512x512, TFX=DECAL(1); CLUT: CBP, CPSM=PSMCT32(0), CSM=0 (CSM1 grid!), CSA=0, CLD=1.
|
||
v = bake.tex0_pack(tbp, TBW_TEX, psm=0x13, tw=TW_LOG, th=TH_LOG, tfx=1)
|
||
v |= (cbp & 0x3FFF) << 37
|
||
v |= (0 & 0xF) << 51 # CPSM = PSMCT32
|
||
v |= (0 & 0x1) << 55 # CSM = 0 -> CSM1 (16x16 CT32 grid) — the Ch350 path
|
||
v |= (0 & 0x1F) << 56 # CSA = 0
|
||
v |= (1 & 0x7) << 61 # CLD = 1 -> always load
|
||
return v
|
||
|
||
def main(argv):
|
||
dump = None; draw_idx = 89761
|
||
a = argv[1:]
|
||
if a and not a[0].startswith("--"): dump = a[0]
|
||
if "--draw-idx" in a: draw_idx = int(a[a.index("--draw-idx")+1])
|
||
if dump is None:
|
||
c = glob.glob(os.path.join(ROOT,"captures","gs","silenthill3","*224139*.gs.zst"))
|
||
if not c: sys.exit("no SH3 dump found; pass the .gs.zst path")
|
||
dump = c[0]
|
||
|
||
PERSP_FRAC = bake.PERSP_FRAC
|
||
|
||
dr = C.get_draw(dump, draw_idx)
|
||
if dr is None: sys.exit(f"draw idx {draw_idx} not found")
|
||
t0 = dr["tex0"]; ORIG_TBP = t0["tbp"]; ORIG_CBP = t0["cbp"]
|
||
TW, TH = t0["tw"], t0["th"]
|
||
assert TW==512 and TH==512 and t0["psm"]==0x13, f"unexpected TEX0 {t0}"
|
||
|
||
# --- reconstruct GS local memory at draw time (Ch349) ---
|
||
mem, replayed, uploads, events, vram = RC.build_localmem_to(dump, draw_idx)
|
||
if mem is None: sys.exit("VRAM snapshot absent")
|
||
# de-swizzled 512x512 index image (for TB reference) + swizzled bytes (for LPDDR)
|
||
idx = mem.read_psmt8(ORIG_TBP, t0["tbw"], TW, TH) # de-swizzled indices
|
||
tex_swz = bytes(mem.m[ORIG_TBP*256 : ORIG_TBP*256 + TEX_BYTES]) # swizzled bytes -> LPDDR
|
||
clut_bytes = bytes(mem.m[ORIG_CBP*256 : ORIG_CBP*256 + 1024]) # CT32-grid CLUT bytes -> BRAM
|
||
pal = RC.read_clut32(mem, ORIG_CBP, order="grid") # for the reference PNG
|
||
|
||
# --- geometry: translate to draw origin (full frame), then choose a deterministic CH-tall VIEWPORT crop ---
|
||
xmin = min(v["x"] for v in dr["verts"]); ymin = min(v["y"] for v in dr["verts"])
|
||
OX, OY = int(xmin), int(ymin)
|
||
fverts = [dict(x=v["x"]-OX, y=v["y"]-OY, s=v["s"], t=v["t"], q=v["q"]) for v in dr["verts"]]
|
||
ftris = [(i-2,i-1,i) for i in range(2,len(fverts))]
|
||
full_h = int(max(v["y"] for v in fverts)) + 1
|
||
|
||
def edge(ax,ay,bx,by,px,py): return (px-ax)*(by-ay)-(py-ay)*(bx-ax)
|
||
# per-row coverage histogram over the FULL frame -> pick CY0 = argmax covered pixels in a CH-tall band.
|
||
row_cov = [0]*(full_h+CH+2)
|
||
for (a0,b0,c0) in ftris:
|
||
v0,v1,v2=fverts[a0],fverts[b0],fverts[c0]
|
||
x0,y0=v0["x"],v0["y"]; x1,y1=v1["x"],v1["y"]; x2,y2=v2["x"],v2["y"]
|
||
ar=edge(x0,y0,x1,y1,x2,y2)
|
||
if abs(ar)<1e-9: continue
|
||
inv=1.0/ar
|
||
for py in range(max(0,int(min(y0,y1,y2))), min(full_h-1,int(max(y0,y1,y2))+1)+1):
|
||
for px in range(max(0,int(min(x0,x1,x2))), min(FBPXW-1,int(max(x0,x1,x2))+1)+1):
|
||
cx,cy=px+0.5,py+0.5
|
||
w0=edge(x1,y1,x2,y2,cx,cy)*inv; w1=edge(x2,y2,x0,y0,cx,cy)*inv; w2=1.0-w0-w1
|
||
if w0>=-0.001 and w1>=-0.001 and w2>=-0.001: row_cov[py]+=1
|
||
best_cy0, best_sum = 0, -1
|
||
for cy0 in range(0, max(1, full_h-CH+1)):
|
||
s = sum(row_cov[cy0:cy0+CH])
|
||
if s > best_sum: best_sum, best_cy0 = s, cy0
|
||
CY0 = best_cy0; CX0 = 0
|
||
# apply the viewport crop: shift Y by -CY0 (ST/Q UNCHANGED — only the framebuffer window moves), then
|
||
# CLIP each triangle to the crop rect [0,FBPXW]x[0,CH] (Sutherland-Hodgman, interpolating S/T/Q linearly in
|
||
# screen space — correct since S,T,Q are already premultiplied by 1/w). This is the VIEWPORT scissor done at
|
||
# the host: every emitted vertex lands inside the FB (no out-of-bounds writes), geometry SHAPE + per-vertex
|
||
# ST/Q are preserved exactly; only the framebuffer window is cropped. Codex's "cropped or scissored" rule.
|
||
def lerp(p1, p2, a):
|
||
return dict(x=p1["x"]+a*(p2["x"]-p1["x"]), y=p1["y"]+a*(p2["y"]-p1["y"]),
|
||
s=p1["s"]+a*(p2["s"]-p1["s"]), t=p1["t"]+a*(p2["t"]-p1["t"]), q=p1["q"]+a*(p2["q"]-p1["q"]))
|
||
def clip_edge(poly, inside, isect):
|
||
out=[]
|
||
for i in range(len(poly)):
|
||
cur=poly[i]; prv=poly[i-1]
|
||
ci=inside(cur); pi=inside(prv)
|
||
if ci:
|
||
if not pi: out.append(isect(prv,cur))
|
||
out.append(cur)
|
||
elif pi:
|
||
out.append(isect(prv,cur))
|
||
return out
|
||
def clip_rect(poly):
|
||
# left x>=0, right x<=FBPXW, top y>=0, bottom y<=CH
|
||
poly=clip_edge(poly, lambda p:p["x"]>=0.0, lambda a,b:lerp(a,b,(0.0-a["x"])/(b["x"]-a["x"])))
|
||
if not poly: return poly
|
||
poly=clip_edge(poly, lambda p:p["x"]<=FBPXW, lambda a,b:lerp(a,b,(FBPXW-a["x"])/(b["x"]-a["x"])))
|
||
if not poly: return poly
|
||
poly=clip_edge(poly, lambda p:p["y"]>=0.0, lambda a,b:lerp(a,b,(0.0-a["y"])/(b["y"]-a["y"])))
|
||
if not poly: return poly
|
||
poly=clip_edge(poly, lambda p:p["y"]<=CH, lambda a,b:lerp(a,b,(CH-a["y"])/(b["y"]-a["y"])))
|
||
return poly
|
||
sverts = [dict(x=v["x"]-CX0, y=v["y"]-CY0, s=v["s"], t=v["t"], q=v["q"]) for v in fverts]
|
||
def rnd(v): # round XY to integer screen coords (the feeder gets ints) — host ref MUST use the SAME ints,
|
||
return dict(x=float(int(round(v["x"]))), y=float(int(round(v["y"]))), s=v["s"], t=v["t"], q=v["q"])
|
||
tris = [] # list of (v0,v1,v2) explicit clipped vertex dicts with INTEGER screen XY
|
||
for (a0,b0,c0) in ((i-2,i-1,i) for i in range(2,len(sverts))):
|
||
poly = clip_rect([sverts[a0], sverts[b0], sverts[c0]])
|
||
poly = [rnd(p) for p in poly]
|
||
for k in range(1, len(poly)-1): # fan the clipped polygon into triangles
|
||
tris.append((poly[0], poly[k], poly[k+1]))
|
||
print(f"[Ch350] draw idx{draw_idx}: {len(sverts)} verts; full frame {FBPXW}x{full_h}; DETERMINISTIC crop "
|
||
f"CX0={CX0} CY0={CY0} -> FB {FBPXW}x{CH} ({best_sum} covered px in band); clipped to {len(tris)} tris")
|
||
|
||
# --- feeder staging ---
|
||
# NOTE: gs_persp_uv contract is uq=(u/w)*2^FRAC, q=(1/w)*2^FRAC, u=(uq*floor(2^SCALE/q))>>SCALE. Scaling
|
||
# S/T/Q by K is INVARIANT (cancels) — confirmed it doesn't move the residual. The texel-accuracy limit is
|
||
# the gs_reciprocal_stub 8-bit (256-entry) LUT: ~0.4% relative -> <=1 texel for Ch348's TW=64 but ~2+ texels
|
||
# for this TW=512 texture (plus the S1-path under-interpolation banding). A perspective-precision limit.
|
||
# Ch351 — EFFECTIVE PERSP_FRAC. The hardware u=s/q divide cancels the frac scale, so "widening PERSP_FRAC"
|
||
# is a HOST PACKING choice: pack S/T/Q with more frac bits so the far-surface denominator q=(1/w)*2^FRACeff
|
||
# doesn't round to 1-2 (FRAC=12 collapses for w~2048). PSCALE=2^k gives FRACeff = PERSP_FRAC + k. PSCALE=256
|
||
# -> FRACeff=20, which took the SH3 crop 20%->80% (Codex's "Q×256 ≈ +8 frac bits"). Default PSCALE=1 keeps
|
||
# Ch342/348 at FRAC=12 (canaries). The 24-bit signed S/T field bounds FRACeff for a given S/T range — checked.
|
||
PSCALE = 4096
|
||
S24_MAX = (1<<23) - 1
|
||
max_sfp = [0]
|
||
def vert_words(v):
|
||
s_fp = round(v["s"] * TW * (1<<PERSP_FRAC) * PSCALE) # s_fp/q_fp = (S/Q)*TW = texel_u (perspective)
|
||
t_fp = round(v["t"] * TH * (1<<PERSP_FRAC) * PSCALE)
|
||
q_fp = round(v["q"] * (1<<PERSP_FRAC) * PSCALE)
|
||
max_sfp[0] = max(max_sfp[0], abs(s_fp), abs(t_fp))
|
||
if abs(s_fp) > S24_MAX or abs(t_fp) > S24_MAX: # 24-bit signed ST field overflow guard (Codex #3)
|
||
sys.exit(f"[Ch351] OVERFLOW: |s_fp|={abs(s_fp)} or |t_fp|={abs(t_fp)} > 2^23-1 at PSCALE={PSCALE} "
|
||
f"(FRACeff={PERSP_FRAC}+{PSCALE.bit_length()-1}). Lower PSCALE for this S/T range.")
|
||
if abs(q_fp) > 0x7FFFFFFF:
|
||
sys.exit(f"[Ch351] OVERFLOW: |q_fp|={abs(q_fp)} > 2^31-1 (Q field). Lower PSCALE.")
|
||
sx = max(0, min(FBPXW-1, int(round(v["x"]))))
|
||
sy = max(0, min(CH-1, int(round(v["y"]))))
|
||
return [bake.rgbaq_with_q(0,0,0, q_fp & 0xFFFFFFFF),
|
||
bake.st_data(s_fp & 0xFFFFFF, t_fp & 0xFFFFFF),
|
||
bake.xyz2_dataz(sx, sy, 0x0000_5000)]
|
||
stg = []
|
||
stg.append(len(tris) | (1<<32)) # word0: ntris | perspective-format flag
|
||
stg.append(bake.frame_1_psmct32(FBW))
|
||
stg.append(bake.alpha_pack(0,1,0,1))
|
||
stg.append(0) # TEST_1 = 0 (ZTE=0, ATE=0): no depth/alpha test
|
||
stg.append(bake.zbuf1_pack(0, zmsk=1)) # ZMSK=1: no Z writes -> no Z buffer needed
|
||
stg.append(tex0_real(NEW_TBP, CBP)) # PSMT8 + CSM1 CLUT (CLD=1) -> feeder commit fires the load
|
||
stg.append(3 | (1<<4)) # TRI + TME, ABE=0 (S1 perspective path)
|
||
for (v0,v1,v2) in tris:
|
||
for v in (v0,v1,v2): stg += vert_words(v)
|
||
if len(stg) > STG_WORDS: sys.exit(f"staging {len(stg)} > {STG_WORDS} (raise STG_WORDS)")
|
||
print(f"[Ch350] feeder staging: {len(stg)} words (<= {STG_WORDS})")
|
||
print(f"[Ch351] effective PERSP_FRAC = {PERSP_FRAC}+{PSCALE.bit_length()-1} = {PERSP_FRAC+PSCALE.bit_length()-1} "
|
||
f"(PSCALE={PSCALE}); max |s_fp/t_fp|={max_sfp[0]} of 2^23-1 ({100.0*max_sfp[0]/((1<<23)-1):.1f}% of the 24-bit ST field)")
|
||
|
||
# --- host reference + per-pixel texel map. TWO references over the SAME clipped geometry:
|
||
# refmap = FLOAT perspective (ideal) — the Codex pixel-diff oracle.
|
||
# refmap_rec = RTL-FAITHFUL: fixed-point vertex attrs (uq=s*TW*2^FRAC, q=Q*2^FRAC), float interp, then the
|
||
# 8-bit gs_reciprocal_stub. Comparing the RTL FB vs BOTH isolates reciprocal quantization
|
||
# (RTL≈refmap_rec, refmap_rec≠refmap) from S1 under-interpolation banding (RTL≠refmap_rec).
|
||
refmap = [0]*(FBPXW*CH); refpix = [(0,0,0)]*(FBPXW*CH)
|
||
refmap_rec = [0]*(FBPXW*CH); refpix_rec = [(0,0,0)]*(FBPXW*CH)
|
||
refmap_aff = [0]*(FBPXW*CH) # AFFINE: per-vertex texel, linear u,v interp (NOT perspective-correct)
|
||
F = 1<<PERSP_FRAC
|
||
for (v0,v1,v2) in tris:
|
||
x0,y0=v0["x"],v0["y"]; x1,y1=v1["x"],v1["y"]; x2,y2=v2["x"],v2["y"]
|
||
area = edge(x0,y0,x1,y1,x2,y2)
|
||
if abs(area)<1e-9: continue
|
||
inv = 1.0/area
|
||
# per-vertex FIXED-POINT attributes (exactly what the feeder staging carries)
|
||
uqv=[round(v["s"]*TW*F) for v in (v0,v1,v2)]
|
||
vqv=[round(v["t"]*TH*F) for v in (v0,v1,v2)]
|
||
qv =[round(v["q"]*F) for v in (v0,v1,v2)]
|
||
# per-vertex TEXEL (perspective divide at the vertex) for the affine reference
|
||
auv=[((v["s"]/v["q"])*TW if abs(v["q"])>1e-12 else 0.0) for v in (v0,v1,v2)]
|
||
avv=[((v["t"]/v["q"])*TH if abs(v["q"])>1e-12 else 0.0) for v in (v0,v1,v2)]
|
||
minx=max(0,int(min(x0,x1,x2))); maxx=min(FBPXW-1,int(max(x0,x1,x2))+1)
|
||
miny=max(0,int(min(y0,y1,y2))); maxy=min(CH-1,int(max(y0,y1,y2))+1)
|
||
for py in range(miny,maxy+1):
|
||
for px in range(minx,maxx+1):
|
||
cx,cy=px+0.5,py+0.5
|
||
w0=edge(x1,y1,x2,y2,cx,cy)*inv; w1=edge(x2,y2,x0,y0,cx,cy)*inv; w2=1.0-w0-w1
|
||
if w0<-0.001 or w1<-0.001 or w2<-0.001: continue
|
||
# Ch351 convention fix: coverage/interior at pixel CENTER (px+0.5), but the RTL interpolates
|
||
# the perspective ATTRIBUTES at the INTEGER pixel coord (tex_dx_s1 = s1_x_q - v0_x, no +0.5).
|
||
# Use a CORNER barycentric for S/T/Q to match -> removes the ~0.5-texel drift.
|
||
a0w=edge(x1,y1,x2,y2,float(px),float(py))*inv; a1w=edge(x2,y2,x0,y0,float(px),float(py))*inv; a2w=1.0-a0w-a1w
|
||
S=a0w*v0["s"]+a1w*v1["s"]+a2w*v2["s"]; T=a0w*v0["t"]+a1w*v1["t"]+a2w*v2["t"]
|
||
Q=a0w*v0["q"]+a1w*v1["q"]+a2w*v2["q"]
|
||
if abs(Q)<1e-12: continue
|
||
tu=int((S/Q)*TW) % TW; tv=int((T/Q)*TH) % TH
|
||
if tu<0: tu+=TW
|
||
if tv<0: tv+=TH
|
||
# RTL-faithful: interp the FIXED-POINT attrs, then the 8-bit reciprocal
|
||
uq=w0*uqv[0]+w1*uqv[1]+w2*uqv[2]; vq=w0*vqv[0]+w1*vqv[1]+w2*vqv[2]; qq=w0*qv[0]+w1*qv[1]+w2*qv[2]
|
||
tur,tvr = persp_texel_recip(uq,vq,qq,TW,TH,idx_bits=8)
|
||
# AFFINE texel: linear interp of the per-vertex texels (the under-interpolation hypothesis)
|
||
au=int(w0*auv[0]+w1*auv[1]+w2*auv[2]) % TW; av=int(w0*avv[0]+w1*avv[1]+w2*avv[2]) % TH
|
||
if au<0: au+=TW
|
||
if av<0: av+=TH
|
||
mw = min(w0,w1,w2)
|
||
interior = 1 if mw > 0.04 else 0 # away from this triangle's own edges
|
||
deep = 1 if mw > 0.15 else 0 # DEEP interior — far from any edge => seam-free zone
|
||
o=py*FBPXW+px
|
||
refmap[o] = (1<<31)|(interior<<30)|(deep<<29)|((tu&0x1FF)<<9)|(tv&0x1FF)
|
||
refmap_rec[o] = (1<<31)|(interior<<30)|((tur&0x1FF)<<9)|(tvr&0x1FF)
|
||
refmap_aff[o] = (1<<31)|(interior<<30)|((au&0x1FF)<<9)|(av&0x1FF)
|
||
p=pal[idx[tv*TW+tu]&0xFF]; refpix[o]=(p&0xFF,(p>>8)&0xFF,(p>>16)&0xFF)
|
||
pr=pal[idx[tvr*TW+tur]&0xFF];refpix_rec[o]=(pr&0xFF,(pr>>8)&0xFF,(pr>>16)&0xFF)
|
||
|
||
covered = sum(1 for w in refmap if w>>31)
|
||
print(f"[Ch350] host reference: {covered} covered FB pixels")
|
||
# emit the RTL-faithful refmap + PNG for the Ch351 oracle
|
||
with open(os.path.join(DATA,"sh3_real_refmap_recip.mem"),"w") as f:
|
||
f.write("// Ch351 RTL-faithful (8-bit reciprocal) per-pixel texel map. gitignored.\n")
|
||
for x in refmap_rec: f.write(f"{x & 0xFFFFFFFF:08x}\n")
|
||
with open(os.path.join(DATA,"sh3_real_refmap_affine.mem"),"w") as f:
|
||
f.write("// Ch351 AFFINE (per-vertex texel, linear interp) per-pixel texel map. gitignored.\n")
|
||
for x in refmap_aff: f.write(f"{x & 0xFFFFFFFF:08x}\n")
|
||
try:
|
||
from PIL import Image
|
||
Image.new("RGB",(FBPXW,CH)).copy() # noop guard
|
||
im2=Image.new("RGB",(FBPXW,CH)); im2.putdata(refpix_rec)
|
||
im2.save(os.path.join(ROOT,"captures","gs","silenthill3","extracted","recon","sh3_real_ref_recip.png"))
|
||
except Exception as e:
|
||
print("(PIL skip recip png:", e, ")")
|
||
|
||
# --- setup bootlet: BOARD-READY CLUT upload (Ch347 pattern). The CSM1 loader reads the CLUT from VRAM at
|
||
# CBP via GRID offsets; sh3_real_clut.mem IS the raw grid bytes, so a LINEAR 256x1 BITBLT of those 256 words
|
||
# (PSMCT32_SWIZZLE=0) places byte CBP+k*4 = word k = the grid byte verbatim -> the loader reads it correctly.
|
||
# The upload also fires dma_done_seen, which auto-starts the feeder (C_SETUP->C_RUN). On the board this is the
|
||
# ONLY CLUT path (no TB backdoor); the SIM TB still backdoors the same bytes (redundant + consistent).
|
||
clut_words_b = [int.from_bytes(clut_bytes[i*4:i*4+4],"little") for i in range(256)]
|
||
RAM_QWORDS = 512
|
||
pay = []
|
||
pay.append(bake.giftag(1,0,0,4,int('E'*4,16)))
|
||
pay.append(bake.aplusd(bake.R_BITBLTBUF, bake.bitbltbuf_pack(CBP, 1, 0x00))) # PSMCT32 dest @CBP
|
||
pay.append(bake.aplusd(bake.R_TRXPOS, bake.trxpos_pack(0,0)))
|
||
pay.append(bake.aplusd(bake.R_TRXREG, bake.trxreg_pack(256,1))) # 256x1 contiguous
|
||
pay.append(bake.aplusd(bake.R_TRXDIR, bake.trxdir_pack(0)))
|
||
pay.append(bake.giftag(256//4,1,2,0,0)) # 64 qwords image data, EOP
|
||
for q in range(256//4):
|
||
word=0
|
||
for lane in range(4): word |= (clut_words_b[q*4+lane]&0xFFFFFFFF) << (32*lane)
|
||
pay.append(word)
|
||
qwc=len(pay)
|
||
disp_hi=((CH-1)<<12)|(FBPXW-1)
|
||
with open(os.path.join(DATA,"payload_sh3_real.mem"),"w") as f:
|
||
f.write(f"// Ch352 LOCAL SH3 real-draw setup payload (CSM1 CLUT 256x1 -> CBP={CBP}, grid bytes verbatim). gitignored. QWC={qwc}.\n")
|
||
for _ in range(16): f.write(f"{0:032x}\n")
|
||
for x in pay: f.write(f"{x&((1<<128)-1):032x}\n")
|
||
for _ in range(RAM_QWORDS-16-qwc): f.write(f"{0:032x}\n")
|
||
bake.write_bios_mem("bios_sh3_real.mem",
|
||
bake.build_textured_demo_bootlet_disp(qwc, disp_hi, FBW),
|
||
f"Ch352 LOCAL SH3 real-draw setup bootlet (QWC={qwc}, DISPLAY1={FBPXW}x{CH}). gitignored.")
|
||
print(f"[Ch352] setup bootlet: payload {qwc} qw (CSM1 CLUT 256x1 upload to CBP={CBP}).")
|
||
|
||
# --- emit ---
|
||
def wmem(name, words, width_hex, banner):
|
||
with open(os.path.join(DATA,name),"w") as f:
|
||
f.write(f"// {banner}\n")
|
||
for x in words: f.write(f"{x & ((1<<(4*width_hex))-1):0{width_hex}x}\n")
|
||
# de-swizzled index image, packed 4 idx/word (LINEAR row-major: byte v*TW+u = idx(u,v))
|
||
idx_words = [idx[i*4]|(idx[i*4+1]<<8)|(idx[i*4+2]<<16)|(idx[i*4+3]<<24) for i in range(TW*TH//4)]
|
||
wmem("sh3_real_idx.mem", idx_words, 8, "Ch350 LOCAL SH3 512x512 de-swizzled indices (4/word) for TB ref. gitignored.")
|
||
# LPDDR texture: the bram-top defaults PSMT8_SWIZZLE=0 (LINEAR read, like Ch347/348) — so store the texture
|
||
# LINEAR (de-swizzled, = idx_words). The texture unit's linear PSMT8 addr (base + v*TBW*64 + u) then reads
|
||
# texel(u,v)=idx(u,v). (The raw SWIZZLED bytes would need PSMT8_SWIZZLE=1; kept as sh3_real_tex_lpddr_swz.mem
|
||
# for that variant.) This is the Ch299/Ch350 root-cause fix: linear texture <-> linear read.
|
||
wmem("sh3_real_tex_lpddr.mem", idx_words, 8, "Ch350 LOCAL SH3 512x512 LINEAR de-swizzled indices -> LPDDR model (PSMT8_SWIZZLE=0). gitignored.")
|
||
# Ch352 guardrail #2 — board-side READBACK CHECKSUM: after the HPS write-probe uploads these 65536 words to
|
||
# LPDDR @0x200000, the HPS read-probe should read them back and confirm sum32 + xor32 BEFORE the cache fill.
|
||
tex_sum = sum(idx_words) & 0xFFFFFFFF
|
||
tex_xor = 0
|
||
for w in idx_words: tex_xor ^= w
|
||
print(f"[Ch352] TEXTURE→LPDDR upload checksum (verify via read-probe before cache-fill): "
|
||
f"{len(idx_words)} words @ LPDDR 0x{LPDDR_TEX_BASE:07x} sum32=0x{tex_sum:08x} xor32=0x{tex_xor:08x}")
|
||
tex_swz_words = [int.from_bytes(tex_swz[i*4:i*4+4],"little") for i in range(TEX_BYTES//4)]
|
||
wmem("sh3_real_tex_lpddr_swz.mem", tex_swz_words, 8, "Ch350 LOCAL SH3 512x512 SWIZZLED bytes -> LPDDR (for PSMT8_SWIZZLE=1 variant). gitignored.")
|
||
clut_words = [int.from_bytes(clut_bytes[i*4:i*4+4],"little") for i in range(256)]
|
||
wmem("sh3_real_clut.mem", clut_words, 8, "Ch350 LOCAL SH3 CSM1 CLUT (raw CT32-grid bytes @CBP) -> BRAM (HW CSM1 loader reads these in grid order). gitignored.")
|
||
# de-gridded palette pal[i] (what the HW CSM1 grid-read produces) -> TB reference expected colors
|
||
wmem("sh3_real_pal.mem", [p & 0xFFFFFFFF for p in pal], 8,
|
||
"Ch350 LOCAL SH3 de-gridded palette pal[i] (grid-read of the CBP bytes) for the TB reference. gitignored.")
|
||
bake.write_feeder_stg_mem("feeder_sh3_real.mem", stg,
|
||
"Ch350 LOCAL SH3 REAL draw (idx89761) feeder staging: triangle list + TEX0(PSMT8,CSM1,CLD=1,DECAL). gitignored.",
|
||
total=STG_WORDS)
|
||
wmem("sh3_real_refmap.mem", refmap, 8, "Ch350 LOCAL per-FB-pixel covered|interior|tu|tv reference map. gitignored.")
|
||
# params include for the TB
|
||
with open(os.path.join(DATA,"sh3_real_params.vh"),"w") as f:
|
||
f.write("// Ch350 LOCAL generated params for tb_top_psmct32_sh3_real_draw_demo. gitignored.\n")
|
||
f.write(f"localparam int FBW = {FBW};\n")
|
||
f.write(f"localparam int FBPXW = {FBPXW};\n")
|
||
f.write(f"localparam int FBH = {CH};\n")
|
||
f.write(f"localparam int VRAM_BYTES_P = {VRAM_BYTES};\n")
|
||
f.write(f"localparam int CROP_CX0 = {CX0};\n")
|
||
f.write(f"localparam int CROP_CY0 = {CY0};\n")
|
||
f.write(f"localparam int CLUT_CBP = {CBP};\n")
|
||
f.write(f"localparam int NEW_TBP = {NEW_TBP};\n")
|
||
f.write(f"localparam int TEX_VRAM_BASE= {TEX_VRAM_BASE};\n")
|
||
f.write(f"localparam int TEX_BYTES = {TEX_BYTES};\n")
|
||
f.write(f"localparam [29:0] LPDDR_TEX_BASE = 30'h{LPDDR_TEX_BASE:07x};\n")
|
||
f.write(f"localparam int N_BEATS = {TEX_BYTES//32};\n")
|
||
f.write(f"localparam int STG_WORDS = {STG_WORDS};\n")
|
||
f.write(f"localparam int TW = {TW};\n")
|
||
f.write(f"localparam int TH = {TH};\n")
|
||
# eyeball PNG
|
||
try:
|
||
from PIL import Image
|
||
im=Image.new("RGB",(FBPXW,CH)); im.putdata(refpix)
|
||
im.save(os.path.join(ROOT,"captures","gs","silenthill3","extracted","recon","sh3_real_ref.png"))
|
||
print("[Ch350] wrote sh3_real_ref.png")
|
||
except Exception as e:
|
||
print("(PIL skipped:", e, ")")
|
||
print(f"[Ch350] emitted fixtures -> {DATA} (LOCAL). TEX_VRAM_BASE=0x{TEX_VRAM_BASE:x} TBP={NEW_TBP} CBP={CBP} "
|
||
f"LPDDR_TEX_BASE=0x{LPDDR_TEX_BASE:x} N_BEATS={TEX_BYTES//32}")
|
||
|
||
if __name__ == "__main__":
|
||
main(sys.argv)
|