retroDE_ps2/sim/data/top_psmct32_raster_demo/bake.py

#!/usr/bin/env python3
# retroDE_ps2 — .mem bake for top_psmct32_raster_demo.
#
# Produces two parallel fixture sets:
#
#   Ch171 (current production — used by the synth / board build):
#     bios.mem     — 19-word EE bootlet at 0xBFC0_0000
#     payload.mem  — 40 qwords: 16 zero + 24 GIF qwords (4 SPRITEs)
#     Pattern: 320x240 four-quadrant test card (160x120 each,
#              R / G / B / W) painted at PCRTC origin. DISPLAY1
#              configured to (DW=319, DH=239) so PCRTC scans the
#              full 320x240; the board wrapper bumps VRAM_BYTES to
#              512 KiB so 320*240*4 = 307,200 bytes fits.
#
#   Ch146 (legacy — used by the Ch155/Ch158 16x8 sim TBs):
#     bios_ch146.mem    — 18-word EE bootlet at 0xBFC0_0000
#     payload_ch146.mem — 40 qwords: 16 zero + 24 GIF qwords (4 SPRITEs)
#     Pattern: 16x8 four-quadrant test card (8x4 each) with the
#              pre-Ch171 RGB tints (0x55/0xAA/0xCC etc) that
#              tb_top_psmct32_raster_demo_bram and friends still
#              spot-check against. DISPLAY1 = (DW=15, DH=7).
#
# Both bootlet variants drive the same DMAC channel-2 kickoff
# (MADR=0x100, QWC=24, CHCR=START); only DISPLAY1 + GIF payload
# differ.

import os

OUT = os.path.dirname(os.path.abspath(__file__))


# ---------------------------------------------------------------------------
# MIPS opcode helpers.
# ---------------------------------------------------------------------------
def enc_lui(rt, imm):
    return (0x0F << 26) | ((rt & 0x1F) << 16) | (imm & 0xFFFF)


def enc_ori(rt, rs, imm):
    return (0x0D << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | (imm & 0xFFFF)


def enc_sw(rt, rs, imm):
    return (0x2B << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | (imm & 0xFFFF)


def enc_syscall():
    return 0x0000_000C


# Ch251 — additional opcodes for the animated bootlet's main loop.
def enc_addiu(rt, rs, imm):
    """opcode 0x09, signed 16-bit imm."""
    return (0x09 << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | (imm & 0xFFFF)


def enc_bne(rs, rt, offset_words):
    """opcode 0x05, signed 16-bit word offset (from PC+4, MIPS-standard)."""
    return (0x05 << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | (offset_words & 0xFFFF)


def enc_j(target_pc):
    """opcode 0x02, 26-bit target (target_pc >> 2). HW substitutes the
    top 4 bits of (PC_delay_slot+1) on execute, so this works only when
    the loop fits in the same 256 MiB region as the calling PC — true
    for our BIOS-resident bootlet (lives wholly in 0xBFC0_0xxx)."""
    return (0x02 << 26) | ((target_pc >> 2) & 0x03FFFFFF)


def enc_special(rs, rt, rd, sa, funct):
    return ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | ((rd & 0x1F) << 11) | ((sa & 0x1F) << 6) | (funct & 0x3F)


def enc_xor(rd, rs, rt):
    return enc_special(rs, rt, rd, 0, 0x26)


def enc_andi(rt, rs, imm):
    """opcode 0x0C, zero-extended 16-bit imm."""
    return (0x0C << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | (imm & 0xFFFF)


def enc_lw(rt, rs, imm):
    """opcode 0x23, signed 16-bit byte offset."""
    return (0x23 << 26) | ((rs & 0x1F) << 21) | ((rt & 0x1F) << 16) | (imm & 0xFFFF)


def enc_nop():
    # SLL $0, $0, 0 — the canonical MIPS NOP.
    return enc_special(0, 0, 0, 0, 0)


# ---------------------------------------------------------------------------
# GIF / GS field encoders.
# ---------------------------------------------------------------------------
R_PRIM    = 0x00
R_RGBAQ   = 0x01
R_XYZ2    = 0x05
R_FRAME_1 = 0x4C

PRIM_SPRITE     = 6
# GS FRAME_1 layout: bits[8:0]=FBP, bits[21:16]=FBW (in 64-pixel units),
# bits[29:24]=PSM. For PSMCT32 (PSM=0), FBP=0, FBW depends on the
# framebuffer width we're rendering — see per-fixture overrides below.
def frame_1_psmct32(fbw):
    return (fbw & 0x3F) << 16

# GS DISPFB1 layout: bits[8:0]=FBP, bits[13:9]=FBW (in 64-pixel units),
# bits[19:15]=PSM, bits[20+]=DBX/DBY. Same FBW units as FRAME_1.
def dispfb1_psmct32(fbw):
    return (fbw & 0x1F) << 9

# PSMCT16 (PSM=2) variants for the Ch308 PSMCT16-tile/framebuffer demo: same FBW
# (64-pixel) units, PSM field set to 0x02 so the GS flush + PCRTC scanout use the
# 16-bit RGB5A1 format. FRAME_1.PSM=[29:24], DISPFB1.PSM=[19:15].
def frame_1_psmct16(fbw):
    return ((fbw & 0x3F) << 16) | (0x02 << 24)

def dispfb1_psmct16(fbw):
    return ((fbw & 0x1F) << 9) | (0x02 << 15)


def giftag(nloop, eop, flg, nreg, regs):
    lower = (nreg & 0xF) << 60
    lower |= (flg & 0x3) << 58
    lower |= (eop & 0x1) << 15
    lower |= nloop & 0x7FFF
    return ((regs & ((1 << 64) - 1)) << 64) | (lower & ((1 << 64) - 1))


def aplusd(reg_num, data64):
    return ((reg_num & 0xFF) << 64) | (data64 & ((1 << 64) - 1))


def xyz2_data(x, y):
    v = 0
    v |= ((x & 0xFFF) << 4)
    v |= ((y & 0xFFF) << 20)
    return v


def rgbaq_data(r, g, b, a=0xFF):
    return (a << 24) | (b << 16) | (g << 8) | r


# ---------------------------------------------------------------------------
# Bootlet template. The two variants differ only in the DISPLAY1_hi
# constant: Ch171 wants DW=319 / DH=239 (320x240 scanout window),
# Ch146 wants DW=15 / DH=7 (16x8 window).
# ---------------------------------------------------------------------------
def bootlet_for_display1_hi(value_32, fbw):
    """Build an EE bootlet that writes value_32 to DISPLAY1_hi and the
    DISPFB1 with the given FBW, then kicks the GIF payload via DMAC
    channel 2 (MADR=0x100, QWC=24)."""
    # Split value_32 into LUI (upper 16) + ORI (lower 16).
    hi16 = (value_32 >> 16) & 0xFFFF
    lo16 = value_32 & 0xFFFF
    dispfb1_val = dispfb1_psmct32(fbw)  # fits in 16 bits for FBW <= 31
    assert dispfb1_val <= 0xFFFF
    return [
        enc_lui(1, 0x1200),         # r1 = 0x1200_0000 (GS-priv base)
        enc_lui(2, 0x0000),         # r2 = 0x0000_0000
        enc_ori(2, 2, dispfb1_val), # r2 = DISPFB1 (PSM=PSMCT32, FBP=0, FBW=fbw)
        enc_sw(2, 1, 0x0070),       # *DISPFB1 = r2
        enc_sw(0, 1, 0x0080),       # *DISPLAY1_lo = 0
        enc_lui(2, hi16),           # r2 = (hi16 << 16)
        enc_ori(2, 2, lo16),        # r2 = value_32
        enc_sw(2, 1, 0x0084),       # *DISPLAY1_hi = r2
        enc_ori(2, 0, 0x0001),      # r2 = PMODE.EN1
        enc_sw(2, 1, 0x0000),       # *PMODE = r2
        enc_lui(10, 0x1000),        # r10 = 0x1000_0000
        enc_ori(10, 10, 0xA000),    # r10 = 0x1000_A000 (DMAC ch2 base)
        enc_ori(11, 0, 0x0100),     # r11 = PAYLOAD_MADR (0x100)
        enc_sw(11, 10, 0x0010),     # *MADR = r11
        enc_ori(11, 0, 24),         # r11 = QWC (24, 4 SPRITEs × 6 qwords)
        enc_sw(11, 10, 0x0020),     # *QWC = r11
        enc_ori(11, 0, 0x0001),     # r11 = CHCR.start
        enc_sw(11, 10, 0x0000),     # *CHCR = r11 (kicks DMA)
        enc_syscall(),              # halt
    ]


def payload_for_sprites(sprites, fbw):
    """Build 24 qwords of GIF payload for the given 4 SPRITEs.
    Each sprite is (r, g, b, x0, y0, x1, y1)."""
    frame_1_val = frame_1_psmct32(fbw)
    payload_qwords = []
    for p, (r, g, b, x0, y0, x1, y1) in enumerate(sprites):
        eop = 1 if p == len(sprites) - 1 else 0
        payload_qwords.append(giftag(1, eop, 0, 5, 0x0000_0000_000E_EEEE))
        payload_qwords.append(aplusd(R_PRIM,    PRIM_SPRITE))
        payload_qwords.append(aplusd(R_FRAME_1, frame_1_val))
        payload_qwords.append(aplusd(R_RGBAQ,   rgbaq_data(r, g, b)))
        payload_qwords.append(aplusd(R_XYZ2,    xyz2_data(x0, y0)))
        payload_qwords.append(aplusd(R_XYZ2,    xyz2_data(x1, y1)))
    return payload_qwords


# ---------------------------------------------------------------------------
# .mem writers.
# ---------------------------------------------------------------------------
BIOS_TOTAL_WORDS = 1024     # matches top_psmct32_raster_demo BIOS_SIZE_BYTES default
RAM_TOTAL_QWORDS = 256      # matches top_psmct32_raster_demo RAM_SIZE_BYTES default


def write_bios_mem(filename, bootlet, banner):
    with open(os.path.join(OUT, filename), "w") as f:
        f.write(f"// {banner}\n")
        f.write("// Loaded via $readmemh into bios_rom_stub.mem (32-bit/word).\n")
        for w in bootlet:
            f.write(f"{w:08x}\n")
        for _ in range(BIOS_TOTAL_WORDS - len(bootlet)):
            f.write(f"{0:08x}\n")


def write_payload_mem(filename, payload_qwords, banner):
    with open(os.path.join(OUT, filename), "w") as f:
        f.write(f"// {banner}\n")
        f.write("// Loaded via $readmemh into ee_ram_stub.mem (128-bit qword).\n")
        f.write("// qw 0..15: zero; qw 16..N: SPRITE PACKED packets.\n")
        for _ in range(16):
            f.write(f"{0:032x}\n")
        for qw in payload_qwords:
            f.write(f"{qw:032x}\n")
        for _ in range(RAM_TOTAL_QWORDS - 16 - len(payload_qwords)):
            f.write(f"{0:032x}\n")


# ---------------------------------------------------------------------------
# Ch251 — animated 320x240 demo (production, used by synth).
#
# Replaces the Ch171 four-quadrant card with a richer scene:
#   - 8 vertical SMPTE-style color bars (full height, 40 px wide each)
#   - 4 thin white border strips (top / bottom / left / right, 4 px)
#   - 4 small orange corner-alignment markers (8x8)
#   - 1 center "heartbeat" SPRITE (16x16) whose RGBAQ is updated by
#     the bootlet's main loop between alternating cyan/red.
#
# DISPLAY1_hi layout: bits[11:0]=DW (width-1), bits[22:12]=DH (height-1).
# DW=319=0x13F, DH=239=0xEF → 0xEF13F.
# FBW=5 because the framebuffer is 320 pixels wide (320 / 64 = 5 pages).
#
# The bootlet does the same initial setup as Ch171 (DISPFB1 / DISPLAY1 /
# PMODE / DMAC ch2 MADR+QWC) and the first DMAC kick, then enters a
# forever loop that:
#   - busy-waits a delay counter
#   - XORs the heartbeat RGBAQ qword in EE-RAM kseg0 between two colors
#   - re-arms DMAC (MADR=0x100, QWC) and re-fires CHCR.start
#
# **Important semantic shift from Ch171:** the bootlet never SYSCALLs.
# `core_halt` stays 0 in steady state — that is the EXPECTED state for
# the animated demo. The new boot-success indicators are FRAME_COUNT
# advancing + visible heartbeat blink + RASTER_OVERFLOW_COUNT=0.
# Runbook LED ledger and tb_de25_nano_psmct32_raster_demo_top were
# updated accordingly.
# ---------------------------------------------------------------------------

CH251_FBW = 5
CH251_DISPLAY1_HI = 0x000EF13F  # DW=319 (320 px), DH=239 (240 px)
CH251_NUM_SPRITES = 17
CH251_QWC = CH251_NUM_SPRITES * 6   # 102 qwords (6 per SPRITE)
assert CH251_QWC <= 0xFFFF, "QWC must fit in an ORI imm field"

# Heartbeat byte offset in EE-RAM (kseg0 view).
# Payload starts at byte 0x100 (= 16 qwords of zero pre-padding).
# Heartbeat is the LAST SPRITE; each SPRITE = 6 qwords; RGBAQ is the
# 4th qword within (0-indexed offset +3). So:
#   = 0x100 + (NUM_SPRITES-1)*6*16 + 3*16
#   = 256 + 16*96 + 48 = 1840 = 0x730
CH251_HB_OFFSET = 0x100 + (CH251_NUM_SPRITES - 1) * 6 * 16 + 3 * 16
assert CH251_HB_OFFSET == 0x730

# Heartbeat alternating colors (packed RGBAQ word, low-half of qword).
#   Bit layout (low 32 bits of RGBAQ data): {A[31:24], B[23:16], G[15:8], R[7:0]}.
#   CYAN = A=FF, B=FF, G=FF, R=00 -> 0xFFFFFF00
#   RED  = A=FF, B=00, G=00, R=FF -> 0xFF0000FF
#   XOR  = 0x00FFFFFF (flips R/G/B, keeps alpha)
CH251_HB_COLOR_A = 0xFFFFFF00
CH251_HB_COLOR_B = 0xFF0000FF
CH251_HB_XOR     = CH251_HB_COLOR_A ^ CH251_HB_COLOR_B
assert CH251_HB_XOR == 0x00FFFFFF


def build_ch251_sprites():
    """Generate the 17 SPRITE list for the Ch251 animated demo."""
    sprites = []
    # --- Color bars (full height, 40 px wide each) ---
    bars = [
        (0xFF, 0xFF, 0xFF),  # 0 white
        (0xFF, 0xFF, 0x00),  # 1 yellow
        (0x00, 0xFF, 0xFF),  # 2 cyan
        (0x00, 0xFF, 0x00),  # 3 green
        (0xFF, 0x00, 0xFF),  # 4 magenta
        (0xFF, 0x00, 0x00),  # 5 red
        (0x00, 0x00, 0xFF),  # 6 blue
        (0x00, 0x00, 0x00),  # 7 black
    ]
    for i, (r, g, b) in enumerate(bars):
        sprites.append((r, g, b, i * 40, 0, i * 40 + 39, 239))
    # --- Thin grey border (drawn on top of the bars). Grey instead of
    # white so it stays visible against bar 0 (white) and bar 7 (black). ---
    bw = 4  # border width in pixels
    sprites += [
        (0x80, 0x80, 0x80,   0,   0, 319, bw - 1),         # top
        (0x80, 0x80, 0x80,   0, 240 - bw, 319, 239),       # bottom
        (0x80, 0x80, 0x80,   0,   0, bw - 1, 239),         # left
        (0x80, 0x80, 0x80, 320 - bw, 0, 319, 239),         # right
    ]
    # --- Orange corner-alignment markers (drawn on top of the border) ---
    cs = 8  # corner-square side
    co = 0xFF
    sprites += [
        (co, 0x80, 0x00,       0,       0, cs - 1, cs - 1),               # TL
        (co, 0x80, 0x00, 320 - cs,      0,    319, cs - 1),               # TR
        (co, 0x80, 0x00,       0, 240 - cs, cs - 1,    239),              # BL
        (co, 0x80, 0x00, 320 - cs, 240 - cs,    319,    239),             # BR
    ]
    # --- Heartbeat (drawn LAST so its RGBAQ qword is at offset 0x730) ---
    sprites.append((0x00, 0xFF, 0xFF, 152, 112, 167, 127))  # cyan
    assert len(sprites) == CH251_NUM_SPRITES
    return sprites


def build_ch251_animated_bootlet():
    """Build the Ch251 animated bootlet — initial setup + first DMAC
    kick + forever loop that updates the heartbeat color and re-fires
    DMAC. Word indices below MUST line up with LOOP_START_INDEX and the
    BNE / J target arithmetic. Re-count if you reorder."""
    hi16 = (CH251_DISPLAY1_HI >> 16) & 0xFFFF
    lo16 = CH251_DISPLAY1_HI & 0xFFFF
    dispfb1_val = dispfb1_psmct32(CH251_FBW)

    color_a_hi = (CH251_HB_COLOR_A >> 16) & 0xFFFF
    color_a_lo = CH251_HB_COLOR_A & 0xFFFF
    xor_hi     = (CH251_HB_XOR     >> 16) & 0xFFFF
    xor_lo     = CH251_HB_XOR     & 0xFFFF

    # Delay counter — number of (addiu + bne + nop) iterations between
    # heartbeat updates.
    #
    # Ch254 cadence characterization (NOT a retune). The bootlet's
    # per-iteration time has TWO additive components:
    #
    #   total/toggle = delay_loop_time + fixed_overhead
    #
    # The delay loop is N iterations of ADDIU + BNE + delay-slot NOP,
    # each iter costing ~14-18 cycles through the ee_core_stub FSM
    # (S_IFETCH_REQ → S_IFETCH_WAIT → S_EXECUTE per instruction, ×3
    # instructions, plus a small amount of branch-handling slack).
    #
    # The fixed overhead is the DMAC drain of 102 qwords through the
    # GIF, the GS rasterization of all 17 SPRITEs, the poll loop
    # waiting for DMAC.STR=0, the CHCR re-arm sequence, and the
    # delay-slot NOP after the J. Hardware measurement at Ch251.5
    # (DELAY_HI=0x100 → 6 s/toggle) and Ch253 (DELAY_HI=0x2B → ~2
    # s/toggle) lets us back-solve both:
    #
    #     6 = 0x100_0000 * cyc/iter / 50e6 + overhead
    #     2 = 0x002B_0000 * cyc/iter / 50e6 + overhead
    #     => cyc/iter ≈ 14, overhead ≈ 1.2 s
    #
    # The ~1.2 s overhead is THE FLOOR. Even with DELAY_HI=0 the
    # bootlet can't toggle faster than ~0.8 Hz without restructuring
    # the rasterization or the bootlet's serialization (e.g., letting
    # the delay run *during* the drain instead of after it, or
    # shrinking the 17-SPRITE payload). That restructure is
    # deliberately out of scope here — the heartbeat is a LIVENESS
    # CUE, not a precision timer, and the slightly-sub-1-Hz cadence
    # is visible enough to confirm "this thing is animating" without
    # claiming a rate we cannot actually deliver.
    #
    # DELAY_HI = 0x002B is the locked Ch254 value. Empirical
    # cadence: ~2 s per cyan↔red toggle (~0.5 Hz), with natural
    # ±0.5 s jitter from overhead variation. `ps2_status.sh --delta`
    # accepts DMA_DONE Δ ∈ {0,1,2} per 2 s window as healthy; Δ=0
    # is a phase-miss not a failure (rerun once).
    #
    # In sim the EE runs at 100 MHz with a fast-paint raster, but the
    # TB only waits for the FIRST DMAC completion (LED[1] latch) — the
    # loop's delay countdown happens in parallel and doesn't affect
    # TB termination. The DMAC-poll inside the loop (below) makes the
    # re-arm bullet-proof regardless of how delay vs drain race.
    DELAY_HI = 0x002B
    DELAY_LO = 0x0000

    # Bootlet word indices (must match the instruction list order
    # below — re-count if you reorder).
    LOOP_START_INDEX = 24    # LUI r12, DELAY_HI
    DELAY_INDEX      = 26    # ADDIU r12, r12, -1
    BNE_DELAY_INDEX  = 27    # BNE r12, $0, DELAY
    POLL_INDEX       = 29    # LW r12, 0(r10) — read CHCR
    BNE_POLL_INDEX   = 31    # BNE r12, $0, POLL

    # MIPS BNE offset is signed words from PC+4 (delay slot PC).
    # target_words - (BNE_index + 1)
    BNE_DELAY_OFFSET = DELAY_INDEX - (BNE_DELAY_INDEX + 1)   # 26 - 28 = -2
    BNE_POLL_OFFSET  = POLL_INDEX  - (BNE_POLL_INDEX  + 1)   # 29 - 32 = -3

    LOOP_START_PC = 0xBFC0_0000 + LOOP_START_INDEX * 4

    return [
        # --- Initial setup (matches the existing one-shot bootlet) ---
        enc_lui(1, 0x1200),             # 0:  r1 = 0x1200_0000 (GS-priv base)
        enc_lui(2, 0x0000),             # 1:  r2 = 0
        enc_ori(2, 2, dispfb1_val),     # 2:  r2 = DISPFB1 value (FBP=0, FBW=5)
        enc_sw(2, 1, 0x0070),           # 3:  *DISPFB1 = r2
        enc_sw(0, 1, 0x0080),           # 4:  *DISPLAY1_lo = 0
        enc_lui(2, hi16),               # 5:  r2 = hi
        enc_ori(2, 2, lo16),            # 6:  r2 = DISPLAY1_hi value
        enc_sw(2, 1, 0x0084),           # 7:  *DISPLAY1_hi = r2
        enc_ori(2, 0, 0x0001),          # 8:  r2 = PMODE.EN1
        enc_sw(2, 1, 0x0000),           # 9:  *PMODE = r2

        # --- DMAC ch2 setup + first kick ---
        enc_lui(10, 0x1000),            # 10: r10 = 0x1000_0000
        enc_ori(10, 10, 0xA000),        # 11: r10 = 0x1000_A000 (DMAC ch2)
        enc_ori(11, 0, 0x0100),         # 12: r11 = MADR (0x100)
        enc_sw(11, 10, 0x0010),         # 13: *MADR = r11
        enc_ori(11, 0, CH251_QWC),      # 14: r11 = QWC (102)
        enc_sw(11, 10, 0x0020),         # 15: *QWC = r11
        enc_ori(11, 0, 0x0001),         # 16: r11 = CHCR.start
        enc_sw(11, 10, 0x0000),         # 17: *CHCR = r11  (first kick)

        # --- Heartbeat-loop state setup ---
        enc_lui(3, 0x8000),             # 18: r3 = 0x8000_0000 (kseg0 base)
        enc_ori(3, 3, CH251_HB_OFFSET), # 19: r3 = 0x8000_0730 (heartbeat RGBAQ)
        enc_lui(5, color_a_hi),         # 20: r5 = current color (hi)
        enc_ori(5, 5, color_a_lo),      # 21: r5 = current color = CYAN
        enc_lui(6, xor_hi),             # 22: r6 = XOR mask (hi)
        enc_ori(6, 6, xor_lo),          # 23: r6 = XOR mask = 0x00FF_FFFF

        # --- Main loop: delay + DMAC-drain poll + toggle + re-fire ---
        enc_lui(12, DELAY_HI),          # 24: <LOOP_START> r12 = delay count hi
        enc_ori(12, 12, DELAY_LO),      # 25: r12 = delay count

        enc_addiu(12, 12, -1),          # 26: <DELAY> r12 -= 1
        enc_bne(12, 0, BNE_DELAY_OFFSET), # 27: BNE r12, $0, DELAY
        enc_nop(),                      # 28: NOP (delay slot)

        enc_lw(12, 10, 0x0000),         # 29: <POLL> r12 = *CHCR
        enc_andi(12, 12, 0x0001),       # 30: r12 &= 0x1 (start bit)
        enc_bne(12, 0, BNE_POLL_OFFSET),# 31: BNE r12, $0, POLL  (wait for DMAC done)
        enc_nop(),                      # 32: NOP (delay slot)

        enc_xor(5, 5, 6),               # 33: r5 ^= r6 (flip color)
        enc_sw(5, 3, 0x0000),           # 34: *0x8000_0730 = r5

        enc_ori(11, 0, 0x0100),         # 35: re-arm MADR
        enc_sw(11, 10, 0x0010),         # 36:
        enc_ori(11, 0, CH251_QWC),      # 37: re-arm QWC
        enc_sw(11, 10, 0x0020),         # 38:
        enc_ori(11, 0, 0x0001),         # 39: CHCR.start
        enc_sw(11, 10, 0x0000),         # 40:  fire DMAC

        enc_j(LOOP_START_PC),           # 41: j LOOP_START
        enc_nop(),                      # 42: NOP (delay slot)
    ]


ch251_bootlet = build_ch251_animated_bootlet()
ch251_sprites = build_ch251_sprites()
ch251_payload = payload_for_sprites(ch251_sprites, CH251_FBW)
assert len(ch251_payload) == CH251_QWC

write_bios_mem(
    "bios.mem", ch251_bootlet,
    f"Ch251 animated BIOS bootlet ({len(ch251_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 320x240; LOOPS FOREVER — core_halt=0 is expected"
)
write_payload_mem(
    "payload.mem", ch251_payload,
    f"Ch251 GIF payload ({CH251_QWC} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS} qwords); {CH251_NUM_SPRITES} SPRITEs (8 color bars + "
    f"4 border strips + 4 corner markers + 1 heartbeat at 0x{CH251_HB_OFFSET:03x})"
)


# ---------------------------------------------------------------------------
# Ch146 — 16x8 four-quadrant test card (legacy, used by sim TBs that
# pre-date Ch171 and check the original pre-Ch171 RGB tints).
# DISPLAY1_hi: DW=15=0xF, DH=7 → 0x700F (matches pre-Ch171 bootlet).
# ---------------------------------------------------------------------------
CH146_FBW = 1                            # 16-pixel-wide fb fits in 1 page (=64 px)
ch146_bootlet = bootlet_for_display1_hi(0x0000_700F, CH146_FBW)
ch146_sprites = [
    (0x55, 0xAA, 0xCC, 0,  0,  7,  3),   # Q0
    (0x66, 0xBB, 0xDD, 8,  0,  15, 3),   # Q1
    (0x77, 0x33, 0x99, 0,  4,  7,  7),   # Q2
    (0x88, 0x44, 0x22, 8,  4,  15, 7),   # Q3
]
ch146_payload = payload_for_sprites(ch146_sprites, CH146_FBW)
assert len(ch146_payload) == 24

write_bios_mem(
    "bios_ch146.mem", ch146_bootlet,
    f"Ch146 legacy BIOS bootlet ({len(ch146_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8"
)
write_payload_mem(
    "payload_ch146.mem", ch146_payload,
    f"Ch146 legacy GIF payload (24 qwords active, padded to {RAM_TOTAL_QWORDS} qwords); "
    f"4 SPRITEs covering 16x8 with the pre-Ch171 RGB tints"
)


# ---------------------------------------------------------------------------
# Brick 1 — TEXTURED-sprite demo fixture.
#
# Proves the synthesizable textured-SPRITE path end-to-end through the
# real top: a small texture is uploaded to VRAM via a BITBLT/IMAGE GIF
# packet, then a textured SPRITE (PRIM.TME=1, PSMCT32 DECAL) is drawn
# sampling that texture, plus one FLAT control sprite so the scanout
# shows both.
#
# This fixture is consumed by tb_top_psmct32_textured_demo, which builds
# top_psmct32_raster_demo with PSMCT32_SWIZZLE=0 so the LINEAR
# gs_texel_addr fetch and the BITBLT upload share one VRAM layout (the
# swizzle reconciliation the gs_stub TODO flags is out of scope for v1).
#
# Layout (all PSMCT32, linear):
#   Framebuffer : FBP=0, FBW=1 (64 px/row). Active scanout 16x8.
#   Texture     : TBP0=8 -> base 8*256 = 2048 bytes. 8x8 texels, TBW=1.
#                 FB occupies bytes 0..2047 (8 rows*64px*4 = 2048), so
#                 the texture region at >=2048 never overlaps the FB.
#
# GIF payload structure (DMAC-streamed from byte 0x100):
#   U1  GIFtag PACKED NREG=4 : BITBLTBUF / TRXPOS / TRXREG / TRXDIR
#       -> arms gif_image_xfer_stub for a host->local upload of the
#          8x8 texture to DBP=8.
#   U2  GIFtag IMAGE  NLOOP=16 : 16 qwords (4 PSMCT32 texels each) =
#          the 8x8 = 64 texels, row-major.
#   U3  GIFtag PACKED NREG=6 : PRIM(SPRITE+TME) / FRAME_1 / TEX0_1 /
#       RGBAQ(flat fallback) / UV(0,0) / XYZ2(0,0)
#   U4  GIFtag PACKED NREG=2 : UV(7,7) / XYZ2(7,7)  (closing vertex,
#          EOP=1) -> textured 8x8 sprite at screen (0,0)..(7,7).
# ---------------------------------------------------------------------------

R_ST     = 0x02
R_UV     = 0x03
R_TEX0_1 = 0x06
R_BITBLTBUF = 0x50
R_TRXPOS    = 0x51
R_TRXREG    = 0x52
R_TRXDIR    = 0x53

TEX_DEMO_FBW   = 1
TEX_DEMO_TBP0  = 8           # texture base = 8*256 = 2048 bytes
TEX_DEMO_TBW   = 1           # 64 texels/row
TEX_DEMO_TEXW  = 8
TEX_DEMO_TEXH  = 8
# DISPLAY1_hi: DW=15 (16 px), DH=7 (8 px) -> 0x700F (same window as Ch146).
TEX_DEMO_DISPLAY1_HI = 0x0000_700F


def prim_sprite_tme():
    # PRIM[2:0]=6 SPRITE, bit4=TME.
    return 6 | (1 << 4)


def tex0_pack(tbp0, tbw, psm=0, tw=3, th=3, tfx=1):
    # TEX0 texture-side fields (per gs_stub decode):
    #   TBP0[13:0], TBW[19:14], PSM[25:20], TW[29:26], TH[33:30], TFX[36:35].
    # Ch333: TFX default = 1 (DECAL, texel replaces color) so the combined path matches the
    # pre-Ch333 behavior for all existing scenes. Color scenes pass tfx=0 (MODULATE: texel*RGBAQ).
    v = 0
    v |= (tbp0 & 0x3FFF)
    v |= (tbw & 0x3F) << 14
    v |= (psm & 0x3F) << 20
    v |= (tw & 0xF) << 26
    v |= (th & 0xF) << 30
    v |= (tfx & 0x3) << 35
    return v


def uv_data(ui, vi):
    # UV reg: U=[13:0], V=[27:14], 10.4 fixed-point (integer texel << 4).
    return ((ui << 4) & 0x3FFF) | (((vi << 4) & 0x3FFF) << 14)


def bitbltbuf_pack(dbp, dbw, dpsm):
    v = 0
    v |= (dbp & 0x3FFF) << 32
    v |= (dbw & 0x3F) << 48
    v |= (dpsm & 0x3F) << 56
    return v


def trxpos_pack(dsax, dsay):
    v = 0
    v |= (dsax & 0x7FF) << 32
    v |= (dsay & 0x7FF) << 48
    return v


def trxreg_pack(rrw, rrh):
    v = 0
    v |= (rrw & 0xFFF)
    v |= (rrh & 0xFFF) << 32
    return v


def trxdir_pack(xdir):
    return xdir & 0x3


def tex_demo_texel(x, y):
    """A bold 2x2-quadrant ABGR pattern so the sampled texture is
    unmistakable on screen (a flat fill can't produce 4 distinct cells).
    A=0xFF.  (x<4,y<4)=RED  (x>=4,y<4)=GREEN  (x<4,y>=4)=BLUE  (x>=4,y>=4)=YELLOW
    ABGR word = A<<24 | B<<16 | G<<8 | R."""
    left = x < 4
    top  = y < 4
    if   top and left:         r, g, b = 0xFF, 0x00, 0x00   # RED
    elif top and not left:     r, g, b = 0x00, 0xFF, 0x00   # GREEN
    elif (not top) and left:   r, g, b = 0x00, 0x00, 0xFF   # BLUE
    else:                      r, g, b = 0xFF, 0xFF, 0x00   # YELLOW
    return 0xFF000000 | (b << 16) | (g << 8) | r


def build_textured_demo_payload():
    """GIF payload qwords for the texture upload + textured sprite +
    one flat control sprite."""
    qw = []

    # --- U1: BITBLTBUF / TRXPOS / TRXREG / TRXDIR (PACKED A+D, NREG=4) ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))   # 4x A+D descriptors
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TEX_DEMO_TBP0, TEX_DEMO_TBW, 0)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TEX_DEMO_TEXW, TEX_DEMO_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))           # 0 = host->local

    # --- U2: IMAGE qwords (FLG=2). 64 texels / 4 per qword = 16 qwords. ---
    n_image = (TEX_DEMO_TEXW * TEX_DEMO_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))                  # IMAGE, NLOOP=16
    for i in range(n_image):
        base = i * 4                                        # 4 texels/qword
        word = 0
        for lane in range(4):
            t = base + lane
            tx = t % TEX_DEMO_TEXW
            ty = t // TEX_DEMO_TEXW
            word |= (tex_demo_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U3: PRIM / FRAME_1 / TEX0_1 / RGBAQ / UV0 / XYZ2_0 (PACKED, NREG=6) ---
    frame_1_val = frame_1_psmct32(TEX_DEMO_FBW)
    tex0_val    = tex0_pack(TEX_DEMO_TBP0, TEX_DEMO_TBW, 0, 3, 3)  # TW=TH=3 -> 8x8
    qw.append(giftag(1, 0, 0, 6, 0x0000_0000_00EE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_sprite_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0x00)))     # flat fallback (overridden)
    qw.append(aplusd(R_UV,      uv_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))

    # --- U4: UV1 / XYZ2_1 closing the textured sprite (PACKED, NREG=2) ---
    qw.append(giftag(1, 0, 0, 2, 0x0000_0000_0000_00EE))           # 2x A+D descriptors
    qw.append(aplusd(R_UV,   uv_data(TEX_DEMO_TEXW - 1, TEX_DEMO_TEXH - 1)))
    qw.append(aplusd(R_XYZ2, xyz2_data(TEX_DEMO_TEXW - 1, TEX_DEMO_TEXH - 1)))

    # --- U5: a FLAT control sprite at (8,0)..(15,7) so the scanout shows
    #         both textured and flat content side by side. EOP here. ---
    qw.append(giftag(1, 1, 0, 5, 0x0000_0000_000E_EEEE))           # EOP
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))                       # SPRITE, no TME
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x20, 0xC0, 0x40)))      # distinct flat green
    qw.append(aplusd(R_XYZ2,    xyz2_data(8, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(15, 7)))

    return qw


def build_textured_demo_bootlet_disp(qwc, display1_hi, fbw):
    """Same one-shot bootlet as build_textured_demo_bootlet but with a
    caller-chosen DISPLAY1_hi (scanout window) and FBW. NO new EE/BIOS
    scaffolding — identical instruction shape, only the two DISPFB1/DISPLAY1
    immediates differ."""
    hi16 = (display1_hi >> 16) & 0xFFFF
    lo16 = display1_hi & 0xFFFF
    dispfb1_val = dispfb1_psmct32(fbw)
    assert dispfb1_val <= 0xFFFF
    assert qwc <= 0xFFFF
    return [
        enc_lui(1, 0x1200),
        enc_lui(2, 0x0000),
        enc_ori(2, 2, dispfb1_val),
        enc_sw(2, 1, 0x0070),
        enc_sw(0, 1, 0x0080),
        enc_lui(2, hi16),
        enc_ori(2, 2, lo16),
        enc_sw(2, 1, 0x0084),
        enc_ori(2, 0, 0x0001),
        enc_sw(2, 1, 0x0000),
        enc_lui(10, 0x1000),
        enc_ori(10, 10, 0xA000),
        enc_ori(11, 0, 0x0100),
        enc_sw(11, 10, 0x0010),
        enc_ori(11, 0, qwc),
        enc_sw(11, 10, 0x0020),
        enc_ori(11, 0, 0x0001),
        enc_sw(11, 10, 0x0000),
        enc_syscall(),
    ]


def build_textured_demo_bootlet(qwc):
    """Bootlet: configure DISPFB1/DISPLAY1/PMODE for a 16x8 PSMCT32
    framebuffer, then kick DMAC ch2 to stream the GIF payload, then
    SYSCALL-halt (one-shot, like the Ch146 fixture)."""
    hi16 = (TEX_DEMO_DISPLAY1_HI >> 16) & 0xFFFF
    lo16 = TEX_DEMO_DISPLAY1_HI & 0xFFFF
    dispfb1_val = dispfb1_psmct32(TEX_DEMO_FBW)
    assert dispfb1_val <= 0xFFFF
    assert qwc <= 0xFFFF
    return [
        enc_lui(1, 0x1200),          # r1 = 0x1200_0000 (GS-priv base)
        enc_lui(2, 0x0000),
        enc_ori(2, 2, dispfb1_val),  # r2 = DISPFB1 (PSMCT32, FBP=0, FBW=1)
        enc_sw(2, 1, 0x0070),        # *DISPFB1 = r2
        enc_sw(0, 1, 0x0080),        # *DISPLAY1_lo = 0
        enc_lui(2, hi16),
        enc_ori(2, 2, lo16),         # r2 = DISPLAY1_hi (DW=15,DH=7)
        enc_sw(2, 1, 0x0084),        # *DISPLAY1_hi = r2
        enc_ori(2, 0, 0x0001),       # r2 = PMODE.EN1
        enc_sw(2, 1, 0x0000),        # *PMODE = r2
        enc_lui(10, 0x1000),
        enc_ori(10, 10, 0xA000),     # r10 = 0x1000_A000 (DMAC ch2 base)
        enc_ori(11, 0, 0x0100),      # r11 = MADR (0x100)
        enc_sw(11, 10, 0x0010),
        enc_ori(11, 0, qwc),         # r11 = QWC
        enc_sw(11, 10, 0x0020),
        enc_ori(11, 0, 0x0001),      # r11 = CHCR.start
        enc_sw(11, 10, 0x0000),      # *CHCR = r11 (kick DMA)
        enc_syscall(),               # halt
    ]


# ---------------------------------------------------------------------------
# Brick 2a — ALPHA-BLEND (transparency) demo fixture.
#
# Proves the FLAT alpha-blended SPRITE path end-to-end: an OPAQUE
# background sprite is painted first, then a SEMI-TRANSPARENT flat
# sprite (PRIM.ABE=1, source-over ALPHA config, RGBAQ.A=0x40) overlaps
# it. The overlap region must show the per-pixel blend of source over
# dest — NEITHER the pure source nor the pure dest — while the
# non-overlap region stays the pure background.
#
# Layout (PSMCT32, linear, 16x8 framebuffer; same window as the
# textured demo so the existing scanout plumbing is reused):
#   BG sprite     : solid blue (R=0x00, G=0x00, B=0xC0), ABE=0,
#                   covers the whole 16x8 area (0,0)..(15,7).
#   OVERLAY sprite: red (R=0xFF, G=0x00, B=0x00), A=0x40, ABE=1,
#                   source-over, covers x in [0..7], full height.
#
# Source-over blend Cv = ((Cs - Cd) * As) >> 7 + Cd, As=0x40 (=64):
#   R: (255-0)*64>>7 + 0   = 127  (0x7F)
#   G: (0-0)*64>>7 + 0     = 0    (0x00)
#   B: (0-192)*64>>7 + 192 = 96   (0x60)
# Overlap region (x in [0..7]) -> (0x7F, 0x00, 0x60).
# Non-overlap   (x in [8..15]) -> pure background blue (0x00,0x00,0xC0).
# ---------------------------------------------------------------------------

R_ALPHA_1 = 0x42

ALPHA_DEMO_FBW = 1
ALPHA_DEMO_DISPLAY1_HI = 0x0000_700F   # DW=15 (16 px), DH=7 (8 px)

# Background (opaque) and overlay (semi-transparent) colors.
ALPHA_BG_R, ALPHA_BG_G, ALPHA_BG_B = 0x00, 0x00, 0xC0   # blue
ALPHA_OV_R, ALPHA_OV_G, ALPHA_OV_B = 0xFF, 0x00, 0x00   # red
ALPHA_OV_A = 0x40                                        # ~0.5 (0x80 == 1.0)


def prim_sprite_abe():
    # PRIM[2:0]=6 SPRITE, bit6=ABE (alpha-blend enable).
    return 6 | (1 << 6)


def alpha_pack(a, b, c, d, fix=0):
    # ALPHA_1: A[1:0] B[3:2] C[5:4] D[7:6] FIX[39:32].
    v = 0
    v |= (a & 0x3)
    v |= (b & 0x3) << 2
    v |= (c & 0x3) << 4
    v |= (d & 0x3) << 6
    v |= (fix & 0xFF) << 32
    return v


def build_alpha_blend_demo_payload():
    """GIF payload: opaque BG sprite, then a semi-transparent overlay
    sprite that blends over it."""
    frame_1_val = frame_1_psmct32(ALPHA_DEMO_FBW)
    qw = []

    # --- U1: opaque BG sprite covering the whole 16x8 area. ABE=0. ---
    qw.append(giftag(1, 0, 0, 5, 0x0000_0000_000E_EEEE))
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))                       # SPRITE, ABE=0
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(ALPHA_BG_R, ALPHA_BG_G, ALPHA_BG_B)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(15, 7)))

    # --- U2: semi-transparent overlay, ABE=1, source-over, x in [0..7]. ---
    #   ALPHA source-over = A=0(Cs) B=1(Cd) C=0(As) D=1(Cd).
    #   RGBAQ.A carries the source alpha (0x40).
    qw.append(giftag(1, 1, 0, 6, 0x0000_0000_00EE_EEEE))           # EOP
    qw.append(aplusd(R_PRIM,    prim_sprite_abe()))                # SPRITE + ABE
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))           # source-over
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(ALPHA_OV_R, ALPHA_OV_G, ALPHA_OV_B, ALPHA_OV_A)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(7, 7)))

    return qw


alpha_demo_payload = build_alpha_blend_demo_payload()
alpha_demo_qwc     = len(alpha_demo_payload)
alpha_demo_bootlet = build_textured_demo_bootlet(alpha_demo_qwc)

write_bios_mem(
    "bios_alpha.mem", alpha_demo_bootlet,
    f"Brick2a alpha-blend BIOS bootlet ({len(alpha_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8; QWC={alpha_demo_qwc}"
)
write_payload_mem(
    "payload_alpha.mem", alpha_demo_payload,
    f"Brick2a alpha-blend GIF payload ({alpha_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); opaque BG sprite + semi-transparent overlay"
)


# ---------------------------------------------------------------------------
# Ch344 — TEXTURED + source-over ALPHA SPRITE demo fixture.
#   Upload an 8x8 checkerboard-ALPHA texture, draw an opaque BG sprite (64x64 blue),
#   then a textured-alpha SPRITE (PRIM SPRITE+TME+ABE) blended over it: opaque-white
#   checks show gray, transparent checks reveal the blue BG. Exercises gs_stub's
#   SPRITE_TEX_ALPHA path end-to-end through the bram-top.
TEXALPHA_FBW = 1
TEXALPHA_DISPLAY1_HI = (63 << 12) | 63    # 64x64
TEXALPHA_TBP = 64                          # texture base word = 64*64 = 4096 (right after the 64x64 FB)
TEXALPHA_TEXW = 8
TEXALPHA_TEXH = 8
TEXALPHA_X0, TEXALPHA_Y0 = 16, 16          # sprite screen rect (32x32 -> 4 screen px / texel)
TEXALPHA_X1, TEXALPHA_Y1 = 48, 48
TEXALPHA_BG_R, TEXALPHA_BG_G, TEXALPHA_BG_B = 0x00, 0x00, 0xC0   # blue background


def texalpha_texel(u, v):
    # 8x8 checkerboard: white-ish OPAQUE (A=0x80) on one parity, fully TRANSPARENT (A=0x00) on the other.
    if (((u >> 1) ^ (v >> 1)) & 1) == 0:
        return (0x80 << 24) | (0xC0 << 16) | (0xC0 << 8) | 0xC0   # A=80, RGB=C0 (light gray)
    return 0x00000000                                            # A=0 -> blend keeps the dest (BG)


def prim_sprite_tme_abe():
    return 6 | (1 << 4) | (1 << 6)         # SPRITE + TME + ABE


def build_texalpha_demo_payload():
    frame_1_val = frame_1_psmct32(TEXALPHA_FBW)
    qw = []
    # U1: upload the 8x8 alpha texture at TBP.
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TEXALPHA_TBP, 1, 0)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TEXALPHA_TEXW, TEXALPHA_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TEXALPHA_TEXW * TEXALPHA_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))         # IMAGE
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane
            word |= (texalpha_texel(t % TEXALPHA_TEXW, t // TEXALPHA_TEXW) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    # U2: opaque BG sprite (fills 64x64), ABE=0.
    qw.append(giftag(1, 0, 0, 5, 0x0000_0000_000E_EEEE))
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(TEXALPHA_BG_R, TEXALPHA_BG_G, TEXALPHA_BG_B)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(63, 63)))
    # U3: textured-alpha SPRITE over the BG (9 A+D regs). White tint -> identity MODULATE; As from texel.
    qw.append(giftag(1, 1, 0, 9, 0x0000_000E_EEEE_EEEE))   # EOP, 9x A+D
    qw.append(aplusd(R_PRIM,    prim_sprite_tme_abe()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))    # source-over
    qw.append(aplusd(R_TEX0_1,  tex0_pack(TEXALPHA_TBP, 1, psm=0, tw=3, th=3, tfx=0)))  # 8x8 PSMCT32 MODULATE
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x80, 0x80, 0x80, 0x80)))  # white tint
    qw.append(aplusd(R_UV,      uv_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(TEXALPHA_X0, TEXALPHA_Y0)))
    qw.append(aplusd(R_UV,      uv_data(TEXALPHA_TEXW, TEXALPHA_TEXH)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(TEXALPHA_X1, TEXALPHA_Y1)))
    return qw


texalpha_demo_payload = build_texalpha_demo_payload()
texalpha_demo_qwc     = len(texalpha_demo_payload)
write_bios_mem(
    "bios_texalpha.mem", build_textured_demo_bootlet_disp(texalpha_demo_qwc, TEXALPHA_DISPLAY1_HI, TEXALPHA_FBW),
    f"Ch344 textured-alpha sprite BIOS bootlet; DISPLAY1=64x64; QWC={texalpha_demo_qwc}")
write_payload_mem(
    "payload_texalpha.mem", texalpha_demo_payload,
    f"Ch344 textured-alpha GIF payload ({texalpha_demo_qwc} qwords): 8x8 alpha texture upload + "
    f"opaque BG sprite + textured-alpha overlay sprite")


# ---------------------------------------------------------------------------
# Brick 2b — Z-BUFFER (depth test) demo fixture.
#
# Proves the FLAT Z-tested PSMCT32 SPRITE path end-to-end: two
# overlapping sprites at DIFFERENT depths are drawn NEAR-first then
# FAR-second; with GEQUAL the FAR sprite must NOT overwrite the NEAR
# sprite in the overlap region (near wins regardless of draw order).
#
# Layout (PSMCT32 fb at FBP=0, FBW=1 → 64 px/row; 16x8 scanout window).
# Z buffer at ZBP=1 → byte base 1*2048 = 0x800, OUTSIDE the 16x8 fb
# region (fb occupies 0..0x7FF: 8 rows * 256 B/row; Z at 0x1000). VRAM powers
# on to zero, so the Z buffer starts cleared — the first (NEAR) sprite
# with Z >= 0 always passes and stamps its Z.
#
#   NEAR sprite : RED   (R=0xFF,G=0,B=0), Z=0x200, x[0..11] y[0..7].
#   FAR  sprite : BLUE  (R=0,G=0,B=0xFF), Z=0x100, x[4..15] y[0..7],
#                 drawn AFTER near.
# With GEQUAL:
#   overlap x[4..11]  : FAR Z(0x100) < stored NEAR Z(0x200) -> FAILS ->
#                       stays RED (near wins; proves depth gated, NOT
#                       last-write-wins).
#   near-only x[0..3] : RED.
#   far-only  x[12..15]: stored Z=0 there, FAR Z(0x100)>=0 -> passes -> BLUE.
#
# TEST_1: ZTE=1 (bit16), ZTST=GEQUAL=2 (bits[18:17]) -> 0x0005_0000.
# ZBUF_1: ZBP=1 (bits[8:0]), PSM=PSMZ32=0, ZMSK=0 -> 0x0000_0001.
# ---------------------------------------------------------------------------

R_TEST_1 = 0x47
R_ZBUF_1 = 0x4E
R_SCISSOR_1 = 0x40
R_CLAMP_1   = 0x48
R_TEX1_1    = 0x14


def tex1_pack(mmag):
    """TEX1_1: MMAG=bit5 (0=NEAREST, 1=LINEAR magnification). Other fields 0."""
    return (mmag & 0x1) << 5


def clamp_pack(wms, wmt):
    """CLAMP_1: WMS[1:0] | WMT[3:2] (0=REPEAT, 1=CLAMP); MIN/MAX (region) left 0."""
    return (wms & 0x3) | ((wmt & 0x3) << 2)


def scissor_pack(x0, x1, y0, y1):
    """SCISSOR_1: SCAX0[10:0] | SCAX1[26:16] | SCAY0[42:32] | SCAY1[58:48] (inclusive)."""
    return (x0 & 0x7FF) | ((x1 & 0x7FF) << 16) | ((y0 & 0x7FF) << 32) | ((y1 & 0x7FF) << 48)

ZBUF_DEMO_FBW          = 1
ZBUF_DEMO_DISPLAY1_HI  = 0x0000_700F   # DW=15 (16 px), DH=7 (8 px)
ZBUF_NEAR_R, ZBUF_NEAR_G, ZBUF_NEAR_B = 0xFF, 0x00, 0x00   # red
ZBUF_FAR_R,  ZBUF_FAR_G,  ZBUF_FAR_B  = 0x00, 0x00, 0xFF   # blue
ZBUF_NEAR_Z = 0x0000_0200
ZBUF_FAR_Z  = 0x0000_0100
# ZBP must be EVEN: the (task-specified) ZMSK lives at bit 0, which
# overlaps ZBP[0]. ZBP=2 -> Z buffer byte base 2*2048=0x1000 (outside
# the 16x8 fb region 0..0x7FF) AND keeps ZMSK=0 (Z updates enabled).
ZBUF_ZBP    = 2


def test1_geq():
    # ZTE=bit16, ZTST=GEQUAL(2)=bits[18:17].
    return (1 << 16) | (2 << 17)


def zbuf1_pack(zbp, zmsk=0, psm=0):
    # ZBUF_1: ZBP[8:0], PSM[27:24], ZMSK bit0.
    return (zbp & 0x1FF) | ((psm & 0xF) << 24) | (zmsk & 0x1)


def xyz2_dataz(x, y, z):
    v = 0
    v |= ((x & 0xFFF) << 4)
    v |= ((y & 0xFFF) << 20)
    v |= ((z & 0xFFFFFFFF) << 32)
    return v


def build_zbuffer_demo_payload():
    """GIF payload: NEAR red sprite (Z=0x200), then FAR blue sprite
    (Z=0x100) that overlaps it. GEQUAL Z-test active for both."""
    frame_1_val = frame_1_psmct32(ZBUF_DEMO_FBW)
    qw = []

    # --- U1: NEAR red sprite, Z=0x200, x[0..11] y[0..7]. ZTE=1 GEQUAL. ---
    qw.append(giftag(1, 0, 0, 7, 0x0000_0000_0EEE_EEEE))   # 7 A+D descriptors (7 nibbles of 0xE)
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(ZBUF_ZBP)))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(ZBUF_NEAR_R, ZBUF_NEAR_G, ZBUF_NEAR_B)))
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(0,  0, ZBUF_NEAR_Z)))
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(11, 7, ZBUF_NEAR_Z)))

    # --- U2: FAR blue sprite, Z=0x100, x[4..15] y[0..7]. EOP. ---
    qw.append(giftag(1, 1, 0, 4, 0x0000_0000_000E_EEEE))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(ZBUF_FAR_R, ZBUF_FAR_G, ZBUF_FAR_B)))
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(4,  0, ZBUF_FAR_Z)))
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(15, 7, ZBUF_FAR_Z)))
    # (PRIM/FRAME_1/TEST_1/ZBUF_1 persist across the GIF context from U1.)

    return qw


zbuf_demo_payload = build_zbuffer_demo_payload()
zbuf_demo_qwc     = len(zbuf_demo_payload)
zbuf_demo_bootlet = build_textured_demo_bootlet(zbuf_demo_qwc)

write_bios_mem(
    "bios_zbuffer.mem", zbuf_demo_bootlet,
    f"Brick2b Z-buffer BIOS bootlet ({len(zbuf_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8; QWC={zbuf_demo_qwc}"
)
write_payload_mem(
    "payload_zbuffer.mem", zbuf_demo_payload,
    f"Brick2b Z-buffer GIF payload ({zbuf_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); NEAR red (Z=0x200) + FAR blue (Z=0x100), GEQUAL depth test"
)


# ---------------------------------------------------------------------------
# Brick 3 — NON-AXIS-ALIGNED GOURAUD TRIANGLE demo fixture (interpolated
# color + interpolated depth), the first triangle on the BRAM board path.
#
# Two non-axis-aligned triangles are drawn into a 16x8 PSMCT32 framebuffer
# with GEQUAL depth test active and per-pixel INTERPOLATED Z:
#
#   TRI A : a large Gouraud triangle, v0=(1,1) RED, v1=(14,1) GREEN,
#           v2=(7,7) BLUE. Z is FLAT-HIGH everywhere (Z=0x300 at all 3
#           vertices) so it is the NEAR surface and stamps Z=0x300.
#   TRI B : a smaller triangle, v0=(2,5) v1=(13,5) v2=(7,2), painted a
#           solid-ish dim WHITE (all 3 vertices grey 0x80) at FLAT-LOW
#           Z=0x100. Drawn SECOND. With GEQUAL its Z(0x100) is BEHIND
#           TRI A's stamped Z(0x300) in the overlap, so it FAILS there
#           (TRI A wins — proves the interpolated-Z compare gates the
#           write), but PASSES where TRI A did not cover (stored Z=0).
#
# This demonstrates: (1) a real non-axis-aligned triangle renders;
# (2) Gouraud color interpolation (TRI A's RGB gradient); (3) per-pixel
# interpolated depth feeding the Z-test (TRI B occluded by the nearer
# TRI A in the overlap). The focused unit TB (tb_gs_tri_interp) pins the
# exact affine color + Z values; this top-level fixture proves the
# end-to-end board path.
#
# NOTE the two triangles here use FLAT (equal-at-all-3-vertices) Z for a
# tractable top-level assertion; the per-pixel Z INTERPOLATOR is exercised
# with non-equal vertex Z in the focused unit TB. The color interpolator
# is exercised non-trivially here (TRI A has 3 distinct vertex colors).
# ---------------------------------------------------------------------------

TRI_DEMO_FBW         = 1
TRI_DEMO_DISPLAY1_HI = 0x0000_700F   # DW=15 (16 px), DH=7 (8 px)
TRI_DEMO_ZBP         = 2             # Z base 2*2048 = 0x1000 (outside 16x8 fb)

PRIM_TRIANGLE = 3                    # PRIM[2:0]=3 discrete TRIANGLE


def build_triangle_demo_payload():
    """GIF payload: TRI A (Gouraud, near flat-Z), then TRI B (grey,
    far flat-Z) with GEQUAL depth test + interpolated Z active."""
    frame_1_val = frame_1_psmct32(TRI_DEMO_FBW)
    qw = []

    # --- U1: TRI A — PRIM/FRAME/TEST/ZBUF + 3 Gouraud vertices.
    #         RGBAQ precedes each XYZ2 so the per-vertex color latches in
    #         the rolling window {prev, curr, closing}. EOP=0.
    qw.append(giftag(1, 0, 0, 10, 0x0000_00EE_EEEE_EEEE))  # 10 A+D descriptors
    qw.append(aplusd(R_PRIM,    PRIM_TRIANGLE))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(TRI_DEMO_ZBP)))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0xFF, 0x00, 0x00)))   # v0 RED
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(1,  1, 0x300)))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0xFF, 0x00)))   # v1 GREEN
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(14, 1, 0x300)))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0xFF)))   # v2 BLUE (closes)
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(7,  7, 0x300)))

    # --- U2: TRI B — grey triangle at FAR flat Z. EOP=1.
    #         PRIM/FRAME/TEST/ZBUF persist from U1's GIF context.
    qw.append(giftag(1, 1, 0, 6, 0x0000_0000_00EE_EEEE))   # 6 A+D descriptors
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x80, 0x80, 0x80)))   # v0 grey
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(2,  5, 0x100)))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x80, 0x80, 0x80)))   # v1 grey
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(13, 5, 0x100)))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x80, 0x80, 0x80)))   # v2 grey (closes)
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(7,  2, 0x100)))

    return qw


tri_demo_payload = build_triangle_demo_payload()
tri_demo_qwc     = len(tri_demo_payload)
tri_demo_bootlet = build_textured_demo_bootlet(tri_demo_qwc)

write_bios_mem(
    "bios_triangle.mem", tri_demo_bootlet,
    f"Brick3 triangle-demo BIOS bootlet ({len(tri_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8; QWC={tri_demo_qwc}"
)
write_payload_mem(
    "payload_triangle.mem", tri_demo_payload,
    f"Brick3 triangle-demo GIF payload ({tri_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); Gouraud TRI A (near) + grey TRI B (far), GEQUAL interp-Z"
)


# ---------------------------------------------------------------------------
# Textured-TRIANGLE demo fixture.
#
# Proves the textured-triangle rung end-to-end: upload a small PSMCT32
# texture via BITBLT, then draw ONE non-axis-aligned TRIANGLE (PRIM type
# 3, TME=1) with per-vertex UV. The rasterizer interpolates U/V affinely
# (the shared-divider gradient engine, steps 10..13) and DECAL-samples
# the uploaded texture. ZTE=0, ABE=0 (read2's only consumer is the texel
# fetch). Reuses the GS_RMW_DEMO board profile (read2 live, small VRAM,
# swizzle=0) and adds NO new EE/BIOS scaffolding — only a GIF payload via
# the shared build_textured_demo_bootlet (one-shot DMAC kick + halt).
#
# UV == screen XY at each vertex, so the affine (u,v) == (x,y) at every
# interior pixel and the sampled texel is exactly tex_demo_texel(x,y) —
# a TB can predict each pixel. Triangle covers part of the 16x8 window
# with a slanted edge (non-axis-aligned).
# ---------------------------------------------------------------------------
def build_textured_triangle_demo_payload():
    frame_1_val = frame_1_psmct32(TEX_DEMO_FBW)
    tex0_val    = tex0_pack(TEX_DEMO_TBP0, TEX_DEMO_TBW, 0, 3, 3)  # 8x8 texture
    qw = []

    # --- U1: BITBLTBUF / TRXPOS / TRXREG / TRXDIR (upload the texture). ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TEX_DEMO_TBP0, TEX_DEMO_TBW, 0)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TEX_DEMO_TEXW, TEX_DEMO_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))

    # --- U2: IMAGE qwords (64 texels / 4 per qword = 16 qwords). ---
    n_image = (TEX_DEMO_TEXW * TEX_DEMO_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        base = i * 4
        word = 0
        for lane in range(4):
            t = base + lane
            tx = t % TEX_DEMO_TEXW
            ty = t // TEX_DEMO_TEXW
            word |= (tex_demo_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U3: PRIM(TRI+TME) / FRAME / TEX0 + 3 vertices (UV+XYZ2 each). EOP. ---
    #   RGBAQ precedes each XYZ2 so the per-vertex rolling window latches;
    #   the colour is overridden by the DECAL texel but still drives the
    #   {prev,curr,closing} window the affine setup reads. UV per vertex
    #   == screen XY so the interpolated (u,v) == (x,y) interior.
    #   13 A+D descriptors: PRIM,FRAME,TEX0 + 3*(RGBAQ,UV,XYZ2)=9 -> 12,
    #   plus nothing else. NREG fields are 0xE per A+D.
    # Non-axis-aligned, with ALL interior UV (==screen XY) inside the
    # 8x8 uploaded texture [0..7] so every sampled texel is well-defined.
    tri = [(1, 1), (7, 1), (2, 6)]
    qw.append(giftag(1, 1, 0, 12, 0x0000_EEEE_EEEE_EEEE))   # 12 A+D, EOP
    qw.append(aplusd(R_PRIM,    prim_tri_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (vx, vy) in tri:
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))  # overridden by texel
        qw.append(aplusd(R_UV,    uv_data(vx, vy)))
        qw.append(aplusd(R_XYZ2,  xyz2_data(vx, vy)))

    return qw


def prim_tri_tme():
    # PRIM[2:0]=3 TRIANGLE, bit4=TME.
    return 3 | (1 << 4)


tritex_demo_payload = build_textured_triangle_demo_payload()
tritex_demo_qwc     = len(tritex_demo_payload)
tritex_demo_bootlet = build_textured_demo_bootlet(tritex_demo_qwc)

write_bios_mem(
    "bios_tritex.mem", tritex_demo_bootlet,
    f"Textured-triangle demo BIOS bootlet ({len(tritex_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8; QWC={tritex_demo_qwc}"
)
write_payload_mem(
    "payload_tritex.mem", tritex_demo_payload,
    f"Textured-triangle demo GIF payload ({tritex_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); BITBLT 8x8 texture upload + 1 textured non-axis-aligned TRI"
)


tex_demo_payload = build_textured_demo_payload()
tex_demo_qwc     = len(tex_demo_payload)
tex_demo_bootlet = build_textured_demo_bootlet(tex_demo_qwc)

write_bios_mem(
    "bios_textured.mem", tex_demo_bootlet,
    f"Brick1 textured-demo BIOS bootlet ({len(tex_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8; QWC={tex_demo_qwc}"
)
write_payload_mem(
    "payload_textured.mem", tex_demo_payload,
    f"Brick1 textured-demo GIF payload ({tex_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); BITBLT 8x8 texture upload + textured SPRITE + 1 flat sprite"
)


# ---------------------------------------------------------------------------
# Ch296 — PSMT8 INDEXED-TEXTURE + CLUT demo fixture.
#
# Proves the PALETTIZED texture path end-to-end through the BRAM board
# top: a small PSMT8 texture (an array of 8-bit INDICES) and a CLUT
# (palette of PSMCT32 colors) are BITBLT-uploaded, a TEX0 with CLUT
# fields (PSM=PSMT8, CLD=1 -> load CLUT, CPSM=PSMCT32, CSM=CSM2) commits
# to fire the VRAM->CLUT load, then ONE PSMT8-textured SPRITE is drawn.
# Mistakes are obvious: each index maps to a boldly distinct color and
# the index pattern is a recognizable shape (a framed 'X').
#
# NO new EE/BIOS scaffolding — GIF payload only (BITBLT for the texture
# indices AND for the CLUT, then the TEX0 + textured primitive). The
# bootlet is the SAME one-shot build_textured_demo_bootlet used by every
# other board fixture.
#
# VRAM layout (linear, no swizzle):
#   - PSMT8 texture indices : DBP = CLUT8_TBP0 (= 8 -> byte 0x800)
#   - CLUT (PSMCT32 entries) : DBP = CLUT8_CBP  (= 12 -> byte 0xC00)
#   The two regions are disjoint (texture spans 0x800.. ; CLUT 0xC00..).

CLUT8_FBW   = 1
CLUT8_TBP0  = 8            # texture base = 8*256 = 0x800
CLUT8_TBW   = 1            # 64 texels/row stride
CLUT8_TEXW  = 8
CLUT8_TEXH  = 8
CLUT8_CBP   = 12           # CLUT base = 12*256 = 0xC00 (CBP is 256-B units)
CLUT8_DISPLAY1_HI = 0x0000_700F   # DW=15 (16 px), DH=7 (8 px) — 16x8 window


def tex0_clut_pack(tbp0, tbw, cbp, cpsm=0, csm=1, csa=0, cld=1):
    """TEX0_1 with the texture side set to PSMT8 (=0x13) AND the CLUT
    side filled so clut_loader_stub fires a VRAM->CLUT load on commit.
       TBP0[13:0] TBW[19:14] PSM[25:20] TW[29:26] TH[33:30]
       CBP[50:37] CPSM[54:51] CSM[55] CSA[60:56] CLD[63:61]."""
    v = tex0_pack(tbp0, tbw, psm=0x13, tw=3, th=3)   # PSMT8, 8x8
    v |= (cbp  & 0x3FFF) << 37
    v |= (cpsm & 0xF)    << 51
    v |= (csm  & 0x1)    << 55
    v |= (csa  & 0x1F)   << 56
    v |= (cld  & 0x7)    << 61
    return v


def clut8_index(x, y):
    """A framed 'X' so the sampled indexed texture is unmistakable:
       border = index 1, the two diagonals = index 3, corner (0,0) = 0,
       interior = index 2."""
    if x == 0 and y == 0:                                  return 0
    if x == y or x == (CLUT8_TEXW - 1 - y):                return 3
    if x == 0 or y == 0 or x == CLUT8_TEXW-1 or y == CLUT8_TEXH-1:
        return 1
    return 2


def clut8_palette(i):
    """Boldly distinct PSMCT32 ABGR per index (A=0xFF):
       0=black 1=RED 2=GREEN 3=BLUE. ABGR word = A<<24|B<<16|G<<8|R."""
    if   i == 0: r, g, b = 0x00, 0x00, 0x00
    elif i == 1: r, g, b = 0xFF, 0x00, 0x00
    elif i == 2: r, g, b = 0x00, 0xFF, 0x00
    elif i == 3: r, g, b = 0x00, 0x00, 0xFF
    else:        r, g, b = 0x7F, 0x7F, 0x7F
    return 0xFF000000 | (b << 16) | (g << 8) | r


def build_clut8_demo_payload():
    """GIF payload: BITBLT the CLUT, BITBLT the PSMT8 index texture, then
    a TEX0(PSMT8+CLUT load) + one PSMT8-textured SPRITE + a flat control
    sprite."""
    qw = []

    # --- U1: BITBLT the CLUT (8 PSMCT32 entries) to VRAM[CBP*256]. ---
    n_clut = 8                       # entries 0..7 (texture only uses 0..3)
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(CLUT8_CBP, 1, 0)))   # DPSM=PSMCT32
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(n_clut, 1)))            # n_clut x 1
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_clut_qw = n_clut // 4                                          # 4 entries/qword
    qw.append(giftag(n_clut_qw, 0, 2, 0, 0))                        # IMAGE
    for i in range(n_clut_qw):
        word = 0
        for lane in range(4):
            word |= (clut8_palette(i * 4 + lane) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: BITBLT the PSMT8 index texture to VRAM[TBP0*256]. ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(CLUT8_TBP0, CLUT8_TBW, 0x13)))  # DPSM=PSMT8
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(CLUT8_TEXW, CLUT8_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    # PSMT8 IMAGE: 16 indices (1 byte each) per 128-bit qword.
    n_idx    = CLUT8_TEXW * CLUT8_TEXH
    n_idx_qw = n_idx // 16
    qw.append(giftag(n_idx_qw, 0, 2, 0, 0))
    for q in range(n_idx_qw):
        word = 0
        for lane in range(16):
            t  = q * 16 + lane
            tx = t % CLUT8_TEXW
            ty = t // CLUT8_TEXW
            word |= (clut8_index(tx, ty) & 0xFF) << (8 * lane)
        qw.append(word)

    # --- U3: PRIM(SPRITE+TME) / FRAME / TEX0(PSMT8+CLUT load) / RGBAQ /
    #         UV0 / XYZ2_0. The TEX0 commit fires the VRAM->CLUT load. ---
    frame_1_val = frame_1_psmct32(CLUT8_FBW)
    tex0_val    = tex0_clut_pack(CLUT8_TBP0, CLUT8_TBW, CLUT8_CBP,
                                 cpsm=0, csm=1, csa=0, cld=1)
    qw.append(giftag(1, 0, 0, 6, 0x0000_0000_00EE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_sprite_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0x00)))
    qw.append(aplusd(R_UV,      uv_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))

    # --- U4: UV1 / XYZ2_1 closing the textured sprite. ---
    qw.append(giftag(1, 0, 0, 2, 0x0000_0000_0000_00EE))
    qw.append(aplusd(R_UV,   uv_data(CLUT8_TEXW - 1, CLUT8_TEXH - 1)))
    qw.append(aplusd(R_XYZ2, xyz2_data(CLUT8_TEXW - 1, CLUT8_TEXH - 1)))

    # --- U5: a FLAT control sprite at (8,0)..(15,7). EOP. ---
    qw.append(giftag(1, 1, 0, 5, 0x0000_0000_000E_EEEE))
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x20, 0xC0, 0x40)))   # flat green
    qw.append(aplusd(R_XYZ2,    xyz2_data(8, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(15, 7)))

    return qw


clut8_demo_payload = build_clut8_demo_payload()
clut8_demo_qwc     = len(clut8_demo_payload)
clut8_demo_bootlet = build_textured_demo_bootlet(clut8_demo_qwc)

write_bios_mem(
    "bios_clut.mem", clut8_demo_bootlet,
    f"Ch296 PSMT8+CLUT demo BIOS bootlet ({len(clut8_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8; QWC={clut8_demo_qwc}"
)
write_payload_mem(
    "payload_clut.mem", clut8_demo_payload,
    f"Ch296 PSMT8+CLUT demo GIF payload ({clut8_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); BITBLT CLUT + PSMT8 index texture + TEX0(CLD) + 1 PSMT8 SPRITE"
)


# ---------------------------------------------------------------------------
# Ch297 — PSMT4 INDEXED-TEXTURE + CLUT demo fixture.
#
# Proves the 4-bit palettized texture path (the MOST common PS2 texture
# format) end-to-end through the BRAM board top, built directly on the
# Ch296 PSMT8+CLUT rung — the ONLY new datapath bit is the nibble extract
# in gs_texture_unit. A 16-entry CLUT (PSMCT32, boldly distinct colors,
# all 16 used) and a PSMT4 index texture (2 indices/byte, LINEAR) are
# BITBLT-uploaded, a TEX0 with CLUT fields (PSM=PSMT4=0x14, CLD=1 ->
# VRAM->CLUT load, CPSM=PSMCT32, CSM=CSM2) commits to fire the load, then
# ONE PSMT4-textured SPRITE is drawn + a FLAT control sprite.
#
# Index pattern: idx(x,y) = (x + 2*y) mod 16 -> a diagonal RAMP that walks
# through ALL 16 palette entries, so the 16-color palette is visibly
# exercised (mistakes are obvious — a stuck nibble collapses the ramp).
#
# NO new EE/BIOS scaffolding — GIF payload only; reuses the SAME one-shot
# build_textured_demo_bootlet as every other board fixture.
#
# VRAM layout (linear, no swizzle):
#   - PSMT4 texture indices : DBP = CLUT4_TBP0 (= 8 -> byte 0x800)
#   - CLUT (PSMCT32 entries) : DBP = CLUT4_CBP  (= 12 -> byte 0xC00)

CLUT4_FBW   = 1
CLUT4_TBP0  = 8            # texture base = 8*256 = 0x800
CLUT4_TBW   = 1            # 64 texels/row stride (texels)
CLUT4_TEXW  = 8
CLUT4_TEXH  = 8
CLUT4_CBP   = 12           # CLUT base = 12*256 = 0xC00 (CBP is 256-B units)


def tex0_clut4_pack(tbp0, tbw, cbp, cpsm=0, csm=1, csa=0, cld=1):
    """TEX0_1 with the texture side set to PSMT4 (=0x14) AND the CLUT side
    filled so clut_loader_stub fires a VRAM->CLUT load on commit."""
    v = tex0_pack(tbp0, tbw, psm=0x14, tw=3, th=3)   # PSMT4, 8x8
    v |= (cbp  & 0x3FFF) << 37
    v |= (cpsm & 0xF)    << 51
    v |= (csm  & 0x1)    << 55
    v |= (csa  & 0x1F)   << 56
    v |= (cld  & 0x7)    << 61
    return v


def clut4_index(x, y):
    """A diagonal ramp through ALL 16 indices: idx = (x + 2*y) mod 16."""
    return (x + 2 * y) % 16


def clut4_palette(i):
    """16 boldly distinct PSMCT32 ABGR entries (A=0xFF). ABGR word =
    A<<24 | B<<16 | G<<8 | R. Spread hue/brightness across all 16 so a
    collapsed nibble is unmistakable."""
    table = [
        (0x00, 0x00, 0x00),  # 0  black
        (0xFF, 0x00, 0x00),  # 1  red
        (0x00, 0xFF, 0x00),  # 2  green
        (0x00, 0x00, 0xFF),  # 3  blue
        (0xFF, 0xFF, 0x00),  # 4  yellow
        (0xFF, 0x00, 0xFF),  # 5  magenta
        (0x00, 0xFF, 0xFF),  # 6  cyan
        (0xFF, 0xFF, 0xFF),  # 7  white
        (0x80, 0x00, 0x00),  # 8  dk red
        (0x00, 0x80, 0x00),  # 9  dk green
        (0x00, 0x00, 0x80),  # 10 dk blue
        (0x80, 0x80, 0x00),  # 11 olive
        (0x80, 0x00, 0x80),  # 12 purple
        (0x00, 0x80, 0x80),  # 13 teal
        (0xC0, 0x60, 0x20),  # 14 orange-brown
        (0x40, 0x40, 0x40),  # 15 dk gray
    ]
    r, g, b = table[i & 0xF]
    return 0xFF000000 | (b << 16) | (g << 8) | r


def build_clut4_demo_payload():
    """GIF payload (NO new EE/BIOS scaffolding):
       U1 BITBLT the 16-entry CLUT (PSMCT32, no read2) to VRAM[CBP*256]
       U2 TEX0(CLD=1) LOAD-ONLY -> fires the VRAM->CLUT load (read2)
       U3 BITBLT the PSMT4 index texture (read2 nibble-RMW) to VRAM[TBP0*256]
       U4 PRIM(SPRITE+TME) + TEX0(CLD=2, CBP-unchanged -> NO reload) + sprite
       U6 a FLAT control sprite

    KEY ORDERING: the PSMT4 texture upload is a read2 nibble-RMW (is_t4_emit),
    and so is NOT time-disjoint from the read2 VRAM->CLUT load. PSMT8 uploads
    use the pure-comb writer (no read2), so Ch296 could load the CLUT on the
    same TEX0 that draws. For PSMT4 we therefore fire the CLUT load on its OWN
    TEX0 (U2) BEFORE the texture upload — the CLUT_STALL FIFO-pop hold keeps
    the load fully ahead of the upload, and the sprite's TEX0 (U4) uses CLD=2
    (CBP unchanged) so it does not re-load and collide with the raster scan."""
    qw = []
    frame_1_val = frame_1_psmct32(CLUT4_FBW)

    # --- U1: BITBLT the CLUT (16 PSMCT32 entries) to VRAM[CBP*256]. ---
    n_clut = 16
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(CLUT4_CBP, 1, 0)))     # DPSM=PSMCT32
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(n_clut, 1)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_clut_qw = n_clut // 4                                            # 4 entries/qword
    qw.append(giftag(n_clut_qw, 0, 2, 0, 0))                           # IMAGE
    for i in range(n_clut_qw):
        word = 0
        for lane in range(4):
            word |= (clut4_palette(i * 4 + lane) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: TEX0(CLD=1) LOAD-ONLY. A bare TEX0_1 commit fires the
    #         VRAM->CLUT load (gs_tex0_wr) with NO PSMT4 upload in flight,
    #         so the read2 CLUT load runs disjoint from the read2 RMW
    #         upload that follows. CLD=4 = PARTIAL 16-entry load into the
    #         CSA=0 window (exactly the 16 entries this 4-bit texture
    #         indexes) — a 16-clock load that finishes before the U3 PSMT4
    #         BITBLT drains into the read2 nibble-RMW path. ---
    tex0_load = tex0_clut4_pack(CLUT4_TBP0, CLUT4_TBW, CLUT4_CBP,
                                cpsm=0, csm=1, csa=0, cld=4)
    qw.append(giftag(1, 0, 0, 1, 0x0000_0000_0000_000E))
    qw.append(aplusd(R_TEX0_1, tex0_load))

    # --- U3: BITBLT the PSMT4 index texture to VRAM[TBP0*256]. ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(CLUT4_TBP0, CLUT4_TBW, 0x14)))  # DPSM=PSMT4
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(CLUT4_TEXW, CLUT4_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    # PSMT4 IMAGE: 32 pixels (4 bits each) per 128-bit qword. Pixel p sits
    # at nibble p (low nibble of byte p/2 for even p, high for odd p), in
    # raster order across the rect.
    n_px    = CLUT4_TEXW * CLUT4_TEXH
    n_px_qw = n_px // 32
    qw.append(giftag(n_px_qw, 0, 2, 0, 0))
    for q in range(n_px_qw):
        word = 0
        for lane in range(32):
            t  = q * 32 + lane
            tx = t % CLUT4_TEXW
            ty = t // CLUT4_TEXW
            word |= (clut4_index(tx, ty) & 0xF) << (4 * lane)
        qw.append(word)

    # --- U4: PRIM(SPRITE+TME) / FRAME / TEX0(CLD=2, no reload) / RGBAQ /
    #         UV0 / XYZ2_0. CLD=2 only re-loads if CBP changed; CBP is the
    #         same as U2, so NO reload fires here -> no read2 collision with
    #         the textured sprite's raster scan. ---
    tex0_draw = tex0_clut4_pack(CLUT4_TBP0, CLUT4_TBW, CLUT4_CBP,
                                cpsm=0, csm=1, csa=0, cld=2)
    qw.append(giftag(1, 0, 0, 6, 0x0000_0000_00EE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_sprite_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_draw))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0x00)))
    qw.append(aplusd(R_UV,      uv_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))

    # --- U4: UV1 / XYZ2_1 closing the textured sprite. ---
    qw.append(giftag(1, 0, 0, 2, 0x0000_0000_0000_00EE))
    qw.append(aplusd(R_UV,   uv_data(CLUT4_TEXW - 1, CLUT4_TEXH - 1)))
    qw.append(aplusd(R_XYZ2, xyz2_data(CLUT4_TEXW - 1, CLUT4_TEXH - 1)))

    # --- U5: a FLAT control sprite at (8,0)..(15,7). EOP. ---
    qw.append(giftag(1, 1, 0, 5, 0x0000_0000_000E_EEEE))
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x20, 0xC0, 0x40)))   # flat green
    qw.append(aplusd(R_XYZ2,    xyz2_data(8, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(15, 7)))

    return qw


clut4_demo_payload = build_clut4_demo_payload()
clut4_demo_qwc     = len(clut4_demo_payload)
clut4_demo_bootlet = build_textured_demo_bootlet(clut4_demo_qwc)

write_bios_mem(
    "bios_clut4.mem", clut4_demo_bootlet,
    f"Ch297 PSMT4+CLUT demo BIOS bootlet ({len(clut4_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x8; QWC={clut4_demo_qwc}"
)
write_payload_mem(
    "payload_clut4.mem", clut4_demo_payload,
    f"Ch297 PSMT4+CLUT demo GIF payload ({clut4_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); BITBLT 16-CLUT + PSMT4 index texture + TEX0(CLD) + 1 PSMT4 SPRITE"
)


# ---------------------------------------------------------------------------
# Ch298 — SWIZZLED PSMT4 INDEXED-TEXTURE + CLUT demo fixture.
#
# The texture INDICES are stored in VRAM in the REAL PS2 PSMT4 BLOCK layout
# (page/block/column swizzle) and the SAMPLER reads them back swizzled, so
# the two VRAM views are CONSISTENT and the rendered pattern is the intended
# diagonal ramp. Built for a board top with PSMT4_SWIZZLE=1 (both the
# gif_image_xfer UPLOAD and the gs_texture_unit SAMPLE take the swizzle path).
#
# WHY THIS IS "VISIBLY WRONG IF INTERPRETED LINEARLY": the texture is 64x32 —
# WIDER than a 32-px PSMT4 block and TALLER than a 16-px block row — so it
# spans MULTIPLE blocks within page 0. The block/column permutation reorders
# the bytes; the SAME VRAM bytes read with a LINEAR addresser (PSMT4_SWIZZLE=0)
# land in the wrong block and produce a scrambled image. The companion board
# TB (tb_top_psmct32_raster_demo_bram_swz4) renders this with PSMT4_SWIZZLE=1
# and checks the diagonal ramp comes back correct.
#
# The GIF payload itself is raster-order IMAGE data (identical SHAPE to the
# clut4 fixture); the swizzle is applied entirely in HARDWARE on the upload
# write — so this adds NO new EE/BIOS scaffolding (reuses the SAME one-shot
# build_textured_demo_bootlet). CLUT load sequenced on its OWN TEX0 BEFORE the
# texture upload, exactly like the linear clut4 fixture (read2 nibble-RMW is
# not time-disjoint from the read2 VRAM->CLUT load).
#
# VRAM layout (swizzled PSMT4):
#   - PSMT4 texture indices : DBP = SWZ4_TBP0 (=16 -> byte 0x1000)
#   - CLUT (PSMCT32 entries) : DBP = SWZ4_CBP  (=12 -> byte 0xC00)
#   (texture base bumped past the CLUT so the larger 64x32 swizzled texture,
#    which spreads across several 256-byte blocks of page 0, never overlaps it)

SWZ4_FBW   = 1            # 16x32 visible framebuffer (PSMCT32 output, 64 px/row page)
# FB (16x32 PSMCT32 at FBP=0) occupies bytes 0..0x2000. Place the CLUT and the
# swizzled texture ABOVE the FB so neither overlaps it:
#   CLUT (16 PSMCT32 entries = 64 B) at CBP=32  -> 0x2000..0x203F
#   64x32 swizzled PSMT4 texture (spans page-0 blocks) at TBP0=36 -> 0x2400..0x27FF
SWZ4_TBP0  = 36           # texture base = 36*256 = 0x2400
SWZ4_TBW   = 2            # 128 texels/row stride; EVEN (PSMT4 swizzle needs FBW even)
SWZ4_TEXW  = 64           # crosses two 32-px block columns
SWZ4_TEXH  = 32           # crosses two 16-px block rows
SWZ4_CBP   = 32           # CLUT base = 32*256 = 0x2000
SWZ4_DISPLAY1_HI = 0x0001_F00F   # DW=15 (16 px wide window), DH=31 (32 px tall)


def tex0_swz4_pack(tbp0, tbw, cbp, cpsm=0, csm=1, csa=0, cld=1):
    """TEX0_1 for the swizzled PSMT4 texture (TW=6 -> 64, TH=5 -> 32)."""
    v = tex0_pack(tbp0, tbw, psm=0x14, tw=6, th=5)   # PSMT4, 64x32
    v |= (cbp  & 0x3FFF) << 37
    v |= (cpsm & 0xF)    << 51
    v |= (csm  & 0x1)    << 55
    v |= (csa  & 0x1F)   << 56
    v |= (cld  & 0x7)    << 61
    return v


def build_swz4_demo_payload():
    """Swizzled PSMT4 demo payload. Same structure as the linear clut4
    fixture: CLUT BITBLT -> TEX0(CLD) load -> PSMT4 texture BITBLT (HW
    swizzles the destination addresses) -> textured sprite -> flat control.
    Reuses clut4_index / clut4_palette so the expected pattern matches."""
    qw = []
    frame_1_val = frame_1_psmct32(SWZ4_FBW)

    # --- U1: BITBLT the 16-entry CLUT (PSMCT32, no swizzle) to VRAM[CBP*256]. ---
    n_clut = 16
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(SWZ4_CBP, 1, 0)))     # DPSM=PSMCT32
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(n_clut, 1)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_clut_qw = n_clut // 4
    qw.append(giftag(n_clut_qw, 0, 2, 0, 0))
    for i in range(n_clut_qw):
        word = 0
        for lane in range(4):
            word |= (clut4_palette(i * 4 + lane) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: TEX0(CLD=4) LOAD-ONLY -> fires the VRAM->CLUT load disjoint
    #         from the read2 PSMT4 upload that follows. ---
    tex0_load = tex0_swz4_pack(SWZ4_TBP0, SWZ4_TBW, SWZ4_CBP,
                               cpsm=0, csm=1, csa=0, cld=4)
    qw.append(giftag(1, 0, 0, 1, 0x0000_0000_0000_000E))
    qw.append(aplusd(R_TEX0_1, tex0_load))

    # --- U3: BITBLT the 64x32 PSMT4 index texture to VRAM[TBP0*256]. With
    #         PSMT4_SWIZZLE=1 the HW writes each nibble at its SWIZZLED block/
    #         column address; the raster-order IMAGE data below is unchanged. ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(SWZ4_TBP0, SWZ4_TBW, 0x14)))  # DPSM=PSMT4
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(SWZ4_TEXW, SWZ4_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_px    = SWZ4_TEXW * SWZ4_TEXH
    n_px_qw = n_px // 32                          # 32 4-bit pixels per qword
    qw.append(giftag(n_px_qw, 0, 2, 0, 0))
    for q in range(n_px_qw):
        word = 0
        for lane in range(32):
            t  = q * 32 + lane
            tx = t % SWZ4_TEXW
            ty = t // SWZ4_TEXW
            word |= (clut4_index(tx, ty) & 0xF) << (4 * lane)
        qw.append(word)

    # --- U4: PRIM(SPRITE+TME) + TEX0(CLD=2, no reload) + a 16x32 textured
    #         sprite sampling u in [0..15], v in [0..31]. The sampled u range
    #         stays within block column 0, but v in [0..31] CROSSES the 16-px
    #         block-row boundary — the across-block proof the architect wants
    #         visible on screen. ---
    tex0_draw = tex0_swz4_pack(SWZ4_TBP0, SWZ4_TBW, SWZ4_CBP,
                               cpsm=0, csm=1, csa=0, cld=2)
    qw.append(giftag(1, 0, 0, 6, 0x0000_0000_00EE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_sprite_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_draw))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0x00)))
    qw.append(aplusd(R_UV,      uv_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))

    qw.append(giftag(1, 1, 0, 2, 0x0000_0000_0000_00EE))   # EOP
    qw.append(aplusd(R_UV,   uv_data(15, 31)))
    qw.append(aplusd(R_XYZ2, xyz2_data(15, 31)))

    return qw


swz4_demo_payload = build_swz4_demo_payload()
swz4_demo_qwc     = len(swz4_demo_payload)
swz4_demo_bootlet = build_textured_demo_bootlet_disp(swz4_demo_qwc, SWZ4_DISPLAY1_HI, SWZ4_FBW)

write_bios_mem(
    "bios_swz4.mem", swz4_demo_bootlet,
    f"Ch298 SWIZZLED PSMT4+CLUT demo BIOS bootlet ({len(swz4_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x32; QWC={swz4_demo_qwc}"
)
write_payload_mem(
    "payload_swz4.mem", swz4_demo_payload,
    f"Ch298 SWIZZLED PSMT4+CLUT demo GIF payload ({swz4_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); BITBLT 16-CLUT + 64x32 SWIZZLED PSMT4 texture + TEX0(CLD) + 16x32 sprite"
)


# ---------------------------------------------------------------------------
# Ch299 — SWIZZLED PSMT8 INDEXED-TEXTURE + CLUT demo fixture.
#
# The SIBLING of the Ch298 swizzled-PSMT4 fixture, MINUS the nibble (PSMT8 is
# 1 byte/texel). The texture INDICES are stored in VRAM in the REAL PS2 PSMT8
# BLOCK layout (page/block/column swizzle) and the SAMPLER reads them back
# swizzled, so the two VRAM views are CONSISTENT and the rendered pattern is the
# intended diagonal ramp. Built for a board top with PSMT8_SWIZZLE=1 (both the
# gif_image_xfer UPLOAD and the gs_texture_unit SAMPLE take the swizzle path).
#
# WHY THIS IS "VISIBLY WRONG IF INTERPRETED LINEARLY": the texture is 64x48 —
# WIDER than a 16-px PSMT8 block column AND TALLER than a 16-px block row — so
# it spans MULTIPLE blocks within page 0 (PSMT8 page = 128x64 px, an 8x4 grid of
# 16x16 blocks). The block/column permutation reorders the bytes; the SAME VRAM
# bytes read with a LINEAR addresser (PSMT8_SWIZZLE=0) land in the wrong block
# and produce a scrambled image. The companion board TB
# (tb_top_psmct32_raster_demo_bram_swz8) renders this with PSMT8_SWIZZLE=1 and
# checks the diagonal ramp comes back correct across block-row boundaries.
#
# SIMPLER SEQUENCING THAN swz4: the PSMT8 upload uses the PURE-COMB writer (one
# byte per texel, no read2 nibble-RMW), so it is TIME-DISJOINT from the read2
# VRAM->CLUT load. We therefore use a SINGLE TEX0(CLD=1) that both loads the
# CLUT and draws — exactly like the Ch296 LINEAR clut8 fixture — rather than the
# separate load-only TEX0 the swz4 read2 conflict forced. NO new EE/BIOS
# scaffolding (GIF payload only; reuses the one-shot textured-demo bootlet).
#
# The geometry stays within ONE page-COLUMN (64 < 128 wide) and ONE page (48 <
# 64 tall) so page_index = page_y*bw_pg + page_x never aliases (with TBW=2 ->
# bw_pg=1, a >1-page-wide AND >1-page-tall texture would alias page(1,0) onto
# page(0,1)); the across-BLOCK proof (block rows at y=16 and y=32) is what makes
# linear-vs-swizzled diverge on screen.
#
# VRAM layout (swizzled PSMT8):
#   FB (16x48 PSMCT32 at FBP=0) occupies bytes 0..0x3000. Place the CLUT and the
#   swizzled texture ABOVE the FB so neither overlaps it:
#     CLUT (16 PSMCT32 entries = 64 B) at CBP=48  -> 0x3000..0x303F
#     64x48 swizzled PSMT8 texture (spans page-0 blocks) at TBP0=52 -> 0x3400..

SWZ8_FBW   = 1            # 16x48 visible framebuffer (PSMCT32 output)
SWZ8_TBP0  = 52           # texture base = 52*256 = 0x3400
SWZ8_TBW   = 2            # 128 texels/row stride; EVEN (PSMT8 swizzle needs FBW even)
SWZ8_TEXW  = 64           # crosses 16-px block columns (x=16,32,48)
SWZ8_TEXH  = 48           # crosses 16-px block rows (y=16,32)
SWZ8_CBP   = 48           # CLUT base = 48*256 = 0x3000
SWZ8_DISPLAY1_HI = 0x0002_F00F   # DW=15 (16 px wide window), DH=47 (48 px tall)


def tex0_swz8_pack(tbp0, tbw, cbp, cpsm=0, csm=1, csa=0, cld=1):
    """TEX0_1 for the swizzled PSMT8 texture (TW=6 -> 64, TH=6 -> 64 capacity;
    we sample within 48). PSM=PSMT8=0x13."""
    v = tex0_pack(tbp0, tbw, psm=0x13, tw=6, th=6)   # PSMT8, 64x64 capacity
    v |= (cbp  & 0x3FFF) << 37
    v |= (cpsm & 0xF)    << 51
    v |= (csm  & 0x1)    << 55
    v |= (csa  & 0x1F)   << 56
    v |= (cld  & 0x7)    << 61
    return v


def swz8_index(x, y):
    """Diagonal index ramp idx(x,y) = (x + 2*y) mod 16 — the SAME shape as the
    swz4 fixture so correct-vs-scrambled is obvious and reuses the 16-entry
    palette below. (PSMT8 can index 0..255, but a 16-entry ramp keeps the
    correct-vs-linear contrast crisp and reuses clut4_palette.)"""
    return (x + 2 * y) % 16


def build_swz8_demo_payload():
    """Swizzled PSMT8 demo payload. CLUT BITBLT -> PSMT8 texture BITBLT (HW
    swizzles the destination addresses, pure-comb writer) -> single
    TEX0(CLD=1) that loads the CLUT AND draws -> textured sprite -> flat
    control. Reuses clut4_palette so the expected pattern matches the swz4 TB
    palette helpers."""
    qw = []
    frame_1_val = frame_1_psmct32(SWZ8_FBW)

    # --- U1: BITBLT the 16-entry CLUT (PSMCT32, no swizzle) to VRAM[CBP*256]. ---
    n_clut = 16
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(SWZ8_CBP, 1, 0)))     # DPSM=PSMCT32
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(n_clut, 1)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_clut_qw = n_clut // 4
    qw.append(giftag(n_clut_qw, 0, 2, 0, 0))
    for i in range(n_clut_qw):
        word = 0
        for lane in range(4):
            word |= (clut4_palette(i * 4 + lane) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: BITBLT the 64x48 PSMT8 index texture to VRAM[TBP0*256]. With
    #         PSMT8_SWIZZLE=1 the HW writes each byte at its SWIZZLED block/
    #         column address; the raster-order IMAGE data below is unchanged.
    #         Pure-comb writer -> time-disjoint from the VRAM->CLUT load. ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(SWZ8_TBP0, SWZ8_TBW, 0x13)))  # DPSM=PSMT8
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(SWZ8_TEXW, SWZ8_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_px    = SWZ8_TEXW * SWZ8_TEXH
    n_px_qw = n_px // 16                          # 16 bytes (indices) per qword
    qw.append(giftag(n_px_qw, 0, 2, 0, 0))
    for q in range(n_px_qw):
        word = 0
        for lane in range(16):
            t  = q * 16 + lane
            tx = t % SWZ8_TEXW
            ty = t // SWZ8_TEXW
            word |= (swz8_index(tx, ty) & 0xFF) << (8 * lane)
        qw.append(word)

    # --- U3: PRIM(SPRITE+TME) + TEX0(CLD=1, loads CLUT AND draws) + a 16x48
    #         textured sprite sampling u in [0..15], v in [0..47]. The sampled
    #         u range stays within block column 0, but v in [0..47] CROSSES the
    #         16-px block-row boundaries at y=16 AND y=32 — the across-block
    #         proof the architect wants visible on screen. ---
    tex0_draw = tex0_swz8_pack(SWZ8_TBP0, SWZ8_TBW, SWZ8_CBP,
                               cpsm=0, csm=1, csa=0, cld=1)
    qw.append(giftag(1, 0, 0, 6, 0x0000_0000_00EE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_sprite_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_draw))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0x00)))
    qw.append(aplusd(R_UV,      uv_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))

    qw.append(giftag(1, 1, 0, 2, 0x0000_0000_0000_00EE))   # EOP
    qw.append(aplusd(R_UV,   uv_data(15, 47)))
    qw.append(aplusd(R_XYZ2, xyz2_data(15, 47)))

    return qw


swz8_demo_payload = build_swz8_demo_payload()
swz8_demo_qwc     = len(swz8_demo_payload)
swz8_demo_bootlet = build_textured_demo_bootlet_disp(swz8_demo_qwc, SWZ8_DISPLAY1_HI, SWZ8_FBW)

write_bios_mem(
    "bios_swz8.mem", swz8_demo_bootlet,
    f"Ch299 SWIZZLED PSMT8+CLUT demo BIOS bootlet ({len(swz8_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x48; QWC={swz8_demo_qwc}"
)
write_payload_mem(
    "payload_swz8.mem", swz8_demo_payload,
    f"Ch299 SWIZZLED PSMT8+CLUT demo GIF payload ({swz8_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); BITBLT 16-CLUT + 64x48 SWIZZLED PSMT8 texture + TEX0(CLD) + 16x48 sprite"
)


# ---------------------------------------------------------------------------
# Ch300 — SWIZZLED PSMCT32 (direct-color) texture demo. The CLOSURE rung of the
# swizzle layout family (after PSMT4 swz4 + PSMT8 swz8). PSMCT32 is DIRECT color
# (no CLUT), so this fixture is SIMPLER: just a swizzled BITBLT upload + a sprite.
#
# WHY VISIBLY WRONG IF READ LINEARLY: a PSMCT32 page is 64x32 px (a 4x8 grid of
# 8x8 blocks). The sampled 16x48 sprite crosses 8-px BLOCK boundaries (x=8;
# y=8,16,24,40) AND — UNIQUELY for PSMCT32 vs the taller PSMT8 page — the 32-px
# PAGE-ROW boundary at y=32 (pixels y>=32 live in PAGE 1, byte 0x2000+ of the
# texture region). Reading the swizzled bytes LINEARLY (PSMCT32_SWIZZLE=0) lands
# the y>=32 rows in the wrong page entirely -> scrambled bottom third.
# PSMCT32_SWIZZLE=1 recovers the intended coordinate-encoded gradient.
#
# Each texel ENCODES its own (x,y): R=x<<4, G=y*5, B=(x+y)<<2 — so any swizzle
# misplacement is BOTH visible (the gradient tears) and exactly checkable per
# pixel by the companion board TB.
#
# VRAM (PSMCT32_SWIZZLE=1 -> the FB ITSELF is also swizzled, same single param):
#   FB (16x40 PSMCT32 at FBP=0) occupies swizzled pages 0..1 -> 0x0000..~0x2200
#   22x40 swizzled PSMCT32 texture at TBP0=48 -> 0x3000.. (pages 0..1 of its
#   region, max ~0x5500) — clear of the FB. VRAM_BYTES=32 KiB (matches swz8).
#
# !!! HEARTBEAT-SPLICER DODGE (the reason the texture is 22 WIDE but only 16
# SAMPLED) !!! top_psmct32_raster_demo_bram carries the Ch251.3 production
# "heartbeat splicer": it UNCONDITIONALLY overwrites the low 32 bits of ee_ram
# qword 115 (byte 0x730) with CYAN (0xFFFFFF00) on every read — that is the
# animated demo's 17th-SPRITE RGBAQ. The DMAC fetches our GIF payload from this
# same ee_ram, so whatever texel lands at qword 115 (= image-data qword 93, raster
# texel index 372) gets clobbered with CYAN. Mirroring the swz8 fix, we make the
# texture WIDER than the sampled sprite (22 vs 16): texel 372 then sits at column
# 372 % 22 = 20, which is OUTSIDE the sampled u in [0..15] window, so the splice
# corrupts an UPLOADED-BUT-UNSAMPLED texel and the on-screen sprite is clean.
# (If the header size ever changes, texel 372 moves and the board TB — which
# checks every sampled pixel — will fail loudly, so this is self-verifying, not
# silently fragile.) NO new EE/BIOS scaffolding (GIF payload only).
#
# Budget: 22*40 = 880 texels = 220 image qwords + 16 header/draw = 236 <= 240.

SWZ32_FBW     = 1         # 16x40 visible framebuffer (PSMCT32 output)
SWZ32_TBP0    = 48        # texture base = 48*256 = 0x3000 (above the swizzled FB)
SWZ32_TBW     = 1         # 64 texels/row stride; PSMCT32 page is 64 wide (bw_pg=FBW, NO >>1)
SWZ32_TEXW    = 22        # texture WIDER than sampled (16) -> spliced texel 372 at col 20 = unsampled
SWZ32_TEXH    = 40        # 40 tall: sample v in [0..39] crosses block rows (8,16,24,32) AND PAGE row y=32
SWZ32_SPRITE_W = 16       # sampled sprite width: u in [0..15]; texture cols 16..21 uploaded-but-unsampled
SWZ32_SPRITE_H = 40       # sampled sprite height: v in [0..39]
SWZ32_DISPLAY1_HI = 0x0002_700F   # DW=15 (16 px wide window), DH=39 (40 px tall)


def swz32_texel(x, y):
    """Coordinate-encoded direct-color texel: R=x<<4, G=y*5, B=(x+y)<<2, A=0xFF.
    ABGR word = A<<24 | B<<16 | G<<8 | R. Each (x,y) is a distinct color so a
    swizzle misplacement both tears the gradient AND fails the exact per-pixel
    check. (G uses y*5 so the 48-row range spans 0..235 — a smooth vertical
    ramp; the page-row crossing at y=32 stays continuous IFF swizzle is right.)"""
    r = (x << 4) & 0xFF
    g = (y * 5) & 0xFF
    b = ((x + y) << 2) & 0xFF
    return 0xFF000000 | (b << 16) | (g << 8) | r


def build_swz32_demo_payload():
    """Swizzled PSMCT32 demo payload: BITBLT a 16x48 PSMCT32 texture (HW swizzles
    the destination, pure-comb word writer) -> PRIM(SPRITE+TME) + TEX0(PSMCT32,
    no CLUT) -> a 16x48 textured sprite. Direct color throughout: no CLUT load."""
    qw = []
    frame_1_val = frame_1_psmct32(SWZ32_FBW)

    # --- U1: BITBLT the 16x48 PSMCT32 texture to VRAM[TBP0*256]. DPSM=PSMCT32
    #         (0x00); with PSMCT32_SWIZZLE=1 the HW writes each pixel at its
    #         SWIZZLED page/block address. 4 PSMCT32 texels per qword. ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(SWZ32_TBP0, SWZ32_TBW, 0x00)))  # DPSM=PSMCT32
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(SWZ32_TEXW, SWZ32_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_px    = SWZ32_TEXW * SWZ32_TEXH
    n_px_qw = n_px // 4                            # 4 PSMCT32 texels per qword
    qw.append(giftag(n_px_qw, 0, 2, 0, 0))
    for q in range(n_px_qw):
        word = 0
        for lane in range(4):
            t  = q * 4 + lane
            tx = t % SWZ32_TEXW
            ty = t // SWZ32_TEXW
            word |= (swz32_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: PRIM(SPRITE+TME) + TEX0(PSMCT32, NO CLUT) + a 16x40 textured sprite
    #         sampling u in [0..15], v in [0..39] (a sub-window of the 22x40
    #         texture; cols 16..21 are uploaded-but-unsampled, see the splicer
    #         note above). v crosses the PAGE-row boundary at y=32 — the
    #         across-PAGE proof unique to PSMCT32 (page = 64x32 px). ---
    tex0_draw = tex0_pack(SWZ32_TBP0, SWZ32_TBW, psm=0x00, tw=5, th=6)  # PSMCT32, 32x64 cap
    qw.append(giftag(1, 0, 0, 6, 0x0000_0000_00EE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_sprite_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_draw))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0x00)))
    qw.append(aplusd(R_UV,      uv_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))

    qw.append(giftag(1, 1, 0, 2, 0x0000_0000_0000_00EE))   # EOP
    qw.append(aplusd(R_UV,   uv_data(SWZ32_SPRITE_W - 1, SWZ32_SPRITE_H - 1)))
    qw.append(aplusd(R_XYZ2, xyz2_data(SWZ32_SPRITE_W - 1, SWZ32_SPRITE_H - 1)))

    return qw


swz32_demo_payload = build_swz32_demo_payload()
swz32_demo_qwc     = len(swz32_demo_payload)
swz32_demo_bootlet = build_textured_demo_bootlet_disp(swz32_demo_qwc, SWZ32_DISPLAY1_HI, SWZ32_FBW)

write_bios_mem(
    "bios_swz32.mem", swz32_demo_bootlet,
    f"Ch300 SWIZZLED PSMCT32 demo BIOS bootlet ({len(swz32_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x48; QWC={swz32_demo_qwc}"
)
write_payload_mem(
    "payload_swz32.mem", swz32_demo_payload,
    f"Ch300 SWIZZLED PSMCT32 demo GIF payload ({swz32_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); BITBLT 16x48 SWIZZLED PSMCT32 texture + TEX0 + 16x48 sprite"
)


# ---------------------------------------------------------------------------
# Ch301 — PERSPECTIVE-CORRECT textured-triangle demo. A receding "floor" quad
# (two TRIANGLEs) textured with a 4x4-cell checkerboard. The top edge is FAR
# (w=8) and the bottom edge is NEAR (w=1); the texture coords are supplied via
# the ST register (S=u/w, T=v/w) + RGBAQ.Q (=1/w), so the rasterizer recovers
# per-pixel (u,v)=(S/Q, T/Q) through the pipelined reciprocal LUT. Under correct
# perspective the checkerboard rows COMPRESS toward the far (top) edge; an affine
# build would space them evenly. PERSPECTIVE_CORRECT=1 + PSMCT32_SWIZZLE=0 (the
# texture is LINEAR — this rung isolates perspective from the swizzle family).
#
# Interface contract (must match gs_stub Ch301 + the board TB):
#   S_fp = round((u/w) * 2^FRAC) -> ST[23:0]      FRAC=12
#   T_fp = round((v/w) * 2^FRAC) -> ST[55:32]
#   Q_fp = round((1/w) * 2^FRAC) -> RGBAQ[63:32]
#
# Payload is kept <= ~95 qwords so the DMAC (QWC from MADR=0x100=qword16) stops
# BEFORE absolute ee_ram qword 115 (16+QWC-1 < 115), so the Ch251.3 heartbeat
# splicer at qword 115 is never read — no fixture/splicer collision.
PERSP_FBW    = 1
PERSP_TBP0   = 8          # texture base 8*256=0x800 (above the 16x24 linear FB)
PERSP_TBW    = 1
PERSP_TEXW   = 16
PERSP_TEXH   = 16
PERSP_SCR_W  = 16         # on-screen quad width
PERSP_SCR_H  = 24         # on-screen quad height
PERSP_FRAC   = 12
PERSP_W_FAR  = 8          # top-edge depth (far)
PERSP_W_NEAR = 1          # bottom-edge depth (near)
PERSP_DISPLAY1_HI = ((PERSP_SCR_H - 1) << 12) | (PERSP_SCR_W - 1)   # DH=23, DW=15


def persp_texel(u, v):
    """4x4-cell checkerboard so perspective foreshortening is unmistakable."""
    cell = ((u >> 2) + (v >> 2)) & 1
    if cell: r, g, b = 0xFF, 0xFF, 0xFF      # white
    else:    r, g, b = 0x20, 0x20, 0x90      # dark blue
    return 0xFF000000 | (b << 16) | (g << 8) | r


def st_data(s_fp, t_fp):
    return (s_fp & 0xFFFFFF) | ((t_fp & 0xFFFFFF) << 32)


def rgbaq_with_q(r, g, b, q_fp):
    return rgbaq_data(r, g, b) | ((q_fp & 0xFFFFFFFF) << 32)


def persp_attrs(u, v, w):
    s_fp = round((u * (1 << PERSP_FRAC)) / w)
    t_fp = round((v * (1 << PERSP_FRAC)) / w)
    q_fp = round((1 << PERSP_FRAC) / w)
    return s_fp, t_fp, q_fp


def persp_vertex_qws(sx, sy, u, v, w):
    """One vertex = RGBAQ(with Q) + ST + XYZ2 (3 A+D), screen pos (sx,sy)."""
    s_fp, t_fp, q_fp = persp_attrs(u, v, w)
    return [
        aplusd(R_RGBAQ, rgbaq_with_q(0x00, 0x00, 0x00, q_fp)),  # color overridden by DECAL texel
        aplusd(R_ST,    st_data(s_fp, t_fp)),
        aplusd(R_XYZ2,  xyz2_data(sx, sy)),
    ]


def build_persp_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(PERSP_FBW)
    tex0_val    = tex0_pack(PERSP_TBP0, PERSP_TBW, psm=0x00, tw=4, th=4)  # PSMCT32 16x16

    # --- U1: upload the 16x16 LINEAR PSMCT32 checkerboard texture. ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(PERSP_TBP0, PERSP_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(PERSP_TEXW, PERSP_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (PERSP_TEXW * PERSP_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % PERSP_TEXW; ty = t // PERSP_TEXW
            word |= (persp_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: PRIM(TRI+TME) / FRAME / TEX0 setup (3 A+D, no EOP). ---
    qw.append(giftag(1, 0, 0, 3, 0x0000_0000_000E_EEEE))
    qw.append(aplusd(R_PRIM,    prim_tri_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_val))

    # quad corners (screen) -> (u,v,w): top far (w=8), bottom near (w=1).
    # texture v 0..15 maps across screen y 0..23 (round); u 0..15 across x.
    def vtx(sx, sy):
        w = PERSP_W_FAR if sy == 0 else PERSP_W_NEAR
        u = sx                                          # 0..15
        v = round(sy * (PERSP_TEXH - 1) / (PERSP_SCR_H - 1))  # 0..15
        return persp_vertex_qws(sx, sy, u, v, w)

    TL = (0, 0); TR = (PERSP_SCR_W - 1, 0)
    BL = (0, PERSP_SCR_H - 1); BR = (PERSP_SCR_W - 1, PERSP_SCR_H - 1)

    # discrete TRIANGLE: 6 vertices = 2 tris. Tri A = TL,TR,BL ; Tri B = TR,BR,BL.
    triA = vtx(*TL) + vtx(*TR) + vtx(*BL)
    triB = vtx(*TR) + vtx(*BR) + vtx(*BL)
    qw.append(giftag(1, 0, 0, 9, 0x0000_000E_EEEE_EEEE))   # tri A: 9 A+D
    qw += triA
    qw.append(giftag(1, 1, 0, 9, 0x0000_000E_EEEE_EEEE))   # tri B: 9 A+D, EOP
    qw += triB
    return qw


persp_demo_payload = build_persp_demo_payload()
persp_demo_qwc     = len(persp_demo_payload)
assert persp_demo_qwc <= 95, f"perspective payload {persp_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
persp_demo_bootlet = build_textured_demo_bootlet_disp(persp_demo_qwc, PERSP_DISPLAY1_HI, PERSP_FBW)

write_bios_mem(
    "bios_persp.mem", persp_demo_bootlet,
    f"Ch301 PERSPECTIVE-CORRECT demo BIOS bootlet ({len(persp_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = {PERSP_SCR_W}x{PERSP_SCR_H}; QWC={persp_demo_qwc}"
)
write_payload_mem(
    "payload_persp.mem", persp_demo_payload,
    f"Ch301 PERSPECTIVE-CORRECT demo GIF payload ({persp_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); 16x16 checkerboard + 2 TME TRIs (ST/Q perspective, receding floor)"
)


# ---------------------------------------------------------------------------
# Ch301b — PERSPECTIVE-CORRECT FLOOR demo (the human-recognizable version).
#
# The Ch301 demo above proved the per-pixel divide, but its rectangular quad +
# hand-assigned depths are NOT a single coherent projected plane (S=u/w is
# inconsistent across the diagonal seam), so it renders a sheared/diagonal
# pattern that "looks broken". This version derives ALL FOUR vertices from ONE
# pinhole projection of a flat floor, so both triangles share the SAME
# projective plane → no seam shear → a checkerboard that COMPRESSES toward the
# far (top) edge — the unmistakable perspective signature.
#
# Pinhole floor (camera height H above a Y=-H floor, looking down +Z; f=focal):
#   screen_x = CX + f*X/Z ; screen_y = CY + f*H/Z ; w = Z
#   texture  : u = (X+Xe)/(2Xe)*(TEXW-1) ; v = (Z-Znear)/(Zfar-Znear)*(TEXH-1)
# With f=16, H=1.5, CX=16, CY=-6, Xe=1, Znear=1, Zfar=4 the four corners land on
# integer screen coords (computed below). Because screen_x/y AND (u/w,v/w,1/w)
# all come from this ONE projection, s/w,t/w,1/w are screen-AFFINE over the whole
# quad (the fundamental theorem of perspective texturing) → the two triangles
# agree on the shared diagonal. The board TB fits a SINGLE affine S/T/Q plane and
# checks every pixel against it (a floor-plane / seam-continuity check, per the
# architect — NOT a per-triangle reference that could hide this exact bug).
#
# Floor corners (screen sx,sy ; texel u,v ; depth w):
#   NL (near-left)  sx=0  sy=18  u=0  v=0   w=1
#   NR (near-right) sx=32 sy=18  u=15 v=0   w=1
#   FL (far-left)   sx=12 sy=0   u=0  v=15  w=4
#   FR (far-right)  sx=20 sy=0   u=15 v=15  w=4
# (near edge wide+low, far edge narrow+high → trapezoid). Tri1=NL,NR,FR ;
# Tri2=NL,FR,FL ; shared diagonal seam = NL-FR. Texture LINEAR PSMCT32 at 0x1400
# (clear of the 64x19 FB). Payload < qword 115 (heartbeat-splicer safe).
PFLOOR_FBW   = 1
PFLOOR_TBP0  = 20         # 20*256 = 0x1400 (above the 64-stride x19-row FB)
PFLOOR_TBW   = 1
PFLOOR_TEXW  = 16
PFLOOR_TEXH  = 16
PFLOOR_DISPLAY1_HI = (18 << 12) | 33   # DW=33 (34 wide), DH=18 (19 tall)
# (sx, sy, u, v, w) per corner — derived from the projection above.
PFLOOR_NL = (0,  18, 0,  0,  1)
PFLOOR_NR = (32, 18, 15, 0,  1)
PFLOOR_FL = (12, 0,  0,  15, 4)
PFLOOR_FR = (20, 0,  15, 15, 4)


def pfloor_vertex_qws(corner):
    sx, sy, u, v, w = corner
    s_fp, t_fp, q_fp = persp_attrs(u, v, w)
    return [
        aplusd(R_RGBAQ, rgbaq_with_q(0x00, 0x00, 0x00, q_fp)),
        aplusd(R_ST,    st_data(s_fp, t_fp)),
        aplusd(R_XYZ2,  xyz2_data(sx, sy)),
    ]


def build_persp_floor_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(PFLOOR_FBW)
    tex0_val    = tex0_pack(PFLOOR_TBP0, PFLOOR_TBW, psm=0x00, tw=4, th=4)  # PSMCT32 16x16

    # U1: upload the 16x16 LINEAR PSMCT32 checkerboard texture.
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(PFLOOR_TBP0, PFLOOR_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(PFLOOR_TEXW, PFLOOR_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (PFLOOR_TEXW * PFLOOR_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % PFLOOR_TEXW; ty = t // PFLOOR_TEXW
            word |= (persp_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # U2: PRIM(TRI+TME)/FRAME/TEX0 setup.
    qw.append(giftag(1, 0, 0, 3, 0x0000_0000_000E_EEEE))
    qw.append(aplusd(R_PRIM,    prim_tri_tme()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEX0_1,  tex0_val))

    # Two triangles from the ONE projected quad (shared diagonal NL-FR).
    tri1 = pfloor_vertex_qws(PFLOOR_NL) + pfloor_vertex_qws(PFLOOR_NR) + pfloor_vertex_qws(PFLOOR_FR)
    tri2 = pfloor_vertex_qws(PFLOOR_NL) + pfloor_vertex_qws(PFLOOR_FR) + pfloor_vertex_qws(PFLOOR_FL)
    qw.append(giftag(1, 0, 0, 9, 0x0000_000E_EEEE_EEEE))   # tri1: 9 A+D
    qw += tri1
    qw.append(giftag(1, 1, 0, 9, 0x0000_000E_EEEE_EEEE))   # tri2: 9 A+D, EOP
    qw += tri2
    return qw


pfloor_demo_payload = build_persp_floor_demo_payload()
pfloor_demo_qwc     = len(pfloor_demo_payload)
assert pfloor_demo_qwc <= 95, f"persp-floor payload {pfloor_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
pfloor_demo_bootlet = build_textured_demo_bootlet_disp(pfloor_demo_qwc, PFLOOR_DISPLAY1_HI, PFLOOR_FBW)

write_bios_mem(
    "bios_persp_floor.mem", pfloor_demo_bootlet,
    f"Ch301b PERSPECTIVE FLOOR demo BIOS bootlet ({len(pfloor_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 34x19; QWC={pfloor_demo_qwc}"
)
write_payload_mem(
    "payload_persp_floor.mem", pfloor_demo_payload,
    f"Ch301b PERSPECTIVE FLOOR demo GIF payload ({pfloor_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); 16x16 checkerboard + 2 TME TRIs from ONE projected floor plane"
)


# ---------------------------------------------------------------------------
# Ch302 — COMBINED textured + alpha + depth demo (the multi-beat read-schedule
# probe). Two primitives:
#   1. Opaque GREEN background sprite, ZTE=GEQUAL over the init-0 Z-buffer, so it
#      writes BOTH the framebuffer color (Cbg=green) AND the Z-buffer (Z=Zbg).
#   2. A COMBINED triangle: TME (translucent red/blue texture, texel A=0x40) +
#      ABE (source-over) + ZTE (GEQUAL). Its interpolated Z runs from 0x6000 at
#      the top (>= Zbg -> PASS -> blend texel over green, write Z) to 0x2000 at
#      the bottom (< Zbg -> FAIL -> green shows through unchanged, no writes).
# With COMBINED_TAZ=1 the gs_stub combined FSM runs the 5-beat per-pixel schedule
# (Zread -> Ztest -> texel -> dest -> colorwrite -> Zwrite). AFFINE UV (this rung
# is about memory scheduling, not perspective).
#
# VRAM (16 KiB): with FBW=1 the FB stride is 256 B/row, so the 16-row FB spans
# 0x0000..0x0FFF; the Z-buffer (ZBP=2) spans 0x1000..0x1FFF; the texture must
# therefore live ABOVE both, at TBP0=32 (0x2000) — NOT inside the FB (an earlier
# 0x800 placement was clobbered by the background sprite). Payload < qword 99
# (heartbeat-splicer safe).
COMB_FBW   = 1
COMB_ZBP   = 2            # Z-buffer @ 2*2048 = 0x1000 (even -> ZMSK=0)
COMB_TBP0  = 32           # texture @ 32*256 = 0x2000 (above the 0x1000 FB + 0x1000 Z)
COMB_TBW   = 1
COMB_TEXW  = 8
COMB_TEXH  = 8
COMB_DISPLAY1_HI = 0x0000_F00F   # DW=15 (16 wide), DH=15 (16 tall)
COMB_CBG   = (0x00, 0x80, 0x00)  # opaque green background (r,g,b)
COMB_ZBG   = 0x0000_4000         # background stored Z
# combined triangle (screen x,y ; interpolated Z ; affine texel u,v)
COMB_V0 = (2,  1,  0x0000_6000, 0, 0)   # top-left  (PASS: Z>Zbg)
COMB_V1 = (13, 1,  0x0000_6000, 7, 0)   # top-right (PASS)
COMB_V2 = (7,  14, 0x0000_2000, 3, 7)   # bottom    (FAIL: Z<Zbg)


def prim_tri_tme_abe():
    # PRIM[2:0]=3 TRIANGLE, bit4=TME, bit6=ABE.
    return 3 | (1 << 4) | (1 << 6)


def comb_texel(tx, ty):
    """Translucent 2-region texture: left half RED, right half BLUE, A=0x40.
    Distinct halves so the TB can prove the TEXEL contributes to the blend AND
    the affine UV mapping is right. ABGR = A<<24 | B<<16 | G<<8 | R."""
    a = 0x40
    if tx < 4: r, g, b = 0xFF, 0x00, 0x00   # red
    else:      r, g, b = 0x00, 0x00, 0xFF   # blue
    return (a << 24) | (b << 16) | (g << 8) | r


def build_combined_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(COMB_FBW)
    tex0_val    = tex0_pack(COMB_TBP0, COMB_TBW, psm=0x00, tw=3, th=3)  # PSMCT32 8x8

    # --- U1: upload the 8x8 translucent PSMCT32 texture. ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(COMB_TBP0, COMB_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(COMB_TEXW, COMB_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (COMB_TEXW * COMB_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % COMB_TEXW; ty = t // COMB_TEXW
            word |= (comb_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: opaque GREEN background sprite (ZTE GEQUAL writes FB + Z). ---
    qw.append(giftag(1, 0, 0, 7, 0x0000_0000_0EEE_EEEE))
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(COMB_ZBP)))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(*COMB_CBG)))
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(0,  0,  COMB_ZBG)))
    qw.append(aplusd(R_XYZ2,    xyz2_dataz(15, 15, COMB_ZBG)))

    # --- U3: COMBINED triangle: TME + ABE(source-over) + ZTE(GEQUAL). EOP. ---
    #   PRIM/ALPHA_1/TEX0 set here; TEST_1/ZBUF_1/FRAME_1 persist from U2.
    #   3 vertices, each RGBAQ + UV(affine) + XYZ2(with interpolated Z).
    #   13 A+D: PRIM,ALPHA_1,TEX0 + 3*(RGBAQ,UV,XYZ2).
    qw.append(giftag(1, 1, 0, 12, 0x0000_EEEE_EEEE_EEEE))   # 12 A+D, EOP
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))    # source-over: Cv=((Cs-Cd)*As)>>7+Cd
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (sx, sy, sz, tu, tv) in (COMB_V0, COMB_V1, COMB_V2):
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))  # color overridden by texel (DECAL)
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


comb_demo_payload = build_combined_demo_payload()
comb_demo_qwc     = len(comb_demo_payload)
assert comb_demo_qwc <= 95, f"combined payload {comb_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
comb_demo_bootlet = build_textured_demo_bootlet_disp(comb_demo_qwc, COMB_DISPLAY1_HI, COMB_FBW)

write_bios_mem(
    "bios_combined.mem", comb_demo_bootlet,
    f"Ch302 COMBINED tex+alpha+depth BIOS bootlet ({len(comb_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x16; QWC={comb_demo_qwc}"
)
write_payload_mem(
    "payload_combined.mem", comb_demo_payload,
    f"Ch302 COMBINED tex+alpha+depth GIF payload ({comb_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); green Z-bg + translucent textured TRI w/ interpolated Z (half pass/half fail)"
)


# ---------------------------------------------------------------------------
# Ch303 — TILE-LOCAL combined renderer demo. Same combined TME+ABE+ZTE triangle
# as Ch302, but rendered into an ON-CHIP 16x16 color+Z tile (CLEAR→RENDER→FLUSH).
# Differences from the Ch302 fixture:
#   - NO background sprite: the "green background" is the FSM's CLEAR phase
#     (TILE_CLEAR_COLOR=green, TILE_CLEAR_Z=Zbg=0x4000 — gs_stub params), so the
#     fixture is JUST the texture upload + the combined triangle.
#   - The Z-buffer is ON-CHIP (tile_z), so NO Z-buffer in VRAM; the texture moves
#     down to TBP0=16 (0x1000), above the 16-row FB (0..0xFFF). VRAM 8 KiB.
#   - ZBUF_1 is still written but IGNORED in tile mode (Z is tile-local).
# Expected result == Ch302: cleared green, triangle top half blended (red->orange,
# blue->teal over green) where depth passes, bottom half occluded (green).
TILE_FBW   = 1
TILE_TBP0  = 16          # texture @ 16*256 = 0x1000 (above the 0x1000 FB; no VRAM Z in tile mode)
TILE_TBW   = 1
TILE_TEXW  = 8
TILE_TEXH  = 8
TILE_ZBP   = 2           # written but ignored (Z is on-chip tile_z)
TILE_DISPLAY1_HI = 0x0000_F00F   # DW=15 (16 wide), DH=15 (16 tall)
TILE_V0 = (2,  1,  0x0000_6000, 0, 0)   # top-left  (PASS)
TILE_V1 = (13, 1,  0x0000_6000, 7, 0)   # top-right (PASS)
TILE_V2 = (7,  14, 0x0000_2000, 3, 7)   # bottom    (FAIL)


def build_tile_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(TILE_FBW)
    tex0_val    = tex0_pack(TILE_TBP0, TILE_TBW, psm=0x00, tw=3, th=3)  # PSMCT32 8x8

    # --- U1: upload the 8x8 translucent PSMCT32 texture (reuses comb_texel). ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TILE_TBP0, TILE_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TILE_TEXW, TILE_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TILE_TEXW * TILE_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % TILE_TEXW; ty = t // TILE_TEXW
            word |= (comb_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: the COMBINED triangle (TME+ABE+ZTE). No background sprite — the
    #         green background is the tile CLEAR phase. EOP. 13 A+D:
    #         PRIM,FRAME,ALPHA_1,TEST_1,ZBUF_1,TEX0 (6) + 3*(RGBAQ,UV,XYZ2) (9) = 15 A+D.
    qw.append(giftag(1, 1, 0, 15, 0x0EEE_EEEE_EEEE_EEEE))   # 15 A+D (15 0xE nibbles), EOP
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))    # source-over
    qw.append(aplusd(R_TEST_1,  test1_geq()))               # ZTE GEQUAL
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(TILE_ZBP)))      # ignored in tile mode
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (sx, sy, sz, tu, tv) in (TILE_V0, TILE_V1, TILE_V2):
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


# ---------------------------------------------------------------------------
# Ch323 — TILE COLOR+Z SPILL/RELOAD two-batch proof fixture.
# TWO TME+ABE+ZTE triangles (the tile-local path's close_combined gate REQUIRES
# triangle+TME+ABE+ZTE — see gs_stub.sv:4764; a flat/sprite/no-texture prim would
# take the non-tile path and never exercise spill/reload). Texture is INCIDENTAL:
# an opaque 2-region texel map (left=color1, right=color2), nearest, sampled with
# uniform UVs so each prim is effectively a flat color. The proof hinges on Z:
#   CLEAR Z = 0x4000 (gs_stub TILE_CLEAR_Z); GEQUAL (frag_z >= stored passes).
#   P1 (batch 1): region A (small top-left tri), Z=0x8000 (near), UV->color1.
#   P2 (batch 2): region A+B (large tri), Z=0x6000 (mid),  UV->color2.
# Single-prim tile mode -> 2 prims = 2 tile cycles; cycle 2 reloads cycle 1's
# flushed color+Z. Region A overlap: reloaded Z=0x8000 makes P2 (0x6000) FAIL ->
# keeps color1 (Z survived). Region B (P2 only): reloaded Z=0x4000 -> P2 PASSES ->
# color2 (control). Negative control (reload off): region A reloads clear 0x4000 ->
# P2 passes -> color2; the color1<->color2 flip proves the result depends on reload.
TSPILL_FBW  = 1
TSPILL_TBP0 = 16          # texture @ 0x1000 (above the 16-row FB)
TSPILL_TBW  = 1
TSPILL_TEXW = 8
TSPILL_TEXH = 8
TSPILL_ZBP  = 2
TSPILL_DISPLAY1_HI = 0x0000_F00F   # 16x16
TSPILL_COLOR1 = 0xFF0000FF         # ABGR opaque RED  (color1, P1 / region A)
TSPILL_COLOR2 = 0xFFFF0000         # ABGR opaque BLUE (color2, P2 / region B)

def tspill_texel(tx, ty):
    # opaque 2-region: left half -> color1, right half -> color2 (A=0xFF so
    # source-over with full alpha is opaque -> rendered color == texel color).
    return TSPILL_COLOR1 if tx < 4 else TSPILL_COLOR2

def build_tile_spill_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(TSPILL_FBW)
    tex0_val    = tex0_pack(TSPILL_TBP0, TSPILL_TBW, psm=0x00, tw=3, th=3)  # 8x8 PSMCT32

    # --- U1: upload the opaque 2-region 8x8 texture ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TSPILL_TBP0, TSPILL_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TSPILL_TEXW, TSPILL_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TSPILL_TEXW * TSPILL_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % TSPILL_TEXW; ty = t // TSPILL_TEXW
            word |= (tspill_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: BATCH 1 — P1 near triangle (region A, top-left), Z=0x8000, UV->color1
    #     (texel (0,0), left/color1 region). 15 A+D, EOP=0 (P2 follows). ---
    qw.append(giftag(1, 0, 0, 15, 0x0EEE_EEEE_EEEE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))   # source-over
    qw.append(aplusd(R_TEST_1,  test1_geq()))              # ZTE GEQUAL
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(TSPILL_ZBP)))   # ignored in tile mode
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (sx, sy) in ((0, 0), (8, 0), (0, 8)):              # small top-left tri = region A
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(0, 0)))          # uniform -> color1
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_8000)))

    # --- U3: BATCH 2 — P2 mid triangle (region A+B, large), Z=0x6000, UV->color2
    #     (texel (7,0), right/color2 region). PRIM/FRAME/ALPHA/TEST/ZBUF/TEX0
    #     persist from U2; re-send only RGBAQ/UV/XYZ2. 9 A+D, EOP=1. ---
    qw.append(giftag(1, 1, 0, 9, 0x0000_000E_EEEE_EEEE))
    for (sx, sy) in ((0, 0), (15, 0), (0, 15)):            # large tri = region A+B
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(7, 0)))          # uniform -> color2
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_6000)))
    return qw

tspill_demo_payload = build_tile_spill_demo_payload()
tspill_demo_qwc     = len(tspill_demo_payload)
assert tspill_demo_qwc <= 95, f"tile-spill payload {tspill_demo_qwc} qwords may collide with splicer"
tspill_demo_bootlet = build_textured_demo_bootlet_disp(tspill_demo_qwc, TSPILL_DISPLAY1_HI, TSPILL_FBW)
write_bios_mem(
    "bios_tile_spill.mem", tspill_demo_bootlet,
    f"Ch323 tile spill/reload two-batch proof bootlet ({len(tspill_demo_bootlet)} words; "
    f"DISPLAY1=16x16; QWC={tspill_demo_qwc})"
)
write_payload_mem(
    "payload_tile_spill.mem", tspill_demo_payload,
    f"Ch323 tile spill/reload proof ({tspill_demo_qwc} qwords): P1 near(Z=0x8000,color1) region A + "
    f"P2 mid(Z=0x6000,color2) region A+B; 2 TME+ABE+ZTE tris = 2 tile batches"
)


# ---------------------------------------------------------------------------
# Ch324 — 4x4 MULTI-TILE two-batch spill/reload proof. Same two-triangle Z model
# as Ch323 (P1 near color1 spills, P2 mid color2 reloads + depth-tests), but scaled
# to a 64x64 framebuffer = a 4x4 grid of 16x16 tiles (FBW=1). Geometry is placed to
# satisfy the grid acceptance:
#   - P1 (batch1, near Z=0x8000, color1): SMALL tri (20,20)-(44,20)-(20,44)  [x+y<64]
#   - P2 (batch2, mid  Z=0x6000, color2): LARGE tri (20,20)-(60,20)-(20,60)  [x+y<80], P2 superset P1
#   Overlap (x+y<64) spans tiles (1,1)/(2,1)/(1,2) -> reloaded Z=0x8000 -> P2 FAILS ->
#   color1 SURVIVES in MULTIPLE tiles (depth survival > 1 tile). Region B (64<x+y<80)
#   spans (3,1)/(2,2)/(1,3) -> reloaded clear 0x4000 -> P2 PASSES -> color2 (control).
#   Both tris cross x=32/48 AND y=32/48 seams. Top tile row (y<20) is EMPTY: tile 0
#   (col0,row0) is a LEADING empty tile; tile 15 (col3,row3, x+y up to 126 > 80) is a
#   FULLY EMPTY tile rendered AFTER non-empty tiles. Texture: same boring opaque
#   2-region (left=color1, right=color2) as Ch323; UVs uniform so each prim is flat.
TS4_FBW  = 1
TS4_TBP0 = 64          # texture @ 0x4000 (above the 64-row, 16KiB FB)
TS4_TBW  = 1
TS4_TEXW = 8
TS4_TEXH = 8
TS4_ZBP  = 8
TS4_DISPLAY1_HI = (63 << 12) | 63    # DW=63 (64 wide), DH=63 (64 tall)
TS4_COLOR1 = 0xFF0000FF               # ABGR opaque RED  (color1, P1 / region A)
TS4_COLOR2 = 0xFFFF0000               # ABGR opaque BLUE (color2, P2 / region B)

def ts4_texel(tx, ty):
    return TS4_COLOR1 if tx < 4 else TS4_COLOR2

def build_tile_spill_4x4_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(TS4_FBW)
    tex0_val    = tex0_pack(TS4_TBP0, TS4_TBW, psm=0x00, tw=3, th=3)  # 8x8 PSMCT32

    # --- U1: upload the opaque 2-region 8x8 texture ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TS4_TBP0, TS4_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TS4_TEXW, TS4_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TS4_TEXW * TS4_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % TS4_TEXW; ty = t // TS4_TEXW
            word |= (ts4_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: BATCH 1 — P1 near tri (region A), Z=0x8000, UV->color1. 15 A+D, EOP=0. ---
    qw.append(giftag(1, 0, 0, 15, 0x0EEE_EEEE_EEEE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))   # source-over
    qw.append(aplusd(R_TEST_1,  test1_geq()))              # ZTE GEQUAL
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(TS4_ZBP)))      # ignored in tile mode
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (sx, sy) in ((20, 20), (44, 20), (20, 44)):        # small tri = region A
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(0, 0)))          # uniform -> color1
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_8000)))

    # --- U3: BATCH 2 — P2 mid tri (region A+B), Z=0x6000, UV->color2. ctx persists;
    #     re-send RGBAQ/UV/XYZ2. 9 A+D, EOP=1. ---
    qw.append(giftag(1, 1, 0, 9, 0x0000_000E_EEEE_EEEE))
    for (sx, sy) in ((20, 20), (60, 20), (20, 60)):        # large tri = region A+B
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(7, 0)))          # uniform -> color2
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_6000)))
    return qw

ts4_demo_payload = build_tile_spill_4x4_demo_payload()
ts4_demo_qwc     = len(ts4_demo_payload)
assert ts4_demo_qwc <= 95, f"tile-spill-4x4 payload {ts4_demo_qwc} qwords may collide with splicer"
ts4_demo_bootlet = build_textured_demo_bootlet_disp(ts4_demo_qwc, TS4_DISPLAY1_HI, TS4_FBW)
write_bios_mem(
    "bios_tile_spill4x4.mem", ts4_demo_bootlet,
    f"Ch324 4x4 multi-tile spill/reload two-batch proof bootlet ({len(ts4_demo_bootlet)} words; "
    f"DISPLAY1=64x64; QWC={ts4_demo_qwc})"
)
write_payload_mem(
    "payload_tile_spill4x4.mem", ts4_demo_payload,
    f"Ch324 4x4 spill/reload proof ({ts4_demo_qwc} qwords): P1 near(Z=0x8000,color1) small + "
    f"P2 mid(Z=0x6000,color2) large, cross-seam, 4x4 grid; depth survival in overlap tiles"
)


# ---------------------------------------------------------------------------
# Ch325 — 8x8 MULTI-TILE two-batch spill/reload proof. The Ch324 4x4 scene scaled
# 2x to a 128x128 framebuffer = an 8x8 grid of 16x16 tiles (FBW=2). Same Z model:
#   - P1 (batch1, near Z=0x8000, color1): tri (40,40)-(88,40)-(40,88)  [x+y<128]
#   - P2 (batch2, mid  Z=0x6000, color2): tri (40,40)-(120,40)-(40,120)[x+y<160], superset
#   Overlap (x+y<128) keeps color1 (depth survival) across MANY tiles; region B
#   (128<x+y<160) takes color2; top rows (y<40) + bottom-right (x+y>160) tiles stay
#   EMPTY. Both tris cross many x/y=16k seams. Texture: same opaque 2-region.
TS8_FBW  = 2
TS8_TBP0 = 0           # Ch326 — texture @ 0x0 (FB is LPDDR-only, BRAM FB region free)
TS8_TBW  = 2
TS8_TEXW = 8
TS8_TEXH = 8
TS8_ZBP  = 16
TS8_DISPLAY1_HI = (127 << 12) | 127   # DW=127 (128 wide), DH=127 (128 tall)
TS8_COLOR1 = 0xFF0000FF                # ABGR opaque RED  (color1, P1)
TS8_COLOR2 = 0xFFFF0000                # ABGR opaque BLUE (color2, P2)

def ts8_texel(tx, ty):
    return TS8_COLOR1 if tx < 4 else TS8_COLOR2

def build_tile_spill_8x8_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(TS8_FBW)
    tex0_val    = tex0_pack(TS8_TBP0, TS8_TBW, psm=0x00, tw=3, th=3)  # 8x8 PSMCT32

    # --- U1: upload the opaque 2-region 8x8 texture ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TS8_TBP0, TS8_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TS8_TEXW, TS8_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TS8_TEXW * TS8_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % TS8_TEXW; ty = t // TS8_TEXW
            word |= (ts8_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    # --- U2: BATCH 1 — P1 near tri, Z=0x8000, UV->color1. 15 A+D, EOP=0. ---
    qw.append(giftag(1, 0, 0, 15, 0x0EEE_EEEE_EEEE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))   # source-over
    qw.append(aplusd(R_TEST_1,  test1_geq()))              # ZTE GEQUAL
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(TS8_ZBP)))      # ignored in tile mode
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (sx, sy) in ((40, 40), (88, 40), (40, 88)):
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(0, 0)))          # uniform -> color1
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_8000)))

    # --- U3: BATCH 2 — P2 mid tri, Z=0x6000, UV->color2. ctx persists. 9 A+D, EOP=1. ---
    qw.append(giftag(1, 1, 0, 9, 0x0000_000E_EEEE_EEEE))
    for (sx, sy) in ((40, 40), (120, 40), (40, 120)):
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(7, 0)))          # uniform -> color2
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_6000)))
    return qw

ts8_demo_payload = build_tile_spill_8x8_demo_payload()
ts8_demo_qwc     = len(ts8_demo_payload)
assert ts8_demo_qwc <= 95, f"tile-spill-8x8 payload {ts8_demo_qwc} qwords may collide with splicer"
ts8_demo_bootlet = build_textured_demo_bootlet_disp(ts8_demo_qwc, TS8_DISPLAY1_HI, TS8_FBW)
write_bios_mem(
    "bios_tile_spill8x8.mem", ts8_demo_bootlet,
    f"Ch325 8x8 multi-tile spill/reload two-batch proof bootlet ({len(ts8_demo_bootlet)} words; "
    f"DISPLAY1=128x128; QWC={ts8_demo_qwc})"
)
write_payload_mem(
    "payload_tile_spill8x8.mem", ts8_demo_payload,
    f"Ch325 8x8 spill/reload proof ({ts8_demo_qwc} qwords): P1 near(Z=0x8000,color1) + "
    f"P2 mid(Z=0x6000,color2), cross-seam, 8x8 grid / 128x128; depth survival in overlap tiles"
)

# ===== Ch327b — 16x16 grid = 256x256 PSMCT32 raster FB (2x the Ch325 8x8 scene) =====
TS16_FBW = 4                              # 256 px / 64
TS16_DISPLAY1_HI = (255 << 12) | 255      # DW=255 (256 wide), DH=255 (256 tall)
def build_tile_spill_16x16_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(TS16_FBW)
    tex0_val    = tex0_pack(TS8_TBP0, TS8_TBW, psm=0x00, tw=3, th=3)  # same 8x8 texture @ vram 0
    # --- U1: upload the opaque 2-region 8x8 texture (identical to 8x8) ---
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TS8_TBP0, TS8_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TS8_TEXW, TS8_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TS8_TEXW * TS8_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % TS8_TEXW; ty = t // TS8_TEXW
            word |= (ts8_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    # --- U2: BATCH 1 — P1 near tri, Z=0x8000, UV->color1 (coords 2x the 8x8) ---
    qw.append(giftag(1, 0, 0, 15, 0x0EEE_EEEE_EEEE_EEEE))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(TS8_ZBP)))
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (sx, sy) in ((80, 80), (176, 80), (80, 176)):
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(0, 0)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_8000)))
    # --- U3: BATCH 2 — P2 mid tri, Z=0x6000, UV->color2 (coords 2x) ---
    qw.append(giftag(1, 1, 0, 9, 0x0000_000E_EEEE_EEEE))
    for (sx, sy) in ((80, 80), (240, 80), (80, 240)):
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(7, 0)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, 0x0000_6000)))
    return qw

ts16_demo_payload = build_tile_spill_16x16_demo_payload()
ts16_demo_qwc     = len(ts16_demo_payload)
assert ts16_demo_qwc <= 95, f"tile-spill-16x16 payload {ts16_demo_qwc} qwords may collide with splicer"
ts16_demo_bootlet = build_textured_demo_bootlet_disp(ts16_demo_qwc, TS16_DISPLAY1_HI, TS16_FBW)
write_bios_mem(
    "bios_tile_spill16x16.mem", ts16_demo_bootlet,
    f"Ch327b 16x16 multi-tile spill/reload two-batch proof bootlet ({len(ts16_demo_bootlet)} words; "
    f"DISPLAY1=256x256; QWC={ts16_demo_qwc})"
)
write_payload_mem(
    "payload_tile_spill16x16.mem", ts16_demo_payload,
    f"Ch327b 16x16 spill/reload proof ({ts16_demo_qwc} qwords): P1 near(Z=0x8000,color1) + "
    f"P2 mid(Z=0x6000,color2), cross-seam, 16x16 grid / 256x256; depth survival in overlap tiles"
)


tile_demo_payload = build_tile_demo_payload()
tile_demo_qwc     = len(tile_demo_payload)
assert tile_demo_qwc <= 95, f"tile payload {tile_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
tile_demo_bootlet = build_textured_demo_bootlet_disp(tile_demo_qwc, TILE_DISPLAY1_HI, TILE_FBW)

write_bios_mem(
    "bios_tile.mem", tile_demo_bootlet,
    f"Ch303 TILE-LOCAL combined demo BIOS bootlet ({len(tile_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 16x16; QWC={tile_demo_qwc}"
)
write_payload_mem(
    "payload_tile.mem", tile_demo_payload,
    f"Ch303 TILE-LOCAL combined demo GIF payload ({tile_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); 8x8 translucent texture + combined TRI rendered to on-chip 16x16 tile"
)


# ---------------------------------------------------------------------------
# Ch304 — 2x2 MULTI-TILE demo. ONE combined TME+ABE+ZTE triangle spanning the
# 32x32 region (a 2x2 grid of 16x16 tiles), crossing BOTH tile seams (x=16 AND
# y=16). The renderer re-tests the triangle against each of the 4 tiles, clears+
# renders+flushes each independently; seams must be continuous (attributes are
# screen-space). Same clear/texture model as Ch303.
#   VRAM 16 KiB: FB 32 rows x 256 B/row = 0x0000..0x1FFF (32x32, FBW=1); texture
#   @ TBP0=32 (0x2000); Z is on-chip (tile_z), none in VRAM.
TILE2_FBW   = 1
TILE2_TBP0  = 32         # texture @ 0x2000 (above the 32-row FB)
TILE2_TBW   = 1
TILE2_TEXW  = 8
TILE2_TEXH  = 8
TILE2_DISPLAY1_HI = (31 << 12) | 31   # DW=31 (32 wide), DH=31 (32 tall)
# triangle spanning the 2x2 grid; crosses x=16 and y=16. (screen x,y; Z; texel u,v)
TILE2_V0 = (3,  3,  0x0000_6000, 0, 0)    # top-left  (PASS)
TILE2_V1 = (28, 3,  0x0000_6000, 7, 0)    # top-right (PASS)
TILE2_V2 = (16, 29, 0x0000_2000, 3, 7)    # bottom    (FAIL)


def build_tile2x2_demo_payload():
    qw = []
    frame_1_val = frame_1_psmct32(TILE2_FBW)
    tex0_val    = tex0_pack(TILE2_TBP0, TILE2_TBW, psm=0x00, tw=3, th=3)  # PSMCT32 8x8

    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TILE2_TBP0, TILE2_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TILE2_TEXW, TILE2_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TILE2_TEXW * TILE2_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % TILE2_TEXW; ty = t // TILE2_TEXW
            word |= (comb_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)

    qw.append(giftag(1, 1, 0, 15, 0x0EEE_EEEE_EEEE_EEEE))   # 15 A+D, EOP
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))            # ignored in tile mode
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for (sx, sy, sz, tu, tv) in (TILE2_V0, TILE2_V1, TILE2_V2):
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


tile2_demo_payload = build_tile2x2_demo_payload()
tile2_demo_qwc     = len(tile2_demo_payload)
assert tile2_demo_qwc <= 95, f"tile2x2 payload {tile2_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
tile2_demo_bootlet = build_textured_demo_bootlet_disp(tile2_demo_qwc, TILE2_DISPLAY1_HI, TILE2_FBW)

write_bios_mem(
    "bios_tile2x2.mem", tile2_demo_bootlet,
    f"Ch304 2x2 MULTI-TILE demo BIOS bootlet ({len(tile2_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={tile2_demo_qwc}"
)
write_payload_mem(
    "payload_tile2x2.mem", tile2_demo_payload,
    f"Ch304 2x2 MULTI-TILE demo GIF payload ({tile2_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); 8x8 texture + combined TRI spanning 2x2 tiles (crosses x=16,y=16 seams)"
)


# ---------------------------------------------------------------------------
# Ch305 — MULTI-PRIMITIVE tiled scene over the 2x2 grid. A fixed LIST of 3
# combined TME+ABE+ZTE primitives, all spanning the 32x32 region (2x2 grid of
# 16x16 on-chip tiles), re-rendered per tile IN ORDER so later primitives
# depth-test / alpha-blend over earlier ones within each tile. Proves draw
# order + depth + alpha interactions across tile seams.
#
#   Clear      : green (TILE_CLEAR_COLOR), Z=0x4000 (gs_stub params).
#   P0 (blue)  : opaque (texel A=0x80 -> source-over == Cs), FLAT Z=0x5000.
#                Big background triangle; blue replaces green where inside.
#   P1 (red)   : opaque (A=0x80), FLAT Z=0x6000 (always in front of P0).
#                Smaller triangle inside P0 -> red over blue.
#   P2 (white) : translucent (A=0x40 -> 50% blend), FLAT Z=0x5800.
#                Larger triangle: passes vs blue(0x5000)+green(0x4000) -> blends
#                light-blue / light-green; FAILS vs red(0x6000) -> red shows
#                through (occluded). Drawn last -> blends over P0/clear.
#
# FLAT Z per primitive (all 3 verts equal) => no within-triangle Z gradient, so
# every depth decision is a fixed primitive-Z vs stored-Z compare -> the TB's
# software reference is an exact integer Z-buffer + source-over replay (only
# triangle EDGES are fuzzy, skipped via barycentric margins).
#
#   VRAM 16 KiB: 32-row FB @ 0..0x1FFF (FBW=1, 256 B/row); 3 solid 4x4 PSMCT32
#   textures @ TBP0=32/36/40 (0x2000/0x2400/0x2800). With TBW=1 the texel-row
#   stride is 64 texels = 0x100 B, so a 4-tall texture OCCUPIES 4 strided rows
#   = 0x400 B (the upload writes, and the sampler reads, rows at base+0/100/200/
#   300). The triangles' interpolated v reaches ~3, so the full 0x400 span IS
#   sampled — the textures MUST be spaced >=0x400 apart or a later upload's
#   base-row overwrites an earlier texture's row 1/2/3 and the sampler reads a
#   neighbouring primitive's colour (the depth/blend path is correct; this is
#   purely a VRAM-layout collision). 4 TBP0 units = 0x400; all fit under VRAM
#   0x4000 (FB ends 0x2000). Z on-chip (tile_z).
TMP_FBW   = 1
TMP_TEXW  = 4
TMP_TEXH  = 4
TMP_TBP_BG  = 32         # blue  @ 0x2000 (rows 0x2000..0x23FF)
TMP_TBP_MID = 36         # red   @ 0x2400 (rows 0x2400..0x27FF)
TMP_TBP_FG  = 40         # white @ 0x2800 (rows 0x2800..0x2BFF)
TMP_TBW   = 1
TMP_DISPLAY1_HI = (31 << 12) | 31   # DW=31 (32 wide), DH=31 (32 tall)

# (screen x, y, Z); UV is irrelevant (solid textures) but kept in-range [0..3].
# All three triangles cross BOTH tile seams (x=16, y=16). P1/P2 are narrower than
# P0 so P0's flanks stay pure blue. Regions (verified vs the TB's SW reference):
#   blue 24 (opaque bg) / red 29 (P1 over P0) / light-blue 26 (P2 blend over blue)
#   / occlusion 19 (P2 FAILS vs red -> red shows) / green control 416.
TMP_P0 = [(1,  1, 0x0000_5000), (30, 1, 0x0000_5000), (15, 30, 0x0000_5000)]   # blue  bg   (far)
TMP_P1 = [(8,  3, 0x0000_6000), (22, 3, 0x0000_6000), (15, 20, 0x0000_6000)]   # red   front(near)
TMP_P2 = [(7,  9, 0x0000_5800), (23, 9, 0x0000_5800), (15, 28, 0x0000_5800)]   # white blend(mid)
TMP_UV = [(0, 0), (3, 0), (0, 3)]


def solid_texel(r, g, b, a):
    """PSMCT32 ABGR: A<<24 | B<<16 | G<<8 | R (matches comb_texel/blend lanes)."""
    return ((a & 0xFF) << 24) | ((b & 0xFF) << 16) | ((g & 0xFF) << 8) | (r & 0xFF)


def tmp_texture_upload(tbp0, texel):
    """A 4x4 solid-color PSMCT32 texture upload (linear, PSMCT32_SWIZZLE=0)."""
    qw = []
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(tbp0, TMP_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TMP_TEXW, TMP_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TMP_TEXW * TMP_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for _ in range(n_image):
        word = 0
        for lane in range(4):
            word |= (texel & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    return qw


def tmp_triangle(tbp0, verts, eop, first, frame_val=None, alpha_val=None):
    """One combined TME+ABE+ZTE triangle. Self-contained state (re-issues PRIM/
    ALPHA/TEST/ZBUF/TEX0 each time so draw-order doesn't depend on persistence);
    FRAME only on the first. EOP set on the last triangle of the packet.
    frame_val overrides FRAME_1 (default PSMCT32); the Ch308 PSMCT16 demo passes
    frame_1_psmct16 so the framebuffer is 16-bit."""
    if frame_val is None:
        frame_val = frame_1_psmct32(TMP_FBW)
    if alpha_val is None:
        alpha_val = alpha_pack(0, 1, 0, 1)   # source-over (default)
    qw = []
    tex0_val = tex0_pack(tbp0, TMP_TBW, psm=0x00, tw=2, th=2)   # 4x4 PSMCT32
    # A+D list: PRIM,ALPHA_1,TEST_1,ZBUF_1,TEX0_1 (5) [+FRAME_1 on first] + 3*(RGBAQ,UV,XYZ2)=9
    if first:
        nreg = 6 + 9   # 15
        qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))   # nreg A+D (0xE) descriptors
        qw.append(aplusd(R_FRAME_1, frame_val))
    else:
        nreg = 5 + 9   # 14
        qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_ALPHA_1, alpha_val))                        # blend mode (default source-over)
    qw.append(aplusd(R_TEST_1,  test1_geq()))                      # ZTE GEQUAL
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))                    # ignored in tile mode
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for i, (sx, sy, sz) in enumerate(verts):
        tu, tv = TMP_UV[i]
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))   # color from texel (DECAL)
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


def build_tile_multiprim_demo_payload():
    qw = []
    qw += tmp_texture_upload(TMP_TBP_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # opaque blue
    qw += tmp_texture_upload(TMP_TBP_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # opaque red
    qw += tmp_texture_upload(TMP_TBP_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # translucent white
    qw += tmp_triangle(TMP_TBP_BG,  TMP_P0, eop=0, first=True)
    qw += tmp_triangle(TMP_TBP_MID, TMP_P1, eop=0, first=False)
    qw += tmp_triangle(TMP_TBP_FG,  TMP_P2, eop=1, first=False)
    return qw


tmp_demo_payload = build_tile_multiprim_demo_payload()
tmp_demo_qwc     = len(tmp_demo_payload)
assert tmp_demo_qwc <= 95, f"tile_multiprim payload {tmp_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
tmp_demo_bootlet = build_textured_demo_bootlet_disp(tmp_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_multiprim.mem", tmp_demo_bootlet,
    f"Ch305 MULTI-PRIMITIVE tiled-scene BIOS bootlet ({len(tmp_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={tmp_demo_qwc}"
)
write_payload_mem(
    "payload_tile_multiprim.mem", tmp_demo_payload,
    f"Ch305 MULTI-PRIMITIVE tiled-scene GIF payload ({tmp_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); 3 solid 4x4 textures + 3 combined tris (blue bg / red front / "
    f"white blend) rendered as a LIST per tile over a 2x2 grid"
)


# ---------------------------------------------------------------------------
# Ch306 — GS SCISSOR clipping in the tiled renderer. Same 3-primitive scene as
# Ch305 (multi-primitive 2x2 grid) PLUS a SCISSOR_1 rectangle that clips the
# whole scene. Effective raster bounds = primitive bbox INTERSECT tile bbox
# INTERSECT scissor rect. Pixels outside the scissor stay the CLEAR color (the
# walker never visits them -> no color/Z write). The rect crosses BOTH tile
# seams (x=16, y=16) so the clip is proven across tiles.
#   Scissor (inclusive) = [SCAX0..SCAX1] x [SCAY0..SCAY1] = [9..22] x [6..20].
#   Crosses x=16 (9<16<22) and y=16 (6<16<20); clips a FAT chunk of the scene
#   (the wide blue band/top, the sides, and the apex) so the clip is substantial
#   on every side while keeping the red/blend interactions inside. (Needs
#   SCISSOR_ENABLE=1 on the core.)
TSC_SX0, TSC_SX1, TSC_SY0, TSC_SY1 = 9, 22, 6, 20


def build_tile_scissor_demo_payload():
    qw = []
    qw += tmp_texture_upload(TMP_TBP_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # opaque blue
    qw += tmp_texture_upload(TMP_TBP_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # opaque red
    qw += tmp_texture_upload(TMP_TBP_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # translucent white
    # SCISSOR_1 is GS state: set it once (its own 1-A+D GIF tag) before the prims.
    qw.append(giftag(1, 0, 0, 1, int('E', 16)))
    qw.append(aplusd(R_SCISSOR_1, scissor_pack(TSC_SX0, TSC_SX1, TSC_SY0, TSC_SY1)))
    qw += tmp_triangle(TMP_TBP_BG,  TMP_P0, eop=0, first=True)
    qw += tmp_triangle(TMP_TBP_MID, TMP_P1, eop=0, first=False)
    qw += tmp_triangle(TMP_TBP_FG,  TMP_P2, eop=1, first=False)
    return qw


tsc_demo_payload = build_tile_scissor_demo_payload()
tsc_demo_qwc     = len(tsc_demo_payload)
assert tsc_demo_qwc <= 95, f"tile_scissor payload {tsc_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
tsc_demo_bootlet = build_textured_demo_bootlet_disp(tsc_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_scissor.mem", tsc_demo_bootlet,
    f"Ch306 SCISSOR-clipped tiled-scene BIOS bootlet ({len(tsc_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={tsc_demo_qwc}"
)
write_payload_mem(
    "payload_tile_scissor.mem", tsc_demo_payload,
    f"Ch306 SCISSOR-clipped tiled-scene GIF payload ({tsc_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); Ch305 3-prim scene + SCISSOR_1 [{TSC_SX0}..{TSC_SX1}]x[{TSC_SY0}..{TSC_SY1}]"
)


# ---------------------------------------------------------------------------
# Ch307 — GS texture WRAP MODES (REPEAT + CLAMP). Two textured combined prims
# over the tiled path, sampling the SAME striped texture with UV running 0..8
# (2x the 4-wide texture) so out-of-range U exercises the wrap mode:
#   Texture 4x4 PSMCT32 (opaque A=0x80): columns u=0,1 WHITE, u=2,3 BLUE.
#   REPEAT prim (top band):  CLAMP_1 WMS=WMT=0 -> white stripe at u=0 AND u=4
#                            (texture tiles) -> TWO white stripes.
#   CLAMP  prim (bottom band): CLAMP_1 WMS=WMT=1 -> white at u=0..1 only, u>=2
#                            sticks to the blue edge -> ONE white stripe.
#   Both span x2..29 (cross the x=16 tile seam); flat Z=0x5000 (opaque over the
#   green clear). Distinguishing region u in [4,6): REPEAT white, CLAMP blue.
#   (Needs TEX_WRAP_ENABLE=1 on the core.)
TWR_TBP0 = 32          # texture @ 0x2000 (above the 32-row FB)
TWR_TBW  = 1
TWR_TEXW = 4
TWR_TEXH = 4
TWR_Z    = 0x0000_5000
WHITE_TEX = solid_texel(0xFF, 0xFF, 0xFF, 0x80)   # ABGR opaque white
BLUE_TEX  = solid_texel(0x00, 0x00, 0xFF, 0x80)   # ABGR opaque blue
# REPEAT prim: right-triangle, u 0->8 along x (v_tex=1 constant). top band.
TWR_RV  = [(2, 4, TWR_Z), (29, 4, TWR_Z), (2, 15, TWR_Z)]
TWR_RUV = [(0, 1), (8, 1), (0, 1)]
# CLAMP prim: same shape, bottom band.
TWR_CV  = [(2, 17, TWR_Z), (29, 17, TWR_Z), (2, 28, TWR_Z)]
TWR_CUV = [(0, 1), (8, 1), (0, 1)]


def wrap_texel(tx, ty):
    return WHITE_TEX if tx <= 1 else BLUE_TEX


def twr_texture_upload(tbp0):
    qw = []
    qw.append(giftag(1, 0, 0, 4, int('E' * 4, 16)))   # 4 A+D regs: BITBLTBUF,TRXPOS,TRXREG,TRXDIR
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(tbp0, TWR_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TWR_TEXW, TWR_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TWR_TEXW * TWR_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % TWR_TEXW; ty = t // TWR_TEXW
            word |= (wrap_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    return qw


def twr_triangle(verts, uv3, wms, wmt, eop, first):
    qw = []
    tex0_val = tex0_pack(TWR_TBP0, TWR_TBW, psm=0x00, tw=2, th=2)   # 4x4 PSMCT32
    if first:
        nreg = 7 + 9   # FRAME,PRIM,ALPHA,TEST,ZBUF,TEX0,CLAMP (7) + 9
        qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
        qw.append(aplusd(R_FRAME_1, frame_1_psmct32(TMP_FBW)))
    else:
        nreg = 6 + 9   # PRIM,ALPHA,TEST,ZBUF,TEX0,CLAMP (6) + 9
        qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    qw.append(aplusd(R_PRIM,     prim_tri_tme_abe()))
    qw.append(aplusd(R_ALPHA_1,  alpha_pack(0, 1, 0, 1)))       # source-over (A=0x80 -> opaque)
    qw.append(aplusd(R_TEST_1,   test1_geq()))
    qw.append(aplusd(R_ZBUF_1,   zbuf1_pack(2)))                # ignored in tile mode
    qw.append(aplusd(R_TEX0_1,   tex0_val))
    qw.append(aplusd(R_CLAMP_1,  clamp_pack(wms, wmt)))         # the wrap mode under test
    for i, (sx, sy, sz) in enumerate(verts):
        tu, tv = uv3[i]
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))  # color from texel (DECAL)
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


def build_tile_wrap_demo_payload():
    qw = []
    qw += twr_texture_upload(TWR_TBP0)
    qw += twr_triangle(TWR_RV, TWR_RUV, wms=0, wmt=0, eop=0, first=True)   # REPEAT
    qw += twr_triangle(TWR_CV, TWR_CUV, wms=1, wmt=1, eop=1, first=False)  # CLAMP
    return qw


twr_demo_payload = build_tile_wrap_demo_payload()
twr_demo_qwc     = len(twr_demo_payload)
assert twr_demo_qwc <= 95, f"tile_wrap payload {twr_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
twr_demo_bootlet = build_textured_demo_bootlet_disp(twr_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_wrap.mem", twr_demo_bootlet,
    f"Ch307 texture WRAP (repeat+clamp) BIOS bootlet ({len(twr_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={twr_demo_qwc}"
)
write_payload_mem(
    "payload_tile_wrap.mem", twr_demo_payload,
    f"Ch307 texture WRAP GIF payload ({twr_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); striped 4x4 tex, REPEAT tri (top, 2 stripes) + CLAMP tri (bottom, 1 stripe)"
)


# ---------------------------------------------------------------------------
# Ch308 — PSMCT16 tile color buffer. The SAME Ch305 3-primitive scene, but the
# on-chip tile color RAM + the framebuffer are PSMCT16 (RGB5A1, 16-bit) instead
# of PSMCT32: FRAME_1.PSM=0x02 and DISPFB1.PSM=0x02 so the GS flush writes 16-bit
# lanes and the PCRTC scanout unpacks RGB5A1->ABGR (5-bit quantized). Proves
# tile color can be narrower than 32-bit when the frame format allows it.
# (Needs TILE_COLOR_PSMCT16=1 on the core.) VRAM 16 KiB: PSMCT16 32x32 FB =
# 0..0x7FF; PSMCT32 textures @ 0x2000/0x2400/0x2800 (Ch305 spacing) — no overlap.


def build_psmct16_demo_bootlet_disp(qwc, display1_hi, fbw):
    """Ch305 one-shot bootlet but DISPFB1.PSM=PSMCT16 so scanout reads RGB5A1."""
    hi16 = (display1_hi >> 16) & 0xFFFF
    lo16 = display1_hi & 0xFFFF
    dispfb1_val = dispfb1_psmct16(fbw)
    # PSMCT16 DISPFB1 PSM field is at bit 15+, so the value exceeds 16 bits and
    # needs a full LUI+ORI load (not ORI-only like the PSMCT32 path).
    df_hi16 = (dispfb1_val >> 16) & 0xFFFF
    df_lo16 = dispfb1_val & 0xFFFF
    assert qwc <= 0xFFFF
    return [
        enc_lui(1, 0x1200),
        enc_lui(2, df_hi16),
        enc_ori(2, 2, df_lo16),
        enc_sw(2, 1, 0x0070),
        enc_sw(0, 1, 0x0080),
        enc_lui(2, hi16),
        enc_ori(2, 2, lo16),
        enc_sw(2, 1, 0x0084),
        enc_ori(2, 0, 0x0001),
        enc_sw(2, 1, 0x0000),
        enc_lui(10, 0x1000),
        enc_ori(10, 10, 0xA000),
        enc_ori(11, 0, 0x0100),
        enc_sw(11, 10, 0x0010),
        enc_ori(11, 0, qwc),
        enc_sw(11, 10, 0x0020),
        enc_ori(11, 0, 0x0001),
        enc_sw(11, 10, 0x0000),
        enc_syscall(),
    ]


def build_tile_psmct16_demo_payload():
    # FRAME_1.PSM stays PSMCT32: this core's COMBINED tile path is gated on a
    # PSMCT32 dest (frame_1_q[29:24]==0), so a PSMCT16 FRAME would disqualify the
    # primitives from the combined classification. The PSMCT16-ness lives in the
    # on-chip tile RAM + the flush (both gated by TILE_COLOR_PSMCT16, independent of
    # FRAME.PSM) and in DISPFB1.PSM=PSMCT16 (so scanout unpacks the flushed RGB5A1).
    qw = []
    qw += tmp_texture_upload(TMP_TBP_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # opaque blue
    qw += tmp_texture_upload(TMP_TBP_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # opaque red
    qw += tmp_texture_upload(TMP_TBP_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # translucent white
    qw += tmp_triangle(TMP_TBP_BG,  TMP_P0, eop=0, first=True)   # FRAME PSMCT32 (combined eligibility)
    qw += tmp_triangle(TMP_TBP_MID, TMP_P1, eop=0, first=False)
    qw += tmp_triangle(TMP_TBP_FG,  TMP_P2, eop=1, first=False)
    return qw


t16_demo_payload = build_tile_psmct16_demo_payload()
t16_demo_qwc     = len(t16_demo_payload)
assert t16_demo_qwc <= 95, f"tile_psmct16 payload {t16_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
t16_demo_bootlet = build_psmct16_demo_bootlet_disp(t16_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_psmct16.mem", t16_demo_bootlet,
    f"Ch308 PSMCT16 tile-color BIOS bootlet ({len(t16_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32 PSMCT16; QWC={t16_demo_qwc}"
)
write_payload_mem(
    "payload_tile_psmct16.mem", t16_demo_payload,
    f"Ch308 PSMCT16 tile-color GIF payload ({t16_demo_qwc} qwords active at byte 0x100, "
    f"padded to {RAM_TOTAL_QWORDS}); Ch305 3-prim scene with FRAME_1.PSM=PSMCT16 (16-bit framebuffer)"
)


# ---------------------------------------------------------------------------
# Ch309 — GS ALPHA mode expansion. The Ch305 3-primitive scene, but P1 uses the
# ADDITIVE blend mode instead of source-over, so two visibly-different blend
# equations coexist:
#   P0 (blue bg)      : source-over A=Cs,B=Cd,C=As,D=Cd (opaque, As=0x80).
#   P1 (red, additive): A=Cs,B=0,C=FIX(0x80),D=Cd -> Cv=Cs+Cd (clamped). Over the
#                       blue bg this BRIGHTENS to magenta (255,0,255); a glow/particle
#                       style add. In front (Z=0x6000).
#   P2 (white, src-over translucent A=0x40): blends light-blue over the blue bg
#                       where it passes depth (occluded by P1's nearer Z).
# Proves additive + FIX participate, source-over unchanged, depth-fail suppresses.
# (Needs ALPHA_MODES_ENABLE=1 on the core.)
ALPHA_ADDITIVE = alpha_pack(0, 2, 2, 1, fix=0x80)   # A=Cs B=0 C=FIX D=Cd, FIX=0x80


def build_tile_alpha_demo_payload():
    qw = []
    qw += tmp_texture_upload(TMP_TBP_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # opaque blue
    qw += tmp_texture_upload(TMP_TBP_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # red (additive src)
    qw += tmp_texture_upload(TMP_TBP_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # translucent white
    qw += tmp_triangle(TMP_TBP_BG,  TMP_P0, eop=0, first=True)                       # source-over
    qw += tmp_triangle(TMP_TBP_MID, TMP_P1, eop=0, first=False, alpha_val=ALPHA_ADDITIVE)  # ADDITIVE
    qw += tmp_triangle(TMP_TBP_FG,  TMP_P2, eop=1, first=False)                      # source-over
    return qw


tal_demo_payload = build_tile_alpha_demo_payload()
tal_demo_qwc     = len(tal_demo_payload)
assert tal_demo_qwc <= 95, f"tile_alpha payload {tal_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
tal_demo_bootlet = build_textured_demo_bootlet_disp(tal_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_alpha.mem", tal_demo_bootlet,
    f"Ch309 ALPHA-mode (additive) BIOS bootlet ({len(tal_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={tal_demo_qwc}"
)
write_payload_mem(
    "payload_tile_alpha.mem", tal_demo_payload,
    f"Ch309 ALPHA-mode GIF payload ({tal_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); Ch305 scene with P1 ADDITIVE (FIX=0x80) + P0/P2 source-over"
)


# ---------------------------------------------------------------------------
# Ch310 — BILINEAR filtering in the combined tile path. Two textured triangles
# sampling the SAME 4x4 CHECKER texture, MAGNIFIED (UV 0..4 over a ~12px-wide
# triangle, ~3 px/texel, so the affine interp produces fractional U/V), with
# different TEX1.MMAG:
#   LEFT  tri (TEX1.MMAG=0, NEAREST):  blocky checker (only the 2 texel colors).
#   RIGHT tri (TEX1.MMAG=1, LINEAR):   smoothed checker (midtone gradients at
#                                      texel boundaries -> bilinear visible).
# Checker = BLUE (0,0,255) / WHITE (255,255,255), opaque A=0x80 (source-over ->
# texel shown). (Needs BILINEAR_ENABLE=1 on the core.)
BIL_TBP0 = 32
BIL_TBW  = 1
BIL_TW   = 4
BIL_TH   = 4
BIL_CB = solid_texel(0x00, 0x00, 0xFF, 0x80)   # blue
BIL_CW = solid_texel(0xFF, 0xFF, 0xFF, 0x80)   # white
BIL_Z  = 0x0000_5000
# magnified: UV 0..4 across each ~12-wide / ~22-tall triangle.
BIL_LV  = [(2, 4, BIL_Z), (14, 4, BIL_Z), (2, 26, BIL_Z)]   # left  (nearest)
BIL_RV  = [(18, 4, BIL_Z), (30, 4, BIL_Z), (18, 26, BIL_Z)] # right (bilinear)
BIL_UV  = [(0, 0), (4, 0), (0, 4)]


def bil_checker_texel(tx, ty):
    return BIL_CW if ((tx + ty) & 1) else BIL_CB


def bil_texture_upload(tbp0):
    qw = []
    qw.append(giftag(1, 0, 0, 4, int('E' * 4, 16)))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(tbp0, BIL_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(BIL_TW, BIL_TH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (BIL_TW * BIL_TH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i * 4 + lane; tx = t % BIL_TW; ty = t // BIL_TW
            word |= (bil_checker_texel(tx, ty) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    return qw


def bil_triangle(verts, mmag, eop, first):
    qw = []
    tex0_val = tex0_pack(BIL_TBP0, BIL_TBW, psm=0x00, tw=2, th=2)   # 4x4 PSMCT32
    # A+D: PRIM,ALPHA,TEST,ZBUF,TEX0,TEX1 (6) [+FRAME on first] + 3*(RGBAQ,UV,XYZ2)=9
    nreg = (7 if first else 6) + 9
    qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    if first:
        qw.append(aplusd(R_FRAME_1, frame_1_psmct32(TMP_FBW)))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))       # source-over (opaque texel)
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    qw.append(aplusd(R_TEX1_1,  tex1_pack(mmag)))              # NEAREST(0) vs LINEAR(1)
    for i, (sx, sy, sz) in enumerate(verts):
        tu, tv = BIL_UV[i]
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


def build_tile_bilinear_demo_payload():
    qw = []
    qw += bil_texture_upload(BIL_TBP0)
    qw += bil_triangle(BIL_LV, mmag=0, eop=0, first=True)    # LEFT nearest
    qw += bil_triangle(BIL_RV, mmag=1, eop=1, first=False)   # RIGHT bilinear
    return qw


bil_demo_payload = build_tile_bilinear_demo_payload()
bil_demo_qwc     = len(bil_demo_payload)
assert bil_demo_qwc <= 95, f"tile_bilinear payload {bil_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
bil_demo_bootlet = build_textured_demo_bootlet_disp(bil_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_bilinear.mem", bil_demo_bootlet,
    f"Ch310 BILINEAR (nearest vs linear) BIOS bootlet ({len(bil_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={bil_demo_qwc}"
)
write_payload_mem(
    "payload_tile_bilinear.mem", bil_demo_payload,
    f"Ch310 BILINEAR GIF payload ({bil_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); 4x4 blue/white checker, LEFT tri TEX1.MMAG=0 nearest / RIGHT MMAG=1 bilinear, magnified"
)


# ---------------------------------------------------------------------------
# Ch314 — BILINEAR for a PALETTIZED (PSMT8 indexed) texture. The Ch310 two-triangle
# scene (LEFT MMAG=0 nearest, RIGHT MMAG=1 bilinear) but the texture is a 4x4 PSMT8
# INDEX checker (idx 0/1) + a CLUT (0=blue, 1=white). Each bilinear tap fetches an
# INDEX, CLUTs it to a color, then the 4 COLORS are interpolated (CLUT-before-interp)
# — so the right triangle shows interpolated blue<->white midtones, NOT blocky steps
# and NOT garbage from interpolating indices. VRAM (16 KiB): FB 32x32 PSMCT32 @ 0..0xFFF,
# PSMT8 index texture @ TBP0=32 (0x2000), CLUT @ CBP=36 (0x2400).
PB_TBP0 = 32              # 0x2000 (above the 0x1000 FB)
PB_TBW  = 1
PB_TW   = 4
PB_TH   = 4
PB_CBP  = 36              # 0x2400 (256-B units)
PB_Z    = 0x0000_5000
PB_LV   = [(2, 4, PB_Z), (14, 4, PB_Z), (2, 26, PB_Z)]    # left  (nearest)
PB_RV   = [(18, 4, PB_Z), (30, 4, PB_Z), (18, 26, PB_Z)]  # right (bilinear)
PB_UV   = [(0, 0), (4, 0), (0, 4)]


def pb_palette(i):
    # 0=blue, 1=white (A=0x80, source-over over the green clear — mirrors BIL).
    if   i == 0: return solid_texel(0x00, 0x00, 0xFF, 0x80)   # blue
    elif i == 1: return solid_texel(0xFF, 0xFF, 0xFF, 0x80)   # white
    else:        return solid_texel(0x7F, 0x7F, 0x7F, 0x80)


def pb_index(tx, ty):
    return (tx + ty) & 1     # blue/white index checker


def pb_tex0(cld):
    # PSMT8 (0x13) 4x4 texture + CLUT-side fields so the TEX0 commit fires a
    # VRAM->CLUT load when cld=1. Mirrors tex0_clut_pack but tw=th=2 (4x4).
    v = tex0_pack(PB_TBP0, PB_TBW, psm=0x13, tw=2, th=2)
    v |= (PB_CBP & 0x3FFF) << 37
    v |= (0      & 0xF)    << 51    # CPSM=PSMCT32
    v |= (1      & 0x1)    << 55    # CSM
    v |= (0      & 0x1F)   << 56    # CSA
    v |= (cld    & 0x7)    << 61    # CLD: 1 -> load CLUT on commit
    return v


def pb_clut_upload():
    # BITBLT 8 PSMCT32 CLUT entries to VRAM[CBP*256] (texture uses 0/1).
    qw = []
    n_clut = 8
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(PB_CBP, 1, 0)))   # DPSM=PSMCT32
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(n_clut, 1)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    qw.append(giftag(n_clut // 4, 0, 2, 0, 0))                     # IMAGE
    for i in range(n_clut // 4):
        word = 0
        for lane in range(4):
            word |= (pb_palette(i * 4 + lane) & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    return qw


def pb_index_upload():
    # BITBLT the 4x4 PSMT8 index texture to VRAM[TBP0*256] (16 indices, 1 qword).
    qw = []
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(PB_TBP0, PB_TBW, 0x13)))  # DPSM=PSMT8
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(PB_TW, PB_TH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_idx = PB_TW * PB_TH
    qw.append(giftag(n_idx // 16, 0, 2, 0, 0))
    for q in range(n_idx // 16):
        word = 0
        for lane in range(16):
            t = q * 16 + lane; tx = t % PB_TW; ty = t // PB_TW
            word |= (pb_index(tx, ty) & 0xFF) << (8 * lane)
        qw.append(word)
    return qw


def pb_triangle(verts, mmag, cld, eop, first):
    qw = []
    nreg = (7 if first else 6) + 9
    qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    if first:
        qw.append(aplusd(R_FRAME_1, frame_1_psmct32(TMP_FBW)))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))   # source-over
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))
    qw.append(aplusd(R_TEX0_1,  pb_tex0(cld)))             # PSMT8 + (first) CLUT load
    qw.append(aplusd(R_TEX1_1,  tex1_pack(mmag)))          # NEAREST(0) vs LINEAR(1)
    for i, (sx, sy, sz) in enumerate(verts):
        tu, tv = PB_UV[i]
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


def build_tile_palbilinear_demo_payload():
    qw = []
    qw += pb_clut_upload()
    qw += pb_index_upload()
    qw += pb_triangle(PB_LV, mmag=0, cld=1, eop=0, first=True)   # LEFT nearest (loads CLUT)
    qw += pb_triangle(PB_RV, mmag=1, cld=0, eop=1, first=False)  # RIGHT bilinear
    return qw


pb_demo_payload = build_tile_palbilinear_demo_payload()
pb_demo_qwc     = len(pb_demo_payload)
assert pb_demo_qwc <= 95, f"tile_palbilinear payload {pb_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
pb_demo_bootlet = build_textured_demo_bootlet_disp(pb_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_palbilinear.mem", pb_demo_bootlet,
    f"Ch314 PALETTIZED BILINEAR BIOS bootlet ({len(pb_demo_bootlet)} words active, "
    f"padded to {BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={pb_demo_qwc}"
)
write_payload_mem(
    "payload_tile_palbilinear.mem", pb_demo_payload,
    f"Ch314 PALETTIZED BILINEAR GIF payload ({pb_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); CLUT(blue/white) + 4x4 PSMT8 index checker, LEFT MMAG=0 nearest / RIGHT MMAG=1 bilinear (CLUT-before-interp)"
)


# ---------------------------------------------------------------------------
# Ch311 — per-tile BIN BUFFER demo. Three primitives with deliberately different
# tile coverage so the bins are checkable:
#   P0 blue bg  (Z=0x5000): big triangle, bbox spans ALL 4 tiles -> in every bin.
#   P1 red      (Z=0x6000): bbox x8..24 y2..12 -> crosses x=16 seam but stays in
#                           tile-row 0 -> in tiles (0,0)+(1,0) ONLY (2 bins).
#   P2 white    (Z=0x5800): bbox x20..28 y20..28 -> tile (1,1) ONLY (1 bin).
# Bins (prim index P0=0,P1=1,P2=2): t0={0,1} t1={0,1} t2={0} t3={0,2}.
# (Needs BIN_BUFFER_ENABLE=1 + TILE_MULTIPRIM=1.) Same render result as the Ch305
# re-test-per-tile path; the bin buffer just precomputes the routing.
BIN_P0 = [(1,  1, 0x0000_5000), (30, 1, 0x0000_5000), (15, 30, 0x0000_5000)]  # blue, all tiles
BIN_P1 = [(8,  2, 0x0000_6000), (24, 2, 0x0000_6000), (16, 12, 0x0000_6000)] # red, tiles 0,1
BIN_P2 = [(20,20, 0x0000_5800), (28,20, 0x0000_5800), (20, 28, 0x0000_5800)] # white, tile 3 only


def build_tile_bin_demo_payload():
    qw = []
    qw += tmp_texture_upload(TMP_TBP_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # opaque blue
    qw += tmp_texture_upload(TMP_TBP_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # opaque red
    qw += tmp_texture_upload(TMP_TBP_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # translucent white
    qw += tmp_triangle(TMP_TBP_BG,  BIN_P0, eop=0, first=True)   # prim 0
    qw += tmp_triangle(TMP_TBP_MID, BIN_P1, eop=0, first=False)  # prim 1
    qw += tmp_triangle(TMP_TBP_FG,  BIN_P2, eop=1, first=False)  # prim 2
    return qw


bn_demo_payload = build_tile_bin_demo_payload()
bn_demo_qwc     = len(bn_demo_payload)
assert bn_demo_qwc <= 95, f"tile_bin payload {bn_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
bn_demo_bootlet = build_textured_demo_bootlet_disp(bn_demo_qwc, TMP_DISPLAY1_HI, TMP_FBW)

write_bios_mem(
    "bios_tile_bin.mem", bn_demo_bootlet,
    f"Ch311 BIN-BUFFER BIOS bootlet ({len(bn_demo_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 32x32; QWC={bn_demo_qwc}"
)
write_payload_mem(
    "payload_tile_bin.mem", bn_demo_payload,
    f"Ch311 BIN-BUFFER GIF payload ({bn_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); P0 all-tiles / P1 2-tiles / P2 1-tile -> bins t0{{0,1}} t1{{0,1}} t2{{0}} t3{{0,2}}"
)


# ---------------------------------------------------------------------------
# Ch312 — scale the tiled renderer to a 4x4 grid (16 tiles, 16x16 each = 64x64).
# Same bin-buffer mechanism (BIN_BUFFER_ENABLE), just TILE_COLS=TILE_ROWS=4. Three
# prims chosen to stress the larger grid + leave empty tiles:
#   P0 blue (Z5000): top-left, bbox tiles (0,0)(1,0)(0,1)(1,1) = t0,1,4,5  (4 tiles)
#   P1 red  (Z6000): mid, crosses x=16/32/48 & y=16/32 seams, bbox tiles
#                    (1,1)(2,1)(3,1)(1,2)(2,2)(3,2) = t5,6,7,9,10,11        (6 tiles)
#   P2 white(Z5800): bottom-right corner tile (3,3) = t15 ONLY              (1 tile)
#   EMPTY tiles (no prim bbox): t2,3,8,12,13,14 -> stay clear.
# 64x64 FB PSMCT32 = 0x4000 (fills 16 KiB) -> VRAM 32 KiB, textures @ 0x4000+.
B4_FBW = 1
B4_DISPLAY1_HI = (63 << 12) | 63          # DW=63 (64 wide), DH=63 (64 tall)
B4_TBP0_BG, B4_TBP0_MID, B4_TBP0_FG = 64, 68, 72   # 0x4000 / 0x4400 / 0x4800
B4_P0 = [(2,  2, 0x0000_5000), (30, 2, 0x0000_5000), (2,  30, 0x0000_5000)]  # blue top-left
B4_P1 = [(20,18, 0x0000_6000), (50,18, 0x0000_6000), (35, 40, 0x0000_6000)]  # red mid, crosses seams
B4_P2 = [(52,52, 0x0000_5800), (60,52, 0x0000_5800), (52, 60, 0x0000_5800)]  # white corner tile


def b4_texture_upload(tbp0, texel):
    # 4x4 solid PSMCT32 upload (reuses TMP texture size; placed above the 64x64 FB).
    qw = []
    qw.append(giftag(1, 0, 0, 4, int('E' * 4, 16)))
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(tbp0, TMP_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TMP_TEXW, TMP_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TMP_TEXW * TMP_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))
    for _ in range(n_image):
        word = 0
        for lane in range(4):
            word |= (texel & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    return qw


def b4_triangle(tbp0, verts, eop, first):
    qw = []
    tex0_val = tex0_pack(tbp0, TMP_TBW, psm=0x00, tw=2, th=2)
    nreg = (6 if first else 5) + 9
    qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    if first:
        qw.append(aplusd(R_FRAME_1, frame_1_psmct32(B4_FBW)))
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for i, (sx, sy, sz) in enumerate(verts):
        tu, tv = TMP_UV[i]
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


def build_tile_bin4x4_demo_payload():
    qw = []
    qw += b4_texture_upload(B4_TBP0_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # blue
    qw += b4_texture_upload(B4_TBP0_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # red
    qw += b4_texture_upload(B4_TBP0_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # white
    qw += b4_triangle(B4_TBP0_BG,  B4_P0, eop=0, first=True)
    qw += b4_triangle(B4_TBP0_MID, B4_P1, eop=0, first=False)
    qw += b4_triangle(B4_TBP0_FG,  B4_P2, eop=1, first=False)
    return qw


b4_demo_payload = build_tile_bin4x4_demo_payload()
b4_demo_qwc     = len(b4_demo_payload)
assert b4_demo_qwc <= 95, f"tile_bin4x4 payload {b4_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
b4_demo_bootlet = build_textured_demo_bootlet_disp(b4_demo_qwc, B4_DISPLAY1_HI, B4_FBW)

write_bios_mem(
    "bios_tile_bin4x4.mem", b4_demo_bootlet,
    f"Ch312 4x4-GRID bin-buffer BIOS bootlet ({len(b4_demo_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 64x64; QWC={b4_demo_qwc}"
)
write_payload_mem(
    "payload_tile_bin4x4.mem", b4_demo_payload,
    f"Ch312 4x4-GRID bin-buffer GIF payload ({b4_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); P0 4-tile / P1 6-tile cross-seam / P2 1-tile + empty tiles, 64x64 PSMCT32 FB"
)


# ---------------------------------------------------------------------------
# Ch315 — PRIMITIVE/BIN CAPACITY scaling. The 4x4 (64x64) grid but SEVEN prims
# (vs Ch312's 3), exceeding the old FIFO/bin depth of 4. Six of them (P0..P5) all
# fall ENTIRELY inside the centre tile t5 (col1,row1 = x[16..31],y[16..31]) so that
# tile's bin holds SIX prims (occupancy 6 > old depth 4); P6 is a lone corner prim
# in t15; the other 14 tiles are empty. One shared opaque-blue texture keeps the
# payload small (subsequent prims re-issue only PRIM + 3 verts; FRAME/TEX0/ALPHA/
# TEST/ZBUF persist). Proves overlap, draw order (bin order {0..5}), full-ish + empty
# bins, and capacity past 4. Needs TILE_FIFO_DEPTH>=7 (the demo profile sets 8).
CAP_TBP0 = 64                       # 0x4000 (above the 64x64 FB), shared blue texture
CAP_FBW  = 1
CAP_DISPLAY1_HI = (63 << 12) | 63   # 64x64
CAP_BLUE = solid_texel(0x00, 0x00, 0xFF, 0xFF)   # OPAQUE blue (order-independent color)
# Ch333 — a second "unity" texture (0x80 per channel = 1.0 in PS2 modulate fixed-point) so a
# MODULATE prim's output equals the staging RGBAQ color. Uploaded alongside blue by the setup.
CAP_TBP1  = 96                                   # 0x6000 — well clear of FB(0x4000) + blue texture
CAP_UNITY = solid_texel(0x80, 0x80, 0x80, 0x80)  # modulate identity: (0x80 * c) >> 7 == c
# P0..P5 are SIX IDENTICAL right-triangles (same winding as the proven Ch312 prims:
# right-angle at top-left) entirely inside tile t0 [0..15]^2, at increasing Z -> they
# all bin into t0 (depth 6), draw in order, and (opaque, GEQUAL) the top one wins so
# the union is that one blue triangle. P6 is a lone corner triangle in t15 [48..63]^2.
# The deep bin is placed in t0 (the FIRST tile rendered) on purpose: a SEPARATE latent
# bug makes EMPTY tiles that precede the first non-empty tile flush black instead of
# the clear colour (never hit before — prior demos always had a prim in t0). That is
# orthogonal to capacity; keeping t0 non-empty avoids it. Identical shapes keep the SW
# image reference winding-exact; the 6-deep bin (read back from bin_prim/bin_n) is what
# proves capacity, not visual distinctness.
CAP_T0 = [(1,1),(14,1),(1,14)]
CAP_PRIMS = [
    (CAP_T0, 0x0000_5000),  # P0
    (CAP_T0, 0x0000_5100),  # P1
    (CAP_T0, 0x0000_5200),  # P2
    (CAP_T0, 0x0000_5300),  # P3
    (CAP_T0, 0x0000_5400),  # P4
    (CAP_T0, 0x0000_5500),  # P5
    ([(50,50),(62,50),(50,62)], 0x0000_5600),  # P6 corner t15
]


def cap_triangle(verts, z, eop, first):
    qw = []
    nreg = (6 if first else 1) + 9    # first: FRAME+PRIM+ALPHA+TEST+ZBUF+TEX0; rest: PRIM only
    qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    if first:
        qw.append(aplusd(R_FRAME_1, frame_1_psmct32(CAP_FBW)))
    qw.append(aplusd(R_PRIM, prim_tri_tme_abe()))         # re-issued per prim (clean vertex kick)
    if first:
        qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))
        qw.append(aplusd(R_TEST_1,  test1_geq()))
        qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))
        qw.append(aplusd(R_TEX0_1,  tex0_pack(CAP_TBP0, TMP_TBW, psm=0x00, tw=2, th=2)))
    for i, (sx, sy) in enumerate(verts):
        tu, tv = TMP_UV[i]
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, z)))
    return qw


def build_tile_cap_demo_payload():
    qw = []
    qw += b4_texture_upload(CAP_TBP0, CAP_BLUE)
    for i, (verts, z) in enumerate(CAP_PRIMS):
        qw += cap_triangle(verts, z, eop=(1 if i == len(CAP_PRIMS)-1 else 0), first=(i == 0))
    return qw


def build_tile_late_demo_payload():
    # Ch316 — ONE prim, ONLY in the LAST tile t15 [48..63]^2; tiles t0..t14 empty
    # (LEADING empties). Exercises the empty-tile-before-first-non-empty-tile path.
    qw = []
    qw += b4_texture_upload(CAP_TBP0, CAP_BLUE)
    qw += cap_triangle([(50,50),(62,50),(50,62)], 0x0000_5000, eop=1, first=True)
    return qw


late_demo_payload = build_tile_late_demo_payload()
late_demo_qwc     = len(late_demo_payload)
assert late_demo_qwc <= 95, f"tile_late payload {late_demo_qwc} qwords"
late_demo_bootlet = build_textured_demo_bootlet_disp(late_demo_qwc, CAP_DISPLAY1_HI, CAP_FBW)
write_bios_mem("bios_tile_late.mem", late_demo_bootlet,
    f"Ch316 LATE-ONLY BIOS bootlet ({len(late_demo_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 64x64; QWC={late_demo_qwc}")
write_payload_mem("payload_tile_late.mem", late_demo_payload,
    f"Ch316 LATE-ONLY GIF payload ({late_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); 1 prim in t15 only, t0..t14 empty (leading-empty-tile traversal)")


cap_demo_payload = build_tile_cap_demo_payload()
cap_demo_qwc     = len(cap_demo_payload)
assert cap_demo_qwc <= 95, f"tile_cap payload {cap_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
cap_demo_bootlet = build_textured_demo_bootlet_disp(cap_demo_qwc, CAP_DISPLAY1_HI, CAP_FBW)

write_bios_mem(
    "bios_tile_cap.mem", cap_demo_bootlet,
    f"Ch315 CAPACITY (7-prim) BIOS bootlet ({len(cap_demo_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 64x64; QWC={cap_demo_qwc}"
)
write_payload_mem(
    "payload_tile_cap.mem", cap_demo_payload,
    f"Ch315 CAPACITY GIF payload ({cap_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); 7 prims: P0..P5 all in centre tile t5 (bin depth 6 > old 4), P6 corner t15"
)

# ===== Ch328 1b — DEPTH-64 CAPACITY PROOF: 18 FLAT-blue tris stacked in tile t0 (bin depth 18 > 16) =====
# Flat (TME=0) opaque-blue triangles: RGBAQ set once (persists -> uniform blue verts), so each prim
# costs only PRIM + 3*XYZ2 (~5 qwords) — fits 18 prims in <95 qwords (a textured scene maxes ~15).
# Exercises the TRI path + the M20K grad-prefetch (uniform color gradients) at a bin depth that the
# old register FIFO (depth-4/8, ~600 ALM/1033 reg per slot) could never have held. All 18 inside t0
# [0..15]^2 (the FIRST tile -> avoids the leading-empty-tile bug); other 15 tiles empty.
CAP64_N = 18
# COMBINED-textured tris (TME+ABE+GEQUAL Z) — the proven tile-multiprim grid path (tile_active =
# ras_combined; a non-combined/flat prim would drop tile_active and stall the grid walk). Shared
# UV/RGBAQ/TEX0/ALPHA/TEST/ZBUF set ONCE (persist) so each prim costs only PRIM + 3*XYZ2 (~5 qw),
# fitting 18 in ~107 qw (a per-vert-UV combined tri like Ch315's would be ~213 qw). All verts UV is
# one texel of the solid-blue texture -> uniform blue, order-independent.
def cap64_triangle(z, first, eop):
    qw = []
    setup = 7 if first else 0        # FRAME+ALPHA+TEST+ZBUF+TEX0+RGBAQ+UV (first prim only; they persist)
    nreg = 1 + setup + 3             # PRIM + setup + 3*XYZ2
    qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    qw.append(aplusd(R_PRIM, prim_tri_tme_abe()))     # re-issued per prim (clean vertex kick)
    if first:
        qw.append(aplusd(R_FRAME_1, frame_1_psmct32(CAP_FBW)))
        qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))   # source-over
        qw.append(aplusd(R_TEST_1,  test1_geq()))              # GEQUAL Z
        qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))
        qw.append(aplusd(R_TEX0_1,  tex0_pack(CAP_TBP0, TMP_TBW, psm=0x00, tw=2, th=2)))
        qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,      uv_data(TMP_UV[0][0], TMP_UV[0][1])))   # shared UV -> solid blue
    for (sx, sy) in CAP_T0:
        qw.append(aplusd(R_XYZ2, xyz2_dataz(sx, sy, z)))
    return qw
def build_tile_cap64_demo_payload():
    qw = []
    qw += b4_texture_upload(CAP_TBP0, CAP_BLUE)
    for i in range(CAP64_N):
        qw += cap64_triangle(0x0000_5000 + i*0x100, first=(i == 0), eop=(1 if i == CAP64_N-1 else 0))
    return qw

# ===== Ch329 Bug 1 — 18 NON-COMBINED SPRITES in tile t0 (multiprim-grid refusal proof) =====
# Sprites are never combined (no tile-local color/Z path), so the multiprim grid must REFUSE them
# cleanly (tile_refused_count==18, grid completes, NO stall) rather than freeze the bin-walk. All 18
# sit in t0; shared FRAME/RGBAQ so each prim is just PRIM + 2*XYZ2 (~4 qw).
def build_tile_sprite18_demo_payload():
    qw = []
    for i in range(CAP64_N):
        first = (i == 0)
        eop   = (i == CAP64_N - 1)
        nreg  = (1 + (2 if first else 0)) + 2   # PRIM + [FRAME+RGBAQ first] + 2*XYZ2
        qw.append(giftag(1, 1 if eop else 0, 0, nreg, int('E' * nreg, 16)))
        qw.append(aplusd(R_PRIM, PRIM_SPRITE))
        if first:
            qw.append(aplusd(R_FRAME_1, frame_1_psmct32(CAP_FBW)))
            qw.append(aplusd(R_RGBAQ,   rgbaq_data(0x00, 0x00, 0xFF)))   # blue (would-be, if rendered)
        qw.append(aplusd(R_XYZ2, xyz2_data(1, 1)))
        qw.append(aplusd(R_XYZ2, xyz2_data(14, 14)))
    return qw
sprite18_payload = build_tile_sprite18_demo_payload()
sprite18_qwc     = len(sprite18_payload)
assert sprite18_qwc <= 114, f"tile_sprite18 payload {sprite18_qwc} qwords (limit 114)"
sprite18_bootlet = build_textured_demo_bootlet_disp(sprite18_qwc, CAP_DISPLAY1_HI, CAP_FBW)
write_bios_mem("bios_tile_sprite18.mem", sprite18_bootlet,
    f"Ch329 18-sprite (non-combined) refusal bootlet ({len(sprite18_bootlet)} words; QWC={sprite18_qwc})")
write_payload_mem("payload_tile_sprite18.mem", sprite18_payload,
    f"Ch329 18 NON-combined sprites in t0 ({sprite18_qwc} qwords) — multiprim grid must refuse all 18")

cap64_demo_payload = build_tile_cap64_demo_payload()
cap64_demo_qwc     = len(cap64_demo_payload)
# Payload loads at byte 0x100 = qword 16, so the absolute EE-RAM range is [16 .. 16+qwc). It CROSSES
# the legacy heartbeat read-splicer's qword 115 (Ch255) — harmless here because the cap64 TB gates the
# splicer OFF (HEARTBEAT_SPLICE_ENABLE=0); just keep the whole payload inside EE RAM.
assert 16 + cap64_demo_qwc <= RAM_TOTAL_QWORDS, f"tile_cap64 payload ends at qword {16+cap64_demo_qwc} > RAM {RAM_TOTAL_QWORDS}"
cap64_demo_bootlet = build_textured_demo_bootlet_disp(cap64_demo_qwc, CAP_DISPLAY1_HI, CAP_FBW)
write_bios_mem(
    "bios_tile_cap64.mem", cap64_demo_bootlet,
    f"Ch328 DEPTH-64 capacity bootlet ({len(cap64_demo_bootlet)} words; DISPLAY1=64x64; QWC={cap64_demo_qwc})"
)
write_payload_mem(
    "payload_tile_cap64.mem", cap64_demo_payload,
    f"Ch328 DEPTH-64 capacity ({cap64_demo_qwc} qwords): {CAP64_N} flat-blue tris all in tile t0 "
    f"(bin depth {CAP64_N} > 16 — impossible at the old register-FIFO per-slot cost)"
)


# ===== Ch330 Brick 2 — runtime-feeder image-equivalence fixtures =====
# SETUP-ONLY payload: upload the blue texture to CAP_TBP0 then EOP — NO baked tris (the feeder
# draws the prims in phase 1; a baked tri here would double-render). And feeder_stg_cap4.mem:
# the SAME 4 combined-TAZ tris (CAP_T0, increasing Z) as a normalized feeder list. Both reference
# the same texture/state, so the feeder render must match the proven baked combined render.
FEEDER_TRIS   = [(CAP_T0, 0x0000_5000 + i*0x100) for i in range(4)]              # list A: tile t0 (top-left)
FEEDER_T15    = [(49,49),(62,49),(49,62)]                                        # tri in tile t15 (col3,row3 = bottom-right)
FEEDER_TRIS_B = [(FEEDER_T15, 0x0000_5000 + i*0x100) for i in range(4)]          # list B: tile t15 (diagonal opposite of t0)

# Ch331 — feeder EXPRESSIVENESS: one small triangle placed in an arbitrary 4x4-grid tile.
def tri_in_tile(t):                                                              # t = row*4 + col, 0..15
    ox, oy = (t % 4) * 16, (t // 4) * 16
    return [(ox+1, oy+1), (ox+14, oy+1), (ox+1, oy+14)]
def scene_from_tiles(tiles):                                                     # one prim per tile, increasing Z
    return [(tri_in_tile(t), 0x0000_5000 + i*0x100) for i, t in enumerate(tiles)]
SCENE_C1_TILES = [0, 5, 10]                          # 3 prims (< TILE_PRIM_COUNT=4): diagonal
SCENE_C2_TILES = [0, 3, 5, 9, 12, 15]                # 6 prims (> 4): scattered, both diagonals
SCENE_C3_TILES = [0, 1, 2, 3, 12, 13, 14, 15]        # 8 prims (== FIFO_DEPTH): top + bottom rows

# Ch332 — second shape: a RECTANGLE (filled quad) = two textured triangles sharing a diagonal.
# (lowest-risk vocabulary expansion: no new feeder record type, just two triangle records.)
def rect_tris_in_tile(t):
    ox, oy = (t % 4) * 16, (t // 4) * 16
    ul = [(ox+1, oy+1), (ox+14, oy+1), (ox+1, oy+14)]    # upper-left half
    lr = [(ox+14, oy+1), (ox+1, oy+14), (ox+14, oy+14)]  # lower-right half (shares the UL diagonal)
    return [ul, lr]
def scene_shapes(items):                              # items = [(tile,'tri'|'rect'), ...]
    out = []; i = 0
    for (t, kind) in items:
        for v in ([tri_in_tile(t)] if kind == 'tri' else rect_tris_in_tile(t)):
            out.append((v, 0x0000_5000 + i*0x100)); i += 1
    return out
SHAPE_TRI   = [(0,'tri'),  (5,'tri'),  (10,'tri')]                  # 3 prims — half-tile triangles
SHAPE_RECT  = [(0,'rect'), (5,'rect'), (10,'rect')]                 # 6 prims — same tiles, FILLED quads
SHAPE_MIXED = [(0,'tri'),  (5,'rect'), (10,'rect'), (15,'tri')]     # 6 prims — 2 tris + 2 rects

# Ch333 — colored shapes: unity texture (CAP_TBP1) + TFX=MODULATE, so the staging RGBAQ IS the
# rendered color. items = [(tile, 'tri'|'rect', (r,g,b)), ...]; each shape's tri(s) share its color.
COL_RED, COL_GREEN, COL_BLUE, COL_YELLOW = (0xFF,0,0), (0,0xFF,0), (0,0,0xFF), (0xFF,0xFF,0)
def scene_shapes_colored(items):
    out = []; i = 0
    for (t, kind, rgb) in items:
        for v in ([tri_in_tile(t)] if kind == 'tri' else rect_tris_in_tile(t)):
            out.append((v, 0x0000_5000 + i*0x100, rgb)); i += 1
    return out
COLOR_TRI = [(0,'tri',COL_RED),  (5,'tri',COL_GREEN),  (10,'tri',COL_BLUE)]               # 3 prims, 3 colors
COLOR_RECT= [(0,'rect',COL_RED), (5,'rect',COL_GREEN), (10,'rect',COL_BLUE)]              # 6 prims, 3 colors
COLOR_MIX = [(0,'tri',COL_RED),  (5,'rect',COL_GREEN), (10,'tri',COL_BLUE), (15,'rect',COL_YELLOW)]  # 6 prims, shape+color vary

def build_feeder_setup_payload():
    # Ch333 — upload TWO textures: blue at CAP_TBP0 (DECAL anchors) + unity at CAP_TBP1 (MODULATE
    # color scenes). EOP rides ONLY the last image packet; every earlier packet has eop=0.
    qw = []
    n_image = (TMP_TEXW * TMP_TEXH) // 4
    def upload(tbp, texel, eop):
        qw.append(giftag(1, 0, 0, 4, int('E' * 4, 16)))          # A+D: BITBLTBUF/TRXPOS/TRXREG/TRXDIR
        qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(tbp, TMP_TBW, 0x00)))
        qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
        qw.append(aplusd(R_TRXREG,    trxreg_pack(TMP_TEXW, TMP_TEXH)))
        qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
        qw.append(giftag(n_image, eop, 2, 0, 0))                 # IMAGE
        for _ in range(n_image):
            word = 0
            for lane in range(4):
                word |= (texel & 0xFFFFFFFF) << (32 * lane)
            qw.append(word)
    upload(CAP_TBP0, CAP_BLUE,  0)   # blue  — DECAL anchors
    upload(CAP_TBP1, CAP_UNITY, 1)   # unity — MODULATE color scenes (EOP here)
    return qw

def build_feeder_staging_colored(prims):                         # prims = [(verts, z, (r,g,b)), ...]
    w = []
    w.append(len(prims))                                         # [0] count
    w.append(frame_1_psmct32(CAP_FBW))                            # [1] FRAME
    w.append(alpha_pack(0, 1, 0, 1))                              # [2] ALPHA
    w.append(test1_geq())                                        # [3] TEST
    w.append(zbuf1_pack(2))                                      # [4] ZBUF
    w.append(tex0_pack(CAP_TBP1, TMP_TBW, psm=0x00, tw=2, th=2, tfx=0))  # [5] TEX0: UNITY tex + MODULATE
    w.append(prim_tri_tme_abe())                                 # [6] PRIM
    for (verts, z, (r, g, b)) in prims:
        for vi, (sx, sy) in enumerate(verts):
            tu, tv = TMP_UV[vi]
            w.append(rgbaq_data(r, g, b))                        # MODULATE: this color flows through the unity texel
            w.append(uv_data(tu, tv))
            w.append(xyz2_dataz(sx, sy, z))
    return w

# Ch334 — NATIVE rectangle record: 3 words (RGBAQ color, corner0 XYZ2, corner1 XYZ2). The feeder
# expands one record into two colored triangles. count word = {rect_count[31:16], tri_count[15:0]}.
def rect_record(tile, rgb, z):
    ox, oy = (tile % 4) * 16, (tile // 4) * 16
    return [rgbaq_data(*rgb), xyz2_dataz(ox+1, oy+1, z), xyz2_dataz(ox+14, oy+14, z)]

# Ch335 — GOURAUD: per-VERTEX color, so the GS interpolates a smooth gradient across the triangle.
# prims = [(verts, z, [rgb0, rgb1, rgb2]), ...]  (one rgb per vertex)
def build_feeder_staging_gouraud(prims):
    w = []
    w.append(len(prims))
    w.append(frame_1_psmct32(CAP_FBW)); w.append(alpha_pack(0, 1, 0, 1)); w.append(test1_geq())
    w.append(zbuf1_pack(2)); w.append(tex0_pack(CAP_TBP1, TMP_TBW, psm=0x00, tw=2, th=2, tfx=0))  # unity + MODULATE
    w.append(prim_tri_tme_abe())   # gs_stub ignores PRIM.IIP — interp is driven by per-vertex colors
    for (verts, z, cols) in prims:
        for vi, (sx, sy) in enumerate(verts):
            w.append(rgbaq_data(*cols[vi])); w.append(uv_data(*TMP_UV[vi])); w.append(xyz2_dataz(sx, sy, z))
    return w
def build_feeder_staging_native(tris, rects):                    # tris=[(verts,z,rgb)], rects=[(tile,rgb)]
    w = []
    w.append(((len(rects) & 0xFFFF) << 16) | (len(tris) & 0xFFFF))   # [0] {rect_count, tri_count}
    w.append(frame_1_psmct32(CAP_FBW))
    w.append(alpha_pack(0, 1, 0, 1))
    w.append(test1_geq())
    w.append(zbuf1_pack(2))
    w.append(tex0_pack(CAP_TBP1, TMP_TBW, psm=0x00, tw=2, th=2, tfx=0))  # unity tex + MODULATE
    w.append(prim_tri_tme_abe())
    for (verts, z, rgb) in tris:
        for vi, (sx, sy) in enumerate(verts):
            w.append(rgbaq_data(*rgb)); w.append(uv_data(*TMP_UV[vi])); w.append(xyz2_dataz(sx, sy, z))
    for i, (tile, rgb) in enumerate(rects):
        w.extend(rect_record(tile, rgb, 0x0000_5000 + (len(tris)+i)*0x100))
    return w

def build_feeder_staging(tris):
    w = []
    w.append(len(tris))                                          # [0] count
    w.append(frame_1_psmct32(CAP_FBW))                            # [1] FRAME
    w.append(alpha_pack(0, 1, 0, 1))                              # [2] ALPHA (source-over)
    w.append(test1_geq())                                        # [3] TEST  (GEQUAL)
    w.append(zbuf1_pack(2))                                      # [4] ZBUF
    w.append(tex0_pack(CAP_TBP0, TMP_TBW, psm=0x00, tw=2, th=2)) # [5] TEX0  (-> the uploaded texture)
    w.append(prim_tri_tme_abe())                                 # [6] PRIM  (combined: TME+ABE)
    for (verts, z) in tris:
        for vi, (sx, sy) in enumerate(verts):
            tu, tv = TMP_UV[vi]
            w.append(rgbaq_data(0x00, 0x00, 0x00))               # textured -> color from texel
            w.append(uv_data(tu, tv))
            w.append(xyz2_dataz(sx, sy, z))
    return w

def write_feeder_stg_mem(filename, words, banner, total=256):
    with open(os.path.join(OUT, filename), "w") as f:
        f.write(f"// {banner}\n// $readmemh into feeder_stg [0:{total-1}] (64-bit words).\n")
        for x in words:                f.write(f"{x & 0xFFFFFFFFFFFFFFFF:016x}\n")
        for _ in range(total - len(words)): f.write(f"{0:016x}\n")

feeder_setup_payload = build_feeder_setup_payload()
feeder_setup_qwc     = len(feeder_setup_payload)
feeder_setup_bootlet = build_textured_demo_bootlet_disp(feeder_setup_qwc, CAP_DISPLAY1_HI, CAP_FBW)
write_bios_mem("bios_feeder_setup.mem", feeder_setup_bootlet,
    f"Ch330 SETUP-ONLY bootlet ({len(feeder_setup_bootlet)} words; DISPLAY1=64x64; QWC={feeder_setup_qwc}; texture upload, NO tris)")
write_payload_mem("payload_feeder_setup.mem", feeder_setup_payload,
    f"Ch330 SETUP-ONLY GIF payload ({feeder_setup_qwc} qwords): blue-texture upload to CAP_TBP0 + EOP, no primitives")
write_feeder_stg_mem("feeder_stg_cap4.mem", build_feeder_staging(FEEDER_TRIS),
    "Ch330 feeder staging A: 4 combined-TAZ tris in tile t0 (CAP_T0) — same scene as the baked combined render")
write_feeder_stg_mem("feeder_stg_cap4_B.mem", build_feeder_staging(FEEDER_TRIS_B),
    "Ch330 feeder staging B (runtime swap): 4 combined-TAZ tris in tile t15 — list B for the no-RBF-rebuild retrigger demo")

# ---------------------------------------------------------------------------
# Ch345a — RUNTIME FEEDER SPRITE records: textured + source-over alpha SPRITEs streamed through the feeder
# (the Ch344-proven subset: PSMCT32, affine UV, ABE source-over, TCC texel alpha). The setup bootlet uploads
# the 8x8 alpha-checker texture + draws an opaque blue BG; the feeder (sprite_mode = staging word0[33]) then
# renders 3 textured-alpha sprites over it. Runtime SPRITE ingestion (NOT authentic glyphs — that is Ch345b).
def build_sprite_feeder_setup_payload():
    frame_1_val = frame_1_psmct32(TEXALPHA_FBW)
    qw = []
    qw.append(giftag(1, 0, 0, 4, 0x0000_0000_0000_EEEE))           # texture upload A+D
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(TEXALPHA_TBP, 1, 0)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(TEXALPHA_TEXW, TEXALPHA_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (TEXALPHA_TEXW * TEXALPHA_TEXH) // 4
    qw.append(giftag(n_image, 0, 2, 0, 0))                         # IMAGE
    for i in range(n_image):
        word = 0
        for lane in range(4):
            t = i*4 + lane
            word |= (texalpha_texel(t % TEXALPHA_TEXW, t // TEXALPHA_TEXW) & 0xFFFFFFFF) << (32*lane)
        qw.append(word)
    qw.append(giftag(1, 1, 0, 5, 0x0000_0000_000E_EEEE))           # opaque blue BG sprite, EOP
    qw.append(aplusd(R_PRIM,    PRIM_SPRITE))
    qw.append(aplusd(R_FRAME_1, frame_1_val))
    qw.append(aplusd(R_RGBAQ,   rgbaq_data(TEXALPHA_BG_R, TEXALPHA_BG_G, TEXALPHA_BG_B)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(0, 0)))
    qw.append(aplusd(R_XYZ2,    xyz2_data(63, 63)))
    return qw

def build_feeder_sprite_staging(sprites):
    w = []
    w.append((len(sprites) & 0xFFFF) | (1 << 33))                  # word0: sprite_count + sprite_mode (bit33)
    w.append(frame_1_psmct32(TEXALPHA_FBW))                        # FRAME
    w.append(alpha_pack(0, 1, 0, 1))                              # ALPHA source-over
    w.append(0)                                                   # TEST_1 (ZTE=0, no depth)
    w.append(0)                                                   # ZBUF_1
    w.append(tex0_pack(TEXALPHA_TBP, 1, psm=0, tw=3, th=3, tfx=0)) # TEX0 8x8 PSMCT32 MODULATE
    w.append(prim_sprite_tme_abe())                              # PRIM SPRITE+TME+ABE
    for (x0,y0,x1,y1,u0,v0,u1,v1) in sprites:
        w += [rgbaq_data(0x80,0x80,0x80,0x80), uv_data(u0,v0), xyz2_data(x0,y0),
              rgbaq_data(0x80,0x80,0x80,0x80), uv_data(u1,v1), xyz2_data(x1,y1)]
    return w

SPRITE_FEEDER_SET = [( 8,24,24,40, 0,0,8,8), (26,24,42,40, 0,0,8,8), (44,24,60,40, 0,0,8,8)]
_spr_setup = build_sprite_feeder_setup_payload()
write_bios_mem("bios_sprite_setup.mem",
    build_textured_demo_bootlet_disp(len(_spr_setup), TEXALPHA_DISPLAY1_HI, TEXALPHA_FBW),
    f"Ch345a sprite-feeder SETUP bootlet; DISPLAY1=64x64; QWC={len(_spr_setup)}; alpha texture + blue BG")
write_payload_mem("payload_sprite_setup.mem", _spr_setup,
    f"Ch345a sprite-feeder SETUP payload ({len(_spr_setup)} qw): 8x8 alpha texture upload + opaque blue BG sprite")
write_feeder_stg_mem("feeder_sprite.mem", build_feeder_sprite_staging(SPRITE_FEEDER_SET),
    f"Ch345a feeder SPRITE staging (sprite_mode word0[33]): {len(SPRITE_FEEDER_SET)} textured-alpha sprites over the BG")
# Ch331 — variable-size multi-tile scenes (end-of-list flush): 3 / 6 / 8 prims across tiles.
write_feeder_stg_mem("feeder_scene_c1.mem", build_feeder_staging(scene_from_tiles(SCENE_C1_TILES)),
    f"Ch331 scene C1: {len(SCENE_C1_TILES)} prims (<TILE_PRIM_COUNT) in tiles {SCENE_C1_TILES}")
write_feeder_stg_mem("feeder_scene_c2.mem", build_feeder_staging(scene_from_tiles(SCENE_C2_TILES)),
    f"Ch331 scene C2: {len(SCENE_C2_TILES)} prims (>TILE_PRIM_COUNT) in tiles {SCENE_C2_TILES}")
write_feeder_stg_mem("feeder_scene_c3.mem", build_feeder_staging(scene_from_tiles(SCENE_C3_TILES)),
    f"Ch331 scene C3: {len(SCENE_C3_TILES)} prims (==FIFO_DEPTH) in tiles {SCENE_C3_TILES}")
# Ch332 — shape vocabulary: triangle scene, rectangle (quad) scene, mixed scene (all <= FIFO_DEPTH).
write_feeder_stg_mem("feeder_shape_tri.mem",   build_feeder_staging(scene_shapes(SHAPE_TRI)),
    "Ch332 shape TRI: 3 half-tile triangles in tiles 0/5/10")
write_feeder_stg_mem("feeder_shape_rect.mem",  build_feeder_staging(scene_shapes(SHAPE_RECT)),
    "Ch332 shape RECT: 3 filled quads (2 tris each = 6 prims) in tiles 0/5/10")
write_feeder_stg_mem("feeder_shape_mixed.mem", build_feeder_staging(scene_shapes(SHAPE_MIXED)),
    "Ch332 shape MIXED: triangles in 0/15 + rectangles in 5/10 (6 prims)")
# Ch333 — colored scenes (unity texture + MODULATE; color from staging RGBAQ).
write_feeder_stg_mem("feeder_color_tri.mem",  build_feeder_staging_colored(scene_shapes_colored(COLOR_TRI)),
    "Ch333 color TRI: red/green/blue triangles in tiles 0/5/10")
write_feeder_stg_mem("feeder_color_rect.mem", build_feeder_staging_colored(scene_shapes_colored(COLOR_RECT)),
    "Ch333 color RECT: red/green/blue filled quads in tiles 0/5/10 (6 prims)")
write_feeder_stg_mem("feeder_color_mix.mem",  build_feeder_staging_colored(scene_shapes_colored(COLOR_MIX)),
    "Ch333 color MIX: red tri(0) + green rect(5) + blue tri(10) + yellow rect(15) — shape & color vary")
# Ch334 — NATIVE rectangle records (1 record -> 2 tris in the feeder). NATIVE_RECT must match the
# Ch333 COLOR_RECT two-triangle scene visually + by records (3 rects = 6 prims).
write_feeder_stg_mem("feeder_native_rect.mem", build_feeder_staging_native([], [(0,COL_RED),(5,COL_GREEN),(10,COL_BLUE)]),
    "Ch334 native RECT: 3 native-rect records (red/green/blue quads 0/5/10) — matches color_rect, records=6")
write_feeder_stg_mem("feeder_native_mix.mem",
    build_feeder_staging_native([(tri_in_tile(0), 0x0000_5000, COL_RED)], [(5,COL_GREEN),(10,COL_BLUE),(15,COL_YELLOW)]),
    "Ch334 native MIX: red triangle(0) + 3 native rects (green5/blue10/yellow15), records=1+6=7")
# Ch335 — GOURAUD per-vertex color (smooth gradients). _g5 = the 2 triangles of tile-5's quad.
COL_WHITE = (0xFF, 0xFF, 0xFF)
_g5 = rect_tris_in_tile(5)   # [UL=(TL,TR,BL), LR=(TR,BL,BR)]
write_feeder_stg_mem("feeder_gouraud_tri.mem",
    build_feeder_staging_gouraud([(tri_in_tile(0), 0x0000_5000, [COL_RED, COL_GREEN, COL_BLUE])]),
    "Ch335 gouraud TRI: tile0 triangle, v0=red v1=green v2=blue -> RGB gradient, records=1")
write_feeder_stg_mem("feeder_gouraud_rect.mem",
    build_feeder_staging_gouraud([(_g5[0], 0x0000_5000, [COL_RED, COL_GREEN, COL_BLUE]),
                                  (_g5[1], 0x0000_5100, [COL_GREEN, COL_BLUE, COL_WHITE])]),
    "Ch335 gouraud RECT: tile5 quad (2 tris), corners red/green/blue/white -> gradient quad, records=2")
write_feeder_stg_mem("feeder_gouraud_mix.mem",
    build_feeder_staging_gouraud([(tri_in_tile(0), 0x0000_5000, [COL_RED, COL_RED, COL_RED]),
                                  (tri_in_tile(10), 0x0000_5100, [COL_RED, COL_GREEN, COL_BLUE])]),
    "Ch335 gouraud MIX: flat red tri(0) + RGB gradient tri(10), records=2")
# Ch336 — >FIFO_DEPTH ACCUMULATION: 14 tris across tiles 0-13 (FIFO depth 8 -> 2 batches, 8+6).
# Prims 0-7 (tiles 0-7) RED = batch 0; prims 8-13 (tiles 8-13) BLUE = batch 1. If batches wipe each
# other, the RED batch-0 tiles go green; accumulation keeps RED *and* BLUE simultaneously visible.
ACCUM_PRIMS = [(tri_in_tile(t), 0x0000_5000 + t*0x100, COL_RED if t < 8 else COL_BLUE) for t in range(14)]
write_feeder_stg_mem("feeder_accum.mem", build_feeder_staging_colored(ACCUM_PRIMS),
    "Ch336 accum: 14 tris (>FIFO_DEPTH); batch0 tiles 0-7 RED, batch1 tiles 8-13 BLUE -> both survive")
# Ch336 board diagnostic — SWAPPED colors: batch0 (tiles 0-7) BLUE, batch1 (tiles 8-13) RED. Localizes
# the board color bug: if the swap shows all-BLUE, the FIRST batch's color is sticking for the scene.
ACCUM_SWAP = [(tri_in_tile(t), 0x0000_5000 + t*0x100, COL_BLUE if t < 8 else COL_RED) for t in range(14)]
write_feeder_stg_mem("feeder_accum_swap.mem", build_feeder_staging_colored(ACCUM_SWAP),
    "Ch336 diag: batch0 tiles 0-7 BLUE, batch1 tiles 8-13 RED (color-swapped accum)")
# Ch336 DEFINITIVE diag — batch0 BLUE, batch1 GREEN. GREEN shares NO channel with RED (the suspected
# default) or BLUE (batch0), so batch1's rendered color is unambiguous:
#   GREEN bottom -> batch1 color tracks its staged value (bug not here)
#   RED   bottom -> batch1 falls back to a constant RED (staged color ignored)
#   BLUE  bottom -> batch1 reuses batch0's color
ACCUM_GREEN = [(tri_in_tile(t), 0x0000_5000 + t*0x100, COL_BLUE if t < 8 else COL_GREEN) for t in range(14)]
write_feeder_stg_mem("feeder_accum_green.mem", build_feeder_staging_colored(ACCUM_GREEN),
    "Ch336 diag: batch0 tiles 0-7 BLUE, batch1 tiles 8-13 GREEN")
# Ch337 — clean-retrigger acceptance scenes. TWO distinct >FIFO_DEPTH scenes (14 prims each = 2
# batches) that occupy DIFFERENT tiles AND colors, so any leftover from a prior scene is visible:
#   A: tiles 0-13 RED       B: tiles 2-15 BLUE
# A then B then A: each scene's first (full-flush) batch wipes the whole FB, so the final FB must be
# EXACTLY the last scene with no residue from the other.
SCENE_A = [(tri_in_tile(t),     0x0000_5000 + t*0x100,     COL_RED)  for t in range(14)]
SCENE_B = [(tri_in_tile(t),     0x0000_5000 + (t-2)*0x100, COL_BLUE) for t in range(2, 16)]
write_feeder_stg_mem("feeder_scene_a.mem", build_feeder_staging_colored(SCENE_A),
    "Ch337 scene A: 14 tris, tiles 0-13, RED (>FIFO_DEPTH, 2 batches)")
write_feeder_stg_mem("feeder_scene_b.mem", build_feeder_staging_colored(SCENE_B),
    "Ch337 scene B: 14 tris, tiles 2-15, BLUE (>FIFO_DEPTH, 2 batches)")

# Ch338 — CROSS-BATCH Z. A NEAR (RED) and a FAR (BLUE) triangle occupy the SAME tile (tile 5) but are
# SPLIT across FIFO batches. ZBUF clear = 0x4000, TEST = GEQUAL (higher Z = nearer = wins). NEAR=0x7000,
# FAR=0x5000, fillers=0x6000 (>= clear so they draw in their own tiles). 14 prims each => batch0 = first
# 8, batch1 = last 6, so the overlap tile is rendered in BOTH batches. With persistent cross-batch Z the
# NEAR (RED) triangle wins the overlap in BOTH orderings; with per-batch Z, NEAR_FIRST wrongly shows the
# later FAR (BLUE) batch on top.
ZP_NEAR, ZP_FAR, ZP_MID = 0x0000_7000, 0x0000_5000, 0x0000_6000
_OT = 5; _B0F = [0, 1, 2, 3, 4, 6, 7]; _B1F = [8, 9, 10, 11, 12]   # overlap tile + filler tiles
# NEAR_FIRST: near RED in batch0, far BLUE (same tile) in batch1 -> persistent Z must keep RED on top.
ZPERSIST_NEAR_FIRST = (
    [(tri_in_tile(_OT), ZP_NEAR, COL_RED)] +
    [(tri_in_tile(t),   ZP_MID,  COL_RED)  for t in _B0F] +
    [(tri_in_tile(_OT), ZP_FAR,  COL_BLUE)] +
    [(tri_in_tile(t),   ZP_MID,  COL_BLUE) for t in _B1F])
# FAR_FIRST: far BLUE in batch0, near RED (same tile) in batch1 -> near wins either way (control case).
ZPERSIST_FAR_FIRST = (
    [(tri_in_tile(_OT), ZP_FAR,  COL_BLUE)] +
    [(tri_in_tile(t),   ZP_MID,  COL_BLUE) for t in _B0F] +
    [(tri_in_tile(_OT), ZP_NEAR, COL_RED)] +
    [(tri_in_tile(t),   ZP_MID,  COL_RED)  for t in _B1F])
write_feeder_stg_mem("feeder_zpersist_near_first.mem", build_feeder_staging_colored(ZPERSIST_NEAR_FIRST),
    "Ch338 cross-batch Z: NEAR(RED,b0)+FAR(BLUE,b1) overlap tile 5 -> tile5 must be RED (near wins)")
write_feeder_stg_mem("feeder_zpersist_far_first.mem", build_feeder_staging_colored(ZPERSIST_FAR_FIRST),
    "Ch338 cross-batch Z: FAR(BLUE,b0)+NEAR(RED,b1) overlap tile 5 -> tile5 must be RED (near wins)")
# Ch338 — MIXED colored/gradient cross-batch overlap (Codex ask). The NEAR prim (batch0) is a GOURAUD
# gradient (RED/GREEN/BLUE per vertex); the FAR prim (batch1, same tile 5) is flat WHITE. Fillers are
# flat (batch0 GREEN, batch1 WHITE). With persistent Z the near GRADIENT wins the overlap, so tile 5
# shows interpolated colors and ZERO white (the far prim is Z-rejected). Per-batch Z would paint tile 5
# white. Built with the gouraud staging so per-vertex colors flow (unity tex + MODULATE).
ZPERSIST_GRAD = (
    [(tri_in_tile(_OT), ZP_NEAR, [COL_RED, COL_GREEN, COL_BLUE])] +
    [(tri_in_tile(t),   ZP_MID,  [COL_GREEN]*3) for t in _B0F] +
    [(tri_in_tile(_OT), ZP_FAR,  [COL_WHITE]*3)] +
    [(tri_in_tile(t),   ZP_MID,  [COL_WHITE]*3) for t in _B1F])
write_feeder_stg_mem("feeder_zpersist_grad.mem", build_feeder_staging_gouraud(ZPERSIST_GRAD),
    "Ch338 cross-batch Z: NEAR gradient(RGB,b0) + FAR flat WHITE(b1) overlap tile 5 -> gradient wins, no white")

# Ch342 — PERSPECTIVE feeder staging: drive the Ch301 perspective path THROUGH the feeder (word0[32]=1,
# per-vertex RGBAQ(+Q_fp)/ST(S_fp,T_fp)/XYZ2) on the tiled/multiprim profile w/ PERSPECTIVE_CORRECT=1.
# A perspective quad (2 tris): top edge FAR (w=8), bottom NEAR (w=1), checkerboard texture (DECAL) so
# rows compress toward the top under correct perspective. Reuses persp_texel/persp_attrs/st_data.
PERSP_FEEDER_TBP = 100                       # texture base (block 100 = byte 0x6400; clear of the 64x64 FB)
def build_feeder_staging_persp(tris):        # tris = [ [(sx,sy,u,v,w) x3], ... ]  z constant
    w = []
    w.append(len(tris) | (1 << 32))          # word0: tri_count + perspective format flag (bit 32)
    w.append(frame_1_psmct32(CAP_FBW))
    w.append(alpha_pack(0, 1, 0, 1))
    w.append(test1_geq())
    w.append(zbuf1_pack(2))
    w.append(tex0_pack(PERSP_FEEDER_TBP, PERSP_TBW, psm=0x00, tw=4, th=4, tfx=1))   # checkerboard, DECAL
    # Ch342 — PRIM = TRIANGLE + TME, ABE=0 (FST=0) to MATCH the authentic cube (ABE=0). ABE=0 keeps the
    # prim NON-combined -> the S1/legacy perspective path (where gs_persp_uv actually launches), NOT the
    # combined-TAZ tiled path (whose perspective integration is a separate follow-on bug).
    w.append(3 | (1 << 4))                   # TRI + TME, ABE clear
    for verts in tris:
        for (sx, sy, u, v, wq) in verts:
            s_fp, t_fp, q_fp = persp_attrs(u, v, wq)
            w.append(rgbaq_with_q(0x00, 0x00, 0x00, q_fp))
            w.append(st_data(s_fp, t_fp))
            w.append(xyz2_dataz(sx, sy, 0x0000_5000))
    return w
# A 32x40 perspective quad in the 64x64 FB: x 16..48, y 12..52; top FAR (w=8), bottom NEAR (w=1);
# UV over the 16x16 checkerboard (u,v 0..16). Two tris share the TL-BR diagonal.
_PX0, _PX1, _PY0, _PY1 = 16, 48, 12, 52
_PWF, _PWN = PERSP_W_FAR, PERSP_W_NEAR
_pv_tl = (_PX0, _PY0, 0,  0,  _PWF); _pv_tr = (_PX1, _PY0, 16, 0,  _PWF)
_pv_bl = (_PX0, _PY1, 0,  16, _PWN); _pv_br = (_PX1, _PY1, 16, 16, _PWN)
PERSP_QUAD = [[_pv_tl, _pv_tr, _pv_bl], [_pv_tr, _pv_bl, _pv_br]]
write_feeder_stg_mem("feeder_persp.mem", build_feeder_staging_persp(PERSP_QUAD),
    "Ch342 perspective quad through the feeder (word0[32]=1, RGBAQ/ST/XYZ2; top FAR w=8, bottom NEAR w=1)")
# checkerboard texture for backdoor-load into VRAM at PERSP_FEEDER_TBP (16x16 PSMCT32, linear).
with open(os.path.join(OUT, "feeder_persp_tex.mem"), "w") as _f:
    _f.write("// Ch342 16x16 checkerboard (persp_texel) for backdoor VRAM load at PERSP_FEEDER_TBP\n")
    for _v in range(PERSP_TEXH):
        for _u in range(PERSP_TEXW):
            _f.write(f"{persp_texel(_u, _v) & 0xFFFFFFFF:08x}\n")

# Ch342 BOARD profile (GS_FEEDER_PERSP_DEMO) — setup-ONLY bootlet that uploads the 16x16 checkerboard
# to PERSP_FEEDER_TBP (the feeder then draws the perspective floor from feeder_persp.mem). No triangles
# in the payload (the feeder owns them). Small (16x16 = 64 image qwords) -> fits the default 4 KiB EE RAM.
def build_persp_feeder_setup_payload():
    qw = []
    qw.append(giftag(1, 0, 0, 4, int('E' * 4, 16)))                 # A+D: BITBLTBUF/TRXPOS/TRXREG/TRXDIR
    qw.append(aplusd(R_BITBLTBUF, bitbltbuf_pack(PERSP_FEEDER_TBP, PERSP_TBW, 0x00)))
    qw.append(aplusd(R_TRXPOS,    trxpos_pack(0, 0)))
    qw.append(aplusd(R_TRXREG,    trxreg_pack(PERSP_TEXW, PERSP_TEXH)))
    qw.append(aplusd(R_TRXDIR,    trxdir_pack(0)))
    n_image = (PERSP_TEXW * PERSP_TEXH) // 4
    qw.append(giftag(n_image, 1, 2, 0, 0))                          # IMAGE, EOP
    texels = [persp_texel(u, v) for v in range(PERSP_TEXH) for u in range(PERSP_TEXW)]
    for k in range(n_image):
        word = 0
        for lane in range(4):
            word |= (texels[4*k + lane] & 0xFFFFFFFF) << (32 * lane)
        qw.append(word)
    return qw
_persp_payload = build_persp_feeder_setup_payload()
write_payload_mem("payload_persp_feeder_setup.mem", _persp_payload,
    f"Ch342 GS_FEEDER_PERSP_DEMO setup payload ({len(_persp_payload)} qw): 16x16 checkerboard upload @ TBP={PERSP_FEEDER_TBP}, no tris")
write_bios_mem("bios_persp_feeder_setup.mem",
    build_textured_demo_bootlet_disp(len(_persp_payload), CAP_DISPLAY1_HI, CAP_FBW),
    f"Ch342 GS_FEEDER_PERSP_DEMO setup bootlet (DISPLAY1=64x64; QWC={len(_persp_payload)}; checkerboard upload, no tris)")


# ---------------------------------------------------------------------------
# Ch313 — FULL PSMCT16 FRAMEBUFFER MODE. Same 4x4 (64x64) bin-buffer scene as Ch312,
# but the WHOLE framebuffer is PSMCT16: FRAME_1.PSM=PSMCT16 (relaxed close_combined
# gate accepts it when TILE_COLOR_PSMCT16=1), the on-chip tile RAM is 16-bit, the
# flush writes RGB5A1 halfword lanes, and DISPFB1.PSM=PSMCT16 scans it out. The
# textures stay PSMCT32 DECAL (close_combined still requires tex0_psm==PSMCT32).
# A 64x64 PSMCT16 FB is 64*64*2 = 0x2000 (8 KiB) — HALF the 0x4000 PSMCT32 FB — so
# this demo runs in a 16 KiB VRAM (vs Ch312's 32 KiB), textures @ 0x2000+. This is
# the concrete framebuffer-memory saving that motivates the LPDDR-backed FB phase.
F16_FBW = 1
F16_DISPLAY1_HI = (63 << 12) | 63                 # DW=63 (64 wide), DH=63 (64 tall)
F16_TBP0_BG, F16_TBP0_MID, F16_TBP0_FG = 32, 36, 40   # 0x2000 / 0x2400 / 0x2800 (above the 8 KiB FB)
F16_P0 = [(2,  2, 0x0000_5000), (30, 2, 0x0000_5000), (2,  30, 0x0000_5000)]  # blue top-left
F16_P1 = [(20,18, 0x0000_6000), (50,18, 0x0000_6000), (35, 40, 0x0000_6000)]  # red mid, crosses seams
F16_P2 = [(52,52, 0x0000_5800), (60,52, 0x0000_5800), (52, 60, 0x0000_5800)]  # white corner tile


def f16_triangle(tbp0, verts, eop, first, fbw=F16_FBW):
    qw = []
    tex0_val = tex0_pack(tbp0, TMP_TBW, psm=0x00, tw=2, th=2)
    nreg = (6 if first else 5) + 9
    qw.append(giftag(1, eop, 0, nreg, int('E' * nreg, 16)))
    if first:
        qw.append(aplusd(R_FRAME_1, frame_1_psmct16(fbw)))       # Ch313: FRAME is PSMCT16 (Ch321: fbw param)
    qw.append(aplusd(R_PRIM,    prim_tri_tme_abe()))
    qw.append(aplusd(R_ALPHA_1, alpha_pack(0, 1, 0, 1)))
    qw.append(aplusd(R_TEST_1,  test1_geq()))
    qw.append(aplusd(R_ZBUF_1,  zbuf1_pack(2)))
    qw.append(aplusd(R_TEX0_1,  tex0_val))
    for i, (sx, sy, sz) in enumerate(verts):
        tu, tv = TMP_UV[i]
        qw.append(aplusd(R_RGBAQ, rgbaq_data(0x00, 0x00, 0x00)))
        qw.append(aplusd(R_UV,    uv_data(tu, tv)))
        qw.append(aplusd(R_XYZ2,  xyz2_dataz(sx, sy, sz)))
    return qw


def build_tile_psmct16fb_demo_payload():
    qw = []
    qw += b4_texture_upload(F16_TBP0_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # blue
    qw += b4_texture_upload(F16_TBP0_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # red
    qw += b4_texture_upload(F16_TBP0_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # white
    qw += f16_triangle(F16_TBP0_BG,  F16_P0, eop=0, first=True)
    qw += f16_triangle(F16_TBP0_MID, F16_P1, eop=0, first=False)
    qw += f16_triangle(F16_TBP0_FG,  F16_P2, eop=1, first=False)
    return qw


f16_demo_payload = build_tile_psmct16fb_demo_payload()
f16_demo_qwc     = len(f16_demo_payload)
assert f16_demo_qwc <= 95, f"tile_psmct16fb payload {f16_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
f16_demo_bootlet = build_psmct16_demo_bootlet_disp(f16_demo_qwc, F16_DISPLAY1_HI, F16_FBW)

write_bios_mem(
    "bios_tile_psmct16fb.mem", f16_demo_bootlet,
    f"Ch313 FULL-PSMCT16-FB 4x4-GRID BIOS bootlet ({len(f16_demo_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 64x64 PSMCT16; QWC={f16_demo_qwc}"
)
write_payload_mem(
    "payload_tile_psmct16fb.mem", f16_demo_payload,
    f"Ch313 FULL-PSMCT16-FB 4x4-GRID GIF payload ({f16_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); P0/P1/P2 over 64x64 FRAME.PSM=PSMCT16 framebuffer (8 KiB) in 16 KiB VRAM"
)


# ============================================================================
# Ch321 — 128x128 PSMCT16 framebuffer (32 KiB) for LPDDR4B line-buffer scanout.
# Same scene as the 64x64 psmct16fb demo, scaled 2x. DISPLAY1 window 128x128,
# FBW=2 (128 px/row). Textures relocated above the 32 KiB FB. The frame is the
# larger LPDDR4B framebuffer Ch321 scans out.
# ============================================================================
F128_FBW         = 2                          # 128 px / 64 = 2 pages
F128_DISPLAY1_HI = (127 << 12) | 127          # DW=127 (128 wide), DH=127 (128 tall)
F128_TBP0_BG, F128_TBP0_MID, F128_TBP0_FG = 128, 132, 136  # 0x8000/0x8400/0x8800 (above the 32 KiB FB)
F128_P0 = [(4,  4, 0x0000_5000), (60, 4, 0x0000_5000), (4,  60, 0x0000_5000)]   # blue (2x of F16_P0)
F128_P1 = [(40,36, 0x0000_6000), (100,36, 0x0000_6000), (70, 80, 0x0000_6000)]  # red mid (2x F16_P1)
F128_P2 = [(104,104,0x0000_5800),(120,104,0x0000_5800),(104,120,0x0000_5800)]   # white corner (2x F16_P2)


def build_tile_lpddr128_demo_payload():
    qw = []
    qw += b4_texture_upload(F128_TBP0_BG,  solid_texel(0x00, 0x00, 0xFF, 0x80))  # blue
    qw += b4_texture_upload(F128_TBP0_MID, solid_texel(0xFF, 0x00, 0x00, 0x80))  # red
    qw += b4_texture_upload(F128_TBP0_FG,  solid_texel(0xFF, 0xFF, 0xFF, 0x40))  # white
    qw += f16_triangle(F128_TBP0_BG,  F128_P0, eop=0, first=True,  fbw=F128_FBW)
    qw += f16_triangle(F128_TBP0_MID, F128_P1, eop=0, first=False, fbw=F128_FBW)
    qw += f16_triangle(F128_TBP0_FG,  F128_P2, eop=1, first=False, fbw=F128_FBW)
    return qw


f128_demo_payload = build_tile_lpddr128_demo_payload()
f128_demo_qwc     = len(f128_demo_payload)
assert f128_demo_qwc <= 95, f"tile_lpddr128 payload {f128_demo_qwc} qwords may collide with heartbeat splicer at qword 115"
f128_demo_bootlet = build_psmct16_demo_bootlet_disp(f128_demo_qwc, F128_DISPLAY1_HI, F128_FBW)

write_bios_mem(
    "bios_tile_lpddr128.mem", f128_demo_bootlet,
    f"Ch321 128x128 PSMCT16 FB BIOS bootlet ({len(f128_demo_bootlet)} words active, padded to "
    f"{BIOS_TOTAL_WORDS}); DISPLAY1 = 128x128 PSMCT16; FBW=2; QWC={f128_demo_qwc}"
)
write_payload_mem(
    "payload_tile_lpddr128.mem", f128_demo_payload,
    f"Ch321 128x128 PSMCT16 FB GIF payload ({f128_demo_qwc} qwords active at byte 0x100, padded to "
    f"{RAM_TOTAL_QWORDS}); P0/P1/P2 over 128x128 FRAME.PSM=PSMCT16 framebuffer (32 KiB) in 64 KiB VRAM"
)
print(f"[bake] wrote Ch321 bios_tile_lpddr128.mem ({len(f128_demo_bootlet)} active) + payload_tile_lpddr128.mem ({f128_demo_qwc} active)")

print(f"[bake] wrote Ch316 bios_tile_late.mem ({len(late_demo_bootlet)} active) + payload_tile_late.mem ({late_demo_qwc} active)")
print(f"[bake] wrote Ch315 bios_tile_cap.mem ({len(cap_demo_bootlet)} active) + payload_tile_cap.mem ({cap_demo_qwc} active)")
print(f"[bake] wrote Ch314 bios_tile_palbilinear.mem ({len(pb_demo_bootlet)} active) + payload_tile_palbilinear.mem ({pb_demo_qwc} active)")
print(f"[bake] wrote Ch313 bios_tile_psmct16fb.mem ({len(f16_demo_bootlet)} active) + payload_tile_psmct16fb.mem ({f16_demo_qwc} active)")
print(f"[bake] wrote Ch312 bios_tile_bin4x4.mem ({len(b4_demo_bootlet)} active) + payload_tile_bin4x4.mem ({b4_demo_qwc} active)")
print(f"[bake] wrote Ch311 bios_tile_bin.mem ({len(bn_demo_bootlet)} active) + payload_tile_bin.mem ({bn_demo_qwc} active)")
print(f"[bake] wrote Ch310 bios_tile_bilinear.mem ({len(bil_demo_bootlet)} active) + payload_tile_bilinear.mem ({bil_demo_qwc} active)")
print(f"[bake] wrote Ch309 bios_tile_alpha.mem ({len(tal_demo_bootlet)} active) + payload_tile_alpha.mem ({tal_demo_qwc} active)")
print(f"[bake] wrote Ch308 bios_tile_psmct16.mem ({len(t16_demo_bootlet)} active) + payload_tile_psmct16.mem ({t16_demo_qwc} active)")
print(f"[bake] wrote Ch307 bios_tile_wrap.mem ({len(twr_demo_bootlet)} active) + payload_tile_wrap.mem ({twr_demo_qwc} active)")
print(f"[bake] wrote Ch306 bios_tile_scissor.mem ({len(tsc_demo_bootlet)} active) + payload_tile_scissor.mem ({tsc_demo_qwc} active)")
print(f"[bake] wrote Ch305 bios_tile_multiprim.mem ({len(tmp_demo_bootlet)} active) + payload_tile_multiprim.mem ({tmp_demo_qwc} active)")
print(f"[bake] wrote Ch304 bios_tile2x2.mem ({len(tile2_demo_bootlet)} active) + payload_tile2x2.mem ({tile2_demo_qwc} active)")
print(f"[bake] wrote Ch303 bios_tile.mem ({len(tile_demo_bootlet)} active) + payload_tile.mem ({tile_demo_qwc} active)")
print(f"[bake] wrote Ch302 bios_combined.mem ({len(comb_demo_bootlet)} active) + payload_combined.mem ({comb_demo_qwc} active)")
print(f"[bake] wrote Ch301b bios_persp_floor.mem ({len(pfloor_demo_bootlet)} active) + payload_persp_floor.mem ({pfloor_demo_qwc} active)")
print(f"[bake] wrote Ch301 bios_persp.mem ({len(persp_demo_bootlet)} active) + payload_persp.mem ({persp_demo_qwc} active)")
print(f"[bake] wrote Ch300 bios_swz32.mem ({len(swz32_demo_bootlet)} active) + payload_swz32.mem ({swz32_demo_qwc} active)")
print(f"[bake] wrote Ch299 bios_swz8.mem ({len(swz8_demo_bootlet)} active) + payload_swz8.mem ({swz8_demo_qwc} active)")
print(f"[bake] wrote Ch298 bios_swz4.mem ({len(swz4_demo_bootlet)} active) + payload_swz4.mem ({swz4_demo_qwc} active)")
print(f"[bake] wrote Ch297 bios_clut4.mem ({len(clut4_demo_bootlet)} active) + payload_clut4.mem ({clut4_demo_qwc} active)")
print(f"[bake] wrote Ch296 bios_clut.mem ({len(clut8_demo_bootlet)} active) + payload_clut.mem ({clut8_demo_qwc} active)")
print(f"[bake] wrote Ch251 bios.mem ({len(ch251_bootlet)} active) + payload.mem ({CH251_QWC} active)")
print(f"[bake] wrote Ch146 bios_ch146.mem ({len(ch146_bootlet)} active) + payload_ch146.mem (24 active)")
print(f"[bake] wrote Brick1 bios_textured.mem ({len(tex_demo_bootlet)} active) + payload_textured.mem ({tex_demo_qwc} active)")
print(f"[bake] wrote Brick2a bios_alpha.mem ({len(alpha_demo_bootlet)} active) + payload_alpha.mem ({alpha_demo_qwc} active)")
print(f"[bake] wrote Brick2b bios_zbuffer.mem ({len(zbuf_demo_bootlet)} active) + payload_zbuffer.mem ({zbuf_demo_qwc} active)")
print(f"[bake] wrote TexTri bios_tritex.mem ({len(tritex_demo_bootlet)} active) + payload_tritex.mem ({tritex_demo_qwc} active)")
print(f"[bake] wrote Brick3 bios_triangle.mem ({len(tri_demo_bootlet)} active) + payload_triangle.mem ({tri_demo_qwc} active)")