Files
retroDE_ps2/rtl/ee/ee_core_stub.sv
T
thejayman77 ec82764bef Initial commit: retroDE_ps2 — first-of-its-kind PS2 GS FPGA core (DE25-Nano / Agilex 5)
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression
(272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps,
and all dump-derived textures/traces) is excluded via .gitignore and stays local.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 20:10:50 -04:00

2155 lines
113 KiB
Systemverilog

// retroDE_ps2 — ee_core_stub
//
// First EE-side execution primitive. Structural mirror of
// `iop_core_stub` — same multi-cycle FSM, same R3000 subset, same
// delay-slot discipline, same strict-unsupported gate, same COP0
// exception-entry shape. The files are kept separate because the EE
// is fundamentally an R5900 (MIPS III + SCE extensions + VU) and will
// eventually need 64-bit register widths, COP1/COP2, and VU-side
// plumbing the IOP will never grow. For the first EE chapter the core
// is deliberately as narrow as iop_core_stub so the earliest EE
// programs can exercise fetch + decode + branches + halt before any
// of that complexity is needed.
//
// Where the two cores differ right now:
// - Trace subsystem: SUBSYS_EE (iop_core_stub uses SUBSYS_IOP).
// - Default PC_RESET and EXC_VECTOR are the same architectural
// values (0xBFC0_0000 / 0x0000_0080) — real R5900 and real R3000
// both reset to 0xBFC0_0000.
//
// Supported opcodes (MIPS encoding):
// SPECIAL (opcode = 0x00):
// func 0x00 (SLL), 0x02 (SRL), 0x03 (SRA) — shift family
// (destination = rd; operand = rt; count = shamt [10:6].
// SRA uses arithmetic right shift so the MSB sign-extends;
// SRL zero-fills. SLL $0,$0,0 is the canonical NOP encoding
// and flows through this path harmlessly — the rd_idx=0
// guard suppresses the writeback.)
// func 0x08 (JR), func 0x09 (JALR), func 0x0C (SYSCALL),
// (JALR: register-indirect call. Target is rs_val (same
// path as JR), link address pc+8 is written to rd_idx.
// rd_idx==0 suppresses the link write — valid encoding
// for "jump indirect without keeping a return address".
// First real-BIOS trip at 0xBFC5_29E8 after SH.)
// func 0x20 (ADD), 0x21 (ADDU), 0x22 (SUB), 0x23 (SUBU)
// (ADD/SUB implemented as ADDU/SUBU — same pragmatic policy
// as ADDI vs ADDIU; Arithmetic Overflow exception deferred.
// First real-BIOS ADD trip at 0x0000_060C (RAM-resident
// code) after JALR unlocked retired=84112.)
// func 0x24 (AND), 0x25 (OR), 0x26 (XOR), 0x27 (NOR)
// func 0x2A (SLT), 0x2B (SLTU) — R-type compare, register form
// pair of SLTI/SLTIU (first real-BIOS SLTU trip at 0xBFC0_2644)
// (R-type ALU family; destination is rd, not rt.
// OR was the first trip in the logic subset at 0xBFC0_2074;
// the rest landed because they share the exact same R-type
// plumbing. ADDU was the first arith trip at 0xBFC0_2640 after
// JAL landed, paired with SUBU in the same addition.)
// other funcs → NOP.
// 0x02 J 0x03 JAL 0x04 BEQ 0x05 BNE
// (JAL: jump-and-link; target is the same form as J, but writes
// pc+8 into $31. First real-BIOS trip at 0xBFC0_23D0 after LB
// unlocked retired=1714.)
// 0x08 ADDI (behaves as ADDIU — we do not model the Arithmetic
// Overflow exception yet; real BIOS emits ADDI where
// overflow cannot happen in practice)
// 0x09 ADDIU
// 0x0A SLTI 0x0B SLTIU (first evidence-driven growth past IOP's
// subset — real BIOS hit SLTI at
// 0xBFC0_0008 under strict mode, then ADDI
// at 0xBFC0_206C, then ANDI at 0xBFC0_2070
// as each new op unlocked the next)
// 0x0C ANDI 0x0D ORI 0x0F LUI
// 0x10 COP0 (rs=0 MFC0, rs=4 MTC0, rs=0x10/func=0x10 RFE)
// 0x20 LB 0x21 LH 0x23 LW 0x25 LHU 0x28 SB 0x29 SH 0x2B SW
// (SB: byte store; broadcasts rt[7:0] into the addressed lane
// and sets a one-hot byte-enable on the map_wr_be bus. First
// real-BIOS trip at 0xBFC0_20A0 after the R-type logic family
// unlocked retired=180.
// LB: byte load with sign-extension; extracts byte at ea[1:0]
// from the 32-bit word returned by the map and 24-bit
// sign-extends. First real-BIOS trip at 0xBFC0_23A8 after SB
// unlocked retired=1704.
// LH/LHU: halfword load, sign- and zero-extended respectively.
// Halfword addressing uses ea[1] only (ea[0] must be zero for
// aligned access). First real-BIOS LH trip at 0xBFC0_2684 after
// SLT/SLTU unlocked retired=7385; LHU landed in the same
// chapter because it shares the exact same extraction plumbing
// with a zero-fill instead of sign-fill.)
// Anything else → NOP (or strict trap when STRICT_UNSUPPORTED=1).
//
// Intentionally NOT yet (evidence-driven growth will resolve these
// as the real BIOS demands them):
// - 64-bit regs / LD / SD / LQ / SQ (R5900 territory)
// - COP1 (FPU), COP2 (VU0 macro mode), VU1, VIF
// - TLB / caches
// - Signed DIV, MULT, MULTU, MTHI/MTLO — HI/LO architectural
// state + unsigned DIVU + MFHI/MFLO landed in ch43; the
// remaining mul/div/signed-div and HI/LO *writes* defer until
// BIOS demands them. Divide latency and overflow detail are
// not modelled (DIVU completes in one cycle, matching the
// ALU style; revisit only if a timing-sensitive test needs
// it).
// - Most of COP0 beyond {Count (read-only), Status, Cause, EPC}
// — Compare, Random, EntryHi/Lo, etc. Count write via MTC0 is
// silently dropped; revisit if a BIOS loop depends on reset-
// to-value semantics.
// - Arithmetic Overflow exception on ADD / SUB / ADDI (they
// execute as ADDU / SUBU / ADDIU for now; the exception path
// lights up only when real BIOS code actually overflows)
// - REGIMM link variants (BLTZAL/BGEZAL) — the non-linking
// BLTZ/BGEZ landed in ch41 alongside BLEZ/BGTZ; link variants
// defer until the real BIOS needs them.
// - unaligned load/store (LWL/LWR/SWL/SWR) — rare in early boot
// - BD bit in Cause; nested interrupts; syscall/break dispatch
// beyond SYSCALL-as-halt.
//
// Trace (SUBSYS_EE, EV_IFETCH one-per-retire; flag bits match
// iop_core_stub):
// bit 0 = SW bit 1 = LW bit 2 = branch/jump taken
// bit 3 = SYSCALL halt bit 4 = instruction in delay slot
// bit 5 = exception taken bit 6 = RFE retired
// bit 7 = strict trap
//
// Strict mode (STRICT_UNSUPPORTED parameter) matches iop_core_stub.
`timescale 1ns/1ps
module ee_core_stub
import trace_pkg::*;
#(
parameter logic [31:0] PC_RESET = 32'hBFC0_0000,
parameter logic [31:0] EXC_VECTOR = 32'h0000_0080,
parameter bit STRICT_UNSUPPORTED = 1'b0,
// Ch47: address-error enforcement on word/halfword loads and
// stores. When enabled, any SW/LW with ea[1:0] != 0 or
// SH/LH/LHU with ea[0] != 0 halts the core and sets
// trap_o / trap_pc_o / trap_instr_o (same mechanism as
// STRICT_UNSUPPORTED). Default on — real MIPS raises an
// AdEL/AdES exception here; silently aliasing to the aligned
// word was a simulation artifact that previously made
// unaligned writes look like valid aligned stores.
parameter bit TRAP_ALIGN_ERROR = 1'b1,
// Ch50: Status.BEV (boot exception vector) modeling. Real MIPS
// resets with BEV=1 so exceptions vector to a ROM-resident
// "boot" handler (BEV_EXC_VECTOR). The BIOS writes BEV=0 via
// MTC0 once its RAM-resident handler is installed, at which
// point exceptions vector to EXC_VECTOR. INIT_BEV controls the
// reset state of Status.BEV; default 0 keeps the pre-ch50
// backward-compatible behavior (always vector to EXC_VECTOR).
// tb_ee_core_bios_smoke sets INIT_BEV=1 to match real-BIOS
// semantics.
parameter bit INIT_BEV = 1'b0,
parameter logic [31:0] BEV_EXC_VECTOR = 32'hBFC0_0380,
// Ch162 — strip the synthesized 32-bit hardware divider that
// Quartus infers from the Ch43 DIVU instruction's `/` and `%`
// operators. The auto-generated divider is the Ch159+ STA
// critical path (top-10 worst paths all live in
// `u_demo|u_core|div_0_rtl_0|auto_generated|divider|...`,
// ~32 ns of combinational ripple). Default 0 keeps the
// load-bearing DIVU semantics for every existing Ch43
// integration TB (`tb_ee_core_divu_mflo`, etc.). When set to
// 1, the DIVU writeback path becomes a no-op (HI/LO stay at
// their prior values, just as in the divisor==0 case the
// spec calls undefined). Synthesis builds that target the
// PSMCT32 SPRITE-only raster demo can set this to 1 — the
// bootlet doesn't execute DIVU, so removing the divider is
// behavior-neutral for that demo while freeing the
// critical-path budget for everything else. Sim TBs that
// assert DIVU output (only `tb_ee_core_divu_mflo` today)
// keep the parameter at its default 0.
parameter bit STRIP_HW_DIVIDER = 1'b0,
// Ch215 — labeled sim-only jmp_buf restore for SYSCALL #8 ($v1=8,
// $a0=2). When enabled, the SYSCALL handler enters a 12-step LW
// FSM that loads $ra/$sp/$fp/$s0..$s7/$gp from a hardcoded frame
// at CH215_JMPBUF_BASE (proven by Ch214 to be the BIOS-side
// 0xA000B1E0 jmp_buf assembled via LUI $r,0xA001 + ADDIU $r,$r,
// -0x4E20 at the setjmp call site 0xBFC52340/4C). Also sets $v0=1
// so the post-setjmp `beq $v0,$0` at 0xBFC52350 falls through to
// the longjmp-return path. NOT general syscall #8 semantics — a
// BIOS-bringup shim until the real kernel handler is modeled.
parameter bit CH215_JMPBUF_RESTORE_ENABLE = 1'b0,
parameter logic [31:0] CH215_JMPBUF_BASE = 32'hA000_B1E0,
// Ch273 — minimal EE syscall HLE dispatcher for the ELF runner /
// user-mode flow. When enabled, SYSCALL with a known $v1 ($v1 is
// the PS2-EE syscall-number convention) sets $v0 to a stub return
// and advances PC to PC+4 (normal user-code SYSCALL resume — NOT
// RFE; that's the Ch199 path). Unknown $v1 still halts (with $v1
// and $a0..$a3 readable via hierarchical sim peek from the TB) so
// the next blocker surfaces.
//
// Default 0 to keep every existing TB's "syscall = halt-PASS"
// pattern working (those tests don't set $v1, so $v1 could be
// anything; gating prevents accidental dispatch).
//
// Recognized $v1 values (qbert.elf crt0 prolog):
// 0x3C EndOfHeap -> $v0 = SYSCALL_HEAP_END (default 0x001E0000)
// 0x3D InitMainThread -> $v0 = 0 (stub success)
// 0x64 FlushCache -> $v0 = 0 (no-op success)
parameter bit EE_SYSCALL_HLE_ENABLE = 1'b0,
parameter logic [31:0] SYSCALL_HEAP_END = 32'h001E_0000
) (
input logic clk,
input logic rst_n,
input logic go_i,
output logic map_rd_en,
output logic [31:0] map_rd_addr,
input logic [31:0] map_rd_data,
input logic map_rd_valid,
output logic map_wr_en,
output logic [31:0] map_wr_addr,
output logic [31:0] map_wr_data,
output logic [3:0] map_wr_be,
input logic cpu_irq,
output logic halt_o,
output logic [31:0] pc_o,
output logic trap_o,
output logic [31:0] trap_pc_o,
output logic [31:0] trap_instr_o,
output logic ev_valid,
output subsys_e ev_subsys,
output event_e ev_event,
output logic [63:0] ev_arg0,
output logic [63:0] ev_arg1,
output logic [63:0] ev_arg2,
output logic [63:0] ev_arg3,
output logic [31:0] ev_flags
);
// ------------------------------------------------------------------
// Opcode / func / COP0 rs constants
// ------------------------------------------------------------------
localparam logic [5:0] OP_SPECIAL = 6'h00;
// REGIMM = opcode 0x01. rt-field selects the branch sub-op:
// rt=0x00 -> BLTZ, rt=0x01 -> BGEZ (link variants 0x10/0x11
// intentionally not yet modeled; add when real-BIOS needs them).
localparam logic [5:0] OP_REGIMM = 6'h01;
localparam logic [4:0] REGIMM_BLTZ = 5'h00;
localparam logic [4:0] REGIMM_BGEZ = 5'h01;
localparam logic [5:0] OP_J = 6'h02;
localparam logic [5:0] OP_JAL = 6'h03;
localparam logic [5:0] OP_BEQ = 6'h04;
// Ch274 — MIPS-II BEQL (Branch on Equal Likely), opcode 0x14.
// Same compare as BEQ; differs from BEQ only in the not-taken
// path: the delay slot is SQUASHED (PC jumps directly to PC+8)
// instead of executing. qbert.elf hits this at PC 0x001000C0 in
// its C++ static-constructor walker, where the delay slot at
// PC+4 clobbers $a0 and MUST be squashed when the table is empty.
localparam logic [5:0] OP_BEQL = 6'h14;
// Ch277 — MIPS-II BNEL (Branch on Not Equal Likely), opcode 0x15.
// Mirror of BEQL with the BNE condition. Taken when rs!=rt
// (delay slot executes, branch target reached); not-taken when
// rs==rt (delay slot SQUASHED, PC jumps to PC+8). qbert.elf hits
// this at PC 0x00112C7C inside a function body — predicted as
// the Ch276→Ch277 follow-on at the Ch274 closeout.
localparam logic [5:0] OP_BNEL = 6'h15;
localparam logic [5:0] OP_BNE = 6'h05;
localparam logic [5:0] OP_BLEZ = 6'h06;
localparam logic [5:0] OP_BGTZ = 6'h07;
localparam logic [5:0] OP_ADDI = 6'h08;
localparam logic [5:0] OP_ADDIU = 6'h09;
localparam logic [5:0] OP_SLTI = 6'h0A;
localparam logic [5:0] OP_SLTIU = 6'h0B;
localparam logic [5:0] OP_ANDI = 6'h0C;
localparam logic [5:0] OP_ORI = 6'h0D;
localparam logic [5:0] OP_LUI = 6'h0F;
localparam logic [5:0] OP_COP0 = 6'h10;
localparam logic [5:0] OP_LB = 6'h20;
localparam logic [5:0] OP_LH = 6'h21;
localparam logic [5:0] OP_LW = 6'h23;
// Ch279 — R5900 EE Load Quadword. opcode=0x1E, I-type:
// lq rt, imm(base) → rt[127:0] = mem[base+imm][127:0]
// 128-bit load symmetric to SQ (Ch271). Our regfile is 32-bit
// so we read only the low 32 bits at EA+0 — upper 96 bits are
// unrepresentable and discarded. Requires 16-byte alignment;
// misaligned trips the existing AdEL path. qbert.elf hits
// `lq $t1, 0($a1)` at PC 0x00112C88.
localparam logic [5:0] OP_LQ = 6'h1E;
localparam logic [5:0] OP_LBU = 6'h24;
localparam logic [5:0] OP_LHU = 6'h25;
localparam logic [5:0] OP_SB = 6'h28;
localparam logic [5:0] OP_SH = 6'h29;
localparam logic [5:0] OP_SW = 6'h2B;
// Ch278 — R5900 EE Multimedia Instruction (MMI) prefix.
// Primary opcode 0x1C; the actual sub-instruction is selected
// by a combination of the funct field (bits 5:0) and the sa
// field (bits 10:6), depending on which sub-group the funct
// names (MMI0/MMI1/MMI2/MMI3). For Ch278 we ONLY decode the
// single (funct=0x09 / sa=0x0E) pattern = MMI2/PCPYLD; every
// other MMI sub-instruction continues to fall through to
// strict-trap so the runner surfaces the next concrete blocker.
localparam logic [5:0] OP_MMI = 6'h1C;
localparam logic [5:0] FUNC_MMI2 = 6'h09;
localparam logic [4:0] MMI2_PCPYLD = 5'h0E;
// Ch282 — R5900 EE MMI2/PAND (Parallel AND). Same MMI2 funct
// group as PCPYLD; sa 0x12 selects PAND. Architectural 128-bit
// bitwise AND; in our 32-bit model identical to standard AND
// (SPECIAL funct 0x24). qbert.elf hits `pand $v0, $v0, $v1`
// (instr 0x70431489) at PC 0x00112C98.
localparam logic [4:0] MMI2_PAND = 5'h12;
// Ch280 — R5900 EE MMI0/PSUBB (Parallel Subtract Byte).
// primary opcode 0x1C, funct 0x08 (MMI0), sa 0x09 (PSUBB).
// 16-way parallel byte subtract across 128 bits; in our
// 32-bit-GPR model, 4 parallel byte subs on the low 32 bits
// (each lane modulo 256, no carry between bytes). qbert.elf
// hits `psubb $v0, $t1, $t2` (instr 0x712A1248) at PC 0x00112C90.
localparam logic [5:0] FUNC_MMI0 = 6'h08;
localparam logic [4:0] MMI0_PSUBB = 5'h09;
// Ch281 — R5900 EE MMI3/PNOR (Parallel Not-OR), opcode 0x1C,
// funct 0x29 (MMI3), sa 0x13 (PNOR). Architectural 128-bit
// bitwise NOR; in our 32-bit model, identical to standard NOR
// (SPECIAL funct 0x27). With rs=$zero this is the canonical
// MIPS "NOT" pseudo. qbert.elf hits `pnor $v1, $zero, $t1`
// (instr 0x70091CE9) at PC 0x00112C94.
localparam logic [5:0] FUNC_MMI3 = 6'h29;
localparam logic [4:0] MMI3_PNOR = 5'h13;
// Ch283 — R5900 EE MMI3/PCPYUD (Parallel Copy Upper Doubleword),
// opcode 0x1C, funct 0x29 (MMI3), sa 0x0E (PCPYUD). Architectural:
// $rd[127:64] = $rs[127:64]; $rd[63:0] = $rt[127:64]
// i.e. extract the *upper* doubleword of each source, with $rt's
// upper-D becoming $rd's lower-D. First MMI op that *reads* from
// the upper 64 bits of source registers — drove the introduction
// of the gpr128 shadow in Ch283. qbert.elf hits `pcpyud $a0, $v0,
// $t1` (instr 0x704923A9) at PC 0x00112CA0.
localparam logic [4:0] MMI3_PCPYUD = 5'h0E;
// Ch300 — R5900 EE MMI3/PCPYH (Parallel Copy Halfword), opcode
// 0x1C, funct 0x29 (MMI3), sa 0x1B. Broadcasts the low halfword
// of each 64-bit doubleword of $rt across the four halfword lanes
// of the corresponding doubleword in $rd. rs is ignored.
// h0 = $rt[15:0], h4 = $rt[79:64]
// $rd low 64 = {h0, h0, h0, h0}
// $rd high 64 = {h4, h4, h4, h4}
// qbert.elf hits `pcpyh $v1, $t0` (instr 0x70081EE9) at PC
// 0x00110BB4 — first call that uses MMI3 with sa=0x1B after
// Ch299's library-gate unblock.
localparam logic [4:0] MMI3_PCPYH = 5'h1B;
// Ch271 — R5900 EE Store Quadword. opcode=0x1F, I-type:
// sq rt, imm(base) → store 128 bits ([rt[127:0]]) to mem[base+imm].
// First real-ELF (qbert.elf) hit `sq $zero, 0($v0)` 8 instructions
// into its prolog. Minimal scope: 4-beat 32-bit-stripe write
// through the existing 32-bit map_wr_data port. Upper 96 bits of
// GPRs aren't modelled, so for non-$zero rt the lower 32 land at
// beat 0 and beats 1-3 write zero (degrades gracefully for the
// common "clear a 128-bit slot" use case). Requires 16-byte
// alignment; misaligned trips the existing AdES path.
localparam logic [5:0] OP_SQ = 6'h1F;
// Ch275 — R5900 EE Store Doubleword. MIPS-III opcode 0x3F:
// sd rt, imm(base) → store 64 bits ([rt[63:0]]) to mem[base+imm].
// qbert.elf hits `sd $ra, 0x20($sp)` (0xFFBF0020) in a function
// prologue at PC 0x00112DAC. Modelled as a 2-beat 32-bit-stripe
// write FSM through the existing map_wr port. Upper 32 bits of
// GPRs aren't modelled, so beat 0 lands rt_val[31:0] and beat 1
// writes 0 — matching the SQ approximation. Requires 8-byte
// alignment; misaligned falls into the existing AdES path.
localparam logic [5:0] OP_SD = 6'h3F;
// Ch284 — MIPS-III LD (Load Doubleword), opcode 0x37. The read-
// side of SD. 2-beat 32-bit load FSM (sq_beat counter, terminal
// beat = 1) reusing the LQ map-driver beat addressing. Beat 0
// captures mem[ea+0] into gpr128[rt][31:0] and mirrors low 32 to
// regfile[rt]; beat 1 captures mem[ea+4] into gpr128[rt][63:32].
// gpr128[rt][127:64] is unchanged (architectural LD only loads
// doubleword; upper 64 of $rt are preserved in real R5900). 8-byte
// alignment required; misaligned trips the existing AdEL path.
// qbert.elf hits `ld $ra, 0($ra)` (0xDFBF0000) at PC 0x00113378
// — the function-epilogue $ra restore.
localparam logic [5:0] OP_LD = 6'h37;
// Ch178 — CACHE accepted as legal no-op. The EE BIOS issues
// CACHE (often Hit-Invalidate or Index-Invalidate of the
// I-cache, op=0x05) after copying exception handlers / trampolines
// into RAM to flush stale icache lines. The stub has no modeled
// caches, so executing CACHE as a semantic NOP (no register/memory
// side effects, PC advances normally) is the standard pragmatic
// choice — see Codex Ch178 caution: accept the whole CACHE class
// regardless of sub-op (`rt` field). If a later chapter actually
// models cache coherency, this is the natural place to revisit.
localparam logic [5:0] OP_CACHE = 6'h2F;
localparam logic [5:0] FUNC_SLL = 6'h00;
// Ch276 — R5900 DSLL (Doubleword Shift Left Logical), MIPS-III
// SPECIAL funct 0x38. Architecturally a 64-bit left shift by
// `sa` (sa ∈ [0,31]). Our regfile is 32-bit, so for any valid
// sa < 32 the low 32-bit result is identical to SLL. PS2 ELFs
// use DSLL to build 64-bit constants and to do unsigned shifts
// where the implicit 64-bit width matters; qbert.elf hits
// `dsll $t1, $t1, 16` at PC 0x00112C54.
localparam logic [5:0] FUNC_DSLL = 6'h38;
localparam logic [5:0] FUNC_SRL = 6'h02;
localparam logic [5:0] FUNC_SRA = 6'h03;
// Ch67 — variable-shift family. Operand = rt_val, shift amount =
// rs_val[4:0], destination = rd. Surfaced by the real-BIOS
// copied-code region (first hit SRLV at pc=0x001A459C after
// Ch64/Ch65 mirrors); SLLV/SRAV are the direct siblings and tend
// to co-occur in shift-heavy code paths.
localparam logic [5:0] FUNC_SLLV = 6'h04;
localparam logic [5:0] FUNC_SRLV = 6'h06;
localparam logic [5:0] FUNC_SRAV = 6'h07;
localparam logic [5:0] FUNC_JR = 6'h08;
localparam logic [5:0] FUNC_JALR = 6'h09;
localparam logic [5:0] FUNC_SYSCALL = 6'h0C;
// Ch292 — MIPS SYNC (SPECIAL funct 0x0F). Memory-ordering barrier;
// architecturally orders prior loads/stores against subsequent ones.
// In this in-order stub model with no cache coherency, store buffer,
// or out-of-order memory, SYNC has no visible side effect. qbert.elf
// hits the canonical encoding 0x0000000F (rs/rt/rd/sa all zero) at
// PC 0x00112994 — the post-registration memory barrier after the
// Ch290/291 paired Add+Enable handler calls.
localparam logic [5:0] FUNC_SYNC = 6'h0F;
// SPECIAL-encoded HI/LO ops (ch43 addition):
// MFHI=0x10, MFLO=0x12, DIVU=0x1B.
// MFHI shares its numeric value with RFE below (both 6'h10),
// but RFE lives under OP_COP0 while MFHI lives under OP_SPECIAL
// — different opcodes so the collision is harmless.
localparam logic [5:0] FUNC_MFHI = 6'h10;
localparam logic [5:0] FUNC_MFLO = 6'h12;
localparam logic [5:0] FUNC_MULTU = 6'h19;
localparam logic [5:0] FUNC_DIVU = 6'h1B;
localparam logic [5:0] FUNC_RFE = 6'h10;
// Ch286 — R5900 EI (Enable Interrupts), EE-specific extension to
// the MIPS COP0 CO sub-table. qbert.elf hits the exact 32-bit
// encoding 0x42000038 (opcode=COP0, rs=CO, rt/rd/sa=0, funct=0x38)
// at PC 0x001000FC during init. Codex framing: decode the EXACT
// 32-bit instruction, accept it as side-effect-free (no GPR
// writeback, PC += 4, no halt/trap), and do NOT NOP-class all
// COP0/CO instructions. The companion DI (funct 0x39) is left
// trapping until a future ELF surfaces it.
localparam logic [31:0] EI_INSTR_R5900 = 32'h4200_0038;
localparam logic [5:0] FUNC_ADD = 6'h20;
localparam logic [5:0] FUNC_ADDU = 6'h21;
// Ch272 — R5900 DADDU (doubleword add unsigned), MIPS-III SPECIAL
// funct 0x2D. Architecturally a 64-bit unsigned add, with no
// overflow trap. Our regfile is 32-bit, so we model it as ADDU
// (low 32 bits only); upper 32 bits silently dropped as
// elsewhere. PS2 ELFs use DADDU as the canonical 64-bit
// `move rd, rs` (= daddu rd, rs, $zero) pseudo-instruction;
// qbert.elf hits this at PC 0x00100068.
localparam logic [5:0] FUNC_DADDU = 6'h2D;
localparam logic [5:0] FUNC_SUB = 6'h22;
localparam logic [5:0] FUNC_SUBU = 6'h23;
// Ch305 — R5900 DSUBU (doubleword subtract unsigned), MIPS-III
// SPECIAL funct 0x2F. The 64-bit-subtract sibling of DADDU
// (Ch272). Our regfile is 32-bit, so modelled as SUBU on the low
// 32 bits, no overflow trap (matching the DADDU low-32
// approximation). qbert.elf hits `dsubu $v0, $v1, $v0`
// (instr 0x0062102F) at PC 0x00110A60.
localparam logic [5:0] FUNC_DSUBU = 6'h2F;
localparam logic [5:0] FUNC_AND = 6'h24;
localparam logic [5:0] FUNC_OR = 6'h25;
localparam logic [5:0] FUNC_XOR = 6'h26;
localparam logic [5:0] FUNC_NOR = 6'h27;
localparam logic [5:0] FUNC_SLT = 6'h2A;
localparam logic [5:0] FUNC_SLTU = 6'h2B;
localparam logic [4:0] COP0_RS_MF = 5'h00;
localparam logic [4:0] COP0_RS_MT = 5'h04;
localparam logic [4:0] COP0_RS_CO = 5'h10;
localparam logic [4:0] COP0_REG_BADVADDR = 5'd8; // ch49 addition
localparam logic [4:0] COP0_REG_COUNT = 5'd9;
localparam logic [4:0] COP0_REG_STATUS = 5'd12;
localparam logic [4:0] COP0_REG_CAUSE = 5'd13;
localparam logic [4:0] COP0_REG_EPC = 5'd14;
// Ch49: MIPS exception code values for synchronous address
// errors. EXC_CODE_ADEL fires on a misaligned load (LW/LH/LHU);
// EXC_CODE_ADES fires on a misaligned store (SW/SH). Other
// ExcCode values (Int=0, Sys=8, Bp=9, RI=10, CpU=11, Ov=12,
// Tr=13) are not yet modelled — they'll land when real BIOS
// code demands them, same way the opcode batches grew earlier.
localparam logic [4:0] EXC_CODE_INT = 5'd0;
localparam logic [4:0] EXC_CODE_ADEL = 5'd4;
localparam logic [4:0] EXC_CODE_ADES = 5'd5;
// ------------------------------------------------------------------
// FSM state
// ------------------------------------------------------------------
typedef enum logic [3:0] {
S_IDLE = 4'd0,
S_IFETCH_REQ = 4'd1,
S_IFETCH_WAIT = 4'd2,
S_EXECUTE = 4'd3,
S_MEM_REQ = 4'd4,
S_MEM_WAIT = 4'd5,
S_MEM_WRITE = 4'd6,
S_HALT = 4'd7,
// Ch215 — labeled sim-only jmp_buf restore for SYSCALL #8
// ($v1=8, $a0=2). 12-step LW sequence walks frame at
// CH215_JMPBUF_BASE (proven hardcoded constant 0xA000B1E0 by
// Ch214). Each step issues one LW (S_CH215_REQ) and captures
// its return value into the right regfile slot (S_CH215_WAIT).
S_CH215_REQ = 4'd8,
S_CH215_WAIT = 4'd9
} state_e;
state_e state;
// Ch215 — jmp_buf restore FSM counter (0..11 walking 12 slots).
logic [3:0] ch215_count;
// Ch271 — SQ 4-beat write counter. While SQ is in flight, this
// counts 0..3; each beat writes one 32-bit lane at ea + beat*4.
logic [1:0] sq_beat;
logic [31:0] pc;
logic [31:0] instr;
logic [31:0] regfile [0:31];
// Ch283 — 128-bit GPR shadow. The R5900 architectural register is
// 128 bits, but every prior chapter has lived inside a "low 32
// only" approximation. gpr128 is parallel storage: scalar writes
// mirror their result into gpr128[X][31:0] with the upper 96 bits
// zeroed (matches the R5900 rule that scalar ops clear the upper
// bits of the destination). MMI ops and LQ are the only producers
// of non-zero upper bits; PCPYUD (Ch283) and similar upper-half
// readers consume from this shadow. regfile remains the canonical
// scalar surface so existing decode/ALU/load logic is unchanged.
logic [127:0] gpr128 [0:31];
// Ch43: architectural HI/LO register pair, populated by DIVU
// (and, when added later, DIV/MULT/MULTU) and read back through
// MFHI/MFLO. Reset to 0. Divide-by-zero policy: leave HI/LO
// unchanged (deterministic, simplest; real HW treats the result
// as undefined on DIVU with rt=0, so any consistent stub is
// spec-compatible).
logic [31:0] hi_reg;
logic [31:0] lo_reg;
logic branch_pending;
logic [31:0] branch_target;
logic instr_in_delay_slot;
logic status_iec, status_iep, status_ieo;
logic status_kuc, status_kup, status_kuo;
logic [7:0] status_im;
// Ch50: BEV bit (bit 22 of Status). Controls the exception
// vector base. 1 after reset (if INIT_BEV=1) so exceptions go
// to BEV_EXC_VECTOR; cleared by BIOS via MTC0 once its RAM
// handler is installed, which switches exceptions to
// EXC_VECTOR.
logic status_bev;
logic [4:0] cause_exc_code;
logic [7:0] cause_ip_sw;
logic [31:0] epc;
// Ch49: COP0 BadVAddr. Captures the offending effective address
// on a synchronous address-error exception (AdEL/AdES). Read-
// only via MFC0; real HW also updates this on TLB exceptions,
// which we don't model yet. Default 0 until the first fault.
logic [31:0] badvaddr;
// COP0 Count (reg 9): free-running 32-bit cycle counter. Increments
// every clock, resets to 0. Exposed read-only through MFC0 $9 — MTC0
// writes are silently dropped by the existing cop0_write default
// case (real hardware allows software to reset Count, but we haven't
// seen BIOS depend on that yet; revisit if a real-BIOS wait loop
// breaks because of the unreset counter).
//
// Rate: advances once per CPU clock. Real R5900 Count advances at
// half CPU clock, but any BIOS polling loop of the form
// do { x = mfc0 $9; } while (x < target);
// will exit in a bounded number of cycles regardless — the only
// observable effect of the faster rate is that waits finish sooner
// in sim than on hardware, which is fine for bring-up.
logic [31:0] cop0_count;
logic [7:0] cause_ip;
always_comb begin
cause_ip = 8'd0;
cause_ip[1:0] = cause_ip_sw[1:0];
cause_ip[2] = cpu_irq;
end
logic [31:0] status_word;
logic [31:0] cause_word;
always_comb begin
status_word = 32'd0;
status_word[0] = status_iec;
status_word[1] = status_kuc;
status_word[2] = status_iep;
status_word[3] = status_kup;
status_word[4] = status_ieo;
status_word[5] = status_kuo;
status_word[15:8] = status_im;
status_word[22] = status_bev; // Ch50
cause_word = 32'd0;
cause_word[6:2] = cause_exc_code;
cause_word[15:8] = cause_ip;
end
// ------------------------------------------------------------------
// Decode — combinational extraction from `instr`
// ------------------------------------------------------------------
logic [5:0] opcode;
logic [4:0] rs_idx;
logic [4:0] rt_idx;
logic [4:0] rd_idx;
logic [4:0] shamt;
logic [5:0] func;
logic [15:0] imm16;
logic [25:0] imm26;
logic [31:0] imm_sx;
logic [31:0] imm_zx;
logic [31:0] branch_offset;
logic [31:0] branch_tgt;
logic [31:0] j_tgt;
logic [31:0] rs_val;
logic [31:0] rt_val;
logic [31:0] ea;
// Ch283 — 128-bit reads from the gpr128 shadow. Used by MMI ops
// and LQ-consuming downstream code that needs the full 128-bit
// architectural view. $0 reads as 128'd0 (architectural).
logic [127:0] rs128_val;
logic [127:0] rt128_val;
assign opcode = instr[31:26];
assign rs_idx = instr[25:21];
assign rt_idx = instr[20:16];
assign rd_idx = instr[15:11];
assign shamt = instr[10:6];
assign imm16 = instr[15:0];
assign imm26 = instr[25:0];
assign func = instr[5:0];
assign imm_sx = {{16{imm16[15]}}, imm16};
assign imm_zx = {16'd0, imm16};
assign branch_offset = {{14{imm16[15]}}, imm16, 2'b00};
assign branch_tgt = pc + 32'd4 + branch_offset;
assign j_tgt = {pc[31:28], imm26, 2'b00};
assign rs_val = (rs_idx == 5'd0) ? 32'd0 : regfile[rs_idx];
assign rt_val = (rt_idx == 5'd0) ? 32'd0 : regfile[rt_idx];
assign rs128_val = (rs_idx == 5'd0) ? 128'd0 : gpr128[rs_idx];
assign rt128_val = (rt_idx == 5'd0) ? 128'd0 : gpr128[rt_idx];
assign ea = rs_val + imm_sx;
logic is_special, is_syscall, is_jr, is_jalr, is_sync;
logic is_and, is_or, is_xor, is_nor;
logic is_add, is_addu, is_daddu, is_sub, is_subu, is_dsubu;
logic is_slt, is_sltu;
logic is_sll, is_srl, is_sra, is_dsll;
logic is_sllv, is_srlv, is_srav;
logic is_rtype_alu;
logic is_multu, is_divu, is_mfhi, is_mflo, is_hilo_op;
logic is_cop0, is_mfc0, is_mtc0, is_rfe, is_ei;
logic is_nop_class;
logic is_lui, is_ori, is_andi;
logic is_addi, is_addiu, is_slti, is_sltiu;
logic is_lw, is_lb, is_lh, is_lhu, is_lbu, is_lq, is_ld;
logic is_sw, is_sb, is_sh, is_sq, is_sd;
logic is_beq, is_beql, is_bne, is_bnel, is_j, is_jal;
logic is_blez, is_bgtz, is_regimm, is_bltz, is_bgez;
logic is_branch, is_jump;
logic branch_taken;
logic is_taken_branch_or_jump;
logic is_cache; // Ch178 — opcode 0x2F, accepted as no-op
assign is_special = (opcode == OP_SPECIAL);
assign is_syscall = is_special && (func == FUNC_SYSCALL);
assign is_sync = is_special && (func == FUNC_SYNC); // Ch292
assign is_jr = is_special && (func == FUNC_JR);
assign is_jalr = is_special && (func == FUNC_JALR);
assign is_and = is_special && (func == FUNC_AND);
assign is_or = is_special && (func == FUNC_OR);
assign is_xor = is_special && (func == FUNC_XOR);
assign is_nor = is_special && (func == FUNC_NOR);
assign is_add = is_special && (func == FUNC_ADD);
assign is_addu = is_special && (func == FUNC_ADDU);
assign is_daddu = is_special && (func == FUNC_DADDU);
assign is_sub = is_special && (func == FUNC_SUB);
assign is_subu = is_special && (func == FUNC_SUBU);
assign is_dsubu = is_special && (func == FUNC_DSUBU); // Ch305
assign is_slt = is_special && (func == FUNC_SLT);
assign is_sltu = is_special && (func == FUNC_SLTU);
assign is_sll = is_special && (func == FUNC_SLL);
assign is_dsll = is_special && (func == FUNC_DSLL);
// Ch278/Ch280 — MMI sub-instruction narrow-decodes. Each
// recognized op fires only when the exact (opcode + funct + sa)
// triple matches; everything else under opcode 0x1C falls
// through to strict-trap.
logic is_mmi;
logic is_pcpyld; // Ch278 — MMI2 / sa 0x0E
logic is_psubb; // Ch280 — MMI0 / sa 0x09
logic is_pnor; // Ch281 — MMI3 / sa 0x13
logic is_pand; // Ch282 — MMI2 / sa 0x12
logic is_pcpyud; // Ch283 — MMI3 / sa 0x0E
logic is_pcpyh; // Ch300 — MMI3 / sa 0x1B
assign is_mmi = (opcode == OP_MMI);
assign is_pcpyld = is_mmi
&& (func == FUNC_MMI2)
&& (shamt == MMI2_PCPYLD);
assign is_psubb = is_mmi
&& (func == FUNC_MMI0)
&& (shamt == MMI0_PSUBB);
assign is_pnor = is_mmi
&& (func == FUNC_MMI3)
&& (shamt == MMI3_PNOR);
assign is_pand = is_mmi
&& (func == FUNC_MMI2)
&& (shamt == MMI2_PAND);
assign is_pcpyud = is_mmi
&& (func == FUNC_MMI3)
&& (shamt == MMI3_PCPYUD);
assign is_pcpyh = is_mmi
&& (func == FUNC_MMI3)
&& (shamt == MMI3_PCPYH);
assign is_srl = is_special && (func == FUNC_SRL);
assign is_sra = is_special && (func == FUNC_SRA);
assign is_sllv = is_special && (func == FUNC_SLLV);
assign is_srlv = is_special && (func == FUNC_SRLV);
assign is_srav = is_special && (func == FUNC_SRAV);
assign is_rtype_alu = is_and || is_or || is_xor || is_nor
|| is_add || is_addu || is_daddu // Ch272 — DADDU
|| is_sub || is_subu || is_dsubu // Ch305 — DSUBU
|| is_slt || is_sltu
|| is_sll || is_srl || is_sra
|| is_dsll // Ch276 — DSLL
|| is_pcpyld // Ch278 — MMI2/PCPYLD
|| is_psubb // Ch280 — MMI0/PSUBB
|| is_pnor // Ch281 — MMI3/PNOR
|| is_pand // Ch282 — MMI2/PAND
|| is_pcpyud // Ch283 — MMI3/PCPYUD
|| is_pcpyh // Ch300 — MMI3/PCPYH
|| is_sllv || is_srlv || is_srav;
// Ch43: HI/LO ops — DIVU writes HI/LO, MFHI/MFLO read them back
// into rd. Separate from is_rtype_alu because DIVU has no rd
// writeback and the MF* ops route through a different mux path.
// Ch203 — MULTU (SPECIAL funct=0x19, $HI:$LO = unsigned rs*rt, no
// rd writeback). Grouped with the HI/LO ops so it auto-excludes
// from is_nop_class.
assign is_multu = is_special && (func == FUNC_MULTU);
assign is_divu = is_special && (func == FUNC_DIVU);
assign is_mfhi = is_special && (func == FUNC_MFHI);
assign is_mflo = is_special && (func == FUNC_MFLO);
assign is_hilo_op = is_multu || is_divu || is_mfhi || is_mflo;
assign is_cop0 = (opcode == OP_COP0);
assign is_mfc0 = is_cop0 && (rs_idx == COP0_RS_MF);
assign is_mtc0 = is_cop0 && (rs_idx == COP0_RS_MT);
assign is_rfe = is_cop0 && (rs_idx == COP0_RS_CO) && (func == FUNC_RFE);
// Ch286 — narrow exact-32-bit match for R5900 EI (see EI_INSTR_R5900
// comment block). Keeps every other COP0/CO encoding trapping.
assign is_ei = (instr == EI_INSTR_R5900);
assign is_lui = (opcode == OP_LUI);
assign is_ori = (opcode == OP_ORI);
assign is_andi = (opcode == OP_ANDI);
assign is_addi = (opcode == OP_ADDI);
assign is_addiu = (opcode == OP_ADDIU);
assign is_slti = (opcode == OP_SLTI);
assign is_sltiu = (opcode == OP_SLTIU);
assign is_lw = (opcode == OP_LW);
assign is_lq = (opcode == OP_LQ);
assign is_ld = (opcode == OP_LD); // Ch284
assign is_lb = (opcode == OP_LB);
assign is_lbu = (opcode == OP_LBU);
assign is_lh = (opcode == OP_LH);
assign is_lhu = (opcode == OP_LHU);
assign is_sw = (opcode == OP_SW);
assign is_sq = (opcode == OP_SQ);
assign is_sd = (opcode == OP_SD);
assign is_sb = (opcode == OP_SB);
assign is_sh = (opcode == OP_SH);
assign is_beq = (opcode == OP_BEQ);
assign is_beql = (opcode == OP_BEQL);
assign is_bne = (opcode == OP_BNE);
assign is_bnel = (opcode == OP_BNEL);
assign is_blez = (opcode == OP_BLEZ);
assign is_bgtz = (opcode == OP_BGTZ);
assign is_regimm = (opcode == OP_REGIMM);
assign is_bltz = is_regimm && (rt_idx == REGIMM_BLTZ);
assign is_bgez = is_regimm && (rt_idx == REGIMM_BGEZ);
assign is_j = (opcode == OP_J);
assign is_jal = (opcode == OP_JAL);
// Ch178 — CACHE class. We accept the whole opcode regardless
// of sub-op (`rt` field) per Codex's caution; BIOS may issue
// multiple cache ops in close succession (Hit-Invalidate /
// Index-Invalidate / etc.) and the stub has no cache state to
// distinguish them. Side-effect-free: no register or memory
// writes, PC advances via the normal "not a branch / not a
// load / not a store" path.
assign is_cache = (opcode == OP_CACHE);
assign is_branch = is_beq || is_beql || is_bne || is_bnel
|| is_blez || is_bgtz
|| is_bltz || is_bgez;
assign is_jump = is_j || is_jal || is_jr || is_jalr;
assign branch_taken = (is_beq && (rs_val == rt_val))
|| (is_beql && (rs_val == rt_val)) // Ch274 — BEQL taken
|| (is_bne && (rs_val != rt_val))
|| (is_bnel && (rs_val != rt_val)) // Ch277 — BNEL taken
|| (is_blez && ($signed(rs_val) <= 0))
|| (is_bgtz && ($signed(rs_val) > 0))
|| (is_bltz && ($signed(rs_val) < 0))
|| (is_bgez && ($signed(rs_val) >= 0));
assign is_taken_branch_or_jump = branch_taken || is_jump;
// Ch274/Ch277 — branch-likely not-taken: squash the delay slot.
// Skip PC by 8 so the delay-slot instruction at PC+4 is never
// fetched or retired. Generalized to cover BEQL + BNEL; adding
// BLEZL/BGTZL/REGIMM-likely later is a one-line OR-extension.
logic is_branch_likely_squash;
assign is_branch_likely_squash =
(is_beql && (rs_val != rt_val)) // Ch274 — BEQL not-taken
|| (is_bnel && (rs_val == rt_val)); // Ch277 — BNEL not-taken
assign is_nop_class = (is_special
&& !is_syscall && !is_jr && !is_jalr
&& !is_rtype_alu
&& !is_hilo_op
&& !is_sync) // Ch292 — narrow SYNC
|| (is_cop0 && !is_mfc0 && !is_mtc0 && !is_rfe
&& !is_ei) // Ch286 — narrow EI
// REGIMM: BLTZ/BGEZ are handled; other rt
// encodings (e.g. BLTZAL=0x10, BGEZAL=0x11
// link variants) trap as unsupported until
// a real-BIOS path needs them.
|| (is_regimm && !is_bltz && !is_bgez)
|| (!is_special && !is_cop0 && !is_regimm
&& !is_lui && !is_ori && !is_andi
&& !is_addi && !is_addiu
&& !is_slti && !is_sltiu
&& !is_lw && !is_lb && !is_lbu && !is_lh && !is_lhu
&& !is_lq // Ch279 — LQ
&& !is_ld // Ch284 — LD
&& !is_sw && !is_sb && !is_sh
&& !is_sq // Ch271 — SQ
&& !is_sd // Ch275 — SD
&& !is_beq && !is_beql && !is_bne && !is_bnel // Ch274/Ch277
&& !is_blez && !is_bgtz
&& !is_j && !is_jal
&& !is_pcpyld // Ch278 — MMI2/PCPYLD only
&& !is_psubb // Ch280 — MMI0/PSUBB only
&& !is_pnor // Ch281 — MMI3/PNOR only
&& !is_pand // Ch282 — MMI2/PAND only
&& !is_pcpyud // Ch283 — MMI3/PCPYUD only
&& !is_pcpyh // Ch300 — MMI3/PCPYH only
&& !is_cache); // Ch178
logic is_nop_instr;
logic is_unsupported;
logic strict_trap;
assign is_nop_instr = (instr == 32'd0);
assign is_unsupported = is_nop_class && !is_nop_instr;
assign strict_trap = STRICT_UNSUPPORTED && is_unsupported;
// Ch47: AdEL/AdES detection. `ea` is the effective address of
// the pending load/store (rs_val + imm_sx). MIPS requires SW /
// LW to be word-aligned (ea[1:0]==0) and SH / LH / LHU to be
// halfword-aligned (ea[0]==0). SB / LB / LBU are unconstrained.
// Anything else is an address-error exception on real HW; in
// this stub we route it through the same trap machinery as
// STRICT_UNSUPPORTED so the first misaligned access halts
// loudly instead of silently aliasing to the nearest aligned
// slot (ch46 evidence showed the old behavior was clobbering
// neighbouring words and confusing post-run analysis).
logic is_word_access;
logic is_half_access;
logic is_align_fault;
logic align_trap;
logic align_except;
logic is_align_store;
assign is_word_access = is_sw || is_lw;
assign is_half_access = is_sh || is_lh || is_lhu;
// Ch271/Ch279 — SQ + LQ require 16-byte alignment (ea[3:0] == 0).
// Ch275 — SD requires 8-byte alignment (ea[2:0] == 0).
logic is_quad_access;
logic is_dword_access;
assign is_quad_access = is_sq || is_lq; // Ch279 — add LQ
assign is_dword_access = is_sd || is_ld; // Ch284 — add LD
assign is_align_fault = (is_word_access && (ea[1:0] != 2'd0))
|| (is_half_access && (ea[0] != 1'b0))
|| (is_quad_access && (ea[3:0] != 4'd0))
|| (is_dword_access && (ea[2:0] != 3'd0));
assign is_align_store = is_sw || is_sh || is_sq || is_sd;
// Ch47: when TRAP_ALIGN_ERROR=1 the fault halts the core (debug
// mode used by tb_ee_core_align and default for iteration).
assign align_trap = TRAP_ALIGN_ERROR && is_align_fault;
// Ch49: when TRAP_ALIGN_ERROR=0 the fault vectors to EXC_VECTOR
// through the standard MIPS exception path (BadVAddr + Cause
// ExcCode + EPC + Status IE/KU stack shift).
assign align_except = !TRAP_ALIGN_ERROR && is_align_fault;
// Ch50: select exception vector based on Status.BEV. Interrupts
// and synchronous faults (AdEL/AdES) both use this. When BEV=1
// (boot / reset), exceptions go to the ROM bootstrap vector
// (BEV_EXC_VECTOR, default 0xBFC0_0380 — the standard MIPS
// R4000 "common" vector for BEV=1). When BEV=0 (BIOS has
// installed its RAM handler), they go to EXC_VECTOR.
logic [31:0] exc_target_pc;
assign exc_target_pc = status_bev ? BEV_EXC_VECTOR : EXC_VECTOR;
logic [31:0] alu_wb;
always_comb begin
if (is_lui) alu_wb = {imm16, 16'd0};
else if (is_ori) alu_wb = rs_val | imm_zx;
else if (is_andi) alu_wb = rs_val & imm_zx;
else if (is_addi || is_addiu) alu_wb = rs_val + imm_sx;
else if (is_slti) alu_wb = ($signed(rs_val) < $signed(imm_sx))
? 32'd1 : 32'd0;
else if (is_sltiu) alu_wb = (rs_val < imm_sx) ? 32'd1 : 32'd0;
else alu_wb = 32'd0;
end
// R-type ALU (destination = rd). Families:
// arith (ADD/ADDU/SUB/SUBU): func 0x20/0x21/0x22/0x23
// logic (AND/OR/XOR/NOR): func 0x24-0x27
// compare (SLT/SLTU): func 0x2A/0x2B
// shifts (SLL/SRL/SRA): func 0x00/0x02/0x03
// ADD/SUB architecturally trap on signed overflow (Arithmetic
// Overflow exception); this core does not model that exception
// yet, so ADD behaves as ADDU and SUB behaves as SUBU. Same
// pragmatic policy as ADDI vs ADDIU. Real BIOS code emits
// ADD/SUB in contexts where overflow cannot happen in practice;
// the trapping variants would only matter if BIOS ever
// deliberately overflows.
// Shifts use `rt_val` as the operand and `shamt` (bits [10:6]
// of instr) as the shift count; SRA uses $signed() with SV's
// arithmetic right-shift (>>>) so the MSB propagates.
// SLL/$0,$0,0 is architectural NOP; this block still produces
// 0 and the rd_idx=0 guard in the writeback path blocks the
// phantom regfile write.
logic [31:0] rtype_alu_wb;
always_comb begin
if (is_and || is_pand) rtype_alu_wb = rs_val & rt_val;
else if (is_or) rtype_alu_wb = rs_val | rt_val;
else if (is_xor) rtype_alu_wb = rs_val ^ rt_val;
else if (is_nor || is_pnor) rtype_alu_wb = ~(rs_val | rt_val);
else if (is_add || is_addu || is_daddu)
rtype_alu_wb = rs_val + rt_val;
else if (is_sub || is_subu || is_dsubu) // Ch305 — DSUBU low-32
rtype_alu_wb = rs_val - rt_val;
else if (is_slt) rtype_alu_wb = ($signed(rs_val) < $signed(rt_val))
? 32'd1 : 32'd0;
else if (is_sltu) rtype_alu_wb = (rs_val < rt_val) ? 32'd1 : 32'd0;
else if (is_sll || is_dsll) rtype_alu_wb = rt_val << shamt;
// Ch278 — MMI2/PCPYLD: 128-bit "pack lower doublewords"
// collapses in our 32-bit model to `rd_low32 = rt_low32`.
// Architectural rd[63:0]=rt[63:0] is observable here; the
// rd[127:64]=rs[63:0] half is unrepresentable.
else if (is_pcpyld) rtype_alu_wb = rt_val;
// Ch280 — MMI0/PSUBB: 4 parallel byte subtracts in the low
// 32 bits (architectural is 16-way across 128 bits; upper
// 96 unrepresentable). Each byte lane is independently
// modulo-256; no carry/borrow propagates between bytes.
else if (is_psubb) begin
rtype_alu_wb[ 7: 0] = rs_val[ 7: 0] - rt_val[ 7: 0];
rtype_alu_wb[15: 8] = rs_val[15: 8] - rt_val[15: 8];
rtype_alu_wb[23:16] = rs_val[23:16] - rt_val[23:16];
rtype_alu_wb[31:24] = rs_val[31:24] - rt_val[31:24];
end
// Ch283 — MMI3/PCPYUD: $rd[63:0] = $rt[127:64]. The low 32
// bits of $rd come from $rt[95:64] (low 32 of $rt's upper
// doubleword). regfile mirror lands this value; gpr128 path
// below sets the full 128-bit result.
else if (is_pcpyud) rtype_alu_wb = rt128_val[95:64];
// Ch300 — MMI3/PCPYH: broadcast $rt[15:0] across the low 64
// halfword lanes. The low 32 of $rd = {h0, h0}.
else if (is_pcpyh) rtype_alu_wb = {rt128_val[15:0], rt128_val[15:0]};
else if (is_srl) rtype_alu_wb = rt_val >> shamt;
else if (is_sra) rtype_alu_wb = $signed(rt_val) >>> shamt;
// Ch67: variable-shift — shift amount comes from rs_val[4:0].
else if (is_sllv) rtype_alu_wb = rt_val << rs_val[4:0];
else if (is_srlv) rtype_alu_wb = rt_val >> rs_val[4:0];
else if (is_srav) rtype_alu_wb = $signed(rt_val) >>> rs_val[4:0];
else rtype_alu_wb = 32'd0;
end
// Ch283 — full 128-bit MMI writeback. Computed in parallel with
// rtype_alu_wb (the legacy low-32 value); the writeback block
// selects between {96'd0, rtype_alu_wb} (scalar) and
// rtype_alu128_wb (MMI) when updating gpr128[rd]. Only the five
// MMI ops modelled to date land here; everything else stays in
// the "zero-extend the scalar result" path. The PSUBB body works
// across all 16 lanes; PNOR/PAND across the full 128 bits;
// PCPYLD/PCPYUD do their architectural doubleword swaps.
logic [127:0] rtype_alu128_wb;
logic is_mmi_wb;
assign is_mmi_wb = is_pcpyld || is_psubb || is_pnor
|| is_pand || is_pcpyud
|| is_pcpyh; // Ch300
always_comb begin
rtype_alu128_wb = 128'd0;
if (is_pcpyld) begin
// $rd[127:64] = $rs[63:0]; $rd[63:0] = $rt[63:0].
rtype_alu128_wb = {rs128_val[63:0], rt128_val[63:0]};
end else if (is_psubb) begin
// 16-way parallel byte subtract.
for (int b = 0; b < 16; b++) begin
rtype_alu128_wb[b*8 +: 8] =
rs128_val[b*8 +: 8] - rt128_val[b*8 +: 8];
end
end else if (is_pnor) begin
rtype_alu128_wb = ~(rs128_val | rt128_val);
end else if (is_pand) begin
rtype_alu128_wb = rs128_val & rt128_val;
end else if (is_pcpyud) begin
// $rd[127:64] = $rs[127:64]; $rd[63:0] = $rt[127:64].
rtype_alu128_wb = {rs128_val[127:64], rt128_val[127:64]};
end else if (is_pcpyh) begin
// Ch300 — Parallel Copy Halfword.
// h0 = $rt[15:0] (low halfword of low D)
// h4 = $rt[79:64] (low halfword of high D)
// $rd low 64 = {h0, h0, h0, h0}
// $rd high 64 = {h4, h4, h4, h4}
// $rs is architecturally ignored.
rtype_alu128_wb = { {4{rt128_val[79:64]}},
{4{rt128_val[15:0]}} };
end
end
logic [31:0] cop0_read_val;
always_comb begin
unique case (rd_idx)
COP0_REG_BADVADDR: cop0_read_val = badvaddr;
COP0_REG_COUNT: cop0_read_val = cop0_count;
COP0_REG_STATUS: cop0_read_val = status_word;
COP0_REG_CAUSE: cop0_read_val = cause_word;
COP0_REG_EPC: cop0_read_val = epc;
default: cop0_read_val = 32'd0;
endcase
end
// Count advance. Separate from the main FSM reset block so the
// counter's behaviour is locally self-contained and easy to audit.
always_ff @(posedge clk) begin
if (!rst_n) cop0_count <= 32'd0;
else cop0_count <= cop0_count + 32'd1;
end
logic [31:0] taken_target;
always_comb begin
if (is_jr || is_jalr) taken_target = rs_val;
else if (is_j || is_jal) taken_target = j_tgt;
else taken_target = branch_tgt;
end
// ------------------------------------------------------------------
// Trace book-keeping (captured at retire)
// ------------------------------------------------------------------
logic [31:0] retired_pc;
logic [31:0] retired_instr;
logic [31:0] retired_arg2;
logic [31:0] retired_arg3;
logic retired_flag_write;
logic retired_flag_read;
logic retired_flag_branch;
logic retired_flag_halt;
logic retired_flag_in_delay;
logic retired_flag_except;
logic retired_flag_rfe;
logic retired_flag_trap;
logic retire_pulse;
// ------------------------------------------------------------------
// Map-port drive (combinational on state)
// ------------------------------------------------------------------
always_comb begin
map_rd_en = 1'b0;
map_rd_addr = 32'd0;
map_wr_en = 1'b0;
map_wr_addr = 32'd0;
map_wr_data = 32'd0;
map_wr_be = 4'd0;
case (state)
S_IFETCH_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = pc;
end
S_MEM_REQ: begin
map_rd_en = 1'b1;
// Ch283 — LQ drives a real 4-beat load FSM using
// sq_beat as the counter (same counter SQ uses).
// Ch284 — LD reuses the same beat addressing for 2
// beats. LW/LB/LBU/LH/LHU stay single-beat with
// sq_beat=0.
map_rd_addr = (is_lq || is_ld)
? (ea + {28'd0, sq_beat, 2'b00})
: ea;
end
// Ch215 — drive the read port for the jmp_buf restore
// FSM. Address = base + count*4 (offsets 0..0x2C).
S_CH215_REQ: begin
map_rd_en = 1'b1;
map_rd_addr = CH215_JMPBUF_BASE
+ {26'd0, ch215_count, 2'b00};
end
S_MEM_WRITE: begin
map_wr_en = 1'b1;
// Ch271/Ch275 — multi-beat stores (SQ/SD) use
// `ea + sq_beat*4` for the per-beat address. Single-
// beat stores (SW/SB/SH) ignore sq_beat (which stays
// at 0) and resolve to `ea`.
map_wr_addr = (is_sq || is_sd)
? (ea + {28'd0, sq_beat, 2'b00})
: ea;
if (is_sq || is_sd) begin
// Ch283 — SQ/SD source per-beat from gpr128[rt],
// not the legacy "low 32 only, zeros above" path.
// SQ emits all four 32-bit lanes; SD emits the
// low two lanes (gpr128[rt][63:0]). Reads of gpr128
// for rt==0 resolve to 0 via the rt_idx==0 check
// already in the read helper.
case (sq_beat)
2'd0: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][31:0];
2'd1: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][63:32];
2'd2: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][95:64];
2'd3: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][127:96];
endcase
map_wr_be = 4'b1111;
end else if (is_sb) begin
// Byte store: broadcast rt_val[7:0] into the
// addressed lane; one-hot byte-enable selects it.
case (ea[1:0])
2'd0: begin
map_wr_data = {24'd0, rt_val[7:0]};
map_wr_be = 4'b0001;
end
2'd1: begin
map_wr_data = {16'd0, rt_val[7:0], 8'd0};
map_wr_be = 4'b0010;
end
2'd2: begin
map_wr_data = {8'd0, rt_val[7:0], 16'd0};
map_wr_be = 4'b0100;
end
2'd3: begin
map_wr_data = {rt_val[7:0], 24'd0};
map_wr_be = 4'b1000;
end
endcase
end else if (is_sh) begin
// Halfword store: broadcast rt_val[15:0] into the
// addressed halfword lane (ea[1] selects). 2-of-4
// byte-enable leaves the other halfword untouched.
// Aligned access assumed (ea[0]==0); unaligned
// halfword store is not modelled as an exception.
case (ea[1])
1'b0: begin
map_wr_data = {16'd0, rt_val[15:0]};
map_wr_be = 4'b0011;
end
1'b1: begin
map_wr_data = {rt_val[15:0], 16'd0};
map_wr_be = 4'b1100;
end
endcase
end else begin
// SW — full word
map_wr_data = rt_val;
map_wr_be = 4'b1111;
end
end
default: ;
endcase
end
// ------------------------------------------------------------------
// Retire helper — pc advance, branch queuing, exception entry
// ------------------------------------------------------------------
task automatic retire_advance;
logic [31:0] next_pc;
logic new_branch_pending;
logic [31:0] new_branch_target;
logic irq_pending_masked;
logic exception_now;
// Ch274/Ch277 — branch-likely squash: when BEQL/BNEL fires
// and the condition is FALSE (not-taken), the delay slot is
// squashed (not executed), so PC jumps directly to PC+8.
// branch_pending is also forced low below so no stale
// branch_target leaks through.
if (is_branch_likely_squash)
next_pc = pc + 32'd8;
else
next_pc = branch_pending ? branch_target : pc + 32'd4;
new_branch_pending = is_taken_branch_or_jump;
new_branch_target = taken_target;
irq_pending_masked = |(cause_ip & status_im);
exception_now = !new_branch_pending
&& status_iec
&& irq_pending_masked;
if (exception_now) begin
epc <= next_pc;
cause_exc_code <= 5'h00;
status_ieo <= status_iep;
status_iep <= status_iec;
status_iec <= 1'b0;
status_kuo <= status_kup;
status_kup <= status_kuc;
status_kuc <= 1'b0;
pc <= exc_target_pc; // Ch50: BEV select
branch_pending <= 1'b0;
retired_flag_except <= 1'b1;
end else begin
pc <= next_pc;
branch_pending <= new_branch_pending;
if (new_branch_pending) branch_target <= new_branch_target;
retired_flag_except <= 1'b0;
end
endtask
// ------------------------------------------------------------------
// Main FSM
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= S_IDLE;
pc <= PC_RESET;
instr <= 32'd0;
branch_pending <= 1'b0;
branch_target <= 32'd0;
instr_in_delay_slot <= 1'b0;
ch215_count <= 4'd0;
sq_beat <= 2'd0;
status_iec <= 1'b0;
status_iep <= 1'b0;
status_ieo <= 1'b0;
status_kuc <= 1'b0;
status_kup <= 1'b0;
status_kuo <= 1'b0;
status_im <= 8'd0;
status_bev <= INIT_BEV; // Ch50
cause_exc_code <= 5'd0;
cause_ip_sw <= 8'd0;
epc <= 32'd0;
badvaddr <= 32'd0; // Ch49
retire_pulse <= 1'b0;
retired_pc <= 32'd0;
retired_instr <= 32'd0;
retired_arg2 <= 32'd0;
retired_arg3 <= 32'd0;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b0;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= 1'b0;
retired_flag_except <= 1'b0;
retired_flag_rfe <= 1'b0;
retired_flag_trap <= 1'b0;
trap_o <= 1'b0;
trap_pc_o <= 32'd0;
trap_instr_o <= 32'd0;
for (int i = 0; i < 32; i++) regfile[i] <= 32'd0;
// Ch283 — gpr128 shadow starts at zero everywhere.
for (int i = 0; i < 32; i++) gpr128[i] <= 128'd0;
hi_reg <= 32'd0;
lo_reg <= 32'd0;
end else begin
retire_pulse <= 1'b0;
case (state)
S_IDLE: begin
if (go_i) state <= S_IFETCH_REQ;
end
S_IFETCH_REQ: state <= S_IFETCH_WAIT;
S_IFETCH_WAIT: begin
if (map_rd_valid) begin
instr <= map_rd_data;
instr_in_delay_slot <= branch_pending;
state <= S_EXECUTE;
end
end
S_EXECUTE: begin
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= 32'd0;
retired_arg3 <= 32'd0;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b0;
retired_flag_branch <= is_taken_branch_or_jump;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retired_flag_except <= 1'b0;
retired_flag_rfe <= 1'b0;
retired_flag_trap <= 1'b0;
if (is_syscall) begin
// Ch199 — narrow _ReturnFromException(2)
// semantics: when SYSCALL fires with the
// exact contract $v1==8 && $a0==2, do NOT
// halt. Ch215 extends this with an optional
// jmp_buf restore (when CH215_JMPBUF_RESTORE_
// ENABLE=1) that walks the BIOS-side jmp_buf
// at CH215_JMPBUF_BASE and loads $ra/$sp/$fp/
// $s0..$s7/$gp from there before resuming.
// Other syscall variants keep the pre-Ch199
// halt path so the ch197 verdict still fires.
if (regfile[3] == 32'd8 && regfile[4] == 32'd2) begin
branch_pending <= 1'b0;
status_iec <= status_iep;
status_iep <= status_ieo;
status_kuc <= status_kup;
status_kup <= status_kuo;
retired_flag_rfe <= 1'b1;
retired_flag_halt <= 1'b0;
retire_pulse <= 1'b1;
if (CH215_JMPBUF_RESTORE_ENABLE) begin
// Ch215 — enter the 12-step jmp_buf
// restore FSM. PC update deferred
// until the FSM has loaded $ra at
// count=0; final transition to
// S_IFETCH_REQ sets pc<-regfile[31].
ch215_count <= 4'd0;
state <= S_CH215_REQ;
end else begin
// Ch199 minimal: PC<-$k0, no GPR restore.
pc <= regfile[26];
state <= S_IFETCH_REQ;
end
end else if (EE_SYSCALL_HLE_ENABLE) begin
// Ch273 — minimal EE syscall HLE for the
// ELF runner. Known $v1 values get a stub
// return; unknown $v1 falls through to
// halt so the next blocker surfaces.
// PC advances to pc+4 (normal user-code
// syscall resume), NOT RFE — that's
// Ch199's path.
case (regfile[3]) // $v1
32'h0000_003C: begin
// EndOfHeap — top-of-usable-RAM.
regfile[2] <= SYSCALL_HEAP_END;
gpr128[2] <= {96'd0, SYSCALL_HEAP_END}; // Ch283
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_003D: begin
// InitMainThread — stub success.
// No scheduler/thread state mutated.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0; // Ch283
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0040: begin
// Ch285 — syscall #64. qbert hits
// this at PC 0x00111D24 with
// $a0=heap-ish, $a1=code-ptr-ish.
// Almost certainly a kernel
// registration / handler-install
// call (the standard PS2 syscall
// table lists names like
// SetVCommonHandler / SetV
// TLBRefillHandler in this slot).
// Per Codex framing, we accept it
// with the least-invasive shape:
// $v0 = 0 ("registered OK") and
// PC += 4. If qbert misbranches
// downstream, revisit and return
// the previous handler pointer
// instead of zero.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0064: begin
// FlushCache — cacheless model, no-op.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0; // Ch283
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0078: begin
// Ch289 — syscall #120. qbert hits
// this at PC 0x00112AA4 with
// $a1=0x00130000, $a2=0x20000000
// (uncached-pointer base), $a3=
// 0x001328C0. Args look like setup/
// registration parameters (likely
// threading/heap/uncached-memory
// related per Codex framing). First
// pass per the Ch285 precedent:
// accept with $v0 = 0 ("kernel
// setup OK") and PC += 4. If qbert
// misbranches downstream, revisit
// and try $a2/$a1 as the return.
// The ELF runner adds a named
// SUMMARY line for this syscall so
// post-run analysis can confirm the
// arg shape without re-reading the
// trace file.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_006B: begin
// Ch304 — syscall #107. qbert hits
// this at PC 0x00111D64 (the 0x6B
// wrapper in Table1 @0x00111D40).
// Ch303 autopsy proved the caller
// at 0x00111B00 IGNORES the return
// value: after `jal 0x00111d60` the
// next instruction (0x111B24) sets
// $a1=0 without reading $v0. So
// $v0=0 is safe. Args at the call:
// $a0=5 (channel), $a1=0,
// $a2=0xFFFFFFFF, $a3=0x00137568.
// Per Codex: add 0x6B ALONE first
// so the next run confirms the
// Ch303-predicted table flow (next
// blocker should be 0x76, 0x44, or
// 0xFFFFFFBD). Ch305 batches the
// rest once the flow is confirmed.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0013: begin
// Ch302 — syscall #19. qbert hits
// this at PC 0x00112A64 right after
// Ch301's 0x17 with LITERALLY
// IDENTICAL args:
// $a0 = 5 (channel id)
// $a1 = 0
// $a2 = 0xFFFFFFFF (sentinel/-1)
// $a3 = 0x00137568 (per-channel
// ctx, same as
// 0x17)
// Second paired-call pattern on the
// syscall track (first was Ch290/291
// 0x12/0x16). 0x17+0x13 are the
// "set + register" pair for channel
// 5's new context. Per Codex: accept
// ($v0 = 0, PC += 4); paired-call
// symmetry makes this well-supported.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0017: begin
// Ch301 — syscall #23. qbert hits
// this at PC 0x00112A84 right after
// Ch300's PCPYH. Args:
// $a0 = 5 (channel id —
// matches the
// Ch290/291
// handler slot)
// $a1 = 0
// $a2 = 0xFFFFFFFF (sentinel/-1)
// $a3 = 0x00137568 (NEW context
// ptr, NOT the
// prior global
// ctx 0x001328C0
// — second
// context-shift
// on syscall
// track)
// PS2 standard table cites syscall
// 23 plausibly as SetVTLBRefill-
// Handler or iWakeupThread. The
// $a0=5 channel-id pattern + $a2=
// -1 sentinel fit a per-channel
// kernel call. Per Codex: accept
// ($v0 = 0, PC += 4); the runner
// observer with distinct-tuple
// tracking will surface whether
// qbert calls 0x17 multiple times
// with varying args.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0077: begin
// Ch297 — syscall #119. qbert hits
// this at PC 0x00111D84 right after
// Ch296's 0x79 acceptance, with a
// NOTICEABLY DIFFERENT arg shape:
// $a0 = 0x001DFD50 (heap address,
// not kseg0 base)
// $a1 = 1
// $a2 = 0
// $a3 = 20 (small int, NOT the
// global ctx
// pointer threaded
// through 0x78/
// 0x12/0x16/0x7A/
// 0x79)
// The $a3 change is the strongest
// "syscall-family boundary" signal
// we've seen — qbert has crossed
// into a different kernel call
// convention. PS2 standard table
// cites syscall 119 as plausibly
// SetVTLBRefillHandler or
// RegisterLibraryEntries. Per Codex:
// accept first-pass ($v0 = 0,
// PC += 4) but treat the runner-side
// observer as the falsifiability
// surface (richer than prior
// observers — first/last args, up
// to 4 distinct tuples).
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0079: begin
// Ch296 — syscall #121. qbert hits
// this at PC 0x00111D94 right after
// Ch295's $a0-aware 0x7A patch
// unblocked the wait loop. Args:
// $a0 = 0x80000000 (kseg0 base —
// same shape as
// Ch293's 0x7A
// init call)
// $a1 = 0
// $a3 = 0x001328C0 (same global
// ctx threaded
// throughout)
// PC sits in the same kernel-
// wrapper neighborhood as the
// Ch289 syscall 0x78 site (PC
// 0x00111D24). Likely an
// adjacent finalize/reset call.
// Per Codex: accept ($v0 = 0,
// PC += 4), keep marked as
// adjacent/experimental like 0x7A.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_007A: begin
// Ch293/Ch295 — syscall #122,
// $a0-aware HLE.
//
// Ch294's autopsy showed qbert calls
// syscall 0x7A with two distinct
// arg shapes and expects different
// return values:
// $a0 = 0x80000000 → $v0 = 0
// (init-style call;
// falls into wait loop)
// $a0 = 0x00000004 → $v0 with
// bit 17 (0x00020000) set
// (poll-style call; bit 17
// is qbert's readiness
// flag, mask in $s0 at
// PC 0x00112418).
// Ch295 is Codex's "Strategy A"
// EXPERIMENTAL unblock — return the
// expected bit shape based on $a0
// alone. Not architectural truth;
// the real PS2 syscall 122 semantics
// are still TBD. If qbert misbranches
// downstream we back this out and
// pursue real-SDK lookup or
// interrupt-delivery wiring.
if (regfile[4] == 32'h0000_0004) begin
regfile[2] <= 32'h0002_0000;
gpr128[2] <= {96'd0, 32'h0002_0000};
end else begin
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
end
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0016: begin
// Ch291 — syscall #22. qbert hits
// this at PC 0x00112A74 with args
// LITERALLY IDENTICAL to the Ch290
// syscall 0x12 call eight
// instructions earlier:
// $a0 = 0x05 (channel/slot)
// $a1 = 0x00112AB0 (fn ptr)
// $a2 = 0x00000000
// $a3 = 0x001328C0 (ctx ptr)
// PS2 standard syscall table cites
// `EnableDmacHandler` (or
// EnableIntcHandler) in this slot —
// the activation companion to Ch290's
// AddDmacHandler. Per Codex framing:
// accept the enable ($v0 = 0, PC +=
// 4); do NOT actually call the
// handler or synthesize a DMAC
// completion. If qbert subsequently
// polls for the handler to fire,
// Ch292 will need to model
// handler-invocation; for now just
// accept the registration+enable
// pair and see what qbert demands.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
32'h0000_0012: begin
// Ch290 — syscall #18. qbert hits
// this at PC 0x00112A54 right after
// Ch289's 0x78 with the classic
// handler-install arg shape:
// $a0 = 0x05 (channel /
// event id /
// handler slot)
// $a1 = 0x00112AB0 (fn pointer
// in code seg)
// $a2 = 0x00000000
// $a3 = 0x001328C0 (ctx ptr,
// same global
// block as
// 0x78's $a3)
// PS2 standard syscall table cites
// names like AddDmacHandler in slot
// 18 ($a0 = DMAC channel; $a0=5 =
// SIF0). Per Codex: accept the
// registration ($v0 = 0, PC += 4),
// do NOT invoke the handler or
// mutate DMAC/INTC state. The ELF
// runner observes args for the
// first occurrence so the SUMMARY
// shows the registration shape.
regfile[2] <= 32'd0;
gpr128[2] <= 128'd0;
pc <= pc + 32'd4;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end
default: begin
// Unhandled — halt; TB reads
// $v1/$a0..$a3 hierarchically for
// the verdict.
retired_flag_halt <= 1'b1;
retire_pulse <= 1'b1;
state <= S_HALT;
end
endcase
end else begin
retired_flag_halt <= 1'b1;
retire_pulse <= 1'b1;
state <= S_HALT;
end
end else if (strict_trap) begin
retired_flag_trap <= 1'b1;
retire_pulse <= 1'b1;
trap_o <= 1'b1;
trap_pc_o <= pc;
trap_instr_o <= instr;
state <= S_HALT;
end else if (align_trap) begin
// Ch47: AdEL/AdES address error, debug-halt
// mode (TRAP_ALIGN_ERROR=1). retired_arg2
// carries the offending EA so traces name
// the byte address that caused the fault.
retired_flag_trap <= 1'b1;
retire_pulse <= 1'b1;
trap_o <= 1'b1;
trap_pc_o <= pc;
trap_instr_o <= instr;
retired_arg2 <= ea;
retired_arg3 <= 32'd0;
state <= S_HALT;
end else if (align_except) begin
// Ch49: AdEL/AdES synchronous exception
// (TRAP_ALIGN_ERROR=0). Take the MIPS
// exception path instead of halting — the
// BIOS presumably has a handler at
// EXC_VECTOR that fixes up the misaligned
// access and RFEs back.
//
// EPC := pc of the FAULTING instruction
// (not next_pc as retire_advance does for
// inter-instruction interrupts). Real MIPS
// additionally sets Cause.BD=1 and writes
// EPC = (branch pc) when the fault fires in
// a delay slot; we don't model Cause.BD yet
// — a delay-slot AdES simply records the
// SW's own pc and the handler is expected
// to cope. Revisit if BIOS emulation
// depends on the BD bit.
epc <= pc;
badvaddr <= ea;
cause_exc_code <= is_align_store ? EXC_CODE_ADES
: EXC_CODE_ADEL;
status_ieo <= status_iep;
status_iep <= status_iec;
status_iec <= 1'b0;
status_kuo <= status_kup;
status_kup <= status_kuc;
status_kuc <= 1'b0;
pc <= exc_target_pc; // Ch50: BEV select
branch_pending <= 1'b0;
// Retire event: treat the fault as an
// exception-retire (flag bit 5), carry ea
// in arg2 so traces name the bad address
// directly, and clear the load/store flags
// since the SW/LW did not actually execute.
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= ea;
retired_arg3 <= 32'd0;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b0;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_except <= 1'b1;
retired_flag_rfe <= 1'b0;
retired_flag_trap <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retire_pulse <= 1'b1;
state <= S_IFETCH_REQ;
end else if (is_lw || is_lb || is_lbu || is_lh || is_lhu) begin
state <= S_MEM_REQ;
end else if (is_lq) begin
// Ch283 — LQ: 4-beat 32-bit load FSM. Beat N
// captures mem[ea + N*4] into the matching
// 32-bit lane of gpr128[rt]. After beat 3, the
// low 32 are mirrored to regfile[rt]. Replaces
// the Ch279 single-beat LW-style approximation.
sq_beat <= 2'd0;
state <= S_MEM_REQ;
end else if (is_ld) begin
// Ch284 — LD: 2-beat 32-bit load FSM mirroring
// SD's beat layout. Beat 0 → gpr128[rt][31:0]
// and regfile[rt] mirror; beat 1 → gpr128[rt]
// [63:32]. gpr128[rt][127:64] is preserved
// (architectural LD only loads doubleword).
sq_beat <= 2'd0;
state <= S_MEM_REQ;
end else if (is_sw || is_sb || is_sh) begin
state <= S_MEM_WRITE;
end else if (is_sq) begin
// Ch271 — SQ: 4-beat 32-bit write FSM.
// Beat 0 emits the lower 32 bits of rt; beats
// 1-3 emit zero (upper 96 bits of $rt aren't
// modelled). For sq $zero,... all four beats
// are zero — the qbert prolog case.
sq_beat <= 2'd0;
state <= S_MEM_WRITE;
end else if (is_sd) begin
// Ch275 — SD: 2-beat 32-bit write FSM.
// Beat 0 emits rt_val[31:0]; beat 1 emits 0
// (upper 32 bits of $rt aren't modelled). qbert
// does sd $ra, 0x20($sp) in a function prologue.
sq_beat <= 2'd0;
state <= S_MEM_WRITE;
end else begin
if ((is_lui || is_ori || is_andi ||
is_addi || is_addiu ||
is_slti || is_sltiu) && (rt_idx != 5'd0)) begin
regfile[rt_idx] <= alu_wb;
// Ch283 — scalar mirror: zero-extend into the
// 128-bit shadow (R5900 clears upper bits on
// every scalar destination write).
gpr128[rt_idx] <= {96'd0, alu_wb};
end
if (is_rtype_alu && (rd_idx != 5'd0)) begin
regfile[rd_idx] <= rtype_alu_wb;
// Ch283 — MMI ops get the full 128-bit
// result into gpr128; scalar ops just zero-
// extend their 32-bit result. The regfile
// mirror above always lands the low 32
// (rtype_alu_wb is computed accordingly).
if (is_mmi_wb)
gpr128[rd_idx] <= rtype_alu128_wb;
else
gpr128[rd_idx] <= {96'd0, rtype_alu_wb};
end
// Ch203: MULTU writes {HI, LO} = unsigned 64-bit
// product of (rs, rt). No rd writeback (rd bits
// are architecturally ignored). MFLO/MFHI in a
// following instruction observe the result.
if (is_multu) begin
logic [63:0] mu_product;
mu_product = {32'd0, rs_val} * {32'd0, rt_val};
lo_reg <= mu_product[31:0];
hi_reg <= mu_product[63:32];
end
// Ch43: DIVU writes LO=quotient, HI=remainder.
// Divisor==0 is UNDEFINED per MIPS spec; we
// take the deterministic "leave HI/LO
// unchanged" option. Ch162: STRIP_HW_DIVIDER
// gates the `/` and `%` operators away so
// Quartus doesn't infer the 32-bit hardware
// divider on PSMCT32-only hardware builds.
if (!STRIP_HW_DIVIDER && is_divu && (rt_val != 32'd0)) begin
lo_reg <= rs_val / rt_val;
hi_reg <= rs_val % rt_val;
end
// Ch43: MFHI/MFLO move HI/LO into rd. rd==0
// suppresses the write (architectural $0
// protection).
if (is_mfhi && (rd_idx != 5'd0)) begin
regfile[rd_idx] <= hi_reg;
gpr128[rd_idx] <= {96'd0, hi_reg}; // Ch283 mirror
end
if (is_mflo && (rd_idx != 5'd0)) begin
regfile[rd_idx] <= lo_reg;
gpr128[rd_idx] <= {96'd0, lo_reg}; // Ch283 mirror
end
// JAL: link address is pc + 8 (instruction after
// the delay slot). $31 is the architectural $ra.
if (is_jal) begin
regfile[5'd31] <= pc + 32'd8;
gpr128[5'd31] <= {96'd0, pc + 32'd8}; // Ch283 mirror
end
// JALR: same pc+8 link semantics, but the link
// destination is explicit in rd. rd==0 suppresses
// the write (valid JALR encoding for "jump
// indirect without keeping a return address").
if (is_jalr && (rd_idx != 5'd0)) begin
regfile[rd_idx] <= pc + 32'd8;
gpr128[rd_idx] <= {96'd0, pc + 32'd8}; // Ch283 mirror
end
if (is_mfc0 && (rt_idx != 5'd0)) begin
regfile[rt_idx] <= cop0_read_val;
gpr128[rt_idx] <= {96'd0, cop0_read_val}; // Ch283 mirror
end
if (is_mtc0) begin
unique case (rd_idx)
COP0_REG_STATUS: begin
status_iec <= rt_val[0];
status_kuc <= rt_val[1];
status_iep <= rt_val[2];
status_kup <= rt_val[3];
status_ieo <= rt_val[4];
status_kuo <= rt_val[5];
status_im <= rt_val[15:8];
status_bev <= rt_val[22]; // Ch50
end
COP0_REG_CAUSE: begin
cause_exc_code <= rt_val[6:2];
cause_ip_sw[1:0] <= rt_val[9:8];
end
COP0_REG_EPC: epc <= rt_val;
default: ;
endcase
end
if (is_rfe) begin
status_iec <= status_iep;
status_iep <= status_ieo;
status_kuc <= status_kup;
status_kup <= status_kuo;
retired_flag_rfe <= 1'b1;
end
if (is_mfc0) begin
retired_arg2 <= {27'd0, rd_idx};
retired_arg3 <= cop0_read_val;
end else if (is_mtc0) begin
retired_arg2 <= {27'd0, rd_idx};
retired_arg3 <= rt_val;
end else if (is_taken_branch_or_jump) begin
retired_arg2 <= taken_target;
retired_arg3 <= 32'd0;
end else if (is_lui || is_ori || is_andi ||
is_addi || is_addiu ||
is_slti || is_sltiu) begin
retired_arg3 <= alu_wb;
end else if (is_rtype_alu) begin
retired_arg2 <= {27'd0, rd_idx};
retired_arg3 <= rtype_alu_wb;
end else if (is_multu) begin
// Ch203: arg2 = rt_val (so traces show
// both operands together with rs in
// arg0/regfile context), arg3 = LO half
// of the product = the value MFLO would
// read next. The full 64-bit product
// isn't carried in trace events; HI is
// exposed via the MFHI retire below.
retired_arg2 <= rt_val;
retired_arg3 <= ({32'd0, rs_val} * {32'd0, rt_val});
end else if (is_divu) begin
// arg2 = divisor (rt_val) so a div-by-zero
// event is obvious in traces; arg3 =
// quotient (or 0 on div-by-zero, since
// LO is left unchanged — the trace just
// records what *would* have been written).
// Ch162: STRIP_HW_DIVIDER gates the `/`
// here too so Quartus doesn't keep an
// inferred divider just for the trace.
retired_arg2 <= rt_val;
retired_arg3 <= (STRIP_HW_DIVIDER || rt_val == 32'd0)
? 32'd0
: (rs_val / rt_val);
end else if (is_mfhi) begin
retired_arg2 <= {27'd0, rd_idx};
retired_arg3 <= hi_reg;
end else if (is_mflo) begin
retired_arg2 <= {27'd0, rd_idx};
retired_arg3 <= lo_reg;
end
retire_pulse <= 1'b1;
retire_advance();
state <= S_IFETCH_REQ;
end
end
S_MEM_REQ: state <= S_MEM_WAIT;
S_MEM_WAIT: begin
if (map_rd_valid) begin
// Ch283/Ch284 — multi-beat loads. LQ takes 4
// beats (terminal = 2'd3), LD takes 2 (terminal
// = 2'd1). Each beat captures a 32-bit lane
// into gpr128[rt]; the last beat mirrors low
// 32 to regfile[rt] and retires. LD mirrors
// regfile on beat 0 (the moment the low 32 are
// actually written into gpr128) so the value
// is observable immediately on retire.
if (is_lq || is_ld) begin
logic [1:0] terminal_beat;
terminal_beat = is_lq ? 2'd3 : 2'd1;
if (rt_idx != 5'd0) begin
case (sq_beat)
2'd0: gpr128[rt_idx][31:0] <= map_rd_data;
2'd1: gpr128[rt_idx][63:32] <= map_rd_data;
2'd2: gpr128[rt_idx][95:64] <= map_rd_data;
2'd3: gpr128[rt_idx][127:96] <= map_rd_data;
endcase
// Last beat mirrors low 32 to regfile.
// For LQ the low 32 was committed on
// beat 0; for LD the same. By terminal
// beat the NBA has settled, so reading
// gpr128[rt_idx][31:0] is safe.
if (sq_beat == terminal_beat)
regfile[rt_idx] <= gpr128[rt_idx][31:0];
end
if (sq_beat != terminal_beat) begin
sq_beat <= sq_beat + 2'd1;
state <= S_MEM_REQ;
end else begin
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= ea;
retired_arg3 <= map_rd_data;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b1;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retired_flag_rfe <= 1'b0;
retire_pulse <= 1'b1;
sq_beat <= 2'd0;
retire_advance();
state <= S_IFETCH_REQ;
end
end else begin
// Sub-word loads extract the addressed byte or
// halfword out of the returned 32-bit word.
// LB : byte at ea[1:0], 24-bit sign-extend
// LH : halfword at ea[1], 16-bit sign-extend
// LHU : halfword at ea[1], 16-bit zero-extend
// LW : whole word, as-is
// Halfword access uses only ea[1] — ea[0] must
// be 0 for aligned access (unaligned halfword
// reads are not modelled as an exception yet).
if (rt_idx != 5'd0) begin
// Ch283 — compute the 32-bit load value into
// a temporary, then write both regfile and
// gpr128 (zero-extended into the upper 96).
// LW/LB/LBU/LH/LHU all produce a single
// 32-bit lane; the architectural rule is
// upper bits of $rt clear to 0.
logic [31:0] load_wb;
load_wb = map_rd_data; // LW default
if (is_lb) begin
case (ea[1:0])
2'd0: load_wb = {{24{map_rd_data[7]}}, map_rd_data[7:0]};
2'd1: load_wb = {{24{map_rd_data[15]}}, map_rd_data[15:8]};
2'd2: load_wb = {{24{map_rd_data[23]}}, map_rd_data[23:16]};
2'd3: load_wb = {{24{map_rd_data[31]}}, map_rd_data[31:24]};
endcase
end else if (is_lbu) begin
case (ea[1:0])
2'd0: load_wb = {24'd0, map_rd_data[7:0]};
2'd1: load_wb = {24'd0, map_rd_data[15:8]};
2'd2: load_wb = {24'd0, map_rd_data[23:16]};
2'd3: load_wb = {24'd0, map_rd_data[31:24]};
endcase
end else if (is_lh) begin
case (ea[1])
1'b0: load_wb = {{16{map_rd_data[15]}}, map_rd_data[15:0]};
1'b1: load_wb = {{16{map_rd_data[31]}}, map_rd_data[31:16]};
endcase
end else if (is_lhu) begin
case (ea[1])
1'b0: load_wb = {16'd0, map_rd_data[15:0]};
1'b1: load_wb = {16'd0, map_rd_data[31:16]};
endcase
end
regfile[rt_idx] <= load_wb;
gpr128[rt_idx] <= {96'd0, load_wb};
end
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= ea;
retired_arg3 <= map_rd_data;
retired_flag_write <= 1'b0;
retired_flag_read <= 1'b1;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retired_flag_rfe <= 1'b0;
retire_pulse <= 1'b1;
retire_advance();
state <= S_IFETCH_REQ;
end
end
end
S_MEM_WRITE: begin
if ((is_sq && sq_beat != 2'd3)
|| (is_sd && sq_beat != 2'd1)) begin
// Ch271/Ch275 — multi-beat store mid-beats:
// drive the next 32-bit lane on the next cycle.
// Don't retire yet — the single architectural
// instruction (SQ=4 beats / SD=2 beats) maps to
// one retire event.
sq_beat <= sq_beat + 2'd1;
// stay in S_MEM_WRITE
end else begin
retired_pc <= pc;
retired_instr <= instr;
retired_arg2 <= ea;
retired_arg3 <= rt_val;
retired_flag_write <= 1'b1;
retired_flag_read <= 1'b0;
retired_flag_branch <= 1'b0;
retired_flag_halt <= 1'b0;
retired_flag_in_delay <= instr_in_delay_slot;
retired_flag_rfe <= 1'b0;
retire_pulse <= 1'b1;
retire_advance();
sq_beat <= 2'd0;
state <= S_IFETCH_REQ;
end
end
S_HALT: state <= S_HALT;
// Ch215 — jmp_buf restore FSM.
S_CH215_REQ: state <= S_CH215_WAIT;
S_CH215_WAIT: begin
if (map_rd_valid) begin
// Store the loaded word into the canonical
// regfile slot per the Ch212 field-map.
case (ch215_count)
4'd0: begin regfile[31] <= map_rd_data; gpr128[31] <= {96'd0, map_rd_data}; end // $ra at +0x00
4'd1: begin regfile[29] <= map_rd_data; gpr128[29] <= {96'd0, map_rd_data}; end // $sp at +0x04
4'd2: begin regfile[30] <= map_rd_data; gpr128[30] <= {96'd0, map_rd_data}; end // $fp at +0x08
4'd3: begin regfile[16] <= map_rd_data; gpr128[16] <= {96'd0, map_rd_data}; end // $s0 at +0x0C
4'd4: begin regfile[17] <= map_rd_data; gpr128[17] <= {96'd0, map_rd_data}; end // $s1 at +0x10
4'd5: begin regfile[18] <= map_rd_data; gpr128[18] <= {96'd0, map_rd_data}; end // $s2 at +0x14
4'd6: begin regfile[19] <= map_rd_data; gpr128[19] <= {96'd0, map_rd_data}; end // $s3 at +0x18
4'd7: begin regfile[20] <= map_rd_data; gpr128[20] <= {96'd0, map_rd_data}; end // $s4 at +0x1C
4'd8: begin regfile[21] <= map_rd_data; gpr128[21] <= {96'd0, map_rd_data}; end // $s5 at +0x20
4'd9: begin regfile[22] <= map_rd_data; gpr128[22] <= {96'd0, map_rd_data}; end // $s6 at +0x24
4'd10: begin regfile[23] <= map_rd_data; gpr128[23] <= {96'd0, map_rd_data}; end // $s7 at +0x28
4'd11: begin regfile[28] <= map_rd_data; gpr128[28] <= {96'd0, map_rd_data}; end // $gp at +0x2C
default: ;
endcase
if (ch215_count == 4'd11) begin
// Done — set $v0=1 (longjmp-style return
// value so post-setjmp `beq $v0,$0` at
// 0xBFC52350 falls through to the
// longjmp-return path), set PC to the
// loaded $ra (committed at count==0).
regfile[2] <= 32'd1;
gpr128[2] <= 128'd1; // Ch283 mirror
pc <= regfile[31];
state <= S_IFETCH_REQ;
end else begin
ch215_count <= ch215_count + 4'd1;
state <= S_CH215_REQ;
end
end
end
default: state <= S_IDLE;
endcase
end
end
assign halt_o = (state == S_HALT);
assign pc_o = pc;
// ------------------------------------------------------------------
// Trace emission — one event per retire, SUBSYS_EE
// ------------------------------------------------------------------
always_ff @(posedge clk) begin
if (!rst_n) begin
ev_valid <= 1'b0;
ev_subsys <= SUBSYS_EE;
ev_event <= EV_IFETCH;
ev_arg0 <= 64'd0;
ev_arg1 <= 64'd0;
ev_arg2 <= 64'd0;
ev_arg3 <= 64'd0;
ev_flags <= 32'd0;
end else if (retire_pulse) begin
ev_valid <= 1'b1;
ev_subsys <= SUBSYS_EE;
ev_event <= EV_IFETCH;
ev_arg0 <= {32'd0, retired_pc};
ev_arg1 <= {32'd0, retired_instr};
ev_arg2 <= {32'd0, retired_arg2};
ev_arg3 <= {32'd0, retired_arg3};
ev_flags <= {24'd0,
retired_flag_trap,
retired_flag_rfe,
retired_flag_except,
retired_flag_in_delay,
retired_flag_halt,
retired_flag_branch,
retired_flag_read,
retired_flag_write};
end else begin
ev_valid <= 1'b0;
end
end
endmodule : ee_core_stub