ec82764bef
RTL (GS rasterizer, EE core stub, platform bridge, LPDDR4B path), sim regression (272 TBs), docs, and tooling. Copyrighted PS2 content (BIOS, game code, GS dumps, and all dump-derived textures/traces) is excluded via .gitignore and stays local. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2155 lines
113 KiB
Systemverilog
2155 lines
113 KiB
Systemverilog
// retroDE_ps2 — ee_core_stub
|
|
//
|
|
// First EE-side execution primitive. Structural mirror of
|
|
// `iop_core_stub` — same multi-cycle FSM, same R3000 subset, same
|
|
// delay-slot discipline, same strict-unsupported gate, same COP0
|
|
// exception-entry shape. The files are kept separate because the EE
|
|
// is fundamentally an R5900 (MIPS III + SCE extensions + VU) and will
|
|
// eventually need 64-bit register widths, COP1/COP2, and VU-side
|
|
// plumbing the IOP will never grow. For the first EE chapter the core
|
|
// is deliberately as narrow as iop_core_stub so the earliest EE
|
|
// programs can exercise fetch + decode + branches + halt before any
|
|
// of that complexity is needed.
|
|
//
|
|
// Where the two cores differ right now:
|
|
// - Trace subsystem: SUBSYS_EE (iop_core_stub uses SUBSYS_IOP).
|
|
// - Default PC_RESET and EXC_VECTOR are the same architectural
|
|
// values (0xBFC0_0000 / 0x0000_0080) — real R5900 and real R3000
|
|
// both reset to 0xBFC0_0000.
|
|
//
|
|
// Supported opcodes (MIPS encoding):
|
|
// SPECIAL (opcode = 0x00):
|
|
// func 0x00 (SLL), 0x02 (SRL), 0x03 (SRA) — shift family
|
|
// (destination = rd; operand = rt; count = shamt [10:6].
|
|
// SRA uses arithmetic right shift so the MSB sign-extends;
|
|
// SRL zero-fills. SLL $0,$0,0 is the canonical NOP encoding
|
|
// and flows through this path harmlessly — the rd_idx=0
|
|
// guard suppresses the writeback.)
|
|
// func 0x08 (JR), func 0x09 (JALR), func 0x0C (SYSCALL),
|
|
// (JALR: register-indirect call. Target is rs_val (same
|
|
// path as JR), link address pc+8 is written to rd_idx.
|
|
// rd_idx==0 suppresses the link write — valid encoding
|
|
// for "jump indirect without keeping a return address".
|
|
// First real-BIOS trip at 0xBFC5_29E8 after SH.)
|
|
// func 0x20 (ADD), 0x21 (ADDU), 0x22 (SUB), 0x23 (SUBU)
|
|
// (ADD/SUB implemented as ADDU/SUBU — same pragmatic policy
|
|
// as ADDI vs ADDIU; Arithmetic Overflow exception deferred.
|
|
// First real-BIOS ADD trip at 0x0000_060C (RAM-resident
|
|
// code) after JALR unlocked retired=84112.)
|
|
// func 0x24 (AND), 0x25 (OR), 0x26 (XOR), 0x27 (NOR)
|
|
// func 0x2A (SLT), 0x2B (SLTU) — R-type compare, register form
|
|
// pair of SLTI/SLTIU (first real-BIOS SLTU trip at 0xBFC0_2644)
|
|
// (R-type ALU family; destination is rd, not rt.
|
|
// OR was the first trip in the logic subset at 0xBFC0_2074;
|
|
// the rest landed because they share the exact same R-type
|
|
// plumbing. ADDU was the first arith trip at 0xBFC0_2640 after
|
|
// JAL landed, paired with SUBU in the same addition.)
|
|
// other funcs → NOP.
|
|
// 0x02 J 0x03 JAL 0x04 BEQ 0x05 BNE
|
|
// (JAL: jump-and-link; target is the same form as J, but writes
|
|
// pc+8 into $31. First real-BIOS trip at 0xBFC0_23D0 after LB
|
|
// unlocked retired=1714.)
|
|
// 0x08 ADDI (behaves as ADDIU — we do not model the Arithmetic
|
|
// Overflow exception yet; real BIOS emits ADDI where
|
|
// overflow cannot happen in practice)
|
|
// 0x09 ADDIU
|
|
// 0x0A SLTI 0x0B SLTIU (first evidence-driven growth past IOP's
|
|
// subset — real BIOS hit SLTI at
|
|
// 0xBFC0_0008 under strict mode, then ADDI
|
|
// at 0xBFC0_206C, then ANDI at 0xBFC0_2070
|
|
// as each new op unlocked the next)
|
|
// 0x0C ANDI 0x0D ORI 0x0F LUI
|
|
// 0x10 COP0 (rs=0 MFC0, rs=4 MTC0, rs=0x10/func=0x10 RFE)
|
|
// 0x20 LB 0x21 LH 0x23 LW 0x25 LHU 0x28 SB 0x29 SH 0x2B SW
|
|
// (SB: byte store; broadcasts rt[7:0] into the addressed lane
|
|
// and sets a one-hot byte-enable on the map_wr_be bus. First
|
|
// real-BIOS trip at 0xBFC0_20A0 after the R-type logic family
|
|
// unlocked retired=180.
|
|
// LB: byte load with sign-extension; extracts byte at ea[1:0]
|
|
// from the 32-bit word returned by the map and 24-bit
|
|
// sign-extends. First real-BIOS trip at 0xBFC0_23A8 after SB
|
|
// unlocked retired=1704.
|
|
// LH/LHU: halfword load, sign- and zero-extended respectively.
|
|
// Halfword addressing uses ea[1] only (ea[0] must be zero for
|
|
// aligned access). First real-BIOS LH trip at 0xBFC0_2684 after
|
|
// SLT/SLTU unlocked retired=7385; LHU landed in the same
|
|
// chapter because it shares the exact same extraction plumbing
|
|
// with a zero-fill instead of sign-fill.)
|
|
// Anything else → NOP (or strict trap when STRICT_UNSUPPORTED=1).
|
|
//
|
|
// Intentionally NOT yet (evidence-driven growth will resolve these
|
|
// as the real BIOS demands them):
|
|
// - 64-bit regs / LD / SD / LQ / SQ (R5900 territory)
|
|
// - COP1 (FPU), COP2 (VU0 macro mode), VU1, VIF
|
|
// - TLB / caches
|
|
// - Signed DIV, MULT, MULTU, MTHI/MTLO — HI/LO architectural
|
|
// state + unsigned DIVU + MFHI/MFLO landed in ch43; the
|
|
// remaining mul/div/signed-div and HI/LO *writes* defer until
|
|
// BIOS demands them. Divide latency and overflow detail are
|
|
// not modelled (DIVU completes in one cycle, matching the
|
|
// ALU style; revisit only if a timing-sensitive test needs
|
|
// it).
|
|
// - Most of COP0 beyond {Count (read-only), Status, Cause, EPC}
|
|
// — Compare, Random, EntryHi/Lo, etc. Count write via MTC0 is
|
|
// silently dropped; revisit if a BIOS loop depends on reset-
|
|
// to-value semantics.
|
|
// - Arithmetic Overflow exception on ADD / SUB / ADDI (they
|
|
// execute as ADDU / SUBU / ADDIU for now; the exception path
|
|
// lights up only when real BIOS code actually overflows)
|
|
// - REGIMM link variants (BLTZAL/BGEZAL) — the non-linking
|
|
// BLTZ/BGEZ landed in ch41 alongside BLEZ/BGTZ; link variants
|
|
// defer until the real BIOS needs them.
|
|
// - unaligned load/store (LWL/LWR/SWL/SWR) — rare in early boot
|
|
// - BD bit in Cause; nested interrupts; syscall/break dispatch
|
|
// beyond SYSCALL-as-halt.
|
|
//
|
|
// Trace (SUBSYS_EE, EV_IFETCH one-per-retire; flag bits match
|
|
// iop_core_stub):
|
|
// bit 0 = SW bit 1 = LW bit 2 = branch/jump taken
|
|
// bit 3 = SYSCALL halt bit 4 = instruction in delay slot
|
|
// bit 5 = exception taken bit 6 = RFE retired
|
|
// bit 7 = strict trap
|
|
//
|
|
// Strict mode (STRICT_UNSUPPORTED parameter) matches iop_core_stub.
|
|
|
|
`timescale 1ns/1ps
|
|
|
|
module ee_core_stub
|
|
import trace_pkg::*;
|
|
#(
|
|
parameter logic [31:0] PC_RESET = 32'hBFC0_0000,
|
|
parameter logic [31:0] EXC_VECTOR = 32'h0000_0080,
|
|
parameter bit STRICT_UNSUPPORTED = 1'b0,
|
|
// Ch47: address-error enforcement on word/halfword loads and
|
|
// stores. When enabled, any SW/LW with ea[1:0] != 0 or
|
|
// SH/LH/LHU with ea[0] != 0 halts the core and sets
|
|
// trap_o / trap_pc_o / trap_instr_o (same mechanism as
|
|
// STRICT_UNSUPPORTED). Default on — real MIPS raises an
|
|
// AdEL/AdES exception here; silently aliasing to the aligned
|
|
// word was a simulation artifact that previously made
|
|
// unaligned writes look like valid aligned stores.
|
|
parameter bit TRAP_ALIGN_ERROR = 1'b1,
|
|
// Ch50: Status.BEV (boot exception vector) modeling. Real MIPS
|
|
// resets with BEV=1 so exceptions vector to a ROM-resident
|
|
// "boot" handler (BEV_EXC_VECTOR). The BIOS writes BEV=0 via
|
|
// MTC0 once its RAM-resident handler is installed, at which
|
|
// point exceptions vector to EXC_VECTOR. INIT_BEV controls the
|
|
// reset state of Status.BEV; default 0 keeps the pre-ch50
|
|
// backward-compatible behavior (always vector to EXC_VECTOR).
|
|
// tb_ee_core_bios_smoke sets INIT_BEV=1 to match real-BIOS
|
|
// semantics.
|
|
parameter bit INIT_BEV = 1'b0,
|
|
parameter logic [31:0] BEV_EXC_VECTOR = 32'hBFC0_0380,
|
|
// Ch162 — strip the synthesized 32-bit hardware divider that
|
|
// Quartus infers from the Ch43 DIVU instruction's `/` and `%`
|
|
// operators. The auto-generated divider is the Ch159+ STA
|
|
// critical path (top-10 worst paths all live in
|
|
// `u_demo|u_core|div_0_rtl_0|auto_generated|divider|...`,
|
|
// ~32 ns of combinational ripple). Default 0 keeps the
|
|
// load-bearing DIVU semantics for every existing Ch43
|
|
// integration TB (`tb_ee_core_divu_mflo`, etc.). When set to
|
|
// 1, the DIVU writeback path becomes a no-op (HI/LO stay at
|
|
// their prior values, just as in the divisor==0 case the
|
|
// spec calls undefined). Synthesis builds that target the
|
|
// PSMCT32 SPRITE-only raster demo can set this to 1 — the
|
|
// bootlet doesn't execute DIVU, so removing the divider is
|
|
// behavior-neutral for that demo while freeing the
|
|
// critical-path budget for everything else. Sim TBs that
|
|
// assert DIVU output (only `tb_ee_core_divu_mflo` today)
|
|
// keep the parameter at its default 0.
|
|
parameter bit STRIP_HW_DIVIDER = 1'b0,
|
|
// Ch215 — labeled sim-only jmp_buf restore for SYSCALL #8 ($v1=8,
|
|
// $a0=2). When enabled, the SYSCALL handler enters a 12-step LW
|
|
// FSM that loads $ra/$sp/$fp/$s0..$s7/$gp from a hardcoded frame
|
|
// at CH215_JMPBUF_BASE (proven by Ch214 to be the BIOS-side
|
|
// 0xA000B1E0 jmp_buf assembled via LUI $r,0xA001 + ADDIU $r,$r,
|
|
// -0x4E20 at the setjmp call site 0xBFC52340/4C). Also sets $v0=1
|
|
// so the post-setjmp `beq $v0,$0` at 0xBFC52350 falls through to
|
|
// the longjmp-return path. NOT general syscall #8 semantics — a
|
|
// BIOS-bringup shim until the real kernel handler is modeled.
|
|
parameter bit CH215_JMPBUF_RESTORE_ENABLE = 1'b0,
|
|
parameter logic [31:0] CH215_JMPBUF_BASE = 32'hA000_B1E0,
|
|
// Ch273 — minimal EE syscall HLE dispatcher for the ELF runner /
|
|
// user-mode flow. When enabled, SYSCALL with a known $v1 ($v1 is
|
|
// the PS2-EE syscall-number convention) sets $v0 to a stub return
|
|
// and advances PC to PC+4 (normal user-code SYSCALL resume — NOT
|
|
// RFE; that's the Ch199 path). Unknown $v1 still halts (with $v1
|
|
// and $a0..$a3 readable via hierarchical sim peek from the TB) so
|
|
// the next blocker surfaces.
|
|
//
|
|
// Default 0 to keep every existing TB's "syscall = halt-PASS"
|
|
// pattern working (those tests don't set $v1, so $v1 could be
|
|
// anything; gating prevents accidental dispatch).
|
|
//
|
|
// Recognized $v1 values (qbert.elf crt0 prolog):
|
|
// 0x3C EndOfHeap -> $v0 = SYSCALL_HEAP_END (default 0x001E0000)
|
|
// 0x3D InitMainThread -> $v0 = 0 (stub success)
|
|
// 0x64 FlushCache -> $v0 = 0 (no-op success)
|
|
parameter bit EE_SYSCALL_HLE_ENABLE = 1'b0,
|
|
parameter logic [31:0] SYSCALL_HEAP_END = 32'h001E_0000
|
|
) (
|
|
input logic clk,
|
|
input logic rst_n,
|
|
|
|
input logic go_i,
|
|
|
|
output logic map_rd_en,
|
|
output logic [31:0] map_rd_addr,
|
|
input logic [31:0] map_rd_data,
|
|
input logic map_rd_valid,
|
|
|
|
output logic map_wr_en,
|
|
output logic [31:0] map_wr_addr,
|
|
output logic [31:0] map_wr_data,
|
|
output logic [3:0] map_wr_be,
|
|
|
|
input logic cpu_irq,
|
|
|
|
output logic halt_o,
|
|
output logic [31:0] pc_o,
|
|
|
|
output logic trap_o,
|
|
output logic [31:0] trap_pc_o,
|
|
output logic [31:0] trap_instr_o,
|
|
|
|
output logic ev_valid,
|
|
output subsys_e ev_subsys,
|
|
output event_e ev_event,
|
|
output logic [63:0] ev_arg0,
|
|
output logic [63:0] ev_arg1,
|
|
output logic [63:0] ev_arg2,
|
|
output logic [63:0] ev_arg3,
|
|
output logic [31:0] ev_flags
|
|
);
|
|
|
|
// ------------------------------------------------------------------
|
|
// Opcode / func / COP0 rs constants
|
|
// ------------------------------------------------------------------
|
|
|
|
localparam logic [5:0] OP_SPECIAL = 6'h00;
|
|
// REGIMM = opcode 0x01. rt-field selects the branch sub-op:
|
|
// rt=0x00 -> BLTZ, rt=0x01 -> BGEZ (link variants 0x10/0x11
|
|
// intentionally not yet modeled; add when real-BIOS needs them).
|
|
localparam logic [5:0] OP_REGIMM = 6'h01;
|
|
localparam logic [4:0] REGIMM_BLTZ = 5'h00;
|
|
localparam logic [4:0] REGIMM_BGEZ = 5'h01;
|
|
localparam logic [5:0] OP_J = 6'h02;
|
|
localparam logic [5:0] OP_JAL = 6'h03;
|
|
localparam logic [5:0] OP_BEQ = 6'h04;
|
|
// Ch274 — MIPS-II BEQL (Branch on Equal Likely), opcode 0x14.
|
|
// Same compare as BEQ; differs from BEQ only in the not-taken
|
|
// path: the delay slot is SQUASHED (PC jumps directly to PC+8)
|
|
// instead of executing. qbert.elf hits this at PC 0x001000C0 in
|
|
// its C++ static-constructor walker, where the delay slot at
|
|
// PC+4 clobbers $a0 and MUST be squashed when the table is empty.
|
|
localparam logic [5:0] OP_BEQL = 6'h14;
|
|
// Ch277 — MIPS-II BNEL (Branch on Not Equal Likely), opcode 0x15.
|
|
// Mirror of BEQL with the BNE condition. Taken when rs!=rt
|
|
// (delay slot executes, branch target reached); not-taken when
|
|
// rs==rt (delay slot SQUASHED, PC jumps to PC+8). qbert.elf hits
|
|
// this at PC 0x00112C7C inside a function body — predicted as
|
|
// the Ch276→Ch277 follow-on at the Ch274 closeout.
|
|
localparam logic [5:0] OP_BNEL = 6'h15;
|
|
localparam logic [5:0] OP_BNE = 6'h05;
|
|
localparam logic [5:0] OP_BLEZ = 6'h06;
|
|
localparam logic [5:0] OP_BGTZ = 6'h07;
|
|
localparam logic [5:0] OP_ADDI = 6'h08;
|
|
localparam logic [5:0] OP_ADDIU = 6'h09;
|
|
localparam logic [5:0] OP_SLTI = 6'h0A;
|
|
localparam logic [5:0] OP_SLTIU = 6'h0B;
|
|
localparam logic [5:0] OP_ANDI = 6'h0C;
|
|
localparam logic [5:0] OP_ORI = 6'h0D;
|
|
localparam logic [5:0] OP_LUI = 6'h0F;
|
|
localparam logic [5:0] OP_COP0 = 6'h10;
|
|
localparam logic [5:0] OP_LB = 6'h20;
|
|
localparam logic [5:0] OP_LH = 6'h21;
|
|
localparam logic [5:0] OP_LW = 6'h23;
|
|
// Ch279 — R5900 EE Load Quadword. opcode=0x1E, I-type:
|
|
// lq rt, imm(base) → rt[127:0] = mem[base+imm][127:0]
|
|
// 128-bit load symmetric to SQ (Ch271). Our regfile is 32-bit
|
|
// so we read only the low 32 bits at EA+0 — upper 96 bits are
|
|
// unrepresentable and discarded. Requires 16-byte alignment;
|
|
// misaligned trips the existing AdEL path. qbert.elf hits
|
|
// `lq $t1, 0($a1)` at PC 0x00112C88.
|
|
localparam logic [5:0] OP_LQ = 6'h1E;
|
|
localparam logic [5:0] OP_LBU = 6'h24;
|
|
localparam logic [5:0] OP_LHU = 6'h25;
|
|
localparam logic [5:0] OP_SB = 6'h28;
|
|
localparam logic [5:0] OP_SH = 6'h29;
|
|
localparam logic [5:0] OP_SW = 6'h2B;
|
|
// Ch278 — R5900 EE Multimedia Instruction (MMI) prefix.
|
|
// Primary opcode 0x1C; the actual sub-instruction is selected
|
|
// by a combination of the funct field (bits 5:0) and the sa
|
|
// field (bits 10:6), depending on which sub-group the funct
|
|
// names (MMI0/MMI1/MMI2/MMI3). For Ch278 we ONLY decode the
|
|
// single (funct=0x09 / sa=0x0E) pattern = MMI2/PCPYLD; every
|
|
// other MMI sub-instruction continues to fall through to
|
|
// strict-trap so the runner surfaces the next concrete blocker.
|
|
localparam logic [5:0] OP_MMI = 6'h1C;
|
|
localparam logic [5:0] FUNC_MMI2 = 6'h09;
|
|
localparam logic [4:0] MMI2_PCPYLD = 5'h0E;
|
|
// Ch282 — R5900 EE MMI2/PAND (Parallel AND). Same MMI2 funct
|
|
// group as PCPYLD; sa 0x12 selects PAND. Architectural 128-bit
|
|
// bitwise AND; in our 32-bit model identical to standard AND
|
|
// (SPECIAL funct 0x24). qbert.elf hits `pand $v0, $v0, $v1`
|
|
// (instr 0x70431489) at PC 0x00112C98.
|
|
localparam logic [4:0] MMI2_PAND = 5'h12;
|
|
// Ch280 — R5900 EE MMI0/PSUBB (Parallel Subtract Byte).
|
|
// primary opcode 0x1C, funct 0x08 (MMI0), sa 0x09 (PSUBB).
|
|
// 16-way parallel byte subtract across 128 bits; in our
|
|
// 32-bit-GPR model, 4 parallel byte subs on the low 32 bits
|
|
// (each lane modulo 256, no carry between bytes). qbert.elf
|
|
// hits `psubb $v0, $t1, $t2` (instr 0x712A1248) at PC 0x00112C90.
|
|
localparam logic [5:0] FUNC_MMI0 = 6'h08;
|
|
localparam logic [4:0] MMI0_PSUBB = 5'h09;
|
|
// Ch281 — R5900 EE MMI3/PNOR (Parallel Not-OR), opcode 0x1C,
|
|
// funct 0x29 (MMI3), sa 0x13 (PNOR). Architectural 128-bit
|
|
// bitwise NOR; in our 32-bit model, identical to standard NOR
|
|
// (SPECIAL funct 0x27). With rs=$zero this is the canonical
|
|
// MIPS "NOT" pseudo. qbert.elf hits `pnor $v1, $zero, $t1`
|
|
// (instr 0x70091CE9) at PC 0x00112C94.
|
|
localparam logic [5:0] FUNC_MMI3 = 6'h29;
|
|
localparam logic [4:0] MMI3_PNOR = 5'h13;
|
|
// Ch283 — R5900 EE MMI3/PCPYUD (Parallel Copy Upper Doubleword),
|
|
// opcode 0x1C, funct 0x29 (MMI3), sa 0x0E (PCPYUD). Architectural:
|
|
// $rd[127:64] = $rs[127:64]; $rd[63:0] = $rt[127:64]
|
|
// i.e. extract the *upper* doubleword of each source, with $rt's
|
|
// upper-D becoming $rd's lower-D. First MMI op that *reads* from
|
|
// the upper 64 bits of source registers — drove the introduction
|
|
// of the gpr128 shadow in Ch283. qbert.elf hits `pcpyud $a0, $v0,
|
|
// $t1` (instr 0x704923A9) at PC 0x00112CA0.
|
|
localparam logic [4:0] MMI3_PCPYUD = 5'h0E;
|
|
// Ch300 — R5900 EE MMI3/PCPYH (Parallel Copy Halfword), opcode
|
|
// 0x1C, funct 0x29 (MMI3), sa 0x1B. Broadcasts the low halfword
|
|
// of each 64-bit doubleword of $rt across the four halfword lanes
|
|
// of the corresponding doubleword in $rd. rs is ignored.
|
|
// h0 = $rt[15:0], h4 = $rt[79:64]
|
|
// $rd low 64 = {h0, h0, h0, h0}
|
|
// $rd high 64 = {h4, h4, h4, h4}
|
|
// qbert.elf hits `pcpyh $v1, $t0` (instr 0x70081EE9) at PC
|
|
// 0x00110BB4 — first call that uses MMI3 with sa=0x1B after
|
|
// Ch299's library-gate unblock.
|
|
localparam logic [4:0] MMI3_PCPYH = 5'h1B;
|
|
// Ch271 — R5900 EE Store Quadword. opcode=0x1F, I-type:
|
|
// sq rt, imm(base) → store 128 bits ([rt[127:0]]) to mem[base+imm].
|
|
// First real-ELF (qbert.elf) hit `sq $zero, 0($v0)` 8 instructions
|
|
// into its prolog. Minimal scope: 4-beat 32-bit-stripe write
|
|
// through the existing 32-bit map_wr_data port. Upper 96 bits of
|
|
// GPRs aren't modelled, so for non-$zero rt the lower 32 land at
|
|
// beat 0 and beats 1-3 write zero (degrades gracefully for the
|
|
// common "clear a 128-bit slot" use case). Requires 16-byte
|
|
// alignment; misaligned trips the existing AdES path.
|
|
localparam logic [5:0] OP_SQ = 6'h1F;
|
|
// Ch275 — R5900 EE Store Doubleword. MIPS-III opcode 0x3F:
|
|
// sd rt, imm(base) → store 64 bits ([rt[63:0]]) to mem[base+imm].
|
|
// qbert.elf hits `sd $ra, 0x20($sp)` (0xFFBF0020) in a function
|
|
// prologue at PC 0x00112DAC. Modelled as a 2-beat 32-bit-stripe
|
|
// write FSM through the existing map_wr port. Upper 32 bits of
|
|
// GPRs aren't modelled, so beat 0 lands rt_val[31:0] and beat 1
|
|
// writes 0 — matching the SQ approximation. Requires 8-byte
|
|
// alignment; misaligned falls into the existing AdES path.
|
|
localparam logic [5:0] OP_SD = 6'h3F;
|
|
// Ch284 — MIPS-III LD (Load Doubleword), opcode 0x37. The read-
|
|
// side of SD. 2-beat 32-bit load FSM (sq_beat counter, terminal
|
|
// beat = 1) reusing the LQ map-driver beat addressing. Beat 0
|
|
// captures mem[ea+0] into gpr128[rt][31:0] and mirrors low 32 to
|
|
// regfile[rt]; beat 1 captures mem[ea+4] into gpr128[rt][63:32].
|
|
// gpr128[rt][127:64] is unchanged (architectural LD only loads
|
|
// doubleword; upper 64 of $rt are preserved in real R5900). 8-byte
|
|
// alignment required; misaligned trips the existing AdEL path.
|
|
// qbert.elf hits `ld $ra, 0($ra)` (0xDFBF0000) at PC 0x00113378
|
|
// — the function-epilogue $ra restore.
|
|
localparam logic [5:0] OP_LD = 6'h37;
|
|
// Ch178 — CACHE accepted as legal no-op. The EE BIOS issues
|
|
// CACHE (often Hit-Invalidate or Index-Invalidate of the
|
|
// I-cache, op=0x05) after copying exception handlers / trampolines
|
|
// into RAM to flush stale icache lines. The stub has no modeled
|
|
// caches, so executing CACHE as a semantic NOP (no register/memory
|
|
// side effects, PC advances normally) is the standard pragmatic
|
|
// choice — see Codex Ch178 caution: accept the whole CACHE class
|
|
// regardless of sub-op (`rt` field). If a later chapter actually
|
|
// models cache coherency, this is the natural place to revisit.
|
|
localparam logic [5:0] OP_CACHE = 6'h2F;
|
|
|
|
localparam logic [5:0] FUNC_SLL = 6'h00;
|
|
// Ch276 — R5900 DSLL (Doubleword Shift Left Logical), MIPS-III
|
|
// SPECIAL funct 0x38. Architecturally a 64-bit left shift by
|
|
// `sa` (sa ∈ [0,31]). Our regfile is 32-bit, so for any valid
|
|
// sa < 32 the low 32-bit result is identical to SLL. PS2 ELFs
|
|
// use DSLL to build 64-bit constants and to do unsigned shifts
|
|
// where the implicit 64-bit width matters; qbert.elf hits
|
|
// `dsll $t1, $t1, 16` at PC 0x00112C54.
|
|
localparam logic [5:0] FUNC_DSLL = 6'h38;
|
|
localparam logic [5:0] FUNC_SRL = 6'h02;
|
|
localparam logic [5:0] FUNC_SRA = 6'h03;
|
|
// Ch67 — variable-shift family. Operand = rt_val, shift amount =
|
|
// rs_val[4:0], destination = rd. Surfaced by the real-BIOS
|
|
// copied-code region (first hit SRLV at pc=0x001A459C after
|
|
// Ch64/Ch65 mirrors); SLLV/SRAV are the direct siblings and tend
|
|
// to co-occur in shift-heavy code paths.
|
|
localparam logic [5:0] FUNC_SLLV = 6'h04;
|
|
localparam logic [5:0] FUNC_SRLV = 6'h06;
|
|
localparam logic [5:0] FUNC_SRAV = 6'h07;
|
|
localparam logic [5:0] FUNC_JR = 6'h08;
|
|
localparam logic [5:0] FUNC_JALR = 6'h09;
|
|
localparam logic [5:0] FUNC_SYSCALL = 6'h0C;
|
|
// Ch292 — MIPS SYNC (SPECIAL funct 0x0F). Memory-ordering barrier;
|
|
// architecturally orders prior loads/stores against subsequent ones.
|
|
// In this in-order stub model with no cache coherency, store buffer,
|
|
// or out-of-order memory, SYNC has no visible side effect. qbert.elf
|
|
// hits the canonical encoding 0x0000000F (rs/rt/rd/sa all zero) at
|
|
// PC 0x00112994 — the post-registration memory barrier after the
|
|
// Ch290/291 paired Add+Enable handler calls.
|
|
localparam logic [5:0] FUNC_SYNC = 6'h0F;
|
|
// SPECIAL-encoded HI/LO ops (ch43 addition):
|
|
// MFHI=0x10, MFLO=0x12, DIVU=0x1B.
|
|
// MFHI shares its numeric value with RFE below (both 6'h10),
|
|
// but RFE lives under OP_COP0 while MFHI lives under OP_SPECIAL
|
|
// — different opcodes so the collision is harmless.
|
|
localparam logic [5:0] FUNC_MFHI = 6'h10;
|
|
localparam logic [5:0] FUNC_MFLO = 6'h12;
|
|
localparam logic [5:0] FUNC_MULTU = 6'h19;
|
|
localparam logic [5:0] FUNC_DIVU = 6'h1B;
|
|
localparam logic [5:0] FUNC_RFE = 6'h10;
|
|
// Ch286 — R5900 EI (Enable Interrupts), EE-specific extension to
|
|
// the MIPS COP0 CO sub-table. qbert.elf hits the exact 32-bit
|
|
// encoding 0x42000038 (opcode=COP0, rs=CO, rt/rd/sa=0, funct=0x38)
|
|
// at PC 0x001000FC during init. Codex framing: decode the EXACT
|
|
// 32-bit instruction, accept it as side-effect-free (no GPR
|
|
// writeback, PC += 4, no halt/trap), and do NOT NOP-class all
|
|
// COP0/CO instructions. The companion DI (funct 0x39) is left
|
|
// trapping until a future ELF surfaces it.
|
|
localparam logic [31:0] EI_INSTR_R5900 = 32'h4200_0038;
|
|
localparam logic [5:0] FUNC_ADD = 6'h20;
|
|
localparam logic [5:0] FUNC_ADDU = 6'h21;
|
|
// Ch272 — R5900 DADDU (doubleword add unsigned), MIPS-III SPECIAL
|
|
// funct 0x2D. Architecturally a 64-bit unsigned add, with no
|
|
// overflow trap. Our regfile is 32-bit, so we model it as ADDU
|
|
// (low 32 bits only); upper 32 bits silently dropped as
|
|
// elsewhere. PS2 ELFs use DADDU as the canonical 64-bit
|
|
// `move rd, rs` (= daddu rd, rs, $zero) pseudo-instruction;
|
|
// qbert.elf hits this at PC 0x00100068.
|
|
localparam logic [5:0] FUNC_DADDU = 6'h2D;
|
|
localparam logic [5:0] FUNC_SUB = 6'h22;
|
|
localparam logic [5:0] FUNC_SUBU = 6'h23;
|
|
// Ch305 — R5900 DSUBU (doubleword subtract unsigned), MIPS-III
|
|
// SPECIAL funct 0x2F. The 64-bit-subtract sibling of DADDU
|
|
// (Ch272). Our regfile is 32-bit, so modelled as SUBU on the low
|
|
// 32 bits, no overflow trap (matching the DADDU low-32
|
|
// approximation). qbert.elf hits `dsubu $v0, $v1, $v0`
|
|
// (instr 0x0062102F) at PC 0x00110A60.
|
|
localparam logic [5:0] FUNC_DSUBU = 6'h2F;
|
|
localparam logic [5:0] FUNC_AND = 6'h24;
|
|
localparam logic [5:0] FUNC_OR = 6'h25;
|
|
localparam logic [5:0] FUNC_XOR = 6'h26;
|
|
localparam logic [5:0] FUNC_NOR = 6'h27;
|
|
localparam logic [5:0] FUNC_SLT = 6'h2A;
|
|
localparam logic [5:0] FUNC_SLTU = 6'h2B;
|
|
|
|
localparam logic [4:0] COP0_RS_MF = 5'h00;
|
|
localparam logic [4:0] COP0_RS_MT = 5'h04;
|
|
localparam logic [4:0] COP0_RS_CO = 5'h10;
|
|
|
|
localparam logic [4:0] COP0_REG_BADVADDR = 5'd8; // ch49 addition
|
|
localparam logic [4:0] COP0_REG_COUNT = 5'd9;
|
|
localparam logic [4:0] COP0_REG_STATUS = 5'd12;
|
|
localparam logic [4:0] COP0_REG_CAUSE = 5'd13;
|
|
localparam logic [4:0] COP0_REG_EPC = 5'd14;
|
|
|
|
// Ch49: MIPS exception code values for synchronous address
|
|
// errors. EXC_CODE_ADEL fires on a misaligned load (LW/LH/LHU);
|
|
// EXC_CODE_ADES fires on a misaligned store (SW/SH). Other
|
|
// ExcCode values (Int=0, Sys=8, Bp=9, RI=10, CpU=11, Ov=12,
|
|
// Tr=13) are not yet modelled — they'll land when real BIOS
|
|
// code demands them, same way the opcode batches grew earlier.
|
|
localparam logic [4:0] EXC_CODE_INT = 5'd0;
|
|
localparam logic [4:0] EXC_CODE_ADEL = 5'd4;
|
|
localparam logic [4:0] EXC_CODE_ADES = 5'd5;
|
|
|
|
// ------------------------------------------------------------------
|
|
// FSM state
|
|
// ------------------------------------------------------------------
|
|
|
|
typedef enum logic [3:0] {
|
|
S_IDLE = 4'd0,
|
|
S_IFETCH_REQ = 4'd1,
|
|
S_IFETCH_WAIT = 4'd2,
|
|
S_EXECUTE = 4'd3,
|
|
S_MEM_REQ = 4'd4,
|
|
S_MEM_WAIT = 4'd5,
|
|
S_MEM_WRITE = 4'd6,
|
|
S_HALT = 4'd7,
|
|
// Ch215 — labeled sim-only jmp_buf restore for SYSCALL #8
|
|
// ($v1=8, $a0=2). 12-step LW sequence walks frame at
|
|
// CH215_JMPBUF_BASE (proven hardcoded constant 0xA000B1E0 by
|
|
// Ch214). Each step issues one LW (S_CH215_REQ) and captures
|
|
// its return value into the right regfile slot (S_CH215_WAIT).
|
|
S_CH215_REQ = 4'd8,
|
|
S_CH215_WAIT = 4'd9
|
|
} state_e;
|
|
|
|
state_e state;
|
|
// Ch215 — jmp_buf restore FSM counter (0..11 walking 12 slots).
|
|
logic [3:0] ch215_count;
|
|
// Ch271 — SQ 4-beat write counter. While SQ is in flight, this
|
|
// counts 0..3; each beat writes one 32-bit lane at ea + beat*4.
|
|
logic [1:0] sq_beat;
|
|
|
|
logic [31:0] pc;
|
|
logic [31:0] instr;
|
|
logic [31:0] regfile [0:31];
|
|
|
|
// Ch283 — 128-bit GPR shadow. The R5900 architectural register is
|
|
// 128 bits, but every prior chapter has lived inside a "low 32
|
|
// only" approximation. gpr128 is parallel storage: scalar writes
|
|
// mirror their result into gpr128[X][31:0] with the upper 96 bits
|
|
// zeroed (matches the R5900 rule that scalar ops clear the upper
|
|
// bits of the destination). MMI ops and LQ are the only producers
|
|
// of non-zero upper bits; PCPYUD (Ch283) and similar upper-half
|
|
// readers consume from this shadow. regfile remains the canonical
|
|
// scalar surface so existing decode/ALU/load logic is unchanged.
|
|
logic [127:0] gpr128 [0:31];
|
|
|
|
// Ch43: architectural HI/LO register pair, populated by DIVU
|
|
// (and, when added later, DIV/MULT/MULTU) and read back through
|
|
// MFHI/MFLO. Reset to 0. Divide-by-zero policy: leave HI/LO
|
|
// unchanged (deterministic, simplest; real HW treats the result
|
|
// as undefined on DIVU with rt=0, so any consistent stub is
|
|
// spec-compatible).
|
|
logic [31:0] hi_reg;
|
|
logic [31:0] lo_reg;
|
|
|
|
logic branch_pending;
|
|
logic [31:0] branch_target;
|
|
logic instr_in_delay_slot;
|
|
|
|
logic status_iec, status_iep, status_ieo;
|
|
logic status_kuc, status_kup, status_kuo;
|
|
logic [7:0] status_im;
|
|
// Ch50: BEV bit (bit 22 of Status). Controls the exception
|
|
// vector base. 1 after reset (if INIT_BEV=1) so exceptions go
|
|
// to BEV_EXC_VECTOR; cleared by BIOS via MTC0 once its RAM
|
|
// handler is installed, which switches exceptions to
|
|
// EXC_VECTOR.
|
|
logic status_bev;
|
|
|
|
logic [4:0] cause_exc_code;
|
|
logic [7:0] cause_ip_sw;
|
|
logic [31:0] epc;
|
|
// Ch49: COP0 BadVAddr. Captures the offending effective address
|
|
// on a synchronous address-error exception (AdEL/AdES). Read-
|
|
// only via MFC0; real HW also updates this on TLB exceptions,
|
|
// which we don't model yet. Default 0 until the first fault.
|
|
logic [31:0] badvaddr;
|
|
|
|
// COP0 Count (reg 9): free-running 32-bit cycle counter. Increments
|
|
// every clock, resets to 0. Exposed read-only through MFC0 $9 — MTC0
|
|
// writes are silently dropped by the existing cop0_write default
|
|
// case (real hardware allows software to reset Count, but we haven't
|
|
// seen BIOS depend on that yet; revisit if a real-BIOS wait loop
|
|
// breaks because of the unreset counter).
|
|
//
|
|
// Rate: advances once per CPU clock. Real R5900 Count advances at
|
|
// half CPU clock, but any BIOS polling loop of the form
|
|
// do { x = mfc0 $9; } while (x < target);
|
|
// will exit in a bounded number of cycles regardless — the only
|
|
// observable effect of the faster rate is that waits finish sooner
|
|
// in sim than on hardware, which is fine for bring-up.
|
|
logic [31:0] cop0_count;
|
|
|
|
logic [7:0] cause_ip;
|
|
always_comb begin
|
|
cause_ip = 8'd0;
|
|
cause_ip[1:0] = cause_ip_sw[1:0];
|
|
cause_ip[2] = cpu_irq;
|
|
end
|
|
|
|
logic [31:0] status_word;
|
|
logic [31:0] cause_word;
|
|
always_comb begin
|
|
status_word = 32'd0;
|
|
status_word[0] = status_iec;
|
|
status_word[1] = status_kuc;
|
|
status_word[2] = status_iep;
|
|
status_word[3] = status_kup;
|
|
status_word[4] = status_ieo;
|
|
status_word[5] = status_kuo;
|
|
status_word[15:8] = status_im;
|
|
status_word[22] = status_bev; // Ch50
|
|
|
|
cause_word = 32'd0;
|
|
cause_word[6:2] = cause_exc_code;
|
|
cause_word[15:8] = cause_ip;
|
|
end
|
|
|
|
// ------------------------------------------------------------------
|
|
// Decode — combinational extraction from `instr`
|
|
// ------------------------------------------------------------------
|
|
|
|
logic [5:0] opcode;
|
|
logic [4:0] rs_idx;
|
|
logic [4:0] rt_idx;
|
|
logic [4:0] rd_idx;
|
|
logic [4:0] shamt;
|
|
logic [5:0] func;
|
|
logic [15:0] imm16;
|
|
logic [25:0] imm26;
|
|
logic [31:0] imm_sx;
|
|
logic [31:0] imm_zx;
|
|
logic [31:0] branch_offset;
|
|
logic [31:0] branch_tgt;
|
|
logic [31:0] j_tgt;
|
|
logic [31:0] rs_val;
|
|
logic [31:0] rt_val;
|
|
logic [31:0] ea;
|
|
// Ch283 — 128-bit reads from the gpr128 shadow. Used by MMI ops
|
|
// and LQ-consuming downstream code that needs the full 128-bit
|
|
// architectural view. $0 reads as 128'd0 (architectural).
|
|
logic [127:0] rs128_val;
|
|
logic [127:0] rt128_val;
|
|
|
|
assign opcode = instr[31:26];
|
|
assign rs_idx = instr[25:21];
|
|
assign rt_idx = instr[20:16];
|
|
assign rd_idx = instr[15:11];
|
|
assign shamt = instr[10:6];
|
|
assign imm16 = instr[15:0];
|
|
assign imm26 = instr[25:0];
|
|
assign func = instr[5:0];
|
|
assign imm_sx = {{16{imm16[15]}}, imm16};
|
|
assign imm_zx = {16'd0, imm16};
|
|
assign branch_offset = {{14{imm16[15]}}, imm16, 2'b00};
|
|
assign branch_tgt = pc + 32'd4 + branch_offset;
|
|
assign j_tgt = {pc[31:28], imm26, 2'b00};
|
|
assign rs_val = (rs_idx == 5'd0) ? 32'd0 : regfile[rs_idx];
|
|
assign rt_val = (rt_idx == 5'd0) ? 32'd0 : regfile[rt_idx];
|
|
assign rs128_val = (rs_idx == 5'd0) ? 128'd0 : gpr128[rs_idx];
|
|
assign rt128_val = (rt_idx == 5'd0) ? 128'd0 : gpr128[rt_idx];
|
|
assign ea = rs_val + imm_sx;
|
|
|
|
logic is_special, is_syscall, is_jr, is_jalr, is_sync;
|
|
logic is_and, is_or, is_xor, is_nor;
|
|
logic is_add, is_addu, is_daddu, is_sub, is_subu, is_dsubu;
|
|
logic is_slt, is_sltu;
|
|
logic is_sll, is_srl, is_sra, is_dsll;
|
|
logic is_sllv, is_srlv, is_srav;
|
|
logic is_rtype_alu;
|
|
logic is_multu, is_divu, is_mfhi, is_mflo, is_hilo_op;
|
|
logic is_cop0, is_mfc0, is_mtc0, is_rfe, is_ei;
|
|
logic is_nop_class;
|
|
logic is_lui, is_ori, is_andi;
|
|
logic is_addi, is_addiu, is_slti, is_sltiu;
|
|
logic is_lw, is_lb, is_lh, is_lhu, is_lbu, is_lq, is_ld;
|
|
logic is_sw, is_sb, is_sh, is_sq, is_sd;
|
|
logic is_beq, is_beql, is_bne, is_bnel, is_j, is_jal;
|
|
logic is_blez, is_bgtz, is_regimm, is_bltz, is_bgez;
|
|
logic is_branch, is_jump;
|
|
logic branch_taken;
|
|
logic is_taken_branch_or_jump;
|
|
logic is_cache; // Ch178 — opcode 0x2F, accepted as no-op
|
|
|
|
assign is_special = (opcode == OP_SPECIAL);
|
|
assign is_syscall = is_special && (func == FUNC_SYSCALL);
|
|
assign is_sync = is_special && (func == FUNC_SYNC); // Ch292
|
|
assign is_jr = is_special && (func == FUNC_JR);
|
|
assign is_jalr = is_special && (func == FUNC_JALR);
|
|
assign is_and = is_special && (func == FUNC_AND);
|
|
assign is_or = is_special && (func == FUNC_OR);
|
|
assign is_xor = is_special && (func == FUNC_XOR);
|
|
assign is_nor = is_special && (func == FUNC_NOR);
|
|
assign is_add = is_special && (func == FUNC_ADD);
|
|
assign is_addu = is_special && (func == FUNC_ADDU);
|
|
assign is_daddu = is_special && (func == FUNC_DADDU);
|
|
assign is_sub = is_special && (func == FUNC_SUB);
|
|
assign is_subu = is_special && (func == FUNC_SUBU);
|
|
assign is_dsubu = is_special && (func == FUNC_DSUBU); // Ch305
|
|
assign is_slt = is_special && (func == FUNC_SLT);
|
|
assign is_sltu = is_special && (func == FUNC_SLTU);
|
|
assign is_sll = is_special && (func == FUNC_SLL);
|
|
assign is_dsll = is_special && (func == FUNC_DSLL);
|
|
// Ch278/Ch280 — MMI sub-instruction narrow-decodes. Each
|
|
// recognized op fires only when the exact (opcode + funct + sa)
|
|
// triple matches; everything else under opcode 0x1C falls
|
|
// through to strict-trap.
|
|
logic is_mmi;
|
|
logic is_pcpyld; // Ch278 — MMI2 / sa 0x0E
|
|
logic is_psubb; // Ch280 — MMI0 / sa 0x09
|
|
logic is_pnor; // Ch281 — MMI3 / sa 0x13
|
|
logic is_pand; // Ch282 — MMI2 / sa 0x12
|
|
logic is_pcpyud; // Ch283 — MMI3 / sa 0x0E
|
|
logic is_pcpyh; // Ch300 — MMI3 / sa 0x1B
|
|
assign is_mmi = (opcode == OP_MMI);
|
|
assign is_pcpyld = is_mmi
|
|
&& (func == FUNC_MMI2)
|
|
&& (shamt == MMI2_PCPYLD);
|
|
assign is_psubb = is_mmi
|
|
&& (func == FUNC_MMI0)
|
|
&& (shamt == MMI0_PSUBB);
|
|
assign is_pnor = is_mmi
|
|
&& (func == FUNC_MMI3)
|
|
&& (shamt == MMI3_PNOR);
|
|
assign is_pand = is_mmi
|
|
&& (func == FUNC_MMI2)
|
|
&& (shamt == MMI2_PAND);
|
|
assign is_pcpyud = is_mmi
|
|
&& (func == FUNC_MMI3)
|
|
&& (shamt == MMI3_PCPYUD);
|
|
assign is_pcpyh = is_mmi
|
|
&& (func == FUNC_MMI3)
|
|
&& (shamt == MMI3_PCPYH);
|
|
assign is_srl = is_special && (func == FUNC_SRL);
|
|
assign is_sra = is_special && (func == FUNC_SRA);
|
|
assign is_sllv = is_special && (func == FUNC_SLLV);
|
|
assign is_srlv = is_special && (func == FUNC_SRLV);
|
|
assign is_srav = is_special && (func == FUNC_SRAV);
|
|
assign is_rtype_alu = is_and || is_or || is_xor || is_nor
|
|
|| is_add || is_addu || is_daddu // Ch272 — DADDU
|
|
|| is_sub || is_subu || is_dsubu // Ch305 — DSUBU
|
|
|| is_slt || is_sltu
|
|
|| is_sll || is_srl || is_sra
|
|
|| is_dsll // Ch276 — DSLL
|
|
|| is_pcpyld // Ch278 — MMI2/PCPYLD
|
|
|| is_psubb // Ch280 — MMI0/PSUBB
|
|
|| is_pnor // Ch281 — MMI3/PNOR
|
|
|| is_pand // Ch282 — MMI2/PAND
|
|
|| is_pcpyud // Ch283 — MMI3/PCPYUD
|
|
|| is_pcpyh // Ch300 — MMI3/PCPYH
|
|
|| is_sllv || is_srlv || is_srav;
|
|
// Ch43: HI/LO ops — DIVU writes HI/LO, MFHI/MFLO read them back
|
|
// into rd. Separate from is_rtype_alu because DIVU has no rd
|
|
// writeback and the MF* ops route through a different mux path.
|
|
// Ch203 — MULTU (SPECIAL funct=0x19, $HI:$LO = unsigned rs*rt, no
|
|
// rd writeback). Grouped with the HI/LO ops so it auto-excludes
|
|
// from is_nop_class.
|
|
assign is_multu = is_special && (func == FUNC_MULTU);
|
|
assign is_divu = is_special && (func == FUNC_DIVU);
|
|
assign is_mfhi = is_special && (func == FUNC_MFHI);
|
|
assign is_mflo = is_special && (func == FUNC_MFLO);
|
|
assign is_hilo_op = is_multu || is_divu || is_mfhi || is_mflo;
|
|
assign is_cop0 = (opcode == OP_COP0);
|
|
assign is_mfc0 = is_cop0 && (rs_idx == COP0_RS_MF);
|
|
assign is_mtc0 = is_cop0 && (rs_idx == COP0_RS_MT);
|
|
assign is_rfe = is_cop0 && (rs_idx == COP0_RS_CO) && (func == FUNC_RFE);
|
|
// Ch286 — narrow exact-32-bit match for R5900 EI (see EI_INSTR_R5900
|
|
// comment block). Keeps every other COP0/CO encoding trapping.
|
|
assign is_ei = (instr == EI_INSTR_R5900);
|
|
|
|
assign is_lui = (opcode == OP_LUI);
|
|
assign is_ori = (opcode == OP_ORI);
|
|
assign is_andi = (opcode == OP_ANDI);
|
|
assign is_addi = (opcode == OP_ADDI);
|
|
assign is_addiu = (opcode == OP_ADDIU);
|
|
assign is_slti = (opcode == OP_SLTI);
|
|
assign is_sltiu = (opcode == OP_SLTIU);
|
|
assign is_lw = (opcode == OP_LW);
|
|
assign is_lq = (opcode == OP_LQ);
|
|
assign is_ld = (opcode == OP_LD); // Ch284
|
|
assign is_lb = (opcode == OP_LB);
|
|
assign is_lbu = (opcode == OP_LBU);
|
|
assign is_lh = (opcode == OP_LH);
|
|
assign is_lhu = (opcode == OP_LHU);
|
|
assign is_sw = (opcode == OP_SW);
|
|
assign is_sq = (opcode == OP_SQ);
|
|
assign is_sd = (opcode == OP_SD);
|
|
assign is_sb = (opcode == OP_SB);
|
|
assign is_sh = (opcode == OP_SH);
|
|
assign is_beq = (opcode == OP_BEQ);
|
|
assign is_beql = (opcode == OP_BEQL);
|
|
assign is_bne = (opcode == OP_BNE);
|
|
assign is_bnel = (opcode == OP_BNEL);
|
|
assign is_blez = (opcode == OP_BLEZ);
|
|
assign is_bgtz = (opcode == OP_BGTZ);
|
|
assign is_regimm = (opcode == OP_REGIMM);
|
|
assign is_bltz = is_regimm && (rt_idx == REGIMM_BLTZ);
|
|
assign is_bgez = is_regimm && (rt_idx == REGIMM_BGEZ);
|
|
assign is_j = (opcode == OP_J);
|
|
assign is_jal = (opcode == OP_JAL);
|
|
// Ch178 — CACHE class. We accept the whole opcode regardless
|
|
// of sub-op (`rt` field) per Codex's caution; BIOS may issue
|
|
// multiple cache ops in close succession (Hit-Invalidate /
|
|
// Index-Invalidate / etc.) and the stub has no cache state to
|
|
// distinguish them. Side-effect-free: no register or memory
|
|
// writes, PC advances via the normal "not a branch / not a
|
|
// load / not a store" path.
|
|
assign is_cache = (opcode == OP_CACHE);
|
|
|
|
assign is_branch = is_beq || is_beql || is_bne || is_bnel
|
|
|| is_blez || is_bgtz
|
|
|| is_bltz || is_bgez;
|
|
assign is_jump = is_j || is_jal || is_jr || is_jalr;
|
|
assign branch_taken = (is_beq && (rs_val == rt_val))
|
|
|| (is_beql && (rs_val == rt_val)) // Ch274 — BEQL taken
|
|
|| (is_bne && (rs_val != rt_val))
|
|
|| (is_bnel && (rs_val != rt_val)) // Ch277 — BNEL taken
|
|
|| (is_blez && ($signed(rs_val) <= 0))
|
|
|| (is_bgtz && ($signed(rs_val) > 0))
|
|
|| (is_bltz && ($signed(rs_val) < 0))
|
|
|| (is_bgez && ($signed(rs_val) >= 0));
|
|
assign is_taken_branch_or_jump = branch_taken || is_jump;
|
|
|
|
// Ch274/Ch277 — branch-likely not-taken: squash the delay slot.
|
|
// Skip PC by 8 so the delay-slot instruction at PC+4 is never
|
|
// fetched or retired. Generalized to cover BEQL + BNEL; adding
|
|
// BLEZL/BGTZL/REGIMM-likely later is a one-line OR-extension.
|
|
logic is_branch_likely_squash;
|
|
assign is_branch_likely_squash =
|
|
(is_beql && (rs_val != rt_val)) // Ch274 — BEQL not-taken
|
|
|| (is_bnel && (rs_val == rt_val)); // Ch277 — BNEL not-taken
|
|
|
|
assign is_nop_class = (is_special
|
|
&& !is_syscall && !is_jr && !is_jalr
|
|
&& !is_rtype_alu
|
|
&& !is_hilo_op
|
|
&& !is_sync) // Ch292 — narrow SYNC
|
|
|| (is_cop0 && !is_mfc0 && !is_mtc0 && !is_rfe
|
|
&& !is_ei) // Ch286 — narrow EI
|
|
// REGIMM: BLTZ/BGEZ are handled; other rt
|
|
// encodings (e.g. BLTZAL=0x10, BGEZAL=0x11
|
|
// link variants) trap as unsupported until
|
|
// a real-BIOS path needs them.
|
|
|| (is_regimm && !is_bltz && !is_bgez)
|
|
|| (!is_special && !is_cop0 && !is_regimm
|
|
&& !is_lui && !is_ori && !is_andi
|
|
&& !is_addi && !is_addiu
|
|
&& !is_slti && !is_sltiu
|
|
&& !is_lw && !is_lb && !is_lbu && !is_lh && !is_lhu
|
|
&& !is_lq // Ch279 — LQ
|
|
&& !is_ld // Ch284 — LD
|
|
&& !is_sw && !is_sb && !is_sh
|
|
&& !is_sq // Ch271 — SQ
|
|
&& !is_sd // Ch275 — SD
|
|
&& !is_beq && !is_beql && !is_bne && !is_bnel // Ch274/Ch277
|
|
&& !is_blez && !is_bgtz
|
|
&& !is_j && !is_jal
|
|
&& !is_pcpyld // Ch278 — MMI2/PCPYLD only
|
|
&& !is_psubb // Ch280 — MMI0/PSUBB only
|
|
&& !is_pnor // Ch281 — MMI3/PNOR only
|
|
&& !is_pand // Ch282 — MMI2/PAND only
|
|
&& !is_pcpyud // Ch283 — MMI3/PCPYUD only
|
|
&& !is_pcpyh // Ch300 — MMI3/PCPYH only
|
|
&& !is_cache); // Ch178
|
|
|
|
logic is_nop_instr;
|
|
logic is_unsupported;
|
|
logic strict_trap;
|
|
assign is_nop_instr = (instr == 32'd0);
|
|
assign is_unsupported = is_nop_class && !is_nop_instr;
|
|
assign strict_trap = STRICT_UNSUPPORTED && is_unsupported;
|
|
|
|
// Ch47: AdEL/AdES detection. `ea` is the effective address of
|
|
// the pending load/store (rs_val + imm_sx). MIPS requires SW /
|
|
// LW to be word-aligned (ea[1:0]==0) and SH / LH / LHU to be
|
|
// halfword-aligned (ea[0]==0). SB / LB / LBU are unconstrained.
|
|
// Anything else is an address-error exception on real HW; in
|
|
// this stub we route it through the same trap machinery as
|
|
// STRICT_UNSUPPORTED so the first misaligned access halts
|
|
// loudly instead of silently aliasing to the nearest aligned
|
|
// slot (ch46 evidence showed the old behavior was clobbering
|
|
// neighbouring words and confusing post-run analysis).
|
|
logic is_word_access;
|
|
logic is_half_access;
|
|
logic is_align_fault;
|
|
logic align_trap;
|
|
logic align_except;
|
|
logic is_align_store;
|
|
assign is_word_access = is_sw || is_lw;
|
|
assign is_half_access = is_sh || is_lh || is_lhu;
|
|
// Ch271/Ch279 — SQ + LQ require 16-byte alignment (ea[3:0] == 0).
|
|
// Ch275 — SD requires 8-byte alignment (ea[2:0] == 0).
|
|
logic is_quad_access;
|
|
logic is_dword_access;
|
|
assign is_quad_access = is_sq || is_lq; // Ch279 — add LQ
|
|
assign is_dword_access = is_sd || is_ld; // Ch284 — add LD
|
|
assign is_align_fault = (is_word_access && (ea[1:0] != 2'd0))
|
|
|| (is_half_access && (ea[0] != 1'b0))
|
|
|| (is_quad_access && (ea[3:0] != 4'd0))
|
|
|| (is_dword_access && (ea[2:0] != 3'd0));
|
|
assign is_align_store = is_sw || is_sh || is_sq || is_sd;
|
|
// Ch47: when TRAP_ALIGN_ERROR=1 the fault halts the core (debug
|
|
// mode used by tb_ee_core_align and default for iteration).
|
|
assign align_trap = TRAP_ALIGN_ERROR && is_align_fault;
|
|
// Ch49: when TRAP_ALIGN_ERROR=0 the fault vectors to EXC_VECTOR
|
|
// through the standard MIPS exception path (BadVAddr + Cause
|
|
// ExcCode + EPC + Status IE/KU stack shift).
|
|
assign align_except = !TRAP_ALIGN_ERROR && is_align_fault;
|
|
|
|
// Ch50: select exception vector based on Status.BEV. Interrupts
|
|
// and synchronous faults (AdEL/AdES) both use this. When BEV=1
|
|
// (boot / reset), exceptions go to the ROM bootstrap vector
|
|
// (BEV_EXC_VECTOR, default 0xBFC0_0380 — the standard MIPS
|
|
// R4000 "common" vector for BEV=1). When BEV=0 (BIOS has
|
|
// installed its RAM handler), they go to EXC_VECTOR.
|
|
logic [31:0] exc_target_pc;
|
|
assign exc_target_pc = status_bev ? BEV_EXC_VECTOR : EXC_VECTOR;
|
|
|
|
logic [31:0] alu_wb;
|
|
always_comb begin
|
|
if (is_lui) alu_wb = {imm16, 16'd0};
|
|
else if (is_ori) alu_wb = rs_val | imm_zx;
|
|
else if (is_andi) alu_wb = rs_val & imm_zx;
|
|
else if (is_addi || is_addiu) alu_wb = rs_val + imm_sx;
|
|
else if (is_slti) alu_wb = ($signed(rs_val) < $signed(imm_sx))
|
|
? 32'd1 : 32'd0;
|
|
else if (is_sltiu) alu_wb = (rs_val < imm_sx) ? 32'd1 : 32'd0;
|
|
else alu_wb = 32'd0;
|
|
end
|
|
|
|
// R-type ALU (destination = rd). Families:
|
|
// arith (ADD/ADDU/SUB/SUBU): func 0x20/0x21/0x22/0x23
|
|
// logic (AND/OR/XOR/NOR): func 0x24-0x27
|
|
// compare (SLT/SLTU): func 0x2A/0x2B
|
|
// shifts (SLL/SRL/SRA): func 0x00/0x02/0x03
|
|
// ADD/SUB architecturally trap on signed overflow (Arithmetic
|
|
// Overflow exception); this core does not model that exception
|
|
// yet, so ADD behaves as ADDU and SUB behaves as SUBU. Same
|
|
// pragmatic policy as ADDI vs ADDIU. Real BIOS code emits
|
|
// ADD/SUB in contexts where overflow cannot happen in practice;
|
|
// the trapping variants would only matter if BIOS ever
|
|
// deliberately overflows.
|
|
// Shifts use `rt_val` as the operand and `shamt` (bits [10:6]
|
|
// of instr) as the shift count; SRA uses $signed() with SV's
|
|
// arithmetic right-shift (>>>) so the MSB propagates.
|
|
// SLL/$0,$0,0 is architectural NOP; this block still produces
|
|
// 0 and the rd_idx=0 guard in the writeback path blocks the
|
|
// phantom regfile write.
|
|
logic [31:0] rtype_alu_wb;
|
|
always_comb begin
|
|
if (is_and || is_pand) rtype_alu_wb = rs_val & rt_val;
|
|
else if (is_or) rtype_alu_wb = rs_val | rt_val;
|
|
else if (is_xor) rtype_alu_wb = rs_val ^ rt_val;
|
|
else if (is_nor || is_pnor) rtype_alu_wb = ~(rs_val | rt_val);
|
|
else if (is_add || is_addu || is_daddu)
|
|
rtype_alu_wb = rs_val + rt_val;
|
|
else if (is_sub || is_subu || is_dsubu) // Ch305 — DSUBU low-32
|
|
rtype_alu_wb = rs_val - rt_val;
|
|
else if (is_slt) rtype_alu_wb = ($signed(rs_val) < $signed(rt_val))
|
|
? 32'd1 : 32'd0;
|
|
else if (is_sltu) rtype_alu_wb = (rs_val < rt_val) ? 32'd1 : 32'd0;
|
|
else if (is_sll || is_dsll) rtype_alu_wb = rt_val << shamt;
|
|
// Ch278 — MMI2/PCPYLD: 128-bit "pack lower doublewords"
|
|
// collapses in our 32-bit model to `rd_low32 = rt_low32`.
|
|
// Architectural rd[63:0]=rt[63:0] is observable here; the
|
|
// rd[127:64]=rs[63:0] half is unrepresentable.
|
|
else if (is_pcpyld) rtype_alu_wb = rt_val;
|
|
// Ch280 — MMI0/PSUBB: 4 parallel byte subtracts in the low
|
|
// 32 bits (architectural is 16-way across 128 bits; upper
|
|
// 96 unrepresentable). Each byte lane is independently
|
|
// modulo-256; no carry/borrow propagates between bytes.
|
|
else if (is_psubb) begin
|
|
rtype_alu_wb[ 7: 0] = rs_val[ 7: 0] - rt_val[ 7: 0];
|
|
rtype_alu_wb[15: 8] = rs_val[15: 8] - rt_val[15: 8];
|
|
rtype_alu_wb[23:16] = rs_val[23:16] - rt_val[23:16];
|
|
rtype_alu_wb[31:24] = rs_val[31:24] - rt_val[31:24];
|
|
end
|
|
// Ch283 — MMI3/PCPYUD: $rd[63:0] = $rt[127:64]. The low 32
|
|
// bits of $rd come from $rt[95:64] (low 32 of $rt's upper
|
|
// doubleword). regfile mirror lands this value; gpr128 path
|
|
// below sets the full 128-bit result.
|
|
else if (is_pcpyud) rtype_alu_wb = rt128_val[95:64];
|
|
// Ch300 — MMI3/PCPYH: broadcast $rt[15:0] across the low 64
|
|
// halfword lanes. The low 32 of $rd = {h0, h0}.
|
|
else if (is_pcpyh) rtype_alu_wb = {rt128_val[15:0], rt128_val[15:0]};
|
|
else if (is_srl) rtype_alu_wb = rt_val >> shamt;
|
|
else if (is_sra) rtype_alu_wb = $signed(rt_val) >>> shamt;
|
|
// Ch67: variable-shift — shift amount comes from rs_val[4:0].
|
|
else if (is_sllv) rtype_alu_wb = rt_val << rs_val[4:0];
|
|
else if (is_srlv) rtype_alu_wb = rt_val >> rs_val[4:0];
|
|
else if (is_srav) rtype_alu_wb = $signed(rt_val) >>> rs_val[4:0];
|
|
else rtype_alu_wb = 32'd0;
|
|
end
|
|
|
|
// Ch283 — full 128-bit MMI writeback. Computed in parallel with
|
|
// rtype_alu_wb (the legacy low-32 value); the writeback block
|
|
// selects between {96'd0, rtype_alu_wb} (scalar) and
|
|
// rtype_alu128_wb (MMI) when updating gpr128[rd]. Only the five
|
|
// MMI ops modelled to date land here; everything else stays in
|
|
// the "zero-extend the scalar result" path. The PSUBB body works
|
|
// across all 16 lanes; PNOR/PAND across the full 128 bits;
|
|
// PCPYLD/PCPYUD do their architectural doubleword swaps.
|
|
logic [127:0] rtype_alu128_wb;
|
|
logic is_mmi_wb;
|
|
assign is_mmi_wb = is_pcpyld || is_psubb || is_pnor
|
|
|| is_pand || is_pcpyud
|
|
|| is_pcpyh; // Ch300
|
|
always_comb begin
|
|
rtype_alu128_wb = 128'd0;
|
|
if (is_pcpyld) begin
|
|
// $rd[127:64] = $rs[63:0]; $rd[63:0] = $rt[63:0].
|
|
rtype_alu128_wb = {rs128_val[63:0], rt128_val[63:0]};
|
|
end else if (is_psubb) begin
|
|
// 16-way parallel byte subtract.
|
|
for (int b = 0; b < 16; b++) begin
|
|
rtype_alu128_wb[b*8 +: 8] =
|
|
rs128_val[b*8 +: 8] - rt128_val[b*8 +: 8];
|
|
end
|
|
end else if (is_pnor) begin
|
|
rtype_alu128_wb = ~(rs128_val | rt128_val);
|
|
end else if (is_pand) begin
|
|
rtype_alu128_wb = rs128_val & rt128_val;
|
|
end else if (is_pcpyud) begin
|
|
// $rd[127:64] = $rs[127:64]; $rd[63:0] = $rt[127:64].
|
|
rtype_alu128_wb = {rs128_val[127:64], rt128_val[127:64]};
|
|
end else if (is_pcpyh) begin
|
|
// Ch300 — Parallel Copy Halfword.
|
|
// h0 = $rt[15:0] (low halfword of low D)
|
|
// h4 = $rt[79:64] (low halfword of high D)
|
|
// $rd low 64 = {h0, h0, h0, h0}
|
|
// $rd high 64 = {h4, h4, h4, h4}
|
|
// $rs is architecturally ignored.
|
|
rtype_alu128_wb = { {4{rt128_val[79:64]}},
|
|
{4{rt128_val[15:0]}} };
|
|
end
|
|
end
|
|
|
|
logic [31:0] cop0_read_val;
|
|
always_comb begin
|
|
unique case (rd_idx)
|
|
COP0_REG_BADVADDR: cop0_read_val = badvaddr;
|
|
COP0_REG_COUNT: cop0_read_val = cop0_count;
|
|
COP0_REG_STATUS: cop0_read_val = status_word;
|
|
COP0_REG_CAUSE: cop0_read_val = cause_word;
|
|
COP0_REG_EPC: cop0_read_val = epc;
|
|
default: cop0_read_val = 32'd0;
|
|
endcase
|
|
end
|
|
|
|
// Count advance. Separate from the main FSM reset block so the
|
|
// counter's behaviour is locally self-contained and easy to audit.
|
|
always_ff @(posedge clk) begin
|
|
if (!rst_n) cop0_count <= 32'd0;
|
|
else cop0_count <= cop0_count + 32'd1;
|
|
end
|
|
|
|
logic [31:0] taken_target;
|
|
always_comb begin
|
|
if (is_jr || is_jalr) taken_target = rs_val;
|
|
else if (is_j || is_jal) taken_target = j_tgt;
|
|
else taken_target = branch_tgt;
|
|
end
|
|
|
|
// ------------------------------------------------------------------
|
|
// Trace book-keeping (captured at retire)
|
|
// ------------------------------------------------------------------
|
|
|
|
logic [31:0] retired_pc;
|
|
logic [31:0] retired_instr;
|
|
logic [31:0] retired_arg2;
|
|
logic [31:0] retired_arg3;
|
|
logic retired_flag_write;
|
|
logic retired_flag_read;
|
|
logic retired_flag_branch;
|
|
logic retired_flag_halt;
|
|
logic retired_flag_in_delay;
|
|
logic retired_flag_except;
|
|
logic retired_flag_rfe;
|
|
logic retired_flag_trap;
|
|
logic retire_pulse;
|
|
|
|
// ------------------------------------------------------------------
|
|
// Map-port drive (combinational on state)
|
|
// ------------------------------------------------------------------
|
|
|
|
always_comb begin
|
|
map_rd_en = 1'b0;
|
|
map_rd_addr = 32'd0;
|
|
map_wr_en = 1'b0;
|
|
map_wr_addr = 32'd0;
|
|
map_wr_data = 32'd0;
|
|
map_wr_be = 4'd0;
|
|
|
|
case (state)
|
|
S_IFETCH_REQ: begin
|
|
map_rd_en = 1'b1;
|
|
map_rd_addr = pc;
|
|
end
|
|
S_MEM_REQ: begin
|
|
map_rd_en = 1'b1;
|
|
// Ch283 — LQ drives a real 4-beat load FSM using
|
|
// sq_beat as the counter (same counter SQ uses).
|
|
// Ch284 — LD reuses the same beat addressing for 2
|
|
// beats. LW/LB/LBU/LH/LHU stay single-beat with
|
|
// sq_beat=0.
|
|
map_rd_addr = (is_lq || is_ld)
|
|
? (ea + {28'd0, sq_beat, 2'b00})
|
|
: ea;
|
|
end
|
|
// Ch215 — drive the read port for the jmp_buf restore
|
|
// FSM. Address = base + count*4 (offsets 0..0x2C).
|
|
S_CH215_REQ: begin
|
|
map_rd_en = 1'b1;
|
|
map_rd_addr = CH215_JMPBUF_BASE
|
|
+ {26'd0, ch215_count, 2'b00};
|
|
end
|
|
S_MEM_WRITE: begin
|
|
map_wr_en = 1'b1;
|
|
// Ch271/Ch275 — multi-beat stores (SQ/SD) use
|
|
// `ea + sq_beat*4` for the per-beat address. Single-
|
|
// beat stores (SW/SB/SH) ignore sq_beat (which stays
|
|
// at 0) and resolve to `ea`.
|
|
map_wr_addr = (is_sq || is_sd)
|
|
? (ea + {28'd0, sq_beat, 2'b00})
|
|
: ea;
|
|
if (is_sq || is_sd) begin
|
|
// Ch283 — SQ/SD source per-beat from gpr128[rt],
|
|
// not the legacy "low 32 only, zeros above" path.
|
|
// SQ emits all four 32-bit lanes; SD emits the
|
|
// low two lanes (gpr128[rt][63:0]). Reads of gpr128
|
|
// for rt==0 resolve to 0 via the rt_idx==0 check
|
|
// already in the read helper.
|
|
case (sq_beat)
|
|
2'd0: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][31:0];
|
|
2'd1: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][63:32];
|
|
2'd2: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][95:64];
|
|
2'd3: map_wr_data = (rt_idx == 5'd0) ? 32'd0 : gpr128[rt_idx][127:96];
|
|
endcase
|
|
map_wr_be = 4'b1111;
|
|
end else if (is_sb) begin
|
|
// Byte store: broadcast rt_val[7:0] into the
|
|
// addressed lane; one-hot byte-enable selects it.
|
|
case (ea[1:0])
|
|
2'd0: begin
|
|
map_wr_data = {24'd0, rt_val[7:0]};
|
|
map_wr_be = 4'b0001;
|
|
end
|
|
2'd1: begin
|
|
map_wr_data = {16'd0, rt_val[7:0], 8'd0};
|
|
map_wr_be = 4'b0010;
|
|
end
|
|
2'd2: begin
|
|
map_wr_data = {8'd0, rt_val[7:0], 16'd0};
|
|
map_wr_be = 4'b0100;
|
|
end
|
|
2'd3: begin
|
|
map_wr_data = {rt_val[7:0], 24'd0};
|
|
map_wr_be = 4'b1000;
|
|
end
|
|
endcase
|
|
end else if (is_sh) begin
|
|
// Halfword store: broadcast rt_val[15:0] into the
|
|
// addressed halfword lane (ea[1] selects). 2-of-4
|
|
// byte-enable leaves the other halfword untouched.
|
|
// Aligned access assumed (ea[0]==0); unaligned
|
|
// halfword store is not modelled as an exception.
|
|
case (ea[1])
|
|
1'b0: begin
|
|
map_wr_data = {16'd0, rt_val[15:0]};
|
|
map_wr_be = 4'b0011;
|
|
end
|
|
1'b1: begin
|
|
map_wr_data = {rt_val[15:0], 16'd0};
|
|
map_wr_be = 4'b1100;
|
|
end
|
|
endcase
|
|
end else begin
|
|
// SW — full word
|
|
map_wr_data = rt_val;
|
|
map_wr_be = 4'b1111;
|
|
end
|
|
end
|
|
default: ;
|
|
endcase
|
|
end
|
|
|
|
// ------------------------------------------------------------------
|
|
// Retire helper — pc advance, branch queuing, exception entry
|
|
// ------------------------------------------------------------------
|
|
|
|
task automatic retire_advance;
|
|
logic [31:0] next_pc;
|
|
logic new_branch_pending;
|
|
logic [31:0] new_branch_target;
|
|
logic irq_pending_masked;
|
|
logic exception_now;
|
|
|
|
// Ch274/Ch277 — branch-likely squash: when BEQL/BNEL fires
|
|
// and the condition is FALSE (not-taken), the delay slot is
|
|
// squashed (not executed), so PC jumps directly to PC+8.
|
|
// branch_pending is also forced low below so no stale
|
|
// branch_target leaks through.
|
|
if (is_branch_likely_squash)
|
|
next_pc = pc + 32'd8;
|
|
else
|
|
next_pc = branch_pending ? branch_target : pc + 32'd4;
|
|
new_branch_pending = is_taken_branch_or_jump;
|
|
new_branch_target = taken_target;
|
|
|
|
irq_pending_masked = |(cause_ip & status_im);
|
|
exception_now = !new_branch_pending
|
|
&& status_iec
|
|
&& irq_pending_masked;
|
|
|
|
if (exception_now) begin
|
|
epc <= next_pc;
|
|
cause_exc_code <= 5'h00;
|
|
status_ieo <= status_iep;
|
|
status_iep <= status_iec;
|
|
status_iec <= 1'b0;
|
|
status_kuo <= status_kup;
|
|
status_kup <= status_kuc;
|
|
status_kuc <= 1'b0;
|
|
pc <= exc_target_pc; // Ch50: BEV select
|
|
branch_pending <= 1'b0;
|
|
retired_flag_except <= 1'b1;
|
|
end else begin
|
|
pc <= next_pc;
|
|
branch_pending <= new_branch_pending;
|
|
if (new_branch_pending) branch_target <= new_branch_target;
|
|
retired_flag_except <= 1'b0;
|
|
end
|
|
endtask
|
|
|
|
// ------------------------------------------------------------------
|
|
// Main FSM
|
|
// ------------------------------------------------------------------
|
|
|
|
always_ff @(posedge clk) begin
|
|
if (!rst_n) begin
|
|
state <= S_IDLE;
|
|
pc <= PC_RESET;
|
|
instr <= 32'd0;
|
|
branch_pending <= 1'b0;
|
|
branch_target <= 32'd0;
|
|
instr_in_delay_slot <= 1'b0;
|
|
ch215_count <= 4'd0;
|
|
sq_beat <= 2'd0;
|
|
|
|
status_iec <= 1'b0;
|
|
status_iep <= 1'b0;
|
|
status_ieo <= 1'b0;
|
|
status_kuc <= 1'b0;
|
|
status_kup <= 1'b0;
|
|
status_kuo <= 1'b0;
|
|
status_im <= 8'd0;
|
|
status_bev <= INIT_BEV; // Ch50
|
|
cause_exc_code <= 5'd0;
|
|
cause_ip_sw <= 8'd0;
|
|
epc <= 32'd0;
|
|
badvaddr <= 32'd0; // Ch49
|
|
|
|
retire_pulse <= 1'b0;
|
|
retired_pc <= 32'd0;
|
|
retired_instr <= 32'd0;
|
|
retired_arg2 <= 32'd0;
|
|
retired_arg3 <= 32'd0;
|
|
retired_flag_write <= 1'b0;
|
|
retired_flag_read <= 1'b0;
|
|
retired_flag_branch <= 1'b0;
|
|
retired_flag_halt <= 1'b0;
|
|
retired_flag_in_delay <= 1'b0;
|
|
retired_flag_except <= 1'b0;
|
|
retired_flag_rfe <= 1'b0;
|
|
retired_flag_trap <= 1'b0;
|
|
|
|
trap_o <= 1'b0;
|
|
trap_pc_o <= 32'd0;
|
|
trap_instr_o <= 32'd0;
|
|
|
|
for (int i = 0; i < 32; i++) regfile[i] <= 32'd0;
|
|
// Ch283 — gpr128 shadow starts at zero everywhere.
|
|
for (int i = 0; i < 32; i++) gpr128[i] <= 128'd0;
|
|
hi_reg <= 32'd0;
|
|
lo_reg <= 32'd0;
|
|
end else begin
|
|
retire_pulse <= 1'b0;
|
|
|
|
case (state)
|
|
S_IDLE: begin
|
|
if (go_i) state <= S_IFETCH_REQ;
|
|
end
|
|
|
|
S_IFETCH_REQ: state <= S_IFETCH_WAIT;
|
|
|
|
S_IFETCH_WAIT: begin
|
|
if (map_rd_valid) begin
|
|
instr <= map_rd_data;
|
|
instr_in_delay_slot <= branch_pending;
|
|
state <= S_EXECUTE;
|
|
end
|
|
end
|
|
|
|
S_EXECUTE: begin
|
|
retired_pc <= pc;
|
|
retired_instr <= instr;
|
|
retired_arg2 <= 32'd0;
|
|
retired_arg3 <= 32'd0;
|
|
retired_flag_write <= 1'b0;
|
|
retired_flag_read <= 1'b0;
|
|
retired_flag_branch <= is_taken_branch_or_jump;
|
|
retired_flag_halt <= 1'b0;
|
|
retired_flag_in_delay <= instr_in_delay_slot;
|
|
retired_flag_except <= 1'b0;
|
|
retired_flag_rfe <= 1'b0;
|
|
retired_flag_trap <= 1'b0;
|
|
|
|
if (is_syscall) begin
|
|
// Ch199 — narrow _ReturnFromException(2)
|
|
// semantics: when SYSCALL fires with the
|
|
// exact contract $v1==8 && $a0==2, do NOT
|
|
// halt. Ch215 extends this with an optional
|
|
// jmp_buf restore (when CH215_JMPBUF_RESTORE_
|
|
// ENABLE=1) that walks the BIOS-side jmp_buf
|
|
// at CH215_JMPBUF_BASE and loads $ra/$sp/$fp/
|
|
// $s0..$s7/$gp from there before resuming.
|
|
// Other syscall variants keep the pre-Ch199
|
|
// halt path so the ch197 verdict still fires.
|
|
if (regfile[3] == 32'd8 && regfile[4] == 32'd2) begin
|
|
branch_pending <= 1'b0;
|
|
status_iec <= status_iep;
|
|
status_iep <= status_ieo;
|
|
status_kuc <= status_kup;
|
|
status_kup <= status_kuo;
|
|
retired_flag_rfe <= 1'b1;
|
|
retired_flag_halt <= 1'b0;
|
|
retire_pulse <= 1'b1;
|
|
if (CH215_JMPBUF_RESTORE_ENABLE) begin
|
|
// Ch215 — enter the 12-step jmp_buf
|
|
// restore FSM. PC update deferred
|
|
// until the FSM has loaded $ra at
|
|
// count=0; final transition to
|
|
// S_IFETCH_REQ sets pc<-regfile[31].
|
|
ch215_count <= 4'd0;
|
|
state <= S_CH215_REQ;
|
|
end else begin
|
|
// Ch199 minimal: PC<-$k0, no GPR restore.
|
|
pc <= regfile[26];
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
end else if (EE_SYSCALL_HLE_ENABLE) begin
|
|
// Ch273 — minimal EE syscall HLE for the
|
|
// ELF runner. Known $v1 values get a stub
|
|
// return; unknown $v1 falls through to
|
|
// halt so the next blocker surfaces.
|
|
// PC advances to pc+4 (normal user-code
|
|
// syscall resume), NOT RFE — that's
|
|
// Ch199's path.
|
|
case (regfile[3]) // $v1
|
|
32'h0000_003C: begin
|
|
// EndOfHeap — top-of-usable-RAM.
|
|
regfile[2] <= SYSCALL_HEAP_END;
|
|
gpr128[2] <= {96'd0, SYSCALL_HEAP_END}; // Ch283
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_003D: begin
|
|
// InitMainThread — stub success.
|
|
// No scheduler/thread state mutated.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0; // Ch283
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0040: begin
|
|
// Ch285 — syscall #64. qbert hits
|
|
// this at PC 0x00111D24 with
|
|
// $a0=heap-ish, $a1=code-ptr-ish.
|
|
// Almost certainly a kernel
|
|
// registration / handler-install
|
|
// call (the standard PS2 syscall
|
|
// table lists names like
|
|
// SetVCommonHandler / SetV
|
|
// TLBRefillHandler in this slot).
|
|
// Per Codex framing, we accept it
|
|
// with the least-invasive shape:
|
|
// $v0 = 0 ("registered OK") and
|
|
// PC += 4. If qbert misbranches
|
|
// downstream, revisit and return
|
|
// the previous handler pointer
|
|
// instead of zero.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0064: begin
|
|
// FlushCache — cacheless model, no-op.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0; // Ch283
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0078: begin
|
|
// Ch289 — syscall #120. qbert hits
|
|
// this at PC 0x00112AA4 with
|
|
// $a1=0x00130000, $a2=0x20000000
|
|
// (uncached-pointer base), $a3=
|
|
// 0x001328C0. Args look like setup/
|
|
// registration parameters (likely
|
|
// threading/heap/uncached-memory
|
|
// related per Codex framing). First
|
|
// pass per the Ch285 precedent:
|
|
// accept with $v0 = 0 ("kernel
|
|
// setup OK") and PC += 4. If qbert
|
|
// misbranches downstream, revisit
|
|
// and try $a2/$a1 as the return.
|
|
// The ELF runner adds a named
|
|
// SUMMARY line for this syscall so
|
|
// post-run analysis can confirm the
|
|
// arg shape without re-reading the
|
|
// trace file.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_006B: begin
|
|
// Ch304 — syscall #107. qbert hits
|
|
// this at PC 0x00111D64 (the 0x6B
|
|
// wrapper in Table1 @0x00111D40).
|
|
// Ch303 autopsy proved the caller
|
|
// at 0x00111B00 IGNORES the return
|
|
// value: after `jal 0x00111d60` the
|
|
// next instruction (0x111B24) sets
|
|
// $a1=0 without reading $v0. So
|
|
// $v0=0 is safe. Args at the call:
|
|
// $a0=5 (channel), $a1=0,
|
|
// $a2=0xFFFFFFFF, $a3=0x00137568.
|
|
// Per Codex: add 0x6B ALONE first
|
|
// so the next run confirms the
|
|
// Ch303-predicted table flow (next
|
|
// blocker should be 0x76, 0x44, or
|
|
// 0xFFFFFFBD). Ch305 batches the
|
|
// rest once the flow is confirmed.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0013: begin
|
|
// Ch302 — syscall #19. qbert hits
|
|
// this at PC 0x00112A64 right after
|
|
// Ch301's 0x17 with LITERALLY
|
|
// IDENTICAL args:
|
|
// $a0 = 5 (channel id)
|
|
// $a1 = 0
|
|
// $a2 = 0xFFFFFFFF (sentinel/-1)
|
|
// $a3 = 0x00137568 (per-channel
|
|
// ctx, same as
|
|
// 0x17)
|
|
// Second paired-call pattern on the
|
|
// syscall track (first was Ch290/291
|
|
// 0x12/0x16). 0x17+0x13 are the
|
|
// "set + register" pair for channel
|
|
// 5's new context. Per Codex: accept
|
|
// ($v0 = 0, PC += 4); paired-call
|
|
// symmetry makes this well-supported.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0017: begin
|
|
// Ch301 — syscall #23. qbert hits
|
|
// this at PC 0x00112A84 right after
|
|
// Ch300's PCPYH. Args:
|
|
// $a0 = 5 (channel id —
|
|
// matches the
|
|
// Ch290/291
|
|
// handler slot)
|
|
// $a1 = 0
|
|
// $a2 = 0xFFFFFFFF (sentinel/-1)
|
|
// $a3 = 0x00137568 (NEW context
|
|
// ptr, NOT the
|
|
// prior global
|
|
// ctx 0x001328C0
|
|
// — second
|
|
// context-shift
|
|
// on syscall
|
|
// track)
|
|
// PS2 standard table cites syscall
|
|
// 23 plausibly as SetVTLBRefill-
|
|
// Handler or iWakeupThread. The
|
|
// $a0=5 channel-id pattern + $a2=
|
|
// -1 sentinel fit a per-channel
|
|
// kernel call. Per Codex: accept
|
|
// ($v0 = 0, PC += 4); the runner
|
|
// observer with distinct-tuple
|
|
// tracking will surface whether
|
|
// qbert calls 0x17 multiple times
|
|
// with varying args.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0077: begin
|
|
// Ch297 — syscall #119. qbert hits
|
|
// this at PC 0x00111D84 right after
|
|
// Ch296's 0x79 acceptance, with a
|
|
// NOTICEABLY DIFFERENT arg shape:
|
|
// $a0 = 0x001DFD50 (heap address,
|
|
// not kseg0 base)
|
|
// $a1 = 1
|
|
// $a2 = 0
|
|
// $a3 = 20 (small int, NOT the
|
|
// global ctx
|
|
// pointer threaded
|
|
// through 0x78/
|
|
// 0x12/0x16/0x7A/
|
|
// 0x79)
|
|
// The $a3 change is the strongest
|
|
// "syscall-family boundary" signal
|
|
// we've seen — qbert has crossed
|
|
// into a different kernel call
|
|
// convention. PS2 standard table
|
|
// cites syscall 119 as plausibly
|
|
// SetVTLBRefillHandler or
|
|
// RegisterLibraryEntries. Per Codex:
|
|
// accept first-pass ($v0 = 0,
|
|
// PC += 4) but treat the runner-side
|
|
// observer as the falsifiability
|
|
// surface (richer than prior
|
|
// observers — first/last args, up
|
|
// to 4 distinct tuples).
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0079: begin
|
|
// Ch296 — syscall #121. qbert hits
|
|
// this at PC 0x00111D94 right after
|
|
// Ch295's $a0-aware 0x7A patch
|
|
// unblocked the wait loop. Args:
|
|
// $a0 = 0x80000000 (kseg0 base —
|
|
// same shape as
|
|
// Ch293's 0x7A
|
|
// init call)
|
|
// $a1 = 0
|
|
// $a3 = 0x001328C0 (same global
|
|
// ctx threaded
|
|
// throughout)
|
|
// PC sits in the same kernel-
|
|
// wrapper neighborhood as the
|
|
// Ch289 syscall 0x78 site (PC
|
|
// 0x00111D24). Likely an
|
|
// adjacent finalize/reset call.
|
|
// Per Codex: accept ($v0 = 0,
|
|
// PC += 4), keep marked as
|
|
// adjacent/experimental like 0x7A.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_007A: begin
|
|
// Ch293/Ch295 — syscall #122,
|
|
// $a0-aware HLE.
|
|
//
|
|
// Ch294's autopsy showed qbert calls
|
|
// syscall 0x7A with two distinct
|
|
// arg shapes and expects different
|
|
// return values:
|
|
// $a0 = 0x80000000 → $v0 = 0
|
|
// (init-style call;
|
|
// falls into wait loop)
|
|
// $a0 = 0x00000004 → $v0 with
|
|
// bit 17 (0x00020000) set
|
|
// (poll-style call; bit 17
|
|
// is qbert's readiness
|
|
// flag, mask in $s0 at
|
|
// PC 0x00112418).
|
|
// Ch295 is Codex's "Strategy A"
|
|
// EXPERIMENTAL unblock — return the
|
|
// expected bit shape based on $a0
|
|
// alone. Not architectural truth;
|
|
// the real PS2 syscall 122 semantics
|
|
// are still TBD. If qbert misbranches
|
|
// downstream we back this out and
|
|
// pursue real-SDK lookup or
|
|
// interrupt-delivery wiring.
|
|
if (regfile[4] == 32'h0000_0004) begin
|
|
regfile[2] <= 32'h0002_0000;
|
|
gpr128[2] <= {96'd0, 32'h0002_0000};
|
|
end else begin
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
end
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0016: begin
|
|
// Ch291 — syscall #22. qbert hits
|
|
// this at PC 0x00112A74 with args
|
|
// LITERALLY IDENTICAL to the Ch290
|
|
// syscall 0x12 call eight
|
|
// instructions earlier:
|
|
// $a0 = 0x05 (channel/slot)
|
|
// $a1 = 0x00112AB0 (fn ptr)
|
|
// $a2 = 0x00000000
|
|
// $a3 = 0x001328C0 (ctx ptr)
|
|
// PS2 standard syscall table cites
|
|
// `EnableDmacHandler` (or
|
|
// EnableIntcHandler) in this slot —
|
|
// the activation companion to Ch290's
|
|
// AddDmacHandler. Per Codex framing:
|
|
// accept the enable ($v0 = 0, PC +=
|
|
// 4); do NOT actually call the
|
|
// handler or synthesize a DMAC
|
|
// completion. If qbert subsequently
|
|
// polls for the handler to fire,
|
|
// Ch292 will need to model
|
|
// handler-invocation; for now just
|
|
// accept the registration+enable
|
|
// pair and see what qbert demands.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
32'h0000_0012: begin
|
|
// Ch290 — syscall #18. qbert hits
|
|
// this at PC 0x00112A54 right after
|
|
// Ch289's 0x78 with the classic
|
|
// handler-install arg shape:
|
|
// $a0 = 0x05 (channel /
|
|
// event id /
|
|
// handler slot)
|
|
// $a1 = 0x00112AB0 (fn pointer
|
|
// in code seg)
|
|
// $a2 = 0x00000000
|
|
// $a3 = 0x001328C0 (ctx ptr,
|
|
// same global
|
|
// block as
|
|
// 0x78's $a3)
|
|
// PS2 standard syscall table cites
|
|
// names like AddDmacHandler in slot
|
|
// 18 ($a0 = DMAC channel; $a0=5 =
|
|
// SIF0). Per Codex: accept the
|
|
// registration ($v0 = 0, PC += 4),
|
|
// do NOT invoke the handler or
|
|
// mutate DMAC/INTC state. The ELF
|
|
// runner observes args for the
|
|
// first occurrence so the SUMMARY
|
|
// shows the registration shape.
|
|
regfile[2] <= 32'd0;
|
|
gpr128[2] <= 128'd0;
|
|
pc <= pc + 32'd4;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
default: begin
|
|
// Unhandled — halt; TB reads
|
|
// $v1/$a0..$a3 hierarchically for
|
|
// the verdict.
|
|
retired_flag_halt <= 1'b1;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_HALT;
|
|
end
|
|
endcase
|
|
end else begin
|
|
retired_flag_halt <= 1'b1;
|
|
retire_pulse <= 1'b1;
|
|
state <= S_HALT;
|
|
end
|
|
end else if (strict_trap) begin
|
|
retired_flag_trap <= 1'b1;
|
|
retire_pulse <= 1'b1;
|
|
trap_o <= 1'b1;
|
|
trap_pc_o <= pc;
|
|
trap_instr_o <= instr;
|
|
state <= S_HALT;
|
|
end else if (align_trap) begin
|
|
// Ch47: AdEL/AdES address error, debug-halt
|
|
// mode (TRAP_ALIGN_ERROR=1). retired_arg2
|
|
// carries the offending EA so traces name
|
|
// the byte address that caused the fault.
|
|
retired_flag_trap <= 1'b1;
|
|
retire_pulse <= 1'b1;
|
|
trap_o <= 1'b1;
|
|
trap_pc_o <= pc;
|
|
trap_instr_o <= instr;
|
|
retired_arg2 <= ea;
|
|
retired_arg3 <= 32'd0;
|
|
state <= S_HALT;
|
|
end else if (align_except) begin
|
|
// Ch49: AdEL/AdES synchronous exception
|
|
// (TRAP_ALIGN_ERROR=0). Take the MIPS
|
|
// exception path instead of halting — the
|
|
// BIOS presumably has a handler at
|
|
// EXC_VECTOR that fixes up the misaligned
|
|
// access and RFEs back.
|
|
//
|
|
// EPC := pc of the FAULTING instruction
|
|
// (not next_pc as retire_advance does for
|
|
// inter-instruction interrupts). Real MIPS
|
|
// additionally sets Cause.BD=1 and writes
|
|
// EPC = (branch pc) when the fault fires in
|
|
// a delay slot; we don't model Cause.BD yet
|
|
// — a delay-slot AdES simply records the
|
|
// SW's own pc and the handler is expected
|
|
// to cope. Revisit if BIOS emulation
|
|
// depends on the BD bit.
|
|
epc <= pc;
|
|
badvaddr <= ea;
|
|
cause_exc_code <= is_align_store ? EXC_CODE_ADES
|
|
: EXC_CODE_ADEL;
|
|
status_ieo <= status_iep;
|
|
status_iep <= status_iec;
|
|
status_iec <= 1'b0;
|
|
status_kuo <= status_kup;
|
|
status_kup <= status_kuc;
|
|
status_kuc <= 1'b0;
|
|
pc <= exc_target_pc; // Ch50: BEV select
|
|
branch_pending <= 1'b0;
|
|
|
|
// Retire event: treat the fault as an
|
|
// exception-retire (flag bit 5), carry ea
|
|
// in arg2 so traces name the bad address
|
|
// directly, and clear the load/store flags
|
|
// since the SW/LW did not actually execute.
|
|
retired_pc <= pc;
|
|
retired_instr <= instr;
|
|
retired_arg2 <= ea;
|
|
retired_arg3 <= 32'd0;
|
|
retired_flag_write <= 1'b0;
|
|
retired_flag_read <= 1'b0;
|
|
retired_flag_branch <= 1'b0;
|
|
retired_flag_halt <= 1'b0;
|
|
retired_flag_except <= 1'b1;
|
|
retired_flag_rfe <= 1'b0;
|
|
retired_flag_trap <= 1'b0;
|
|
retired_flag_in_delay <= instr_in_delay_slot;
|
|
|
|
retire_pulse <= 1'b1;
|
|
state <= S_IFETCH_REQ;
|
|
end else if (is_lw || is_lb || is_lbu || is_lh || is_lhu) begin
|
|
state <= S_MEM_REQ;
|
|
end else if (is_lq) begin
|
|
// Ch283 — LQ: 4-beat 32-bit load FSM. Beat N
|
|
// captures mem[ea + N*4] into the matching
|
|
// 32-bit lane of gpr128[rt]. After beat 3, the
|
|
// low 32 are mirrored to regfile[rt]. Replaces
|
|
// the Ch279 single-beat LW-style approximation.
|
|
sq_beat <= 2'd0;
|
|
state <= S_MEM_REQ;
|
|
end else if (is_ld) begin
|
|
// Ch284 — LD: 2-beat 32-bit load FSM mirroring
|
|
// SD's beat layout. Beat 0 → gpr128[rt][31:0]
|
|
// and regfile[rt] mirror; beat 1 → gpr128[rt]
|
|
// [63:32]. gpr128[rt][127:64] is preserved
|
|
// (architectural LD only loads doubleword).
|
|
sq_beat <= 2'd0;
|
|
state <= S_MEM_REQ;
|
|
end else if (is_sw || is_sb || is_sh) begin
|
|
state <= S_MEM_WRITE;
|
|
end else if (is_sq) begin
|
|
// Ch271 — SQ: 4-beat 32-bit write FSM.
|
|
// Beat 0 emits the lower 32 bits of rt; beats
|
|
// 1-3 emit zero (upper 96 bits of $rt aren't
|
|
// modelled). For sq $zero,... all four beats
|
|
// are zero — the qbert prolog case.
|
|
sq_beat <= 2'd0;
|
|
state <= S_MEM_WRITE;
|
|
end else if (is_sd) begin
|
|
// Ch275 — SD: 2-beat 32-bit write FSM.
|
|
// Beat 0 emits rt_val[31:0]; beat 1 emits 0
|
|
// (upper 32 bits of $rt aren't modelled). qbert
|
|
// does sd $ra, 0x20($sp) in a function prologue.
|
|
sq_beat <= 2'd0;
|
|
state <= S_MEM_WRITE;
|
|
end else begin
|
|
if ((is_lui || is_ori || is_andi ||
|
|
is_addi || is_addiu ||
|
|
is_slti || is_sltiu) && (rt_idx != 5'd0)) begin
|
|
regfile[rt_idx] <= alu_wb;
|
|
// Ch283 — scalar mirror: zero-extend into the
|
|
// 128-bit shadow (R5900 clears upper bits on
|
|
// every scalar destination write).
|
|
gpr128[rt_idx] <= {96'd0, alu_wb};
|
|
end
|
|
|
|
if (is_rtype_alu && (rd_idx != 5'd0)) begin
|
|
regfile[rd_idx] <= rtype_alu_wb;
|
|
// Ch283 — MMI ops get the full 128-bit
|
|
// result into gpr128; scalar ops just zero-
|
|
// extend their 32-bit result. The regfile
|
|
// mirror above always lands the low 32
|
|
// (rtype_alu_wb is computed accordingly).
|
|
if (is_mmi_wb)
|
|
gpr128[rd_idx] <= rtype_alu128_wb;
|
|
else
|
|
gpr128[rd_idx] <= {96'd0, rtype_alu_wb};
|
|
end
|
|
|
|
// Ch203: MULTU writes {HI, LO} = unsigned 64-bit
|
|
// product of (rs, rt). No rd writeback (rd bits
|
|
// are architecturally ignored). MFLO/MFHI in a
|
|
// following instruction observe the result.
|
|
if (is_multu) begin
|
|
logic [63:0] mu_product;
|
|
mu_product = {32'd0, rs_val} * {32'd0, rt_val};
|
|
lo_reg <= mu_product[31:0];
|
|
hi_reg <= mu_product[63:32];
|
|
end
|
|
|
|
// Ch43: DIVU writes LO=quotient, HI=remainder.
|
|
// Divisor==0 is UNDEFINED per MIPS spec; we
|
|
// take the deterministic "leave HI/LO
|
|
// unchanged" option. Ch162: STRIP_HW_DIVIDER
|
|
// gates the `/` and `%` operators away so
|
|
// Quartus doesn't infer the 32-bit hardware
|
|
// divider on PSMCT32-only hardware builds.
|
|
if (!STRIP_HW_DIVIDER && is_divu && (rt_val != 32'd0)) begin
|
|
lo_reg <= rs_val / rt_val;
|
|
hi_reg <= rs_val % rt_val;
|
|
end
|
|
|
|
// Ch43: MFHI/MFLO move HI/LO into rd. rd==0
|
|
// suppresses the write (architectural $0
|
|
// protection).
|
|
if (is_mfhi && (rd_idx != 5'd0)) begin
|
|
regfile[rd_idx] <= hi_reg;
|
|
gpr128[rd_idx] <= {96'd0, hi_reg}; // Ch283 mirror
|
|
end
|
|
if (is_mflo && (rd_idx != 5'd0)) begin
|
|
regfile[rd_idx] <= lo_reg;
|
|
gpr128[rd_idx] <= {96'd0, lo_reg}; // Ch283 mirror
|
|
end
|
|
|
|
// JAL: link address is pc + 8 (instruction after
|
|
// the delay slot). $31 is the architectural $ra.
|
|
if (is_jal) begin
|
|
regfile[5'd31] <= pc + 32'd8;
|
|
gpr128[5'd31] <= {96'd0, pc + 32'd8}; // Ch283 mirror
|
|
end
|
|
|
|
// JALR: same pc+8 link semantics, but the link
|
|
// destination is explicit in rd. rd==0 suppresses
|
|
// the write (valid JALR encoding for "jump
|
|
// indirect without keeping a return address").
|
|
if (is_jalr && (rd_idx != 5'd0)) begin
|
|
regfile[rd_idx] <= pc + 32'd8;
|
|
gpr128[rd_idx] <= {96'd0, pc + 32'd8}; // Ch283 mirror
|
|
end
|
|
|
|
if (is_mfc0 && (rt_idx != 5'd0)) begin
|
|
regfile[rt_idx] <= cop0_read_val;
|
|
gpr128[rt_idx] <= {96'd0, cop0_read_val}; // Ch283 mirror
|
|
end
|
|
|
|
if (is_mtc0) begin
|
|
unique case (rd_idx)
|
|
COP0_REG_STATUS: begin
|
|
status_iec <= rt_val[0];
|
|
status_kuc <= rt_val[1];
|
|
status_iep <= rt_val[2];
|
|
status_kup <= rt_val[3];
|
|
status_ieo <= rt_val[4];
|
|
status_kuo <= rt_val[5];
|
|
status_im <= rt_val[15:8];
|
|
status_bev <= rt_val[22]; // Ch50
|
|
end
|
|
COP0_REG_CAUSE: begin
|
|
cause_exc_code <= rt_val[6:2];
|
|
cause_ip_sw[1:0] <= rt_val[9:8];
|
|
end
|
|
COP0_REG_EPC: epc <= rt_val;
|
|
default: ;
|
|
endcase
|
|
end
|
|
|
|
if (is_rfe) begin
|
|
status_iec <= status_iep;
|
|
status_iep <= status_ieo;
|
|
status_kuc <= status_kup;
|
|
status_kup <= status_kuo;
|
|
retired_flag_rfe <= 1'b1;
|
|
end
|
|
|
|
if (is_mfc0) begin
|
|
retired_arg2 <= {27'd0, rd_idx};
|
|
retired_arg3 <= cop0_read_val;
|
|
end else if (is_mtc0) begin
|
|
retired_arg2 <= {27'd0, rd_idx};
|
|
retired_arg3 <= rt_val;
|
|
end else if (is_taken_branch_or_jump) begin
|
|
retired_arg2 <= taken_target;
|
|
retired_arg3 <= 32'd0;
|
|
end else if (is_lui || is_ori || is_andi ||
|
|
is_addi || is_addiu ||
|
|
is_slti || is_sltiu) begin
|
|
retired_arg3 <= alu_wb;
|
|
end else if (is_rtype_alu) begin
|
|
retired_arg2 <= {27'd0, rd_idx};
|
|
retired_arg3 <= rtype_alu_wb;
|
|
end else if (is_multu) begin
|
|
// Ch203: arg2 = rt_val (so traces show
|
|
// both operands together with rs in
|
|
// arg0/regfile context), arg3 = LO half
|
|
// of the product = the value MFLO would
|
|
// read next. The full 64-bit product
|
|
// isn't carried in trace events; HI is
|
|
// exposed via the MFHI retire below.
|
|
retired_arg2 <= rt_val;
|
|
retired_arg3 <= ({32'd0, rs_val} * {32'd0, rt_val});
|
|
end else if (is_divu) begin
|
|
// arg2 = divisor (rt_val) so a div-by-zero
|
|
// event is obvious in traces; arg3 =
|
|
// quotient (or 0 on div-by-zero, since
|
|
// LO is left unchanged — the trace just
|
|
// records what *would* have been written).
|
|
// Ch162: STRIP_HW_DIVIDER gates the `/`
|
|
// here too so Quartus doesn't keep an
|
|
// inferred divider just for the trace.
|
|
retired_arg2 <= rt_val;
|
|
retired_arg3 <= (STRIP_HW_DIVIDER || rt_val == 32'd0)
|
|
? 32'd0
|
|
: (rs_val / rt_val);
|
|
end else if (is_mfhi) begin
|
|
retired_arg2 <= {27'd0, rd_idx};
|
|
retired_arg3 <= hi_reg;
|
|
end else if (is_mflo) begin
|
|
retired_arg2 <= {27'd0, rd_idx};
|
|
retired_arg3 <= lo_reg;
|
|
end
|
|
|
|
retire_pulse <= 1'b1;
|
|
retire_advance();
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
end
|
|
|
|
S_MEM_REQ: state <= S_MEM_WAIT;
|
|
|
|
S_MEM_WAIT: begin
|
|
if (map_rd_valid) begin
|
|
// Ch283/Ch284 — multi-beat loads. LQ takes 4
|
|
// beats (terminal = 2'd3), LD takes 2 (terminal
|
|
// = 2'd1). Each beat captures a 32-bit lane
|
|
// into gpr128[rt]; the last beat mirrors low
|
|
// 32 to regfile[rt] and retires. LD mirrors
|
|
// regfile on beat 0 (the moment the low 32 are
|
|
// actually written into gpr128) so the value
|
|
// is observable immediately on retire.
|
|
if (is_lq || is_ld) begin
|
|
logic [1:0] terminal_beat;
|
|
terminal_beat = is_lq ? 2'd3 : 2'd1;
|
|
if (rt_idx != 5'd0) begin
|
|
case (sq_beat)
|
|
2'd0: gpr128[rt_idx][31:0] <= map_rd_data;
|
|
2'd1: gpr128[rt_idx][63:32] <= map_rd_data;
|
|
2'd2: gpr128[rt_idx][95:64] <= map_rd_data;
|
|
2'd3: gpr128[rt_idx][127:96] <= map_rd_data;
|
|
endcase
|
|
// Last beat mirrors low 32 to regfile.
|
|
// For LQ the low 32 was committed on
|
|
// beat 0; for LD the same. By terminal
|
|
// beat the NBA has settled, so reading
|
|
// gpr128[rt_idx][31:0] is safe.
|
|
if (sq_beat == terminal_beat)
|
|
regfile[rt_idx] <= gpr128[rt_idx][31:0];
|
|
end
|
|
|
|
if (sq_beat != terminal_beat) begin
|
|
sq_beat <= sq_beat + 2'd1;
|
|
state <= S_MEM_REQ;
|
|
end else begin
|
|
retired_pc <= pc;
|
|
retired_instr <= instr;
|
|
retired_arg2 <= ea;
|
|
retired_arg3 <= map_rd_data;
|
|
retired_flag_write <= 1'b0;
|
|
retired_flag_read <= 1'b1;
|
|
retired_flag_branch <= 1'b0;
|
|
retired_flag_halt <= 1'b0;
|
|
retired_flag_in_delay <= instr_in_delay_slot;
|
|
retired_flag_rfe <= 1'b0;
|
|
retire_pulse <= 1'b1;
|
|
sq_beat <= 2'd0;
|
|
retire_advance();
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
end else begin
|
|
// Sub-word loads extract the addressed byte or
|
|
// halfword out of the returned 32-bit word.
|
|
// LB : byte at ea[1:0], 24-bit sign-extend
|
|
// LH : halfword at ea[1], 16-bit sign-extend
|
|
// LHU : halfword at ea[1], 16-bit zero-extend
|
|
// LW : whole word, as-is
|
|
// Halfword access uses only ea[1] — ea[0] must
|
|
// be 0 for aligned access (unaligned halfword
|
|
// reads are not modelled as an exception yet).
|
|
if (rt_idx != 5'd0) begin
|
|
// Ch283 — compute the 32-bit load value into
|
|
// a temporary, then write both regfile and
|
|
// gpr128 (zero-extended into the upper 96).
|
|
// LW/LB/LBU/LH/LHU all produce a single
|
|
// 32-bit lane; the architectural rule is
|
|
// upper bits of $rt clear to 0.
|
|
logic [31:0] load_wb;
|
|
load_wb = map_rd_data; // LW default
|
|
if (is_lb) begin
|
|
case (ea[1:0])
|
|
2'd0: load_wb = {{24{map_rd_data[7]}}, map_rd_data[7:0]};
|
|
2'd1: load_wb = {{24{map_rd_data[15]}}, map_rd_data[15:8]};
|
|
2'd2: load_wb = {{24{map_rd_data[23]}}, map_rd_data[23:16]};
|
|
2'd3: load_wb = {{24{map_rd_data[31]}}, map_rd_data[31:24]};
|
|
endcase
|
|
end else if (is_lbu) begin
|
|
case (ea[1:0])
|
|
2'd0: load_wb = {24'd0, map_rd_data[7:0]};
|
|
2'd1: load_wb = {24'd0, map_rd_data[15:8]};
|
|
2'd2: load_wb = {24'd0, map_rd_data[23:16]};
|
|
2'd3: load_wb = {24'd0, map_rd_data[31:24]};
|
|
endcase
|
|
end else if (is_lh) begin
|
|
case (ea[1])
|
|
1'b0: load_wb = {{16{map_rd_data[15]}}, map_rd_data[15:0]};
|
|
1'b1: load_wb = {{16{map_rd_data[31]}}, map_rd_data[31:16]};
|
|
endcase
|
|
end else if (is_lhu) begin
|
|
case (ea[1])
|
|
1'b0: load_wb = {16'd0, map_rd_data[15:0]};
|
|
1'b1: load_wb = {16'd0, map_rd_data[31:16]};
|
|
endcase
|
|
end
|
|
regfile[rt_idx] <= load_wb;
|
|
gpr128[rt_idx] <= {96'd0, load_wb};
|
|
end
|
|
|
|
retired_pc <= pc;
|
|
retired_instr <= instr;
|
|
retired_arg2 <= ea;
|
|
retired_arg3 <= map_rd_data;
|
|
retired_flag_write <= 1'b0;
|
|
retired_flag_read <= 1'b1;
|
|
retired_flag_branch <= 1'b0;
|
|
retired_flag_halt <= 1'b0;
|
|
retired_flag_in_delay <= instr_in_delay_slot;
|
|
retired_flag_rfe <= 1'b0;
|
|
retire_pulse <= 1'b1;
|
|
|
|
retire_advance();
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
end
|
|
end
|
|
|
|
S_MEM_WRITE: begin
|
|
if ((is_sq && sq_beat != 2'd3)
|
|
|| (is_sd && sq_beat != 2'd1)) begin
|
|
// Ch271/Ch275 — multi-beat store mid-beats:
|
|
// drive the next 32-bit lane on the next cycle.
|
|
// Don't retire yet — the single architectural
|
|
// instruction (SQ=4 beats / SD=2 beats) maps to
|
|
// one retire event.
|
|
sq_beat <= sq_beat + 2'd1;
|
|
// stay in S_MEM_WRITE
|
|
end else begin
|
|
retired_pc <= pc;
|
|
retired_instr <= instr;
|
|
retired_arg2 <= ea;
|
|
retired_arg3 <= rt_val;
|
|
retired_flag_write <= 1'b1;
|
|
retired_flag_read <= 1'b0;
|
|
retired_flag_branch <= 1'b0;
|
|
retired_flag_halt <= 1'b0;
|
|
retired_flag_in_delay <= instr_in_delay_slot;
|
|
retired_flag_rfe <= 1'b0;
|
|
retire_pulse <= 1'b1;
|
|
|
|
retire_advance();
|
|
sq_beat <= 2'd0;
|
|
state <= S_IFETCH_REQ;
|
|
end
|
|
end
|
|
|
|
S_HALT: state <= S_HALT;
|
|
|
|
// Ch215 — jmp_buf restore FSM.
|
|
S_CH215_REQ: state <= S_CH215_WAIT;
|
|
|
|
S_CH215_WAIT: begin
|
|
if (map_rd_valid) begin
|
|
// Store the loaded word into the canonical
|
|
// regfile slot per the Ch212 field-map.
|
|
case (ch215_count)
|
|
4'd0: begin regfile[31] <= map_rd_data; gpr128[31] <= {96'd0, map_rd_data}; end // $ra at +0x00
|
|
4'd1: begin regfile[29] <= map_rd_data; gpr128[29] <= {96'd0, map_rd_data}; end // $sp at +0x04
|
|
4'd2: begin regfile[30] <= map_rd_data; gpr128[30] <= {96'd0, map_rd_data}; end // $fp at +0x08
|
|
4'd3: begin regfile[16] <= map_rd_data; gpr128[16] <= {96'd0, map_rd_data}; end // $s0 at +0x0C
|
|
4'd4: begin regfile[17] <= map_rd_data; gpr128[17] <= {96'd0, map_rd_data}; end // $s1 at +0x10
|
|
4'd5: begin regfile[18] <= map_rd_data; gpr128[18] <= {96'd0, map_rd_data}; end // $s2 at +0x14
|
|
4'd6: begin regfile[19] <= map_rd_data; gpr128[19] <= {96'd0, map_rd_data}; end // $s3 at +0x18
|
|
4'd7: begin regfile[20] <= map_rd_data; gpr128[20] <= {96'd0, map_rd_data}; end // $s4 at +0x1C
|
|
4'd8: begin regfile[21] <= map_rd_data; gpr128[21] <= {96'd0, map_rd_data}; end // $s5 at +0x20
|
|
4'd9: begin regfile[22] <= map_rd_data; gpr128[22] <= {96'd0, map_rd_data}; end // $s6 at +0x24
|
|
4'd10: begin regfile[23] <= map_rd_data; gpr128[23] <= {96'd0, map_rd_data}; end // $s7 at +0x28
|
|
4'd11: begin regfile[28] <= map_rd_data; gpr128[28] <= {96'd0, map_rd_data}; end // $gp at +0x2C
|
|
default: ;
|
|
endcase
|
|
if (ch215_count == 4'd11) begin
|
|
// Done — set $v0=1 (longjmp-style return
|
|
// value so post-setjmp `beq $v0,$0` at
|
|
// 0xBFC52350 falls through to the
|
|
// longjmp-return path), set PC to the
|
|
// loaded $ra (committed at count==0).
|
|
regfile[2] <= 32'd1;
|
|
gpr128[2] <= 128'd1; // Ch283 mirror
|
|
pc <= regfile[31];
|
|
state <= S_IFETCH_REQ;
|
|
end else begin
|
|
ch215_count <= ch215_count + 4'd1;
|
|
state <= S_CH215_REQ;
|
|
end
|
|
end
|
|
end
|
|
|
|
default: state <= S_IDLE;
|
|
endcase
|
|
end
|
|
end
|
|
|
|
assign halt_o = (state == S_HALT);
|
|
assign pc_o = pc;
|
|
|
|
// ------------------------------------------------------------------
|
|
// Trace emission — one event per retire, SUBSYS_EE
|
|
// ------------------------------------------------------------------
|
|
|
|
always_ff @(posedge clk) begin
|
|
if (!rst_n) begin
|
|
ev_valid <= 1'b0;
|
|
ev_subsys <= SUBSYS_EE;
|
|
ev_event <= EV_IFETCH;
|
|
ev_arg0 <= 64'd0;
|
|
ev_arg1 <= 64'd0;
|
|
ev_arg2 <= 64'd0;
|
|
ev_arg3 <= 64'd0;
|
|
ev_flags <= 32'd0;
|
|
end else if (retire_pulse) begin
|
|
ev_valid <= 1'b1;
|
|
ev_subsys <= SUBSYS_EE;
|
|
ev_event <= EV_IFETCH;
|
|
ev_arg0 <= {32'd0, retired_pc};
|
|
ev_arg1 <= {32'd0, retired_instr};
|
|
ev_arg2 <= {32'd0, retired_arg2};
|
|
ev_arg3 <= {32'd0, retired_arg3};
|
|
ev_flags <= {24'd0,
|
|
retired_flag_trap,
|
|
retired_flag_rfe,
|
|
retired_flag_except,
|
|
retired_flag_in_delay,
|
|
retired_flag_halt,
|
|
retired_flag_branch,
|
|
retired_flag_read,
|
|
retired_flag_write};
|
|
end else begin
|
|
ev_valid <= 1'b0;
|
|
end
|
|
end
|
|
|
|
endmodule : ee_core_stub
|