From fe6df042be95c74dfcba8c01ffcb44d33c2dac13 Mon Sep 17 00:00:00 2001 From: nerd73 Date: Sat, 25 Apr 2020 03:24:59 -0600 Subject: [PATCH] Implement a proper P6 timing model --- src/cpu/codegen_timing_p6.c | 1864 ++++++++++++++++--------------- src/cpu_new/codegen_timing_p6.c | 1864 ++++++++++++++++--------------- 2 files changed, 1902 insertions(+), 1826 deletions(-) diff --git a/src/cpu/codegen_timing_p6.c b/src/cpu/codegen_timing_p6.c index 0cddde494..a0da30304 100644 --- a/src/cpu/codegen_timing_p6.c +++ b/src/cpu/codegen_timing_p6.c @@ -1,4 +1,5 @@ -/*Hacky P6 timings based on K6 timings*/ +/*Basic P6 timing model by plant/nerd73. Based on the K6 timing model*/ +/*Some cycle timings come from https://www.agner.org/optimize/instruction_tables.pdf*/ #include #include #include @@ -21,24 +22,26 @@ typedef enum uop_type_t UOP_ALU = 0, /*Executes in Integer X or Y units*/ UOP_ALUX, /*Executes in Integer X unit*/ UOP_LOAD, /*Executes in Load unit*/ - UOP_STORE, /*Executes in Store unit*/ + UOP_STORED, /*Executes in Data Store unit*/ + UOP_STOREA, /*Executes in Address Store unit*/ UOP_FLOAD, /*Executes in Load unit*/ - UOP_FSTORE, /*Executes in Store unit*/ + UOP_FSTORED, /*Executes in Data Store unit*/ + UOP_FSTOREA, /*Executes in Address Store unit*/ UOP_MLOAD, /*Executes in Load unit*/ - UOP_MSTORE, /*Executes in Store unit*/ + UOP_MSTORED, /*Executes in Data Store unit*/ + UOP_MSTOREA, /*Executes in Address Store unit*/ UOP_FLOAT, /*Executes in Floating Point unit*/ - UOP_MEU, /*Executes in Multimedia unit*/ - UOP_MEU_SHIFT, /*Executes in Multimedia unit or ALU X/Y. Uses MMX shifter*/ - UOP_MEU_MUL, /*Executes in Multimedia unit or ALU X/Y. Uses MMX multiplier*/ + UOP_MMX, /*Executes in Integer X or Y units as MMX*/ + UOP_MMX_SHIFT, /*Executes in Integer Y unit. Uses MMX shifter*/ + UOP_MMX_MUL, /*Executes in Integer X unit. Uses MMX multiplier*/ UOP_BRANCH, /*Executes in Branch unit*/ UOP_LIMM /*Does not require an execution unit*/ } uop_type_t; typedef enum decode_type_t { - DECODE_SHORT, - DECODE_LONG, - DECODE_VECTOR + DECODE_SIMPLE, + DECODE_COMPLEX, } decode_type_t; #define MAX_UOPS 10 @@ -46,7 +49,6 @@ typedef enum decode_type_t typedef struct risc86_uop_t { uop_type_t type; - double throughput; double latency; } risc86_uop_t; @@ -60,804 +62,885 @@ typedef struct risc86_instruction_t static const risc86_instruction_t alu_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t alux_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t load_alu_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t load_alux_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t alu_store_op = { - .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} + }; static const risc86_instruction_t alux_store_op = { - .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} }; static const risc86_instruction_t branch_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_BRANCH, .latency = 1} }; static const risc86_instruction_t limm_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LIMM, .throughput = 1, .latency = 1} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_LIMM, .latency = 1} }; static const risc86_instruction_t load_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1} }; static const risc86_instruction_t store_op = { - .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1} }; static const risc86_instruction_t bswap_op = { - .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, }; static const risc86_instruction_t leave_op = { .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t lods_op = { .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t loop_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} }; static const risc86_instruction_t mov_reg_seg_op = { .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, }; static const risc86_instruction_t movs_op = { .nr_uops = 4, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t pop_reg_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t pop_mem_op = { - .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t push_imm_op = { - .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 2}, + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1}, }; static const risc86_instruction_t push_mem_op = { - .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1} }; static const risc86_instruction_t push_seg_op = { - .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t stos_op = { - .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t test_reg_op = { .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t test_reg_b_op = { .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t test_mem_imm_op = { .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t test_mem_imm_b_op = { .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t xchg_op = { .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t mmx_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX, .latency = 1.5} }; static const risc86_instruction_t mmx_mul_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU_MUL, .throughput = 1.5, .latency = 3} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX_MUL, .latency = 1.5} }; static const risc86_instruction_t mmx_shift_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU_SHIFT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX_SHIFT, .latency = 1.5} }; static const risc86_instruction_t load_mmx_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU, .throughput = 1.5, .latency = 3} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_MMX, .latency = 1.5} }; static const risc86_instruction_t load_mmx_mul_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU_MUL, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_MMX_MUL, .latency = 1.5} }; static const risc86_instruction_t load_mmx_shift_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU_SHIFT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_MMX_SHIFT, .latency = 1.5} }; static const risc86_instruction_t mload_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MLOAD, .throughput = 1, .latency = 2} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_MLOAD, .latency = 3}, }; static const risc86_instruction_t mstore_op = { - .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MSTORE, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_MSTORED, .latency = 1}, + .uop[1] = {.type = UOP_MSTOREA, .latency = 1} }; static const risc86_instruction_t pmul_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU_MUL, .throughput = 1, .latency = 2} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX_MUL, .latency = 1.5} }; static const risc86_instruction_t pmul_mem_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU_MUL, .throughput = 1, .latency = 2} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_MMX_MUL, .latency = 1.5} }; - static const risc86_instruction_t float_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_FLOAT, .latency = 1} +}; +static const risc86_instruction_t fadd_op = +{ + .nr_uops = 1, + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_FLOAT, .latency = 3} +}; +static const risc86_instruction_t fmul_op = +{ + .nr_uops = 1, + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_ALU, .latency = 5} +}; +static const risc86_instruction_t float2_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 1} +}; +static const risc86_instruction_t fchs_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 2}, + .uop[1] = {.type = UOP_FLOAT, .latency = 2}, + .uop[2] = {.type = UOP_FLOAT, .latency = 2} }; static const risc86_instruction_t load_float_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_FLOAT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 1} +}; +static const risc86_instruction_t load_fadd_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 3} +}; +static const risc86_instruction_t load_fmul_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 5} }; static const risc86_instruction_t fstore_op = { - .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FSTORED, .latency = 1}, + .uop[1] = {.type = UOP_FSTOREA, .latency = 1}, +}; +static const risc86_instruction_t load_fiadd_op = +{ + .nr_uops = 7, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 1}, + .uop[2] = {.type = UOP_FLOAT, .latency = 1}, + .uop[3] = {.type = UOP_FLOAT, .latency = 1}, + .uop[4] = {.type = UOP_FLOAT, .latency = 1}, + .uop[5] = {.type = UOP_FLOAT, .latency = 1}, + .uop[6] = {.type = UOP_FLOAT, .latency = 1} }; - static const risc86_instruction_t fdiv_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 38, .latency = 38} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 32} }; static const risc86_instruction_t fdiv_mem_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_FLOAT, .throughput = 38, .latency = 38} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 38} }; static const risc86_instruction_t fsin_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 62, .latency = 62} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 60} }; static const risc86_instruction_t fsqrt_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 49, .latency = 49} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 69} }; -static const risc86_instruction_t vector_fldcw_op = +static const risc86_instruction_t complex_fldcw_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 8, .latency = 8} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 10} }; -static const risc86_instruction_t vector_float_op = +static const risc86_instruction_t complex_float_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 1} }; -static const risc86_instruction_t vector_float_l_op = +static const risc86_instruction_t complex_float_l_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 50, .latency = 50} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 50} }; -static const risc86_instruction_t vector_flde_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_FLOAT, .throughput = 2, .latency = 2} -}; -static const risc86_instruction_t vector_fste_op = +static const risc86_instruction_t complex_flde_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 2, .latency = 2}, - .uop[1] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAD, .latency = 1}, + .uop[2] = {.type = UOP_FLOAT, .latency = 2} +}; +static const risc86_instruction_t complex_fste_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 2}, + .uop[1] = {.type = UOP_FSTORED, .latency = 1}, + .uop[2] = {.type = UOP_FSTOREA, .latency = 1} }; -static const risc86_instruction_t vector_alu1_op = +static const risc86_instruction_t complex_alu1_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alu2_op = +static const risc86_instruction_t complex_alu2_op = { .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alu3_op = +static const risc86_instruction_t complex_alu3_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alu6_op = +static const risc86_instruction_t complex_alu6_op = { .nr_uops = 6, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1}, + .uop[4] = {.type = UOP_ALU, .latency = 1}, + .uop[5] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alux1_op = +static const risc86_instruction_t complex_alux1_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1} }; -static const risc86_instruction_t vector_alux3_op = +static const risc86_instruction_t complex_alux3_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1} }; -static const risc86_instruction_t vector_alux6_op = +static const risc86_instruction_t complex_alux6_op = { - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .nr_uops = 6, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1}, + .uop[3] = {.type = UOP_ALUX, .latency = 1}, + .uop[4] = {.type = UOP_ALUX, .latency = 1}, + .uop[5] = {.type = UOP_ALUX, .latency = 1} }; -static const risc86_instruction_t vector_alu_store_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_alux_store_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_arpl_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[1] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_bound_op = +static const risc86_instruction_t complex_alu_store_op = { .nr_uops = 4, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1}, }; -static const risc86_instruction_t vector_bsx_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_call_far_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_cli_sti_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 7, .latency = 7} -}; -static const risc86_instruction_t vector_cmps_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_cmpsb_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_cmpxchg_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, -}; -static const risc86_instruction_t vector_cmpxchg_b_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, -}; -static const risc86_instruction_t vector_cpuid_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 22, .latency = 22} -}; -static const risc86_instruction_t vector_div16_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_div16_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_div32_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 18, .latency = 18} -}; -static const risc86_instruction_t vector_div32_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 18, .latency = 18} -}; -static const risc86_instruction_t vector_emms_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 25, .latency = 25} -}; -static const risc86_instruction_t vector_enter_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_femms_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 6, .latency = 6} -}; -static const risc86_instruction_t vector_in_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 10, .latency = 11} -}; -static const risc86_instruction_t vector_ins_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 10, .latency = 11}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_int_op = -{ - .nr_uops = 5, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 20, .latency = 20}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_iret_op = -{ - .nr_uops = 5, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[3] = {.type = UOP_ALU, .throughput = 20, .latency = 20}, - .uop[4] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_invd_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1000, .latency = 1000} -}; -static const risc86_instruction_t vector_jmp_far_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_load_alu_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_load_alux_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_loop_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_lss_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_mov_mem_seg_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mov_seg_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_mov_seg_reg_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_mul_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mul_mem_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mul64_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mul64_mem_op = +static const risc86_instruction_t complex_alux_store_op = { .nr_uops = 4, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} }; -static const risc86_instruction_t vector_out_op = +static const risc86_instruction_t complex_arpl_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3}, + .uop[1] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_bound_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_bsx_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 10, .latency = 10} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 10} }; -static const risc86_instruction_t vector_outs_op = +static const risc86_instruction_t complex_call_far_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_cli_sti_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 7} +}; +static const risc86_instruction_t complex_cmps_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 10, .latency = 10}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_pusha_op = +static const risc86_instruction_t complex_cmpsb_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_cmpxchg_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_cmpxchg_b_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_cpuid_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 23} +}; +static const risc86_instruction_t complex_div16_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 10} +}; +static const risc86_instruction_t complex_div16_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 10} +}; +static const risc86_instruction_t complex_div32_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 18} +}; +static const risc86_instruction_t complex_div32_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 18} +}; +static const risc86_instruction_t complex_emms_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 25} +}; +static const risc86_instruction_t complex_enter_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 10} +}; +static const risc86_instruction_t complex_femms_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 6} +}; +static const risc86_instruction_t complex_in_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 10} +}; +static const risc86_instruction_t complex_ins_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 10}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_int_op = { .nr_uops = 8, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[6] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[7] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 20}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_STORED, .latency = 1}, + .uop[4] = {.type = UOP_STOREA, .latency = 1}, + .uop[5] = {.type = UOP_STORED, .latency = 1}, + .uop[6] = {.type = UOP_STOREA, .latency = 1}, + .uop[7] = {.type = UOP_BRANCH, .latency = 1} }; -static const risc86_instruction_t vector_popa_op = +static const risc86_instruction_t complex_iret_op = +{ + .nr_uops = 5, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_LOAD, .latency = 3}, + .uop[2] = {.type = UOP_LOAD, .latency = 3}, + .uop[3] = {.type = UOP_ALU, .latency = 20}, + .uop[4] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_invd_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 500} +}; +static const risc86_instruction_t complex_jmp_far_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_load_alu_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_load_alux_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_loop_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_lss_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_mov_mem_seg_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, +}; +static const risc86_instruction_t complex_mov_seg_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_mov_seg_reg_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_mul_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_mul_mem_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_mul64_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_mul64_mem_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1}, + .uop[3] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_out_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 10}, + .uop[1] = {.type = UOP_STOREA, .latency = 10}, +}; +static const risc86_instruction_t complex_outs_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 10}, + .uop[1] = {.type = UOP_STOREA, .latency = 10}, + .uop[2] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_pusha_op = { .nr_uops = 8, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[6] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[7] = {.type = UOP_LOAD, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 2}, + .uop[1] = {.type = UOP_STOREA, .latency = 2}, + .uop[2] = {.type = UOP_STORED, .latency = 2}, + .uop[3] = {.type = UOP_STOREA, .latency = 2}, + .uop[4] = {.type = UOP_STORED, .latency = 2}, + .uop[5] = {.type = UOP_STOREA, .latency = 2}, + .uop[6] = {.type = UOP_STORED, .latency = 2}, + .uop[7] = {.type = UOP_STOREA, .latency = 2} }; -static const risc86_instruction_t vector_popf_op = +static const risc86_instruction_t complex_popa_op = +{ + .nr_uops = 8, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1}, + .uop[2] = {.type = UOP_LOAD, .latency = 1}, + .uop[3] = {.type = UOP_LOAD, .latency = 1}, + .uop[4] = {.type = UOP_LOAD, .latency = 1}, + .uop[5] = {.type = UOP_LOAD, .latency = 1}, + .uop[6] = {.type = UOP_LOAD, .latency = 1}, + .uop[7] = {.type = UOP_LOAD, .latency = 1} +}; +static const risc86_instruction_t complex_popf_op = { .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 17, .latency = 17} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_ALUX, .latency = 17} }; -static const risc86_instruction_t vector_push_mem_op = +static const risc86_instruction_t complex_push_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_pushf_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_ret_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_retf_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 3}, + .uop[2] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_scas_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_scasb_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_setcc_mem_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_FSTORED, .latency = 1}, + .uop[3] = {.type = UOP_FSTOREA, .latency = 1} +}; +static const risc86_instruction_t complex_setcc_reg_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_test_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_test_mem_b_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_xchg_mem_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_xlat_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1} +}; +static const risc86_instruction_t complex_wbinvd_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_pushf_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_ret_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_retf_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[2] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_scas_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_scasb_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_setcc_mem_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_setcc_reg_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_test_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_test_mem_b_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_xchg_mem_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_xlat_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2} -}; -static const risc86_instruction_t vector_wbinvd_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 10000, .latency = 10000} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 10000} }; + #define INVALID NULL static const risc86_instruction_t *opcode_timings[256] = @@ -865,38 +948,38 @@ static const risc86_instruction_t *opcode_timings[256] = /* ADD ADD ADD ADD*/ /*00*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* ADD ADD PUSH ES POP ES*/ - &alux_op, &alu_op, &push_seg_op, &vector_mov_seg_mem_op, + &alux_op, &alu_op, &push_seg_op, &complex_mov_seg_mem_op, /* OR OR OR OR*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* OR OR PUSH CS */ &alux_op, &alu_op, &push_seg_op, INVALID, /* ADC ADC ADC ADC*/ -/*10*/ &vector_alux_store_op, &vector_alu_store_op, &vector_load_alux_op, &vector_load_alu_op, +/*10*/ &complex_alux_store_op,&complex_alu_store_op, &complex_load_alux_op,&complex_load_alu_op, /* ADC ADC PUSH SS POP SS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* SBB SBB SBB SBB*/ -/*10*/ &vector_alux_store_op, &vector_alu_store_op, &vector_load_alux_op, &vector_load_alu_op, +/*10*/ &complex_alux_store_op,&complex_alu_store_op, &complex_load_alux_op,&complex_load_alu_op, /* SBB SBB PUSH DS POP DS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* AND AND AND AND*/ /*20*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* AND AND DAA*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* SUB SUB SUB SUB*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* SUB SUB DAS*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* XOR XOR XOR XOR*/ /*30*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* XOR XOR AAA*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* CMP CMP CMP CMP*/ &load_alux_op, &load_alu_op, &load_alux_op, &load_alu_op, /* CMP CMP AAS*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* INC EAX INC ECX INC EDX INC EBX*/ /*40*/ &alu_op, &alu_op, &alu_op, &alu_op, @@ -917,12 +1000,12 @@ static const risc86_instruction_t *opcode_timings[256] = &pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op, /* PUSHA POPA BOUND ARPL*/ -/*60*/ &vector_pusha_op, &vector_popa_op, &vector_bound_op, &vector_arpl_op, +/*60*/ &complex_pusha_op, &complex_popa_op, &complex_bound_op, &complex_arpl_op, INVALID, INVALID, INVALID, INVALID, /* PUSH imm IMUL PUSH imm IMUL*/ - &push_imm_op, &vector_mul_op, &push_imm_op, &vector_mul_op, + &push_imm_op, &complex_mul_op, &push_imm_op, &complex_mul_op, /* INSB INSW OUTSB OUTSW*/ - &vector_ins_op, &vector_ins_op, &vector_outs_op, &vector_outs_op, + &complex_ins_op, &complex_ins_op, &complex_outs_op, &complex_outs_op, /* Jxx*/ /*70*/ &branch_op, &branch_op, &branch_op, &branch_op, @@ -932,29 +1015,29 @@ static const risc86_instruction_t *opcode_timings[256] = /*80*/ INVALID, INVALID, INVALID, INVALID, /* TEST TEST XCHG XCHG*/ - &vector_test_mem_b_op, &vector_test_mem_op, &vector_xchg_mem_op, &vector_xchg_mem_op, + &complex_test_mem_b_op, &complex_test_mem_op, &complex_xchg_mem_op, &complex_xchg_mem_op, /* MOV MOV MOV MOV*/ &store_op, &store_op, &load_op, &load_op, /* MOV from seg LEA MOV to seg POP*/ - &vector_mov_mem_seg_op, &store_op, &vector_mov_seg_mem_op, &pop_mem_op, + &complex_mov_mem_seg_op, &store_op, &complex_mov_seg_mem_op, &pop_mem_op, /* NOP XCHG XCHG XCHG*/ /*90*/ &limm_op, &xchg_op, &xchg_op, &xchg_op, /* XCHG XCHG XCHG XCHG*/ &xchg_op, &xchg_op, &xchg_op, &xchg_op, /* CBW CWD CALL far WAIT*/ - &vector_alu1_op, &vector_alu1_op, &vector_call_far_op, &limm_op, + &complex_alu1_op, &complex_alu1_op, &complex_call_far_op, &limm_op, /* PUSHF POPF SAHF LAHF*/ - &vector_pushf_op, &vector_popf_op, &vector_alux1_op, &vector_alux1_op, + &complex_pushf_op, &complex_popf_op, &complex_alux1_op, &complex_alux1_op, /* MOV MOV MOV MOV*/ /*a0*/ &load_op, &load_op, &store_op, &store_op, /* MOVSB MOVSW CMPSB CMPSW*/ - &movs_op, &movs_op, &vector_cmpsb_op, &vector_cmps_op, + &movs_op, &movs_op, &complex_cmpsb_op, &complex_cmps_op, /* TEST TEST STOSB STOSW*/ &test_reg_b_op, &test_reg_op, &stos_op, &stos_op, /* LODSB LODSW SCASB SCASW*/ - &lods_op, &lods_op, &vector_scasb_op, &vector_scas_op, + &lods_op, &lods_op, &complex_scasb_op, &complex_scas_op, /* MOV*/ /*b0*/ &limm_op, &limm_op, &limm_op, &limm_op, @@ -963,37 +1046,37 @@ static const risc86_instruction_t *opcode_timings[256] = &limm_op, &limm_op, &limm_op, &limm_op, /* RET imm RET*/ -/*c0*/ INVALID, INVALID, &vector_ret_op, &vector_ret_op, +/*c0*/ INVALID, INVALID, &complex_ret_op, &complex_ret_op, /* LES LDS MOV MOV*/ - &vector_lss_op, &vector_lss_op, &store_op, &store_op, + &complex_lss_op, &complex_lss_op, &store_op, &store_op, /* ENTER LEAVE RETF RETF*/ - &vector_enter_op, &leave_op, &vector_retf_op, &vector_retf_op, + &complex_enter_op, &leave_op, &complex_retf_op, &complex_retf_op, /* INT3 INT INTO IRET*/ - &vector_int_op, &vector_int_op, &vector_int_op, &vector_iret_op, + &complex_int_op, &complex_int_op, &complex_int_op, &complex_iret_op, /*d0*/ INVALID, INVALID, INVALID, INVALID, /* AAM AAD SETALC XLAT*/ - &vector_alux6_op, &vector_alux3_op, &vector_alux1_op, &vector_xlat_op, + &complex_alux6_op, &complex_alux3_op, &complex_alux1_op, &complex_xlat_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /* LOOPNE LOOPE LOOP JCXZ*/ -/*e0*/ &vector_loop_op, &vector_loop_op, &loop_op, &vector_loop_op, +/*e0*/ &complex_loop_op, &complex_loop_op, &loop_op, &complex_loop_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* CALL JMP JMP JMP*/ - &store_op, &branch_op, &vector_jmp_far_op, &branch_op, + &store_op, &branch_op, &complex_jmp_far_op, &branch_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* REPNE REPE*/ /*f0*/ INVALID, INVALID, INVALID, INVALID, /* HLT CMC*/ - &vector_alux1_op, &vector_alu2_op, INVALID, INVALID, + &complex_alux1_op, &complex_alu2_op, INVALID, INVALID, /* CLC STC CLI STI*/ - &vector_alu1_op, &vector_alu1_op, &vector_cli_sti_op, &vector_cli_sti_op, + &complex_alu1_op, &complex_alu1_op, &complex_cli_sti_op, &complex_cli_sti_op, /* CLD STD INCDEC*/ - &vector_alu1_op, &vector_alu1_op, &alux_store_op, INVALID + &complex_alu1_op, &complex_alu1_op, &alux_store_op, INVALID }; static const risc86_instruction_t *opcode_timings_mod3[256] = @@ -1001,38 +1084,38 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = /* ADD ADD ADD ADD*/ /*00*/ &alux_op, &alu_op, &alux_op, &alu_op, /* ADD ADD PUSH ES POP ES*/ - &alux_op, &alu_op, &push_seg_op, &vector_mov_seg_mem_op, + &alux_op, &alu_op, &push_seg_op, &complex_mov_seg_mem_op, /* OR OR OR OR*/ &alux_op, &alu_op, &alux_op, &alu_op, /* OR OR PUSH CS */ &alux_op, &alu_op, &push_seg_op, INVALID, /* ADC ADC ADC ADC*/ -/*10*/ &vector_alux1_op, &vector_alu1_op, &vector_alux1_op, &vector_alu1_op, +/*10*/ &complex_alux1_op, &complex_alu1_op, &complex_alux1_op, &complex_alu1_op, /* ADC ADC PUSH SS POP SS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* SBB SBB SBB SBB*/ - &vector_alux1_op, &vector_alu1_op, &vector_alux1_op, &vector_alu1_op, + &complex_alux1_op, &complex_alu1_op, &complex_alux1_op, &complex_alu1_op, /* SBB SBB PUSH DS POP DS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* AND AND AND AND*/ /*20*/ &alux_op, &alu_op, &alux_op, &alu_op, /* AND AND DAA*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* SUB SUB SUB SUB*/ &alux_op, &alu_op, &alux_op, &alu_op, /* SUB SUB DAS*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* XOR XOR XOR XOR*/ /*30*/ &alux_op, &alu_op, &alux_op, &alu_op, /* XOR XOR AAA*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* CMP CMP CMP CMP*/ &alux_op, &alu_op, &alux_op, &alu_op, /* CMP CMP AAS*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* INC EAX INC ECX INC EDX INC EBX*/ /*40*/ &alu_op, &alu_op, &alu_op, &alu_op, @@ -1053,12 +1136,12 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = &pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op, /* PUSHA POPA BOUND ARPL*/ -/*60*/ &vector_pusha_op, &vector_popa_op, &vector_bound_op, &vector_arpl_op, +/*60*/ &complex_pusha_op, &complex_popa_op, &complex_bound_op, &complex_arpl_op, INVALID, INVALID, INVALID, INVALID, /* PUSH imm IMUL PUSH imm IMUL*/ - &push_imm_op, &vector_mul_op, &push_imm_op, &vector_mul_op, + &push_imm_op, &complex_mul_op, &push_imm_op, &complex_mul_op, /* INSB INSW OUTSB OUTSW*/ - &vector_ins_op, &vector_ins_op, &vector_outs_op, &vector_outs_op, + &complex_ins_op, &complex_ins_op, &complex_outs_op, &complex_outs_op, /* Jxx*/ /*70*/ &branch_op, &branch_op, &branch_op, &branch_op, @@ -1068,29 +1151,29 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = /*80*/ INVALID, INVALID, INVALID, INVALID, /* TEST TEST XCHG XCHG*/ - &vector_alu1_op, &vector_alu1_op, &vector_alu3_op, &vector_alu3_op, + &complex_alu1_op, &complex_alu1_op, &complex_alu3_op, &complex_alu3_op, /* MOV MOV MOV MOV*/ &store_op, &store_op, &load_op, &load_op, /* MOV from seg LEA MOV to seg POP*/ - &mov_reg_seg_op, &store_op, &vector_mov_seg_reg_op, &pop_reg_op, + &mov_reg_seg_op, &store_op, &complex_mov_seg_reg_op, &pop_reg_op, /* NOP XCHG XCHG XCHG*/ /*90*/ &limm_op, &xchg_op, &xchg_op, &xchg_op, /* XCHG XCHG XCHG XCHG*/ &xchg_op, &xchg_op, &xchg_op, &xchg_op, /* CBW CWD CALL far WAIT*/ - &vector_alu1_op, &vector_alu1_op, &vector_call_far_op, &limm_op, + &complex_alu1_op, &complex_alu1_op, &complex_call_far_op, &limm_op, /* PUSHF POPF SAHF LAHF*/ - &vector_pushf_op, &vector_popf_op, &vector_alux1_op, &vector_alux1_op, + &complex_pushf_op, &complex_popf_op, &complex_alux1_op, &complex_alux1_op, /* MOV MOV MOV MOV*/ /*a0*/ &load_op, &load_op, &store_op, &store_op, /* MOVSB MOVSW CMPSB CMPSW*/ - &movs_op, &movs_op, &vector_cmpsb_op, &vector_cmps_op, + &movs_op, &movs_op, &complex_cmpsb_op, &complex_cmps_op, /* TEST TEST STOSB STOSW*/ &test_reg_b_op, &test_reg_op, &stos_op, &stos_op, /* LODSB LODSW SCASB SCASW*/ - &lods_op, &lods_op, &vector_scasb_op, &vector_scas_op, + &lods_op, &lods_op, &complex_scasb_op, &complex_scas_op, /* MOV*/ /*b0*/ &limm_op, &limm_op, &limm_op, &limm_op, @@ -1099,57 +1182,57 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = &limm_op, &limm_op, &limm_op, &limm_op, /* RET imm RET*/ -/*c0*/ INVALID, INVALID, &vector_ret_op, &vector_ret_op, +/*c0*/ INVALID, INVALID, &complex_ret_op, &complex_ret_op, /* LES LDS MOV MOV*/ - &vector_lss_op, &vector_lss_op, &store_op, &store_op, + &complex_lss_op, &complex_lss_op, &store_op, &store_op, /* ENTER LEAVE RETF RETF*/ - &vector_enter_op, &leave_op, &vector_retf_op, &vector_retf_op, + &complex_enter_op, &leave_op, &complex_retf_op, &complex_retf_op, /* INT3 INT INTO IRET*/ - &vector_int_op, &vector_int_op, &vector_int_op, &vector_iret_op, + &complex_int_op, &complex_int_op, &complex_int_op, &complex_iret_op, /*d0*/ INVALID, INVALID, INVALID, INVALID, /* AAM AAD SETALC XLAT*/ - &vector_alux6_op, &vector_alux3_op, &vector_alux1_op, &vector_xlat_op, + &complex_alux6_op, &complex_alux3_op, &complex_alux1_op, &complex_xlat_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /* LOOPNE LOOPE LOOP JCXZ*/ -/*e0*/ &vector_loop_op, &vector_loop_op, &loop_op, &vector_loop_op, +/*e0*/ &complex_loop_op, &complex_loop_op, &loop_op, &complex_loop_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* CALL JMP JMP JMP*/ - &store_op, &branch_op, &vector_jmp_far_op, &branch_op, + &store_op, &branch_op, &complex_jmp_far_op, &branch_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* REPNE REPE*/ /*f0*/ INVALID, INVALID, INVALID, INVALID, /* HLT CMC*/ - &vector_alux1_op, &vector_alu2_op, INVALID, INVALID, + &complex_alux1_op, &complex_alu2_op, INVALID, INVALID, /* CLC STC CLI STI*/ - &vector_alu1_op, &vector_alu1_op, &vector_cli_sti_op, &vector_cli_sti_op, + &complex_alu1_op, &complex_alu1_op, &complex_cli_sti_op, &complex_cli_sti_op, /* CLD STD INCDEC*/ - &vector_alu1_op, &vector_alu1_op, &vector_alux1_op, INVALID + &complex_alu1_op, &complex_alu1_op, &complex_alux1_op, INVALID }; static const risc86_instruction_t *opcode_timings_0f[256] = { -/*00*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - INVALID, &vector_alu6_op, &vector_alu6_op, INVALID, - &vector_invd_op, &vector_wbinvd_op, INVALID, INVALID, - INVALID, &load_op, &vector_femms_op, INVALID, +/*00*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + INVALID, &complex_alu6_op, &complex_alu6_op, INVALID, + &complex_invd_op, &complex_wbinvd_op, INVALID, INVALID, + INVALID, &load_op, &complex_femms_op, INVALID, /*10*/ INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, -/*20*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - &vector_alu6_op, &vector_alu6_op, INVALID, INVALID, +/*20*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + &complex_alu6_op, &complex_alu6_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, -/*30*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, INVALID, +/*30*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, @@ -1170,7 +1253,7 @@ static const risc86_instruction_t *opcode_timings_0f[256] = INVALID, INVALID, &mload_op, &mload_op, /*70*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, - &load_mmx_op, &load_mmx_op, &load_mmx_op, &vector_emms_op, + &load_mmx_op, &load_mmx_op, &load_mmx_op, &complex_emms_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, &mstore_op, &mstore_op, @@ -1179,122 +1262,122 @@ static const risc86_instruction_t *opcode_timings_0f[256] = &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, -/*90*/ &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, - &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, - &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, - &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, +/*90*/ &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, + &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, + &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, + &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, -/*a0*/ &push_seg_op, &vector_mov_seg_mem_op, &vector_cpuid_op, &vector_load_alu_op, - &vector_alu_store_op, &vector_alu_store_op, INVALID, INVALID, - &push_seg_op, &vector_mov_seg_mem_op, INVALID, &vector_load_alu_op, - &vector_alu_store_op, &vector_alu_store_op, INVALID, &vector_mul_op, +/*a0*/ &push_seg_op, &complex_mov_seg_mem_op,&complex_cpuid_op, &complex_load_alu_op, + &complex_alu_store_op, &complex_alu_store_op, INVALID, INVALID, + &push_seg_op, &complex_mov_seg_mem_op,INVALID, &complex_load_alu_op, + &complex_alu_store_op, &complex_alu_store_op, INVALID, &complex_mul_op, -/*b0*/ &vector_cmpxchg_b_op, &vector_cmpxchg_op, &vector_lss_op, &vector_load_alu_op, - &vector_lss_op, &vector_lss_op, &load_alux_op, &load_alu_op, - INVALID, INVALID, &vector_load_alu_op, &vector_load_alu_op, - &vector_bsx_op, &vector_bsx_op, &load_alux_op, &load_alu_op, +/*b0*/ &complex_cmpxchg_b_op, &complex_cmpxchg_op, &complex_lss_op, &complex_load_alu_op, + &complex_lss_op, &complex_lss_op, &load_alux_op, &load_alu_op, + INVALID, INVALID, &complex_load_alu_op, &complex_load_alu_op, + &complex_bsx_op, &complex_bsx_op, &load_alux_op, &load_alu_op, -/*c0*/ &vector_alux_store_op, &vector_alu_store_op, INVALID, INVALID, - INVALID, INVALID, INVALID, &vector_cmpxchg_op, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, +/*c0*/ &complex_alux_store_op, &complex_alu_store_op, INVALID, INVALID, + INVALID, INVALID, INVALID, &complex_cmpxchg_op, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, -/*d0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, - INVALID, &load_mmx_mul_op, INVALID, INVALID, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, +/*d0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, + INVALID, &load_mmx_mul_op, INVALID, INVALID, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, -/*e0*/ &load_mmx_op, &load_mmx_shift_op, &load_mmx_shift_op, INVALID, - INVALID, &pmul_mem_op, INVALID, INVALID, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, +/*e0*/ &load_mmx_op, &load_mmx_shift_op, &load_mmx_shift_op, INVALID, + INVALID, &pmul_mem_op, INVALID, INVALID, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, -/*f0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, - INVALID, &pmul_mem_op, INVALID, INVALID, - &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, - &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, +/*f0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, + INVALID, &pmul_mem_op, INVALID, INVALID, + &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, + &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, }; static const risc86_instruction_t *opcode_timings_0f_mod3[256] = { -/*00*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - INVALID, &vector_alu6_op, &vector_alu6_op, INVALID, - &vector_invd_op, &vector_wbinvd_op, INVALID, INVALID, - INVALID, INVALID, &vector_femms_op, INVALID, +/*00*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + INVALID, &complex_alu6_op, &complex_alu6_op, INVALID, + &complex_invd_op, &complex_wbinvd_op, INVALID, INVALID, + INVALID, INVALID, &complex_femms_op, INVALID, -/*10*/ INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*10*/ INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*20*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - &vector_alu6_op, &vector_alu6_op, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*20*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + &complex_alu6_op, &complex_alu6_op, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*30*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*30*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*40*/ INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*40*/ INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*50*/ INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*50*/ INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*60*/ &mmx_op, &mmx_op, &mmx_op, &mmx_op, - &mmx_op, &mmx_op, &mmx_op, &mmx_op, - &mmx_op, &mmx_op, &mmx_op, &mmx_op, - INVALID, INVALID, &mmx_op, &mmx_op, +/*60*/ &mmx_op, &mmx_op, &mmx_op, &mmx_op, + &mmx_op, &mmx_op, &mmx_op, &mmx_op, + &mmx_op, &mmx_op, &mmx_op, &mmx_op, + INVALID, INVALID, &mmx_op, &mmx_op, -/*70*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, - &mmx_op, &mmx_op, &mmx_op, &vector_emms_op, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, &mmx_op, &mmx_op, +/*70*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, + &mmx_op, &mmx_op, &mmx_op, &complex_emms_op, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, &mmx_op, &mmx_op, /*80*/ &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, -/*90*/ &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, - &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, - &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, - &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, +/*90*/ &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, + &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, + &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, + &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, -/*a0*/ &push_seg_op, &vector_mov_seg_mem_op, &vector_cpuid_op, &vector_alu1_op, - &vector_alu1_op, &vector_alu1_op, INVALID, INVALID, - &push_seg_op, &vector_mov_seg_mem_op, INVALID, &vector_alu1_op, - &vector_alu1_op, &vector_alu1_op, INVALID, &vector_mul_op, +/*a0*/ &push_seg_op, &complex_mov_seg_mem_op, &complex_cpuid_op, &complex_alu1_op, + &complex_alu1_op, &complex_alu1_op, INVALID, INVALID, + &push_seg_op, &complex_mov_seg_mem_op, INVALID, &complex_alu1_op, + &complex_alu1_op, &complex_alu1_op, INVALID, &complex_mul_op, -/*b0*/ &vector_cmpxchg_b_op, &vector_cmpxchg_op, &vector_lss_op, &vector_alu1_op, - &vector_lss_op, &vector_lss_op, &alux_op, &alu_op, - INVALID, INVALID, &vector_alu1_op, &vector_alu1_op, - &vector_bsx_op, &vector_bsx_op, &alux_op, &alu_op, +/*b0*/ &complex_cmpxchg_b_op, &complex_cmpxchg_op, &complex_lss_op, &complex_alu1_op, + &complex_lss_op, &complex_lss_op, &alux_op, &alu_op, + INVALID, INVALID, &complex_alu1_op, &complex_alu1_op, + &complex_bsx_op, &complex_bsx_op, &alux_op, &alu_op, -/*c0*/ &vector_alux1_op, &vector_alu1_op, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, +/*c0*/ &complex_alux1_op, &complex_alu1_op, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, -/*d0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, - INVALID, &mmx_mul_op, INVALID, INVALID, - &mmx_op, &mmx_op, INVALID, &mmx_op, - &mmx_op, &mmx_op, INVALID, &mmx_op, +/*d0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, + INVALID, &mmx_mul_op, INVALID, INVALID, + &mmx_op, &mmx_op, INVALID, &mmx_op, + &mmx_op, &mmx_op, INVALID, &mmx_op, -/*e0*/ &mmx_op, &mmx_shift_op, &mmx_shift_op, INVALID, - INVALID, &pmul_op, INVALID, INVALID, - &mmx_op, &mmx_op, INVALID, &mmx_op, - &mmx_op, &mmx_op, INVALID, &mmx_op, +/*e0*/ &mmx_op, &mmx_shift_op, &mmx_shift_op, INVALID, + INVALID, &pmul_op, INVALID, INVALID, + &mmx_op, &mmx_op, INVALID, &mmx_op, + &mmx_op, &mmx_op, INVALID, &mmx_op, -/*f0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, - INVALID, &pmul_op, INVALID, INVALID, - &mmx_op, &mmx_op, &mmx_op, INVALID, - &mmx_op, &mmx_op, &mmx_op, INVALID, +/*f0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, + INVALID, &pmul_op, INVALID, INVALID, + &mmx_op, &mmx_op, &mmx_op, INVALID, + &mmx_op, &mmx_op, &mmx_op, INVALID, }; static const risc86_instruction_t *opcode_timings_0f0f[256] = @@ -1466,100 +1549,100 @@ static const risc86_instruction_t *opcode_timings_0f0f_mod3[256] = static const risc86_instruction_t *opcode_timings_shift[8] = { - &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op, - &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op + &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op, + &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op }; static const risc86_instruction_t *opcode_timings_shift_b[8] = { - &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op, - &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op + &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op, + &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op }; static const risc86_instruction_t *opcode_timings_shift_mod3[8] = { - &vector_alu1_op, &vector_alu1_op, &vector_alu1_op, &vector_alu1_op, - &alu_op, &alu_op, &alu_op, &alu_op + &complex_alu1_op, &complex_alu1_op, &complex_alu1_op, &complex_alu1_op, + &alu_op, &alu_op, &alu_op, &alu_op }; static const risc86_instruction_t *opcode_timings_shift_b_mod3[8] = { - &vector_alux1_op, &vector_alux1_op, &vector_alux1_op, &vector_alux1_op, - &alux_op, &alux_op, &alux_op, &alux_op + &complex_alux1_op, &complex_alux1_op, &complex_alux1_op, &complex_alux1_op, + &alux_op, &alux_op, &alux_op, &alux_op }; static const risc86_instruction_t *opcode_timings_80[8] = { - &alux_store_op, &alux_store_op, &vector_alux_store_op, &vector_alux_store_op, - &alux_store_op, &alux_store_op, &alux_store_op, &alux_store_op, + &alux_store_op, &alux_store_op, &complex_alux_store_op, &complex_alux_store_op, + &alux_store_op, &alux_store_op, &alux_store_op, &alux_store_op, }; static const risc86_instruction_t *opcode_timings_80_mod3[8] = { - &alux_op, &alux_op, &alux_store_op, &alux_store_op, - &alux_op, &alux_op, &alux_op, &alux_op, + &alux_op, &alux_op, &alux_store_op, &alux_store_op, + &alux_op, &alux_op, &alux_op, &alux_op, }; static const risc86_instruction_t *opcode_timings_8x[8] = { - &alu_store_op, &alu_store_op, &vector_alu_store_op, &vector_alu_store_op, - &alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op, + &alu_store_op, &alu_store_op, &complex_alu_store_op, &complex_alu_store_op, + &alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op, }; static const risc86_instruction_t *opcode_timings_8x_mod3[8] = { - &alu_op, &alu_op, &alu_store_op, &alu_store_op, - &alu_op, &alu_op, &alu_op, &alu_op, + &alu_op, &alu_op, &alu_store_op, &alu_store_op, + &alu_op, &alu_op, &alu_op, &alu_op, }; static const risc86_instruction_t *opcode_timings_f6[8] = { /* TST NOT NEG*/ - &test_mem_imm_b_op, INVALID, &vector_alux_store_op, &vector_alux_store_op, + &test_mem_imm_b_op, INVALID, &complex_alux_store_op, &complex_alux_store_op, /* MUL IMUL DIV IDIV*/ - &vector_mul_mem_op, &vector_mul_mem_op, &vector_div16_mem_op, &vector_div16_mem_op, + &complex_mul_mem_op, &complex_mul_mem_op, &complex_div16_mem_op, &complex_div16_mem_op, }; static const risc86_instruction_t *opcode_timings_f6_mod3[8] = { /* TST NOT NEG*/ &test_reg_b_op, INVALID, &alux_op, &alux_op, /* MUL IMUL DIV IDIV*/ - &vector_mul_op, &vector_mul_op, &vector_div16_op, &vector_div16_op, + &complex_mul_op, &complex_mul_op, &complex_div16_op, &complex_div16_op, }; static const risc86_instruction_t *opcode_timings_f7[8] = { /* TST NOT NEG*/ - &test_mem_imm_op, INVALID, &vector_alu_store_op, &vector_alu_store_op, + &test_mem_imm_op, INVALID, &complex_alu_store_op, &complex_alu_store_op, /* MUL IMUL DIV IDIV*/ - &vector_mul64_mem_op, &vector_mul64_mem_op, &vector_div32_mem_op, &vector_div32_mem_op, + &complex_mul64_mem_op, &complex_mul64_mem_op, &complex_div32_mem_op, &complex_div32_mem_op, }; static const risc86_instruction_t *opcode_timings_f7_mod3[8] = { /* TST NOT NEG*/ &test_reg_op, INVALID, &alu_op, &alu_op, /* MUL IMUL DIV IDIV*/ - &vector_mul64_op, &vector_mul64_op, &vector_div32_op, &vector_div32_op, + &complex_mul64_op, &complex_mul64_op, &complex_div32_op, &complex_div32_op, }; static const risc86_instruction_t *opcode_timings_ff[8] = { /* INC DEC CALL CALL far*/ - &alu_store_op, &alu_store_op, &store_op, &vector_call_far_op, + &alu_store_op, &alu_store_op, &store_op, &complex_call_far_op, /* JMP JMP far PUSH*/ - &branch_op, &vector_jmp_far_op, &push_mem_op, INVALID + &branch_op, &complex_jmp_far_op, &push_mem_op, INVALID }; static const risc86_instruction_t *opcode_timings_ff_mod3[8] = { /* INC DEC CALL CALL far*/ - &vector_alu1_op, &vector_alu1_op, &store_op, &vector_call_far_op, + &complex_alu1_op, &complex_alu1_op, &store_op, &complex_call_far_op, /* JMP JMP far PUSH*/ - &branch_op, &vector_jmp_far_op, &vector_push_mem_op, INVALID + &branch_op, &complex_jmp_far_op, &complex_push_mem_op, INVALID }; static const risc86_instruction_t *opcode_timings_d8[8] = { /* FADDs FMULs FCOMs FCOMPs*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op, /* FSUBs FSUBRs FDIVs FDIVRs*/ &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, }; static const risc86_instruction_t *opcode_timings_d8_mod3[8] = { /* FADD FMUL FCOM FCOMP*/ - &float_op, &float_op, &float_op, &float_op, + &fadd_op, &fmul_op, &float_op, &float_op, /* FSUB FSUBR FDIV FDIVR*/ &float_op, &float_op, &fdiv_op, &fdiv_op, }; @@ -1569,7 +1652,7 @@ static const risc86_instruction_t *opcode_timings_d9[8] = /* FLDs FSTs FSTPs*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FLDENV FLDCW FSTENV FSTCW*/ - &vector_float_l_op, &vector_fldcw_op, &vector_float_l_op, &vector_float_op + &complex_float_l_op, &complex_fldcw_op, &complex_float_l_op, &complex_float_op }; static const risc86_instruction_t *opcode_timings_d9_mod3[64] = { @@ -1577,16 +1660,16 @@ static const risc86_instruction_t *opcode_timings_d9_mod3[64] = &float_op, &float_op, &float_op, &float_op, &float_op, &float_op, &float_op, &float_op, /*FXCH*/ - &float_op, &float_op, &float_op, &float_op, - &float_op, &float_op, &float_op, &float_op, + &limm_op, &limm_op, &limm_op, &limm_op, + &limm_op, &limm_op, &limm_op, &limm_op, /*FNOP*/ &float_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /*FSTP*/ - &float_op, &float_op, &float_op, &float_op, - &float_op, &float_op, &float_op, &float_op, + &float2_op, &float2_op, &float2_op, &float2_op, + &float2_op, &float2_op, &float2_op, &float2_op, /* opFCHS opFABS*/ - &float_op, &float_op, INVALID, INVALID, + &fchs_op, &float_op, INVALID, INVALID, /* opFTST opFXAM*/ &float_op, &float_op, INVALID, INVALID, /* opFLD1 opFLDL2T opFLDL2E opFLDPI*/ @@ -1606,7 +1689,7 @@ static const risc86_instruction_t *opcode_timings_d9_mod3[64] = static const risc86_instruction_t *opcode_timings_da[8] = { /* FIADDl FIMULl FICOMl FICOMPl*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op, /* FISUBl FISUBRl FIDIVl FIDIVRl*/ &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, }; @@ -1622,7 +1705,7 @@ static const risc86_instruction_t *opcode_timings_db[8] = /* FLDil FSTil FSTPil*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FLDe FSTPe*/ - INVALID, &vector_flde_op, INVALID, &vector_fste_op + INVALID, &complex_flde_op, INVALID, &complex_fste_op }; static const risc86_instruction_t *opcode_timings_db_mod3[64] = { @@ -1656,14 +1739,14 @@ static const risc86_instruction_t *opcode_timings_db_mod3[64] = static const risc86_instruction_t *opcode_timings_dc[8] = { /* FADDd FMULd FCOMd FCOMPd*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op, /* FSUBd FSUBRd FDIVd FDIVRd*/ &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, }; static const risc86_instruction_t *opcode_timings_dc_mod3[8] = { /* opFADDr opFMULr*/ - &float_op, &float_op, INVALID, INVALID, + &fadd_op, &fmul_op, INVALID, INVALID, /* opFSUBRr opFSUBr opFDIVRr opFDIVr*/ &float_op, &float_op, &fdiv_op, &fdiv_op }; @@ -1673,7 +1756,7 @@ static const risc86_instruction_t *opcode_timings_dd[8] = /* FLDd FSTd FSTPd*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FRSTOR FSAVE FSTSW*/ - &vector_float_l_op, INVALID, &vector_float_l_op, &vector_float_l_op + &complex_float_l_op, INVALID, &complex_float_l_op, &complex_float_l_op }; static const risc86_instruction_t *opcode_timings_dd_mod3[8] = { @@ -1686,14 +1769,14 @@ static const risc86_instruction_t *opcode_timings_dd_mod3[8] = static const risc86_instruction_t *opcode_timings_de[8] = { /* FIADDw FIMULw FICOMw FICOMPw*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, /* FISUBw FISUBRw FIDIVw FIDIVRw*/ - &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, + &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, }; static const risc86_instruction_t *opcode_timings_de_mod3[8] = { /* FADDP FMULP FCOMPP*/ - &float_op, &float_op, INVALID, &float_op, + &fadd_op, &fmul_op, INVALID, &float_op, /* FSUBP FSUBRP FDIVP FDIVRP*/ &float_op, &float_op, &fdiv_op, &fdiv_op, }; @@ -1703,7 +1786,7 @@ static const risc86_instruction_t *opcode_timings_df[8] = /* FILDiw FISTiw FISTPiw*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FILDiq FBSTP FISTPiq*/ - INVALID, &load_float_op, &vector_float_l_op, &fstore_op, + INVALID, &load_float_op, &complex_float_l_op, &fstore_op, }; static const risc86_instruction_t *opcode_timings_df_mod3[8] = { @@ -1731,34 +1814,27 @@ static p6_unit_t *units; /*Pentium Pro has no MMX*/ static p6_unit_t ppro_units[] = { - {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX)}, /*Integer X*/ - {.uop_mask = (1 << UOP_ALU)}, /*Integer Y*/ - {.uop_mask = (1 << UOP_FLOAT)}, /*Floating point*/ - {.uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD) | (1 << UOP_MLOAD)}, /*Load*/ - {.uop_mask = (1 << UOP_STORE) | (1 << UOP_FSTORE) | (1 << UOP_MSTORE)}, /*Store*/ - {.uop_mask = (1 << UOP_BRANCH)} /*Branch*/ + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX) | (1 << UOP_FLOAT)}, /*Integer X & Floating point*/ + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_BRANCH)}, /*Integer Y*/ + {.uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD)}, /*Load*/ + {.uop_mask = (1 << UOP_STORED) | (1 << UOP_FSTORED)}, /*Data Store*/ + {.uop_mask = (1 << UOP_STOREA) | (1 << UOP_FSTOREA)}, /*Address Store*/ }; #define NR_PPRO_UNITS (sizeof(ppro_units) / sizeof(p6_unit_t)) -/*Well, it works I guess*/ +/*Pentium II/Celeron assigns the multiplier to port 0, the shifter to port 1, and shares the MMX ALU*/ static p6_unit_t p2_units[] = { - {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX) | (1 << UOP_MEU) | /*Integer X*/ - (1 << UOP_MEU_SHIFT) | (1 << UOP_MEU_MUL)}, - {.uop_mask = (1 << UOP_ALU) | (1 << UOP_MEU) | /*Integer Y*/ - (1 << UOP_MEU_SHIFT) | (1 << UOP_MEU_MUL)}, - {.uop_mask = (1 << UOP_FLOAT)}, /*Floating point*/ + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX) | (1 << UOP_FLOAT) | /*Integer X & Floating point*/ + (1 << UOP_MMX) | (1 << UOP_MMX_MUL)}, + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_BRANCH) | /*Integer Y*/ + (1 << UOP_MMX) | (1 << UOP_MMX_SHIFT)}, {.uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD) | (1 << UOP_MLOAD)}, /*Load*/ - {.uop_mask = (1 << UOP_STORE) | (1 << UOP_FSTORE) | (1 << UOP_MSTORE)}, /*Store*/ - {.uop_mask = (1 << UOP_BRANCH)} /*Branch*/ + {.uop_mask = (1 << UOP_STORED) | (1 << UOP_FSTORED) | (1 << UOP_MSTORED)}, /*Data Store*/ + {.uop_mask = (1 << UOP_STOREA) | (1 << UOP_FSTOREA) | (1 << UOP_MSTOREA)}, /*Address Store*/ }; #define NR_P2_UNITS (sizeof(p2_units) / sizeof(p6_unit_t)) -/*First available cycles of shared execution units. Each of these can be submitted - to by ALU X and Y*/ -static int mul_first_available_cycle; -static int shift_first_available_cycle; - static int uop_run(const risc86_uop_t *uop, int decode_time) { int c; @@ -1767,16 +1843,8 @@ static int uop_run(const risc86_uop_t *uop, int decode_time) /*UOP_LIMM does not require execution*/ if (uop->type == UOP_LIMM) - return decode_time; + return decode_time; - if (units == p2_units) /*More hackyness*/ - { - if (uop->type == UOP_MEU_MUL && decode_time < mul_first_available_cycle) - decode_time = mul_first_available_cycle; - else if (uop->type == UOP_MEU_SHIFT && decode_time < mul_first_available_cycle) - decode_time = shift_first_available_cycle; - } - /*Find execution unit for this uOP*/ for (c = 0; c < nr_units; c++) { @@ -1790,43 +1858,36 @@ static int uop_run(const risc86_uop_t *uop, int decode_time) } } if (!best_unit) - fatal("uop_run: can not find execution unit\n"); + fatal("uop_run: can not find execution unit\n"); if (best_start_cycle < decode_time) best_start_cycle = decode_time; - best_unit->first_available_cycle = best_start_cycle + uop->throughput; + best_unit->first_available_cycle = best_start_cycle + uop->latency; - if (units == p2_units) /*More hackyness*/ - { - if (uop->type == UOP_MEU_MUL) - mul_first_available_cycle = best_start_cycle + uop->throughput; - else if (uop->type == UOP_MEU_SHIFT) - shift_first_available_cycle = best_start_cycle + uop->throughput; - } - return best_start_cycle + uop->throughput; + + return best_start_cycle + uop->latency; } -/*The K6 decoder can decode, per clock : - - 1 or 2 'short' instructions, each up to 2 uOPs and 7 bytes long - - 1 'long' instruction, up to 4 uOPs - - 1 'vector' instruction, up to 4 uOPs per cycle, plus (I think) 1 cycle startup delay) +/*The P6 decoders can decode, per clock : + - 1 to 3 'simple' instructions, each up to 1 uOP and 7 bytes long + - 1 'complex' instruction, up to 4 uOPs or 3 per cycle for instructions longer than 4 uOPs */ static struct { int nr_uops; - const risc86_uop_t *uops[4]; + const risc86_uop_t *uops[6]; /*Earliest time a uop can start. If the timestamp is -1, then the uop is part of a dependency chain and the start time is the completion time of the previous uop*/ - int earliest_start[4]; + int earliest_start[6]; } decode_buffer; -#define NR_OPQUADS 6 -/*Timestamps of when the last six opquads completed. The K6 scheduler retires - opquads in order, so this is needed to determine when the next can be scheduled*/ -static int opquad_completion_timestamp[NR_OPQUADS]; -static int next_opquad = 0; +#define NR_OPSEQS 3 +/*Timestamps of when the last three op sequences completed. Technically this is incorrect, +as the actual size of the opseq buffer is 20 bytes and not 18, but I'm restricted to multiples of 6*/ +static int opseq_completion_timestamp[NR_OPSEQS]; +static int next_opseq = 0; #define NR_REGS 8 /*Timestamp of when last operation on an integer register completed*/ @@ -1841,50 +1902,48 @@ void decode_flush_p6() { int c; int uop_timestamp = 0; - - /*Decoded opquad can not be submitted if there are no free spaces in the - opquad buffer*/ - if (decode_timestamp < opquad_completion_timestamp[next_opquad]) - decode_timestamp = opquad_completion_timestamp[next_opquad]; + + /*Decoded opseq can not be submitted if there are no free spaces in the + opseq buffer*/ + if (decode_timestamp < opseq_completion_timestamp[next_opseq]) + decode_timestamp = opseq_completion_timestamp[next_opseq]; /*Ensure that uops can not be submitted before they have been decoded*/ if (decode_timestamp > last_uop_timestamp) last_uop_timestamp = decode_timestamp; /*Submit uops to execution units, and determine the latest completion time*/ - for (c = 0; c < decode_buffer.nr_uops; c++) + for (c = 0; c < (decode_buffer.nr_uops); c++) { int start_timestamp; - if (decode_buffer.earliest_start[c] == -1) - start_timestamp = last_uop_timestamp; - else - start_timestamp = decode_buffer.earliest_start[c]; + + start_timestamp = decode_buffer.earliest_start[c]; last_uop_timestamp = uop_run(decode_buffer.uops[c], start_timestamp); if (last_uop_timestamp > uop_timestamp) uop_timestamp = last_uop_timestamp; } - /*Calculate opquad completion time. Since opquads complete in order, it + /*Calculate opseq completion time. Since opseqs complete in order, it must be after the last completion.*/ if (uop_timestamp <= last_complete_timestamp) last_complete_timestamp = last_complete_timestamp + 1; else last_complete_timestamp = uop_timestamp; - /*Advance to next opquad in buffer*/ - opquad_completion_timestamp[next_opquad] = last_complete_timestamp; - next_opquad++; - if (next_opquad == NR_OPQUADS) - next_opquad = 0; + /*Advance to next opseq in buffer*/ + opseq_completion_timestamp[next_opseq] = last_complete_timestamp; + next_opseq++; + if (next_opseq == NR_OPSEQS) + next_opseq = 0; decode_timestamp++; decode_buffer.nr_uops = 0; } /*The instruction is only of interest here if it's longer than 7 bytes, as that's the - limit on K6 short decoding*/ + limit on P6 simple decoding*/ static int codegen_timing_instr_length(uint64_t deps, uint32_t fetchdat, int op_32) { int len = prefixes + 1; /*Opcode*/ @@ -1936,8 +1995,9 @@ static int codegen_timing_instr_length(uint64_t deps, uint32_t fetchdat, int op_ static void decode_instruction(const risc86_instruction_t *ins, uint64_t deps, uint32_t fetchdat, int op_32, int bit8) { uint32_t regmask_required; - uint32_t regmask_modified; - int c, d; + uint32_t regmask_modified; + int c; + int d = 0; /*Complex decoder uOPs*/ int earliest_start = 0; decode_type_t decode_type = ins->decode_type; int instr_length = codegen_timing_instr_length(deps, fetchdat, op_32); @@ -1967,87 +2027,68 @@ static void decode_instruction(const risc86_instruction_t *ins, uint64_t deps, u earliest_start = fpu_st_timestamp[reg]; } - /*Short decoders are limited to 7 bytes*/ - if (decode_type == DECODE_SHORT && instr_length > 7) - decode_type = DECODE_LONG; - /*Long decoder is limited to 11 bytes*/ - else if (instr_length > 11) - decode_type = DECODE_VECTOR; + /*Simple decoders are limited to 7 bytes & 1 uOP*/ + if (decode_type == DECODE_SIMPLE && instr_length > 7) + decode_type = DECODE_COMPLEX; + else if (decode_type == DECODE_SIMPLE && ins->nr_uops > 1) + decode_type = DECODE_COMPLEX; switch (decode_type) - { - case DECODE_SHORT: - if (decode_buffer.nr_uops) + { + case DECODE_SIMPLE: + if (decode_buffer.nr_uops - d == 2) + { + decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0]; + decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start; + decode_buffer.nr_uops = 3; + decode_flush_p6(); + } + else if (decode_buffer.nr_uops - d == 1) + { + decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0]; + decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start; + decode_buffer.nr_uops = 2+d; + if (d) + decode_flush_p6(); + } + else if (decode_buffer.nr_uops) { decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0]; decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start; - if (ins->nr_uops > 1) - { - decode_buffer.uops[decode_buffer.nr_uops+1] = &ins->uop[1]; - decode_buffer.earliest_start[decode_buffer.nr_uops+1] = -1; - } - decode_buffer.nr_uops += ins->nr_uops; - - decode_flush_p6(); + decode_buffer.nr_uops = 1+d; } else { - decode_buffer.nr_uops = ins->nr_uops; + decode_buffer.nr_uops = 1; decode_buffer.uops[0] = &ins->uop[0]; decode_buffer.earliest_start[0] = earliest_start; - if (ins->nr_uops > 1) - { - decode_buffer.uops[1] = &ins->uop[1]; - decode_buffer.earliest_start[1] = -1; - } - } + } break; - case DECODE_LONG: + case DECODE_COMPLEX: if (decode_buffer.nr_uops) - decode_flush_p6(); + decode_flush_p6(); /*The 4-1-1 arrangement implies that a complex ins. can't be decoded after a simple one*/ - decode_buffer.nr_uops = ins->nr_uops; - for (c = 0; c < ins->nr_uops; c++) - { - decode_buffer.uops[c] = &ins->uop[c]; - if (c == 0) - decode_buffer.earliest_start[c] = earliest_start; - else - decode_buffer.earliest_start[c] = -1; - } - decode_flush_p6(); - break; - - case DECODE_VECTOR: - if (decode_buffer.nr_uops) - decode_flush_p6(); - - decode_timestamp++; d = 0; - + for (c = 0; c < ins->nr_uops; c++) { decode_buffer.uops[d] = &ins->uop[c]; - if (c == 0) - decode_buffer.earliest_start[d] = earliest_start; - else - decode_buffer.earliest_start[d] = -1; - d++; + decode_buffer.earliest_start[c] = earliest_start; + d++; - if (d == 4) + if (d == 3 && ins->nr_uops > 4) /*Ins. with >4 uOPs require the use of special units only present on 3 translate PLAs*/ { d = 0; - decode_buffer.nr_uops = 4; - decode_flush_p6(); + decode_buffer.nr_uops = 3; + decode_flush_p6(); /*The other two decoders are halted to preserve in-order issue*/ } } - if (d) - { - decode_buffer.nr_uops = d; - decode_flush_p6(); - } - break; + if (d) + { + decode_buffer.nr_uops = d; + } + break; } /*Update write timestamps for any output registers*/ @@ -2098,15 +2139,12 @@ void codegen_timing_p6_block_start() for (c = 0; c < nr_units; c++) units[c].first_available_cycle = 0; - mul_first_available_cycle = 0; - shift_first_available_cycle = 0; - decode_timestamp = 0; last_complete_timestamp = 0; - for (c = 0; c < NR_OPQUADS; c++) - opquad_completion_timestamp[c] = 0; - next_opquad = 0; + for (c = 0; c < NR_OPSEQS; c++) + opseq_completion_timestamp[c] = 0; + next_opseq = 0; for (c = 0; c < NR_REGS; c++) reg_available_timestamp[c] = 0; @@ -2298,7 +2336,7 @@ void codegen_timing_p6_opcode(uint8_t opcode, uint32_t fetchdat, int op_32, uint if (ins_table[opcode]) decode_instruction(ins_table[opcode], deps[opcode], fetchdat, op_32, bit8); else - decode_instruction(&vector_alu1_op, 0, fetchdat, op_32, bit8); + decode_instruction(&complex_alu1_op, 0, fetchdat, op_32, bit8); codegen_block_cycles += (last_complete_timestamp - old_last_complete_timestamp); } @@ -2327,4 +2365,4 @@ codegen_timing_t codegen_timing_p6 = codegen_timing_p6_block_start, codegen_timing_p6_block_end, codegen_timing_p6_jump_cycles -}; +}; \ No newline at end of file diff --git a/src/cpu_new/codegen_timing_p6.c b/src/cpu_new/codegen_timing_p6.c index d84a18904..845f46803 100644 --- a/src/cpu_new/codegen_timing_p6.c +++ b/src/cpu_new/codegen_timing_p6.c @@ -1,4 +1,5 @@ -/*Hacky P6 timings based on K6 timings*/ +/*Basic P6 timing model by plant/nerd73. Based on the K6 timing model*/ +/*Some cycle timings come from https://www.agner.org/optimize/instruction_tables.pdf*/ #include #include <86box/86box.h> #include "cpu.h" @@ -18,24 +19,26 @@ typedef enum uop_type_t UOP_ALU = 0, /*Executes in Integer X or Y units*/ UOP_ALUX, /*Executes in Integer X unit*/ UOP_LOAD, /*Executes in Load unit*/ - UOP_STORE, /*Executes in Store unit*/ + UOP_STORED, /*Executes in Data Store unit*/ + UOP_STOREA, /*Executes in Address Store unit*/ UOP_FLOAD, /*Executes in Load unit*/ - UOP_FSTORE, /*Executes in Store unit*/ + UOP_FSTORED, /*Executes in Data Store unit*/ + UOP_FSTOREA, /*Executes in Address Store unit*/ UOP_MLOAD, /*Executes in Load unit*/ - UOP_MSTORE, /*Executes in Store unit*/ + UOP_MSTORED, /*Executes in Data Store unit*/ + UOP_MSTOREA, /*Executes in Address Store unit*/ UOP_FLOAT, /*Executes in Floating Point unit*/ - UOP_MEU, /*Executes in Multimedia unit*/ - UOP_MEU_SHIFT, /*Executes in Multimedia unit or ALU X/Y. Uses MMX shifter*/ - UOP_MEU_MUL, /*Executes in Multimedia unit or ALU X/Y. Uses MMX multiplier*/ + UOP_MMX, /*Executes in Integer X or Y units as MMX*/ + UOP_MMX_SHIFT, /*Executes in Integer Y unit. Uses MMX shifter*/ + UOP_MMX_MUL, /*Executes in Integer X unit. Uses MMX multiplier*/ UOP_BRANCH, /*Executes in Branch unit*/ UOP_LIMM /*Does not require an execution unit*/ } uop_type_t; typedef enum decode_type_t { - DECODE_SHORT, - DECODE_LONG, - DECODE_VECTOR + DECODE_SIMPLE, + DECODE_COMPLEX, } decode_type_t; #define MAX_UOPS 10 @@ -43,7 +46,6 @@ typedef enum decode_type_t typedef struct risc86_uop_t { uop_type_t type; - double throughput; double latency; } risc86_uop_t; @@ -57,804 +59,885 @@ typedef struct risc86_instruction_t static const risc86_instruction_t alu_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t alux_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t load_alu_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t load_alux_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t alu_store_op = { - .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} + }; static const risc86_instruction_t alux_store_op = { - .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} }; static const risc86_instruction_t branch_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_BRANCH, .latency = 1} }; static const risc86_instruction_t limm_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LIMM, .throughput = 1, .latency = 1} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_LIMM, .latency = 1} }; static const risc86_instruction_t load_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1} }; static const risc86_instruction_t store_op = { - .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1} }; static const risc86_instruction_t bswap_op = { - .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, }; static const risc86_instruction_t leave_op = { .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t lods_op = { .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t loop_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} }; static const risc86_instruction_t mov_reg_seg_op = { .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, }; static const risc86_instruction_t movs_op = { .nr_uops = 4, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t pop_reg_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t pop_mem_op = { - .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t push_imm_op = { - .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 2}, + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1}, }; static const risc86_instruction_t push_mem_op = { - .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1} }; static const risc86_instruction_t push_seg_op = { - .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t stos_op = { - .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t test_reg_op = { .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t test_reg_b_op = { .nr_uops = 1, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t test_mem_imm_op = { .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t test_mem_imm_b_op = { .nr_uops = 2, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} }; static const risc86_instruction_t xchg_op = { .nr_uops = 3, - .decode_type = DECODE_LONG, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; static const risc86_instruction_t mmx_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX, .latency = 1.5} }; static const risc86_instruction_t mmx_mul_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU_MUL, .throughput = 1.5, .latency = 3} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX_MUL, .latency = 1.5} }; static const risc86_instruction_t mmx_shift_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU_SHIFT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX_SHIFT, .latency = 1.5} }; static const risc86_instruction_t load_mmx_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU, .throughput = 1.5, .latency = 3} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_MMX, .latency = 1.5} }; static const risc86_instruction_t load_mmx_mul_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU_MUL, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_MMX_MUL, .latency = 1.5} }; static const risc86_instruction_t load_mmx_shift_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU_SHIFT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_MMX_SHIFT, .latency = 1.5} }; static const risc86_instruction_t mload_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MLOAD, .throughput = 1, .latency = 2} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_MLOAD, .latency = 3}, }; static const risc86_instruction_t mstore_op = { - .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MSTORE, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_MSTORED, .latency = 1}, + .uop[1] = {.type = UOP_MSTOREA, .latency = 1} }; static const risc86_instruction_t pmul_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_MEU_MUL, .throughput = 1, .latency = 2} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_MMX_MUL, .latency = 1.5} }; static const risc86_instruction_t pmul_mem_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_MEU_MUL, .throughput = 1, .latency = 2} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_MMX_MUL, .latency = 1.5} }; - static const risc86_instruction_t float_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_FLOAT, .latency = 1} +}; +static const risc86_instruction_t fadd_op = +{ + .nr_uops = 1, + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_FLOAT, .latency = 3} +}; +static const risc86_instruction_t fmul_op = +{ + .nr_uops = 1, + .decode_type = DECODE_SIMPLE, + .uop[0] = {.type = UOP_ALU, .latency = 5} +}; +static const risc86_instruction_t float2_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 1} +}; +static const risc86_instruction_t fchs_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 2}, + .uop[1] = {.type = UOP_FLOAT, .latency = 2}, + .uop[2] = {.type = UOP_FLOAT, .latency = 2} }; static const risc86_instruction_t load_float_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_FLOAT, .throughput = 1.5, .latency = 1.5} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 1} +}; +static const risc86_instruction_t load_fadd_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 3} +}; +static const risc86_instruction_t load_fmul_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 5} }; static const risc86_instruction_t fstore_op = { - .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1} + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FSTORED, .latency = 1}, + .uop[1] = {.type = UOP_FSTOREA, .latency = 1}, +}; +static const risc86_instruction_t load_fiadd_op = +{ + .nr_uops = 7, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 1}, + .uop[2] = {.type = UOP_FLOAT, .latency = 1}, + .uop[3] = {.type = UOP_FLOAT, .latency = 1}, + .uop[4] = {.type = UOP_FLOAT, .latency = 1}, + .uop[5] = {.type = UOP_FLOAT, .latency = 1}, + .uop[6] = {.type = UOP_FLOAT, .latency = 1} }; - static const risc86_instruction_t fdiv_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 38, .latency = 38} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 32} }; static const risc86_instruction_t fdiv_mem_op = { .nr_uops = 2, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_FLOAT, .throughput = 38, .latency = 38} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAT, .latency = 38} }; static const risc86_instruction_t fsin_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 62, .latency = 62} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 60} }; static const risc86_instruction_t fsqrt_op = { .nr_uops = 1, - .decode_type = DECODE_SHORT, - .uop[0] = {.type = UOP_FLOAT, .throughput = 49, .latency = 49} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 69} }; -static const risc86_instruction_t vector_fldcw_op = +static const risc86_instruction_t complex_fldcw_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 8, .latency = 8} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 10} }; -static const risc86_instruction_t vector_float_op = +static const risc86_instruction_t complex_float_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 1} }; -static const risc86_instruction_t vector_float_l_op = +static const risc86_instruction_t complex_float_l_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 50, .latency = 50} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 50} }; -static const risc86_instruction_t vector_flde_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_FLOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_FLOAT, .throughput = 2, .latency = 2} -}; -static const risc86_instruction_t vector_fste_op = +static const risc86_instruction_t complex_flde_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_FLOAT, .throughput = 2, .latency = 2}, - .uop[1] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAD, .latency = 1}, + .uop[1] = {.type = UOP_FLOAD, .latency = 1}, + .uop[2] = {.type = UOP_FLOAT, .latency = 2} +}; +static const risc86_instruction_t complex_fste_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_FLOAT, .latency = 2}, + .uop[1] = {.type = UOP_FSTORED, .latency = 1}, + .uop[2] = {.type = UOP_FSTOREA, .latency = 1} }; -static const risc86_instruction_t vector_alu1_op = +static const risc86_instruction_t complex_alu1_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alu2_op = +static const risc86_instruction_t complex_alu2_op = { .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alu3_op = +static const risc86_instruction_t complex_alu3_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alu6_op = +static const risc86_instruction_t complex_alu6_op = { .nr_uops = 6, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1}, + .uop[4] = {.type = UOP_ALU, .latency = 1}, + .uop[5] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_alux1_op = +static const risc86_instruction_t complex_alux1_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1} }; -static const risc86_instruction_t vector_alux3_op = +static const risc86_instruction_t complex_alux3_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1} }; -static const risc86_instruction_t vector_alux6_op = +static const risc86_instruction_t complex_alux6_op = { - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .nr_uops = 6, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1}, + .uop[3] = {.type = UOP_ALUX, .latency = 1}, + .uop[4] = {.type = UOP_ALUX, .latency = 1}, + .uop[5] = {.type = UOP_ALUX, .latency = 1} }; -static const risc86_instruction_t vector_alu_store_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_alux_store_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_arpl_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[1] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_bound_op = +static const risc86_instruction_t complex_alu_store_op = { .nr_uops = 4, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1}, }; -static const risc86_instruction_t vector_bsx_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_call_far_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_cli_sti_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 7, .latency = 7} -}; -static const risc86_instruction_t vector_cmps_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_cmpsb_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_cmpxchg_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, -}; -static const risc86_instruction_t vector_cmpxchg_b_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, -}; -static const risc86_instruction_t vector_cpuid_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 22, .latency = 22} -}; -static const risc86_instruction_t vector_div16_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_div16_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_div32_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 18, .latency = 18} -}; -static const risc86_instruction_t vector_div32_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 18, .latency = 18} -}; -static const risc86_instruction_t vector_emms_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 25, .latency = 25} -}; -static const risc86_instruction_t vector_enter_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 10, .latency = 10} -}; -static const risc86_instruction_t vector_femms_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 6, .latency = 6} -}; -static const risc86_instruction_t vector_in_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 10, .latency = 11} -}; -static const risc86_instruction_t vector_ins_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 10, .latency = 11}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_int_op = -{ - .nr_uops = 5, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 20, .latency = 20}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_iret_op = -{ - .nr_uops = 5, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[3] = {.type = UOP_ALU, .throughput = 20, .latency = 20}, - .uop[4] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_invd_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1000, .latency = 1000} -}; -static const risc86_instruction_t vector_jmp_far_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_load_alu_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_load_alux_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_loop_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_lss_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[2] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_mov_mem_seg_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mov_seg_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_mov_seg_reg_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 3, .latency = 3} -}; -static const risc86_instruction_t vector_mul_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mul_mem_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mul64_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_mul64_mem_op = +static const risc86_instruction_t complex_alux_store_op = { .nr_uops = 4, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} }; -static const risc86_instruction_t vector_out_op = +static const risc86_instruction_t complex_arpl_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3}, + .uop[1] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_bound_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_bsx_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 10, .latency = 10} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 10} }; -static const risc86_instruction_t vector_outs_op = +static const risc86_instruction_t complex_call_far_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_cli_sti_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 7} +}; +static const risc86_instruction_t complex_cmps_op = { .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 10, .latency = 10}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} }; -static const risc86_instruction_t vector_pusha_op = +static const risc86_instruction_t complex_cmpsb_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_cmpxchg_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_cmpxchg_b_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_STORED, .latency = 1}, + .uop[3] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_cpuid_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 23} +}; +static const risc86_instruction_t complex_div16_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 10} +}; +static const risc86_instruction_t complex_div16_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 10} +}; +static const risc86_instruction_t complex_div32_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 18} +}; +static const risc86_instruction_t complex_div32_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 18} +}; +static const risc86_instruction_t complex_emms_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 25} +}; +static const risc86_instruction_t complex_enter_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 10} +}; +static const risc86_instruction_t complex_femms_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 6} +}; +static const risc86_instruction_t complex_in_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 10} +}; +static const risc86_instruction_t complex_ins_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 10}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_int_op = { .nr_uops = 8, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[6] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[7] = {.type = UOP_STORE, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 20}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_STORED, .latency = 1}, + .uop[4] = {.type = UOP_STOREA, .latency = 1}, + .uop[5] = {.type = UOP_STORED, .latency = 1}, + .uop[6] = {.type = UOP_STOREA, .latency = 1}, + .uop[7] = {.type = UOP_BRANCH, .latency = 1} }; -static const risc86_instruction_t vector_popa_op = +static const risc86_instruction_t complex_iret_op = +{ + .nr_uops = 5, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_LOAD, .latency = 3}, + .uop[2] = {.type = UOP_LOAD, .latency = 3}, + .uop[3] = {.type = UOP_ALU, .latency = 20}, + .uop[4] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_invd_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 500} +}; +static const risc86_instruction_t complex_jmp_far_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_load_alu_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_load_alux_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_loop_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_lss_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_mov_mem_seg_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, +}; +static const risc86_instruction_t complex_mov_seg_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_mov_seg_reg_op = +{ + .nr_uops = 1, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 3} +}; +static const risc86_instruction_t complex_mul_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_mul_mem_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_mul64_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_mul64_mem_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALUX, .latency = 1}, + .uop[3] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_out_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 10}, + .uop[1] = {.type = UOP_STOREA, .latency = 10}, +}; +static const risc86_instruction_t complex_outs_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 10}, + .uop[1] = {.type = UOP_STOREA, .latency = 10}, + .uop[2] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_pusha_op = { .nr_uops = 8, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[3] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[4] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[5] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[6] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[7] = {.type = UOP_LOAD, .throughput = 1, .latency = 1} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 2}, + .uop[1] = {.type = UOP_STOREA, .latency = 2}, + .uop[2] = {.type = UOP_STORED, .latency = 2}, + .uop[3] = {.type = UOP_STOREA, .latency = 2}, + .uop[4] = {.type = UOP_STORED, .latency = 2}, + .uop[5] = {.type = UOP_STOREA, .latency = 2}, + .uop[6] = {.type = UOP_STORED, .latency = 2}, + .uop[7] = {.type = UOP_STOREA, .latency = 2} }; -static const risc86_instruction_t vector_popf_op = +static const risc86_instruction_t complex_popa_op = +{ + .nr_uops = 8, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1}, + .uop[2] = {.type = UOP_LOAD, .latency = 1}, + .uop[3] = {.type = UOP_LOAD, .latency = 1}, + .uop[4] = {.type = UOP_LOAD, .latency = 1}, + .uop[5] = {.type = UOP_LOAD, .latency = 1}, + .uop[6] = {.type = UOP_LOAD, .latency = 1}, + .uop[7] = {.type = UOP_LOAD, .latency = 1} +}; +static const risc86_instruction_t complex_popf_op = { .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 17, .latency = 17} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 3}, + .uop[1] = {.type = UOP_ALUX, .latency = 17} }; -static const risc86_instruction_t vector_push_mem_op = +static const risc86_instruction_t complex_push_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_STORED, .latency = 1}, + .uop[1] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_pushf_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1} +}; +static const risc86_instruction_t complex_ret_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_retf_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 3}, + .uop[2] = {.type = UOP_BRANCH, .latency = 1} +}; +static const risc86_instruction_t complex_scas_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_scasb_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_setcc_mem_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_FSTORED, .latency = 1}, + .uop[3] = {.type = UOP_FSTOREA, .latency = 1} +}; +static const risc86_instruction_t complex_setcc_reg_op = +{ + .nr_uops = 3, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALUX, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1}, + .uop[2] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_test_mem_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_test_mem_b_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_ALUX, .latency = 1} +}; +static const risc86_instruction_t complex_xchg_mem_op = +{ + .nr_uops = 4, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_LOAD, .latency = 1}, + .uop[1] = {.type = UOP_STORED, .latency = 1}, + .uop[2] = {.type = UOP_STOREA, .latency = 1}, + .uop[3] = {.type = UOP_ALU, .latency = 1} +}; +static const risc86_instruction_t complex_xlat_op = +{ + .nr_uops = 2, + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 1}, + .uop[1] = {.type = UOP_LOAD, .latency = 1} +}; +static const risc86_instruction_t complex_wbinvd_op = { .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_pushf_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_ret_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_retf_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 3, .latency = 3}, - .uop[2] = {.type = UOP_BRANCH, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_scas_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_scasb_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_setcc_mem_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_FSTORE, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_setcc_reg_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_test_mem_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_test_mem_b_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 2}, - .uop[1] = {.type = UOP_ALUX, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_xchg_mem_op = -{ - .nr_uops = 3, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_LOAD, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_STORE, .throughput = 1, .latency = 1}, - .uop[2] = {.type = UOP_ALU, .throughput = 1, .latency = 1} -}; -static const risc86_instruction_t vector_xlat_op = -{ - .nr_uops = 2, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 1, .latency = 1}, - .uop[1] = {.type = UOP_LOAD, .throughput = 1, .latency = 2} -}; -static const risc86_instruction_t vector_wbinvd_op = -{ - .nr_uops = 1, - .decode_type = DECODE_VECTOR, - .uop[0] = {.type = UOP_ALU, .throughput = 10000, .latency = 10000} + .decode_type = DECODE_COMPLEX, + .uop[0] = {.type = UOP_ALU, .latency = 10000} }; + #define INVALID NULL static const risc86_instruction_t *opcode_timings[256] = @@ -862,38 +945,38 @@ static const risc86_instruction_t *opcode_timings[256] = /* ADD ADD ADD ADD*/ /*00*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* ADD ADD PUSH ES POP ES*/ - &alux_op, &alu_op, &push_seg_op, &vector_mov_seg_mem_op, + &alux_op, &alu_op, &push_seg_op, &complex_mov_seg_mem_op, /* OR OR OR OR*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* OR OR PUSH CS */ &alux_op, &alu_op, &push_seg_op, INVALID, /* ADC ADC ADC ADC*/ -/*10*/ &vector_alux_store_op, &vector_alu_store_op, &vector_load_alux_op, &vector_load_alu_op, +/*10*/ &complex_alux_store_op,&complex_alu_store_op, &complex_load_alux_op,&complex_load_alu_op, /* ADC ADC PUSH SS POP SS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* SBB SBB SBB SBB*/ -/*10*/ &vector_alux_store_op, &vector_alu_store_op, &vector_load_alux_op, &vector_load_alu_op, +/*10*/ &complex_alux_store_op,&complex_alu_store_op, &complex_load_alux_op,&complex_load_alu_op, /* SBB SBB PUSH DS POP DS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* AND AND AND AND*/ /*20*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* AND AND DAA*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* SUB SUB SUB SUB*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* SUB SUB DAS*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* XOR XOR XOR XOR*/ /*30*/ &alux_store_op, &alu_store_op, &load_alux_op, &load_alu_op, /* XOR XOR AAA*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* CMP CMP CMP CMP*/ &load_alux_op, &load_alu_op, &load_alux_op, &load_alu_op, /* CMP CMP AAS*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* INC EAX INC ECX INC EDX INC EBX*/ /*40*/ &alu_op, &alu_op, &alu_op, &alu_op, @@ -914,12 +997,12 @@ static const risc86_instruction_t *opcode_timings[256] = &pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op, /* PUSHA POPA BOUND ARPL*/ -/*60*/ &vector_pusha_op, &vector_popa_op, &vector_bound_op, &vector_arpl_op, +/*60*/ &complex_pusha_op, &complex_popa_op, &complex_bound_op, &complex_arpl_op, INVALID, INVALID, INVALID, INVALID, /* PUSH imm IMUL PUSH imm IMUL*/ - &push_imm_op, &vector_mul_op, &push_imm_op, &vector_mul_op, + &push_imm_op, &complex_mul_op, &push_imm_op, &complex_mul_op, /* INSB INSW OUTSB OUTSW*/ - &vector_ins_op, &vector_ins_op, &vector_outs_op, &vector_outs_op, + &complex_ins_op, &complex_ins_op, &complex_outs_op, &complex_outs_op, /* Jxx*/ /*70*/ &branch_op, &branch_op, &branch_op, &branch_op, @@ -929,29 +1012,29 @@ static const risc86_instruction_t *opcode_timings[256] = /*80*/ INVALID, INVALID, INVALID, INVALID, /* TEST TEST XCHG XCHG*/ - &vector_test_mem_b_op, &vector_test_mem_op, &vector_xchg_mem_op, &vector_xchg_mem_op, + &complex_test_mem_b_op, &complex_test_mem_op, &complex_xchg_mem_op, &complex_xchg_mem_op, /* MOV MOV MOV MOV*/ &store_op, &store_op, &load_op, &load_op, /* MOV from seg LEA MOV to seg POP*/ - &vector_mov_mem_seg_op, &store_op, &vector_mov_seg_mem_op, &pop_mem_op, + &complex_mov_mem_seg_op, &store_op, &complex_mov_seg_mem_op, &pop_mem_op, /* NOP XCHG XCHG XCHG*/ /*90*/ &limm_op, &xchg_op, &xchg_op, &xchg_op, /* XCHG XCHG XCHG XCHG*/ &xchg_op, &xchg_op, &xchg_op, &xchg_op, /* CBW CWD CALL far WAIT*/ - &vector_alu1_op, &vector_alu1_op, &vector_call_far_op, &limm_op, + &complex_alu1_op, &complex_alu1_op, &complex_call_far_op, &limm_op, /* PUSHF POPF SAHF LAHF*/ - &vector_pushf_op, &vector_popf_op, &vector_alux1_op, &vector_alux1_op, + &complex_pushf_op, &complex_popf_op, &complex_alux1_op, &complex_alux1_op, /* MOV MOV MOV MOV*/ /*a0*/ &load_op, &load_op, &store_op, &store_op, /* MOVSB MOVSW CMPSB CMPSW*/ - &movs_op, &movs_op, &vector_cmpsb_op, &vector_cmps_op, + &movs_op, &movs_op, &complex_cmpsb_op, &complex_cmps_op, /* TEST TEST STOSB STOSW*/ &test_reg_b_op, &test_reg_op, &stos_op, &stos_op, /* LODSB LODSW SCASB SCASW*/ - &lods_op, &lods_op, &vector_scasb_op, &vector_scas_op, + &lods_op, &lods_op, &complex_scasb_op, &complex_scas_op, /* MOV*/ /*b0*/ &limm_op, &limm_op, &limm_op, &limm_op, @@ -960,37 +1043,37 @@ static const risc86_instruction_t *opcode_timings[256] = &limm_op, &limm_op, &limm_op, &limm_op, /* RET imm RET*/ -/*c0*/ INVALID, INVALID, &vector_ret_op, &vector_ret_op, +/*c0*/ INVALID, INVALID, &complex_ret_op, &complex_ret_op, /* LES LDS MOV MOV*/ - &vector_lss_op, &vector_lss_op, &store_op, &store_op, + &complex_lss_op, &complex_lss_op, &store_op, &store_op, /* ENTER LEAVE RETF RETF*/ - &vector_enter_op, &leave_op, &vector_retf_op, &vector_retf_op, + &complex_enter_op, &leave_op, &complex_retf_op, &complex_retf_op, /* INT3 INT INTO IRET*/ - &vector_int_op, &vector_int_op, &vector_int_op, &vector_iret_op, + &complex_int_op, &complex_int_op, &complex_int_op, &complex_iret_op, /*d0*/ INVALID, INVALID, INVALID, INVALID, /* AAM AAD SETALC XLAT*/ - &vector_alux6_op, &vector_alux3_op, &vector_alux1_op, &vector_xlat_op, + &complex_alux6_op, &complex_alux3_op, &complex_alux1_op, &complex_xlat_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /* LOOPNE LOOPE LOOP JCXZ*/ -/*e0*/ &vector_loop_op, &vector_loop_op, &loop_op, &vector_loop_op, +/*e0*/ &complex_loop_op, &complex_loop_op, &loop_op, &complex_loop_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* CALL JMP JMP JMP*/ - &store_op, &branch_op, &vector_jmp_far_op, &branch_op, + &store_op, &branch_op, &complex_jmp_far_op, &branch_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* REPNE REPE*/ /*f0*/ INVALID, INVALID, INVALID, INVALID, /* HLT CMC*/ - &vector_alux1_op, &vector_alu2_op, INVALID, INVALID, + &complex_alux1_op, &complex_alu2_op, INVALID, INVALID, /* CLC STC CLI STI*/ - &vector_alu1_op, &vector_alu1_op, &vector_cli_sti_op, &vector_cli_sti_op, + &complex_alu1_op, &complex_alu1_op, &complex_cli_sti_op, &complex_cli_sti_op, /* CLD STD INCDEC*/ - &vector_alu1_op, &vector_alu1_op, &alux_store_op, INVALID + &complex_alu1_op, &complex_alu1_op, &alux_store_op, INVALID }; static const risc86_instruction_t *opcode_timings_mod3[256] = @@ -998,38 +1081,38 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = /* ADD ADD ADD ADD*/ /*00*/ &alux_op, &alu_op, &alux_op, &alu_op, /* ADD ADD PUSH ES POP ES*/ - &alux_op, &alu_op, &push_seg_op, &vector_mov_seg_mem_op, + &alux_op, &alu_op, &push_seg_op, &complex_mov_seg_mem_op, /* OR OR OR OR*/ &alux_op, &alu_op, &alux_op, &alu_op, /* OR OR PUSH CS */ &alux_op, &alu_op, &push_seg_op, INVALID, /* ADC ADC ADC ADC*/ -/*10*/ &vector_alux1_op, &vector_alu1_op, &vector_alux1_op, &vector_alu1_op, +/*10*/ &complex_alux1_op, &complex_alu1_op, &complex_alux1_op, &complex_alu1_op, /* ADC ADC PUSH SS POP SS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* SBB SBB SBB SBB*/ - &vector_alux1_op, &vector_alu1_op, &vector_alux1_op, &vector_alu1_op, + &complex_alux1_op, &complex_alu1_op, &complex_alux1_op, &complex_alu1_op, /* SBB SBB PUSH DS POP DS*/ - &vector_alux1_op, &vector_alu1_op, &push_seg_op, &vector_mov_seg_mem_op, + &complex_alux1_op, &complex_alu1_op, &push_seg_op, &complex_mov_seg_mem_op, /* AND AND AND AND*/ /*20*/ &alux_op, &alu_op, &alux_op, &alu_op, /* AND AND DAA*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* SUB SUB SUB SUB*/ &alux_op, &alu_op, &alux_op, &alu_op, /* SUB SUB DAS*/ - &alux_op, &alu_op, INVALID, &vector_alux1_op, + &alux_op, &alu_op, INVALID, &complex_alux1_op, /* XOR XOR XOR XOR*/ /*30*/ &alux_op, &alu_op, &alux_op, &alu_op, /* XOR XOR AAA*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* CMP CMP CMP CMP*/ &alux_op, &alu_op, &alux_op, &alu_op, /* CMP CMP AAS*/ - &alux_op, &alu_op, INVALID, &vector_alux6_op, + &alux_op, &alu_op, INVALID, &complex_alux6_op, /* INC EAX INC ECX INC EDX INC EBX*/ /*40*/ &alu_op, &alu_op, &alu_op, &alu_op, @@ -1050,12 +1133,12 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = &pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op, /* PUSHA POPA BOUND ARPL*/ -/*60*/ &vector_pusha_op, &vector_popa_op, &vector_bound_op, &vector_arpl_op, +/*60*/ &complex_pusha_op, &complex_popa_op, &complex_bound_op, &complex_arpl_op, INVALID, INVALID, INVALID, INVALID, /* PUSH imm IMUL PUSH imm IMUL*/ - &push_imm_op, &vector_mul_op, &push_imm_op, &vector_mul_op, + &push_imm_op, &complex_mul_op, &push_imm_op, &complex_mul_op, /* INSB INSW OUTSB OUTSW*/ - &vector_ins_op, &vector_ins_op, &vector_outs_op, &vector_outs_op, + &complex_ins_op, &complex_ins_op, &complex_outs_op, &complex_outs_op, /* Jxx*/ /*70*/ &branch_op, &branch_op, &branch_op, &branch_op, @@ -1065,29 +1148,29 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = /*80*/ INVALID, INVALID, INVALID, INVALID, /* TEST TEST XCHG XCHG*/ - &vector_alu1_op, &vector_alu1_op, &vector_alu3_op, &vector_alu3_op, + &complex_alu1_op, &complex_alu1_op, &complex_alu3_op, &complex_alu3_op, /* MOV MOV MOV MOV*/ &store_op, &store_op, &load_op, &load_op, /* MOV from seg LEA MOV to seg POP*/ - &mov_reg_seg_op, &store_op, &vector_mov_seg_reg_op, &pop_reg_op, + &mov_reg_seg_op, &store_op, &complex_mov_seg_reg_op, &pop_reg_op, /* NOP XCHG XCHG XCHG*/ /*90*/ &limm_op, &xchg_op, &xchg_op, &xchg_op, /* XCHG XCHG XCHG XCHG*/ &xchg_op, &xchg_op, &xchg_op, &xchg_op, /* CBW CWD CALL far WAIT*/ - &vector_alu1_op, &vector_alu1_op, &vector_call_far_op, &limm_op, + &complex_alu1_op, &complex_alu1_op, &complex_call_far_op, &limm_op, /* PUSHF POPF SAHF LAHF*/ - &vector_pushf_op, &vector_popf_op, &vector_alux1_op, &vector_alux1_op, + &complex_pushf_op, &complex_popf_op, &complex_alux1_op, &complex_alux1_op, /* MOV MOV MOV MOV*/ /*a0*/ &load_op, &load_op, &store_op, &store_op, /* MOVSB MOVSW CMPSB CMPSW*/ - &movs_op, &movs_op, &vector_cmpsb_op, &vector_cmps_op, + &movs_op, &movs_op, &complex_cmpsb_op, &complex_cmps_op, /* TEST TEST STOSB STOSW*/ &test_reg_b_op, &test_reg_op, &stos_op, &stos_op, /* LODSB LODSW SCASB SCASW*/ - &lods_op, &lods_op, &vector_scasb_op, &vector_scas_op, + &lods_op, &lods_op, &complex_scasb_op, &complex_scas_op, /* MOV*/ /*b0*/ &limm_op, &limm_op, &limm_op, &limm_op, @@ -1096,57 +1179,57 @@ static const risc86_instruction_t *opcode_timings_mod3[256] = &limm_op, &limm_op, &limm_op, &limm_op, /* RET imm RET*/ -/*c0*/ INVALID, INVALID, &vector_ret_op, &vector_ret_op, +/*c0*/ INVALID, INVALID, &complex_ret_op, &complex_ret_op, /* LES LDS MOV MOV*/ - &vector_lss_op, &vector_lss_op, &store_op, &store_op, + &complex_lss_op, &complex_lss_op, &store_op, &store_op, /* ENTER LEAVE RETF RETF*/ - &vector_enter_op, &leave_op, &vector_retf_op, &vector_retf_op, + &complex_enter_op, &leave_op, &complex_retf_op, &complex_retf_op, /* INT3 INT INTO IRET*/ - &vector_int_op, &vector_int_op, &vector_int_op, &vector_iret_op, + &complex_int_op, &complex_int_op, &complex_int_op, &complex_iret_op, /*d0*/ INVALID, INVALID, INVALID, INVALID, /* AAM AAD SETALC XLAT*/ - &vector_alux6_op, &vector_alux3_op, &vector_alux1_op, &vector_xlat_op, + &complex_alux6_op, &complex_alux3_op, &complex_alux1_op, &complex_xlat_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /* LOOPNE LOOPE LOOP JCXZ*/ -/*e0*/ &vector_loop_op, &vector_loop_op, &loop_op, &vector_loop_op, +/*e0*/ &complex_loop_op, &complex_loop_op, &loop_op, &complex_loop_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* CALL JMP JMP JMP*/ - &store_op, &branch_op, &vector_jmp_far_op, &branch_op, + &store_op, &branch_op, &complex_jmp_far_op, &branch_op, /* IN AL IN AX OUT_AL OUT_AX*/ - &vector_in_op, &vector_in_op, &vector_out_op, &vector_out_op, + &complex_in_op, &complex_in_op, &complex_out_op, &complex_out_op, /* REPNE REPE*/ /*f0*/ INVALID, INVALID, INVALID, INVALID, /* HLT CMC*/ - &vector_alux1_op, &vector_alu2_op, INVALID, INVALID, + &complex_alux1_op, &complex_alu2_op, INVALID, INVALID, /* CLC STC CLI STI*/ - &vector_alu1_op, &vector_alu1_op, &vector_cli_sti_op, &vector_cli_sti_op, + &complex_alu1_op, &complex_alu1_op, &complex_cli_sti_op, &complex_cli_sti_op, /* CLD STD INCDEC*/ - &vector_alu1_op, &vector_alu1_op, &vector_alux1_op, INVALID + &complex_alu1_op, &complex_alu1_op, &complex_alux1_op, INVALID }; static const risc86_instruction_t *opcode_timings_0f[256] = { -/*00*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - INVALID, &vector_alu6_op, &vector_alu6_op, INVALID, - &vector_invd_op, &vector_wbinvd_op, INVALID, INVALID, - INVALID, &load_op, &vector_femms_op, INVALID, +/*00*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + INVALID, &complex_alu6_op, &complex_alu6_op, INVALID, + &complex_invd_op, &complex_wbinvd_op, INVALID, INVALID, + INVALID, &load_op, &complex_femms_op, INVALID, /*10*/ INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, -/*20*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - &vector_alu6_op, &vector_alu6_op, INVALID, INVALID, +/*20*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + &complex_alu6_op, &complex_alu6_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, -/*30*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, INVALID, +/*30*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, @@ -1167,7 +1250,7 @@ static const risc86_instruction_t *opcode_timings_0f[256] = INVALID, INVALID, &mload_op, &mload_op, /*70*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, - &load_mmx_op, &load_mmx_op, &load_mmx_op, &vector_emms_op, + &load_mmx_op, &load_mmx_op, &load_mmx_op, &complex_emms_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, &mstore_op, &mstore_op, @@ -1176,122 +1259,122 @@ static const risc86_instruction_t *opcode_timings_0f[256] = &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, -/*90*/ &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, - &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, - &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, - &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, &vector_setcc_reg_op, +/*90*/ &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, + &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, + &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, + &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, &complex_setcc_reg_op, -/*a0*/ &push_seg_op, &vector_mov_seg_mem_op, &vector_cpuid_op, &vector_load_alu_op, - &vector_alu_store_op, &vector_alu_store_op, INVALID, INVALID, - &push_seg_op, &vector_mov_seg_mem_op, INVALID, &vector_load_alu_op, - &vector_alu_store_op, &vector_alu_store_op, INVALID, &vector_mul_op, +/*a0*/ &push_seg_op, &complex_mov_seg_mem_op,&complex_cpuid_op, &complex_load_alu_op, + &complex_alu_store_op, &complex_alu_store_op, INVALID, INVALID, + &push_seg_op, &complex_mov_seg_mem_op,INVALID, &complex_load_alu_op, + &complex_alu_store_op, &complex_alu_store_op, INVALID, &complex_mul_op, -/*b0*/ &vector_cmpxchg_b_op, &vector_cmpxchg_op, &vector_lss_op, &vector_load_alu_op, - &vector_lss_op, &vector_lss_op, &load_alux_op, &load_alu_op, - INVALID, INVALID, &vector_load_alu_op, &vector_load_alu_op, - &vector_bsx_op, &vector_bsx_op, &load_alux_op, &load_alu_op, +/*b0*/ &complex_cmpxchg_b_op, &complex_cmpxchg_op, &complex_lss_op, &complex_load_alu_op, + &complex_lss_op, &complex_lss_op, &load_alux_op, &load_alu_op, + INVALID, INVALID, &complex_load_alu_op, &complex_load_alu_op, + &complex_bsx_op, &complex_bsx_op, &load_alux_op, &load_alu_op, -/*c0*/ &vector_alux_store_op, &vector_alu_store_op, INVALID, INVALID, - INVALID, INVALID, INVALID, &vector_cmpxchg_op, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, +/*c0*/ &complex_alux_store_op, &complex_alu_store_op, INVALID, INVALID, + INVALID, INVALID, INVALID, &complex_cmpxchg_op, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, -/*d0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, - INVALID, &load_mmx_mul_op, INVALID, INVALID, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, +/*d0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, + INVALID, &load_mmx_mul_op, INVALID, INVALID, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, -/*e0*/ &load_mmx_op, &load_mmx_shift_op, &load_mmx_shift_op, INVALID, - INVALID, &pmul_mem_op, INVALID, INVALID, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, - &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, +/*e0*/ &load_mmx_op, &load_mmx_shift_op, &load_mmx_shift_op, INVALID, + INVALID, &pmul_mem_op, INVALID, INVALID, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, + &load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op, -/*f0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, - INVALID, &pmul_mem_op, INVALID, INVALID, - &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, - &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, +/*f0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op, + INVALID, &pmul_mem_op, INVALID, INVALID, + &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, + &load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID, }; static const risc86_instruction_t *opcode_timings_0f_mod3[256] = { -/*00*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - INVALID, &vector_alu6_op, &vector_alu6_op, INVALID, - &vector_invd_op, &vector_wbinvd_op, INVALID, INVALID, - INVALID, INVALID, &vector_femms_op, INVALID, +/*00*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + INVALID, &complex_alu6_op, &complex_alu6_op, INVALID, + &complex_invd_op, &complex_wbinvd_op, INVALID, INVALID, + INVALID, INVALID, &complex_femms_op, INVALID, -/*10*/ INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*10*/ INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*20*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, - &vector_alu6_op, &vector_alu6_op, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*20*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, + &complex_alu6_op, &complex_alu6_op, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*30*/ &vector_alu6_op, &vector_alu6_op, &vector_alu6_op, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*30*/ &complex_alu6_op, &complex_alu6_op, &complex_alu6_op, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*40*/ INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*40*/ INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*50*/ INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, +/*50*/ INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, -/*60*/ &mmx_op, &mmx_op, &mmx_op, &mmx_op, - &mmx_op, &mmx_op, &mmx_op, &mmx_op, - &mmx_op, &mmx_op, &mmx_op, &mmx_op, - INVALID, INVALID, &mmx_op, &mmx_op, +/*60*/ &mmx_op, &mmx_op, &mmx_op, &mmx_op, + &mmx_op, &mmx_op, &mmx_op, &mmx_op, + &mmx_op, &mmx_op, &mmx_op, &mmx_op, + INVALID, INVALID, &mmx_op, &mmx_op, -/*70*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, - &mmx_op, &mmx_op, &mmx_op, &vector_emms_op, - INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, &mmx_op, &mmx_op, +/*70*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, + &mmx_op, &mmx_op, &mmx_op, &complex_emms_op, + INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, &mmx_op, &mmx_op, /*80*/ &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, &branch_op, -/*90*/ &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, - &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, - &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, - &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, &vector_setcc_mem_op, +/*90*/ &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, + &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, + &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, + &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, &complex_setcc_mem_op, -/*a0*/ &push_seg_op, &vector_mov_seg_mem_op, &vector_cpuid_op, &vector_alu1_op, - &vector_alu1_op, &vector_alu1_op, INVALID, INVALID, - &push_seg_op, &vector_mov_seg_mem_op, INVALID, &vector_alu1_op, - &vector_alu1_op, &vector_alu1_op, INVALID, &vector_mul_op, +/*a0*/ &push_seg_op, &complex_mov_seg_mem_op, &complex_cpuid_op, &complex_alu1_op, + &complex_alu1_op, &complex_alu1_op, INVALID, INVALID, + &push_seg_op, &complex_mov_seg_mem_op, INVALID, &complex_alu1_op, + &complex_alu1_op, &complex_alu1_op, INVALID, &complex_mul_op, -/*b0*/ &vector_cmpxchg_b_op, &vector_cmpxchg_op, &vector_lss_op, &vector_alu1_op, - &vector_lss_op, &vector_lss_op, &alux_op, &alu_op, - INVALID, INVALID, &vector_alu1_op, &vector_alu1_op, - &vector_bsx_op, &vector_bsx_op, &alux_op, &alu_op, +/*b0*/ &complex_cmpxchg_b_op, &complex_cmpxchg_op, &complex_lss_op, &complex_alu1_op, + &complex_lss_op, &complex_lss_op, &alux_op, &alu_op, + INVALID, INVALID, &complex_alu1_op, &complex_alu1_op, + &complex_bsx_op, &complex_bsx_op, &alux_op, &alu_op, -/*c0*/ &vector_alux1_op, &vector_alu1_op, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, - &bswap_op, &bswap_op, &bswap_op, &bswap_op, +/*c0*/ &complex_alux1_op, &complex_alu1_op, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, + &bswap_op, &bswap_op, &bswap_op, &bswap_op, -/*d0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, - INVALID, &mmx_mul_op, INVALID, INVALID, - &mmx_op, &mmx_op, INVALID, &mmx_op, - &mmx_op, &mmx_op, INVALID, &mmx_op, +/*d0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, + INVALID, &mmx_mul_op, INVALID, INVALID, + &mmx_op, &mmx_op, INVALID, &mmx_op, + &mmx_op, &mmx_op, INVALID, &mmx_op, -/*e0*/ &mmx_op, &mmx_shift_op, &mmx_shift_op, INVALID, - INVALID, &pmul_op, INVALID, INVALID, - &mmx_op, &mmx_op, INVALID, &mmx_op, - &mmx_op, &mmx_op, INVALID, &mmx_op, +/*e0*/ &mmx_op, &mmx_shift_op, &mmx_shift_op, INVALID, + INVALID, &pmul_op, INVALID, INVALID, + &mmx_op, &mmx_op, INVALID, &mmx_op, + &mmx_op, &mmx_op, INVALID, &mmx_op, -/*f0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, - INVALID, &pmul_op, INVALID, INVALID, - &mmx_op, &mmx_op, &mmx_op, INVALID, - &mmx_op, &mmx_op, &mmx_op, INVALID, +/*f0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op, + INVALID, &pmul_op, INVALID, INVALID, + &mmx_op, &mmx_op, &mmx_op, INVALID, + &mmx_op, &mmx_op, &mmx_op, INVALID, }; static const risc86_instruction_t *opcode_timings_0f0f[256] = @@ -1463,100 +1546,100 @@ static const risc86_instruction_t *opcode_timings_0f0f_mod3[256] = static const risc86_instruction_t *opcode_timings_shift[8] = { - &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op, - &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op, &vector_alu_store_op + &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op, + &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op, &complex_alu_store_op }; static const risc86_instruction_t *opcode_timings_shift_b[8] = { - &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op, - &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op, &vector_alux_store_op + &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op, + &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op, &complex_alux_store_op }; static const risc86_instruction_t *opcode_timings_shift_mod3[8] = { - &vector_alu1_op, &vector_alu1_op, &vector_alu1_op, &vector_alu1_op, - &alu_op, &alu_op, &alu_op, &alu_op + &complex_alu1_op, &complex_alu1_op, &complex_alu1_op, &complex_alu1_op, + &alu_op, &alu_op, &alu_op, &alu_op }; static const risc86_instruction_t *opcode_timings_shift_b_mod3[8] = { - &vector_alux1_op, &vector_alux1_op, &vector_alux1_op, &vector_alux1_op, - &alux_op, &alux_op, &alux_op, &alux_op + &complex_alux1_op, &complex_alux1_op, &complex_alux1_op, &complex_alux1_op, + &alux_op, &alux_op, &alux_op, &alux_op }; static const risc86_instruction_t *opcode_timings_80[8] = { - &alux_store_op, &alux_store_op, &vector_alux_store_op, &vector_alux_store_op, - &alux_store_op, &alux_store_op, &alux_store_op, &alux_store_op, + &alux_store_op, &alux_store_op, &complex_alux_store_op, &complex_alux_store_op, + &alux_store_op, &alux_store_op, &alux_store_op, &alux_store_op, }; static const risc86_instruction_t *opcode_timings_80_mod3[8] = { - &alux_op, &alux_op, &alux_store_op, &alux_store_op, - &alux_op, &alux_op, &alux_op, &alux_op, + &alux_op, &alux_op, &alux_store_op, &alux_store_op, + &alux_op, &alux_op, &alux_op, &alux_op, }; static const risc86_instruction_t *opcode_timings_8x[8] = { - &alu_store_op, &alu_store_op, &vector_alu_store_op, &vector_alu_store_op, - &alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op, + &alu_store_op, &alu_store_op, &complex_alu_store_op, &complex_alu_store_op, + &alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op, }; static const risc86_instruction_t *opcode_timings_8x_mod3[8] = { - &alu_op, &alu_op, &alu_store_op, &alu_store_op, - &alu_op, &alu_op, &alu_op, &alu_op, + &alu_op, &alu_op, &alu_store_op, &alu_store_op, + &alu_op, &alu_op, &alu_op, &alu_op, }; static const risc86_instruction_t *opcode_timings_f6[8] = { /* TST NOT NEG*/ - &test_mem_imm_b_op, INVALID, &vector_alux_store_op, &vector_alux_store_op, + &test_mem_imm_b_op, INVALID, &complex_alux_store_op, &complex_alux_store_op, /* MUL IMUL DIV IDIV*/ - &vector_mul_mem_op, &vector_mul_mem_op, &vector_div16_mem_op, &vector_div16_mem_op, + &complex_mul_mem_op, &complex_mul_mem_op, &complex_div16_mem_op, &complex_div16_mem_op, }; static const risc86_instruction_t *opcode_timings_f6_mod3[8] = { /* TST NOT NEG*/ &test_reg_b_op, INVALID, &alux_op, &alux_op, /* MUL IMUL DIV IDIV*/ - &vector_mul_op, &vector_mul_op, &vector_div16_op, &vector_div16_op, + &complex_mul_op, &complex_mul_op, &complex_div16_op, &complex_div16_op, }; static const risc86_instruction_t *opcode_timings_f7[8] = { /* TST NOT NEG*/ - &test_mem_imm_op, INVALID, &vector_alu_store_op, &vector_alu_store_op, + &test_mem_imm_op, INVALID, &complex_alu_store_op, &complex_alu_store_op, /* MUL IMUL DIV IDIV*/ - &vector_mul64_mem_op, &vector_mul64_mem_op, &vector_div32_mem_op, &vector_div32_mem_op, + &complex_mul64_mem_op, &complex_mul64_mem_op, &complex_div32_mem_op, &complex_div32_mem_op, }; static const risc86_instruction_t *opcode_timings_f7_mod3[8] = { /* TST NOT NEG*/ &test_reg_op, INVALID, &alu_op, &alu_op, /* MUL IMUL DIV IDIV*/ - &vector_mul64_op, &vector_mul64_op, &vector_div32_op, &vector_div32_op, + &complex_mul64_op, &complex_mul64_op, &complex_div32_op, &complex_div32_op, }; static const risc86_instruction_t *opcode_timings_ff[8] = { /* INC DEC CALL CALL far*/ - &alu_store_op, &alu_store_op, &store_op, &vector_call_far_op, + &alu_store_op, &alu_store_op, &store_op, &complex_call_far_op, /* JMP JMP far PUSH*/ - &branch_op, &vector_jmp_far_op, &push_mem_op, INVALID + &branch_op, &complex_jmp_far_op, &push_mem_op, INVALID }; static const risc86_instruction_t *opcode_timings_ff_mod3[8] = { /* INC DEC CALL CALL far*/ - &vector_alu1_op, &vector_alu1_op, &store_op, &vector_call_far_op, + &complex_alu1_op, &complex_alu1_op, &store_op, &complex_call_far_op, /* JMP JMP far PUSH*/ - &branch_op, &vector_jmp_far_op, &vector_push_mem_op, INVALID + &branch_op, &complex_jmp_far_op, &complex_push_mem_op, INVALID }; static const risc86_instruction_t *opcode_timings_d8[8] = { /* FADDs FMULs FCOMs FCOMPs*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op, /* FSUBs FSUBRs FDIVs FDIVRs*/ &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, }; static const risc86_instruction_t *opcode_timings_d8_mod3[8] = { /* FADD FMUL FCOM FCOMP*/ - &float_op, &float_op, &float_op, &float_op, + &fadd_op, &fmul_op, &float_op, &float_op, /* FSUB FSUBR FDIV FDIVR*/ &float_op, &float_op, &fdiv_op, &fdiv_op, }; @@ -1566,7 +1649,7 @@ static const risc86_instruction_t *opcode_timings_d9[8] = /* FLDs FSTs FSTPs*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FLDENV FLDCW FSTENV FSTCW*/ - &vector_float_l_op, &vector_fldcw_op, &vector_float_l_op, &vector_float_op + &complex_float_l_op, &complex_fldcw_op, &complex_float_l_op, &complex_float_op }; static const risc86_instruction_t *opcode_timings_d9_mod3[64] = { @@ -1574,16 +1657,16 @@ static const risc86_instruction_t *opcode_timings_d9_mod3[64] = &float_op, &float_op, &float_op, &float_op, &float_op, &float_op, &float_op, &float_op, /*FXCH*/ - &float_op, &float_op, &float_op, &float_op, - &float_op, &float_op, &float_op, &float_op, + &limm_op, &limm_op, &limm_op, &limm_op, + &limm_op, &limm_op, &limm_op, &limm_op, /*FNOP*/ &float_op, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /*FSTP*/ - &float_op, &float_op, &float_op, &float_op, - &float_op, &float_op, &float_op, &float_op, + &float2_op, &float2_op, &float2_op, &float2_op, + &float2_op, &float2_op, &float2_op, &float2_op, /* opFCHS opFABS*/ - &float_op, &float_op, INVALID, INVALID, + &fchs_op, &float_op, INVALID, INVALID, /* opFTST opFXAM*/ &float_op, &float_op, INVALID, INVALID, /* opFLD1 opFLDL2T opFLDL2E opFLDPI*/ @@ -1603,7 +1686,7 @@ static const risc86_instruction_t *opcode_timings_d9_mod3[64] = static const risc86_instruction_t *opcode_timings_da[8] = { /* FIADDl FIMULl FICOMl FICOMPl*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op, /* FISUBl FISUBRl FIDIVl FIDIVRl*/ &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, }; @@ -1619,7 +1702,7 @@ static const risc86_instruction_t *opcode_timings_db[8] = /* FLDil FSTil FSTPil*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FLDe FSTPe*/ - INVALID, &vector_flde_op, INVALID, &vector_fste_op + INVALID, &complex_flde_op, INVALID, &complex_fste_op }; static const risc86_instruction_t *opcode_timings_db_mod3[64] = { @@ -1653,14 +1736,14 @@ static const risc86_instruction_t *opcode_timings_db_mod3[64] = static const risc86_instruction_t *opcode_timings_dc[8] = { /* FADDd FMULd FCOMd FCOMPd*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op, /* FSUBd FSUBRd FDIVd FDIVRd*/ &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, }; static const risc86_instruction_t *opcode_timings_dc_mod3[8] = { /* opFADDr opFMULr*/ - &float_op, &float_op, INVALID, INVALID, + &fadd_op, &fmul_op, INVALID, INVALID, /* opFSUBRr opFSUBr opFDIVRr opFDIVr*/ &float_op, &float_op, &fdiv_op, &fdiv_op }; @@ -1670,7 +1753,7 @@ static const risc86_instruction_t *opcode_timings_dd[8] = /* FLDd FSTd FSTPd*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FRSTOR FSAVE FSTSW*/ - &vector_float_l_op, INVALID, &vector_float_l_op, &vector_float_l_op + &complex_float_l_op, INVALID, &complex_float_l_op, &complex_float_l_op }; static const risc86_instruction_t *opcode_timings_dd_mod3[8] = { @@ -1683,14 +1766,14 @@ static const risc86_instruction_t *opcode_timings_dd_mod3[8] = static const risc86_instruction_t *opcode_timings_de[8] = { /* FIADDw FIMULw FICOMw FICOMPw*/ - &load_float_op, &load_float_op, &load_float_op, &load_float_op, + &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, /* FISUBw FISUBRw FIDIVw FIDIVRw*/ - &load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op, + &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, &load_fiadd_op, }; static const risc86_instruction_t *opcode_timings_de_mod3[8] = { /* FADDP FMULP FCOMPP*/ - &float_op, &float_op, INVALID, &float_op, + &fadd_op, &fmul_op, INVALID, &float_op, /* FSUBP FSUBRP FDIVP FDIVRP*/ &float_op, &float_op, &fdiv_op, &fdiv_op, }; @@ -1700,7 +1783,7 @@ static const risc86_instruction_t *opcode_timings_df[8] = /* FILDiw FISTiw FISTPiw*/ &load_float_op, INVALID, &fstore_op, &fstore_op, /* FILDiq FBSTP FISTPiq*/ - INVALID, &load_float_op, &vector_float_l_op, &fstore_op, + INVALID, &load_float_op, &complex_float_l_op, &fstore_op, }; static const risc86_instruction_t *opcode_timings_df_mod3[8] = { @@ -1728,34 +1811,27 @@ static p6_unit_t *units; /*Pentium Pro has no MMX*/ static p6_unit_t ppro_units[] = { - {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX)}, /*Integer X*/ - {.uop_mask = (1 << UOP_ALU)}, /*Integer Y*/ - {.uop_mask = (1 << UOP_FLOAT)}, /*Floating point*/ - {.uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD) | (1 << UOP_MLOAD)}, /*Load*/ - {.uop_mask = (1 << UOP_STORE) | (1 << UOP_FSTORE) | (1 << UOP_MSTORE)}, /*Store*/ - {.uop_mask = (1 << UOP_BRANCH)} /*Branch*/ + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX) | (1 << UOP_FLOAT)}, /*Integer X & Floating point*/ + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_BRANCH)}, /*Integer Y*/ + {.uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD)}, /*Load*/ + {.uop_mask = (1 << UOP_STORED) | (1 << UOP_FSTORED)}, /*Data Store*/ + {.uop_mask = (1 << UOP_STOREA) | (1 << UOP_FSTOREA)}, /*Address Store*/ }; #define NR_PPRO_UNITS (sizeof(ppro_units) / sizeof(p6_unit_t)) -/*Well, it works I guess*/ +/*Pentium II/Celeron assigns the multiplier to port 0, the shifter to port 1, and shares the MMX ALU*/ static p6_unit_t p2_units[] = { - {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX) | (1 << UOP_MEU) | /*Integer X*/ - (1 << UOP_MEU_SHIFT) | (1 << UOP_MEU_MUL)}, - {.uop_mask = (1 << UOP_ALU) | (1 << UOP_MEU) | /*Integer Y*/ - (1 << UOP_MEU_SHIFT) | (1 << UOP_MEU_MUL)}, - {.uop_mask = (1 << UOP_FLOAT)}, /*Floating point*/ + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUX) | (1 << UOP_FLOAT) | /*Integer X & Floating point*/ + (1 << UOP_MMX) | (1 << UOP_MMX_MUL)}, + {.uop_mask = (1 << UOP_ALU) | (1 << UOP_BRANCH) | /*Integer Y*/ + (1 << UOP_MMX) | (1 << UOP_MMX_SHIFT)}, {.uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD) | (1 << UOP_MLOAD)}, /*Load*/ - {.uop_mask = (1 << UOP_STORE) | (1 << UOP_FSTORE) | (1 << UOP_MSTORE)}, /*Store*/ - {.uop_mask = (1 << UOP_BRANCH)} /*Branch*/ + {.uop_mask = (1 << UOP_STORED) | (1 << UOP_FSTORED) | (1 << UOP_MSTORED)}, /*Data Store*/ + {.uop_mask = (1 << UOP_STOREA) | (1 << UOP_FSTOREA) | (1 << UOP_MSTOREA)}, /*Address Store*/ }; #define NR_P2_UNITS (sizeof(p2_units) / sizeof(p6_unit_t)) -/*First available cycles of shared execution units. Each of these can be submitted - to by ALU X and Y*/ -static int mul_first_available_cycle; -static int shift_first_available_cycle; - static int uop_run(const risc86_uop_t *uop, int decode_time) { int c; @@ -1764,16 +1840,8 @@ static int uop_run(const risc86_uop_t *uop, int decode_time) /*UOP_LIMM does not require execution*/ if (uop->type == UOP_LIMM) - return decode_time; + return decode_time; - if (units == p2_units) /*More hackyness*/ - { - if (uop->type == UOP_MEU_MUL && decode_time < mul_first_available_cycle) - decode_time = mul_first_available_cycle; - else if (uop->type == UOP_MEU_SHIFT && decode_time < mul_first_available_cycle) - decode_time = shift_first_available_cycle; - } - /*Find execution unit for this uOP*/ for (c = 0; c < nr_units; c++) { @@ -1787,43 +1855,36 @@ static int uop_run(const risc86_uop_t *uop, int decode_time) } } if (!best_unit) - fatal("uop_run: can not find execution unit\n"); + fatal("uop_run: can not find execution unit\n"); if (best_start_cycle < decode_time) best_start_cycle = decode_time; - best_unit->first_available_cycle = best_start_cycle + uop->throughput; + best_unit->first_available_cycle = best_start_cycle + uop->latency; - if (units == p2_units) /*More hackyness*/ - { - if (uop->type == UOP_MEU_MUL) - mul_first_available_cycle = best_start_cycle + uop->throughput; - else if (uop->type == UOP_MEU_SHIFT) - shift_first_available_cycle = best_start_cycle + uop->throughput; - } - return best_start_cycle + uop->throughput; + + return best_start_cycle + uop->latency; } -/*The K6 decoder can decode, per clock : - - 1 or 2 'short' instructions, each up to 2 uOPs and 7 bytes long - - 1 'long' instruction, up to 4 uOPs - - 1 'vector' instruction, up to 4 uOPs per cycle, plus (I think) 1 cycle startup delay) +/*The P6 decoders can decode, per clock : + - 1 to 3 'simple' instructions, each up to 1 uOP and 7 bytes long + - 1 'complex' instruction, up to 4 uOPs or 3 per cycle for instructions longer than 4 uOPs */ static struct { int nr_uops; - const risc86_uop_t *uops[4]; + const risc86_uop_t *uops[6]; /*Earliest time a uop can start. If the timestamp is -1, then the uop is part of a dependency chain and the start time is the completion time of the previous uop*/ - int earliest_start[4]; + int earliest_start[6]; } decode_buffer; -#define NR_OPQUADS 6 -/*Timestamps of when the last six opquads completed. The K6 scheduler retires - opquads in order, so this is needed to determine when the next can be scheduled*/ -static int opquad_completion_timestamp[NR_OPQUADS]; -static int next_opquad = 0; +#define NR_OPSEQS 3 +/*Timestamps of when the last three op sequences completed. Technically this is incorrect, +as the actual size of the opseq buffer is 20 bytes and not 18, but I'm restricted to multiples of 6*/ +static int opseq_completion_timestamp[NR_OPSEQS]; +static int next_opseq = 0; #define NR_REGS 8 /*Timestamp of when last operation on an integer register completed*/ @@ -1838,50 +1899,48 @@ void decode_flush_p6() { int c; int uop_timestamp = 0; - - /*Decoded opquad can not be submitted if there are no free spaces in the - opquad buffer*/ - if (decode_timestamp < opquad_completion_timestamp[next_opquad]) - decode_timestamp = opquad_completion_timestamp[next_opquad]; + + /*Decoded opseq can not be submitted if there are no free spaces in the + opseq buffer*/ + if (decode_timestamp < opseq_completion_timestamp[next_opseq]) + decode_timestamp = opseq_completion_timestamp[next_opseq]; /*Ensure that uops can not be submitted before they have been decoded*/ if (decode_timestamp > last_uop_timestamp) last_uop_timestamp = decode_timestamp; /*Submit uops to execution units, and determine the latest completion time*/ - for (c = 0; c < decode_buffer.nr_uops; c++) + for (c = 0; c < (decode_buffer.nr_uops); c++) { int start_timestamp; - if (decode_buffer.earliest_start[c] == -1) - start_timestamp = last_uop_timestamp; - else - start_timestamp = decode_buffer.earliest_start[c]; + + start_timestamp = decode_buffer.earliest_start[c]; last_uop_timestamp = uop_run(decode_buffer.uops[c], start_timestamp); if (last_uop_timestamp > uop_timestamp) uop_timestamp = last_uop_timestamp; } - /*Calculate opquad completion time. Since opquads complete in order, it + /*Calculate opseq completion time. Since opseqs complete in order, it must be after the last completion.*/ if (uop_timestamp <= last_complete_timestamp) last_complete_timestamp = last_complete_timestamp + 1; else last_complete_timestamp = uop_timestamp; - /*Advance to next opquad in buffer*/ - opquad_completion_timestamp[next_opquad] = last_complete_timestamp; - next_opquad++; - if (next_opquad == NR_OPQUADS) - next_opquad = 0; + /*Advance to next opseq in buffer*/ + opseq_completion_timestamp[next_opseq] = last_complete_timestamp; + next_opseq++; + if (next_opseq == NR_OPSEQS) + next_opseq = 0; decode_timestamp++; decode_buffer.nr_uops = 0; } /*The instruction is only of interest here if it's longer than 7 bytes, as that's the - limit on K6 short decoding*/ + limit on P6 simple decoding*/ static int codegen_timing_instr_length(uint64_t deps, uint32_t fetchdat, int op_32) { int len = prefixes + 1; /*Opcode*/ @@ -1933,8 +1992,9 @@ static int codegen_timing_instr_length(uint64_t deps, uint32_t fetchdat, int op_ static void decode_instruction(const risc86_instruction_t *ins, uint64_t deps, uint32_t fetchdat, int op_32, int bit8) { uint32_t regmask_required; - uint32_t regmask_modified; - int c, d; + uint32_t regmask_modified; + int c; + int d = 0; /*Complex decoder uOPs*/ int earliest_start = 0; decode_type_t decode_type = ins->decode_type; int instr_length = codegen_timing_instr_length(deps, fetchdat, op_32); @@ -1964,87 +2024,68 @@ static void decode_instruction(const risc86_instruction_t *ins, uint64_t deps, u earliest_start = fpu_st_timestamp[reg]; } - /*Short decoders are limited to 7 bytes*/ - if (decode_type == DECODE_SHORT && instr_length > 7) - decode_type = DECODE_LONG; - /*Long decoder is limited to 11 bytes*/ - else if (instr_length > 11) - decode_type = DECODE_VECTOR; + /*Simple decoders are limited to 7 bytes & 1 uOP*/ + if (decode_type == DECODE_SIMPLE && instr_length > 7) + decode_type = DECODE_COMPLEX; + else if (decode_type == DECODE_SIMPLE && ins->nr_uops > 1) + decode_type = DECODE_COMPLEX; switch (decode_type) - { - case DECODE_SHORT: - if (decode_buffer.nr_uops) + { + case DECODE_SIMPLE: + if (decode_buffer.nr_uops - d == 2) + { + decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0]; + decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start; + decode_buffer.nr_uops = 3; + decode_flush_p6(); + } + else if (decode_buffer.nr_uops - d == 1) + { + decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0]; + decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start; + decode_buffer.nr_uops = 2+d; + if (d) + decode_flush_p6(); + } + else if (decode_buffer.nr_uops) { decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0]; decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start; - if (ins->nr_uops > 1) - { - decode_buffer.uops[decode_buffer.nr_uops+1] = &ins->uop[1]; - decode_buffer.earliest_start[decode_buffer.nr_uops+1] = -1; - } - decode_buffer.nr_uops += ins->nr_uops; - - decode_flush_p6(); + decode_buffer.nr_uops = 1+d; } else { - decode_buffer.nr_uops = ins->nr_uops; + decode_buffer.nr_uops = 1; decode_buffer.uops[0] = &ins->uop[0]; decode_buffer.earliest_start[0] = earliest_start; - if (ins->nr_uops > 1) - { - decode_buffer.uops[1] = &ins->uop[1]; - decode_buffer.earliest_start[1] = -1; - } - } + } break; - case DECODE_LONG: + case DECODE_COMPLEX: if (decode_buffer.nr_uops) - decode_flush_p6(); + decode_flush_p6(); /*The 4-1-1 arrangement implies that a complex ins. can't be decoded after a simple one*/ - decode_buffer.nr_uops = ins->nr_uops; - for (c = 0; c < ins->nr_uops; c++) - { - decode_buffer.uops[c] = &ins->uop[c]; - if (c == 0) - decode_buffer.earliest_start[c] = earliest_start; - else - decode_buffer.earliest_start[c] = -1; - } - decode_flush_p6(); - break; - - case DECODE_VECTOR: - if (decode_buffer.nr_uops) - decode_flush_p6(); - - decode_timestamp++; d = 0; - + for (c = 0; c < ins->nr_uops; c++) { decode_buffer.uops[d] = &ins->uop[c]; - if (c == 0) - decode_buffer.earliest_start[d] = earliest_start; - else - decode_buffer.earliest_start[d] = -1; - d++; + decode_buffer.earliest_start[c] = earliest_start; + d++; - if (d == 4) + if (d == 3 && ins->nr_uops > 4) /*Ins. with >4 uOPs require the use of special units only present on 3 translate PLAs*/ { d = 0; - decode_buffer.nr_uops = 4; - decode_flush_p6(); + decode_buffer.nr_uops = 3; + decode_flush_p6(); /*The other two decoders are halted to preserve in-order issue*/ } } - if (d) - { - decode_buffer.nr_uops = d; - decode_flush_p6(); - } - break; + if (d) + { + decode_buffer.nr_uops = d; + } + break; } /*Update write timestamps for any output registers*/ @@ -2095,15 +2136,12 @@ void codegen_timing_p6_block_start() for (c = 0; c < nr_units; c++) units[c].first_available_cycle = 0; - mul_first_available_cycle = 0; - shift_first_available_cycle = 0; - decode_timestamp = 0; last_complete_timestamp = 0; - for (c = 0; c < NR_OPQUADS; c++) - opquad_completion_timestamp[c] = 0; - next_opquad = 0; + for (c = 0; c < NR_OPSEQS; c++) + opseq_completion_timestamp[c] = 0; + next_opseq = 0; for (c = 0; c < NR_REGS; c++) reg_available_timestamp[c] = 0; @@ -2295,7 +2333,7 @@ void codegen_timing_p6_opcode(uint8_t opcode, uint32_t fetchdat, int op_32, uint if (ins_table[opcode]) decode_instruction(ins_table[opcode], deps[opcode], fetchdat, op_32, bit8); else - decode_instruction(&vector_alu1_op, 0, fetchdat, op_32, bit8); + decode_instruction(&complex_alu1_op, 0, fetchdat, op_32, bit8); codegen_block_cycles += (last_complete_timestamp - old_last_complete_timestamp); } @@ -2324,4 +2362,4 @@ codegen_timing_t codegen_timing_p6 = codegen_timing_p6_block_start, codegen_timing_p6_block_end, codegen_timing_p6_jump_cycles -}; +}; \ No newline at end of file