Xen-Devel Archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 00/10] x86: support AVX512-FP16
@ 2023-04-03 14:56 Jan Beulich
  2023-04-03 14:57 ` [PATCH v2 01/10] x86emul: handle AVX512-FP16 insns encoded in 0f3a opcode map Jan Beulich
                   ` (10 more replies)
  0 siblings, 11 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:56 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

While I (quite obviously) don't have any suitable hardware, Intel's
SDE allows testing the implementation. And since there's no new
state (registers) associated with this ISA extension, this should
suffice for integration.

01: handle AVX512-FP16 insns encoded in 0f3a opcode map
02: handle AVX512-FP16 Map5 arithmetic insns
03: handle AVX512-FP16 move insns
04: handle AVX512-FP16 fma-like insns
05: handle AVX512-FP16 Map6 misc insns
06: handle AVX512-FP16 complex multiplication insns
07: handle AVX512-FP16 conversion to/from (packed) int16 insns
08: handle AVX512-FP16 floating point conversion insns
09: handle AVX512-FP16 conversion to/from (packed) int{32,64} insns
10: AVX512-FP16 testing

I've re-based this ahead of the also pending AMX series (and,
obviously, ahead of the not even submitted yet KeyLocker one), in
the hope that this may find its way in sooner than that other series.

Jan


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 01/10] x86emul: handle AVX512-FP16 insns encoded in 0f3a opcode map
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
@ 2023-04-03 14:57 ` Jan Beulich
  2023-04-03 14:57 ` [PATCH v2 02/10] x86emul: handle AVX512-FP16 Map5 arithmetic insns Jan Beulich
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:57 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

In order to re-use (also in subsequent patches) existing code and tables
as much as possible, simply introduce a new boolean field in emulator
state indicating whether an insn is one with a half-precision source.
Everything else then follows "naturally".

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
SDE: -spr or -future

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -76,6 +76,7 @@ enum esz {
     ESZ_b,
     ESZ_w,
     ESZ_bw,
+    ESZ_fp16,
 };
 
 #ifndef __i386__
@@ -601,6 +602,19 @@ static const struct test avx512_vpopcntd
     INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
 };
 
+static const struct test avx512_fp16_all[] = {
+    INSN(cmpph,           , 0f3a, c2,    vl, fp16, vl),
+    INSN(cmpsh,         f3, 0f3a, c2,    el, fp16, el),
+    INSN(fpclassph,       , 0f3a, 66,    vl, fp16, vl),
+    INSN(fpclasssh,       , 0f3a, 67,    el, fp16, el),
+    INSN(getmantph,       , 0f3a, 26,    vl, fp16, vl),
+    INSN(getmantsh,       , 0f3a, 27,    el, fp16, el),
+    INSN(reduceph,        , 0f3a, 56,    vl, fp16, vl),
+    INSN(reducesh,        , 0f3a, 57,    el, fp16, el),
+    INSN(rndscaleph,      , 0f3a, 08,    vl, fp16, vl),
+    INSN(rndscalesh,      , 0f3a, 0a,    el, fp16, el),
+};
+
 static const struct test gfni_all[] = {
     INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
     INSN(gf2p8affineqb,    66, 0f3a, ce, vl, q, vl),
@@ -728,8 +742,10 @@ static void test_one(const struct test *
         break;
 
     case ESZ_w:
-        esz = 2;
         evex.w = 1;
+        /* fall through */
+    case ESZ_fp16:
+        esz = 2;
         break;
 
 #ifdef __i386__
@@ -845,7 +861,7 @@ static void test_one(const struct test *
     case ESZ_b: case ESZ_w: case ESZ_bw:
         return;
 
-    case ESZ_d: case ESZ_q:
+    case ESZ_d: case ESZ_q: case ESZ_fp16:
         break;
 
     default:
@@ -1002,6 +1018,7 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512_vnni, all);
     RUN(avx512_vp2intersect, all);
     RUN(avx512_vpopcntdq, all);
+    RUN(avx512_fp16, all);
 
     if ( cpu_has_avx512f )
     {
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -1972,8 +1972,10 @@ static const struct evex {
     { { 0x03 }, 3, T, R, pfx_66, Wn, Ln }, /* valign{d,q} */
     { { 0x04 }, 3, T, R, pfx_66, W0, Ln }, /* vpermilps */
     { { 0x05 }, 3, T, R, pfx_66, W1, Ln }, /* vpermilpd */
+    { { 0x08 }, 3, T, R, pfx_no, W0, Ln }, /* vrndscaleph */
     { { 0x08 }, 3, T, R, pfx_66, W0, Ln }, /* vrndscaleps */
     { { 0x09 }, 3, T, R, pfx_66, W1, Ln }, /* vrndscalepd */
+    { { 0x0a }, 3, T, R, pfx_no, W0, LIG }, /* vrndscalesh */
     { { 0x0a }, 3, T, R, pfx_66, W0, LIG }, /* vrndscaless */
     { { 0x0b }, 3, T, R, pfx_66, W1, LIG }, /* vrndscalesd */
     { { 0x0f }, 3, T, R, pfx_66, WIG, Ln }, /* vpalignr */
@@ -1993,7 +1995,9 @@ static const struct evex {
     { { 0x22 }, 3, T, R, pfx_66, Wn, L0 }, /* vpinsr{d,q} */
     { { 0x23 }, 3, T, R, pfx_66, Wn, L1|L2 }, /* vshuff{32x4,64x2} */
     { { 0x25 }, 3, T, R, pfx_66, Wn, Ln }, /* vpternlog{d,q} */
+    { { 0x26 }, 3, T, R, pfx_no, W0, Ln }, /* vgetmantph */
     { { 0x26 }, 3, T, R, pfx_66, Wn, Ln }, /* vgetmantp{s,d} */
+    { { 0x27 }, 3, T, R, pfx_no, W0, LIG }, /* vgetmantsh */
     { { 0x27 }, 3, T, R, pfx_66, Wn, LIG }, /* vgetmants{s,d} */
     { { 0x38 }, 3, T, R, pfx_66, Wn, L1|L2 }, /* vinserti{32x4,64x2} */
     { { 0x39 }, 3, T, W, pfx_66, Wn, L1|L2 }, /* vextracti{32x4,64x2} */
@@ -2008,14 +2012,20 @@ static const struct evex {
     { { 0x51 }, 3, T, R, pfx_66, Wn, LIG }, /* vranges{s,d} */
     { { 0x54 }, 3, T, R, pfx_66, Wn, Ln }, /* vfixupimmp{s,d} */
     { { 0x55 }, 3, T, R, pfx_66, Wn, LIG }, /* vfixumpimms{s,d} */
+    { { 0x56 }, 3, T, R, pfx_no, W0, Ln }, /* vreduceph */
     { { 0x56 }, 3, T, R, pfx_66, Wn, Ln }, /* vreducep{s,d} */
+    { { 0x57 }, 3, T, R, pfx_no, W0, LIG }, /* vreducesh */
     { { 0x57 }, 3, T, R, pfx_66, Wn, LIG }, /* vreduces{s,d} */
+    { { 0x66 }, 3, T, R, pfx_no, W0, Ln }, /* vfpclassph */
     { { 0x66 }, 3, T, R, pfx_66, Wn, Ln }, /* vfpclassp{s,d} */
+    { { 0x67 }, 3, T, R, pfx_no, W0, LIG }, /* vfpclasssh */
     { { 0x67 }, 3, T, R, pfx_66, Wn, LIG }, /* vfpclasss{s,d} */
     { { 0x70 }, 3, T, R, pfx_66, W1, Ln }, /* vshldw */
     { { 0x71 }, 3, T, R, pfx_66, Wn, Ln }, /* vshld{d,q} */
     { { 0x72 }, 3, T, R, pfx_66, W1, Ln }, /* vshrdw */
     { { 0x73 }, 3, T, R, pfx_66, Wn, Ln }, /* vshrd{d,q} */
+    { { 0xc2 }, 3, T, R, pfx_no, W0, Ln }, /* vcmpph */
+    { { 0xc2 }, 3, T, R, pfx_f3, W0, LIG }, /* vcmpsh */
     { { 0xce }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineqb */
     { { 0xcf }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineinvqb */
 };
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -4677,6 +4677,44 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vfpclassphz $0x46,128(%ecx),%k3...");
+    if ( stack_exec && cpu_has_avx512_fp16 )
+    {
+        decl_insn(vfpclassph);
+
+        asm volatile ( put_insn(vfpclassph,
+                                /* 0x46: check for +/- 0 and neg. */
+                                /* vfpclassphz $0x46, 128(%0), %%k3 */
+                                ".byte 0x62, 0xf3, 0x7c, 0x48\n\t"
+                                ".byte 0x66, 0x59, 0x02, 0x46")
+                       :: "c" (NULL) );
+
+        set_insn(vfpclassph);
+        for ( i = 0; i < 3; ++i )
+        {
+            res[16 + i * 5 + 0] = 0x7fff0000; /* +0 / +NaN */
+            res[16 + i * 5 + 1] = 0xffff8000; /* -0 / -NaN */
+            res[16 + i * 5 + 2] = 0x80010001; /* +DEN / -DEN */
+            res[16 + i * 5 + 3] = 0xfc00f800; /* -FIN / -INF */
+            res[16 + i * 5 + 4] = 0x7c007800; /* +FIN / +INF */
+        }
+        res[31] = 0;
+        regs.ecx = (unsigned long)res - 64;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vfpclassph) )
+            goto fail;
+        asm volatile ( "kmovd %%k3, %0" : "=g" (rc) );
+        /*
+         * 0b11(0001100101)*3
+         * 0b1100_0110_0101_0001_1001_0100_0110_0101
+         */
+        if ( rc != 0xc6519465 )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     /*
      * The following compress/expand tests are not only making sure the
      * accessed data is correct, but they also verify (by placing operands
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -183,6 +183,7 @@ void wrpkru(unsigned int val);
 #define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
 #define cpu_has_avx512_vp2intersect (cp.feat.avx512_vp2intersect && xcr0_mask(0xe6))
 #define cpu_has_serialize  cp.feat.serialize
+#define cpu_has_avx512_fp16 (cp.feat.avx512_fp16 && xcr0_mask(0xe6))
 #define cpu_has_avx_vnni   (cp.feat.avx_vnni && xcr0_mask(6))
 #define cpu_has_avx512_bf16 (cp.feat.avx512_bf16 && xcr0_mask(0xe6))
 
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -518,6 +518,7 @@ static const struct ext0f3a_table {
     [0x7a ... 0x7b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
+    [0xc2] = { .simd_size = simd_any_fp, .d8s = d8s_vl },
     [0xcc] = { .simd_size = simd_other },
     [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -579,7 +580,7 @@ static unsigned int decode_disp8scale(en
         if ( s->evex.brs )
         {
     case d8s_dq:
-            return 2 + s->evex.w;
+            return 1 + !s->fp16 + s->evex.w;
         }
         break;
 
@@ -596,7 +597,7 @@ static unsigned int decode_disp8scale(en
         /* fall through */
     case simd_scalar_opc:
     case simd_scalar_vexw:
-        return 2 + s->evex.w;
+        return 1 + !s->fp16 + s->evex.w;
 
     case simd_128:
         /* These should have an explicit size specified. */
@@ -1417,7 +1418,29 @@ int x86emul_decode(struct x86_emulate_st
              */
             s->simd_size = ext0f3a_table[b].simd_size;
             if ( evex_encoded() )
+            {
+                switch ( b )
+                {
+                case 0x08: /* vrndscaleph */
+                case 0x0a: /* vrndscalesh */
+                case 0x26: /* vfpclassph */
+                case 0x27: /* vfpclasssh */
+                case 0x56: /* vgetmantph */
+                case 0x57: /* vgetmantsh */
+                case 0x66: /* vreduceph */
+                case 0x67: /* vreducesh */
+                    if ( !s->evex.pfx )
+                        s->fp16 = true;
+                    break;
+
+                case 0xc2: /* vpcmp{p,s}h */
+                    if ( !(s->evex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+                        s->fp16 = true;
+                    break;
+                }
+
                 disp8scale = decode_disp8scale(ext0f3a_table[b].d8s, s);
+            }
             break;
 
         case ext_8f09:
@@ -1712,7 +1735,7 @@ int x86emul_decode(struct x86_emulate_st
             break;
         case vex_f3:
             generate_exception_if(evex_encoded() && s->evex.w, X86_EXC_UD);
-            s->op_bytes = 4;
+            s->op_bytes = 4 >> s->fp16;
             break;
         case vex_f2:
             generate_exception_if(evex_encoded() && !s->evex.w, X86_EXC_UD);
@@ -1722,11 +1745,11 @@ int x86emul_decode(struct x86_emulate_st
         break;
 
     case simd_scalar_opc:
-        s->op_bytes = 4 << (ctxt->opcode & 1);
+        s->op_bytes = 2 << (!s->fp16 + (ctxt->opcode & 1));
         break;
 
     case simd_scalar_vexw:
-        s->op_bytes = 4 << s->vex.w;
+        s->op_bytes = 2 << (!s->fp16 + s->vex.w);
         break;
 
     case simd_128:
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -305,6 +305,7 @@ struct x86_emulate_state {
     bool lock_prefix;
     bool not_64bit; /* Instruction not available in 64bit. */
     bool fpu_ctrl;  /* Instruction is an FPU control one. */
+    bool fp16;      /* Instruction has half-precision FP source operand. */
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
@@ -592,6 +593,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_avx512_vp2intersect() (ctxt->cpuid->feat.avx512_vp2intersect)
 #define vcpu_has_serialize()   (ctxt->cpuid->feat.serialize)
 #define vcpu_has_tsxldtrk()    (ctxt->cpuid->feat.tsxldtrk)
+#define vcpu_has_avx512_fp16() (ctxt->cpuid->feat.avx512_fp16)
 #define vcpu_has_avx_vnni()    (ctxt->cpuid->feat.avx_vnni)
 #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1300,7 +1300,7 @@ x86_emulate(
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
-    elem_bytes = 4 << evex.w;
+    elem_bytes = 2 << (!state->fp16 + evex.w);
 
     generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
 
@@ -7145,6 +7145,15 @@ x86_emulate(
         avx512_vlen_check(b & 2);
         goto simd_imm8_zmm;
 
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x0a): /* vrndscalesh $imm8,xmm/mem,xmm,xmm{k} */
+        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x08): /* vrndscaleph $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        avx512_vlen_check(b & 2);
+        goto simd_imm8_zmm;
+
 #endif /* X86EMUL_NO_SIMD */
 
     CASE_SIMD_PACKED_INT(0x0f3a, 0x0f): /* palignr $imm8,{,x}mm/mem,{,x}mm */
@@ -7455,6 +7464,14 @@ x86_emulate(
             avx512_vlen_check(false);
         goto simd_imm8_zmm;
 
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x26): /* vgetmantph $imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x56): /* vreduceph $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x51): /* vranges{s,d} $imm8,xmm/mem,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x57): /* vreduces{s,d} $imm8,xmm/mem,xmm,xmm{k} */
         host_and_vcpu_must_have(avx512dq);
@@ -7467,6 +7484,16 @@ x86_emulate(
             avx512_vlen_check(true);
         goto simd_imm8_zmm;
 
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x27): /* vgetmantsh $imm8,xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x57): /* vreducesh $imm8,xmm/mem,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        if ( !evex.brs )
+            avx512_vlen_check(true);
+        else
+            generate_exception_if(ea.type != OP_REG, EXC_UD);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
         if ( !vex.w )
@@ -7630,6 +7657,16 @@ x86_emulate(
         avx512_vlen_check(true);
         goto simd_imm8_zmm;
 
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x66): /* vfpclassph $imm8,[xyz]mm/mem,k{k} */
+    case X86EMUL_OPC_EVEX(0x0f3a, 0x67): /* vfpclasssh $imm8,xmm/mem,k{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w || !evex.r || !evex.R || evex.z, EXC_UD);
+        if ( !(b & 1) )
+            goto avx512f_imm8_no_sae;
+        generate_exception_if(evex.brs, EXC_UD);
+        avx512_vlen_check(true);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x70): /* vpshldw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x72): /* vpshrdw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(!evex.w, EXC_UD);
@@ -7640,6 +7677,16 @@ x86_emulate(
         host_and_vcpu_must_have(avx512_vbmi2);
         goto avx512f_imm8_no_sae;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f3a, 0xc2): /* vcmpsh $imm8,xmm/mem,xmm,k{k} */
+        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX(0x0f3a, 0xc2): /* vcmpph $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w || !evex.r || !evex.R || evex.z, EXC_UD);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC(0x0f3a, 0xcc):     /* sha1rnds4 $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(sha);
         op_bytes = 16;



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 02/10] x86emul: handle AVX512-FP16 Map5 arithmetic insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
  2023-04-03 14:57 ` [PATCH v2 01/10] x86emul: handle AVX512-FP16 insns encoded in 0f3a opcode map Jan Beulich
@ 2023-04-03 14:57 ` Jan Beulich
  2023-04-03 14:57 ` [PATCH v2 03/10] x86emul: handle AVX512-FP16 move insns Jan Beulich
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:57 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

This encoding space is a very sparse clone of the "twobyte" one. Re-use
that table, as the entries corresponding to invalid opcodes in Map5 are
simply benign with simd_size forced to other than simd_none (preventing
undue memory reads in SrcMem handling early in x86_emulate()).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add comments.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -6,7 +6,7 @@
 struct test {
     const char *mnemonic;
     unsigned int opc:8;
-    unsigned int spc:2;
+    unsigned int spc:3;
     unsigned int pfx:2;
     unsigned int vsz:3;
     unsigned int esz:4;
@@ -19,6 +19,10 @@ enum spc {
     SPC_0f,
     SPC_0f38,
     SPC_0f3a,
+    SPC_unused4,
+    SPC_map5,
+    SPC_map6,
+    SPC_unused7,
 };
 
 enum pfx {
@@ -603,16 +607,32 @@ static const struct test avx512_vpopcntd
 };
 
 static const struct test avx512_fp16_all[] = {
+    INSN(addph,           , map5, 58,    vl, fp16, vl),
+    INSN(addsh,         f3, map5, 58,    el, fp16, el),
     INSN(cmpph,           , 0f3a, c2,    vl, fp16, vl),
     INSN(cmpsh,         f3, 0f3a, c2,    el, fp16, el),
+    INSN(comish,          , map5, 2f,    el, fp16, el),
+    INSN(divph,           , map5, 5e,    vl, fp16, vl),
+    INSN(divsh,         f3, map5, 5e,    el, fp16, el),
     INSN(fpclassph,       , 0f3a, 66,    vl, fp16, vl),
     INSN(fpclasssh,       , 0f3a, 67,    el, fp16, el),
     INSN(getmantph,       , 0f3a, 26,    vl, fp16, vl),
     INSN(getmantsh,       , 0f3a, 27,    el, fp16, el),
+    INSN(maxph,           , map5, 5f,    vl, fp16, vl),
+    INSN(maxsh,         f3, map5, 5f,    el, fp16, el),
+    INSN(minph,           , map5, 5d,    vl, fp16, vl),
+    INSN(minsh,         f3, map5, 5d,    el, fp16, el),
+    INSN(mulph,           , map5, 59,    vl, fp16, vl),
+    INSN(mulsh,         f3, map5, 59,    el, fp16, el),
     INSN(reduceph,        , 0f3a, 56,    vl, fp16, vl),
     INSN(reducesh,        , 0f3a, 57,    el, fp16, el),
     INSN(rndscaleph,      , 0f3a, 08,    vl, fp16, vl),
     INSN(rndscalesh,      , 0f3a, 0a,    el, fp16, el),
+    INSN(sqrtph,          , map5, 51,    vl, fp16, vl),
+    INSN(sqrtsh,        f3, map5, 51,    el, fp16, el),
+    INSN(subph,           , map5, 5c,    vl, fp16, vl),
+    INSN(subsh,         f3, map5, 5c,    el, fp16, el),
+    INSN(ucomish,         , map5, 2e,    el, fp16, el),
 };
 
 static const struct test gfni_all[] = {
@@ -713,8 +733,8 @@ static void test_one(const struct test *
     union evex {
         uint8_t raw[3];
         struct {
-            uint8_t opcx:2;
-            uint8_t mbz:2;
+            uint8_t opcx:3;
+            uint8_t mbz:1;
             uint8_t R:1;
             uint8_t b:1;
             uint8_t x:1;
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2028,6 +2028,23 @@ static const struct evex {
     { { 0xc2 }, 3, T, R, pfx_f3, W0, LIG }, /* vcmpsh */
     { { 0xce }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineqb */
     { { 0xcf }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineinvqb */
+}, evex_map5[] = {
+    { { 0x2e }, 2, T, R, pfx_no, W0, LIG }, /* vucomish */
+    { { 0x2f }, 2, T, R, pfx_no, W0, LIG }, /* vcomish */
+    { { 0x51 }, 2, T, R, pfx_no, W0, Ln }, /* vsqrtph */
+    { { 0x51 }, 2, T, R, pfx_f3, W0, LIG }, /* vsqrtsh */
+    { { 0x58 }, 2, T, R, pfx_no, W0, Ln }, /* vaddph */
+    { { 0x58 }, 2, T, R, pfx_f3, W0, LIG }, /* vaddsh */
+    { { 0x59 }, 2, T, R, pfx_no, W0, Ln }, /* vmulph */
+    { { 0x59 }, 2, T, R, pfx_f3, W0, LIG }, /* vmulsh */
+    { { 0x5c }, 2, T, R, pfx_no, W0, Ln }, /* vsubph */
+    { { 0x5c }, 2, T, R, pfx_f3, W0, LIG }, /* vsubsh */
+    { { 0x5d }, 2, T, R, pfx_no, W0, Ln }, /* vminph */
+    { { 0x5d }, 2, T, R, pfx_f3, W0, LIG }, /* vminsh */
+    { { 0x5e }, 2, T, R, pfx_no, W0, Ln }, /* vdivph */
+    { { 0x5e }, 2, T, R, pfx_f3, W0, LIG }, /* vdivsh */
+    { { 0x5f }, 2, T, R, pfx_no, W0, Ln }, /* vmaxph */
+    { { 0x5f }, 2, T, R, pfx_f3, W0, LIG }, /* vmaxsh */
 };
 
 static const struct {
@@ -2037,6 +2054,8 @@ static const struct {
     { evex_0f,   ARRAY_SIZE(evex_0f) },
     { evex_0f38, ARRAY_SIZE(evex_0f38) },
     { evex_0f3a, ARRAY_SIZE(evex_0f3a) },
+    { NULL,      0 },
+    { evex_map5, ARRAY_SIZE(evex_map5) },
 };
 
 #undef Wn
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -1219,9 +1219,22 @@ int x86emul_decode(struct x86_emulate_st
                         opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
                         d = twobyte_table[0x3a].desc;
                         break;
+
+                    case evex_map5:
+                        if ( !evex_encoded() )
+                        {
                     default:
-                        rc = X86EMUL_UNRECOGNIZED;
-                        goto done;
+                            rc = X86EMUL_UNRECOGNIZED;
+                            goto done;
+                        }
+                        opcode |= MASK_INSR(5, X86EMUL_OPC_EXT_MASK);
+                        /*
+                         * Re-use twobyte_table[] here, for the similarity of
+                         * the entries valid in map 5.
+                         */
+                        d = twobyte_table[b].desc;
+                        s->simd_size = twobyte_table[b].size ?: simd_other;
+                        break;
                     }
                 }
                 else if ( s->ext < ext_8f08 + ARRAY_SIZE(xop_table) )
@@ -1443,6 +1456,25 @@ int x86emul_decode(struct x86_emulate_st
             }
             break;
 
+        case ext_map5:
+            switch ( b )
+            {
+            default:
+                if ( !(s->evex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+                    s->fp16 = true;
+                break;
+
+            case 0x2e: case 0x2f: /* v{,u}comish */
+                if ( !s->evex.pfx )
+                    s->fp16 = true;
+                s->simd_size = simd_none;
+                break;
+            }
+
+            /* Like above re-use twobyte_table[] here. */
+            disp8scale = decode_disp8scale(twobyte_table[b].d8s, s);
+            break;
+
         case ext_8f09:
             if ( ext8f09_table[b].two_op )
                 d |= TwoOp;
@@ -1661,6 +1693,7 @@ int x86emul_decode(struct x86_emulate_st
         s->simd_size = ext8f08_table[b].simd_size;
         break;
 
+    case ext_map5:
     case ext_8f09:
     case ext_8f0a:
         break;
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -195,6 +195,7 @@ enum vex_opcx {
     vex_0f = vex_none + 1,
     vex_0f38,
     vex_0f3a,
+    evex_map5 = 5,
 };
 
 enum vex_pfx {
@@ -223,8 +224,8 @@ union vex {
 union evex {
     uint8_t raw[3];
     struct {             /* SDM names */
-        uint8_t opcx:2;  /* mm */
-        uint8_t mbz:2;
+        uint8_t opcx:3;  /* mmm */
+        uint8_t mbz:1;
         uint8_t R:1;     /* R' */
         uint8_t b:1;     /* B */
         uint8_t x:1;     /* X */
@@ -249,6 +250,7 @@ struct x86_emulate_state {
         ext_0f   = vex_0f,
         ext_0f38 = vex_0f38,
         ext_0f3a = vex_0f3a,
+        ext_map5 = evex_map5,
         /*
          * For XOP use values such that the respective instruction field
          * can be used without adjustment.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3756,6 +3756,13 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
+    case X86EMUL_OPC_EVEX(5, 0x2e): /* vucomish xmm/m16,xmm */
+    case X86EMUL_OPC_EVEX(5, 0x2f): /* vcomish xmm/m16,xmm */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        /* fall through */
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2e): /* vucomis{s,d} xmm/mem,xmm */
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2f): /* vcomis{s,d} xmm/mem,xmm */
         generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk ||
@@ -3768,9 +3775,11 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_zmm);
 
         opc = init_evex(stub);
-        op_bytes = 4 << evex.w;
+        op_bytes = 2 << (!state->fp16 + evex.w);
         goto vcomi;
 
+#endif
+
     case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->write_msr == NULL);
@@ -7736,6 +7745,20 @@ x86_emulate(
 
 #ifndef X86EMUL_NO_SIMD
 
+    case X86EMUL_OPC_EVEX_F3(5, 0x51):   /* vsqrtsh xmm/m16,xmm,xmm{k} */
+        d &= ~TwoOp;
+        /* fall through */
+    case X86EMUL_OPC_EVEX(5, 0x51):      /* vsqrtph [xyz]mm/mem,[xyz]mm{k} */
+    CASE_SIMD_SINGLE_FP(_EVEX, 5, 0x58): /* vadd{p,s}h [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_SINGLE_FP(_EVEX, 5, 0x59): /* vmul{p,s}h [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_SINGLE_FP(_EVEX, 5, 0x5c): /* vsub{p,s}h [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_SINGLE_FP(_EVEX, 5, 0x5d): /* vmin{p,s}h [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_SINGLE_FP(_EVEX, 5, 0x5e): /* vdiv{p,s}h [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_SINGLE_FP(_EVEX, 5, 0x5f): /* vmax{p,s}h [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        goto avx512f_all_fp;
+
     case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -619,6 +619,7 @@ struct x86_emulate_ctxt
  *    0x0fxxxx for 0f-prefixed opcodes (or their VEX/EVEX equivalents)
  *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
  *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
+ *     0x5xxxx for Map5 opcodes (EVEX only)
  *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
  *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
  *  0x8f0axxxx for 8f/a-prefixed XOP opcodes



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 03/10] x86emul: handle AVX512-FP16 move insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
  2023-04-03 14:57 ` [PATCH v2 01/10] x86emul: handle AVX512-FP16 insns encoded in 0f3a opcode map Jan Beulich
  2023-04-03 14:57 ` [PATCH v2 02/10] x86emul: handle AVX512-FP16 Map5 arithmetic insns Jan Beulich
@ 2023-04-03 14:57 ` Jan Beulich
  2023-04-03 14:58 ` [PATCH v2 04/10] x86emul: handle AVX512-FP16 fma-like insns Jan Beulich
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:57 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -622,6 +622,8 @@ static const struct test avx512_fp16_all
     INSN(maxsh,         f3, map5, 5f,    el, fp16, el),
     INSN(minph,           , map5, 5d,    vl, fp16, vl),
     INSN(minsh,         f3, map5, 5d,    el, fp16, el),
+    INSN(movsh,         f3, map5, 10,    el, fp16, el),
+    INSN(movsh,         f3, map5, 11,    el, fp16, el),
     INSN(mulph,           , map5, 59,    vl, fp16, vl),
     INSN(mulsh,         f3, map5, 59,    el, fp16, el),
     INSN(reduceph,        , 0f3a, 56,    vl, fp16, vl),
@@ -635,6 +637,11 @@ static const struct test avx512_fp16_all
     INSN(ucomish,         , map5, 2e,    el, fp16, el),
 };
 
+static const struct test avx512_fp16_128[] = {
+    INSN(movw, 66, map5, 6e, el, fp16, el),
+    INSN(movw, 66, map5, 7e, el, fp16, el),
+};
+
 static const struct test gfni_all[] = {
     INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
     INSN(gf2p8affineqb,    66, 0f3a, ce, vl, q, vl),
@@ -1039,6 +1046,7 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512_vp2intersect, all);
     RUN(avx512_vpopcntdq, all);
     RUN(avx512_fp16, all);
+    RUN(avx512_fp16, 128);
 
     if ( cpu_has_avx512f )
     {
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2029,6 +2029,8 @@ static const struct evex {
     { { 0xce }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineqb */
     { { 0xcf }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineinvqb */
 }, evex_map5[] = {
+    { { 0x10 }, 2, T, R, pfx_f3, W0, LIG }, /* vmovsh */
+    { { 0x11 }, 2, T, W, pfx_f3, W0, LIG }, /* vmovsh */
     { { 0x2e }, 2, T, R, pfx_no, W0, LIG }, /* vucomish */
     { { 0x2f }, 2, T, R, pfx_no, W0, LIG }, /* vcomish */
     { { 0x51 }, 2, T, R, pfx_no, W0, Ln }, /* vsqrtph */
@@ -2045,6 +2047,8 @@ static const struct evex {
     { { 0x5e }, 2, T, R, pfx_f3, W0, LIG }, /* vdivsh */
     { { 0x5f }, 2, T, R, pfx_no, W0, Ln }, /* vmaxph */
     { { 0x5f }, 2, T, R, pfx_f3, W0, LIG }, /* vmaxsh */
+    { { 0x6e }, 2, T, R, pfx_66, WIG, L0 }, /* vmovw */
+    { { 0x7e }, 2, T, W, pfx_66, WIG, L0 }, /* vmovw */
 };
 
 static const struct {
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5140,6 +5140,76 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovsh 8(%ecx),%xmm5...");
+    if ( stack_exec && cpu_has_avx512_fp16 )
+    {
+        decl_insn(vmovsh_from_mem);
+        decl_insn(vmovw_to_gpr);
+
+        asm volatile ( "vpcmpeqw %%ymm5, %%ymm5, %%ymm5\n\t"
+                       put_insn(vmovsh_from_mem,
+                                /* vmovsh 8(%0), %%xmm5 */
+                                ".byte 0x62, 0xf5, 0x7e, 0x08\n\t"
+                                ".byte 0x10, 0x69, 0x04")
+                       :: "c" (NULL) );
+
+        set_insn(vmovsh_from_mem);
+        res[2] = 0x3c00bc00;
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsh_from_mem) )
+            goto fail;
+        asm volatile ( "kmovw     %2, %%k1\n\t"
+                       "vmovdqu16 %1, %%zmm4%{%%k1%}%{z%}\n\t"
+                       "vpcmpeqw  %%zmm4, %%zmm5, %%k0\n\t"
+                       "kmovw     %%k0, %0"
+                       : "=g" (rc)
+                       : "m" (res[2]), "r" (1) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vmovsh %xmm4,2(%eax){%k3}...");
+        memset(res, ~0, 8);
+        res[2] = 0xbc00ffff;
+        memset(res + 3, ~0, 8);
+        regs.eax = (unsigned long)res;
+        regs.ecx = ~0;
+        for ( i = 0; i < 2; ++i )
+        {
+            decl_insn(vmovsh_to_mem);
+
+            asm volatile ( "kmovw %1, %%k3\n\t"
+                           put_insn(vmovsh_to_mem,
+                                    /* vmovsh %%xmm4, 2(%0)%{%%k3%} */
+                                    ".byte 0x62, 0xf5, 0x7e, 0x0b\n\t"
+                                    ".byte 0x11, 0x60, 0x01")
+                           :: "a" (NULL), "r" (i) );
+
+            set_insn(vmovsh_to_mem);
+            rc = x86_emulate(&ctxt, &emulops);
+            if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsh_to_mem) ||
+                 memcmp(res, res + 3 - i, 8) )
+                goto fail;
+        }
+        printf("okay\n");
+
+        printf("%-40s", "Testing vmovw %xmm5,%ecx...");
+        asm volatile ( put_insn(vmovw_to_gpr,
+                                /* vmovw %%xmm5, %0 */
+                                ".byte 0x62, 0xf5, 0x7d, 0x08\n\t"
+                                ".byte 0x7e, 0xe9")
+                       :: "c" (NULL) );
+        set_insn(vmovw_to_gpr);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovw_to_gpr) ||
+             regs.ecx != 0xbc00 )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing invpcid 16(%ecx),%%edx...");
     if ( stack_exec )
     {
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -585,7 +585,7 @@ static unsigned int decode_disp8scale(en
         break;
 
     case d8s_dq64:
-        return 2 + (s->op_bytes == 8);
+        return 1 + !s->fp16 + (s->op_bytes == 8);
     }
 
     switch ( s->simd_size )
@@ -1469,6 +1469,15 @@ int x86emul_decode(struct x86_emulate_st
                     s->fp16 = true;
                 s->simd_size = simd_none;
                 break;
+
+            case 0x6e: /* vmovw r/m16, xmm */
+                d = (d & ~SrcMask) | SrcMem16;
+                /* fall through */
+            case 0x7e: /* vmovw xmm, r/m16 */
+                if ( s->evex.pfx == vex_66 )
+                    s->fp16 = true;
+                s->simd_size = simd_none;
+                break;
             }
 
             /* Like above re-use twobyte_table[] here. */
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4390,6 +4390,15 @@ x86_emulate(
 
 #ifndef X86EMUL_NO_SIMD
 
+    case X86EMUL_OPC_EVEX_66(5, 0x7e): /* vmovw xmm,r/m16 */
+        ASSERT(dst.bytes >= 4);
+        if ( dst.type == OP_MEM )
+            dst.bytes = 2;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(5, 0x6e): /* vmovw r/m16,xmm */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     case X86EMUL_OPC_EVEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
         generate_exception_if((evex.lr || evex.opmsk || evex.brs ||
@@ -7745,8 +7754,18 @@ x86_emulate(
 
 #ifndef X86EMUL_NO_SIMD
 
+    case X86EMUL_OPC_EVEX_F3(5, 0x10):   /* vmovsh m16,xmm{k} */
+                                         /* vmovsh xmm,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_F3(5, 0x11):   /* vmovsh xmm,m16{k} */
+                                         /* vmovsh xmm,xmm,xmm{k} */
+        generate_exception_if(evex.brs, EXC_UD);
+        if ( ea.type == OP_MEM )
+            d |= TwoOp;
+        else
+        {
     case X86EMUL_OPC_EVEX_F3(5, 0x51):   /* vsqrtsh xmm/m16,xmm,xmm{k} */
-        d &= ~TwoOp;
+            d &= ~TwoOp;
+        }
         /* fall through */
     case X86EMUL_OPC_EVEX(5, 0x51):      /* vsqrtph [xyz]mm/mem,[xyz]mm{k} */
     CASE_SIMD_SINGLE_FP(_EVEX, 5, 0x58): /* vadd{p,s}h [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 04/10] x86emul: handle AVX512-FP16 fma-like insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (2 preceding siblings ...)
  2023-04-03 14:57 ` [PATCH v2 03/10] x86emul: handle AVX512-FP16 move insns Jan Beulich
@ 2023-04-03 14:58 ` Jan Beulich
  2023-04-03 14:58 ` [PATCH v2 05/10] x86emul: handle AVX512-FP16 Map6 misc insns Jan Beulich
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:58 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

The Map6 encoding space is a very sparse clone of the "0f38" one. Once
again re-use that table, as the entries corresponding to invalid opcodes
in Map6 are simply benign with simd_size forced to other than simd_none
(preventing undue memory reads in SrcMem handling early in
x86_emulate()).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add comments.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -614,6 +614,36 @@ static const struct test avx512_fp16_all
     INSN(comish,          , map5, 2f,    el, fp16, el),
     INSN(divph,           , map5, 5e,    vl, fp16, vl),
     INSN(divsh,         f3, map5, 5e,    el, fp16, el),
+    INSN(fmadd132ph,    66, map6, 98,    vl, fp16, vl),
+    INSN(fmadd132sh,    66, map6, 99,    el, fp16, el),
+    INSN(fmadd213ph,    66, map6, a8,    vl, fp16, vl),
+    INSN(fmadd213sh,    66, map6, a9,    el, fp16, el),
+    INSN(fmadd231ph,    66, map6, b8,    vl, fp16, vl),
+    INSN(fmadd231sh,    66, map6, b9,    el, fp16, el),
+    INSN(fmaddsub132ph, 66, map6, 96,    vl, fp16, vl),
+    INSN(fmaddsub213ph, 66, map6, a6,    vl, fp16, vl),
+    INSN(fmaddsub231ph, 66, map6, b6,    vl, fp16, vl),
+    INSN(fmsub132ph,    66, map6, 9a,    vl, fp16, vl),
+    INSN(fmsub132sh,    66, map6, 9b,    el, fp16, el),
+    INSN(fmsub213ph,    66, map6, aa,    vl, fp16, vl),
+    INSN(fmsub213sh,    66, map6, ab,    el, fp16, el),
+    INSN(fmsub231ph,    66, map6, ba,    vl, fp16, vl),
+    INSN(fmsub231sh,    66, map6, bb,    el, fp16, el),
+    INSN(fmsubadd132ph, 66, map6, 97,    vl, fp16, vl),
+    INSN(fmsubadd213ph, 66, map6, a7,    vl, fp16, vl),
+    INSN(fmsubadd231ph, 66, map6, b7,    vl, fp16, vl),
+    INSN(fnmadd132ph,   66, map6, 9c,    vl, fp16, vl),
+    INSN(fnmadd132sh,   66, map6, 9d,    el, fp16, el),
+    INSN(fnmadd213ph,   66, map6, ac,    vl, fp16, vl),
+    INSN(fnmadd213sh,   66, map6, ad,    el, fp16, el),
+    INSN(fnmadd231ph,   66, map6, bc,    vl, fp16, vl),
+    INSN(fnmadd231sh,   66, map6, bd,    el, fp16, el),
+    INSN(fnmsub132ph,   66, map6, 9e,    vl, fp16, vl),
+    INSN(fnmsub132sh,   66, map6, 9f,    el, fp16, el),
+    INSN(fnmsub213ph,   66, map6, ae,    vl, fp16, vl),
+    INSN(fnmsub213sh,   66, map6, af,    el, fp16, el),
+    INSN(fnmsub231ph,   66, map6, be,    vl, fp16, vl),
+    INSN(fnmsub231sh,   66, map6, bf,    el, fp16, el),
     INSN(fpclassph,       , 0f3a, 66,    vl, fp16, vl),
     INSN(fpclasssh,       , 0f3a, 67,    el, fp16, el),
     INSN(getmantph,       , 0f3a, 26,    vl, fp16, vl),
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2049,6 +2049,37 @@ static const struct evex {
     { { 0x5f }, 2, T, R, pfx_f3, W0, LIG }, /* vmaxsh */
     { { 0x6e }, 2, T, R, pfx_66, WIG, L0 }, /* vmovw */
     { { 0x7e }, 2, T, W, pfx_66, WIG, L0 }, /* vmovw */
+}, evex_map6[] = {
+    { { 0x96 }, 2, T, R, pfx_66, W0, Ln }, /* vfmaddsub132ph */
+    { { 0x97 }, 2, T, R, pfx_66, W0, Ln }, /* vfmsubadd132ph */
+    { { 0x98 }, 2, T, R, pfx_66, W0, Ln }, /* vfmadd132ph */
+    { { 0x99 }, 2, T, R, pfx_66, W0, LIG }, /* vfmadd132sh */
+    { { 0x9a }, 2, T, R, pfx_66, W0, Ln }, /* vfmsub132ph */
+    { { 0x9b }, 2, T, R, pfx_66, W0, LIG }, /* vfmsub132sh */
+    { { 0x9c }, 2, T, R, pfx_66, W0, Ln }, /* vfnmadd132ph */
+    { { 0x9d }, 2, T, R, pfx_66, W0, LIG }, /* vfnmadd132sh */
+    { { 0x9e }, 2, T, R, pfx_66, W0, Ln }, /* vfnmsub132ph */
+    { { 0x9f }, 2, T, R, pfx_66, W0, LIG }, /* vfnmsub132sh */
+    { { 0xa6 }, 2, T, R, pfx_66, W0, Ln }, /* vfmaddsub213ph */
+    { { 0xa7 }, 2, T, R, pfx_66, W0, Ln }, /* vfmsubadd213ph */
+    { { 0xa8 }, 2, T, R, pfx_66, W0, Ln }, /* vfmadd213ph */
+    { { 0xa9 }, 2, T, R, pfx_66, W0, LIG }, /* vfmadd213sh */
+    { { 0xaa }, 2, T, R, pfx_66, W0, Ln }, /* vfmsub213ph */
+    { { 0xab }, 2, T, R, pfx_66, W0, LIG }, /* vfmsub213sh */
+    { { 0xac }, 2, T, R, pfx_66, W0, Ln }, /* vfnmadd213ph */
+    { { 0xad }, 2, T, R, pfx_66, W0, LIG }, /* vfnmadd213sh */
+    { { 0xae }, 2, T, R, pfx_66, W0, Ln }, /* vfnmsub213ph */
+    { { 0xaf }, 2, T, R, pfx_66, W0, LIG }, /* vfnmsub213sh */
+    { { 0xb6 }, 2, T, R, pfx_66, W0, Ln }, /* vfmaddsub231ph */
+    { { 0xb7 }, 2, T, R, pfx_66, W0, Ln }, /* vfmsubadd231ph */
+    { { 0xb8 }, 2, T, R, pfx_66, W0, Ln }, /* vfmadd231ph */
+    { { 0xb9 }, 2, T, R, pfx_66, W0, LIG }, /* vfmadd231sh */
+    { { 0xba }, 2, T, R, pfx_66, W0, Ln }, /* vfmsub231ph */
+    { { 0xbb }, 2, T, R, pfx_66, W0, LIG }, /* vfmsub231sh */
+    { { 0xbc }, 2, T, R, pfx_66, W0, Ln }, /* vfnmadd231ph */
+    { { 0xbd }, 2, T, R, pfx_66, W0, LIG }, /* vfnmadd231sh */
+    { { 0xbe }, 2, T, R, pfx_66, W0, Ln }, /* vfnmsub231ph */
+    { { 0xbf }, 2, T, R, pfx_66, W0, LIG }, /* vfnmsub231sh */
 };
 
 static const struct {
@@ -2060,6 +2091,7 @@ static const struct {
     { evex_0f3a, ARRAY_SIZE(evex_0f3a) },
     { NULL,      0 },
     { evex_map5, ARRAY_SIZE(evex_map5) },
+    { evex_map6, ARRAY_SIZE(evex_map6) },
 };
 
 #undef Wn
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -1235,6 +1235,20 @@ int x86emul_decode(struct x86_emulate_st
                         d = twobyte_table[b].desc;
                         s->simd_size = twobyte_table[b].size ?: simd_other;
                         break;
+
+                    case evex_map6:
+                        if ( !evex_encoded() )
+                        {
+                            rc = X86EMUL_UNRECOGNIZED;
+                            goto done;
+                        }
+                        opcode |= MASK_INSR(6, X86EMUL_OPC_EXT_MASK);
+                        /*
+                         * Re-use twobyte_table[]'s 0x38 entry here, for the
+                         * similarity of the 0F38 entries with map 6.
+                         */
+                        d = twobyte_table[0x38].desc;
+                        break;
                     }
                 }
                 else if ( s->ext < ext_8f08 + ARRAY_SIZE(xop_table) )
@@ -1484,6 +1498,28 @@ int x86emul_decode(struct x86_emulate_st
             disp8scale = decode_disp8scale(twobyte_table[b].d8s, s);
             break;
 
+        case ext_map6:
+            /*
+             * Re-use ext0f38_table[] here, for the similarity of the entries
+             * valid in map 6.
+             */
+            d = ext0f38_table[b].to_mem ? DstMem | SrcReg
+                                        : DstReg | SrcMem;
+            if ( ext0f38_table[b].two_op )
+                d |= TwoOp;
+            s->simd_size = ext0f38_table[b].simd_size ?: simd_other;
+
+            switch ( b )
+            {
+            default:
+                if ( s->evex.pfx == vex_66 )
+                    s->fp16 = true;
+                break;
+            }
+
+            disp8scale = decode_disp8scale(ext0f38_table[b].d8s, s);
+            break;
+
         case ext_8f09:
             if ( ext8f09_table[b].two_op )
                 d |= TwoOp;
@@ -1703,6 +1739,7 @@ int x86emul_decode(struct x86_emulate_st
         break;
 
     case ext_map5:
+    case ext_map6:
     case ext_8f09:
     case ext_8f0a:
         break;
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -196,6 +196,7 @@ enum vex_opcx {
     vex_0f38,
     vex_0f3a,
     evex_map5 = 5,
+    evex_map6,
 };
 
 enum vex_pfx {
@@ -251,6 +252,7 @@ struct x86_emulate_state {
         ext_0f38 = vex_0f38,
         ext_0f3a = vex_0f3a,
         ext_map5 = evex_map5,
+        ext_map6 = evex_map6,
         /*
          * For XOP use values such that the respective instruction field
          * can be used without adjustment.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7778,6 +7778,49 @@ x86_emulate(
         generate_exception_if(evex.w, EXC_UD);
         goto avx512f_all_fp;
 
+    case X86EMUL_OPC_EVEX_66(6, 0x96): /* vfmaddsub132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x97): /* vfmsubadd132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x98): /* vfmadd132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x9a): /* vfmsub132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x9c): /* vfnmadd132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x9e): /* vfnmsub132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xa6): /* vfmaddsub213ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xa7): /* vfmsubadd213ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xa8): /* vfmadd213ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xaa): /* vfmsub213ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xac): /* vfnmadd213ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xae): /* vfnmsub213ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xb6): /* vfmaddsub231ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xb7): /* vfmsubadd231ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xb8): /* vfmadd231ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xba): /* vfmsub231ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xbc): /* vfnmadd231ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xbe): /* vfnmsub231ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(6, 0x99): /* vfmadd132sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x9b): /* vfmsub132sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x9d): /* vfnmadd132sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x9f): /* vfnmsub132sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xa9): /* vfmadd213sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xab): /* vfmsub213sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xad): /* vfnmadd213sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xaf): /* vfnmsub213sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xb9): /* vfmadd231sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xbb): /* vfmsub231sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xbd): /* vfnmadd231sh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0xbf): /* vfnmsub231sh xmm/m16,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs),
+                              EXC_UD);
+        if ( !evex.brs )
+            avx512_vlen_check(true);
+        goto simd_zmm;
+
     case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -620,6 +620,7 @@ struct x86_emulate_ctxt
  *  0x0f38xxxx for 0f38-prefixed opcodes (or their VEX/EVEX equivalents)
  *  0x0f3axxxx for 0f3a-prefixed opcodes (or their VEX/EVEX equivalents)
  *     0x5xxxx for Map5 opcodes (EVEX only)
+ *     0x6xxxx for Map6 opcodes (EVEX only)
  *  0x8f08xxxx for 8f/8-prefixed XOP opcodes
  *  0x8f09xxxx for 8f/9-prefixed XOP opcodes
  *  0x8f0axxxx for 8f/a-prefixed XOP opcodes



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 05/10] x86emul: handle AVX512-FP16 Map6 misc insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (3 preceding siblings ...)
  2023-04-03 14:58 ` [PATCH v2 04/10] x86emul: handle AVX512-FP16 fma-like insns Jan Beulich
@ 2023-04-03 14:58 ` Jan Beulich
  2023-04-03 14:58 ` [PATCH v2 06/10] x86emul: handle AVX512-FP16 complex multiplication insns Jan Beulich
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:58 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

While, as before, this leverages that the Map6 encoding space is a very
sparse clone of the "0f38" one, switch around the simd_size overriding
for opcode 2D. This way fewer separate overrides are needed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -646,6 +646,8 @@ static const struct test avx512_fp16_all
     INSN(fnmsub231sh,   66, map6, bf,    el, fp16, el),
     INSN(fpclassph,       , 0f3a, 66,    vl, fp16, vl),
     INSN(fpclasssh,       , 0f3a, 67,    el, fp16, el),
+    INSN(getexpph,      66, map6, 42,    vl, fp16, vl),
+    INSN(getexpsh,      66, map6, 43,    el, fp16, el),
     INSN(getmantph,       , 0f3a, 26,    vl, fp16, vl),
     INSN(getmantsh,       , 0f3a, 27,    el, fp16, el),
     INSN(maxph,           , map5, 5f,    vl, fp16, vl),
@@ -656,10 +658,16 @@ static const struct test avx512_fp16_all
     INSN(movsh,         f3, map5, 11,    el, fp16, el),
     INSN(mulph,           , map5, 59,    vl, fp16, vl),
     INSN(mulsh,         f3, map5, 59,    el, fp16, el),
+    INSN(rcpph,         66, map6, 4c,    vl, fp16, vl),
+    INSN(rcpsh,         66, map6, 4d,    el, fp16, el),
     INSN(reduceph,        , 0f3a, 56,    vl, fp16, vl),
     INSN(reducesh,        , 0f3a, 57,    el, fp16, el),
     INSN(rndscaleph,      , 0f3a, 08,    vl, fp16, vl),
     INSN(rndscalesh,      , 0f3a, 0a,    el, fp16, el),
+    INSN(rsqrtph,       66, map6, 4e,    vl, fp16, vl),
+    INSN(rsqrtsh,       66, map6, 4f,    el, fp16, el),
+    INSN(scalefph,      66, map6, 2c,    vl, fp16, vl),
+    INSN(scalefsh,      66, map6, 2d,    el, fp16, el),
     INSN(sqrtph,          , map5, 51,    vl, fp16, vl),
     INSN(sqrtsh,        f3, map5, 51,    el, fp16, el),
     INSN(subph,           , map5, 5c,    vl, fp16, vl),
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2050,6 +2050,14 @@ static const struct evex {
     { { 0x6e }, 2, T, R, pfx_66, WIG, L0 }, /* vmovw */
     { { 0x7e }, 2, T, W, pfx_66, WIG, L0 }, /* vmovw */
 }, evex_map6[] = {
+    { { 0x2c }, 2, T, R, pfx_66, W0, Ln }, /* vscalefph */
+    { { 0x2d }, 2, T, R, pfx_66, W0, LIG }, /* vscalefsh */
+    { { 0x42 }, 2, T, R, pfx_66, W0, Ln }, /* vgetexpph */
+    { { 0x43 }, 2, T, R, pfx_66, W0, LIG }, /* vgetexpsh */
+    { { 0x4c }, 2, T, R, pfx_66, W0, Ln }, /* vrcpph */
+    { { 0x4d }, 2, T, R, pfx_66, W0, LIG }, /* vrcpsh */
+    { { 0x4e }, 2, T, R, pfx_66, W0, Ln }, /* vrsqrtph */
+    { { 0x4f }, 2, T, R, pfx_66, W0, LIG }, /* vrsqrtsh */
     { { 0x96 }, 2, T, R, pfx_66, W0, Ln }, /* vfmaddsub132ph */
     { { 0x97 }, 2, T, R, pfx_66, W0, Ln }, /* vfmsubadd132ph */
     { { 0x98 }, 2, T, R, pfx_66, W0, Ln }, /* vfmadd132ph */
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -358,7 +358,7 @@ static const struct ext0f38_table {
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2c] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
-    [0x2d] = { .simd_size = simd_packed_fp, .d8s = d8s_dq },
+    [0x2d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x31] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
@@ -909,8 +909,8 @@ decode_0f38(struct x86_emulate_state *s,
         ctxt->opcode |= MASK_INSR(s->vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
-    case X86EMUL_OPC_EVEX_66(0, 0x2d): /* vscalefs{s,d} */
-        s->simd_size = simd_scalar_vexw;
+    case X86EMUL_OPC_VEX_66(0, 0x2d): /* vmaskmovpd */
+        s->simd_size = simd_packed_fp;
         break;
 
     case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7778,6 +7778,8 @@ x86_emulate(
         generate_exception_if(evex.w, EXC_UD);
         goto avx512f_all_fp;
 
+    case X86EMUL_OPC_EVEX_66(6, 0x2c): /* vscalefph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x42): /* vgetexpph [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x96): /* vfmaddsub132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x97): /* vfmsubadd132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x98): /* vfmadd132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -7802,6 +7804,8 @@ x86_emulate(
             avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(6, 0x2d): /* vscalefsh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x43): /* vgetexpsh xmm/m16,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x99): /* vfmadd132sh xmm/m16,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x9b): /* vfmsub132sh xmm/m16,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x9d): /* vfnmadd132sh xmm/m16,xmm,xmm{k} */
@@ -7821,6 +7825,19 @@ x86_emulate(
             avx512_vlen_check(true);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(6, 0x4c): /* vrcpph [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x4e): /* vrsqrtph [xyz]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(6, 0x4d): /* vrcpsh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(6, 0x4f): /* vrsqrtsh xmm/m16,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w || evex.brs, EXC_UD);
+        avx512_vlen_check(true);
+        goto simd_zmm;
+
     case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 06/10] x86emul: handle AVX512-FP16 complex multiplication insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (4 preceding siblings ...)
  2023-04-03 14:58 ` [PATCH v2 05/10] x86emul: handle AVX512-FP16 Map6 misc insns Jan Beulich
@ 2023-04-03 14:58 ` Jan Beulich
  2023-04-03 14:59 ` [PATCH v2 07/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int16 insns Jan Beulich
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:58 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Aspects to consider are that these have 32-bit element size (pairs of
FP16) and that there are restrictions on the registers valid to use.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -614,12 +614,18 @@ static const struct test avx512_fp16_all
     INSN(comish,          , map5, 2f,    el, fp16, el),
     INSN(divph,           , map5, 5e,    vl, fp16, vl),
     INSN(divsh,         f3, map5, 5e,    el, fp16, el),
+    INSNX(fcmaddcph,    f2, map6, 56, 1, vl,    d, vl),
+    INSNX(fcmaddcsh,    f2, map6, 57, 1, el,    d, el),
+    INSNX(fcmulcph,     f2, map6, d6, 1, vl,    d, vl),
+    INSNX(fcmulcsh,     f2, map6, d7, 1, el,    d, el),
     INSN(fmadd132ph,    66, map6, 98,    vl, fp16, vl),
     INSN(fmadd132sh,    66, map6, 99,    el, fp16, el),
     INSN(fmadd213ph,    66, map6, a8,    vl, fp16, vl),
     INSN(fmadd213sh,    66, map6, a9,    el, fp16, el),
     INSN(fmadd231ph,    66, map6, b8,    vl, fp16, vl),
     INSN(fmadd231sh,    66, map6, b9,    el, fp16, el),
+    INSNX(fmaddcph,     f3, map6, 56, 1, vl,    d, vl),
+    INSNX(fmaddcsh,     f3, map6, 57, 1, el,    d, el),
     INSN(fmaddsub132ph, 66, map6, 96,    vl, fp16, vl),
     INSN(fmaddsub213ph, 66, map6, a6,    vl, fp16, vl),
     INSN(fmaddsub231ph, 66, map6, b6,    vl, fp16, vl),
@@ -632,6 +638,8 @@ static const struct test avx512_fp16_all
     INSN(fmsubadd132ph, 66, map6, 97,    vl, fp16, vl),
     INSN(fmsubadd213ph, 66, map6, a7,    vl, fp16, vl),
     INSN(fmsubadd231ph, 66, map6, b7,    vl, fp16, vl),
+    INSNX(fmulcph,      f3, map6, d6, 1, vl,    d, vl),
+    INSNX(fmulcsh,      f3, map6, d7, 1, el,    d, el),
     INSN(fnmadd132ph,   66, map6, 9c,    vl, fp16, vl),
     INSN(fnmadd132sh,   66, map6, 9d,    el, fp16, el),
     INSN(fnmadd213ph,   66, map6, ac,    vl, fp16, vl),
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2058,6 +2058,10 @@ static const struct evex {
     { { 0x4d }, 2, T, R, pfx_66, W0, LIG }, /* vrcpsh */
     { { 0x4e }, 2, T, R, pfx_66, W0, Ln }, /* vrsqrtph */
     { { 0x4f }, 2, T, R, pfx_66, W0, LIG }, /* vrsqrtsh */
+    { { 0x56 }, 2, T, R, pfx_f3, W0, Ln }, /* vfmaddcph */
+    { { 0x56 }, 2, T, R, pfx_f2, W0, Ln }, /* vfcmaddcph */
+    { { 0x57 }, 2, T, R, pfx_f3, W0, LIG }, /* vfmaddcsh */
+    { { 0x57 }, 2, T, R, pfx_f2, W0, LIG }, /* vfcmaddcsh */
     { { 0x96 }, 2, T, R, pfx_66, W0, Ln }, /* vfmaddsub132ph */
     { { 0x97 }, 2, T, R, pfx_66, W0, Ln }, /* vfmsubadd132ph */
     { { 0x98 }, 2, T, R, pfx_66, W0, Ln }, /* vfmadd132ph */
@@ -2088,6 +2092,10 @@ static const struct evex {
     { { 0xbd }, 2, T, R, pfx_66, W0, LIG }, /* vfnmadd231sh */
     { { 0xbe }, 2, T, R, pfx_66, W0, Ln }, /* vfnmsub231ph */
     { { 0xbf }, 2, T, R, pfx_66, W0, LIG }, /* vfnmsub231sh */
+    { { 0xd6 }, 2, T, R, pfx_f3, W0, Ln }, /* vfmulcph */
+    { { 0xd6 }, 2, T, R, pfx_f2, W0, Ln }, /* vfcmulcph */
+    { { 0xd7 }, 2, T, R, pfx_f3, W0, LIG }, /* vfmulcsh */
+    { { 0xd7 }, 2, T, R, pfx_f2, W0, LIG }, /* vfcmulcsh */
 };
 
 static const struct {
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -379,6 +379,8 @@ static const struct ext0f38_table {
     [0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0x50 ... 0x53] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x54 ... 0x55] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
+    [0x56] = { .simd_size = simd_other, .d8s = d8s_vl },
+    [0x57] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
     [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
@@ -441,6 +443,8 @@ static const struct ext0f38_table {
     [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0xd6] = { .simd_size = simd_other, .d8s = d8s_vl },
+    [0xd7] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0xf0] = { .two_op = 1 },
@@ -1515,6 +1519,10 @@ int x86emul_decode(struct x86_emulate_st
                 if ( s->evex.pfx == vex_66 )
                     s->fp16 = true;
                 break;
+
+            case 0x56: case 0x57: /* vf{,c}maddc{p,s}h */
+            case 0xd6: case 0xd7: /* vf{,c}mulc{p,s}h */
+                break;
             }
 
             disp8scale = decode_disp8scale(ext0f38_table[b].d8s, s);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7838,6 +7838,34 @@ x86_emulate(
         avx512_vlen_check(true);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_F3(6, 0x56): /* vfmaddcph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(6, 0x56): /* vfcmaddcph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(6, 0xd6): /* vfmulcph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(6, 0xd6): /* vfcmulcph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        op_bytes = 16 << evex.lr;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_F3(6, 0x57): /* vfmaddcsh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_F2(6, 0x57): /* vfcmaddcsh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_F3(6, 0xd7): /* vfmulcsh xmm/m16,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_F2(6, 0xd7): /* vfcmulcsh xmm/m16,xmm,xmm{k} */
+    {
+        unsigned int src1 = ~evex.reg;
+
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w || ((b & 1) && ea.type != OP_REG && evex.brs),
+                              EXC_UD);
+        if ( mode_64bit() )
+            src1 = (src1 & 0xf) | (!evex.RX << 4);
+        else
+            src1 &= 7;
+        generate_exception_if(modrm_reg == src1 ||
+                              (ea.type != OP_MEM && modrm_reg == modrm_rm),
+                              EXC_UD);
+        if ( ea.type != OP_REG || (b & 1) || !evex.brs )
+            avx512_vlen_check(!(b & 1));
+        goto simd_zmm;
+    }
+
     case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 07/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int16 insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (5 preceding siblings ...)
  2023-04-03 14:58 ` [PATCH v2 06/10] x86emul: handle AVX512-FP16 complex multiplication insns Jan Beulich
@ 2023-04-03 14:59 ` Jan Beulich
  2023-04-03 14:59 ` [PATCH v2 08/10] x86emul: handle AVX512-FP16 floating point conversion insns Jan Beulich
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:59 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

These are easiest in that they have same-size source and destination
vectors, yet they're different from other conversion insns in that they
use opcodes which have different meaning in the 0F encoding space
({,V}H{ADD,SUB}P{S,D}), hence requiring a little bit of overriding.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -612,6 +612,12 @@ static const struct test avx512_fp16_all
     INSN(cmpph,           , 0f3a, c2,    vl, fp16, vl),
     INSN(cmpsh,         f3, 0f3a, c2,    el, fp16, el),
     INSN(comish,          , map5, 2f,    el, fp16, el),
+    INSN(cvtph2uw,        , map5, 7d,    vl, fp16, vl),
+    INSN(cvtph2w,       66, map5, 7d,    vl, fp16, vl),
+    INSN(cvttph2uw,       , map5, 7c,    vl, fp16, vl),
+    INSN(cvttph2w,      66, map5, 7c,    vl, fp16, vl),
+    INSN(cvtuw2ph,      f2, map5, 7d,    vl, fp16, vl),
+    INSN(cvtw2ph,       f3, map5, 7d,    vl, fp16, vl),
     INSN(divph,           , map5, 5e,    vl, fp16, vl),
     INSN(divsh,         f3, map5, 5e,    el, fp16, el),
     INSNX(fcmaddcph,    f2, map6, 56, 1, vl,    d, vl),
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2048,6 +2048,12 @@ static const struct evex {
     { { 0x5f }, 2, T, R, pfx_no, W0, Ln }, /* vmaxph */
     { { 0x5f }, 2, T, R, pfx_f3, W0, LIG }, /* vmaxsh */
     { { 0x6e }, 2, T, R, pfx_66, WIG, L0 }, /* vmovw */
+    { { 0x7c }, 2, T, R, pfx_no, W0, Ln }, /* vcvttph2uw */
+    { { 0x7c }, 2, T, R, pfx_66, W0, Ln }, /* vcvttph2w */
+    { { 0x7d }, 2, T, R, pfx_no, W0, Ln }, /* vcvtph2uw */
+    { { 0x7d }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2w */
+    { { 0x7d }, 2, T, R, pfx_f3, W0, Ln }, /* vcvtw2ph */
+    { { 0x7d }, 2, T, R, pfx_f2, W0, Ln }, /* vcvtuwph */
     { { 0x7e }, 2, T, W, pfx_66, WIG, L0 }, /* vmovw */
 }, evex_map6[] = {
     { { 0x2c }, 2, T, R, pfx_66, W0, Ln }, /* vscalefph */
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -259,7 +259,7 @@ static const struct twobyte_table {
     [0x78 ... 0x79] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_vl },
     [0x7a] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
     [0x7b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_dq64 },
-    [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
     [0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq64 },
     [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x80 ... 0x8f] = { DstImplicit|SrcImm },
@@ -1496,6 +1496,12 @@ int x86emul_decode(struct x86_emulate_st
                     s->fp16 = true;
                 s->simd_size = simd_none;
                 break;
+
+            case 0x7c: /* vcvttph2{,u}w */
+            case 0x7d: /* vcvtph2{,u}w / vcvt{,u}w2ph */
+                d = DstReg | SrcMem | TwoOp;
+                s->fp16 = true;
+                break;
             }
 
             /* Like above re-use twobyte_table[] here. */
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7778,6 +7778,14 @@ x86_emulate(
         generate_exception_if(evex.w, EXC_UD);
         goto avx512f_all_fp;
 
+    case X86EMUL_OPC_EVEX   (5, 0x7c): /* vcvttph2uw [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x7c): /* vcvttph2w [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX   (5, 0x7d): /* vcvtph2uw [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x7d): /* vcvtph2w [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(5, 0x7d): /* vcvtw2ph [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(5, 0x7d): /* vcvtuw2ph [xyz]mm/mem,[xyz]mm{k} */
+        op_bytes = 16 << evex.lr;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(6, 0x2c): /* vscalefph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x42): /* vgetexpph [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x96): /* vfmaddsub132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 08/10] x86emul: handle AVX512-FP16 floating point conversion insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (6 preceding siblings ...)
  2023-04-03 14:59 ` [PATCH v2 07/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int16 insns Jan Beulich
@ 2023-04-03 14:59 ` Jan Beulich
  2023-04-03 15:00 ` [PATCH v2 09/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int{32,64} insns Jan Beulich
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 14:59 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -612,8 +612,16 @@ static const struct test avx512_fp16_all
     INSN(cmpph,           , 0f3a, c2,    vl, fp16, vl),
     INSN(cmpsh,         f3, 0f3a, c2,    el, fp16, el),
     INSN(comish,          , map5, 2f,    el, fp16, el),
+    INSN(cvtpd2ph,      66, map5, 5a,    vl,    q, vl),
+    INSN(cvtph2pd,        , map5, 5a,  vl_4, fp16, vl),
+    INSN(cvtph2psx,     66, map6, 13,  vl_2, fp16, vl),
     INSN(cvtph2uw,        , map5, 7d,    vl, fp16, vl),
     INSN(cvtph2w,       66, map5, 7d,    vl, fp16, vl),
+    INSN(cvtps2phx,     66, map5, 1d,    vl,    d, vl),
+    INSN(cvtsd2sh,      f2, map5, 5a,    el,    q, el),
+    INSN(cvtsh2sd,      f3, map5, 5a,    el, fp16, el),
+    INSN(cvtsh2ss,        , map6, 13,    el, fp16, el),
+    INSN(cvtss2sh,        , map5, 1d,    el,    d, el),
     INSN(cvttph2uw,       , map5, 7c,    vl, fp16, vl),
     INSN(cvttph2w,      66, map5, 7c,    vl, fp16, vl),
     INSN(cvtuw2ph,      f2, map5, 7d,    vl, fp16, vl),
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2031,6 +2031,8 @@ static const struct evex {
 }, evex_map5[] = {
     { { 0x10 }, 2, T, R, pfx_f3, W0, LIG }, /* vmovsh */
     { { 0x11 }, 2, T, W, pfx_f3, W0, LIG }, /* vmovsh */
+    { { 0x1d }, 2, T, R, pfx_66, W0, Ln }, /* vcvtps2phx */
+    { { 0x1d }, 2, T, R, pfx_no, W0, LIG }, /* vcvtss2sh */
     { { 0x2e }, 2, T, R, pfx_no, W0, LIG }, /* vucomish */
     { { 0x2f }, 2, T, R, pfx_no, W0, LIG }, /* vcomish */
     { { 0x51 }, 2, T, R, pfx_no, W0, Ln }, /* vsqrtph */
@@ -2039,6 +2041,10 @@ static const struct evex {
     { { 0x58 }, 2, T, R, pfx_f3, W0, LIG }, /* vaddsh */
     { { 0x59 }, 2, T, R, pfx_no, W0, Ln }, /* vmulph */
     { { 0x59 }, 2, T, R, pfx_f3, W0, LIG }, /* vmulsh */
+    { { 0x5a }, 2, T, R, pfx_no, W0, Ln }, /* vcvtph2pd */
+    { { 0x5a }, 2, T, R, pfx_66, W1, Ln }, /* vcvtpd2ph */
+    { { 0x5a }, 2, T, R, pfx_f3, W0, LIG }, /* vcvtsh2sd */
+    { { 0x5a }, 2, T, R, pfx_f2, W1, LIG }, /* vcvtsd2sh */
     { { 0x5c }, 2, T, R, pfx_no, W0, Ln }, /* vsubph */
     { { 0x5c }, 2, T, R, pfx_f3, W0, LIG }, /* vsubsh */
     { { 0x5d }, 2, T, R, pfx_no, W0, Ln }, /* vminph */
@@ -2056,6 +2062,8 @@ static const struct evex {
     { { 0x7d }, 2, T, R, pfx_f2, W0, Ln }, /* vcvtuwph */
     { { 0x7e }, 2, T, W, pfx_66, WIG, L0 }, /* vmovw */
 }, evex_map6[] = {
+    { { 0x13 }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2psx */
+    { { 0x13 }, 2, T, R, pfx_no, W0, LIG }, /* vcvtsh2ss */
     { { 0x2c }, 2, T, R, pfx_66, W0, Ln }, /* vscalefph */
     { { 0x2d }, 2, T, R, pfx_66, W0, LIG }, /* vscalefsh */
     { { 0x42 }, 2, T, R, pfx_66, W0, Ln }, /* vgetexpph */
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -224,7 +224,9 @@ static const struct twobyte_table {
     [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
     [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, 3 },
     [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
-    [0x18 ... 0x1f] = { ImplicitOps|ModRM },
+    [0x18 ... 0x1c] = { ImplicitOps|ModRM },
+    [0x1d] = { ImplicitOps|ModRM, simd_none, d8s_vl },
+    [0x1e ... 0x1f] = { ImplicitOps|ModRM },
     [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
     [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
     [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
@@ -1482,6 +1484,19 @@ int x86emul_decode(struct x86_emulate_st
                     s->fp16 = true;
                 break;
 
+            case 0x1d: /* vcvtps2phx / vcvtss2sh */
+                if ( s->evex.pfx & VEX_PREFIX_SCALAR_MASK )
+                    break;
+                d = DstReg | SrcMem;
+                if ( s->evex.pfx & VEX_PREFIX_DOUBLE_MASK )
+                {
+                    s->simd_size = simd_packed_fp;
+                    d |= TwoOp;
+                }
+                else
+                    s->simd_size = simd_scalar_vexw;
+                break;
+
             case 0x2e: case 0x2f: /* v{,u}comish */
                 if ( !s->evex.pfx )
                     s->fp16 = true;
@@ -1506,6 +1521,15 @@ int x86emul_decode(struct x86_emulate_st
 
             /* Like above re-use twobyte_table[] here. */
             disp8scale = decode_disp8scale(twobyte_table[b].d8s, s);
+
+            switch ( b )
+            {
+            case 0x5a: /* vcvtph2pd needs special casing */
+                if ( !s->evex.pfx && !s->evex.brs )
+                    disp8scale -= 2;
+                break;
+            }
+
             break;
 
         case ext_map6:
@@ -1526,6 +1550,17 @@ int x86emul_decode(struct x86_emulate_st
                     s->fp16 = true;
                 break;
 
+            case 0x13: /* vcvtph2psx / vcvtsh2ss */
+                if ( s->evex.pfx & VEX_PREFIX_SCALAR_MASK )
+                    break;
+                s->fp16 = true;
+                if ( !(s->evex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+                {
+                    s->simd_size = simd_scalar_vexw;
+                    d &= ~TwoOp;
+                }
+                break;
+
             case 0x56: case 0x57: /* vf{,c}maddc{p,s}h */
             case 0xd6: case 0xd7: /* vf{,c}mulc{p,s}h */
                 break;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7778,14 +7778,25 @@ x86_emulate(
         generate_exception_if(evex.w, EXC_UD);
         goto avx512f_all_fp;
 
+    CASE_SIMD_ALL_FP(_EVEX, 5, 0x5a):  /* vcvtp{h,d}2p{h,d} [xyz]mm/mem,[xyz]mm{k} */
+                                       /* vcvts{h,d}2s{h,d} xmm/mem,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
+            d &= ~TwoOp;
+        op_bytes = 2 << (((evex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + evex.lr) +
+                         2 * evex.w);
+        goto avx512f_all_fp;
+
     case X86EMUL_OPC_EVEX   (5, 0x7c): /* vcvttph2uw [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(5, 0x7c): /* vcvttph2w [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX   (5, 0x7d): /* vcvtph2uw [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(5, 0x7d): /* vcvtph2w [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F3(5, 0x7d): /* vcvtw2ph [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F2(5, 0x7d): /* vcvtuw2ph [xyz]mm/mem,[xyz]mm{k} */
-        op_bytes = 16 << evex.lr;
+    case X86EMUL_OPC_EVEX_66(6, 0x13): /* vcvtph2psx [xy]mm/mem,[xyz]mm{k} */
+        op_bytes = 8 << ((ext == ext_map5) + evex.lr);
         /* fall through */
+    case X86EMUL_OPC_EVEX_66(5, 0x1d): /* vcvtps2phx [xyz]mm/mem,[xy]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x2c): /* vscalefph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x42): /* vgetexpph [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x96): /* vfmaddsub132ph [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -7812,6 +7823,8 @@ x86_emulate(
             avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX(5, 0x1d):    /* vcvtss2sh xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX(6, 0x13):    /* vcvtsh2ss xmm/mem,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x2d): /* vscalefsh xmm/m16,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x43): /* vgetexpsh xmm/m16,xmm,xmm{k} */
     case X86EMUL_OPC_EVEX_66(6, 0x99): /* vfmadd132sh xmm/m16,xmm,xmm{k} */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 09/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int{32,64} insns
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (7 preceding siblings ...)
  2023-04-03 14:59 ` [PATCH v2 08/10] x86emul: handle AVX512-FP16 floating point conversion insns Jan Beulich
@ 2023-04-03 15:00 ` Jan Beulich
  2023-04-03 15:00 ` [PATCH v2 10/10] x86emul: AVX512-FP16 testing Jan Beulich
  2023-05-22 16:25 ` [PATCH v2 00/10] x86: support AVX512-FP16 Andrew Cooper
  10 siblings, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 15:00 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -612,18 +612,36 @@ static const struct test avx512_fp16_all
     INSN(cmpph,           , 0f3a, c2,    vl, fp16, vl),
     INSN(cmpsh,         f3, 0f3a, c2,    el, fp16, el),
     INSN(comish,          , map5, 2f,    el, fp16, el),
+    INSN(cvtdq2ph,        , map5, 5b,    vl,    d, vl),
     INSN(cvtpd2ph,      66, map5, 5a,    vl,    q, vl),
+    INSN(cvtph2dq,      66, map5, 5b,  vl_2, fp16, vl),
     INSN(cvtph2pd,        , map5, 5a,  vl_4, fp16, vl),
     INSN(cvtph2psx,     66, map6, 13,  vl_2, fp16, vl),
+    INSN(cvtph2qq,      66, map5, 7b,  vl_4, fp16, vl),
+    INSN(cvtph2udq,       , map5, 79,  vl_2, fp16, vl),
+    INSN(cvtph2uqq,     66, map5, 79,  vl_4, fp16, vl),
     INSN(cvtph2uw,        , map5, 7d,    vl, fp16, vl),
     INSN(cvtph2w,       66, map5, 7d,    vl, fp16, vl),
     INSN(cvtps2phx,     66, map5, 1d,    vl,    d, vl),
+    INSN(cvtqq2ph,        , map5, 5b,    vl,    q, vl),
     INSN(cvtsd2sh,      f2, map5, 5a,    el,    q, el),
     INSN(cvtsh2sd,      f3, map5, 5a,    el, fp16, el),
+    INSN(cvtsh2si,      f3, map5, 2d,    el, fp16, el),
     INSN(cvtsh2ss,        , map6, 13,    el, fp16, el),
+    INSN(cvtsh2usi,     f3, map5, 79,    el, fp16, el),
+    INSN(cvtsi2sh,      f3, map5, 2a,    el, dq64, el),
     INSN(cvtss2sh,        , map5, 1d,    el,    d, el),
+    INSN(cvttph2dq,     f3, map5, 5b,  vl_2, fp16, vl),
+    INSN(cvttph2qq,     66, map5, 7a,  vl_4, fp16, vl),
+    INSN(cvttph2udq,      , map5, 78,  vl_2, fp16, vl),
+    INSN(cvttph2uqq,    66, map5, 78,  vl_4, fp16, vl),
     INSN(cvttph2uw,       , map5, 7c,    vl, fp16, vl),
     INSN(cvttph2w,      66, map5, 7c,    vl, fp16, vl),
+    INSN(cvttsh2si,     f3, map5, 2c,    el, fp16, el),
+    INSN(cvttsh2usi,    f3, map5, 78,    el, fp16, el),
+    INSN(cvtudq2ph,     f2, map5, 7a,    vl,    d, vl),
+    INSN(cvtuqq2ph,     f2, map5, 7a,    vl,    q, vl),
+    INSN(cvtusi2sh,     f3, map5, 7b,    el, dq64, el),
     INSN(cvtuw2ph,      f2, map5, 7d,    vl, fp16, vl),
     INSN(cvtw2ph,       f3, map5, 7d,    vl, fp16, vl),
     INSN(divph,           , map5, 5e,    vl, fp16, vl),
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2033,6 +2033,9 @@ static const struct evex {
     { { 0x11 }, 2, T, W, pfx_f3, W0, LIG }, /* vmovsh */
     { { 0x1d }, 2, T, R, pfx_66, W0, Ln }, /* vcvtps2phx */
     { { 0x1d }, 2, T, R, pfx_no, W0, LIG }, /* vcvtss2sh */
+    { { 0x2a }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtsi2sh */
+    { { 0x2c }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvttsh2si */
+    { { 0x2d }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtsh2si */
     { { 0x2e }, 2, T, R, pfx_no, W0, LIG }, /* vucomish */
     { { 0x2f }, 2, T, R, pfx_no, W0, LIG }, /* vcomish */
     { { 0x51 }, 2, T, R, pfx_no, W0, Ln }, /* vsqrtph */
@@ -2045,6 +2048,10 @@ static const struct evex {
     { { 0x5a }, 2, T, R, pfx_66, W1, Ln }, /* vcvtpd2ph */
     { { 0x5a }, 2, T, R, pfx_f3, W0, LIG }, /* vcvtsh2sd */
     { { 0x5a }, 2, T, R, pfx_f2, W1, LIG }, /* vcvtsd2sh */
+    { { 0x5b }, 2, T, R, pfx_no, W0, Ln }, /* vcvtdq2ph */
+    { { 0x5b }, 2, T, R, pfx_no, W1, Ln }, /* vcvtqq2ph */
+    { { 0x5b }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2dq */
+    { { 0x5b }, 2, T, R, pfx_f3, W0, Ln }, /* vcvttph2dq */
     { { 0x5c }, 2, T, R, pfx_no, W0, Ln }, /* vsubph */
     { { 0x5c }, 2, T, R, pfx_f3, W0, LIG }, /* vsubsh */
     { { 0x5d }, 2, T, R, pfx_no, W0, Ln }, /* vminph */
@@ -2054,6 +2061,17 @@ static const struct evex {
     { { 0x5f }, 2, T, R, pfx_no, W0, Ln }, /* vmaxph */
     { { 0x5f }, 2, T, R, pfx_f3, W0, LIG }, /* vmaxsh */
     { { 0x6e }, 2, T, R, pfx_66, WIG, L0 }, /* vmovw */
+    { { 0x78 }, 2, T, R, pfx_no, W0, Ln }, /* vcvttph2udq */
+    { { 0x78 }, 2, T, R, pfx_66, W0, Ln }, /* vcvttph2uqq */
+    { { 0x78 }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvttsh2usi */
+    { { 0x79 }, 2, T, R, pfx_no, W0, Ln }, /* vcvtph2udq */
+    { { 0x79 }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2uqq */
+    { { 0x79 }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtsh2usi */
+    { { 0x7a }, 2, T, R, pfx_66, W0, Ln }, /* vcvttph2qq */
+    { { 0x7a }, 2, T, R, pfx_f2, W0, Ln }, /* vcvtudq2ph */
+    { { 0x7a }, 2, T, R, pfx_f2, W1, Ln }, /* vcvtuqq2ph */
+    { { 0x7b }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2qq */
+    { { 0x7b }, 2, T, R, pfx_f3, Wn, LIG }, /* vcvtusi2sh */
     { { 0x7c }, 2, T, R, pfx_no, W0, Ln }, /* vcvttph2uw */
     { { 0x7c }, 2, T, R, pfx_66, W0, Ln }, /* vcvttph2w */
     { { 0x7d }, 2, T, R, pfx_no, W0, Ln }, /* vcvtph2uw */
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -1497,12 +1497,25 @@ int x86emul_decode(struct x86_emulate_st
                     s->simd_size = simd_scalar_vexw;
                 break;
 
+            case 0x2a: /* vcvtsi2sh */
+                break;
+
+            case 0x2c: case 0x2d: /* vcvt{,t}sh2si */
+                if ( s->evex.pfx == vex_f3 )
+                    s->fp16 = true;
+                break;
+
             case 0x2e: case 0x2f: /* v{,u}comish */
                 if ( !s->evex.pfx )
                     s->fp16 = true;
                 s->simd_size = simd_none;
                 break;
 
+            case 0x5b: /* vcvt{d,q}q2ph, vcvt{,t}ph2dq */
+                if ( s->evex.pfx && s->evex.pfx != vex_f2 )
+                    s->fp16 = true;
+                break;
+
             case 0x6e: /* vmovw r/m16, xmm */
                 d = (d & ~SrcMask) | SrcMem16;
                 /* fall through */
@@ -1512,6 +1525,17 @@ int x86emul_decode(struct x86_emulate_st
                 s->simd_size = simd_none;
                 break;
 
+            case 0x78: case 0x79: /* vcvt{,t}ph2u{d,q}q, vcvt{,t}sh2usi */
+                if ( s->evex.pfx != vex_f2 )
+                    s->fp16 = true;
+                break;
+
+            case 0x7a: /* vcvttph2qq, vcvtu{d,q}q2ph */
+            case 0x7b: /* vcvtph2qq, vcvtusi2sh */
+                if ( s->evex.pfx == vex_66 )
+                    s->fp16 = true;
+                break;
+
             case 0x7c: /* vcvttph2{,u}w */
             case 0x7d: /* vcvtph2{,u}w / vcvt{,u}w2ph */
                 d = DstReg | SrcMem | TwoOp;
@@ -1524,10 +1548,34 @@ int x86emul_decode(struct x86_emulate_st
 
             switch ( b )
             {
+            case 0x78:
+            case 0x79:
+                /* vcvt{,t}ph2u{d,q}q need special casing */
+                if ( s->evex.pfx <= vex_66 )
+                {
+                    if ( !s->evex.brs )
+                        disp8scale -= 1 + (s->evex.pfx == vex_66);
+                    break;
+                }
+                /* vcvt{,t}sh2usi needs special casing: fall through */
+            case 0x2c: case 0x2d: /* vcvt{,t}sh2si need special casing */
+                disp8scale = 1;
+                break;
+
             case 0x5a: /* vcvtph2pd needs special casing */
                 if ( !s->evex.pfx && !s->evex.brs )
                     disp8scale -= 2;
                 break;
+
+            case 0x5b: /* vcvt{,t}ph2dq need special casing */
+                if ( s->evex.pfx && !s->evex.brs )
+                    --disp8scale;
+                break;
+
+            case 0x7a: case 0x7b: /* vcvt{,t}ph2qq need special casing */
+                if ( s->evex.pfx == vex_66 && !s->evex.brs )
+                    disp8scale = s->evex.brs ? 1 : 2 + s->evex.lr;
+                break;
             }
 
             break;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3577,6 +3577,12 @@ x86_emulate(
         state->simd_size = simd_none;
         goto simd_0f_rm;
 
+#ifndef X86EMUL_NO_SIMD
+
+    case X86EMUL_OPC_EVEX_F3(5, 0x2a):      /* vcvtsi2sh r/m,xmm,xmm */
+    case X86EMUL_OPC_EVEX_F3(5, 0x7b):      /* vcvtusi2sh r/m,xmm,xmm */
+        host_and_vcpu_must_have(avx512_fp16);
+        /* fall through */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2a): /* vcvtsi2s{s,d} r/m,xmm,xmm */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x7b): /* vcvtusi2s{s,d} r/m,xmm,xmm */
         generate_exception_if(evex.opmsk || (ea.type != OP_REG && evex.brs),
@@ -3655,7 +3661,9 @@ x86_emulate(
             opc[1] = 0x01;
 
             rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
-                           vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4, ctxt);
+                           vex.pfx & VEX_PREFIX_DOUBLE_MASK
+                           ? 8 : 2 << !state->fp16,
+                           ctxt);
             if ( rc != X86EMUL_OKAY )
                 goto done;
         }
@@ -3685,6 +3693,12 @@ x86_emulate(
         state->simd_size = simd_none;
         break;
 
+    case X86EMUL_OPC_EVEX_F3(5, 0x2c):      /* vcvttsh2si xmm/mem,reg */
+    case X86EMUL_OPC_EVEX_F3(5, 0x2d):      /* vcvtsh2si xmm/mem,reg */
+    case X86EMUL_OPC_EVEX_F3(5, 0x78):      /* vcvttsh2usi xmm/mem,reg */
+    case X86EMUL_OPC_EVEX_F3(5, 0x79):      /* vcvtsh2usi xmm/mem,reg */
+        host_and_vcpu_must_have(avx512_fp16);
+        /* fall through */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */
     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */
@@ -3756,8 +3770,6 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
-#ifndef X86EMUL_NO_SIMD
-
     case X86EMUL_OPC_EVEX(5, 0x2e): /* vucomish xmm/m16,xmm */
     case X86EMUL_OPC_EVEX(5, 0x2f): /* vcomish xmm/m16,xmm */
         host_and_vcpu_must_have(avx512_fp16);
@@ -7787,6 +7799,38 @@ x86_emulate(
                          2 * evex.w);
         goto avx512f_all_fp;
 
+    case X86EMUL_OPC_EVEX   (5, 0x5b): /* vcvtdq2ph [xyz]mm/mem,[xy]mm{k} */
+                                       /* vcvtqq2ph [xyz]mm/mem,xmm{k} */
+    case X86EMUL_OPC_EVEX_F2(5, 0x7a): /* vcvtudq2ph [xyz]mm/mem,[xy]mm{k} */
+                                       /* vcvtuqq2ph [xyz]mm/mem,xmm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 16 << evex.lr;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(5, 0x5b): /* vcvtph2dq [xy]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(5, 0x5b): /* vcvttph2dq [xy]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX   (5, 0x78): /* vcvttph2udq [xy]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX   (5, 0x79): /* vcvtph2udq [xy]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 8 << evex.lr;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(5, 0x78): /* vcvttph2uqq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x79): /* vcvtph2uqq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x7a): /* vcvttph2qq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(5, 0x7b): /* vcvtph2qq xmm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_fp16);
+        generate_exception_if(evex.w, EXC_UD);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 4 << (evex.w + evex.lr);
+        goto simd_zmm;
+
     case X86EMUL_OPC_EVEX   (5, 0x7c): /* vcvttph2uw [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(5, 0x7c): /* vcvttph2w [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX   (5, 0x7d): /* vcvtph2uw [xyz]mm/mem,[xyz]mm{k} */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 10/10] x86emul: AVX512-FP16 testing
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (8 preceding siblings ...)
  2023-04-03 15:00 ` [PATCH v2 09/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int{32,64} insns Jan Beulich
@ 2023-04-03 15:00 ` Jan Beulich
  2023-06-05 13:08   ` Jan Beulich
  2023-05-22 16:25 ` [PATCH v2 00/10] x86: support AVX512-FP16 Andrew Cooper
  10 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2023-04-03 15:00 UTC (permalink / raw
  To: xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Naming of some of the builtins isn't fully consistent with that of pre-
existing ones, so there's a need for a new BR2() wrapper macro.

With the tests providing some proof of proper functioning of the
emulator code also enable use of the feature by guests, as there's no
other infrastructure involved in enabling this ISA extension.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add CHANGELOG.md entry.
---
This is Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> under the
condition that public/arch-x86/cpufeatureset.h use 'a', not 'A'. But I
was putting this under question, so far without further response.
---
SDE: -spr or -future
---
In the course of putting together the FMA part of the test I had noticed
that we no longer tested scalar FMA insns (FMA, FMA4, AVX512F), due to
gcc (then) no longer recognizing the pattern in version 9 or later. See
gcc bug 105965, which apparently has already gained a fix for version
13. (Using intrinsics for scalar operations is prohibitive, as they have
full-vector parameters.) I'm taking this as one of several reasons why
here I'm not even trying to make the compiler spot the complex FMA
patterns, using a mixture of intrinsics and inline assembly instead.

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog
    - Bus-lock detection, used by Xen to mitigate (by rate-limiting) the system
      wide impact of a guest misusing atomic instructions.
  - xl/libxl can customize SMBIOS strings for HVM guests.
+ - x86 AVX512-FP16
 
 ## [4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.0) - 2022-12-12
 
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86
 
 CFLAGS += $(CFLAGS_xeninclude)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi avx512fp16
 FMA := fma4 fma
 SG := avx2-sg avx512f-sg avx512vl-sg
 AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
@@ -91,6 +91,9 @@ avx512vbmi-vecs := $(avx512bw-vecs)
 avx512vbmi-ints := $(avx512bw-ints)
 avx512vbmi-flts := $(avx512bw-flts)
 avx512vbmi2-vecs := $(avx512bw-vecs)
+avx512fp16-vecs := $(avx512bw-vecs)
+avx512fp16-ints :=
+avx512fp16-flts := 2
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1 2
@@ -246,7 +249,7 @@ $(addsuffix .c,$(GF)):
 
 $(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)): simd.h
 
-xop.h avx512f.h: simd-fma.c
+xop.h avx512f.h avx512fp16.h: simd-fma.c
 
 endif # 32-bit override
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -20,6 +20,14 @@ ENTRY(simd_test);
     asm ( "vcmpsd $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
     r_ == 1; \
 })
+# elif VEC_SIZE == 2
+#  define eq(x, y) ({ \
+    _Float16 x_ = (x)[0]; \
+    _Float16 __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned int r_; \
+    asm ( "vcmpsh $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
 # elif FLOAT_SIZE == 4
 /*
  * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
@@ -31,6 +39,8 @@ ENTRY(simd_test);
 #  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
 # elif FLOAT_SIZE == 8
 #  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif FLOAT_SIZE == 2
+#  define eq(x, y) (B(cmpph, _mask, x, y, 0, -1) == ALL_TRUE)
 # elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__)
 #  define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
 # elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__)
@@ -116,6 +126,14 @@ static inline bool _to_bool(byte_vec_t b
     asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
     (vec_t){ t_[0] }; \
 })
+#  elif FLOAT_SIZE == 2
+#   define to_u_int(type, x) ({ \
+    unsigned type u_; \
+    _Float16 __attribute__((vector_size(16))) t_; \
+    asm ( "vcvtsh2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
+    asm ( "vcvtusi2sh%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
+    (vec_t){ t_[0] }; \
+})
 #  endif
 #  define to_uint(x) to_u_int(int, x)
 #  ifdef __x86_64__
@@ -153,6 +171,43 @@ static inline bool _to_bool(byte_vec_t b
 #   define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
 #   define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
 #  endif
+# elif FLOAT_SIZE == 2
+#  define to_int(x) BR2(vcvtw2ph, _mask, BR2(vcvtph2w, _mask, x, (vhi_t)undef(), ~0), undef(), ~0)
+#  define to_uint(x) BR2(vcvtuw2ph, _mask, BR2(vcvtph2uw, _mask, x, (vhi_t)undef(), ~0), undef(), ~0)
+#  if VEC_SIZE == 16
+#   define low_half(x) (x)
+#   define high_half(x) ((vec_t)B_(movhlps, , (vsf_t)undef(), (vsf_t)(x)))
+#   define insert_half(x, y, p) ((vec_t)((p) ? B_(movlhps, , (vsf_t)(x), (vsf_t)(y)) \
+                                             : B_(shufps, , (vsf_t)(y), (vsf_t)(x), 0b11100100)))
+#  elif VEC_SIZE == 32
+#   define _half(x, lh) ((vhf_half_t)B(extracti32x4_, _mask, (vsi_t)(x), lh, (vsi_half_t){}, ~0))
+#   define low_half(x)  _half(x, 0)
+#   define high_half(x) _half(x, 1)
+#   define insert_half(x, y, p) \
+    ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_half_t)(y), p, (vsi_t)undef(), ~0))
+#  elif VEC_SIZE == 64
+#   define _half(x, lh) \
+    ((vhf_half_t)__builtin_ia32_extracti64x4_mask((vdi_t)(x), lh, (vdi_half_t){}, ~0))
+#   define low_half(x)  _half(x, 0)
+#   define high_half(x) _half(x, 1)
+#   define insert_half(x, y, p) \
+    ((vec_t)__builtin_ia32_inserti64x4_mask((vdi_t)(x), (vdi_half_t)(y), p, (vdi_t)undef(), ~0))
+#  endif
+#  define to_w_int(x, s) ({ \
+    vhf_half_t t_ = low_half(x); \
+    vsi_t lo_, hi_; \
+    touch(t_); \
+    lo_ = BR2(vcvtph2 ## s ## dq, _mask, t_, (vsi_t)undef(), ~0); \
+    t_ = high_half(x); \
+    touch(t_); \
+    hi_ = BR2(vcvtph2 ## s ## dq, _mask, t_, (vsi_t)undef(), ~0); \
+    touch(lo_); touch(hi_); \
+    insert_half(insert_half(undef(), \
+                            BR2(vcvt ## s ## dq2ph, _mask, lo_, (vhf_half_t){}, ~0), 0), \
+                BR2(vcvt ## s ## dq2ph, _mask, hi_, (vhf_half_t){}, ~0), 1); \
+})
+#  define to_wint(x) to_w_int(x, )
+#  define to_uwint(x) to_w_int(x, u)
 # endif
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if FLOAT_SIZE == 4
@@ -240,10 +295,18 @@ static inline vec_t movlhps(vec_t x, vec
 #  define scale(x, y) scalar_2op(x, y, "vscalefsd %[in2], %[in1], %[out]")
 #  define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
 #  define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
+# elif FLOAT_SIZE == 2
+#  define getexp(x) scalar_1op(x, "vgetexpsh %[in], %[out], %[out]")
+#  define getmant(x) scalar_1op(x, "vgetmantsh $0, %[in], %[out], %[out]")
+#  define recip(x) scalar_1op(x, "vrcpsh %[in], %[out], %[out]")
+#  define rsqrt(x) scalar_1op(x, "vrsqrtsh %[in], %[out], %[out]")
+#  define scale(x, y) scalar_2op(x, y, "vscalefsh %[in2], %[in1], %[out]")
+#  define sqrt(x) scalar_1op(x, "vsqrtsh %[in], %[out], %[out]")
+#  define trunc(x) scalar_1op(x, "vrndscalesh $0b1011, %[in], %[out], %[out]")
 # endif
 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
       (VEC_SIZE == 64 || defined(__AVX512VL__))
-# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
+# if (ELEM_COUNT == 8 && ELEM_SIZE >= 4) /* vextractf{32,64}x4 */ || \
      (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
      (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
 #  define _half(x, lh) ({ \
@@ -398,6 +461,21 @@ static inline vec_t movlhps(vec_t x, vec
                          VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
                        0b01010101, undef(), ~0)
 #  endif
+# elif FLOAT_SIZE == 2
+#  define frac(x) BR2(reduceph, _mask, x, 0b00001011, undef(), ~0)
+#  define getexp(x) BR(getexpph, _mask, x, undef(), ~0)
+#  define getmant(x) BR(getmantph, _mask, x, 0, undef(), ~0)
+#  define max(x, y) BR2(maxph, _mask, x, y, undef(), ~0)
+#  define min(x, y) BR2(minph, _mask, x, y, undef(), ~0)
+#  define scale(x, y) BR2(scalefph, _mask, x, y, undef(), ~0)
+#  define recip(x) B(rcpph, _mask, x, undef(), ~0)
+#  define rsqrt(x) B(rsqrtph, _mask, x, undef(), ~0)
+#  define shrink1(x) BR2(vcvtps2phx, _mask, (vsf_t)(x), (vhf_half_t){}, ~0)
+#  define shrink2(x) BR2(vcvtpd2ph, _mask, (vdf_t)(x), (vhf_quarter_t){}, ~0)
+#  define sqrt(x) BR2(sqrtph, _mask, x, undef(), ~0)
+#  define trunc(x) BR2(rndscaleph, _mask, x, 0b1011, undef(), ~0)
+#  define widen1(x) ((vec_t)BR2(vcvtph2psx, _mask, x, (vsf_t)undef(), ~0))
+#  define widen2(x) ((vec_t)BR2(vcvtph2pd, _mask, x, (vdf_t)undef(), ~0))
 # endif
 #elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
@@ -920,6 +998,16 @@ static inline vec_t movlhps(vec_t x, vec
 #  define dup_lo(x) B(movddup, _mask, x, undef(), ~0)
 # endif
 #endif
+#if FLOAT_SIZE == 2 && ELEM_COUNT > 1
+# define dup_hi(x) ((vec_t)B(pshufhw, _mask, \
+                             B(pshuflw, _mask, (vhi_t)(x), 0b11110101, \
+                               (vhi_t)undef(), ~0), \
+                             0b11110101, (vhi_t)undef(), ~0))
+# define dup_lo(x) ((vec_t)B(pshufhw, _mask, \
+                             B(pshuflw, _mask, (vhi_t)(x), 0b10100000, \
+                               (vhi_t)undef(), ~0), \
+                             0b10100000, (vhi_t)undef(), ~0))
+#endif
 #if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
 #  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -53,6 +53,9 @@ float
 # elif FLOAT_SIZE == 8
 #  define MODE DF
 #  define ELEM_SFX "d"
+# elif FLOAT_SIZE == 2
+#  define MODE HF
+#  define ELEM_SFX "h"
 # endif
 #endif
 #ifndef VEC_SIZE
@@ -67,7 +70,10 @@ typedef unsigned int __attribute__((mode
 /* Various builtins want plain char / int / long long vector types ... */
 typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
 typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+#if VEC_SIZE >= 4
 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+typedef float __attribute__((vector_size(VEC_SIZE))) vsf_t;
+#endif
 #if VEC_SIZE >= 8
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
@@ -96,6 +102,9 @@ typedef char __attribute__((vector_size(
 typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
 typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+#ifdef __AVX512FP16__
+typedef _Float16 __attribute__((vector_size(HALF_SIZE))) vhf_half_t;
+#endif
 typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
 # endif
 
@@ -110,6 +119,9 @@ typedef char __attribute__((vector_size(
 typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t;
 typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t;
 typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t;
+#ifdef __AVX512FP16__
+typedef _Float16 __attribute__((vector_size(QUARTER_SIZE))) vhf_quarter_t;
+#endif
 # endif
 
 # if ELEM_COUNT >= 8
@@ -163,6 +175,7 @@ DECL_OCTET(half);
 #elif VEC_SIZE == 64
 # define B(n, s, a...)   __builtin_ia32_ ## n ## 512 ## s(a)
 # define BR(n, s, a...)  __builtin_ia32_ ## n ## 512 ## s(a, 4)
+# define BR2(n, s, a...) __builtin_ia32_ ## n ## 512 ## s ## _round(a, 4)
 #endif
 #ifndef B_
 # define B_ B
@@ -171,6 +184,9 @@ DECL_OCTET(half);
 # define BR B
 # define BR_ B_
 #endif
+#ifndef BR2
+# define BR2 BR
+#endif
 #ifndef BR_
 # define BR_ BR
 #endif
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -28,6 +28,8 @@ ENTRY(fma_test);
 #  define fmaddsub(x, y, z) BR(vfmaddsubps, _mask, x, y, z, ~0)
 # elif FLOAT_SIZE == 8
 #  define fmaddsub(x, y, z) BR(vfmaddsubpd, _mask, x, y, z, ~0)
+# elif FLOAT_SIZE == 2
+#  define fmaddsub(x, y, z) BR(vfmaddsubph, _mask, x, y, z, ~0)
 # endif
 #elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
@@ -70,6 +72,75 @@ ENTRY(fma_test);
 # endif
 #endif
 
+#ifdef __AVX512FP16__
+# define I (1.if16)
+# if VEC_SIZE > FLOAT_SIZE
+#  define CELEM_COUNT (ELEM_COUNT / 2)
+static const unsigned int conj_mask = 0x80000000;
+#  define conj(z) ({ \
+    vec_t r_; \
+    asm ( "vpxord %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (z), "m" (conj_mask), "i" (CELEM_COUNT) ); \
+    r_; \
+})
+#  define _cmul_vv(a, b, c)  BR2(vf##c##mulcph, , a, b)
+#  define _cmul_vs(a, b, c) ({ \
+    vec_t r_; \
+    _Complex _Float16 b_ = (b); \
+    asm ( "vf"#c"mulcph %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (a), "m" (b_), "i" (CELEM_COUNT) ); \
+    r_; \
+})
+#  define cmadd_vv(a, b, c) BR2(vfmaddcph, , a, b, c)
+#  define cmadd_vs(a, b, c) ({ \
+    _Complex _Float16 b_ = (b); \
+    vec_t r_; \
+    asm ( "vfmaddcph %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (a), "m" (b_), "i" (CELEM_COUNT), "0" (c) ); \
+    r_; \
+})
+# else
+#  define CELEM_COUNT 1
+typedef _Float16 __attribute__((vector_size(4))) cvec_t;
+#  define conj(z) ({ \
+    cvec_t r_; \
+    asm ( "xor $0x80000000, %0" : "=rm" (r_) : "0" (z) ); \
+    r_; \
+})
+#  define _cmul_vv(a, b, c) ({ \
+    cvec_t r_; \
+    /* "=&x" to force destination to be different from both sources */ \
+    asm ( "vf"#c"mulcsh %2, %1, %0" : "=&x" (r_) : "x" (a), "m" (b) ); \
+    r_; \
+})
+#  define _cmul_vs(a, b, c) ({ \
+    _Complex _Float16 b_ = (b); \
+    cvec_t r_; \
+    /* "=&x" to force destination to be different from both sources */ \
+    asm ( "vf"#c"mulcsh %2, %1, %0" : "=&x" (r_) : "x" (a), "m" (b_) ); \
+    r_; \
+})
+#  define cmadd_vv(a, b, c) ({ \
+    cvec_t r_ = (c); \
+    asm ( "vfmaddcsh %2, %1, %0" : "+x" (r_) : "x" (a), "m" (b) ); \
+    r_; \
+})
+#  define cmadd_vs(a, b, c) ({ \
+    _Complex _Float16 b_ = (b); \
+    cvec_t r_ = (c); \
+    asm ( "vfmaddcsh %2, %1, %0" : "+x" (r_) : "x" (a), "m" (b_) ); \
+    r_; \
+})
+# endif
+# define cmul_vv(a, b) _cmul_vv(a, b, )
+# define cmulc_vv(a, b) _cmul_vv(a, b, c)
+# define cmul_vs(a, b) _cmul_vs(a, b, )
+# define cmulc_vs(a, b) _cmul_vs(a, b, c)
+#endif
+
 int fma_test(void)
 {
     unsigned int i;
@@ -156,5 +227,99 @@ int fma_test(void)
     touch(inv);
 #endif
 
+#ifdef CELEM_COUNT
+
+# if VEC_SIZE > FLOAT_SIZE
+#  define cvec_t vec_t
+#  define ceq eq
+# else
+  {
+    /* Cannot re-use the function-scope variables (for being too small). */
+    cvec_t x, y, z, src = { 1, 2 }, inv = { 2, 1 }, one = { 1, 1 };
+#  define ceq(x, y) ({ \
+    unsigned int r_; \
+    asm ( "vcmpph $0, %1, %2, %0"  : "=k" (r_) : "x" (x), "x" (y) ); \
+    (r_ & 3) == 3; \
+})
+# endif
+
+    /* (a * i)² == -a² */
+    x = cmul_vs(src, I);
+    y = cmul_vv(x, x);
+    x = -src;
+    touch(src);
+    z = cmul_vv(x, src);
+    if ( !ceq(y, z) ) return __LINE__;
+
+    /* conj(a * b) == conj(a) * conj(b) */
+    touch(src);
+    x = conj(src);
+    touch(inv);
+    y = cmulc_vv(x, inv);
+    touch(src);
+    touch(inv);
+    z = conj(cmul_vv(src, inv));
+    if ( !ceq(y, z) ) return __LINE__;
+
+    /* a * conj(a) == |a|² */
+    touch(src);
+    y = src;
+    touch(src);
+    x = cmulc_vv(y, src);
+    y *= y;
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        if ( x[i] != y[i] + y[i + 1] ) return __LINE__;
+        if ( x[i + 1] ) return __LINE__;
+    }
+
+    /* a * b == b * a + 0 */
+    touch(src);
+    touch(inv);
+    x = cmul_vv(src, inv);
+    touch(src);
+    touch(inv);
+    y = cmadd_vv(inv, src, (cvec_t){});
+    if ( !ceq(x, y) ) return __LINE__;
+
+    /* a * 1 + b == b * 1 + a */
+    touch(src);
+    touch(inv);
+    x = cmadd_vs(src, 1, inv);
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        z[i] = 1;
+        z[i + 1] = 0;
+    }
+    touch(z);
+    y = cmadd_vv(inv, z, src);
+    if ( !ceq(x, y) ) return __LINE__;
+
+    /* (a + b) * c == a * c + b * c */
+    touch(one);
+    touch(inv);
+    x = cmul_vv(src + one, inv);
+    touch(inv);
+    y = cmul_vv(one, inv);
+    touch(inv);
+    z = cmadd_vv(src, inv, y);
+    if ( !ceq(x, z) ) return __LINE__;
+
+    /* a * i + conj(a) == (Re(a) - Im(a)) * (1 + i) */
+    x = cmadd_vs(src, I, conj(src));
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        typeof(x[0]) val = src[i] - src[i + 1];
+
+        if ( x[i] != val ) return __LINE__;
+        if ( x[i + 1] != val ) return __LINE__;
+    }
+
+# if VEC_SIZE == FLOAT_SIZE
+  }
+# endif
+
+#endif /* CELEM_COUNT */
+
     return 0;
 }
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -43,6 +43,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512er.h"
 #include "avx512vbmi.h"
 #include "avx512vbmi2-vpclmulqdq.h"
+#include "avx512fp16.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -249,6 +250,16 @@ static bool simd_check_avx512bw_gf_vl(vo
     return cpu_has_gfni && cpu_has_avx512vl;
 }
 
+static bool simd_check_avx512fp16(void)
+{
+    return cpu_has_avx512_fp16;
+}
+
+static bool simd_check_avx512fp16_vl(void)
+{
+    return cpu_has_avx512_fp16 && cpu_has_avx512vl;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -513,6 +524,10 @@ static const struct {
     AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
     AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
     AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
+    SIMD(AVX512_FP16 f16 scal,avx512fp16,     f2),
+    SIMD(AVX512_FP16 f16x32, avx512fp16,    64f2),
+    AVX512VL(_FP16+VL f16x8, avx512fp16,    16f2),
+    AVX512VL(_FP16+VL f16x16,avx512fp16,    32f2),
     SIMD(SHA,                sse4_sha,        16),
     SIMD(AVX+SHA,             avx_sha,        16),
     AVX512VL(VL+SHA,      avx512f_sha,        16),
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -267,7 +267,7 @@ XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13)
 XEN_CPUFEATURE(SERIALIZE,     9*32+14) /*A  SERIALIZE insn */
 XEN_CPUFEATURE(TSXLDTRK,      9*32+16) /*a  TSX load tracking suspend/resume insns */
 XEN_CPUFEATURE(CET_IBT,       9*32+20) /*   CET - Indirect Branch Tracking */
-XEN_CPUFEATURE(AVX512_FP16,   9*32+23) /*   AVX512 FP16 instructions */
+XEN_CPUFEATURE(AVX512_FP16,   9*32+23) /*A  AVX512 FP16 instructions */
 XEN_CPUFEATURE(IBRSB,         9*32+26) /*A  IBRS and IBPB support (used by Intel) */
 XEN_CPUFEATURE(STIBP,         9*32+27) /*A  STIBP */
 XEN_CPUFEATURE(L1D_FLUSH,     9*32+28) /*S  MSR_FLUSH_CMD and L1D flush. */



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 00/10] x86: support AVX512-FP16
  2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
                   ` (9 preceding siblings ...)
  2023-04-03 15:00 ` [PATCH v2 10/10] x86emul: AVX512-FP16 testing Jan Beulich
@ 2023-05-22 16:25 ` Andrew Cooper
  2023-05-23  6:35   ` Jan Beulich
  10 siblings, 1 reply; 16+ messages in thread
From: Andrew Cooper @ 2023-05-22 16:25 UTC (permalink / raw
  To: Jan Beulich, xen-devel@lists.xenproject.org; +Cc: Wei Liu, Roger Pau Monné

On 03/04/2023 3:56 pm, Jan Beulich wrote:
> While I (quite obviously) don't have any suitable hardware, Intel's
> SDE allows testing the implementation. And since there's no new
> state (registers) associated with this ISA extension, this should
> suffice for integration.

I've given this a spin on a Sapphire Rapids system.

Relevant (AFAICT) bits of the log:

Testing vfpclasspsz $0x46,64(%edx),%k2...okay
Testing vfpclassphz $0x46,128(%ecx),%k3...okay
...
Testing avx512_fp16/all disp8 handling...okay
Testing avx512_fp16/128 disp8 handling...okay
...
Testing AVX512_FP16 f16 scal native execution...okay
Testing AVX512_FP16 f16 scal 64-bit code sequence...okay
Testing AVX512_FP16 f16 scal 32-bit code sequence...okay
Testing AVX512_FP16 f16x32 native execution...okay
Testing AVX512_FP16 f16x32 64-bit code sequence...okay
Testing AVX512_FP16 f16x32 32-bit code sequence...okay
Testing AVX512_FP16+VL f16x8 native execution...okay
Testing AVX512_FP16+VL f16x8 64-bit code sequence...okay
Testing AVX512_FP16+VL f16x8 32-bit code sequence...okay
Testing AVX512_FP16+VL f16x16 native execution...okay
Testing AVX512_FP16+VL f16x16 64-bit code sequence...okay
Testing AVX512_FP16+VL f16x16 32-bit code sequence...okay

and it exits zero, so everything seems fine.


One thing however, this series ups the minimum GCC version required to
build the emulator at all:

make: Entering directory '/local/xen.git/tools/tests/x86_emulator'
gcc: error: unrecognized command-line option ‘-mavx512fp16’; did you
mean ‘-mavx512bf16’?
Makefile:121: Test harness not built, use newer compiler than "gcc"
(version 10) and an "{evex}" capable assembler

and I'm not sure we want to do this.  When upping the version of GCC but
leaving binutils as-was does lead to a build of the harness without
AVX512-FP16 active, which is the preferred behaviour here.

~Andrew


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 00/10] x86: support AVX512-FP16
  2023-05-22 16:25 ` [PATCH v2 00/10] x86: support AVX512-FP16 Andrew Cooper
@ 2023-05-23  6:35   ` Jan Beulich
  2023-06-05 12:10     ` Andrew Cooper
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2023-05-23  6:35 UTC (permalink / raw
  To: Andrew Cooper, xen-devel@lists.xenproject.org
  Cc: Wei Liu, Roger Pau Monné

On 22.05.2023 18:25, Andrew Cooper wrote:
> On 03/04/2023 3:56 pm, Jan Beulich wrote:
>> While I (quite obviously) don't have any suitable hardware, Intel's
>> SDE allows testing the implementation. And since there's no new
>> state (registers) associated with this ISA extension, this should
>> suffice for integration.
> 
> I've given this a spin on a Sapphire Rapids system.
> 
> Relevant (AFAICT) bits of the log:
> 
> Testing vfpclasspsz $0x46,64(%edx),%k2...okay
> Testing vfpclassphz $0x46,128(%ecx),%k3...okay
> ...
> Testing avx512_fp16/all disp8 handling...okay
> Testing avx512_fp16/128 disp8 handling...okay
> ...
> Testing AVX512_FP16 f16 scal native execution...okay
> Testing AVX512_FP16 f16 scal 64-bit code sequence...okay
> Testing AVX512_FP16 f16 scal 32-bit code sequence...okay
> Testing AVX512_FP16 f16x32 native execution...okay
> Testing AVX512_FP16 f16x32 64-bit code sequence...okay
> Testing AVX512_FP16 f16x32 32-bit code sequence...okay
> Testing AVX512_FP16+VL f16x8 native execution...okay
> Testing AVX512_FP16+VL f16x8 64-bit code sequence...okay
> Testing AVX512_FP16+VL f16x8 32-bit code sequence...okay
> Testing AVX512_FP16+VL f16x16 native execution...okay
> Testing AVX512_FP16+VL f16x16 64-bit code sequence...okay
> Testing AVX512_FP16+VL f16x16 32-bit code sequence...okay
> 
> and it exits zero, so everything seems fine.
> 
> 
> One thing however, this series ups the minimum GCC version required to
> build the emulator at all:
> 
> make: Entering directory '/local/xen.git/tools/tests/x86_emulator'
> gcc: error: unrecognized command-line option ‘-mavx512fp16’; did you
> mean ‘-mavx512bf16’?
> Makefile:121: Test harness not built, use newer compiler than "gcc"
> (version 10) and an "{evex}" capable assembler
> 
> and I'm not sure we want to do this.  When upping the version of GCC but
> leaving binutils as-was does lead to a build of the harness without
> AVX512-FP16 active, which is the preferred behaviour here.

Well, this series on its own does, but I did notice the issue already.
Hence "x86emul: rework compiler probing in the test harness" [1].

Jan

[1] https://lists.xen.org/archives/html/xen-devel/2023-03/msg00123.html


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 00/10] x86: support AVX512-FP16
  2023-05-23  6:35   ` Jan Beulich
@ 2023-06-05 12:10     ` Andrew Cooper
  0 siblings, 0 replies; 16+ messages in thread
From: Andrew Cooper @ 2023-06-05 12:10 UTC (permalink / raw
  To: Jan Beulich, xen-devel@lists.xenproject.org; +Cc: Wei Liu, Roger Pau Monné

On 23/05/2023 7:35 am, Jan Beulich wrote:
> On 22.05.2023 18:25, Andrew Cooper wrote:
>> On 03/04/2023 3:56 pm, Jan Beulich wrote:
>>> While I (quite obviously) don't have any suitable hardware, Intel's
>>> SDE allows testing the implementation. And since there's no new
>>> state (registers) associated with this ISA extension, this should
>>> suffice for integration.
>> I've given this a spin on a Sapphire Rapids system.
>>
>> Relevant (AFAICT) bits of the log:
>>
>> Testing vfpclasspsz $0x46,64(%edx),%k2...okay
>> Testing vfpclassphz $0x46,128(%ecx),%k3...okay
>> ...
>> Testing avx512_fp16/all disp8 handling...okay
>> Testing avx512_fp16/128 disp8 handling...okay
>> ...
>> Testing AVX512_FP16 f16 scal native execution...okay
>> Testing AVX512_FP16 f16 scal 64-bit code sequence...okay
>> Testing AVX512_FP16 f16 scal 32-bit code sequence...okay
>> Testing AVX512_FP16 f16x32 native execution...okay
>> Testing AVX512_FP16 f16x32 64-bit code sequence...okay
>> Testing AVX512_FP16 f16x32 32-bit code sequence...okay
>> Testing AVX512_FP16+VL f16x8 native execution...okay
>> Testing AVX512_FP16+VL f16x8 64-bit code sequence...okay
>> Testing AVX512_FP16+VL f16x8 32-bit code sequence...okay
>> Testing AVX512_FP16+VL f16x16 native execution...okay
>> Testing AVX512_FP16+VL f16x16 64-bit code sequence...okay
>> Testing AVX512_FP16+VL f16x16 32-bit code sequence...okay
>>
>> and it exits zero, so everything seems fine.
>>
>>
>> One thing however, this series ups the minimum GCC version required to
>> build the emulator at all:
>>
>> make: Entering directory '/local/xen.git/tools/tests/x86_emulator'
>> gcc: error: unrecognized command-line option ‘-mavx512fp16’; did you
>> mean ‘-mavx512bf16’?
>> Makefile:121: Test harness not built, use newer compiler than "gcc"
>> (version 10) and an "{evex}" capable assembler
>>
>> and I'm not sure we want to do this.  When upping the version of GCC but
>> leaving binutils as-was does lead to a build of the harness without
>> AVX512-FP16 active, which is the preferred behaviour here.
> Well, this series on its own does, but I did notice the issue already.
> Hence "x86emul: rework compiler probing in the test harness" [1].
>
> Jan
>
> [1] https://lists.xen.org/archives/html/xen-devel/2023-03/msg00123.html

Ok.  Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 10/10] x86emul: AVX512-FP16 testing
  2023-04-03 15:00 ` [PATCH v2 10/10] x86emul: AVX512-FP16 testing Jan Beulich
@ 2023-06-05 13:08   ` Jan Beulich
  2023-06-05 13:43     ` Henry Wang
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Beulich @ 2023-06-05 13:08 UTC (permalink / raw
  To: Henry Wang
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné,
	xen-devel@lists.xenproject.org

Henry,

On 03.04.2023 17:00, Jan Beulich wrote:
> Naming of some of the builtins isn't fully consistent with that of pre-
> existing ones, so there's a need for a new BR2() wrapper macro.
> 
> With the tests providing some proof of proper functioning of the
> emulator code also enable use of the feature by guests, as there's no
> other infrastructure involved in enabling this ISA extension.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Add CHANGELOG.md entry.

I notice I forgot to Cc you on this submission, with said addition.
May I ask for your ack (or otherwise), please?

Thanks, Jan

> --- a/CHANGELOG.md
> +++ b/CHANGELOG.md
> @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog
>     - Bus-lock detection, used by Xen to mitigate (by rate-limiting) the system
>       wide impact of a guest misusing atomic instructions.
>   - xl/libxl can customize SMBIOS strings for HVM guests.
> + - x86 AVX512-FP16
>  
>  ## [4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.0) - 2022-12-12
>  
>[...]



^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v2 10/10] x86emul: AVX512-FP16 testing
  2023-06-05 13:08   ` Jan Beulich
@ 2023-06-05 13:43     ` Henry Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Henry Wang @ 2023-06-05 13:43 UTC (permalink / raw
  To: Jan Beulich
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné,
	xen-devel@lists.xenproject.org

Hi Jan,

> -----Original Message-----
> Subject: Re: [PATCH v2 10/10] x86emul: AVX512-FP16 testing
> 
> Henry,
> 
> On 03.04.2023 17:00, Jan Beulich wrote:
> > Naming of some of the builtins isn't fully consistent with that of pre-
> > existing ones, so there's a need for a new BR2() wrapper macro.
> >
> > With the tests providing some proof of proper functioning of the
> > emulator code also enable use of the feature by guests, as there's no
> > other infrastructure involved in enabling this ISA extension.
> >
> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> > ---
> > v2: Add CHANGELOG.md entry.
> 
> I notice I forgot to Cc you on this submission, with said addition.
> May I ask for your ack (or otherwise), please?

Thanks for letting me know this (and also remembering the changelog
entry)! My suggestion would be making the entry look like a complete
sentence instead of the current form.

But I also understand this is quite nitpicking so I will let you decide,
either making it more readable or keeping its current form is fine with
me, and if you want to improve the wording, you can do it on commit.

So:
Acked-by: Henry Wang <Henry.Wang@arm.com> # CHANGELOG

Kind regards,
Henry

> 
> Thanks, Jan
> 
> > --- a/CHANGELOG.md
> > +++ b/CHANGELOG.md
> > @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog
> >     - Bus-lock detection, used by Xen to mitigate (by rate-limiting) the system
> >       wide impact of a guest misusing atomic instructions.
> >   - xl/libxl can customize SMBIOS strings for HVM guests.
> > + - x86 AVX512-FP16
> >
> >  ##
> [4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-
> 4.17.0) - 2022-12-12
> >
> >[...]


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-06-05 13:44 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-04-03 14:56 [PATCH v2 00/10] x86: support AVX512-FP16 Jan Beulich
2023-04-03 14:57 ` [PATCH v2 01/10] x86emul: handle AVX512-FP16 insns encoded in 0f3a opcode map Jan Beulich
2023-04-03 14:57 ` [PATCH v2 02/10] x86emul: handle AVX512-FP16 Map5 arithmetic insns Jan Beulich
2023-04-03 14:57 ` [PATCH v2 03/10] x86emul: handle AVX512-FP16 move insns Jan Beulich
2023-04-03 14:58 ` [PATCH v2 04/10] x86emul: handle AVX512-FP16 fma-like insns Jan Beulich
2023-04-03 14:58 ` [PATCH v2 05/10] x86emul: handle AVX512-FP16 Map6 misc insns Jan Beulich
2023-04-03 14:58 ` [PATCH v2 06/10] x86emul: handle AVX512-FP16 complex multiplication insns Jan Beulich
2023-04-03 14:59 ` [PATCH v2 07/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int16 insns Jan Beulich
2023-04-03 14:59 ` [PATCH v2 08/10] x86emul: handle AVX512-FP16 floating point conversion insns Jan Beulich
2023-04-03 15:00 ` [PATCH v2 09/10] x86emul: handle AVX512-FP16 conversion to/from (packed) int{32,64} insns Jan Beulich
2023-04-03 15:00 ` [PATCH v2 10/10] x86emul: AVX512-FP16 testing Jan Beulich
2023-06-05 13:08   ` Jan Beulich
2023-06-05 13:43     ` Henry Wang
2023-05-22 16:25 ` [PATCH v2 00/10] x86: support AVX512-FP16 Andrew Cooper
2023-05-23  6:35   ` Jan Beulich
2023-06-05 12:10     ` Andrew Cooper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).