[Valgrind-developers] [PATCH v3] Add support for precise computation of SIMD greater than

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

From: eyal0 <109...@us...>

This fixes https://bugs.kde.org/show_bug.cgi?id=432801

To test:

```sh
make && perl tests/vg_regtest memcheck/tests/x86/pcmpgtd
```
---
 .gitignore                              |   1 +
 memcheck/mc_translate.c                 | 101 +++++++++++++++++-
 memcheck/tests/amd64/Makefile.am        |   6 +-
 memcheck/tests/amd64/pcmpgtd.c          | 134 ++++++++++++++++++++++++
 memcheck/tests/amd64/pcmpgtd.stderr.exp |  44 ++++++++
 memcheck/tests/amd64/pcmpgtd.vgtest     |   2 +
 6 files changed, 285 insertions(+), 3 deletions(-)
 create mode 100644 memcheck/tests/amd64/pcmpgtd.c
 create mode 100644 memcheck/tests/amd64/pcmpgtd.stderr.exp
 create mode 100644 memcheck/tests/amd64/pcmpgtd.vgtest

diff --git a/.gitignore b/.gitignore
index b9fca3de3..fd1cf9ae5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -974,6 +974,7 @@
 /memcheck/tests/amd64/insn-bsfl
 /memcheck/tests/amd64/insn-pmovmskb
 /memcheck/tests/amd64/insn-pcmpistri
+/memcheck/tests/amd64/pcmpgtd
 /memcheck/tests/amd64/sh-mem-vec128
 /memcheck/tests/amd64/sh-mem-vec256
 /memcheck/tests/amd64/xsave-avx
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 516988bdd..51e94427e 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1287,6 +1287,101 @@ static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
    return final_cast;
 }
 
+/* Check if we can know, despite the uncertain bits, that xx is greater than yy.
+   Notice that it's xx > yy and not the other way around.  This is Intel syntax
+   with destination first.  It will appear reversed in gdb disassembly (AT&T
+   syntax).
+ */
+static IRAtom* expensiveCmpGT ( MCEnv*  mce,
+                                unsigned int word_size,
+                                Bool is_signed,
+                                unsigned int count,
+                                IRAtom* vxx, IRAtom* vyy,
+                                IRAtom* xx,  IRAtom* yy )
+{
+   IROp   opAND, opOR, opXOR, opNOT, opEQ, opSHL, opGT;
+   IRType ty;
+
+   tl_assert(isShadowAtom(mce,vxx));
+   tl_assert(isShadowAtom(mce,vyy));
+   tl_assert(isOriginalAtom(mce,xx));
+   tl_assert(isOriginalAtom(mce,yy));
+   tl_assert(sameKindedAtoms(vxx,xx));
+   tl_assert(sameKindedAtoms(vyy,yy));
+
+   switch (word_size * count) {
+      case 128:
+         ty = Ity_V128;
+         opAND = Iop_AndV128;
+         opOR   = Iop_OrV128;
+         opXOR  = Iop_XorV128;
+         opNOT  = Iop_NotV128;
+         break;
+      default:
+         VG_(tool_panic)("expensiveCmpGT");
+   }
+   if (word_size == 32 && count == 4) {
+      opEQ = Iop_CmpEQ32x4;
+      opSHL = Iop_ShlN32x4;
+      if (is_signed) {
+         opGT = Iop_CmpGT32Sx4;
+      } else {
+         opGT = Iop_CmpGT32Ux4;
+      }
+   } else {
+      VG_(tool_panic)("expensiveCmpGT");
+   }
+   IRAtom *MSBs;
+   if (is_signed) {
+      // For unsigned it's easy to make the min and max: Just set the unknown
+      // bits all to 0s or 1s.  For signed it's harder because having a 1 in the
+      // MSB makes a number smaller, not larger!  We can work around this by
+      // flipping the MSB before and after computing the min and max values.
+      IRAtom *const0 = mkV128(0);
+      IRAtom *all_ones = assignNew('V', mce, ty, binop(opEQ, const0, const0));
+      MSBs = assignNew('V', mce, ty, binop(opSHL, all_ones, mkU8(31)));
+      xx = assignNew('V', mce, ty, binop(opXOR, xx, MSBs));
+      yy = assignNew('V', mce, ty, binop(opXOR, yy, MSBs));
+      // From here on out, we're dealing with MSB-flipped integers.
+   }
+   // We can combine xx and vxx to create two values: the largest that xx could
+   // possibly be and the smallest that xx could possibly be.  Likewise, we can
+   // do the same for yy.  We'll call those max_xx and min_xx and max_yy and
+   // min_yy.
+   IRAtom *not_vxx = assignNew('V', mce, ty, unop(opNOT, vxx));
+   IRAtom *not_vyy = assignNew('V', mce, ty, unop(opNOT, vyy));
+   IRAtom *max_xx = assignNew('V', mce, ty, binop(opOR, xx, vxx));
+   IRAtom *min_xx = assignNew('V', mce, ty, binop(opAND, xx, not_vxx));
+   IRAtom *max_yy = assignNew('V', mce, ty, binop(opOR, yy, vyy));
+   IRAtom *min_yy = assignNew('V', mce, ty, binop(opAND, yy, not_vyy));
+   if (is_signed) {
+      // Unflip the MSBs.
+      max_xx = assignNew('V', mce, ty, binop(opXOR, max_xx, MSBs));
+      min_xx = assignNew('V', mce, ty, binop(opXOR, min_xx, MSBs));
+      max_yy = assignNew('V', mce, ty, binop(opXOR, max_yy, MSBs));
+      min_yy = assignNew('V', mce, ty, binop(opXOR, min_yy, MSBs));
+   }
+   IRAtom *min_xx_gt_max_yy = assignNew('V', mce, ty, binop(opGT, min_xx, max_yy));
+   IRAtom *max_xx_gt_min_yy = assignNew('V', mce, ty, binop(opGT, max_xx, min_yy));
+   // If min_xx is greater than max_yy then xx is surely greater than yy so we know
+   // our answer for sure.  If max_xx is not greater than min_yy then xx can't
+   // possible be greater than yy so again we know the answer for sure.  For all
+   // other cases, we can't know.
+   //
+   // So the result is defined if:
+   //
+   // min_xx_gt_max_yy | ~max_xx_gt_min_yy
+   //
+   // Because defined in vbits is 0s and not 1s, we need to invert that:
+   //
+   // ~(min_xx_gt_max_yy | ~max_xx_gt_min_yy)
+   //
+   // We can use DeMorgan's Law to simplify the above:
+   //
+   // ~min_xx_gt_max_yy & max_xx_gt_min_yy
+   IRAtom *not_min_xx_gt_max_yy = assignNew('V', mce, ty, unop(opNOT, min_xx_gt_max_yy));
+   return assignNew('V', mce, ty, binop(opAND, not_min_xx_gt_max_yy, max_xx_gt_min_yy));
+}
 
 /* --------- Semi-accurate interpretation of CmpORD. --------- */
 
@@ -3947,9 +4042,13 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
       case Iop_PwExtUSMulQAdd8x16:
          return binary16Ix8(mce, vatom1, vatom2);
 
-      case Iop_Sub32x4:
       case Iop_CmpGT32Sx4:
+         return expensiveCmpGT(mce, 32, True /* signed */,
+                               4, vatom1, vatom2, atom1, atom2);
       case Iop_CmpGT32Ux4:
+         return expensiveCmpGT(mce, 32, False /* unsigned */,
+                               4, vatom1, vatom2, atom1, atom2);
+      case Iop_Sub32x4:
       case Iop_CmpEQ32x4:
       case Iop_QAdd32Sx4:
       case Iop_QAdd32Ux4:
diff --git a/memcheck/tests/amd64/Makefile.am b/memcheck/tests/amd64/Makefile.am
index da15cf797..e81ea74da 100644
--- a/memcheck/tests/amd64/Makefile.am
+++ b/memcheck/tests/amd64/Makefile.am
@@ -18,6 +18,7 @@ EXTRA_DIST = \
 	insn-pcmpistri.vgtest insn-pcmpistri.stdout.exp insn-pcmpistri.stderr.exp \
 	insn-pmovmskb.vgtest insn-pmovmskb.stdout.exp insn-pmovmskb.stderr.exp \
 	more_x87_fp.stderr.exp more_x87_fp.stdout.exp more_x87_fp.vgtest \
+	pcmpgtd.stderr.exp pcmpgtd.vgtest \
 	sh-mem-vec128-plo-no.vgtest \
 		sh-mem-vec128-plo-no.stderr.exp \
 		sh-mem-vec128-plo-no.stdout.exp \
@@ -43,6 +44,7 @@ check_PROGRAMS = \
 	fxsave-amd64 \
 	insn-bsfl \
 	insn-pmovmskb \
+	pcmpgtd \
 	sh-mem-vec128 \
 	sse_memory \
 	xor-undef-amd64
@@ -55,8 +57,8 @@ endif
 # clang 3.5.0 barfs about -mfancy-math-387
 if !COMPILER_IS_CLANG
  check_PROGRAMS += \
- 	more_x87_fp \
- 	shr_edx
+	more_x87_fp \
+	shr_edx
 endif
 
 AM_CFLAGS    += @FLAG_M64@
diff --git a/memcheck/tests/amd64/pcmpgtd.c b/memcheck/tests/amd64/pcmpgtd.c
new file mode 100644
index 000000000..891ebad35
--- /dev/null
+++ b/memcheck/tests/amd64/pcmpgtd.c
@@ -0,0 +1,134 @@
+/* https://bugs.kde.org/show_bug.cgi?id=432801 */
+
+#include <signal.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "../../memcheck.h"
+
+// This function fails when compiled on clang version 10 or greater with -O2.
+// It's unused by the test but left here as a copy of the error in the bug
+// report https://bugs.kde.org/show_bug.cgi?id=432801
+void standalone() {
+   struct sigaction act;
+   if (sigaction(SIGTERM, 0, &act) == 1) {
+      return;
+   }
+   if (sigaction(SIGTERM, 0, &act) == 1) {
+      return;
+   }
+
+   char pattern[] = "\x1\x2\x3\x4\x5\x6\x7\x8\x9";
+   const unsigned long plen = strlen(pattern);
+   pattern[1] = 0;
+   size_t hp=0;
+   for (size_t i = 0; i < plen; ++i)
+      hp += pattern[i];
+   volatile int j = 0;
+   if (j == hp % 10) {
+      j++;
+   }
+   printf("%ld\n", hp);
+}
+
+typedef unsigned long ULong;
+
+typedef struct {
+   ULong w64[2];  /* Note: little-endian */
+} V128;
+
+static int cmpGT32Sx4(V128 x, V128 y)
+{
+   int result;
+   __asm__("movups %1,%%xmm6\n"
+           "\tmovups %2,%%xmm7\n"
+           // order swapped for AT&T style which has destination second.
+           "\tpcmpgtd %%xmm7,%%xmm6\n"
+           "\tmovd %%xmm6, %0"
+           : "=r" (result) : "m" (x), "m" (y) : "xmm6");
+   return result;
+}
+
+/* Set the V bits on the data at "addr".  Note the convention: A zero
+   bit means "defined"; 1 means "undefined". */
+static void set_vbits(V128 *addr, V128 vbits)
+{
+   int i;
+   for (i=0 ; i<2 ; ++i) {
+      (void)VALGRIND_SET_VBITS(&addr->w64[i], &vbits.w64[i], sizeof(vbits.w64[i]));
+   }
+}
+
+/* Use a value that we know is invalid. */
+static void use(char *x, char* y, int invalid)
+{
+   /* Convince GCC it does not know what is in "invalid" so it cannot
+      possibly optimize away the conditional branch below. */
+   __asm__ ("" : "=r" (invalid) : "0" (invalid));
+
+   /* Create a conditional branch on which our output depends, so that
+      memcheck cannot possibly optimize it away, either. */
+   if (invalid) {
+      fprintf(stderr, "%s > %s == true\n", x, y);
+   } else {
+      fprintf(stderr, "%s > %s == false\n", x, y);
+   }
+}
+
+// Convert a string like "123XXX45" to a value and vbits.
+V128 string_to_v128(char *s) {
+   ULong x = 0;
+   ULong vx = 0;
+
+   for (; *s; s++) {
+      int lowered_c = tolower(*s);
+      x <<= 4;
+      vx <<= 4;
+      if (lowered_c == 'x') {
+         vx |= 0xf;
+      } else if (isdigit(lowered_c)) {
+         x |= lowered_c - '0';
+      } else if (lowered_c >= 'a' && lowered_c <= 'f') {
+         x |= lowered_c - 'a' + 0xa;
+      } else {
+         fprintf(stderr, "Not a hex digit: %c\n", *s);
+         exit(1);
+      }
+   }
+
+   V128 vx128 = { { vx, 0 } };
+   V128 x128 = { { x, 0 } };
+   set_vbits(&x128, vx128);
+   return x128;
+}
+
+static void doit(char *x, char *y) {
+   int result = cmpGT32Sx4(string_to_v128(x), string_to_v128(y));
+   use(x, y, result);
+}
+
+int main() {
+   // completely undefined
+   doit("xxxxxxxx", "xxxxxxxx");
+
+   // completely defined
+   doit("00000000", "00000000");
+   doit("00000000", "f0000000");
+   doit("f0000000", "00000000");
+
+   doit("00000000", "fxxxxxxx"); // defined: 0 > all negatives
+   doit("0xxxxxxx", "fxxxxxxx"); // defined: non-negatives > all negatives
+   doit("xxxxxxx0", "f0000000"); // undefined
+   doit("xxxxxxx1", "80000000"); // defined: ends with 1 > MIN_INT
+   doit("5xxxxxxx", "6xxxxxxx"); // defined
+   doit("8xxxxxxx", "9xxxxxxx"); // defined
+
+   doit("1234567x", "12345678"); // undefined
+   doit("1234567x", "1234567f"); // defined: x can't be more than f
+   doit("1234567x", "1234567e"); // undefined: x can be more than e
+
+   return 0;
+}
diff --git a/memcheck/tests/amd64/pcmpgtd.stderr.exp b/memcheck/tests/amd64/pcmpgtd.stderr.exp
new file mode 100644
index 000000000..bb248c60c
--- /dev/null
+++ b/memcheck/tests/amd64/pcmpgtd.stderr.exp
@@ -0,0 +1,44 @@
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: use (pcmpgtd.c:74)
+   by 0x........: doit (pcmpgtd.c:110)
+   by 0x........: main (pcmpgtd.c:115)
+
+xxxxxxxx > xxxxxxxx == false
+00000000 > 00000000 == false
+00000000 > f0000000 == true
+f0000000 > 00000000 == false
+00000000 > fxxxxxxx == true
+0xxxxxxx > fxxxxxxx == true
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: use (pcmpgtd.c:74)
+   by 0x........: doit (pcmpgtd.c:110)
+   by 0x........: main (pcmpgtd.c:124)
+
+xxxxxxx0 > f0000000 == true
+xxxxxxx1 > 80000000 == true
+5xxxxxxx > 6xxxxxxx == false
+8xxxxxxx > 9xxxxxxx == false
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: use (pcmpgtd.c:74)
+   by 0x........: doit (pcmpgtd.c:110)
+   by 0x........: main (pcmpgtd.c:129)
+
+1234567x > 12345678 == false
+1234567x > 1234567f == false
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: use (pcmpgtd.c:74)
+   by 0x........: doit (pcmpgtd.c:110)
+   by 0x........: main (pcmpgtd.c:131)
+
+1234567x > 1234567e == false
+
+HEAP SUMMARY:
+    in use at exit: 0 bytes in 0 blocks
+  total heap usage: 0 allocs, 0 frees, 0 bytes allocated
+
+For a detailed leak analysis, rerun with: --leak-check=full
+
+Use --track-origins=yes to see where uninitialised values come from
+For lists of detected and suppressed errors, rerun with: -s
+ERROR SUMMARY: 4 errors from 4 contexts (suppressed: 0 from 0)
diff --git a/memcheck/tests/amd64/pcmpgtd.vgtest b/memcheck/tests/amd64/pcmpgtd.vgtest
new file mode 100644
index 000000000..08fabeb76
--- /dev/null
+++ b/memcheck/tests/amd64/pcmpgtd.vgtest
@@ -0,0 +1,2 @@
+prog: pcmpgtd -q
+prereq: test -e pcmpgtd
-- 
2.20.1