From: Wu, F. <fe...@in...> - 2023-05-26 14:09:08
|
On 5/26/2023 9:59 PM, Fei Wu wrote: > I'm from Intel RISC-V team and working on a RISC-V International > development partner project to add RISC-V vector (RVV) support on > Valgrind, the target tool is memcheck. My work bases on commit > 71272b252977 of Petr's riscv64-linux branch, many thanks to Petr for his > great work first. > https://github.com/petrpavlu/valgrind-riscv64 > > This RFC is a starting point of RVV support on Valgrind, It's far from > complete, which will take huge time, but I do think it's more effective > to have some real code for discussion, so this series adds the RVV > support to run memcpy/strcmp/strcpy/strlen/strncpy in: > https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/examples > In case the intrinsic version is built with extra RVV instructions which are not supported yet, here is an assembly version. All C code is from the above link with a small tweak, and the asm code is copied from: https://github.com/riscv/riscv-v-spec/tree/master/example diff --git a/rvv-examples/Makefile b/rvv-examples/Makefile new file mode 100644 index 000000000..dfae4ac31 --- /dev/null +++ b/rvv-examples/Makefile @@ -0,0 +1,23 @@ +CC := clang +CFLAGS := -g -march=rv64gcv -mllvm -riscv-v-vector-bits-min=128 -O2 +ASFLAGS := -g -march=rv64gcv -mllvm -riscv-v-vector-bits-min=128 -O2 + +BINARY = rvv_strcmp rvv_memcpy rvv_strcpy rvv_strlen rvv_strncpy + +.PHONY: all clean test + +all: $(BINARY) + +clean: + rm -f $(BINARY) + +test: $(BINARY) + for t in $(BINARY); do \ + valgrind ./$$t; \ + done + +rvv_strcmp: rvv_strcmp.c strcmp.s +rvv_memcpy: rvv_memcpy.c memcpy.s +rvv_strcpy: rvv_strcpy.c strcpy.s +rvv_strlen: rvv_strlen.c strlen.s +rvv_strncpy: rvv_strncpy.c strncpy.s diff --git a/rvv-examples/common.h b/rvv-examples/common.h new file mode 100644 index 000000000..cec96ed2b --- /dev/null +++ b/rvv-examples/common.h @@ -0,0 +1,112 @@ +// common.h +// common utilites for the test code under exmaples/ + +#include <math.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +extern void *memcpy_vec(void *dst, void *src, size_t n); +extern int strcmp_vec(const char *src1, const char *src2); +extern char *strcpy_vec(char *dst, const char *src); +extern size_t strlen_vec(char *src); +extern char *strncpy_vec(char *dst, char *src, size_t count); + +void gen_rand_1d(double *a, int n) { + for (int i = 0; i < n; ++i) + a[i] = (double)rand() / (double)RAND_MAX + (double)(rand() % 1000); +} + +void gen_string(char *s, int n) { + // char value range: -128 ~ 127 + for (int i = 0; i < n - 1; ++i) + s[i] = (char)(rand() % 127) + 1; + s[n - 1] = '\0'; +} + +void gen_rand_2d(double **ar, int n, int m) { + for (int i = 0; i < n; ++i) + for (int j = 0; j < m; ++j) + ar[i][j] = (double)rand() / (double)RAND_MAX + (double)(rand() % 1000); +} + +void print_string(const char *a, const char *name) { + printf("const char *%s = \"", name); + int i = 0; + while (a[i] != 0) + putchar(a[i++]); + printf("\"\n"); + puts(""); +} + +void print_array_1d(double *a, int n, const char *type, const char *name) { + printf("%s %s[%d] = {\n", type, name, n); + for (int i = 0; i < n; ++i) { + printf("%06.2f%s", a[i], i != n - 1 ? "," : "};\n"); + if (i % 10 == 9) + puts(""); + } + puts(""); +} + +void print_array_2d(double **a, int n, int m, const char *type, + const char *name) { + printf("%s %s[%d][%d] = {\n", type, name, n, m); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < m; ++j) { + printf("%06.2f", a[i][j]); + if (j == m - 1) + puts(i == n - 1 ? "};" : ","); + else + putchar(','); + } + } + puts(""); +} + +bool double_eq(double golden, double actual, double relErr) { + return (fabs(actual - golden) < relErr); +} + +bool compare_1d(double *golden, double *actual, int n) { + for (int i = 0; i < n; ++i) + if (!double_eq(golden[i], actual[i], 1e-6)) + return false; + return true; +} + +bool compare_string(const char *golden, const char *actual, int n) { + for (int i = 0; i < n; ++i) + if (golden[i] != actual[i]) + return false; + return true; +} + +bool compare_2d(double **golden, double **actual, int n, int m) { + for (int i = 0; i < n; ++i) + for (int j = 0; j < m; ++j) + if (!double_eq(golden[i][j], actual[i][j], 1e-6)) + return false; + return true; +} + +double **alloc_array_2d(int n, int m) { + double **ret; + ret = (double **)malloc(sizeof(double *) * n); + for (int i = 0; i < n; ++i) + ret[i] = (double *)malloc(sizeof(double) * m); + return ret; +} + +void init_array_one_1d(double *ar, int n) { + for (int i = 0; i < n; ++i) + ar[i] = 1; +} + +void init_array_one_2d(double **ar, int n, int m) { + for (int i = 0; i < n; ++i) + for (int j = 0; j < m; ++j) + ar[i][j] = 1; +} diff --git a/rvv-examples/memcpy.s b/rvv-examples/memcpy.s new file mode 100644 index 000000000..1b50ab670 --- /dev/null +++ b/rvv-examples/memcpy.s @@ -0,0 +1,17 @@ + .text + .balign 4 + .global memcpy_vec + # void *memcpy_vec(void* dest, const void* src, size_t n) + # a0=dest, a1=src, a2=n + # + memcpy_vec: + mv a3, a0 # Copy destination + loop: + vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b + vle8.v v0, (a1) # Load bytes + add a1, a1, t0 # Bump pointer + sub a2, a2, t0 # Decrement count + vse8.v v0, (a3) # Store bytes + add a3, a3, t0 # Bump pointer + bnez a2, loop # Any more? + ret # Return diff --git a/rvv-examples/rvv_memcpy.c b/rvv-examples/rvv_memcpy.c new file mode 100644 index 000000000..d78b9b604 --- /dev/null +++ b/rvv-examples/rvv_memcpy.c @@ -0,0 +1,21 @@ +#include "common.h" +#include <riscv_vector.h> +#include <string.h> + +int main() { + const int N = 127; + const uint32_t seed = 0xdeadbeef; + srand(seed); + + // data gen + double A[N]; + gen_rand_1d(A, N); + + // compute + double golden[N], actual[N]; + memcpy(golden, A, sizeof(A)); + memcpy_vec(actual, A, sizeof(A)); + + // compare + puts(compare_1d(golden, actual, N) ? "pass" : "fail"); +} diff --git a/rvv-examples/rvv_strcmp.c b/rvv-examples/rvv_strcmp.c new file mode 100644 index 000000000..d10cac133 --- /dev/null +++ b/rvv-examples/rvv_strcmp.c @@ -0,0 +1,25 @@ +#include "common.h" +#include <riscv_vector.h> +#include <string.h> + +int main() { + const int N = 1023; + const uint32_t seed = 0xdeadbeef; + srand(seed); + + // data gen + char s0[N], s1[N]; + gen_string(s0, N); + gen_string(s1, N); + + // compute + int golden, actual; + golden = strcmp(s0, s1); + actual = strcmp_vec(s0, s1); + + golden = (golden == 0) ? 0 : (golden > 0) ? 1 : -1; + actual = (golden == 0) ? 0 : (golden > 0) ? 1 : -1; + + // compare + puts(golden == actual ? "pass" : "fail"); +} diff --git a/rvv-examples/rvv_strcpy.c b/rvv-examples/rvv_strcpy.c new file mode 100644 index 000000000..7e5af8673 --- /dev/null +++ b/rvv-examples/rvv_strcpy.c @@ -0,0 +1,22 @@ +#include "common.h" +#include <assert.h> +#include <riscv_vector.h> +#include <string.h> + +int main() { + const int N = 2000; + const uint32_t seed = 0xdeadbeef; + srand(seed); + + // data gen + char s0[N]; + gen_string(s0, N); + + // compute + char golden[N], actual[N]; + strcpy(golden, s0); + strcpy_vec(actual, s0); + + // compare + puts(strcmp(golden, actual) == 0 ? "pass" : "fail"); +} diff --git a/rvv-examples/rvv_strlen.c b/rvv-examples/rvv_strlen.c new file mode 100644 index 000000000..e1142f883 --- /dev/null +++ b/rvv-examples/rvv_strlen.c @@ -0,0 +1,22 @@ +#include "common.h" +#include <riscv_vector.h> +#include <string.h> + +int main() { + const uint32_t seed = 0xdeadbeef; + srand(seed); + + int N = rand() % 2000; + + // data gen + char s0[N]; + gen_string(s0, N); + + // compute + size_t golden, actual; + golden = strlen(s0); + actual = strlen_vec(s0); + + // compare + puts(golden == actual ? "pass" : "fail"); +} diff --git a/rvv-examples/rvv_strncpy.c b/rvv-examples/rvv_strncpy.c new file mode 100644 index 000000000..f1d14ac52 --- /dev/null +++ b/rvv-examples/rvv_strncpy.c @@ -0,0 +1,25 @@ +#include "common.h" +#include <riscv_vector.h> +#include <string.h> + +int main() { + const int N = 1320; + const uint32_t seed = 0xdeadbeef; + srand(seed); + + // data gen + char s0[N]; + gen_string(s0, N); + char s1[] = "the quick brown fox jumps over the lazy dog"; + size_t count = strlen(s1) + rand() % 500; + + // compute + char golden[N], actual[N]; + strcpy(golden, s0); + strcpy(actual, s0); + strncpy(golden, s1, count); + strncpy_vec(actual, s1, count); + + // compare + puts(compare_string(golden, actual, N) ? "pass" : "fail"); +} diff --git a/rvv-examples/strcmp.s b/rvv-examples/strcmp.s new file mode 100644 index 000000000..85d32c96d --- /dev/null +++ b/rvv-examples/strcmp.s @@ -0,0 +1,34 @@ + .text + .balign 4 + .global strcmp_vec + # int strcmp_vec(const char *src1, const char* src2) +strcmp_vec: + ## Using LMUL=2, but same register names work for larger LMULs + li t1, 0 # Initial pointer bump +loop: + vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes + add a0, a0, t1 # Bump src1 pointer + vle8ff.v v8, (a0) # Get src1 bytes + add a1, a1, t1 # Bump src2 pointer + vle8ff.v v16, (a1) # Get src2 bytes + + vmseq.vi v0, v8, 0 # Flag zero bytes in src1 + vmsne.vv v1, v8, v16 # Flag if src1 != src2 + vmor.mm v0, v0, v1 # Combine exit conditions + + vfirst.m a2, v0 # ==0 or != ? + csrr t1, vl # Get number of bytes fetched + + bltz a2, loop # Loop if all same and no zero byte + + add a0, a0, a2 # Get src1 element address + lbu a3, (a0) # Get src1 byte from memory + + add a1, a1, a2 # Get src2 element address + lbu a4, (a1) # Get src2 byte from memory + + sub a0, a3, a4 # Return value. + + ret + + diff --git a/rvv-examples/strcpy.s b/rvv-examples/strcpy.s new file mode 100644 index 000000000..292df25ac --- /dev/null +++ b/rvv-examples/strcpy.s @@ -0,0 +1,20 @@ + .text + .balign 4 + .global strcpy_vec + # char* strcpy_vec(char *dst, const char* src) +strcpy_vec: + mv a2, a0 # Copy dst + li t0, -1 # Infinite AVL +loop: + vsetvli x0, t0, e8, m8, ta, ma # Max length vectors of bytes + vle8ff.v v8, (a1) # Get src bytes + csrr t1, vl # Get number of bytes fetched + vmseq.vi v1, v8, 0 # Flag zero bytes + vfirst.m a3, v1 # Zero found? + add a1, a1, t1 # Bump pointer + vmsif.m v0, v1 # Set mask up to and including zero byte. + vse8.v v8, (a2), v0.t # Write out bytes + add a2, a2, t1 # Bump pointer + bltz a3, loop # Zero byte not found, so loop + + ret diff --git a/rvv-examples/strlen.s b/rvv-examples/strlen.s new file mode 100644 index 000000000..721c0257e --- /dev/null +++ b/rvv-examples/strlen.s @@ -0,0 +1,22 @@ + .text + .balign 4 + .global strlen_vec +# size_t strlen_vec(const char *str) +# a0 holds *str + +strlen_vec: + mv a3, a0 # Save start +loop: + vsetvli a1, x0, e8, m8, ta, ma # Vector of bytes of maximum length + vle8ff.v v8, (a3) # Load bytes + csrr a1, vl # Get bytes read + vmseq.vi v0, v8, 0 # Set v0[i] where v8[i] = 0 + vfirst.m a2, v0 # Find first set bit + add a3, a3, a1 # Bump pointer + bltz a2, loop # Not found? + + add a0, a0, a1 # Sum start + bump + add a3, a3, a2 # Add index + sub a0, a3, a0 # Subtract start address+bump + + ret diff --git a/rvv-examples/strncpy.s b/rvv-examples/strncpy.s new file mode 100644 index 000000000..f7114c5ca --- /dev/null +++ b/rvv-examples/strncpy.s @@ -0,0 +1,36 @@ + .text + .balign 4 + .global strncpy_vec + # char* strncpy_vec(char *dst, const char* src, size_t n) +strncpy_vec: + mv a3, a0 # Copy dst +loop: + vsetvli x0, a2, e8, m8, ta, ma # Vectors of bytes. + vle8ff.v v8, (a1) # Get src bytes + vmseq.vi v1, v8, 0 # Flag zero bytes + csrr t1, vl # Get number of bytes fetched + vfirst.m a4, v1 # Zero found? + vmsbf.m v0, v1 # Set mask up to before zero byte. + vse8.v v8, (a3), v0.t # Write out non-zero bytes + bgez a4, zero_tail # Zero remaining bytes. + sub a2, a2, t1 # Decrement count. + add a3, a3, t1 # Bump dest pointer + add a1, a1, t1 # Bump src pointer + bnez a2, loop # Anymore? + + ret + +zero_tail: + sub a2, a2, a4 # Subtract count on non-zero bytes. + add a3, a3, a4 # Advance past non-zero bytes. + vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes. + vmv.v.i v0, 0 # Splat zero. + +zero_loop: + vse8.v v0, (a3) # Store zero. + sub a2, a2, t1 # Decrement count. + add a3, a3, t1 # Bump pointer + vsetvli t1, a2, e8, m8, ta, ma # Vectors of bytes. + bnez a2, zero_loop # Anymore? + + ret Thanks, Fei. > The whole idea is splitting the vector instructions into scalar > instructions which have already been well supported on Petr's branch, > the correctness of binary translation (tool=none) is simple to ensure, > but the logic of tool=memcheck should not be broken, one of the keys is > to deal with the instructions with mask: > > * for load/store with mask, LoadG/StoreG are enabled, the same semantics > as other architectures > > * for other instructions such as vadd, if the vector mask agnostic (vma) > is set to undisturbed, the masked original value is read first then > write back, the V bit won't change even after write back, it's not > necessary to have another guard type like LoadG/StoreG. > > Pros > ---- > * by leveraging the existing scalar instructions support on Valgrind, > usually adding a new instruction involves only the frontend in > guest_riscv64_toIR, other parts are rare touched, so effort is much > reduced to enable new instructions. > > * As the backend only sees the scalar IRs and generates scalar > instructions, it's possible to run valgrind ./vec-test on non-RVV host. > > Cons > ---- > * as this method splits RVV instruction at frontend, there is less > chance to optimize at other stages, e.g. the vbits tracking. > > * with larger vlen such as 1K, at most 1 RVV instruction will split into > 1K ops, besides the performance penalty, it causes pressure to other > components such as tmp space too. Some of this can be relieved by > grouping multiple elements together. > > > There are some alternatives, but none seems perfect: > * helper function. It's much easier to make tool=none work, but how good > is it to handle the V+A tracking and other tools? Generally speaking, it > should not be a general solution for too many instructions. > > * define and pass the RVV IR to backend, instead of splitting it too > early. This introduces much effort, we should evaluate what level of > profit can be attained. > > At last, if the performance is tolerable, is this the right way to go? > > > Fei Wu (12): > riscv64: Starting Vector support, registers added > riscv64: Pass riscv guest_state for translation > riscv64: Add SyncupEnv & TooManyIR jump kinds > riscv64: Add LoadG/StoreG support > riscv64: Shift guest_state -2048 on calling helper > riscv64: Add cpu_state to TB > riscv64: Introduce dis_RV64V and add vsetvl > riscv64: Add load/store > riscv64: Add csrr vl > riscv64: add vfirst > riscv64: Add vmsgtu/vmseq/vmsne/vmsbf/vmsif/vmor/vmv/vid > riscv64: Add vadd > > VEX/priv/guest_riscv64_toIR.c | 974 +++++++++++++++++++++++++++++- > VEX/priv/host_riscv64_defs.c | 133 ++++ > VEX/priv/host_riscv64_defs.h | 23 + > VEX/priv/host_riscv64_isel.c | 89 ++- > VEX/priv/ir_defs.c | 8 + > VEX/priv/ir_opt.c | 4 +- > VEX/pub/libvex.h | 4 + > VEX/pub/libvex_guest_riscv64.h | 47 +- > VEX/pub/libvex_ir.h | 9 +- > coregrind/m_scheduler/scheduler.c | 17 +- > coregrind/m_translate.c | 5 + > coregrind/m_transtab.c | 26 +- > coregrind/pub_core_transtab.h | 5 + > memcheck/mc_machine.c | 35 ++ > memcheck/mc_translate.c | 4 + > 15 files changed, 1368 insertions(+), 15 deletions(-) > |