|
From: Stas B. <sta...@gm...> - 2026-06-10 03:21:45
|
Not checking for the 8th bit is a question. A custom SIMD routine
would provide ascii-checking basically for free (and even for :utf8),
although it's harder to keep up with well optimized memcpy (but
they're all in assembly anyway, not impossible to copy the same
approaches).
The buffering external formats do have SB-VM::SIMD-COPY-UTF8-TO-BASE-STRING.
On Wed, Jun 10, 2026 at 6:09 AM snuglas via Sbcl-commits
<sbc...@li...> wrote:
>
> The branch "master" has been updated in SBCL:
> via 007463a97af9e11b4ea2c41917bf2838b371e9c0 (commit)
> from 2df3f8071bfcf85721104e58efad6d38e9c2b912 (commit)
>
> - Log -----------------------------------------------------------------
> commit 007463a97af9e11b4ea2c41917bf2838b371e9c0
> Author: Douglas Katzman <do...@go...>
> Date: Tue Jun 9 22:57:57 2026 -0400
>
> Improve receiving some C strings from foreign function on 64-bit Unicode
>
> If the string's EF is :ASCII and the desired element-type is BASE-CHAR then just memcpy.
> ---
> src/code/c-call.lisp | 33 ++++++++++++++++++++++++---------
> src/code/misc-aliens.lisp | 8 --------
> src/code/target-alieneval.lisp | 9 +++++++++
> src/code/target-c-call.lisp | 8 ++++++++
> src/cold/exports.lisp | 2 +-
> src/compiler/fndb.lisp | 2 ++
> tests/aliencall.pure.lisp | 16 ++++++++++++++++
> 7 files changed, 60 insertions(+), 18 deletions(-)
>
> diff --git a/src/code/c-call.lisp b/src/code/c-call.lisp
> index 2d343d1ac..67c130576 100644
> --- a/src/code/c-call.lisp
> +++ b/src/code/c-call.lisp
> @@ -83,6 +83,8 @@
> :datum nil))
>
> (define-alien-type-method (c-string :naturalize-gen) (type alien)
> + ;; Potentially the SAFETY policy could influence whether to elide
> + ;; the null check on strings whose alien type says non-nullable.
> `(if (zerop (sap-int ,alien))
> ,(if (alien-c-string-type-not-null type)
> `(null-error ',type)
> @@ -91,19 +93,32 @@
> ;; conversion, or whether we can just do a cheap byte-by-byte
> ;; copy of the c-string data.
> ;;
> - ;; On SB-UNICODE we can never do the cheap copy, even if the
> - ;; external format and element-type are suitable, since
> + ;; On SB-UNICODE the cheap copy is possible for external-format :ASCII
> + ;; and copying to a base-string. Otherwise it isn't since
> ;; simple-base-strings may not contain ISO-8859-1 characters.
> ;; If we need to check for non-ascii data in the input, we
> ;; might as well go through the usual external-format machinery
> ;; instead of rewriting another version of it.
> - ,(if #+sb-unicode t
> - #-sb-unicode (c-string-needs-conversion-p type)
> - `(c-string-to-string ,alien
> - (c-string-external-format ,type)
> - (alien-c-string-type-element-type
> - ,type))
> - `(%naturalize-c-string ,alien))))
> + ,(let ((conv `(c-string-to-string
> + ,alien
> + (c-string-external-format ,type)
> + ',(alien-c-string-type-element-type type))))
> + #-sb-unicode
> + (if (c-string-needs-conversion-p type) conv `(%naturalize-c-string ,alien))
> + #+sb-unicode
> + (if (or (neq (alien-c-string-type-external-format type) :ascii)
> + (neq (alien-c-string-type-element-type type) 'base-char)
> + ;; this test might be unnecessary but if you're asking for maximum
> + ;; safety then we should check for non-ASCII characters
> + (sb-c::policy sb-c::*policy* (= safety 3)))
> + conv
> + ;; else "cheap byte-by-byte copy"
> + #-64-bit `(%naturalize-c-string ,alien)
> + ;; even better: avoid SAP consing. The CPU and OS assure that us that
> + ;; userspace can't have bit 63 on for two popular architectures.
> + #+64-bit
> + `(%naturalize-base-string/word #+(or arm64 x86-64) (the fixnum (sap-int ,alien))
> + #-(or arm64 x86-64) (sap-int ,alien))))))
>
> (define-alien-type-method (c-string :deport-gen) (type value)
> ;; This SAP taking is safe as DEPORT callers pin the VALUE when
> diff --git a/src/code/misc-aliens.lisp b/src/code/misc-aliens.lisp
> index 49620026e..6c8d73139 100644
> --- a/src/code/misc-aliens.lisp
> +++ b/src/code/misc-aliens.lisp
> @@ -69,14 +69,6 @@
> (dest system-area-pointer)
> (src system-area-pointer)
> (n sb-unix::size-t))
> -;;; The overhead of Lisp may make the distinction between memmove() and memcpy()
> -;;; irrelevant, but we may as well promise that the ranges don't overlap when one
> -;;; of them is a freshly consed string, for example.
> -(declaim (inline memcpy))
> -(define-alien-routine ("memcpy" memcpy) system-area-pointer
> - (dest system-area-pointer)
> - (src system-area-pointer)
> - (n sb-unix::size-t))
>
> (defun copy-ub8-to-system-area (src src-offset dst dst-offset length)
> (with-pinned-objects (src)
> diff --git a/src/code/target-alieneval.lisp b/src/code/target-alieneval.lisp
> index dce96d26b..b2fa2593e 100644
> --- a/src/code/target-alieneval.lisp
> +++ b/src/code/target-alieneval.lisp
> @@ -963,3 +963,12 @@ specifies the way that the argument is passed.
> (error "(STRUCT ~S) has unexpected size" tag)))))
> (check-size 'sb-unix::timespec)
> (check-size 'sb-unix::timeval)))
> +
> +;;; The overhead of Lisp may make the distinction between memmove() and memcpy()
> +;;; irrelevant, but we may as well promise that the ranges don't overlap when one
> +;;; of them is a freshly consed string, for example.
> +(declaim (inline sb-impl::memcpy))
> +(define-alien-routine ("memcpy" sb-impl::memcpy) system-area-pointer
> + (dest system-area-pointer)
> + (src system-area-pointer)
> + (n sb-unix::size-t))
> diff --git a/src/code/target-c-call.lisp b/src/code/target-c-call.lisp
> index c24355191..c517663ed 100644
> --- a/src/code/target-c-call.lisp
> +++ b/src/code/target-c-call.lisp
> @@ -70,3 +70,11 @@
> ;; COPY-UB8 pins the lisp string, no need to do it here
> (sb-kernel:copy-ub8-from-system-area sap 0 result 0 length)
> result))
> +
> +(defun %naturalize-base-string/word (word)
> + (declare (type sb-vm:word word))
> + (let* ((length (alien-funcall (extern-alien "strlen" (function size-t unsigned)) word))
> + (result (make-string length :element-type 'base-char)))
> + (with-pinned-objects (result)
> + (sb-impl::memcpy (vector-sap result) (int-sap word) length))
> + result))
> diff --git a/src/cold/exports.lisp b/src/cold/exports.lisp
> index cf0f8b564..34770006b 100644
> --- a/src/cold/exports.lisp
> +++ b/src/cold/exports.lisp
> @@ -1012,7 +1012,7 @@ Lisp extension proposal by David N. Gray")
> "%CAST"
> "%DEREF-ADDR" "%HEAP-ALIEN" "%HEAP-ALIEN-ADDR"
> "%LOCAL-ALIEN-ADDR" "%LOCAL-ALIEN-FORCED-TO-MEMORY-P" "%SAP-ALIEN"
> - "%NATURALIZE-C-STRING"
> + "%NATURALIZE-C-STRING" "%NATURALIZE-BASE-STRING/WORD"
> "%SET-DEREF" "%SET-HEAP-ALIEN" "%SET-LOCAL-ALIEN" "%SET-SLOT"
> "%SLOT-ADDR" "*SAVED-FP*" "*VALUES-TYPE-OKAY*"
> "*ALIEN-TYPE-HASHSETS*"
> diff --git a/src/compiler/fndb.lisp b/src/compiler/fndb.lisp
> index 69a7f4be3..622d27a08 100644
> --- a/src/compiler/fndb.lisp
> +++ b/src/compiler/fndb.lisp
> @@ -2397,6 +2397,8 @@
> ;;;; ALIEN and call-out-to-C stuff
>
> (defknown %alien-funcall ((or string system-area-pointer) alien-type &rest t) *)
> +#+64-bit
> +(defknown sb-alien-internals:%naturalize-base-string/word (sb-vm:word) simple-base-string)
>
> ;; Used by WITH-PINNED-OBJECTS
> (defknown sb-vm::touch-object (t) (values)
> diff --git a/tests/aliencall.pure.lisp b/tests/aliencall.pure.lisp
> index 68d651671..84d3bf19f 100644
> --- a/tests/aliencall.pure.lisp
> +++ b/tests/aliencall.pure.lisp
> @@ -39,3 +39,19 @@
> ;; 53F9FF58 LDR R9, #x1001A10048 ; printf
> ;; 2) 60023FD6 BLR R9
> #+arm64 (assert (= (loop for line in lines count (search "BLR" line)) 2))))
> +
> +(locally
> +(declare (optimize (sb-c::alien-funcall-saves-fp-and-pc 0)))
> +(define-alien-routine strerror (c-string :external-format :ascii :element-type base-char)
> + (e int)))
> +
> +(with-test (:name :return-c-string-optimizer :skipped-on (:not :x86-64))
> + ;; check that the thing actually works
> + (assert (plusp (length (strerror sb-unix:ebadf))))
> + (let ((lines (ctu:disassembly-lines #'strerror)))
> + ;; should tail-call the naturalize function
> + (assert (loop for line in lines
> + thereis (and (search "JMP" line) (search "%NATURALIZE-BASE-STRING/WORD" line))))
> + ;; no alloc-tramp (no SAP consing), nor BIGNUM consing
> + (assert (loop for line in lines
> + never (or (search "ALLOC" line) (search "BIGNUM" line))))))
>
> -----------------------------------------------------------------------
>
>
> hooks/post-receive
> --
> SBCL
>
>
> _______________________________________________
> Sbcl-commits mailing list
> Sbc...@li...
> https://lists.sourceforge.net/lists/listinfo/sbcl-commits
|