Re: [Ocaml-lib-devel] write_double

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

> > Have you decided to make the extlib with only ocaml code,
> > without any external C ?
>
> That's the policy - to make it easier to port extlib to any platform
> which supports OCaml.

Maybe write_double_opt could be joined along with the extlib only as a patch 
and a readme explaining the difference.

> Is it possible to match the speed-ups using pure OCaml code?  eg.  by
> carefully looking at the generated assembler (ocamlopt -S) and
> studying why it might be slow?

yes I have read the gas code of the extlib version compared to the one of the 
mixed ocaml/C, and even without this just reading the original code it is 
easy to understand what makes the difference:

* the mixed ocaml/C copy the double to an ocaml string buffer (allways the
  same so no alloc at each call) with the C function memcpy, then the string
  buffer is writen once. This is minimal, no extra operations:

let buf_str = "01234567"
external double_cast: buf_str:string -> float -> unit = "double_cast"

let write_double_opt_native ch d =
        double_cast ~buf_str d;
        nwrite ch buf_str

CAMLprim value double_cast( value str, value d ) {
        memcpy((char *)str, (double *)d, sizeof(double));
        return Val_unit;
}

// as you see there is only one line of C code
// we have "Int64.bits_of_float" but not any "String.buf_of_float" :,(

// I've tryed with the Marshall module, the binary part is at the end
// but sometimes for some particular floats there are not the good number
// of bytes in total.

    ; the assembly code of the C part:
double_cast:
        pushl   %ebp
        movl    %esp, %ebp
        movl    12(%ebp), %edx
        movl    8(%ebp), %ecx
        movl    (%edx), %eax
        movl    %eax, (%ecx)
        movl    4(%edx), %eax
        movl    %eax, 4(%ecx)
        movl    $1, %eax
        popl    %ebp
        ret

    ; and the gas code of the ocaml part:
camlIO__write_double_opt_native_422:
        subl    $4, %esp
.L557:
        movl    %eax, 0(%esp)
        pushl   %ebx
        pushl   camlIO + 296
        movl    $double_cast, %eax
        call    caml_c_call
.L558:
        addl    $8, %esp
        movl    camlIO + 296, %ebx
        movl    0(%esp), %eax
        addl    $4, %esp
        jmp     camlIO__nwrite_140

* the original write_double makes a lot of shifts and convertions,
  here is the list of the extraneous operations done:

    - 1 x (Int64.bits_of_float)
    - 3 x (Int64.to_int32)
    - 1 x (Int64.shift_right_logical)
    - 4 x (Int32.to_int)
    - 2 x (Int32.shift_right_logical)
    - 4 x (lsr)
    - 8 x (write_byte)

(and there is no surprise in the related gas code which only lists all these 
operations, if interested I've put it at the end of this email, because it's 
quite long)

* even without additional C, it would be possible to make the implementation a 
bit more concise, but it doesn't enhance the speed very much (just a very 
little even with 1 million calls):

let write_double_ext_native ch f =
        let bin = Int64.bits_of_float f in
        let b7 = Int64.to_int(bin) in
        let b6 = b7 lsr  8
        and b5 = b7 lsr 16
        and b4 = Int64.to_int(Int64.shift_right_logical bin 24) in
        let b3 = b4 lsr  8
        and b2 = b4 lsr 16
        and b1 = Int64.to_int(Int64.shift_right_logical bin 48) in
        let b0 = b1 lsr  8
        in
        write_byte ch b7;
        write_byte ch b6;
        write_byte ch b5;
        write_byte ch b4;
        write_byte ch b3;
        write_byte ch b2;
        write_byte ch b1;
        write_byte ch b0

this version is included too in the test tarball that I have provided in my 
previous email, and you can easily compare it with all the other 
implementations adding this line in the test script 'test_write_double.sh':
   time ./test.opt /dev/null  -ext

--
With Regards
Florent
-- 

; gas code of the current write_double:

camlIO__write_double_391:
        subl    $4, %esp
.L527:  
        movl    %eax, 0(%esp)
        pushl   %ebx 
        movl    $caml_int64_bits_of_float, %eax
        call    caml_c_call
.L528:  
        addl    $4, %esp
        movl    %eax, %ebx
        movl    0(%esp), %eax
        addl    $4, %esp
        jmp     camlIO__write_i64_385

; ....

camlIO__write_i64_385:
        subl    $8, %esp
.L520:  
        movl    %eax, 4(%esp)
        movl    %ebx, 0(%esp)
        pushl   %ebx
        movl    $caml_int64_to_int32, %eax
        call    caml_c_call
.L521:  
        addl    $4, %esp
        movl    %eax, %ebx
        movl    4(%esp), %eax
        call    camlIO__write_real_i32_380
.L522:
        pushl   $65
        movl    4(%esp), %eax
        pushl   %eax
        movl    $caml_int64_shift_right_unsigned, %eax
        call    caml_c_call
.L523:
        addl    $8, %esp
        pushl   %eax
        movl    $caml_int64_to_int32, %eax
        call    caml_c_call
.L524:  
        addl    $4, %esp
        movl    %eax, %ebx
        movl    4(%esp), %eax
        addl    $8, %esp
        jmp     camlIO__write_real_i32_380

; ...

camlIO__write_real_i32_380:
        subl    $12, %esp
.L516:  
        movl    %eax, %ecx
        movl    %ecx, 8(%esp)
        movl    4(%ebx), %eax
        sall    $1, %eax
        orl     $1, %eax
        movl    %eax, 0(%esp)
        movl    4(%ebx), %ebx
        shrl    $24, %ebx
        sall    $1, %ebx
        orl     $1, %ebx
        movl    %ebx, 4(%esp)
        andl    $511, %eax
        movl    (%ecx), %ebx
        movl    (%ebx), %ecx
        call    *%ecx
.L517:  
        movl    0(%esp), %eax
        shrl    $8, %eax
        orl     $1, %eax
        andl    $511, %eax
        movl    8(%esp), %ebx
        movl    (%ebx), %ebx
        movl    (%ebx), %ecx
        call    *%ecx
.L518:
        movl    0(%esp), %eax
        shrl    $16, %eax
        orl     $1, %eax
        andl    $511, %eax
        movl    8(%esp), %ebx
        movl    (%ebx), %ebx
        movl    (%ebx), %ecx
        call    *%ecx
.L519:
        movl    4(%esp), %eax
        andl    $511, %eax
        movl    8(%esp), %ebx
        movl    (%ebx), %ebx
        movl    (%ebx), %ecx
        addl    $12, %esp
        jmp     *%ecx