Menu

#912 Z80: comparing are two bytes equal -- heavy stack use

None
open
nobody
None
5
2024-05-02
2024-04-23
No

The following function could use only HL DE BC and A throughout:

int memcmp (const void * vbuf1, const void * vbuf2, unsigned count)
{
    unsigned char* buf1 = (unsigned char*)vbuf1; 
    unsigned char* buf2 = (unsigned char*)vbuf2; 
    if ( count ) { 
        do {
            if ( *buf1 != *buf2 )  
                break;
             buf2++; buf1++;    
        } while ( --count );    

        return( *buf1 - *buf2 );
    } else
        return 0;
}

annotated:

int memcmp (const void * vbuf1, const void * vbuf2, unsigned count)
{
    unsigned char* buf1 = (unsigned char*)vbuf1; // HL
    unsigned char* buf2 = (unsigned char*)vbuf2; // DE
    if ( count ) {  // BC
        do {
            if ( *buf1 != *buf2 )  // ld A,(DE) : cp (HL) : jr NZ, break
                break;
             buf2++; buf1++;    // inc DE : inc HL
        } while ( --count );    // dec BC : ld A, B : or C : jr NZ, loop

        return( *buf1 - *buf2 );
    } else
        return 0;
}

but currently (#14815 (Linux)) what I get is:

sdcc   -mz80  --opt-code-speed  --max-allocs-per-node20000 --Werror --peep-return  -c

_memcmp::
    push    ix
    ld  ix,#0
    add ix,sp
    push    af
    ld  c, l
    ld  b, h
;check.c:3: unsigned char* buf1 = (unsigned char*)vbuf1; 
;check.c:4: unsigned char* buf2 = (unsigned char*)vbuf2; 
;check.c:5: if ( count ) { 
    ld  a, 5 (ix)
    or  a, 4 (ix)
    jr  Z, 00107$
;check.c:6: do {
    ld  l, 4 (ix)
    ld  h, 5 (ix)
00103$:
;check.c:7: if ( *buf1 != *buf2 )  
    ld  a, (bc)
    ld  -2 (ix), a
    ld  a, (de)
    ld  -1 (ix), a
    ld  a, -2 (ix)
    sub a, -1 (ix)
    jr  NZ, 00105$
;check.c:9: buf2++; buf1++;    
;check.c:10: } while ( --count );    
    dec hl
    inc de
    inc bc
    ld  a, h
    or  a, l
    jr  NZ, 00103$
00105$:
;check.c:12: return( *buf1 - *buf2 );
    ld  a, (bc)
    ld  l, a
    ld  h, #0x00
    ld  a, (de)
    ld  c, a
    ld  b, #0x00
    cp  a, a
    sbc hl, bc
    ex  de, hl
    jp  00109$
00107$:
;check.c:14: return 0;
    ld  de, #0x0000
00109$:
;check.c:15: }
    ld  sp, ix
    pop ix
    pop hl
    pop af
    jp  (hl)

The biggest problem is in the loop:

;check.c:7: if ( *buf1 != *buf2 )  
    ld  a, (bc)
    ld  -2 (ix), a
    ld  a, (de)
    ld  -1 (ix), a
    ld  a, -2 (ix)
    sub a, -1 (ix)
    jr  NZ, 00105$

Ideally, that could be

   1A        ld a,(de)    [7]
   BE        cp (hl)      [7]
   20 ..     jr NZ, ..

Then the registry allocator would "know" that HL is the one used in CP, and then the codegen would just use all that? I am aware that the icode has a lot of variables now, and that maybe it could be too complicated to achieve that, if the levels can't plan like that, but maybe this could be considered as a goal.

I am also aware that some modifications of how that compare is written produce different code.

E.g.

        unsigned char c2 = *buf2;
        if ( *buf1 != c2 )
            break;

gives:

;check.c:7: unsigned char c2 = *buf2;
    ld  a, (de)
    ld  -2 (ix), a
;check.c:8: if ( *buf1 != c2 )
    ld  a, (bc)
    ld  -1 (ix), a
    ld  a, -2 (ix)
    sub a, -1 (ix)
    jr  NZ, 00105$

and this almost manages to produce something better:

        unsigned char c2 = *buf2;
        if ( (unsigned char)(*buf1 - c2) )
            break;

(note without the cast it would use 16 bits for sub there, which would also be unnecessary and is a symptom of some issue, I'd think)

;check.c:7: unsigned char c2 = *buf2;
    ld  a, (de)
    ld  -1 (ix), a
;check.c:8: if ( (unsigned char)(*buf1 - c2) )
    ld  a, (bc)
    sub a, -1 (ix)
    or  a, a
    jr  NZ, 00105$

but it still uses stack for one of the two.

I think that ideally all the forms would generate

   1A        ld a,(de)    [7]
   BE        cp (hl)      [7]
   20 ..     jr NZ, ..

but I don't know how possible that would be.

Discussion

  • Janko Stamenović

    sub a, -1 (ix) or a, a jr NZ is also unexpected in that last case.

     
  • Janko Stamenović

    The problem with temporary variables, demonstrated in the minimal function:

    _Bool two_bytes_equal( unsigned char* buf1, unsigned char* buf2 )
    {
        return *buf1 == *buf2;
    }
    

    produces:

    ;check.c:2: _Bool two_bytes_equal( unsigned char* buf1, unsigned char* buf2 )
    _two_bytes_equal::
    ;check.c:4: return *buf1 == *buf2;
        ld  c, (hl)
        ld  a, (de)
        ld  b, a
        ld  a, c
        sub a, b
        ld  a, #0x01
        ret Z
        xor a, a
    ;check.c:5: }
        ret
    

    verbose:

    ;   ---------------------------------
    ; Function two_bytes_equal
    ; ---------------------------------
    ;   Register assignment is optimal.
    ; Stack space usage: 0 bytes.
    _two_bytes_equal::
    ;ic:2: iTemp0 [k2 lr3:5 so:0]{ ia0 a2p0 re1 rm0 nos0 ru0 dp0}{unsigned-char generic* fixed}{ sir@ _two_bytes_equal_buf1_10000_1}[l h ] = recv _two_bytes_equal [k1 lr0:0 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{_Bool function ( unsigned-char generic* auto, unsigned-char generic* auto) fixed}
    ;   genReceive
    ;   genMove_o size 2 result type 2 source type 2 hl_dead 1
    ;ic:3: iTemp1 [k4 lr4:6 so:0]{ ia0 a2p0 re1 rm0 nos0 ru0 dp0}{unsigned-char generic* fixed}{ sir@ _two_bytes_equal_buf2_10000_1}[e d ] = recv _two_bytes_equal [k1 lr0:0 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{_Bool function ( unsigned-char generic* auto, unsigned-char generic* auto) fixed}
    ;   genReceive
    ;   genMove_o size 2 result type 2 source type 2 hl_dead 0
    ;check.c:4: return *buf1 == *buf2;
    ;ic:5:  iTemp3 [k7 lr5:7 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}[c ] = @[iTemp0 [k2 lr3:5 so:0]{ ia1 a2p0 re1 rm0 nos0 ru0 dp0}{unsigned-char generic* fixed}{ sir@ _two_bytes_equal_buf1_10000_1}[l h ] + 0x0 {const-unsigned-char literal}]
    ;   genPointerGet
        ld  c, (hl)
    ;ic:7:  iTemp5 [k9 lr6:7 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}[b ] = @[iTemp1 [k4 lr4:6 so:0]{ ia1 a2p0 re1 rm0 nos0 ru0 dp0}{unsigned-char generic* fixed}{ sir@ _two_bytes_equal_buf2_10000_1}[e d ] + 0x0 {const-unsigned-char literal}]
    ;   genPointerGet
        ld  a, (de)
    ;   genMove_o size 1 result type 2 source type 2 hl_dead 1
    ;   cheapMove
        ld  b, a
    ;   genMove_o size 0 result type 2 source type 1 hl_dead 1
    ;ic:8:  iTemp6 [k10 lr7:8 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{_Bool fixed}[a ] = iTemp3 [k7 lr5:7 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}[c ] == iTemp5 [k9 lr6:7 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}[b ]
    ;   genCmpEq
    ;   cheapMove
        ld  a, c
        sub a, b
        ld  a, #0x01
        ret Z
    ; common peephole 161 replaced jump by return.
    ; common peephole 169xnz used double assignment in case of NZ condition.
    ; common peephole 158 removed unused label 00103$.
        xor a, a
    ; common peephole 158 removed unused label 00104$.
    ;   genMove_o size 1 result type 2 source type 2 hl_dead 1
    ;ic:9:  ret iTemp6 [k10 lr7:8 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{_Bool fixed}[a ]
    ;   genRet
    ;   genMove_o size 1 result type 2 source type 2 hl_dead 1
    ;ic:10:  _return($1) :
    ;   genLabel
    ; common peephole 158 removed unused label 00101$.
    ;check.c:5: }
    ;ic:11:     eproc _two_bytes_equal [k1 lr0:0 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{_Bool function ( unsigned-char generic* auto, unsigned-char generic* auto) fixed}
    ;   genEndFunction
        ret
    
     

    Last edit: Janko Stamenović 2024-04-23
  • Maarten Brock

    Maarten Brock - 2024-04-29

    Should the optimal solution not use this?

    LD A,(DE)
    INC DE
    CPI
    JR NP, break
    JR NZ, loop
    
     
    • Janko Stamenović

      i still haven't learned enough what is realistic to expect from the existing infrastructure, and I understand that compilers hve its limitations about how far some optimization can go, so I'm more expecting some relatively more general to be possible than some "absolute best". Sooner or later I will spend more time and even if these entries remain unsolved I'll try to nderstand on them where the limits are. Thanks for that example.

       
      👍
      1
  • Ragozini Arturo

    Ragozini Arturo - 2024-04-30

    I am starting to think that the code generator of sdcc is too targeted to generic 8 bit processors to produce good z80 code.

     
    • Philipp Klaus Krause

      I don't think there's an easy answer to this.

      On one hand, the structure of SDCC makes it hard to make good use of indirect and indexed indirect addressing modes (outside of plain loads), as well as complex instructions such as ldi, ldd and cpi. In particular for z80, there is are clear weaknesses in SDCC's use of (hl), (de), (bc), d (iy): When there is a pointer or array access in the C code, SDCC typically loads the result into a temporary (in registers or on the stack), then does the operations on these temporaries. GCC's or LLVM's instruction selection can do much better here.
      But SDCC has better register allocation. And I don't think combining SDCC's register allocator with an GCC-like instruction selection would be easy (though it isn't impossible either).

      There are many still cases where hand-written asm can do better SDCC.

      But AFAIK, SDCC is still the best compiler for the Z80 (and for STM8, Rabbit, etc, which also have complex addressing modes that SDCC won't fully use) so far.

       
      👍
      1
  • Philipp Klaus Krause

    Let's start slow here. For the comparison, I get this code (with --no-peep --fverbose-asm --icode-in-asm --max-allocs-per-node 2000000):

    00103$:
    ;test.c:7: if ( *buf1 != *buf2 )  
    ;ic:13:     iTemp7 [k14 lr13:15 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}{ sir@ _memcmp_sloc0_1_0}[_memcmp_sloc0_1_0] = @[iTemp14 [k21 lr9:22 so:0]{ ia1 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char generic* fixed}[l h ] + 0x0 {const-unsigned-char literal}]
    ;   genPointerGet
    ;   AOP_STK for _memcmp_sloc0_1_0
        ld  a, (hl)
    ;   genMove_o size 1 result type 5 source type 2 hl_dead 0
        ld  -2 (ix), a
    ;   genMove_o size 0 result type 5 source type 1 hl_dead 0
    ;ic:15:     iTemp9 [k16 lr14:15 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}{ sir@ _memcmp_sloc1_1_0}[_memcmp_sloc1_1_0] = @[iTemp12 [k19 lr8:24 so:0]{ ia1 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char generic* fixed}[e d ] + 0x0 {const-unsigned-char literal}]
    ;   genPointerGet
    ;   AOP_STK for _memcmp_sloc1_1_0
        ld  a, (de)
    ;   genMove_o size 1 result type 5 source type 2 hl_dead 0
        ld  -1 (ix), a
    ;   genMove_o size 0 result type 5 source type 1 hl_dead 0
    ;ic:16:     iTemp10 [k17 lr15:16 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{_Bool fixed} = iTemp7 [k14 lr13:15 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}{ sir@ _memcmp_sloc0_1_0}[_memcmp_sloc0_1_0] == iTemp9 [k16 lr14:15 so:0]{ ia0 a2p0 re0 rm0 nos0 ru0 dp0}{unsigned-char register}{ sir@ _memcmp_sloc1_1_0}[_memcmp_sloc1_1_0]
    ;   genCmpEq
    ;   AOP_STK for _memcmp_sloc0_1_0
    ;   AOP_STK for _memcmp_sloc1_1_0
    ;   genMove_o size 1 result type 2 source type 5 hl_dead 0
        ld  a, -2 (ix)
    ;   genMove_o size 0 result type 2 source type 1 hl_dead 0
        sub a, -1 (ix)
        jp  NZ, 00132$
        jp  00133$
    00132$:
        jp  00105$
    00133$:
    

    Here, we have the pointer read results in iTemp7 and iTemp9. bc, de and hl are all in use. So iTemp7 has to go on the stack. But it should be possible to put iTemp9 in a, i.e. get something like this after register allocation and code generation:

    00103$:
        ld  a, (hl)
        ld  -1 (ix), a
        ld  a, (de)
        sub a, -1 (ix)
        jp  NZ, 00132$
        jp  00133$
    00132$:
        jp  00105$
    00133$:
    

    Stack use of the function would be down to a single byte.

    P.S.: This now works in [r14835], which also made some peephole optimizer rule improvements that can optimize two_bytes_equal above even a bit further (since there the other temporary is in a register, not on the stack).

     

    Related

    Commit: [r14835]


    Last edit: Philipp Klaus Krause 2024-04-30
    • Janko Stamenović

      Shouldn't n that commit the rule 115 say "if notUsed %1 "?

       
      • Philipp Klaus Krause

        Yes. Thanks. fixed in [r14836].

         

        Related

        Commit: [r14836]

        • Janko Stamenović

          Many thanks to you!

           
  • Philipp Klaus Krause

    • Group: -->
     
  • Philipp Klaus Krause

    A more generic solution would be [feature-requests:#914].

     

    Related

    Feature Requests: #914


Log in to post a comment.