Menu

#905 Ineffcient asm code

None
open
nobody
None
5
2024-04-16
2024-04-11
No

I was compiling this function

u8         COL0[32];
u8         PAT0[32];
u8         COL1[32];
u8         PAT1[32];

#define MaxNumObj 16
Object MyObject[MaxNumObj];

#define YScreenSplit 106

extern u8 SprtColDef[];
extern u8 SprtPatDef[];

u8 n0,n1;

Object* MyObj;
u8 ii,jj;

u16 y;
i16 x;
u8  p;
u8  c;  

void AddSat0(void) {

    if (n0<32) {

        if (x<0) {
            x += 32;        // mange EC bit
            c |= 1;
        }           

        if (PAT0[n0] != p) {
            PAT0[n0] = p;
            VDP_WriteVRAM(&SprtPatDef[p*32],SptAddr0 + n0*32,0,32);
        }
        if (COL0[n0] != c) {
            COL0[n0] = c;
            VDP_WriteVRAM(&SprtColDef[c*16],SctAddr0 + n0*16,0,16);
        }

        {
            g_VDP_Sprite.Y = y;             // Y coordinate on screen (all lower priority sprite will be disable if equal to 216 or 0xD0)
            g_VDP_Sprite.X = x;             // X coordinate of the sprite
            g_VDP_Sprite.Pattern = n0*4;
            u16 low = SatAddr0;
            low += (n0 * 4);
            VDP_WriteVRAM((u8*)&g_VDP_Sprite, low, 0, 3);
        }
        n0++;
    }   
}

Inspecting the ASM I've found plenty of possible optimisations that the compiler is ignoring
This is the generated asm:

    .area _CODE
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:56: void AddSat0(void) {
;   ---------------------------------
; Function AddSat0
; ---------------------------------
_AddSat0::
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:59: if (n0<32) {
    ld  a, (_n0+0)
    sub a, #0x20
    ret NC
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:61: if (x<0) {
    ld  bc, (_x)
    bit 7, b
    jr  Z, 00102$
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:62: x += 32;        // mange EC bit
    ld  hl, #0x0020
    add hl, bc
    ld  (_x), hl
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:63: c |= 1;
    ld  a, (_c+0)
    or  a, #0x01
    ld  (_c+0), a
00102$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:66: if (PAT0[n0] != p) {
    ld  a, #<(_PAT0)
    ld  hl, #_n0
    add a, (hl)
    ld  c, a
    ld  a, #>(_PAT0)
    adc a, #0x00
    ld  b, a
    ld  a, (bc)
    ld  d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:67: PAT0[n0] = p;
    ld  a,(_p+0)
    cp  a,d
    jr  Z, 00104$
    ld  (bc), a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:68: VDP_WriteVRAM(&SprtPatDef[p*32],SptAddr0 + n0*32,0,32);
    ld  a, (_n0+0)
    ld  h, #0x00
;   spillPairReg hl
;   spillPairReg hl
    ld  l, a
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, hl
    ex  de, hl
    ld  bc, #_SprtPatDef+0
    ld  a, (_p+0)
    ld  h, #0x00
;   spillPairReg hl
;   spillPairReg hl
    ld  l, a
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, bc
    ld  bc, #0x0020
    push    bc
    xor a, a
    push    af
    inc sp
    call    _VDP_WriteVRAM_128K
00104$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:70: if (COL0[n0] != c) {
    ld  a, #<(_COL0)
    ld  hl, #_n0
    add a, (hl)
    ld  c, a
    ld  a, #>(_COL0)
    adc a, #0x00
    ld  b, a
    ld  a, (bc)
    ld  d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:71: COL0[n0] = c;
    ld  a,(_c+0)
    cp  a,d
    jr  Z, 00106$
    ld  (bc), a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:72: VDP_WriteVRAM(&SprtColDef[c*16],SctAddr0 + n0*16,0,16);
    ld  a, (_n0+0)
    ld  h, #0x00
;   spillPairReg hl
;   spillPairReg hl
    ld  l, a
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, hl
    ld  e, l
    ld  a, h
    add a, #0x10
    ld  d, a
    ld  bc, #_SprtColDef+0
    ld  a, (_c+0)
    ld  l, a
;   spillPairReg hl
;   spillPairReg hl
    ld  h, #0x00
;   spillPairReg hl
;   spillPairReg hl
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, hl
    add hl, bc
    ld  bc, #0x0010
    push    bc
    xor a, a
    push    af
    inc sp
    call    _VDP_WriteVRAM_128K
00106$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:76: g_VDP_Sprite.Y = y;             // Y coordinate on screen (all lower priority sprite will be disable if equal to 216 or 0xD0)
    ld  a, (_y+0)
    ld  (#_g_VDP_Sprite),a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:77: g_VDP_Sprite.X = x;             // X coordinate of the sprite
    ld  a, (_x+0)
    ld  (#(_g_VDP_Sprite + 1)),a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:78: g_VDP_Sprite.Pattern = n0*4;
    ld  a, (_n0+0)
    add a, a
    add a, a
    ld  (#(_g_VDP_Sprite + 2)),a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:80: low += (n0 * 4);
    ld  a, (_n0+0)
    ld  h, #0x00
;   spillPairReg hl
;   spillPairReg hl
    ld  l, a
    add hl, hl
    add hl, hl
    ld  e, l
    ld  a, h
    add a, #0x12
    ld  d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:81: VDP_WriteVRAM((u8*)&g_VDP_Sprite, low, 0, 3);
    ld  hl, #0x0003
    push    hl
    xor a, a
    push    af
    inc sp
    ld  hl, #_g_VDP_Sprite
    call    _VDP_WriteVRAM_128K
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:83: n0++;
    ld  hl, #_n0
    inc (hl)
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:85: }
    ret

The generated code is ignoring 16 bit loading instructions and the fact CP can have (HL) as operand.
If we focus on this section:

;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:66: if (PAT0[n0] != p) {
    ld  a, #<(_PAT0)
    ld  hl, #_n0
    add a, (hl)
    ld  c, a
    ld  a, #>(_PAT0)
    adc a, #0x00
    ld  b, a
    ld  a, (bc)
    ld  d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:67: PAT0[n0] = p;
    ld  a,(_p+0)
    cp  a,d
    jr  Z, 00104$
    ld  (bc), a

it can be rearranged in

    ld  hl,#_PAT0
    ld  bc,(#_n0)
    ld  b,0
    add hl,bc
    ld a,(_p+0)
    cp (hl)
    jr z,00104$
    ld (hl),a

Moreover it tends to use SUB in place of CP, where the latter preserved the content of A.
e.g. here:

;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:59: if (n0<32) {
    ld  a, (_n0+0)
    sub a, #0x20
    ret NC

It could have used CP #0x20 in order to keep in A the correct value on n0 and be able to reuse it later
Any hint on why it happens ?

1 Attachments

Discussion

  • Philipp Klaus Krause

    Ticket moved from /p/sdcc/bugs/3726/

    Can't be converted:

    • _category: Z80
     
  • Philipp Klaus Krause

    • labels: Bad code -->
    • summary: Bad asm code --> Ineffcient asm code
    • Group: -->
     
  • Philipp Klaus Krause

    Since the generated code is correct, I've moved the ticket to feature requests (interpreting it as a request for better optimization), and changed the title (since "bad code" is often used to refer to incorrect code).

     
  • Philipp Klaus Krause

     
  • Philipp Klaus Krause

    Can you give a compileable C code example (preferably contained in a single file), that shows the issue?

     
  • Philipp Klaus Krause

    Using bits from the reproducer for [bugs:#3727], I was able to compile your code here.
    It looks like it would indeed be better to use hl instead of bc here, and use some 16-bit instructions. I checked that codegen can use hl (though it still uses 8-bit instructions then), and that already improves the ld a, (bc) ld d, a to ld a, (hl), so it should have happened. Apparently something prevents the register allocator from using hl here.
    So to improve the code from SDCC here, we need to
    1) Ensure that the register allocator considers using hl here.
    2) Make code generation use 16-bit instructions here.

     

    Related

    Bugs: #3727

    • Philipp Klaus Krause

      2) Should be working now in [r14792].
      1) Is still missing.

       
      👍
      1

      Related

      Commit: [r14792]


      Last edit: Philipp Klaus Krause 2024-04-13
      • Philipp Klaus Krause

        In [r14793], there were further improvements. For huge arguments to --max-allocs-per-node, I now see the use of hl here (works for me at --max-allocs-per-node 60000000, but not --max-allocs-per-node 40000000).
        Maybe with a bit of fine-tuning the heuristics (possibly rough_cost_estimate in ralloc2.cc), the register allocator will use hl here at lower --max-allocs-per-node parameter.

        Philipp

         

        Related

        Commit: [r14793]


        Last edit: Philipp Klaus Krause 2024-04-16
  • Philipp Klaus Krause

    This simplified code still reproduces the issue:

    typedef unsigned char u8;
    typedef signed char i8;
    typedef unsigned int u16;
    typedef signed int i16;
    
    #define SptAddr0    0x0000
    #define SctAddr0    0x1000
    
    u8         COL0[32];
    u8         PAT0[32];
    u8         COL1[32];
    u8         PAT1[32];
    
    extern u8 SprtColDef[];
    extern u8 SprtPatDef[];
    
    u8 n0;
    
    i16 x;
    u8  p;
    u8  c;  
    
    void VDP_WriteVRAM_16K(const u8* src, u16 dest, u16 count);
    
    #define VDP_WriteVRAM(src, destLow, destHigh, count)    VDP_WriteVRAM_16K(src, destLow, count)
    
    void AddSat0(void) {
    
        if (n0<32) {
    
            if (x<0) {
                x += 32;        // mange EC bit
                c |= 1;
            }           
    
            if (PAT0[n0] != p) {
                PAT0[n0] = p;
                VDP_WriteVRAM(&SprtPatDef[p*32],SptAddr0 + n0*32,0,32);
            }
            if (COL0[n0] != c) {
                COL0[n0] = c;
                VDP_WriteVRAM(&SprtColDef[c*16],SctAddr0 + n0*16,0,16);
            }
    
            n0++;
        }   
    }
    
     

Log in to post a comment.

MongoDB Logo MongoDB