I was compiling this function
u8 COL0[32];
u8 PAT0[32];
u8 COL1[32];
u8 PAT1[32];
#define MaxNumObj 16
Object MyObject[MaxNumObj];
#define YScreenSplit 106
extern u8 SprtColDef[];
extern u8 SprtPatDef[];
u8 n0,n1;
Object* MyObj;
u8 ii,jj;
u16 y;
i16 x;
u8 p;
u8 c;
void AddSat0(void) {
if (n0<32) {
if (x<0) {
x += 32; // mange EC bit
c |= 1;
}
if (PAT0[n0] != p) {
PAT0[n0] = p;
VDP_WriteVRAM(&SprtPatDef[p*32],SptAddr0 + n0*32,0,32);
}
if (COL0[n0] != c) {
COL0[n0] = c;
VDP_WriteVRAM(&SprtColDef[c*16],SctAddr0 + n0*16,0,16);
}
{
g_VDP_Sprite.Y = y; // Y coordinate on screen (all lower priority sprite will be disable if equal to 216 or 0xD0)
g_VDP_Sprite.X = x; // X coordinate of the sprite
g_VDP_Sprite.Pattern = n0*4;
u16 low = SatAddr0;
low += (n0 * 4);
VDP_WriteVRAM((u8*)&g_VDP_Sprite, low, 0, 3);
}
n0++;
}
}
Inspecting the ASM I've found plenty of possible optimisations that the compiler is ignoring
This is the generated asm:
.area _CODE
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:56: void AddSat0(void) {
; ---------------------------------
; Function AddSat0
; ---------------------------------
_AddSat0::
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:59: if (n0<32) {
ld a, (_n0+0)
sub a, #0x20
ret NC
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:61: if (x<0) {
ld bc, (_x)
bit 7, b
jr Z, 00102$
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:62: x += 32; // mange EC bit
ld hl, #0x0020
add hl, bc
ld (_x), hl
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:63: c |= 1;
ld a, (_c+0)
or a, #0x01
ld (_c+0), a
00102$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:66: if (PAT0[n0] != p) {
ld a, #<(_PAT0)
ld hl, #_n0
add a, (hl)
ld c, a
ld a, #>(_PAT0)
adc a, #0x00
ld b, a
ld a, (bc)
ld d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:67: PAT0[n0] = p;
ld a,(_p+0)
cp a,d
jr Z, 00104$
ld (bc), a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:68: VDP_WriteVRAM(&SprtPatDef[p*32],SptAddr0 + n0*32,0,32);
ld a, (_n0+0)
ld h, #0x00
; spillPairReg hl
; spillPairReg hl
ld l, a
add hl, hl
add hl, hl
add hl, hl
add hl, hl
add hl, hl
ex de, hl
ld bc, #_SprtPatDef+0
ld a, (_p+0)
ld h, #0x00
; spillPairReg hl
; spillPairReg hl
ld l, a
add hl, hl
add hl, hl
add hl, hl
add hl, hl
add hl, hl
add hl, bc
ld bc, #0x0020
push bc
xor a, a
push af
inc sp
call _VDP_WriteVRAM_128K
00104$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:70: if (COL0[n0] != c) {
ld a, #<(_COL0)
ld hl, #_n0
add a, (hl)
ld c, a
ld a, #>(_COL0)
adc a, #0x00
ld b, a
ld a, (bc)
ld d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:71: COL0[n0] = c;
ld a,(_c+0)
cp a,d
jr Z, 00106$
ld (bc), a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:72: VDP_WriteVRAM(&SprtColDef[c*16],SctAddr0 + n0*16,0,16);
ld a, (_n0+0)
ld h, #0x00
; spillPairReg hl
; spillPairReg hl
ld l, a
add hl, hl
add hl, hl
add hl, hl
add hl, hl
ld e, l
ld a, h
add a, #0x10
ld d, a
ld bc, #_SprtColDef+0
ld a, (_c+0)
ld l, a
; spillPairReg hl
; spillPairReg hl
ld h, #0x00
; spillPairReg hl
; spillPairReg hl
add hl, hl
add hl, hl
add hl, hl
add hl, hl
add hl, bc
ld bc, #0x0010
push bc
xor a, a
push af
inc sp
call _VDP_WriteVRAM_128K
00106$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:76: g_VDP_Sprite.Y = y; // Y coordinate on screen (all lower priority sprite will be disable if equal to 216 or 0xD0)
ld a, (_y+0)
ld (#_g_VDP_Sprite),a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:77: g_VDP_Sprite.X = x; // X coordinate of the sprite
ld a, (_x+0)
ld (#(_g_VDP_Sprite + 1)),a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:78: g_VDP_Sprite.Pattern = n0*4;
ld a, (_n0+0)
add a, a
add a, a
ld (#(_g_VDP_Sprite + 2)),a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:80: low += (n0 * 4);
ld a, (_n0+0)
ld h, #0x00
; spillPairReg hl
; spillPairReg hl
ld l, a
add hl, hl
add hl, hl
ld e, l
ld a, h
add a, #0x12
ld d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:81: VDP_WriteVRAM((u8*)&g_VDP_Sprite, low, 0, 3);
ld hl, #0x0003
push hl
xor a, a
push af
inc sp
ld hl, #_g_VDP_Sprite
call _VDP_WriteVRAM_128K
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:83: n0++;
ld hl, #_n0
inc (hl)
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:85: }
ret
The generated code is ignoring 16 bit loading instructions and the fact CP can have (HL) as operand.
If we focus on this section:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:66: if (PAT0[n0] != p) {
ld a, #<(_PAT0)
ld hl, #_n0
add a, (hl)
ld c, a
ld a, #>(_PAT0)
adc a, #0x00
ld b, a
ld a, (bc)
ld d, a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:67: PAT0[n0] = p;
ld a,(_p+0)
cp a,d
jr Z, 00104$
ld (bc), a
it can be rearranged in
ld hl,#_PAT0
ld bc,(#_n0)
ld b,0
add hl,bc
ld a,(_p+0)
cp (hl)
jr z,00104$
ld (hl),a
Moreover it tends to use SUB in place of CP, where the latter preserved the content of A.
e.g. here:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:59: if (n0<32) {
ld a, (_n0+0)
sub a, #0x20
ret NC
It could have used CP #0x20 in order to keep in A the correct value on n0 and be able to reuse it later
Any hint on why it happens ?
Ticket moved from /p/sdcc/bugs/3726/
Can't be converted:
Since the generated code is correct, I've moved the ticket to feature requests (interpreting it as a request for better optimization), and changed the title (since "bad code" is often used to refer to incorrect code).
Can you give a compileable C code example (preferably contained in a single file), that shows the issue?
Using bits from the reproducer for [bugs:#3727], I was able to compile your code here.
It looks like it would indeed be better to use hl instead of bc here, and use some 16-bit instructions. I checked that codegen can use hl (though it still uses 8-bit instructions then), and that already improves the
ld a, (bc) ld d, atold a, (hl), so it should have happened. Apparently something prevents the register allocator from using hl here.So to improve the code from SDCC here, we need to
1) Ensure that the register allocator considers using hl here.
2) Make code generation use 16-bit instructions here.
Related
Bugs: #3727
2) Should be working now in [r14792].
1) Is still missing.
Related
Commit: [r14792]
Last edit: Philipp Klaus Krause 2024-04-13
In [r14793], there were further improvements. For huge arguments to --max-allocs-per-node, I now see the use of hl here (works for me at --max-allocs-per-node 60000000, but not --max-allocs-per-node 40000000).
Maybe with a bit of fine-tuning the heuristics (possibly rough_cost_estimate in ralloc2.cc), the register allocator will use hl here at lower --max-allocs-per-node parameter.
Philipp
Related
Commit: [r14793]
Last edit: Philipp Klaus Krause 2024-04-16
This simplified code still reproduces the issue: