Small Device C Compiler (SDCC) / Patches / #491 mcs51: fold extra +1/-1 into genPlus/genMinus

A bit more reference on what I was trying to do:

 uint16_t strlen(register __code const char *s)
 {
    __code const char *x = s;
    while (*x)
        x++;
    return x - s;
}

generates

_strlen:
    mov r6, dpl
    mov r7, dph
    mov ar4,r6
    mov ar5,r7
00101$:
    mov dpl,r4
    mov dph,r5
    clr a
    movc    a,@a+dptr
    jz  00103$
    inc r4
    cjne    r4,#0x00,00101$
    inc r5
    sjmp    00101$
00103$:
    mov a,r4
    clr c
    subb    a,r6
    mov r6,a
    mov a,r5
    subb    a,r7
    mov dpl,r6
    mov dph,a
    ret

at first I tried modifying rtrack.c to keep track of value equality, and in genPlusIncr for a 16bit addition to use "rtrackRegEq(...)" to prefer "inc dptr/mov/mov" over "inc/cjne/inc", but this doesn't work because register tracking runs before peephole optimizations and any jumps clear the tracker, so it has its state cleared just before the genPlusIncr.

But then there already is special handling for (*(ptr++)), so we can write:

 uint16_t strlen(register __code const char *s)
 {
    __code const char *x = s;
    while (*(x++));
    return x - s - 1;
}

but this generates a seperate 16bit subtraction for the "- 1":

_strlen:
    mov r6, dpl
    mov r7, dph
    mov ar4,r6
    mov ar5,r7
00101$:
    mov dpl,r4
    mov dph,r5
    clr a
    movc    a,@a+dptr
    mov r3,a
    inc dptr
    mov r4,dpl
    mov r5,dph
    mov a,r3
    jnz 00101$
    mov a,r4
    clr c
    subb    a,r6
    mov r6,a
    mov a,r5
    subb    a,r7
    mov r7,a
    dec r6
    cjne    r6,#0xff,00120$
    dec r7
00120$:
    mov dpl,r6
    mov dph,r7
    ret

with the proposed change this becomes the more optimized:

_strlen:
    mov r6, dpl
    mov r7, dph
    mov ar4,r6
    mov ar5,r7
00101$:
    mov dpl,r4
    mov dph,r5
    clr a
    movc    a,@a+dptr
    mov r3,a
    inc dptr
    mov r4,dpl
    mov r5,dph
    mov a,r3
    jnz 00101$
    mov a,r4
    setb    c
    subb    a,r6
    mov r6,a
    mov a,r5
    subb    a,r7
    mov dpl,r6
    mov dph,a
    ret

I should try if this can be handled by a peephole rule instead, speaking of which it would be nice if the temporary to dptr and dptr to temporary copies could be hoisted out of the loop...

I should try if this can be handled by a peephole rule instead, speaking of which it would be nice if the temporary to dptr and dptr to temporary copies could be hoisted out of the loop...

And yes, both can with --peep-file containing:

replace {
    clr c
    subb    a,%1
    mov %1,a
    mov a,%2
    subb    a,%3
    mov %3,a
    dec %1
    cjne    %1,#0xff,%4
    dec %3
%4:
} by {
    setb    c
    subb    a,%1
    mov %1,a
    mov a,%2
    subb    a,%3
    ;   Peephole optimized decrement, merged into previous
%4:
}

replace {
%1:
    mov dpl,%2
    mov dph,%3
    clr a
    movc    a,@a+dptr
    mov %4,a
    inc dptr
    mov %2,dpl
    mov %3,dph
    mov a,%4
    jnz %1
} by {
    mov dpl,%2
    mov dph,%3
%1:
    clr a
    movc    a,@a+dptr
    mov %4,a
    inc dptr
    jnz %1
    ;   Peephole hoisted temporary update out of loop
    mov %2,dpl
    mov %3,dph
    mov a,%4
} if labelRefCount(%1 1)

the generated code becomes:

_strlen:
    mov r6, dpl
    mov r7, dph
    mov ar4,r6
    mov ar5,r7
;   Peephole 108.b  removed ljmp by inverse jump logic
    mov dpl,r4 ;  <----
    mov dph,r5 ;  <----
00101$:
    clr a
    movc    a,@a+dptr
;   Peephole 302 removed dead mov r3, a
    inc dptr
    jnz 00101$
;   Peephole hoisted temporary update out of loop <----
    mov r4,dpl ;  <----
    mov r5,dph ;  <----
;   Peephole 500    removed redundant label 00119$
;   Peephole 177.c  removed redundant mov
    mov a,r4
    setb    c  ; <----
    subb    a,r6
    mov r6,a
    mov a,r5
    subb    a,r7
;   Peephole optimized decrement, merged into previous <----
00120$:
    mov dpl,r6
    mov dph,r7
;   Peephole 206    removed redundant mov dpl,dpl
;   Peephole 206    removed redundant mov dph,dph
;   Peephole 500    removed redundant label 00104$
    ret

Last edit: Tobias Diedrich 2026-03-03

Tobias Diedrich - 2026-03-03

On the note of peephole rules, I think rule 131 is unsafe?

replace { mov a,%1 subb a,#0x01 mov %2,a mov %1,%2 } by { ; Peephole 131 optimized decrement (not caring for c) dec %1 mov %2,%1 }

AFAICS these are only identical if carry is unset at the beginning, and differ when carry is set.
Maybe this was intended to be:

replace { clr C mov a,%1 subb a,#0x01 mov %2,a mov %1,%2 } by { ; Peephole 131 optimized decrement (not caring for c) dec %1 mov %2,%1 }
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
- Philipp Klaus Krause - 2026-03-04
  
  Do you have a C code sample that can reproduce incorrect optimizations with this rule?
  
  If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
  - Tobias Diedrich - 2026-03-05
    
    Do you have a C code sample that can reproduce incorrect optimizations with this rule?
    
    No, I just randomly noticed this while looking for existing add/sub-related peephole rules. I had a quick look how old this rule is and it's been there for over 20 years, so its probably safe. Possibly the compiler will just never generate instructions that would trigger the false-positive case.
    
    [edit]Rule 131 goes back all the way to https://sourceforge.net/p/sdcc/code/3/[/edit]
    
    👍
    1
    
    Last edit: Tobias Diedrich 2026-03-05
    
    If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
    - Tobias Diedrich - 2026-03-05
      
      Looking through mcs51/gen.c I don't think this pattern is generated anymore.
      Grep through the regression test results I don't see a single hit on "Peephole 131" either, so this is probably an unused rule nowadays...
      
      👍
      1
      
      If you would like to refer to this comment somewhere else in this project, copy and paste the following link:

Optimized strlen peephole rules:

 uint16_t strlen(register __code const char *s)
 {
    __code const char *x = s;
    while (*(x++));
    return x - s - 1;
}

// optimized strlen
replace {
    clr c
    subb    a,%1
    mov %1,a
    mov a,%2
    subb    a,%3
    mov %3,a
    dec %1
    cjne    %1,#0xff,%4
    dec %3
%4:
} by {
    setb    c
    subb    a,%1
    mov %1,a
    mov a,%2
    subb    a,%3
    mov %3,a
    ;   Peephole optimized decrement, merged into previous
} if labelRefCount(%4 1)

replace restart {
%1:
    mov dpl,%2
    mov dph,%3
    clr a
    movc    a,@a+dptr
    mov %4,a
    inc dptr
    mov %2,dpl
    mov %3,dph
    mov a,%4
    jnz %1
} by {
    mov dpl,%2
    mov dph,%3
%1:
    clr a
    movc    a,@a+dptr
    mov %4,a
    inc dptr
    jnz %1
    ;   Peephole hoisted temporary update out of loop
    mov %2,dpl
    mov %3,dph
    mov a,%4
} if labelRefCount(%1 1)

replace restart {
%1:
    mov dpl,%2
    mov dph,%3
    movx    a,@dptr
    mov %4,a
    inc dptr
    mov %2,dpl
    mov %3,dph
    mov a,%4
    jnz %1
} by {
    mov dpl,%2
    mov dph,%3
%1:
    movx    a,@dptr
    mov %4,a
    inc dptr
    jnz %1
    ;   Peephole hoisted temporary update out of loop
    mov %2,dpl
    mov %3,dph
    mov a,%4
} if labelRefCount(%1 1)

replace {
    mov %1, dpl
    mov %2, dph
    mov a%3,%1
    mov a%4,%2
} by {
    mov %1, dpl
    mov %2, dph
    mov %3, dpl
    mov %4, dph
    ;   Peephole modified moves to enable further optimizations
}

replace {
    mov %1,a
    mov a,%2
    subb    a,%3
    mov %3,a
    mov dpl,%1
    mov dph,%3
    ret
} by {
    mov dpl,a
    ;   Peephole removed redundant move before return
    mov a,%2
    subb    a,%3
    mov dph,a
    ret
}

generates:

_strlen:
    mov r6, dpl
    mov r7, dph
;   Peephole modified moves to enable further optimizations
;   Peephole 108.b  removed ljmp by inverse jump logic
;   Peephole 302 removed dead mov r4, dpl
;   Peephole 177.d  removed redundant mov
;   Peephole 302 removed dead mov r5, dph
;   Peephole 177.a  removed redundant mov
00101$:
    clr a
    movc    a,@a+dptr
;   Peephole 302 removed dead mov r3, a
    inc dptr
    jnz 00101$
;   Peephole hoisted temporary update out of loop
    mov r4,dpl
    mov r5,dph
;   Peephole 500    removed redundant label 00119$
;   Peephole 177.c  removed redundant mov
    mov a,r4
    setb    c
    subb    a,r6
;   Peephole optimized decrement, merged into previous
;   Peephole 206    removed redundant mov dpl,dpl
;   Peephole 206    removed redundant mov dph,dph
;   Peephole 500    removed redundant label 00104$
    mov dpl,a
;   Peephole removed redundant move before return
    mov a,r5
    subb    a,r7
    mov dph,a
    ret

Last edit: Tobias Diedrich 2026-03-03

Which of these rules are general enough to apply to code other than your own strlen variant? Did you try them on the regression tests?

Grepping over the regression test:

273 hits for "Peephole 4242.a"
120 hits for "Peephole 4242.b"
288 hits for "Peephole 4242.c"
0 hits for "Peephole hoisted temporary update out of loop"
0 hits for "Peephole optimized decrement, merged into previous"
0 hits for "Peephole removed redundant move before return"

[edit]Some regressions found compared to unmodified baseline, need to check where the mov , dpl corruption is coming from...

Unfortunately notSame(%1, '') is not accepted, so for now I fixed it by duplicating the rule, so the degenerate case is consumed first by the earlier more specific rule:

replace {
    mov %1, dpl
    mov %2, dph
    mov a,  %1
    mov a%4,%2
} by {
    mov %1, dpl
    mov %2, dph
    mov a,  dpl
    mov %4, dph
    ;   Peephole 4242.a modified moves to enable further optimizations
}

replace {
    mov %1, dpl
    mov %2, dph
    mov a%3,%1
    mov a,  %2
} by {
    mov %1, dpl
    mov %2, dph
    mov %3, dpl
    mov a,  dph
    ;   Peephole 4242.b modified moves to enable further optimizations
}

replace {
    mov %1, dpl
    mov %2, dph
    mov a%3,%1
    mov a%4,%2
} by {
    mov %1, dpl
    mov %2, dph
    mov %3, dpl
    mov %4, dph
    ;   Peephole 4242.c modified moves to enable further optimizations
} if notSame(%3 %4)

[/edit]

Last edit: Tobias Diedrich 2026-03-05

550 hits for "Peephole modified moves to enable further optimizations"

That said, sampling the hits it is often only a change with no further effect (no change in size or speed).
But it has some successes, for example in support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm:

$ diff -Naru build-{unmodified,subcarry-peeph}/support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm
--- build-unmodified/support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm    2026-03-05 17:04:16.859657082 +0100
+++ build-subcarry-peeph/support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm        2026-03-05 19:22:11.554234670 +0100
@@ -134,12 +134,13 @@
        ar1 = 0x01
        ar0 = 0x00
 ;      genReceive

-       mov     r6, dpl
-       mov     r7, dph
 ;      cases/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.c:111: return ((value << SHIFT_L) | (value >> SHIFT_R)) AND_OPERATION;
 ;      genSwap
-       mov     a,r6
-       mov     ar6,r7
+;      Peephole 302 removed dead mov r6, dpl
+;      Peephole 302 removed dead mov r7, dph
+       mov     a,  dpl
+       mov     r6, dph
+;      Peephole 4242.a modified moves to enable further optimizations
        mov     r7,a
 ;      genCast
 ;      genAnd
@@ -225,12 +226,13 @@
 ;      -----------------------------------------
 _rotate_test_2:
 ;      genReceive
-       mov     r6, dpl
-       mov     r7, dph
 ;      cases/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.c:130: return ((value2 << SHIFT_L) | (value2 >> SHIFT_R)) AND_OPERATION;
 ;      genSwap
-       mov     a,r6
-       mov     ar6,r7
+;      Peephole 302 removed dead mov r6, dpl
+;      Peephole 302 removed dead mov r7, dph
+       mov     a,  dpl
+       mov     r6, dph
+;      Peephole 4242.a modified moves to enable further optimizations
        mov     r7,a
 ;      genCast
 ;      genAnd
@@ -500,12 +502,13 @@
 ;      -----------------------------------------
 _rotate_test_4:
 ;      genReceive
-       mov     r6, dpl
-       mov     r7, dph
 ;      cases/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.c:171: rotate_test_value = ((rotate_test_value << SHIFT_L) | (rotate_test_value >> SHIFT_R)) AND_OPERATION;
 ;      genSwap
-       mov     a,r6
-       mov     ar6,r7
+;      Peephole 302 removed dead mov r6, dpl
+;      Peephole 302 removed dead mov r7, dph
+       mov     a,  dpl
+       mov     r6, dph
+;      Peephole 4242.a modified moves to enable further optimizations
        mov     r7,a
 ;      genCast
 ;      genAnd

Last edit: Tobias Diedrich 2026-03-05

$ grep "(f:" regtest-before.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
b: 76390042 T: 10330159128
$ grep "(f:" regtest-after.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
b: 76389574 T: 10330144896

So 468 bytes saved over all testcases?

Last edit: Tobias Diedrich 2026-03-05

Tobias Diedrich - 2026-03-05

Did you try them on the regression tests?

Not yet, will try to do that on the weekend. Any tips? Presumably all I need to run them is make -C support/regression test-mcs51? How long is the run expected to take on a gen2 ryzen?

If you would like to refer to this comment somewhere else in this project, copy and paste the following link:

Of course there were some bugs, but the regression test is passing now.

$ grep "(f:" regtest-before.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
b: 76397798 T: 10330885752
$ grep "(f:" regtest-after.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
b: 76397020 T: 10330860444
$ grep "failures" regtest-after.log 
Summary for 'mcs51-small-stack-auto': 0 failures, 25183 tests, 6199 test cases, 11629012 bytes, 1132641312 ticks
Summary for 'mcs51-small': 0 failures, 24574 tests, 6193 test cases, 10249507 bytes, 888038904 ticks
Summary for 'mcs51-medium': 0 failures, 24221 tests, 6199 test cases, 11940616 bytes, 1696195944 ticks
Summary for 'mcs51-large-stack-auto': 0 failures, 26891 tests, 6201 test cases, 13190547 bytes, 1756580028 ticks
Summary for 'mcs51-huge': 0 failures, 26912 tests, 6204 test cases, 15596236 bytes, 2567890536 ticks
Summary for 'mcs51-large': 0 failures, 26912 tests, 6204 test cases, 13791102 bytes, 2289513720 ticks

76397798-76397020 => 778 bytes saved across (duplicated per model) test code.

Hits for "folding post-decrement":

tst_bug-2866.asm
tst_gcc-torture-execute-20021204-1.asm

Hits for "folding post-increment":

tst_bug-2756.asm
tst_bug-2931.asm
tst_bug3403429.asm
tst_bug3440327.asm
tst_bug3521024.asm
tst_bug-3547.asm
tst_bugs-1596270-1736867.asm
tst_gcc-torture-execute-20100430-1.asm
tst_gcc-torture-execute-memchr-1.asm
tst_gcc-torture-execute-strlen-2.asm
tst_gcc-torture-execute-strlen-4.asm
tst_structidx.asm

Additionally the "increment by 0" optimization path triggers in some array access cases even without folding.

addsubcarry.patch

Tobias Diedrich - 2026-03-07

I also have a patch for the "need pointerCode" assert, as attached.

pointercode.patch

If you would like to refer to this comment somewhere else in this project, copy and paste the following link:

mcs51: fold extra +1/-1 into genPlus/genMinus

The Small Device C Compiler (SDCC), targeting 8-bit architectures

Group

Searches

Help

#491 mcs51: fold extra +1/-1 into genPlus/genMinus

Discussion