Menu

#491 mcs51: fold extra +1/-1 into genPlus/genMinus

open
None
5
3 days ago
2026-03-02
No

This tries to optimize the pattern "(a + b) + 1" and "(a - b) - 1".

In the case of genMinus we get the extra "-1" for free in many
cases by flipping the carry bit.

In the case of genPlus it requires switching the first add to
already use addc instead of just add and adding the SETC operation.

Since "SETB C" is a single byte while "ADD A, #1" is two bytes,
this is still a win (except when the register tracker has found an
existing #1 immediate so that "ADD A, Rn" can be used, then it
is a draw).

In the best case (multi-byte addition) we'll save several the
extra addition would have generated.

Further optimization would be supporting "(a + b) + bitvar" and
"(a - b) - bitvar", as well as supporting iCode variants like
"a + (b + 1)" or "1 + (a + b)".

1 Attachments

Discussion

  • Tobias Diedrich

    Tobias Diedrich - 2026-03-03

    A bit more reference on what I was trying to do:

     uint16_t strlen(register __code const char *s)
     {
        __code const char *x = s;
        while (*x)
            x++;
        return x - s;
    }
    

    generates

    _strlen:
        mov r6, dpl
        mov r7, dph
        mov ar4,r6
        mov ar5,r7
    00101$:
        mov dpl,r4
        mov dph,r5
        clr a
        movc    a,@a+dptr
        jz  00103$
        inc r4
        cjne    r4,#0x00,00101$
        inc r5
        sjmp    00101$
    00103$:
        mov a,r4
        clr c
        subb    a,r6
        mov r6,a
        mov a,r5
        subb    a,r7
        mov dpl,r6
        mov dph,a
        ret
    

    at first I tried modifying rtrack.c to keep track of value equality, and in genPlusIncr for a 16bit addition to use "rtrackRegEq(...)" to prefer "inc dptr/mov/mov" over "inc/cjne/inc", but this doesn't work because register tracking runs before peephole optimizations and any jumps clear the tracker, so it has its state cleared just before the genPlusIncr.

    But then there already is special handling for (*(ptr++)), so we can write:

     uint16_t strlen(register __code const char *s)
     {
        __code const char *x = s;
        while (*(x++));
        return x - s - 1;
    }
    

    but this generates a seperate 16bit subtraction for the "- 1":

    _strlen:
        mov r6, dpl
        mov r7, dph
        mov ar4,r6
        mov ar5,r7
    00101$:
        mov dpl,r4
        mov dph,r5
        clr a
        movc    a,@a+dptr
        mov r3,a
        inc dptr
        mov r4,dpl
        mov r5,dph
        mov a,r3
        jnz 00101$
        mov a,r4
        clr c
        subb    a,r6
        mov r6,a
        mov a,r5
        subb    a,r7
        mov r7,a
        dec r6
        cjne    r6,#0xff,00120$
        dec r7
    00120$:
        mov dpl,r6
        mov dph,r7
        ret
    

    with the proposed change this becomes the more optimized:

    _strlen:
        mov r6, dpl
        mov r7, dph
        mov ar4,r6
        mov ar5,r7
    00101$:
        mov dpl,r4
        mov dph,r5
        clr a
        movc    a,@a+dptr
        mov r3,a
        inc dptr
        mov r4,dpl
        mov r5,dph
        mov a,r3
        jnz 00101$
        mov a,r4
        setb    c
        subb    a,r6
        mov r6,a
        mov a,r5
        subb    a,r7
        mov dpl,r6
        mov dph,a
        ret
    

    I should try if this can be handled by a peephole rule instead, speaking of which it would be nice if the temporary to dptr and dptr to temporary copies could be hoisted out of the loop...

     
  • Tobias Diedrich

    Tobias Diedrich - 2026-03-03

    I should try if this can be handled by a peephole rule instead, speaking of which it would be nice if the temporary to dptr and dptr to temporary copies could be hoisted out of the loop...

    And yes, both can with --peep-file containing:

    replace {
        clr c
        subb    a,%1
        mov %1,a
        mov a,%2
        subb    a,%3
        mov %3,a
        dec %1
        cjne    %1,#0xff,%4
        dec %3
    %4:
    } by {
        setb    c
        subb    a,%1
        mov %1,a
        mov a,%2
        subb    a,%3
        ;   Peephole optimized decrement, merged into previous
    %4:
    }
    
    replace {
    %1:
        mov dpl,%2
        mov dph,%3
        clr a
        movc    a,@a+dptr
        mov %4,a
        inc dptr
        mov %2,dpl
        mov %3,dph
        mov a,%4
        jnz %1
    } by {
        mov dpl,%2
        mov dph,%3
    %1:
        clr a
        movc    a,@a+dptr
        mov %4,a
        inc dptr
        jnz %1
        ;   Peephole hoisted temporary update out of loop
        mov %2,dpl
        mov %3,dph
        mov a,%4
    } if labelRefCount(%1 1)
    

    the generated code becomes:

    _strlen:
        mov r6, dpl
        mov r7, dph
        mov ar4,r6
        mov ar5,r7
    ;   Peephole 108.b  removed ljmp by inverse jump logic
        mov dpl,r4 ;  <----
        mov dph,r5 ;  <----
    00101$:
        clr a
        movc    a,@a+dptr
    ;   Peephole 302 removed dead mov r3, a
        inc dptr
        jnz 00101$
    ;   Peephole hoisted temporary update out of loop <----
        mov r4,dpl ;  <----
        mov r5,dph ;  <----
    ;   Peephole 500    removed redundant label 00119$
    ;   Peephole 177.c  removed redundant mov
        mov a,r4
        setb    c  ; <----
        subb    a,r6
        mov r6,a
        mov a,r5
        subb    a,r7
    ;   Peephole optimized decrement, merged into previous <----
    00120$:
        mov dpl,r6
        mov dph,r7
    ;   Peephole 206    removed redundant mov dpl,dpl
    ;   Peephole 206    removed redundant mov dph,dph
    ;   Peephole 500    removed redundant label 00104$
        ret
    
     

    Last edit: Tobias Diedrich 2026-03-03
  • Tobias Diedrich

    Tobias Diedrich - 2026-03-03

    On the note of peephole rules, I think rule 131 is unsafe?

    replace {
        mov a,%1
        subb    a,#0x01
        mov %2,a
        mov %1,%2
    } by {
        ;   Peephole 131    optimized decrement (not caring for c)
        dec %1
        mov %2,%1
    }
    

    AFAICS these are only identical if carry is unset at the beginning, and differ when carry is set.
    Maybe this was intended to be:

    replace {
        clr C
        mov a,%1
        subb    a,#0x01
        mov %2,a
        mov %1,%2
    } by {
        ;   Peephole 131    optimized decrement (not caring for c)
        dec %1
        mov %2,%1
    }
    
     
    • Philipp Klaus Krause

      Do you have a C code sample that can reproduce incorrect optimizations with this rule?

       
      • Tobias Diedrich

        Tobias Diedrich - 6 days ago

        Do you have a C code sample that can reproduce incorrect optimizations with this rule?

        No, I just randomly noticed this while looking for existing add/sub-related peephole rules. I had a quick look how old this rule is and it's been there for over 20 years, so its probably safe. Possibly the compiler will just never generate instructions that would trigger the false-positive case.

        [edit]Rule 131 goes back all the way to https://sourceforge.net/p/sdcc/code/3/[/edit]

         
        👍
        1

        Last edit: Tobias Diedrich 5 days ago
        • Tobias Diedrich

          Tobias Diedrich - 5 days ago

          Looking through mcs51/gen.c I don't think this pattern is generated anymore.
          Grep through the regression test results I don't see a single hit on "Peephole 131" either, so this is probably an unused rule nowadays...

           
          👍
          1
  • Tobias Diedrich

    Tobias Diedrich - 2026-03-03

    Optimized strlen peephole rules:

     uint16_t strlen(register __code const char *s)
     {
        __code const char *x = s;
        while (*(x++));
        return x - s - 1;
    }
    
    // optimized strlen
    replace {
        clr c
        subb    a,%1
        mov %1,a
        mov a,%2
        subb    a,%3
        mov %3,a
        dec %1
        cjne    %1,#0xff,%4
        dec %3
    %4:
    } by {
        setb    c
        subb    a,%1
        mov %1,a
        mov a,%2
        subb    a,%3
        mov %3,a
        ;   Peephole optimized decrement, merged into previous
    } if labelRefCount(%4 1)
    
    replace restart {
    %1:
        mov dpl,%2
        mov dph,%3
        clr a
        movc    a,@a+dptr
        mov %4,a
        inc dptr
        mov %2,dpl
        mov %3,dph
        mov a,%4
        jnz %1
    } by {
        mov dpl,%2
        mov dph,%3
    %1:
        clr a
        movc    a,@a+dptr
        mov %4,a
        inc dptr
        jnz %1
        ;   Peephole hoisted temporary update out of loop
        mov %2,dpl
        mov %3,dph
        mov a,%4
    } if labelRefCount(%1 1)
    
    replace restart {
    %1:
        mov dpl,%2
        mov dph,%3
        movx    a,@dptr
        mov %4,a
        inc dptr
        mov %2,dpl
        mov %3,dph
        mov a,%4
        jnz %1
    } by {
        mov dpl,%2
        mov dph,%3
    %1:
        movx    a,@dptr
        mov %4,a
        inc dptr
        jnz %1
        ;   Peephole hoisted temporary update out of loop
        mov %2,dpl
        mov %3,dph
        mov a,%4
    } if labelRefCount(%1 1)
    
    replace {
        mov %1, dpl
        mov %2, dph
        mov a%3,%1
        mov a%4,%2
    } by {
        mov %1, dpl
        mov %2, dph
        mov %3, dpl
        mov %4, dph
        ;   Peephole modified moves to enable further optimizations
    }
    
    replace {
        mov %1,a
        mov a,%2
        subb    a,%3
        mov %3,a
        mov dpl,%1
        mov dph,%3
        ret
    } by {
        mov dpl,a
        ;   Peephole removed redundant move before return
        mov a,%2
        subb    a,%3
        mov dph,a
        ret
    }
    

    generates:

    _strlen:
        mov r6, dpl
        mov r7, dph
    ;   Peephole modified moves to enable further optimizations
    ;   Peephole 108.b  removed ljmp by inverse jump logic
    ;   Peephole 302 removed dead mov r4, dpl
    ;   Peephole 177.d  removed redundant mov
    ;   Peephole 302 removed dead mov r5, dph
    ;   Peephole 177.a  removed redundant mov
    00101$:
        clr a
        movc    a,@a+dptr
    ;   Peephole 302 removed dead mov r3, a
        inc dptr
        jnz 00101$
    ;   Peephole hoisted temporary update out of loop
        mov r4,dpl
        mov r5,dph
    ;   Peephole 500    removed redundant label 00119$
    ;   Peephole 177.c  removed redundant mov
        mov a,r4
        setb    c
        subb    a,r6
    ;   Peephole optimized decrement, merged into previous
    ;   Peephole 206    removed redundant mov dpl,dpl
    ;   Peephole 206    removed redundant mov dph,dph
    ;   Peephole 500    removed redundant label 00104$
        mov dpl,a
    ;   Peephole removed redundant move before return
        mov a,r5
        subb    a,r7
        mov dph,a
        ret
    
     

    Last edit: Tobias Diedrich 2026-03-03
    • Philipp Klaus Krause

      Which of these rules are general enough to apply to code other than your own strlen variant? Did you try them on the regression tests?

       
      • Tobias Diedrich

        Tobias Diedrich - 5 days ago

        Grepping over the regression test:

        • 273 hits for "Peephole 4242.a"
        • 120 hits for "Peephole 4242.b"
        • 288 hits for "Peephole 4242.c"
        • 0 hits for "Peephole hoisted temporary update out of loop"
        • 0 hits for "Peephole optimized decrement, merged into previous"
        • 0 hits for "Peephole removed redundant move before return"

        [edit]Some regressions found compared to unmodified baseline, need to check where the mov , dpl corruption is coming from...

        Unfortunately notSame(%1, '') is not accepted, so for now I fixed it by duplicating the rule, so the degenerate case is consumed first by the earlier more specific rule:

        replace {
            mov %1, dpl
            mov %2, dph
            mov a,  %1
            mov a%4,%2
        } by {
            mov %1, dpl
            mov %2, dph
            mov a,  dpl
            mov %4, dph
            ;   Peephole 4242.a modified moves to enable further optimizations
        }
        
        replace {
            mov %1, dpl
            mov %2, dph
            mov a%3,%1
            mov a,  %2
        } by {
            mov %1, dpl
            mov %2, dph
            mov %3, dpl
            mov a,  dph
            ;   Peephole 4242.b modified moves to enable further optimizations
        }
        
        replace {
            mov %1, dpl
            mov %2, dph
            mov a%3,%1
            mov a%4,%2
        } by {
            mov %1, dpl
            mov %2, dph
            mov %3, dpl
            mov %4, dph
            ;   Peephole 4242.c modified moves to enable further optimizations
        } if notSame(%3 %4)
        

        [/edit]

         

        Last edit: Tobias Diedrich 5 days ago
        • Tobias Diedrich

          Tobias Diedrich - 5 days ago

          550 hits for "Peephole modified moves to enable further optimizations"

          That said, sampling the hits it is often only a change with no further effect (no change in size or speed).
          But it has some successes, for example in support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm:

          $ diff -Naru build-{unmodified,subcarry-peeph}/support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm
          --- build-unmodified/support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm    2026-03-05 17:04:16.859657082 +0100
          +++ build-subcarry-peeph/support/regression/gen/mcs51-medium/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.asm        2026-03-05 19:22:11.554234670 +0100
          @@ -134,12 +134,13 @@
                  ar1 = 0x01
                  ar0 = 0x00
           ;      genReceive
          
          -       mov     r6, dpl
          -       mov     r7, dph
           ;      cases/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.c:111: return ((value << SHIFT_L) | (value >> SHIFT_R)) AND_OPERATION;
           ;      genSwap
          -       mov     a,r6
          -       mov     ar6,r7
          +;      Peephole 302 removed dead mov r6, dpl
          +;      Peephole 302 removed dead mov r7, dph
          +       mov     a,  dpl
          +       mov     r6, dph
          +;      Peephole 4242.a modified moves to enable further optimizations
                  mov     r7,a
           ;      genCast
           ;      genAnd
          @@ -225,12 +226,13 @@
           ;      -----------------------------------------
           _rotate_test_2:
           ;      genReceive
          -       mov     r6, dpl
          -       mov     r7, dph
           ;      cases/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.c:130: return ((value2 << SHIFT_L) | (value2 >> SHIFT_R)) AND_OPERATION;
           ;      genSwap
          -       mov     a,r6
          -       mov     ar6,r7
          +;      Peephole 302 removed dead mov r6, dpl
          +;      Peephole 302 removed dead mov r7, dph
          +       mov     a,  dpl
          +       mov     r6, dph
          +;      Peephole 4242.a modified moves to enable further optimizations
                  mov     r7,a
           ;      genCast
           ;      genAnd
          @@ -500,12 +502,13 @@
           ;      -----------------------------------------
           _rotate_test_4:
           ;      genReceive
          -       mov     r6, dpl
          -       mov     r7, dph
           ;      cases/rotate2/rotate2_size_16_andCase_1_xorLiteral_1_rotateLeft_0_structVar_0.c:171: rotate_test_value = ((rotate_test_value << SHIFT_L) | (rotate_test_value >> SHIFT_R)) AND_OPERATION;
           ;      genSwap
          -       mov     a,r6
          -       mov     ar6,r7
          +;      Peephole 302 removed dead mov r6, dpl
          +;      Peephole 302 removed dead mov r7, dph
          +       mov     a,  dpl
          +       mov     r6, dph
          +;      Peephole 4242.a modified moves to enable further optimizations
                  mov     r7,a
           ;      genCast
           ;      genAnd
          
           

          Last edit: Tobias Diedrich 5 days ago
          • Tobias Diedrich

            Tobias Diedrich - 5 days ago
            $ grep "(f:" regtest-before.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
            b: 76390042 T: 10330159128
            $ grep "(f:" regtest-after.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
            b: 76389574 T: 10330144896
            

            So 468 bytes saved over all testcases?

             

            Last edit: Tobias Diedrich 5 days ago
  • Tobias Diedrich

    Tobias Diedrich - 6 days ago

    Did you try them on the regression tests?

    Not yet, will try to do that on the weekend. Any tips? Presumably all I need to run them is make -C support/regression test-mcs51? How long is the run expected to take on a gen2 ryzen?

     
  • Tobias Diedrich

    Tobias Diedrich - 3 days ago

    Of course there were some bugs, but the regression test is passing now.

    $ grep "(f:" regtest-before.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
    b: 76397798 T: 10330885752
    $ grep "(f:" regtest-after.log | sed -e 's@.*b:\([^,]*\), T:\([^)]*\))@b: \1 T: \2@' | awk 'BEGIN { b = 0; t = 0 } { b += $2; t += $4 } END { print "b:", b, "T:", t }'
    b: 76397020 T: 10330860444
    $ grep "failures" regtest-after.log 
    Summary for 'mcs51-small-stack-auto': 0 failures, 25183 tests, 6199 test cases, 11629012 bytes, 1132641312 ticks
    Summary for 'mcs51-small': 0 failures, 24574 tests, 6193 test cases, 10249507 bytes, 888038904 ticks
    Summary for 'mcs51-medium': 0 failures, 24221 tests, 6199 test cases, 11940616 bytes, 1696195944 ticks
    Summary for 'mcs51-large-stack-auto': 0 failures, 26891 tests, 6201 test cases, 13190547 bytes, 1756580028 ticks
    Summary for 'mcs51-huge': 0 failures, 26912 tests, 6204 test cases, 15596236 bytes, 2567890536 ticks
    Summary for 'mcs51-large': 0 failures, 26912 tests, 6204 test cases, 13791102 bytes, 2289513720 ticks
    

    76397798-76397020 => 778 bytes saved across (duplicated per model) test code.

    Hits for "folding post-decrement":

    tst_bug-2866.asm
    tst_gcc-torture-execute-20021204-1.asm
    

    Hits for "folding post-increment":

    tst_bug-2756.asm
    tst_bug-2931.asm
    tst_bug3403429.asm
    tst_bug3440327.asm
    tst_bug3521024.asm
    tst_bug-3547.asm
    tst_bugs-1596270-1736867.asm
    tst_gcc-torture-execute-20100430-1.asm
    tst_gcc-torture-execute-memchr-1.asm
    tst_gcc-torture-execute-strlen-2.asm
    tst_gcc-torture-execute-strlen-4.asm
    tst_structidx.asm
    

    Additionally the "increment by 0" optimization path triggers in some array access cases even without folding.

     
  • Tobias Diedrich

    Tobias Diedrich - 3 days ago

    I also have a patch for the "need pointerCode" assert, as attached.

     

Log in to post a comment.

MongoDB Logo MongoDB