Menu

#3221 not-optimal asm output for single bit operations

open
nobody
STM8
5
2021-05-25
2021-04-30
No

SDCC some times produce not-optimal code for single bit operations

Example

#include <stdint.h>

#define PORTB_ODR (*((uint8_t*) 0x5005))

// sdcc -mstm8 --std-c99 main.c
void main() {
   // you should uncomment only ONE below
   // line to produce correct optimal assembler output
   // PORTB_ODR ^= 0x01;  // -> bcpl
   // PORTB_ODR &= ~0x01; // -> bres
   // PORTB_ODR |= 0x01;  // -> bset
}

but if several bit operations goes sequentially sdcc produce next

#include <stdint.h>

#define PORTB_ODR (*((uint8_t*) 0x5005))

// sdcc -mstm8 --std-c99 main.c
void main() {
    PORTB_ODR |= 0x01;
    PORTB_ODR &= ~0x01;
}
_main:
;   main.c: 6: PORTB_ODR |= 0x01;
    ld  a, 0x5005
    or  a, #0x01
;   main.c: 7: PORTB_ODR &= ~0x01;
    ld  0x5005, a
    and a, #0xfe
    ld  0x5005, a
;   main.c: 8: }
    ret

and

#include <stdint.h>

#define PORTB_ODR (*((uint8_t*) 0x5005))

// sdcc -mstm8 --std-c99 main.c
void main() {
    PORTB_ODR ^= 0x01;
    PORTB_ODR |= 0x02;
}
_main:
;   main.c: 6: PORTB_ODR ^= 0x01;
    ld  a, 0x5005
    xor a, #0x01
;   main.c: 7: PORTB_ODR |= 0x02;
    ld  0x5005, a
    or  a, #0x02
    ld  0x5005, a
;   main.c: 8: }
    ret

as you can see single-bit operations now is not atomiс and not optimal

Options --opt-code-size / --opt-code-speed have no any effects for current issue.

SDCC : mcs51/z80/z180/r2k/r2ka/r3ka/gbz80/tlcs90/ez80_z80/z80n/ds390/pic16/pic14/TININative/ds400/hc08/s08/stm8/pdk13/pdk14/pdk15 4.1.0 #12072 (Linux)
published under GNU General Public License (GPL)

Discussion

  • Visenri

    Visenri - 2021-05-05

    This should fix your problem:

    #define PORTB_ODR (*((volatile uint8_t*) 0x5005))
    

    The Keyword volatile forces the compiler to read and write the same variable at each statement (Any mcu register or interrupt shared variable should be qualified as volatile).

    Resulting in:

    ld  a, 0x5005
    xor a, #0x01
    ld  0x5005, a
    

    Later the optimizer converts it to:

    bcpl 20485, #0
    

    Equivalent to:

    bcpl 0x5005, #0
    
     
  • Oleksandr Dorogyh

    thanks a lot !!!

     
  • Oleksandr Dorogyh

    I found case when volatile does not work

        #include "stm8s.h"
        #include "stm8s_it.h"
    
        #define PB5 0b00100000
        #define LED_PIN_PB5 PB5
    
        #define FAST_PERIOD 40
        #define SLOW_PERIOD (FAST_PERIOD >> 1)
    
        volatile uint16_t subCnt = FAST_PERIOD;
        volatile uint8_t mode = 0;
    
        //#undef GPIOB
        //#define GPIOB ((volatile GPIO_TypeDef *)GPIOB_BaseAddress)
    
        void main()
        {
           // should be converted to bcpl
            GPIOB->ODR ^= LED_PIN_PB5;
    
            while (1)
            {
                CFG->GCR = CFG_GCR_AL;
                wfi();
                // TIM4->ARR = 0XFF; // with set AL bit this code is unreachable
    
                disableInterrupts();
    
               // should be converted to bcpl
                GPIOB->ODR ^= LED_PIN_PB5; // LED pin toogle
    
                if (mode == 0)
                {
                    TIM4->ARR = 100;
                    mode = 1;
                    subCnt = SLOW_PERIOD;
                }
                else
                {
                    TIM4->ARR = 1;
                    mode = 0;
                    subCnt = FAST_PERIOD;
                }
                enableInterrupts();
            }
        }
    

    ODR register declared as volatile

    #define     __I     volatile const   /*!< defines 'read only' permissions     */
    #define     __O     volatile         /*!< defines 'write only' permissions    */
    #define     __IO    volatile         /*!< defines 'read / write' permissions  */
    
    typedef struct GPIO_struct
    {
      __IO uint8_t ODR; /*!< Output Data Register */
      __IO uint8_t IDR; /*!< Input Data Register */
      __IO uint8_t DDR; /*!< Data Direction Register */
      __IO uint8_t CR1; /*!< Configuration Register 1 */
      __IO uint8_t CR2; /*!< Configuration Register 2 */
    }
    GPIO_TypeDef;
    
    _main:
    ; **********************************************
    ; ************* Expected bcpl ******************
    ; **********************************************
    ;   src\main.c: 19: GPIOB->ODR ^= LED_PIN_PB5;
        ld  a, 0x5005
        clrw    x
        xor a, #0x20
        ld  0x5005, a
    ;   src\main.c: 21: while (1)
    00105$:
    ;   src\main.c: 23: CFG->GCR = CFG_GCR_AL;
        mov 0x7f60+0, #0x02
    ;   src\main.c: 24: wfi();
        wfi
    ;   src\main.c: 27: disableInterrupts();
        sim
    ; **********************************************
    ; ************* Expected bcpl ******************
    ; **********************************************
    ;   src\main.c: 30: GPIOB->ODR ^= LED_PIN_PB5;
        ld  a, 0x5005
        xor a, #0x20
        ld  0x5005, a
    ;   src\main.c: 32: if (mode == 0)
        tnz _mode+0
        jrne    00102$
    ;   src\main.c: 34: TIM4->ARR = 100;
        mov 0x5348+0, #0x64
    ;   src\main.c: 35: mode = 1;
        mov _mode+0, #0x01
    ;   src\main.c: 36: subCnt = SLOW_PERIOD;
        ldw x, #0x0014
        ldw _subCnt+0, x
        jra 00103$
    00102$:
    ;   src\main.c: 40: TIM4->ARR = 1;
        mov 0x5348+0, #0x01
    ;   src\main.c: 41: mode = 0;
        clr _mode+0
    ;   src\main.c: 42: subCnt = FAST_PERIOD;
        ldw x, #0x0028
        ldw _subCnt+0, x
    00103$:
    ;   src\main.c: 44: enableInterrupts();
        rim
        jra 00105$
    ;   src\main.c: 46: }
        ret
    
     

    Last edit: Oleksandr Dorogyh 2021-05-06
    • Philipp Klaus Krause

      Without having compiled your code:
      AFAIK the use of bit instructions is done by the peephole optimizer. The peephole optimizer will only do the optimization if a is not used subsequently. enableInterrupts() is implemented as a macro using inline asm. When the peephole optimizer sees inline asm it decides to err on the safe side and assume that no optimization is possible.
      We can already see this in a minimal example:

       void f(void)
      {
          {__asm__("sim\n");}
          *((volatile unsigned char *)3) ^= 2;
          {__asm__("rim\n");}
      }
      
      void g(void)
      {
          *((volatile unsigned char *)3) ^= 2;
      }
      

      As a workaround, you could use a critical section for disabling interrupts:

      void f(void)
      {
          __critical {
              *((volatile unsigned char *)3) ^= 2;
          }
      }
      

      But I suspect that is SDCC-specific code that won't compile on Raisonance / IAR / Cosmic.

       

      Last edit: Maarten Brock 2021-05-22
      • Arseny Vakhrushev

        Well, the following code shows that the optimizer is somewhat "picky" about such optimizations:

        #include <stdint.h>
        
        #define ADDR (*(volatile uint8_t *)(0x5555))
        
        void main(void) {
            ADDR |= 0x01;
            ADDR |= 0x02;
            ADDR |= 0x04;
            __asm__("wfi");
        }
        
              000000                         89 _main:
                                             90 ;   main.c: 6: ADDR |= 0x01;
              000000 72 10 55 55      [ 1]   91     bset    21845, #0
                                             92 ;   main.c: 7: ADDR |= 0x02;
              000004 72 12 55 55      [ 1]   93     bset    21845, #1
                                             94 ;   main.c: 8: ADDR |= 0x04;
              000008 C6 55 55         [ 1]   95     ld  a, 0x5555
              00000B AA 04            [ 1]   96     or  a, #0x04
              00000D C7 55 55         [ 1]   97     ld  0x5555, a
                                             98 ;   main.c: 9: __asm__("wfi");
              000010 8F               [10]   99     wfi
                                            100 ;   main.c: 10: }
              000011 81               [ 4]  101     ret
        

        Three equal statements. Yet the optimizer correctly optimized only the first two, but failed the third. Weird... :-(

        Removing __asm__ makes all three be optimized fine, of course.

         

        Last edit: Arseny Vakhrushev 2021-05-24
        • Philipp Klaus Krause

          Code generation generates something like

          ld  a, 0x5555
          or  a, #0x04
          ld  0x5555, a
          

          for each of them. Then the peephole optimizer sees that for the first two, the value in a is overwritten later, and optimizes it into bset. For the third, it sees that there is inline asm, and decides to not do anything. Without the inline asm, it arrives at ret, checks that the function returns void, so the value in a is not used, and also optimizes.

           
          • Arseny Vakhrushev

            The peephole optimizer is an interesting beast. The workaround to the case above is this:

            #include <stdint.h>
            
            #define ADDR (*(volatile uint8_t *)(0x5555))
            
            static void wfi() {
                __asm__("wfi");
            }
            
            void main(void) {
                ADDR |= 0x01;
                ADDR |= 0x02;
                ADDR |= 0x04;
                for (;;) wfi();
            }
            

            If wfi() is inlined, no optimization for the third line. So it looks like the peephole optmizer sees the code after compiler optimizations (inlining, returns, etc.) and is akin of a "last wipe of dust". I guess I wish the compiler would work better at optimizing bit operations one day. :-) Anyways, thanks for the SDCC! It's a beautiful tool!

             
            • Philipp Klaus Krause

              Yes, the peephole optimizer is the last stage before asm output. To see what it does, options --no-peep and --fverbose-asm can be helpful.
              The use of __sfr __at can be a workaround for the case discussed here, as then code generation might already generate the bit instructions.
              But it is clearly less elegant and not portable.

               
              • Oleksandr Dorogyh

                by specification for STM8
                sometime reading whole register leads to reset some internal state
                and code doesnt work as expected at all.

                Because of the C part does not generate predictive code for single-bit instruction I should write some parts on assembler and this yet more not elegant and not portable.

                For example

                while( a & 0x20) {} // in different variants dont produce BTJF/BTJT instruction
                
                 
              • Arseny Vakhrushev

                I believe __sfr and __at are not available for STM8.

                 
  • Arseny Vakhrushev

    I'm observing a similar behaviour:

    #define TIM2_CCER1 (*(volatile uint8_t *)(0x530a))
    

    whereas:

    ;   /home/seny/devel/rcmixer/src/main.c: 197: TIM2_CCER1 = 0x11;
        mov 0x530a+0, #0x11
    ;   /home/seny/devel/rcmixer/src/main.c: 199: TIM2_CCER1 |= 0x22;
        ld  a, 0x530a
        or  a, #0x22
        ld  0x530a, a
    
     
    • Philipp Klaus Krause

      Can you provide a minimal compileable example? Which compiler version do you use with which options?

       
      • Arseny Vakhrushev

        Well... sure! Please take a look:

        #include <stdint.h>
        
        #define TIM2_CCER1 (*(volatile uint8_t *)0x530a)
        
        void main(void) {
            TIM2_CCER1 = 0x11;
            TIM2_CCER1 |= 0x22;
        }
        
        $ sdcc -mstm8 test.c
        $ cat test.lst | tail -n16
                                             86 ;   -----------------------------------------
                                             87 ;    function main
                                             88 ;   -----------------------------------------
              000000                         89 _main:
                                             90 ;   test.c: 6: TIM2_CCER1 = 0x11;
              000000 35 11 53 0A      [ 1]   91     mov 0x530a+0, #0x11
                                             92 ;   test.c: 7: TIM2_CCER1 |= 0x22;
              000004 C6 53 0A         [ 1]   93     ld  a, 0x530a
              000007 AA 22            [ 1]   94     or  a, #0x22
              000009 C7 53 0A         [ 1]   95     ld  0x530a, a
                                             96 ;   test.c: 8: }
              00000C 81               [ 4]   97     ret
                                             98     .area CODE
                                             99     .area CONST
                                            100     .area INITIALIZER
                                            101     .area CABS (ABS)
        
         
      • Arseny Vakhrushev

        Phillip, please disregard my input! In my case, it's not a single bit. Sorry.

         
  • Oleksandr Dorogyh

    Arseny, you have not a single-bit operation, using OR in your case is correct

    0x22 = 0b00100010 - two bits

     
    • Arseny Vakhrushev

      Oh, right! My bad. Please disregard my five cents.

       

Log in to post a comment.

MongoDB Logo MongoDB