Menu

#3727 Incorrect loop optimization (probably in GCSE)

open
nobody
None
redundancy elimination
5
2024-04-16
2024-04-12
No

The compiler assumes that global variables cannot be modified by functions, thus "optimise" the code resolving tests and expressions on the basis of the value assigned before calling the function.

In this code, n0 is global and can be modified by AddSat1()

    {
        n1=0;

        MyObj = &MyObject[0];
        for (ii=0;ii<MaxNumObj;ii++)
        {
            if (MyObj->Status != 0) 
            {
                for (jj=0;jj<MyObj->NumSprt;jj++) 
                {
                    y = MyObj->y + MyObj->dy[jj];
                    x = MyObj->x + MyObj->dx[jj];
                    p = MyObj->pat[jj];
                    c = MyObj->col[jj];

                    AddSat1();          // SAT1
                }
            }
            MyObj++;                    
        }
        if (n1<32) {
            VDP_Poke_16K(216, SatAddr1+4*n1);
        }
    }

The compiler assume that n1 stays 0 when evaluating the last IF (n1<32) and generates this ASM

_SatUpdate::
    push    ix
    ld  ix,#0
    add ix,sp
    dec sp
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:130: if (Sprt_flip == 1) 
    ld  a, (_Sprt_flip+0)
    dec a
    jp  NZ,00114$
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:132: n1=0;
    ld  hl, #_n1
    ld  (hl), #0x00
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:134: MyObj = &MyObject[0];
    ld  iy, #_MyObj
    ld  0 (iy), #<(_MyObject)
    ld  1 (iy), #>(_MyObject)
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:135: for (ii=0;ii<MaxNumObj;ii++)
    ld  hl, #_ii
    ld  (hl), #0x00
00119$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:137: if (MyObj->Status != 0) 
    ld  hl, (_MyObj)
    ld  a, (hl)
    or  a, a
    jr  Z, 00103$
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:139: for (jj=0;jj<MyObj->NumSprt;jj++) 
    ld  hl, #_jj
    ld  (hl), #0x00
00117$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:137: if (MyObj->Status != 0) 
    ld  bc, (_MyObj)
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:139: for (jj=0;jj<MyObj->NumSprt;jj++) 
    ld  e, c
    ld  d, b
    ld  hl, #7
    add hl, de
    ld  e, (hl)
    ld  a, (_jj+0)
    sub a, e
    jr  NC, 00103$
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:141: y = MyObj->y + MyObj->dy[jj];
    ld  l, c
;   spillPairReg hl
;   spillPairReg hl
    ld  h, b
;   spillPairReg hl
;   spillPairReg hl
    inc hl
    inc hl
    inc hl
    ld  e, (hl)
    inc hl
    ld  d, (hl)
    ld  hl, #0x0014
    add hl, bc
    ld  a, (_jj+0)
    ld  -1 (ix), a
    ld  a, l
    add a, -1 (ix)
    ld  l, a
;   spillPairReg hl
;   spillPairReg hl
    jr  NC, 00209$
    inc h
00209$:
    ld  l, (hl)
;   spillPairReg hl
    ld  h, #0x00
;   spillPairReg hl
;   spillPairReg hl
    add hl, de
    ld  (_y), hl
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:142: x = MyObj->x + MyObj->dx[jj];
    ld  l, c
;   spillPairReg hl
;   spillPairReg hl
    ld  h, b
;   spillPairReg hl
;   spillPairReg hl
    inc hl
    ld  e, (hl)
    inc hl
    ld  d, (hl)
    ld  hl, #0x0008
    add hl, bc
    ld  a, l
    add a, -1 (ix)
    ld  l, a
;   spillPairReg hl
;   spillPairReg hl
    jr  NC, 00210$
    inc h
00210$:
    ld  l, (hl)
;   spillPairReg hl
    ld  h, #0x00
;   spillPairReg hl
;   spillPairReg hl
    add hl, de
    ld  (_x), hl
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:143: p = MyObj->pat[jj];
    ld  hl, #0x0020
    add hl, bc
    ld  e, -1 (ix)
    ld  d, #0x00
    add hl, de
    ld  a, (hl)
    ld  (_p+0), a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:144: c = MyObj->col[jj];
    ld  hl, #0x002c
    add hl, bc
    ld  e, -1 (ix)
    ld  d, #0x00
    add hl, de
    ld  a, (hl)
    ld  (_c+0), a
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:146: AddSat1();         // SAT1
    call    _AddSat1
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:139: for (jj=0;jj<MyObj->NumSprt;jj++) 
    ld  hl, #_jj
    inc (hl)
    jp  00117$
00103$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:149: MyObj++;                   
    ld  hl, #_MyObj
    ld  a, (hl)
    add a, #0x38
    ld  (hl), a
    jr  NC, 00211$
    inc hl
    inc (hl)
00211$:
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:135: for (ii=0;ii<MaxNumObj;ii++)
    ld  hl, #_ii
    inc (hl)
    ld  a, (_ii+0)
    sub a, #0x10
    jp  C, 00119$
;./after_burner\AFTER_BURNER_SPRITE_SPLIT.C:152: VDP_Poke_16K(216, SatAddr1+4*n1);
    ld  de, #0x1e00
    ld  a, #0xd8
    call    _VDP_Poke_16K
    jp  00126$
00114$:

As you can see n1 has been evaluated a 0 both in the comparison and in the expression 4*n1

Related

Feature Requests: #905

Discussion

  • Philipp Klaus Krause

    On which version of SDCC did you see the bug (sdcc --version)? On which host OS? Which command line options were used to compile?

    Can you provide a compileable code sample to reproduce the issue?

    Even if not, maybe there are still a few things to try (also helpful if you can provide code, but with code I could try them myself):

    • Post the relevant asm part when compiled with --fverbose-asm and --fverbose-asm --no-peep.
    • Check lif any of the following options is a workaround: --no-peep, --nolospre, --nogenconstprop, --noinduction, --nogcse.
     
    • Ragozini Arturo

      Ragozini Arturo - 2024-04-12

      The workaround is to remove the function and inject its code directly, in order to make SDCC see that it is modifying n1

       
  • Ragozini Arturo

    Ragozini Arturo - 2024-04-12

    sdcc --version
    SDCC : mcs51/z80/z180/r2k/r2ka/r3ka/sm83/tlcs90/ez80_z80/z80n/r800/ds390/pic16/pic14/TININative/ds400/hc08/s08/stm8/pdk13/pdk14/pdk15/mos6502/mos65c02 TD- 4.4.0 #14620 (MINGW64)
    published under GNU General Public License (GPL)

    I'm using Windows 11

    This should be the command line (I'm using a script from msxgl)
    Compiling ./template.c using SDCC C compiler...
    Execute: C:\Users\PC\OneDrive\Documenti\MSXgl-1.0.0\tools/sdcc/bin/sdcc -c -mz80 -DTARGET=TARGET_ROM_ASCII16_4096K -DMSX_VERSION=MSX_TR -IC:\Users\PC\OneDrive\Documenti\MSXgl-1.0.0\projects\template_msx2/ -IC:\Users\PC\OneDrive\Documenti\MSXgl-1.0.0\engine/src -IC:\Users\PC\OneDrive\Documenti\MSXgl-1.0.0\engine/content -IC:\Users\PC\OneDrive\Documenti\MSXgl-1.0.0\tools/ --opt-code-speed -DAPPSIGN ./template.c -o C:\Users\PC\OneDrive\Documenti\MSXgl-1.0.0\projects\template_msx2/out/
    Exit code: 0

     
  • Ragozini Arturo

    Ragozini Arturo - 2024-04-12

    This is the C file that includes the file where the bug happens

     

    Last edit: Ragozini Arturo 2024-04-12
  • Ragozini Arturo

    Ragozini Arturo - 2024-04-12

    Look at the void SatUpdate(void) in this file
    The "broken" n1 is just after the call the the function FAKE()

     
  • Philipp Klaus Krause

    • Attachments has changed:

    Diff:

    --- old
    +++ new
    @@ -0,0 +1 @@
    +bug-3727.c (2.1 kB; text/x-csrc)
    
    • Category: Z80 --> redundancy elimination
     
  • Philipp Klaus Krause

    I was able to reproduce the issue using SDCC 4.4.1 #14789 on Debian GNU/Linux on amd64.

    From your code, I created the attached regression test. It looks like all SDCC ports are affected (I tried z80, stm8, mcs51). --nogcse is a workaround.

    P.S.: An improved version of the attached test is in [r14790] (disabled until someone fixes the bug).

     

    Related

    Commit: [r14790]


    Last edit: Philipp Klaus Krause 2024-04-12
  • Philipp Klaus Krause

    • Attachments has changed:

    Diff:

    --- old
    +++ new
    @@ -1 +0,0 @@
    -bug-3727.c (2.1 kB; text/x-csrc)
    
     
    • Janko Stamenović

      Playing with that, I get the minimal code (using just u8) which still passes 0 to the minimal VDP_Poke_16K this:

      typedef unsigned char u8;
      
      void VDP_Poke_16K( u8 dest );
      
      int bad = 1;
      
      u8 n1;
      
      void fake (void)
      {
          n1 = 32;
      }
      
      void SatUpdate(void)
      {
          n1=0;
      
          if ( bad )
          {
              for (u8 j=0; j<1; j++)
              {
                  fake();
              }
          }
          VDP_Poke_16K( 2*n1 );
      }
      
      void testBug(void)
      {
          SatUpdate();
      }
      

      Clearly passes 0 to _VDP_Poke_16K:

      _SatUpdate::
          ld  hl, #_n1
          ld  (hl), #0x00
          ld  a, (_bad+1)
          ld  iy, #_bad
          or  a, 0 (iy)
          jr  Z, 00103$
          xor a, a
      00105$:
          sub a, #0x01
          jr  NC, 00103$
          call    _fake
          ld  a, #0x01
          jp  00105$
      00103$:
          xor a, a   ; <====  passes 0 here
          jp  _VDP_Poke_16K
      

      The call to fake() has to be in a block inside of a block, and one of the these blocks has to be a for loop: I can't get if inside of an if to generate a problem, and removing the if ( bad ) line while keeping the for loop would also produce the correct code.

       
      • Philipp Klaus Krause

        Thanks. That simplification is now in [r14795].

         

        Related

        Commit: [r14795]

        • Janko Stamenović

          Welcome, I'm glad when it helps.

          And, unsurprisingly, what I immediately see as the additional improvement point in that generated code: IY is used there "for no reason" IMO, even if these IX and IY registers should always be the last to be used as they produce both bigger and slower code (worse in both dimensions, so clear-cut).

           
  • Philipp Klaus Krause

    • summary: BUGGED Z80 CODE --> Incorrect loop optimization (probably in GCSE)
     

Log in to post a comment.

MongoDB Logo MongoDB