Small Device C Compiler (SDCC) / Discussion / [z80] long long bloated code

I no longer have that original code fragment -- I thought sdcc was doing that all the time but it isn't. I added peephole rules to change those to 16-bit pushes so stopped seeing the problem or lack of problem. I must have got lucky on the first try and got that weirdness but it still shows up in small amounts (see below).

This is the test code I am using now - you can prune as needed. Everything is working with sdcc so there is no problem with the correctness of the 64-bit code generation for the z80.

typedef struct {
   long long rem;
   long long quot;
} lldiv_t;

extern int printf(char *fmt, ...);
extern int scanf(char *fmt, ...);
extern char *lltoa(long long i, char *buf, int radix) __z88dk_callee;
extern int ffsll(long long i) __z88dk_callee;
extern long long llabs(long long i) __z88dk_callee;
extern void _lldiv_(lldiv_t *ld, long long denom, long long num) __z88dk_callee;

extern void in_pause(unsigned int ms) __z88dk_fastcall;

struct long_s
{
   unsigned long ls32;
   unsigned long ms32;
};

union u64_s
{
   long long a;
   struct long_s b;
};

//union u64_s x, y, z;

char buffer_0[100];
char buffer_1[100];
char buffer_2[100];

lldiv_t ld;

void main(void)
{
   union u64_s x, y, z;

   while(1)
   {
      printf("\n\n[#1] MS32 <space> LS32 in hex\n");
      scanf("%lx%lx", &x.b.ms32, &x.b.ls32);

      printf("\n[#2] MS32 <space> LS32 in hex\n");
      scanf("%lx%lx", &y.b.ms32, &y.b.ls32);

      lltoa(x.a, buffer_0, 10);
      lltoa(y.a, buffer_1, 10);

      z.a = x.a + y.a;
      lltoa(z.a, buffer_2, 10);
      printf("\n%s + %s = %s\n", buffer_0, buffer_1, buffer_2);

      z.a = x.a - y.a;
      lltoa(z.a, buffer_2, 10);
      printf("%s - %s = %s\n", buffer_0, buffer_1, buffer_2);

      z.a = x.a * y.a;
      lltoa(z.a, buffer_2, 10);
      printf("%s * %s = %s\n", buffer_0, buffer_1, buffer_2);

      z.a = x.a / y.a;
      lltoa(z.a, buffer_2, 10);
      printf("%s / %s = %s\n", buffer_0, buffer_1, buffer_2);

      z.a = x.a % y.a;
      lltoa(z.a, buffer_2, 10);
      printf("%s %% %s = %s\n", buffer_0, buffer_1, buffer_2);

      z.a = x.a >> y.a;
      lltoa(z.a, buffer_2, 10);
      printf("%s >> %s = %s\n", buffer_0, buffer_1, buffer_2);

      z.a = x.a << y.a;
      lltoa(z.a, buffer_2, 10);
      printf("%s << %s = %s\n", buffer_0, buffer_1, buffer_2);

      printf("ffsll(%s) = %u\n", buffer_0, ffsll(x.a));

      z.a = llabs(x.a);
      in_pause(200);  // avoid sdcc bug already reported
      lltoa(z.a, buffer_2, 10);
      printf("llabs(%s) = %s\n", buffer_0, buffer_2);

      _lldiv_(&ld, x.a, y.a);
      lltoa(ld.quot, buffer_0, 10);
      lltoa(ld.rem, buffer_1, 10);
      printf("lldiv: quotient = %s, remainder = %s\n\n", buffer_0, buffer_1);
   }
}

Compile with:
sdcc -mz80 -S --max-allocs-per-node200000 test.c

It's not as egregious but you can still find small examples of that sort of code in the output:

;tcode.c:68: lltoa(z.a, buffer_2, 10);
......
    ld  h,-13 (ix)
    ld  l,-14 (ix)
    push    hl
    ld  a,-15 (ix)   ;;; here doing it 8-bits instead of 16
    push    af
    inc sp
    ld  a,-16 (ix)
    push    af
    inc sp
    call    _lltoa

Even with the 16-bit pushes, pushing one longlong param allocated on the stack is taking up 28 bytes.

Adds and subs of two longlong allocated on the stack take up 72 bytes (they are inlined in the output of the above).

I can't seem to generate a case where sdcc makes use of its own 64-bit shift library functions ( _ _ rrulonglong and family). Instead it's inling the shifts. One example here:

;tcode.c:71: z.a = x.a >> y.a;
    ld  e,-18 (ix)
    ld  d,-17 (ix)
    ld  hl, #0x001e
    add hl, sp
    ex  de, hl
    ld  bc, #0x0008
    ldir
    ld  e,-22 (ix)
    ld  d,-21 (ix)
    ld  hl, #0x0026
    add hl, sp
    ex  de, hl
    ld  bc, #0x0008
    ldir
    ld  b,-8 (ix)
    push    af
    pop af
    inc b
    jr  00111$
00110$:
    sra -9 (ix)
    rr  -10 (ix)
    rr  -11 (ix)
    rr  -12 (ix)
    rr  -13 (ix)
    rr  -14 (ix)
    rr  -15 (ix)
    rr  -16 (ix)
00111$:
    djnz    00110$

That's 79 bytes for one shift compared to a small set up and call of sdcc's function. The implementation we have is also much faster than this inlined code so it would preferable if sdcc used the library routine.

I think for 64-bit types, using library functions for everything will lead to practical code sizes and speed improvement.

I see the same sort of thing with 32-bit types btw. The other C compiler we have (a much enhanced small C derivative) generates library cals for all 32-bit operations and in programs with a lot of 32-bit code, this leads to program sizes that are 10% smaller than sdcc because sdcc is inlining 32-bit ops. With 32-bit code the situation is a little bit different in that the code sdcc inlines is sometimes faster than a call to a library routine would be when set up and call overhead is accounted for. But if someone is looking at this, I would suggest that maybe the "--opt-code-size" flag could be used to indicate 32-bit code should use library functions while for 64-bit functions the library should always be used.

Bugs: ~~#3428~~

Small Device C Compiler (SDCC) Discussion

The Small Device C Compiler (SDCC), targeting 8-bit architectures

Forums

Help

[z80] long long bloated code

Related