From: Vladimir D. <vdo...@wa...> - 2001-12-06 21:57:16
|
Hi, Can anyone tell me what is going on with a performance. Isn't it the processor issue? If I copy data from one place to another after each megabyte (delta between src and dst) there is some kind of "black hole" 32 bytes long in which processor works 15 times slower. It happens after 1M, 2M, 3M, have not tested more. 0..................1M|32bytes.............2M|32bytes.............3M|32bytes |--------------------|xxx-------------------|xxx-------------------|xxx N = 1.2.3 Delta between src and dst 0M <= DELTA < 1M -fast Delta between src and dst 1M*N <= DELTA <= 1M*N+31 -15 times slower Delta between src and dst 1M*N+31 < DELTA < 1M*N -fast It doesn't matter is memory area PAGE_SIZE aligned and not. I even tested with mmap'ed physical memory allocated at boot time with "alloc_bootmem_pages" to be sure that it is continuous and not-swappable - the same result ... Here is a simplified demo code, output, and disassembly. Thanks, Vova Output: u8copy1mm1 3145728 chars = 0.174000 sec. u8copy1m 3145728 chars = 2.392500 sec. u8copy1mp31 3145728 chars = 2.391000 sec. u8copy1mp32 3145728 chars = 0.174000 sec. Code: //1024*1024*3 #define THREEMEG 3145728 //1M #define ONEMEGMINUS1 1048575 #define ONEMEG 1048576 #define ONEMEGPLUS31 1048607 #define ONEMEGPLUS32 1048608 void u8copy1mm1(void) { for(ii = 0; ii < THREEMEG; ii++) mem8_1[0]=mem8_1[ONEMEGMINUS1]; } void u8copy1m(void) { for(ii = 0; ii < THREEMEG; ii++) mem8_1[0]=mem8_1[ONEMEG]; } void u8copy1mp31(void) { for(ii = 0; ii < THREEMEG; ii++) mem8_1[0]=mem8_1[ONEMEGPLUS31]; } void u8copy1mp32(void) { for(ii = 0; ii < THREEMEG; ii++) mem8_1[0]=mem8_1[ONEMEGPLUS32]; } start = clock(); for( k = 0; k < NUMBEROFLOOPS; k++ ) u8copy1mm1(); end = clock(); cpu_time_used = (((double)(end-start))/CLOCKS_PER_SEC)/k; printf("u8copy1mm1 %i chars = %f sec.\n",THREEMEG,cpu_time_used); start = clock(); for( k = 0; k < NUMBEROFLOOPS; k++ ) u8copy1m(); end = clock(); cpu_time_used = (((double)(end-start))/CLOCKS_PER_SEC)/k; printf("u8copy1m %i chars = %f sec.\n",THREEMEG,cpu_time_used); start = clock(); for( k = 0; k < NUMBEROFLOOPS; k++ ) u8copy1mp31(); end = clock(); cpu_time_used = (((double)(end-start))/CLOCKS_PER_SEC)/k; printf("u8copy1mp31 %i chars = %f sec.\n",THREEMEG,cpu_time_used); start = clock(); for( k = 0; k < NUMBEROFLOOPS; k++ ) u8copy1mp32(); end = clock(); cpu_time_used = (((double)(end-start))/CLOCKS_PER_SEC)/k; printf("u8copy1mp32 %i chars = %f sec.\n",THREEMEG,cpu_time_used); Output: mem1 = 0x29711000 THREEMEG = 3145728 NUMBEROFLOOPS = 10 u8copy1mm1 3145728 chars = 0.174000 sec. u8copy1m 3145728 chars = 2.392500 sec. u8copy1mp31 3145728 chars = 2.391000 sec. u8copy1mp32 3145728 chars = 0.174000 sec. Disassembly: 00402240 <u8copy1mm1>: 402240: 09 d2 mov.l 402268<u8copy1mm1+0x28>,r2!0x413940 402242: 00 e1 mov #0,r1 402244: 09 d6 mov.l 40226c<u8copy1mm1+0x2c>,r6!0x2fffff 402246: 0a d7 mov.l 402270<u8copy1mm1+0x30>,r7!0x413900 402248: 23 63 mov r2,r3 40224a: 0a d0 mov.l 402274<u8copy1mm1+0x34>,r0!0xfffff 40224c: e6 2f mov.l r14,@-r15 40224e: 12 22 mov.l r1,@r2 402250: f3 6e mov r15,r14 402252: 72 62 mov.l @r7,r2 402254: 2c 04 mov.b @(r0,r2),r4 402256: 40 22 mov.b r4,@r2 402258: 32 61 mov.l @r3,r1 40225a: 01 71 add #1,r1 40225c: 67 31 cmp/gt r6,r1 40225e: f8 8f bf.s 402252 <u8copy1mm1+0x12> 402260: 12 23 mov.l r1,@r3 402262: e3 6f mov r14,r15 402264: 0b 00 rts 402266: f6 6e mov.l @r15+,r14 402268: 40 39 cmp/eq r4,r9 40226a: 41 00 .word 0x0041 40226c: ff ff .word 0xffff 40226e: 2f 00 mac.l @r2+,@r0+ 402270: 00 39 cmp/eq r0,r9 402272: 41 00 .word 0x0041 402274: ff ff .word 0xffff 402276: 0f 00 mac.l @r0+,@r0+ 402278: 09 00 nop 40227a: 09 00 nop 40227c: 09 00 nop 40227e: 09 00 nop 00402280 <u8copy1m>: 402280: 09 d2 mov.l 4022a8<u8copy1m+0x28>,r2!0x413940 402282: 00 e1 mov #0,r1 402284: 09 d6 mov.l 4022ac<u8copy1m+0x2c>,r6!0x2fffff 402286: 0a d7 mov.l 4022b0<u8copy1m+0x30>,r7!0x413900 402288: 23 63 mov r2,r3 40228a: 0a d0 mov.l 4022b4<u8copy1m+0x34>,r0!0x100000 40228c: e6 2f mov.l r14,@-r15 40228e: 12 22 mov.l r1,@r2 402290: f3 6e mov r15,r14 402292: 72 62 mov.l @r7,r2 402294: 2c 04 mov.b @(r0,r2),r4 402296: 40 22 mov.b r4,@r2 402298: 32 61 mov.l @r3,r1 40229a: 01 71 add #1,r1 40229c: 67 31 cmp/gt r6,r1 40229e: f8 8f bf.s 402292 <u8copy1m+0x12> 4022a0: 12 23 mov.l r1,@r3 4022a2: e3 6f mov r14,r15 4022a4: 0b 00 rts 4022a6: f6 6e mov.l @r15+,r14 4022a8: 40 39 cmp/eq r4,r9 4022aa: 41 00 .word 0x0041 4022ac: ff ff .word 0xffff 4022ae: 2f 00 mac.l @r2+,@r0+ 4022b0: 00 39 cmp/eq r0,r9 4022b2: 41 00 .word 0x0041 4022b4: 00 00 .word 0x0000 4022b6: 10 00 .word 0x0010 4022b8: 09 00 nop 4022ba: 09 00 nop 4022bc: 09 00 nop 4022be: 09 00 nop 004022c0 <u8copy1mp31>: 4022c0: 09 d2 mov.l 4022e8<u8copy1mp31+0x28>,r2!0x413940 4022c2: 00 e1 mov #0,r1 4022c4: 09 d6 mov.l 4022ec<u8copy1mp31+0x2c>,r6!0x2fffff 4022c6: 0a d7 mov.l 4022f0<u8copy1mp31+0x30>,r7!0x413900 4022c8: 23 63 mov r2,r3 4022ca: 0a d0 mov.l 4022f4<u8copy1mp31+0x34>,r0!0x10001f 4022cc: e6 2f mov.l r14,@-r15 4022ce: 12 22 mov.l r1,@r2 4022d0: f3 6e mov r15,r14 4022d2: 72 62 mov.l @r7,r2 4022d4: 2c 04 mov.b @(r0,r2),r4 4022d6: 40 22 mov.b r4,@r2 4022d8: 32 61 mov.l @r3,r1 4022da: 01 71 add #1,r1 4022dc: 67 31 cmp/gt r6,r1 4022de: f8 8f bf.s 4022d2 <u8copy1mp31+0x12> 4022e0: 12 23 mov.l r1,@r3 4022e2: e3 6f mov r14,r15 4022e4: 0b 00 rts 4022e6: f6 6e mov.l @r15+,r14 4022e8: 40 39 cmp/eq r4,r9 4022ea: 41 00 .word 0x0041 4022ec: ff ff .word 0xffff 4022ee: 2f 00 mac.l @r2+,@r0+ 4022f0: 00 39 cmp/eq r0,r9 4022f2: 41 00 .word 0x0041 4022f4: 1f 00 mac.l @r1+,@r0+ 4022f6: 10 00 .word 0x0010 4022f8: 09 00 nop 4022fa: 09 00 nop 4022fc: 09 00 nop 4022fe: 09 00 nop 00402300 <u8copy1mp32>: 402300: 09 d2 mov.l 402328<u8copy1mp32+0x28>,r2!0x413940 402302: 00 e1 mov #0,r1 402304: 09 d6 mov.l 40232c<u8copy1mp32+0x2c>,r6!0x2fffff 402306: 0a d7 mov.l 402330<u8copy1mp32+0x30>,r7!0x413900 402308: 23 63 mov r2,r3 40230a: 0a d0 mov.l 402334<u8copy1mp32+0x34>,r0!0x100020 40230c: e6 2f mov.l r14,@-r15 40230e: 12 22 mov.l r1,@r2 402310: f3 6e mov r15,r14 402312: 72 62 mov.l @r7,r2 402314: 2c 04 mov.b @(r0,r2),r4 402316: 40 22 mov.b r4,@r2 402318: 32 61 mov.l @r3,r1 40231a: 01 71 add #1,r1 40231c: 67 31 cmp/gt r6,r1 40231e: f8 8f bf.s 402312 <u8copy1mp32+0x12> 402320: 12 23 mov.l r1,@r3 402322: e3 6f mov r14,r15 402324: 0b 00 rts 402326: f6 6e mov.l @r15+,r14 402328: 40 39 cmp/eq r4,r9 40232a: 41 00 .word 0x0041 40232c: ff ff .word 0xffff 40232e: 2f 00 mac.l @r2+,@r0+ 402330: 00 39 cmp/eq r0,r9 402332: 41 00 .word 0x0041 402334: 20 00 .word 0x0020 402336: 10 00 .word 0x0010 402338: 09 00 nop 40233a: 09 00 nop 40233c: 09 00 nop 40233e: 09 00 nop |