Thread: [xtensa-cvscommit] linux/arch/xtensa/lib/hal memcopy.S,1.1.1.1,1.2
Brought to you by:
zankel
From: <joe...@us...> - 2002-11-21 17:18:55
|
Update of /cvsroot/xtensa/linux/arch/xtensa/lib/hal In directory sc8-pr-cvs1:/tmp/cvs-serv26438/arch/xtensa/lib/hal Modified Files: memcopy.S Log Message: Thanks to Marc Gauthier for these changes: 1. Bug fix to non-LOOP version of memcpy(). 2. Removes all unaligned accesses (needed for upcoming unaligned exceptions). 3. Optimized for common case (dst and src are aligned). Index: memcopy.S =================================================================== RCS file: /cvsroot/xtensa/linux/arch/xtensa/lib/hal/memcopy.S,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -C2 -d -r1.1.1.1 -r1.2 *** memcopy.S 28 Aug 2002 16:10:14 -0000 1.1.1.1 --- memcopy.S 21 Nov 2002 17:18:52 -0000 1.2 *************** *** 12,15 **** --- 12,32 ---- */ + #include <xtensa/coreasm.h> + + .macro src_b r, w0, w1 + #ifdef __XTENSA_EB__ + src \r, \w0, \w1 + #else + src \r, \w1, \w0 + #endif + .endm + + .macro ssa8 r + #ifdef __XTENSA_EB__ + ssa8b \r + #else + ssa8l \r + #endif + .endm /* *************** *** 25,46 **** * types of devices). * ! * !!!!!!! FIXME: ! * !!!!!!! Handling of IRAM/IROM/DRAM/DROM has not yet ! * !!!!!!! been implemented, and should be added here. * * The bcopy version is provided here to avoid the overhead * of an extra call, for callers that require this convention. * ! * The general case algorithm is as follows: ! * If the destination and source are both aligned, ! * do 16B chunks with a loop, and then finish up with ! * 8B, 4B, 2B, and 1B copies conditional on the length. ! * If destination is aligned and source unaligned, ! * do the same, but use SRC to align the source data. * If destination is unaligned, align it by conditionally ! * copying 1B and 2B and then retest. ! * This code tries to use fall-through braches for the common ! * case of aligned destinations (except for the branches to ! * the alignment label). * * Register use: --- 42,63 ---- * types of devices). * ! * !!!!!!! XTFIXME: ! * !!!!!!! Handling of IRAM/IROM has not yet ! * !!!!!!! been implemented. * * The bcopy version is provided here to avoid the overhead * of an extra call, for callers that require this convention. * ! * The (general case) algorithm is as follows: * If destination is unaligned, align it by conditionally ! * copying 1 and 2 bytes. ! * If source is aligned, ! * do 16 bytes with a loop, and then finish up with ! * 8, 4, 2, and 1 byte copies conditional on the length; ! * else (if source is unaligned), ! * do the same, but use SRC to align the source data. ! * This code tries to use fall-through branches for the common ! * case of aligned source and destination and multiple ! * of 4 (or 8) length. * * Register use: *************** *** 55,70 **** * a8/ tmp * a9/ tmp */ - #include <xtensa/coreasm.h> - - #ifdef __XTENSA_EB__ - #define ALIGN(R, W0, W1) src R, W0, W1 - #define SSA8(R) ssa8b R - #else - #define ALIGN(R, W0, W1) src R, W1, W0 - #define SSA8(R) ssa8l R - #endif - .text .align 4 --- 72,79 ---- * a8/ tmp * a9/ tmp + * a10/ tmp + * a11/ tmp */ .text .align 4 *************** *** 80,129 **** - .align 4 - .global xthal_memcpy - .type xthal_memcpy,@function - xthal_memcpy: - entry sp, 16 # minimal stack frame - # a2/ dst, a3/ src, a4/ len - mov a5, a2 # copy dst so that a2 is return value - .Lcommon: - bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 - bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 - .Ldstaligned: # return here from .Ldstunaligned when dst is aligned - srli a7, a4, 4 # number of loop iterations with 16B - # per iteration - movi a8, 3 # if source is also aligned, - bnone a3, a8, .Laligned # then use word copy - SSA8( a3) # set shift amount from byte offset - bnez a4, .Lsrcunaligned - retw - - /* - * Destination is unaligned - */ - - .Ldst1mod2: # dst is only byte aligned - bltui a4, 7, .Lbytecopy # do short copies byte by byte - - # copy 1 byte - l8ui a6, a3, 0 - addi a3, a3, 1 - s8i a6, a5, 0 - addi a5, a5, 1 - addi a4, a4, -1 - bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then - # return to main algorithm - .Ldst2mod4: # dst 16-bit aligned - # copy 2 bytes - bltui a4, 6, .Lbytecopy # do short copies byte by byte - l8ui a6, a3, 0 - l8ui a7, a3, 1 - addi a3, a3, 2 - s8i a6, a5, 0 - s8i a7, a5, 1 - addi a5, a5, 2 - addi a4, a4, -2 - j .Ldstaligned # dst is now aligned, return to main algorithm - /* * Byte by byte copy --- 89,92 ---- *************** *** 151,160 **** /* ! * Destination and source are word-aligned. */ # copy 16 bytes per iteration for word-aligned dst and word-aligned src - .align 4 # 1 mod 4 alignment for LOOPNEZ - .byte 0 # (0 mod 4 alignment for LBEG) - .Laligned: #if XCHAL_HAVE_LOOPS loopnez a7, .Loop1done --- 114,164 ---- /* ! * Destination is unaligned */ + + .align 4 + .Ldst1mod2: # dst is only byte aligned + _bltui a4, 7, .Lbytecopy # do short copies byte by byte + + # copy 1 byte + l8ui a6, a3, 0 + addi a3, a3, 1 + addi a4, a4, -1 + s8i a6, a5, 0 + addi a5, a5, 1 + _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then + # return to main algorithm + .Ldst2mod4: # dst 16-bit aligned + # copy 2 bytes + _bltui a4, 6, .Lbytecopy # do short copies byte by byte + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + addi a4, a4, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + j .Ldstaligned # dst is now aligned, return to main algorithm + + + .align 4 + .global xthal_memcpy + .type xthal_memcpy,@function + xthal_memcpy: + entry sp, 16 # minimal stack frame + # a2/ dst, a3/ src, a4/ len + mov a5, a2 # copy dst so that a2 is return value + .Lcommon: + _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 + _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 + .Ldstaligned: # return here from .Ldst?mod? once dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + movi a8, 3 # if source is not aligned, + _bany a3, a8, .Lsrcunaligned # then use shifting copy + /* + * Destination and source are word-aligned, use word copy. + */ # copy 16 bytes per iteration for word-aligned dst and word-aligned src #if XCHAL_HAVE_LOOPS loopnez a7, .Loop1done *************** *** 188,192 **** addi a5, a5, 8 .L2: ! bbci.l a4, 2, .L3 # copy 4 bytes l32i a6, a3, 0 --- 192,200 ---- addi a5, a5, 8 .L2: ! bbsi.l a4, 2, .L3 ! bbsi.l a4, 1, .L4 ! bbsi.l a4, 0, .L5 ! retw ! .L3: # copy 4 bytes l32i a6, a3, 0 *************** *** 194,199 **** s32i a6, a5, 0 addi a5, a5, 4 ! .L3: ! bbci.l a4, 1, .L4 # copy 2 bytes l16ui a6, a3, 0 --- 202,209 ---- s32i a6, a5, 0 addi a5, a5, 4 ! bbsi.l a4, 1, .L4 ! bbsi.l a4, 0, .L5 ! retw ! .L4: # copy 2 bytes l16ui a6, a3, 0 *************** *** 201,211 **** s16i a6, a5, 0 addi a5, a5, 2 ! .L4: ! bbci.l a4, 0, .L5 # copy 1 byte l8ui a6, a3, 0 s8i a6, a5, 0 - .L5: - .Lret1: retw --- 211,220 ---- s16i a6, a5, 0 addi a5, a5, 2 ! bbsi.l a4, 0, .L5 ! retw ! .L5: # copy 1 byte l8ui a6, a3, 0 s8i a6, a5, 0 retw *************** *** 215,224 **** .align 4 - .byte 0 # 1 mod 4 alignement for LOOPNEZ - # (0 mod 4 alignment for LBEG) .Lsrcunaligned: # copy 16 bytes per iteration for word-aligned dst and unaligned src ! and a10, a3, a8 # save unalignment offset for below ! sub a3, a3, a10 # align a3 (to avoid sim warnings only; not needed for hardware) l32i a6, a3, 0 # load first word #if XCHAL_HAVE_LOOPS --- 224,237 ---- .align 4 .Lsrcunaligned: + _beqz a4, .Ldone # avoid loading anything for zero-length copies # copy 16 bytes per iteration for word-aligned dst and unaligned src ! ssa8 a3 # set shift amount from byte offset ! #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the ! lint or ferret client, or 0 to save a few cycles */ ! #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT ! and a11, a3, a8 # save unalignment offset for below ! sub a3, a3, a11 # align a3 ! #endif l32i a6, a3, 0 # load first word #if XCHAL_HAVE_LOOPS *************** *** 232,245 **** l32i a7, a3, 4 l32i a8, a3, 8 ! ALIGN( a6, a6, a7) s32i a6, a5, 0 l32i a9, a3, 12 ! ALIGN( a7, a7, a8) s32i a7, a5, 4 l32i a6, a3, 16 ! ALIGN( a8, a8, a9) s32i a8, a5, 8 addi a3, a3, 16 ! ALIGN( a9, a9, a6) s32i a9, a5, 12 addi a5, a5, 16 --- 245,258 ---- l32i a7, a3, 4 l32i a8, a3, 8 ! src_b a6, a6, a7 s32i a6, a5, 0 l32i a9, a3, 12 ! src_b a7, a7, a8 s32i a7, a5, 4 l32i a6, a3, 16 ! src_b a8, a8, a9 s32i a8, a5, 8 addi a3, a3, 16 ! src_b a9, a9, a6 s32i a9, a5, 12 addi a5, a5, 16 *************** *** 252,259 **** l32i a7, a3, 4 l32i a8, a3, 8 ! ALIGN( a6, a6, a7) s32i a6, a5, 0 addi a3, a3, 8 ! ALIGN( a7, a7, a8) s32i a7, a5, 4 addi a5, a5, 8 --- 265,272 ---- l32i a7, a3, 4 l32i a8, a3, 8 ! src_b a6, a6, a7 s32i a6, a5, 0 addi a3, a3, 8 ! src_b a7, a7, a8 s32i a7, a5, 4 addi a5, a5, 8 *************** *** 264,274 **** l32i a7, a3, 4 addi a3, a3, 4 ! ALIGN( a6, a6, a7) s32i a6, a5, 0 addi a5, a5, 4 mov a6, a7 .L13: ! add a3, a3, a10 # readjust a3 with correct misalignment ! bbci.l a4, 1, .L14 # copy 2 bytes l8ui a6, a3, 0 --- 277,292 ---- l32i a7, a3, 4 addi a3, a3, 4 ! src_b a6, a6, a7 s32i a6, a5, 0 addi a5, a5, 4 mov a6, a7 .L13: ! #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT ! add a3, a3, a11 # readjust a3 with correct misalignment ! #endif ! bbsi.l a4, 1, .L14 ! bbsi.l a4, 0, .L15 ! .Ldone: retw ! .L14: # copy 2 bytes l8ui a6, a3, 0 *************** *** 278,287 **** s8i a7, a5, 1 addi a5, a5, 2 ! .L14: ! bbci.l a4, 0, .L15 # copy 1 byte l8ui a6, a3, 0 s8i a6, a5, 0 - .L15: retw --- 296,305 ---- s8i a7, a5, 1 addi a5, a5, 2 ! bbsi.l a4, 0, .L15 ! retw ! .L15: # copy 1 byte l8ui a6, a3, 0 s8i a6, a5, 0 retw |