diff -r a1e347446159 -r 28ccaba883f4 genericopenlibs/liboil/src/motovec/vec_memset.s --- a/genericopenlibs/liboil/src/motovec/vec_memset.s Fri Sep 17 19:25:42 2010 +0530 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,553 +0,0 @@ -//------------------------------------------------------------------ -// file: vec_memset.S -// AltiVec enabled version of memset and bzero and cacheable_memzero -//------------------------------------------------------------------ - -//------------------------------------------------------------------ -// Copyright Motorola, Inc. 2002 -// ALL RIGHTS RESERVED -// -// You are hereby granted a copyright license to use, modify, and -// distribute the SOFTWARE so long as this entire notice is retained -// without alteration in any modified and/or redistributed versions, -// and that such modified versions are clearly identified as such. -// No licenses are granted by implication, estoppel or otherwise under -// any patents or trademarks of Motorola, Inc. -// -// The SOFTWARE is provided on an "AS IS" basis and without warranty. -// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS -// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED -// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR -// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH -// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS -// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. -// -// To the maximum extent permitted by applicable law, IN NO EVENT SHALL -// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER -// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF -// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS -// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR -// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility -// for the maintenance and support of the SOFTWARE. -//------------------------------------------------------------------ - -//------------------------------------------------------------------ -// extern void *memset( void *ptr, int val, size_t len ); -// Copies val into each of len characters beginning at ptr. -// - Harbison&Steele 4th ed -// (despite val being an int, this memset assumes it is never -// more than a byte. That seems to be correct from all the -// memset functions I've seen but I don't know if ANSI allows -// anthing longer. Chuck Corley 12/21/02) -// Returns: -// void * ptr -//------------------------------------------------------------------ - -//------------------------------------------------------------------ -// extern void * bzero( char *ptr, int len); -// Copies 0 into each of len characters at ptr. -// - Harbison&Steele 4th ed -// Returns: -// void * ptr -//------------------------------------------------------------------ - -// Revision History: -// Rev 0.0 Original Chuck Corley 02/09/03 -// Could benefit from changes added to memcpy -// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03 -// -// This is beta quality code; users are encouraged to make it faster. -// ASSUMPTIONS: -// Code is highly likely to be in the cache; data is not (streaming data) -// Zero fill could be quite likely. -// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack - -#define VRSV 256 // VRSAVE spr -// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. -#define MIN_VEC 16 - -// Register useage -#define Rt r0 // r0 when used as a temporary register - -#define DST r3 // entering: dest pointer; exiting: same dest pointer - -#define FILL r4 // entering: fill char then fill word - -#define BC r5 // entering: Byte_Count then remaining Byte_Count - -#define DBC r6// dst + byte count - -#define BK r7 // BC - 1 +/- (n*16) - -#define Fsh r8 // fill byte shifted right one nibble - -#define DM1 r9// dst -1 for byte-by-byte backwards initially -#define D r9 // (dst+16)[0:27] - dst[28:31] -#define DNX r9 // (dst+n*16)[28:31] -#define BL r9 // second byte_kount index pointer - -#define DR r10 // (dst+16)[0:27] -#define QW r10 // number of cache lines - -#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] - -#define RSV r12 // storage for VRSAVE register if used - -// Condition register use (not including temporary cr0) -// cr0[2] = (FILL==0)? -// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) -// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) -// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) -// cr6[2] = (QW == 0)? 1 : 0; -// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) -// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) -// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) -// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) -// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) -// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) - -// Conditionalize the use of dcba. It will help if the data is -// not in cache and hurt if it is. Generally, except for small -// benchmarks repeated many times, we assume data is not in cache -// (data streaming) and using dcba is a performance boost. -// We use dcba which will noop to non-cacheable memory rather than -// dcbz which will cause an aligment exception. -#ifndef NO_DCBA -#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) - // gcc and codewarrior and diab don't assemble dcba -#define DCBK .long 0x7c033dec -// dcba r3,r7 or dcba DST,BK -#else -#ifdef __ghs__ -.macro DCBK -.long 0x7c033dec -.endm -#else -#define DCBK dcba DST,BK -#endif // __ghs__ -#endif // __GNUC__ or __MWERKS__ -#else -#define DCBK nop -#endif // NO_DCBA - - .text -#ifdef __MWERKS__ - .align 32 -#else - .align 5 -#endif - -#ifdef LIBMOTOVEC - .globl memset -memset: -#else - .globl _vec_memset -_vec_memset: -#endif - - cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count - cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count - rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift - - addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination - addi DR,DST,16 // IU1 Address of second dst vector - add DBC,DST,BC // IU1 Address of last dst byte + 1 - bgt cr7,v_memset // b if BC>MIN_VEC - - mtctr BC // for (i=1;i<=BC;i++) - beqlr cr1 // return if BC = 0 -Byte_set: - stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL - bdnz Byte_set - - blr - -v_memset: -// Byte count < MIN_VEC bytes will have been set by scalar code above, -// so this will not deal with small block sets < MIN_VEC. - -// For systems using VRSAVE, define VRSAV=1 when compiling. For systems -// that don't, make sure VRSAVE is undefined. -#ifdef VRSAVE - mfspr RSV,VRSV // IU2 Get current VRSAVE contents -#endif - rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] - addi DBK,DBC,-1 // IU1 Address of last dst byte - -#ifdef VRSAVE - oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine -#endif - subf D,DST,DR // IU1 How many bytes in first destination? - li BK,0 // IU1 Initialize byte kount index - -#ifdef VRSAVE - mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op -#endif - vxor v0,v0,v0 // VIU Clear v0 - subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) - cmpi cr1,0,D,16 // IU1 Is D0 left justified? - beq+ enter_bzero // b if FILL==0 - - lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR - vspltisb v1,4 // VPU Splat 0x4 to every byte - - lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR - - vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3] - - vor v0,v0,v2 // VIU Form FILL byte in VR[0:7] - - vspltb v0,v0,0 // VPU Splat the fill byte to all bytes -enter_bzero: - mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 - rlwinm QW,QW,28,4,31 // IU1 Quad words remaining - beq cr1,Left_just // b if D0 is left justified - - bns cr7,No_B_fwd // b if only even number of bytes to store - - stvebx v0,DST,BK // LSU store first byte at DST+0 - addi BK,BK,1 // IU1 increment index -No_B_fwd: - bne cr7,No_H_fwd // b if only words to store - - stvehx v0,DST,BK // LSU store halfword at DST+0/1 - addi BK,BK,2 // IU1 increment index -No_H_fwd: - bng cr7,No_W1_fwd // b if exactly zero or two words to store - - stvewx v0,DST,BK // LSU store word 1 of one or three - addi BK,BK,4 // IU1 increment index - -No_W1_fwd: - bnl cr7,No_W2_fwd // b if there was only one word to store - stvewx v0,DST,BK // LSU store word 1 of two or 2 of three - addi BK,BK,4 // IU1 increment index - - stvewx v0,DST,BK // LSU store word 2 of two or 3 of three - b No_W2_fwd - -Left_just: - stvx v0,0,DST // LSU Store 16 bytes at D0 -No_W2_fwd: - rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] - cmpi cr6,0,QW,0 // IU1 Any full vectors to move? - - li BK,16 // IU1 Re-initialize byte kount index - cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? - ble cr6,Last_QW // b if no Quad words to do - - mtctr QW // IU2 for (i=0;i<=QW;i++) - cmpi cr6,0,QW,4 // IU1 Check QW>4 - -QW_loop: - stvx v0,DST,BK // LSU Store 16 fill bytes - addi BK,BK,16 // IU1 Increment byte kount index - bdnzf 25,QW_loop // b if 4 or less quad words to do - - add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) - addi QW,QW,-1 // IU1 One more QW stored by now - bgt cr6,GT_4QW_fwd // b if >4 quad words left - -Last_QW: // Next vector is the last; we're done. - mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 - - beq cr1,Rt_just_fwd // b if last destination is right justified - - rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte - li BL,0 // IU1 Initialize index pointer - bnl cr7,Only_1W_fwd // b if there was only one or zero words to store - - stvewx v0,DBK,BL // LSU store word 1 of two or three - addi BL,BL,4 // IU1 increment index - - stvewx v0,DBK,BL // LSU store word 2 of two or three - addi BL,BL,4 // IU1 increment index -Only_1W_fwd: - bng cr7,Only_2W_fwd // b if there were only two or zero words to store - - stvewx v0,DBK,BL // LSU store word 3 of three if necessary - addi BL,BL,4 // IU1 increment index -Only_2W_fwd: - bne cr7,Only_B_fwd // b if there are no half words to store - - stvehx v0,DBK,BL // LSU store one halfword if necessary - addi BL,BL,2 // IU1 increment index -Only_B_fwd: - bns cr7,All_done_fwd // b if there are no bytes to store - - stvebx v0,DBK,BL // LSU store one byte if necessary - b All_done_fwd - -Rt_just_fwd: - - stvx v0,DST,BK // LSU Store 16 bytes at D14 -All_done_fwd: -#ifdef VRSAVE - mtspr VRSV,RSV // IU1 Restore VRSAVE -#endif - blr // Return destination address from entry - -#ifdef __MWERKS__ - .align 16 -#else - .align 4 -#endif -GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice - - addi QW,QW,-1 // IU1 Keeping track of QWs stored - mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; - addi DNX,DNX,16 // IU1 Update cr6 for next loop - - stvx v0,DST,BK // LSU Store 16 bytes at D2 - addi BK,BK,16 // IU1 Increment byte count by 16 - bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL - - mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) - - bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even - -// We need the ctr register to reflect an even byte count before entering -// the next block - faster to decrement than to reload. - bdnz B32_fwd // decrement counter for last QW store odd - -B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned - DCBK // LSU then Kill instead of RWITM - - stvx v0,DST,BK // LSU Store 16 bytes at D11 - addi BK,BK,16 // IU1 Increment byte count - bdz Nxt_loc_fwd // always decrement and branch to next instr - -Nxt_loc_fwd: - stvx v0,DST,BK // LSU Store 16 bytes at D12 - addi BK,BK,16 // IU1 Increment byte count - bdnz B32_fwd // b if there are at least two more QWs to do - - bso cr6,One_even_QW // b if there is one even and one odd QW to store - b Last_QW // b if last store is to even address - -// Come here with two more loads and two stores to do -One_even_QW: - stvx v0,DST,BK // LSU Store 16 bytes at D13 - addi BK,BK,16 // IU1 Increment byte count - - b Last_QW - -// End of memset in AltiVec - -#define BCz r4 // in bzero r4 enters with byte count - -#ifdef __MWERKS__ - .align 32 -#else - .align 5 -#endif - -#ifdef LIBMOTOVEC - .globl bzero -bzero: -#else - .globl vec_bzero -vec_bzero: -#endif - - mr BC,BCz // IU1 arg[2] is BC here, not FILL - li FILL,0 // IU1 for bzero FILL=0 -#ifdef LIBMOTOVEC - b memset -#else - b _vec_memset -#endif - -// cacheable_memzero will employ dcbz to clear 32 bytes at a time -// of cacheable memory. Like bzero, second entering argument will be BC. -// Using this for non-cacheable memory will generate an alignment exception. - - .text -#ifdef __MWERKS__ - .align 32 -#else - .align 5 -#endif - -#ifdef LIBMOTOVEC - .globl cacheable_memzero -cacheable_memzero: -#else - .globl vec_cacheable_memzero -vec_cacheable_memzero: -#endif - - mr BC,BCz // IU1 arg[2] is BC here, not FILL - li FILL,0 // IU1 for bzero FILL=0 - cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count - - cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count - - addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination - addi DR,DST,16 // IU1 Address of second dst vector - add DBC,DST,BC // IU1 Address of last dst byte + 1 - bgt cr7,c_v_memset // b if BC>MIN_VEC - - mtctr BC // for (i=1;i<=BC;i++) - beqlr cr1 // return if BC = 0 -c_Byte_set: - stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL - bdnz c_Byte_set - - blr - -c_v_memset: -// Byte count < MIN_VEC bytes will have been set by scalar code above, -// so this will not deal with small block sets < MIN_VEC. - -// For systems using VRSAVE, define VRSAV=1 when compiling. For systems -// that don't, make sure VRSAVE is undefined. -#ifdef VRSAVE - mfspr RSV,VRSV // IU2 Get current VRSAVE contents -#endif - rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] - addi DBK,DBC,-1 // IU1 Address of last dst byte - -#ifdef VRSAVE - oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine -#endif - subf D,DST,DR // IU1 How many bytes in first destination? - li BK,0 // IU1 Initialize byte kount index - -#ifdef VRSAVE - mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op -#endif - vxor v0,v0,v0 // VIU Clear v0 - subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) - cmpi cr1,0,D,16 // IU1 Is D0 left justified? - - mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 - rlwinm QW,QW,28,4,31 // IU1 Quad words remaining - beq cr1,c_Left_just // b if D0 is left justified - - bns cr7,c_No_B_fwd // b if only even number of bytes to store - - stvebx v0,DST,BK // LSU store first byte at DST+0 - addi BK,BK,1 // IU1 increment index -c_No_B_fwd: - bne cr7,c_No_H_fwd // b if only words to store - - stvehx v0,DST,BK // LSU store halfword at DST+0/1 - addi BK,BK,2 // IU1 increment index -c_No_H_fwd: - bng cr7,c_No_W1_fwd // b if exactly zero or two words to store - - stvewx v0,DST,BK // LSU store word 1 of one or three - addi BK,BK,4 // IU1 increment index - -c_No_W1_fwd: - bnl cr7,c_No_W2_fwd // b if there was only one word to store - stvewx v0,DST,BK // LSU store word 1 of two or 2 of three - addi BK,BK,4 // IU1 increment index - - stvewx v0,DST,BK // LSU store word 2 of two or 3 of three - b c_No_W2_fwd - -c_Left_just: - stvx v0,0,DST // LSU Store 16 bytes at D0 -c_No_W2_fwd: - rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] - cmpi cr6,0,QW,0 // IU1 Any full vectors to move? - - li BK,16 // IU1 Re-initialize byte kount index - cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? - ble cr6,c_Last_QW // b if no Quad words to do - - mtctr QW // IU2 for (i=0;i<=QW;i++) - cmpi cr6,0,QW,4 // IU1 Check QW>4 - -c_QW_loop: - stvx v0,DST,BK // LSU Store 16 fill bytes - addi BK,BK,16 // IU1 Increment byte kount index - bdnzf 25,c_QW_loop // b if 4 or less quad words to do - - add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) - addi QW,QW,-1 // IU1 One more QW stored by now - bgt cr6,c_GT_4QW_fwd // b if >4 quad words left - -c_Last_QW: // Next vector is the last; we're done. - mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 - - beq cr1,c_Rt_just_fwd // b if last destination is right justified - - rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte - li BL,0 // IU1 Initialize index pointer - bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store - - stvewx v0,DBK,BL // LSU store word 1 of two or three - addi BL,BL,4 // IU1 increment index - - stvewx v0,DBK,BL // LSU store word 2 of two or three - addi BL,BL,4 // IU1 increment index -c_Only_1W_fwd: - bng cr7,Only_2W_fwd // b if there were only two or zero words to store - - stvewx v0,DBK,BL // LSU store word 3 of three if necessary - addi BL,BL,4 // IU1 increment index -c_Only_2W_fwd: - bne cr7,c_Only_B_fwd // b if there are no half words to store - - stvehx v0,DBK,BL // LSU store one halfword if necessary - addi BL,BL,2 // IU1 increment index -c_Only_B_fwd: - bns cr7,c_All_done_fwd // b if there are no bytes to store - - stvebx v0,DBK,BL // LSU store one byte if necessary - b c_All_done_fwd - -c_Rt_just_fwd: - - stvx v0,DST,BK // LSU Store 16 bytes at D14 -c_All_done_fwd: -#ifdef VRSAVE - mtspr VRSV,RSV // IU1 Restore VRSAVE -#endif - blr // Return destination address from entry - -#ifdef __MWERKS__ - .align 16 -#else - .align 4 -#endif -c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice - - addi QW,QW,-1 // IU1 Keeping track of QWs stored - mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; - addi DNX,DNX,16 // IU1 Update cr6 for next loop - - stvx v0,DST,BK // LSU Store 16 bytes at D2 - addi BK,BK,16 // IU1 Increment byte count by 16 - bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL - - mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) - - bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even - -// We need the ctr register to reflect an even byte count before entering -// the next block - faster to decrement than to reload. - bdnz B32_fwd // decrement counter for last QW store odd - -c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned - dcbz DST,BK // LSU zero whole cache line - bdz c_Nxt_loc_fwd // always decrement and branch to next instr - -c_Nxt_loc_fwd: - addi BK,BK,32 // IU1 Increment byte count - bdnz B32_fwd // b if there are at least two more QWs to do - - bso cr6,c_One_even_QW // b if there is one even and one odd QW to store - b c_Last_QW // b if last store is to even address - -// Come here with two more loads and two stores to do -c_One_even_QW: - stvx v0,DST,BK // LSU Store 16 bytes at D13 - addi BK,BK,16 // IU1 Increment byte count - - b c_Last_QW - -// End of cacheable_memzero in AltiVec