--- a/genericopenlibs/liboil/src/motovec/vec_memset.s Fri Sep 17 19:25:42 2010 +0530
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,553 +0,0 @@
-//------------------------------------------------------------------
-// file: vec_memset.S
-// AltiVec enabled version of memset and bzero and cacheable_memzero
-//------------------------------------------------------------------
-
-//------------------------------------------------------------------
-// Copyright Motorola, Inc. 2002
-// ALL RIGHTS RESERVED
-//
-// You are hereby granted a copyright license to use, modify, and
-// distribute the SOFTWARE so long as this entire notice is retained
-// without alteration in any modified and/or redistributed versions,
-// and that such modified versions are clearly identified as such.
-// No licenses are granted by implication, estoppel or otherwise under
-// any patents or trademarks of Motorola, Inc.
-//
-// The SOFTWARE is provided on an "AS IS" basis and without warranty.
-// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
-// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
-// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
-// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
-// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
-// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
-//
-// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
-// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
-// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
-// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
-// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
-// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
-// for the maintenance and support of the SOFTWARE.
-//------------------------------------------------------------------
-
-//------------------------------------------------------------------
-// extern void *memset( void *ptr, int val, size_t len );
-// Copies val into each of len characters beginning at ptr.
-// - Harbison&Steele 4th ed
-// (despite val being an int, this memset assumes it is never
-// more than a byte. That seems to be correct from all the
-// memset functions I've seen but I don't know if ANSI allows
-// anthing longer. Chuck Corley 12/21/02)
-// Returns:
-// void * ptr
-//------------------------------------------------------------------
-
-//------------------------------------------------------------------
-// extern void * bzero( char *ptr, int len);
-// Copies 0 into each of len characters at ptr.
-// - Harbison&Steele 4th ed
-// Returns:
-// void * ptr
-//------------------------------------------------------------------
-
-// Revision History:
-// Rev 0.0 Original Chuck Corley 02/09/03
-// Could benefit from changes added to memcpy
-// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03
-//
-// This is beta quality code; users are encouraged to make it faster.
-// ASSUMPTIONS:
-// Code is highly likely to be in the cache; data is not (streaming data)
-// Zero fill could be quite likely.
-// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
-
-#define VRSV 256 // VRSAVE spr
-// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
-#define MIN_VEC 16
-
-// Register useage
-#define Rt r0 // r0 when used as a temporary register
-
-#define DST r3 // entering: dest pointer; exiting: same dest pointer
-
-#define FILL r4 // entering: fill char then fill word
-
-#define BC r5 // entering: Byte_Count then remaining Byte_Count
-
-#define DBC r6// dst + byte count
-
-#define BK r7 // BC - 1 +/- (n*16)
-
-#define Fsh r8 // fill byte shifted right one nibble
-
-#define DM1 r9// dst -1 for byte-by-byte backwards initially
-#define D r9 // (dst+16)[0:27] - dst[28:31]
-#define DNX r9 // (dst+n*16)[28:31]
-#define BL r9 // second byte_kount index pointer
-
-#define DR r10 // (dst+16)[0:27]
-#define QW r10 // number of cache lines
-
-#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
-
-#define RSV r12 // storage for VRSAVE register if used
-
-// Condition register use (not including temporary cr0)
-// cr0[2] = (FILL==0)?
-// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
-// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
-// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
-// cr6[2] = (QW == 0)? 1 : 0;
-// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
-// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
-// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
-// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
-// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
-// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
-
-// Conditionalize the use of dcba. It will help if the data is
-// not in cache and hurt if it is. Generally, except for small
-// benchmarks repeated many times, we assume data is not in cache
-// (data streaming) and using dcba is a performance boost.
-// We use dcba which will noop to non-cacheable memory rather than
-// dcbz which will cause an aligment exception.
-#ifndef NO_DCBA
-#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
- // gcc and codewarrior and diab don't assemble dcba
-#define DCBK .long 0x7c033dec
-// dcba r3,r7 or dcba DST,BK
-#else
-#ifdef __ghs__
-.macro DCBK
-.long 0x7c033dec
-.endm
-#else
-#define DCBK dcba DST,BK
-#endif // __ghs__
-#endif // __GNUC__ or __MWERKS__
-#else
-#define DCBK nop
-#endif // NO_DCBA
-
- .text
-#ifdef __MWERKS__
- .align 32
-#else
- .align 5
-#endif
-
-#ifdef LIBMOTOVEC
- .globl memset
-memset:
-#else
- .globl _vec_memset
-_vec_memset:
-#endif
-
- cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
- cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
- rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
-
- addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
- addi DR,DST,16 // IU1 Address of second dst vector
- add DBC,DST,BC // IU1 Address of last dst byte + 1
- bgt cr7,v_memset // b if BC>MIN_VEC
-
- mtctr BC // for (i=1;i<=BC;i++)
- beqlr cr1 // return if BC = 0
-Byte_set:
- stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
- bdnz Byte_set
-
- blr
-
-v_memset:
-// Byte count < MIN_VEC bytes will have been set by scalar code above,
-// so this will not deal with small block sets < MIN_VEC.
-
-// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
-// that don't, make sure VRSAVE is undefined.
-#ifdef VRSAVE
- mfspr RSV,VRSV // IU2 Get current VRSAVE contents
-#endif
- rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
- addi DBK,DBC,-1 // IU1 Address of last dst byte
-
-#ifdef VRSAVE
- oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine
-#endif
- subf D,DST,DR // IU1 How many bytes in first destination?
- li BK,0 // IU1 Initialize byte kount index
-
-#ifdef VRSAVE
- mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
-#endif
- vxor v0,v0,v0 // VIU Clear v0
- subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
- cmpi cr1,0,D,16 // IU1 Is D0 left justified?
- beq+ enter_bzero // b if FILL==0
-
- lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR
- vspltisb v1,4 // VPU Splat 0x4 to every byte
-
- lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR
-
- vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3]
-
- vor v0,v0,v2 // VIU Form FILL byte in VR[0:7]
-
- vspltb v0,v0,0 // VPU Splat the fill byte to all bytes
-enter_bzero:
- mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
- rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
- beq cr1,Left_just // b if D0 is left justified
-
- bns cr7,No_B_fwd // b if only even number of bytes to store
-
- stvebx v0,DST,BK // LSU store first byte at DST+0
- addi BK,BK,1 // IU1 increment index
-No_B_fwd:
- bne cr7,No_H_fwd // b if only words to store
-
- stvehx v0,DST,BK // LSU store halfword at DST+0/1
- addi BK,BK,2 // IU1 increment index
-No_H_fwd:
- bng cr7,No_W1_fwd // b if exactly zero or two words to store
-
- stvewx v0,DST,BK // LSU store word 1 of one or three
- addi BK,BK,4 // IU1 increment index
-
-No_W1_fwd:
- bnl cr7,No_W2_fwd // b if there was only one word to store
- stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
- addi BK,BK,4 // IU1 increment index
-
- stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
- b No_W2_fwd
-
-Left_just:
- stvx v0,0,DST // LSU Store 16 bytes at D0
-No_W2_fwd:
- rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
- cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
-
- li BK,16 // IU1 Re-initialize byte kount index
- cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
- ble cr6,Last_QW // b if no Quad words to do
-
- mtctr QW // IU2 for (i=0;i<=QW;i++)
- cmpi cr6,0,QW,4 // IU1 Check QW>4
-
-QW_loop:
- stvx v0,DST,BK // LSU Store 16 fill bytes
- addi BK,BK,16 // IU1 Increment byte kount index
- bdnzf 25,QW_loop // b if 4 or less quad words to do
-
- add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
- addi QW,QW,-1 // IU1 One more QW stored by now
- bgt cr6,GT_4QW_fwd // b if >4 quad words left
-
-Last_QW: // Next vector is the last; we're done.
- mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
-
- beq cr1,Rt_just_fwd // b if last destination is right justified
-
- rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
- li BL,0 // IU1 Initialize index pointer
- bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
-
- stvewx v0,DBK,BL // LSU store word 1 of two or three
- addi BL,BL,4 // IU1 increment index
-
- stvewx v0,DBK,BL // LSU store word 2 of two or three
- addi BL,BL,4 // IU1 increment index
-Only_1W_fwd:
- bng cr7,Only_2W_fwd // b if there were only two or zero words to store
-
- stvewx v0,DBK,BL // LSU store word 3 of three if necessary
- addi BL,BL,4 // IU1 increment index
-Only_2W_fwd:
- bne cr7,Only_B_fwd // b if there are no half words to store
-
- stvehx v0,DBK,BL // LSU store one halfword if necessary
- addi BL,BL,2 // IU1 increment index
-Only_B_fwd:
- bns cr7,All_done_fwd // b if there are no bytes to store
-
- stvebx v0,DBK,BL // LSU store one byte if necessary
- b All_done_fwd
-
-Rt_just_fwd:
-
- stvx v0,DST,BK // LSU Store 16 bytes at D14
-All_done_fwd:
-#ifdef VRSAVE
- mtspr VRSV,RSV // IU1 Restore VRSAVE
-#endif
- blr // Return destination address from entry
-
-#ifdef __MWERKS__
- .align 16
-#else
- .align 4
-#endif
-GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
-
- addi QW,QW,-1 // IU1 Keeping track of QWs stored
- mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
- addi DNX,DNX,16 // IU1 Update cr6 for next loop
-
- stvx v0,DST,BK // LSU Store 16 bytes at D2
- addi BK,BK,16 // IU1 Increment byte count by 16
- bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
-
- mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
-
- bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
-
-// We need the ctr register to reflect an even byte count before entering
-// the next block - faster to decrement than to reload.
- bdnz B32_fwd // decrement counter for last QW store odd
-
-B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
- DCBK // LSU then Kill instead of RWITM
-
- stvx v0,DST,BK // LSU Store 16 bytes at D11
- addi BK,BK,16 // IU1 Increment byte count
- bdz Nxt_loc_fwd // always decrement and branch to next instr
-
-Nxt_loc_fwd:
- stvx v0,DST,BK // LSU Store 16 bytes at D12
- addi BK,BK,16 // IU1 Increment byte count
- bdnz B32_fwd // b if there are at least two more QWs to do
-
- bso cr6,One_even_QW // b if there is one even and one odd QW to store
- b Last_QW // b if last store is to even address
-
-// Come here with two more loads and two stores to do
-One_even_QW:
- stvx v0,DST,BK // LSU Store 16 bytes at D13
- addi BK,BK,16 // IU1 Increment byte count
-
- b Last_QW
-
-// End of memset in AltiVec
-
-#define BCz r4 // in bzero r4 enters with byte count
-
-#ifdef __MWERKS__
- .align 32
-#else
- .align 5
-#endif
-
-#ifdef LIBMOTOVEC
- .globl bzero
-bzero:
-#else
- .globl vec_bzero
-vec_bzero:
-#endif
-
- mr BC,BCz // IU1 arg[2] is BC here, not FILL
- li FILL,0 // IU1 for bzero FILL=0
-#ifdef LIBMOTOVEC
- b memset
-#else
- b _vec_memset
-#endif
-
-// cacheable_memzero will employ dcbz to clear 32 bytes at a time
-// of cacheable memory. Like bzero, second entering argument will be BC.
-// Using this for non-cacheable memory will generate an alignment exception.
-
- .text
-#ifdef __MWERKS__
- .align 32
-#else
- .align 5
-#endif
-
-#ifdef LIBMOTOVEC
- .globl cacheable_memzero
-cacheable_memzero:
-#else
- .globl vec_cacheable_memzero
-vec_cacheable_memzero:
-#endif
-
- mr BC,BCz // IU1 arg[2] is BC here, not FILL
- li FILL,0 // IU1 for bzero FILL=0
- cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
-
- cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
-
- addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
- addi DR,DST,16 // IU1 Address of second dst vector
- add DBC,DST,BC // IU1 Address of last dst byte + 1
- bgt cr7,c_v_memset // b if BC>MIN_VEC
-
- mtctr BC // for (i=1;i<=BC;i++)
- beqlr cr1 // return if BC = 0
-c_Byte_set:
- stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
- bdnz c_Byte_set
-
- blr
-
-c_v_memset:
-// Byte count < MIN_VEC bytes will have been set by scalar code above,
-// so this will not deal with small block sets < MIN_VEC.
-
-// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
-// that don't, make sure VRSAVE is undefined.
-#ifdef VRSAVE
- mfspr RSV,VRSV // IU2 Get current VRSAVE contents
-#endif
- rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
- addi DBK,DBC,-1 // IU1 Address of last dst byte
-
-#ifdef VRSAVE
- oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
-#endif
- subf D,DST,DR // IU1 How many bytes in first destination?
- li BK,0 // IU1 Initialize byte kount index
-
-#ifdef VRSAVE
- mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
-#endif
- vxor v0,v0,v0 // VIU Clear v0
- subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
- cmpi cr1,0,D,16 // IU1 Is D0 left justified?
-
- mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
- rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
- beq cr1,c_Left_just // b if D0 is left justified
-
- bns cr7,c_No_B_fwd // b if only even number of bytes to store
-
- stvebx v0,DST,BK // LSU store first byte at DST+0
- addi BK,BK,1 // IU1 increment index
-c_No_B_fwd:
- bne cr7,c_No_H_fwd // b if only words to store
-
- stvehx v0,DST,BK // LSU store halfword at DST+0/1
- addi BK,BK,2 // IU1 increment index
-c_No_H_fwd:
- bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
-
- stvewx v0,DST,BK // LSU store word 1 of one or three
- addi BK,BK,4 // IU1 increment index
-
-c_No_W1_fwd:
- bnl cr7,c_No_W2_fwd // b if there was only one word to store
- stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
- addi BK,BK,4 // IU1 increment index
-
- stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
- b c_No_W2_fwd
-
-c_Left_just:
- stvx v0,0,DST // LSU Store 16 bytes at D0
-c_No_W2_fwd:
- rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
- cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
-
- li BK,16 // IU1 Re-initialize byte kount index
- cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
- ble cr6,c_Last_QW // b if no Quad words to do
-
- mtctr QW // IU2 for (i=0;i<=QW;i++)
- cmpi cr6,0,QW,4 // IU1 Check QW>4
-
-c_QW_loop:
- stvx v0,DST,BK // LSU Store 16 fill bytes
- addi BK,BK,16 // IU1 Increment byte kount index
- bdnzf 25,c_QW_loop // b if 4 or less quad words to do
-
- add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
- addi QW,QW,-1 // IU1 One more QW stored by now
- bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
-
-c_Last_QW: // Next vector is the last; we're done.
- mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
-
- beq cr1,c_Rt_just_fwd // b if last destination is right justified
-
- rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
- li BL,0 // IU1 Initialize index pointer
- bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
-
- stvewx v0,DBK,BL // LSU store word 1 of two or three
- addi BL,BL,4 // IU1 increment index
-
- stvewx v0,DBK,BL // LSU store word 2 of two or three
- addi BL,BL,4 // IU1 increment index
-c_Only_1W_fwd:
- bng cr7,Only_2W_fwd // b if there were only two or zero words to store
-
- stvewx v0,DBK,BL // LSU store word 3 of three if necessary
- addi BL,BL,4 // IU1 increment index
-c_Only_2W_fwd:
- bne cr7,c_Only_B_fwd // b if there are no half words to store
-
- stvehx v0,DBK,BL // LSU store one halfword if necessary
- addi BL,BL,2 // IU1 increment index
-c_Only_B_fwd:
- bns cr7,c_All_done_fwd // b if there are no bytes to store
-
- stvebx v0,DBK,BL // LSU store one byte if necessary
- b c_All_done_fwd
-
-c_Rt_just_fwd:
-
- stvx v0,DST,BK // LSU Store 16 bytes at D14
-c_All_done_fwd:
-#ifdef VRSAVE
- mtspr VRSV,RSV // IU1 Restore VRSAVE
-#endif
- blr // Return destination address from entry
-
-#ifdef __MWERKS__
- .align 16
-#else
- .align 4
-#endif
-c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
-
- addi QW,QW,-1 // IU1 Keeping track of QWs stored
- mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
- addi DNX,DNX,16 // IU1 Update cr6 for next loop
-
- stvx v0,DST,BK // LSU Store 16 bytes at D2
- addi BK,BK,16 // IU1 Increment byte count by 16
- bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
-
- mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
-
- bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
-
-// We need the ctr register to reflect an even byte count before entering
-// the next block - faster to decrement than to reload.
- bdnz B32_fwd // decrement counter for last QW store odd
-
-c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
- dcbz DST,BK // LSU zero whole cache line
- bdz c_Nxt_loc_fwd // always decrement and branch to next instr
-
-c_Nxt_loc_fwd:
- addi BK,BK,32 // IU1 Increment byte count
- bdnz B32_fwd // b if there are at least two more QWs to do
-
- bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
- b c_Last_QW // b if last store is to even address
-
-// Come here with two more loads and two stores to do
-c_One_even_QW:
- stvx v0,DST,BK // LSU Store 16 bytes at D13
- addi BK,BK,16 // IU1 Increment byte count
-
- b c_Last_QW
-
-// End of cacheable_memzero in AltiVec