 b6cbf7203b
			
		
	
	
		b6cbf7203b
		
	
	
	
	
		
			
			This patch imports the unmodified current version of NetBSD libc. The NetBSD includes are in /nbsd_include, while the libc code itself is split between lib/nbsd_libc and common/lib/libc.
		
			
				
	
	
		
			1959 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			1959 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*	$NetBSD: memcpy.S,v 1.2 2001/08/01 05:52:12 eeh Exp $	*/
 | |
| 
 | |
| /*
 | |
|  * Copyright (c) 2001	Eduardo E. Horvath
 | |
|  *
 | |
|  * This software was developed by the Computer Systems Engineering group
 | |
|  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
 | |
|  * contributed to Berkeley.
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions
 | |
|  * are met:
 | |
|  * 1. Redistributions of source code must retain the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer.
 | |
|  * 2. Redistributions in binary form must reproduce the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer in the
 | |
|  *    documentation and/or other materials provided with the distribution.
 | |
|  * 3. All advertising materials mentioning features or use of this software
 | |
|  *    must display the following acknowledgement:
 | |
|  *	This product includes software developed by the University of
 | |
|  *	California, Berkeley and its contributors.
 | |
|  * 4. Neither the name of the University nor the names of its contributors
 | |
|  *    may be used to endorse or promote products derived from this software
 | |
|  *    without specific prior written permission.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 | |
|  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
|  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 | |
|  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
|  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 | |
|  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 | |
|  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 | |
|  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | |
|  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 | |
|  * SUCH DAMAGE.
 | |
|  *
 | |
|  */
 | |
| 
 | |
| #include <machine/asm.h>
 | |
| #ifndef _LOCORE
 | |
| #define _LOCORE
 | |
| #endif
 | |
| #include <machine/ctlreg.h>
 | |
| #include <machine/frame.h>
 | |
| #include <machine/psl.h>
 | |
| 
 | |
| #if defined(LIBC_SCCS) && !defined(lint)
 | |
| 	RCSID("$NetBSD: memcpy.S,v 1.2 2001/08/01 05:52:12 eeh Exp $")
 | |
| #endif  /* LIBC_SCCS and not lint */
 | |
| 
 | |
| #define	EMPTY	nop
 | |
| #define	NOTREACHED	ta	1
 | |
| 
 | |
| #define	BCOPY_SMALL	16
 | |
| #define	BLOCK_SIZE	64
 | |
| 
 | |
| #if 0
 | |
| #define ASI_STORE	ASI_BLK_COMMIT_P
 | |
| #else
 | |
| #define ASI_STORE	ASI_BLK_P
 | |
| #endif
 | |
| 
 | |
| #if 1
 | |
| /*
 | |
|  * kernel bcopy/memcpy
 | |
|  * Assumes regions do not overlap; has no useful return value.
 | |
|  *
 | |
|  * Must not use %g7 (see copyin/copyout above).
 | |
|  */
 | |
| ENTRY(memcpy) /* dest, src, size */
 | |
| 	/*
 | |
| 	 * Swap args for bcopy.  Gcc generates calls to memcpy for
 | |
| 	 * structure assignments.
 | |
| 	 */
 | |
| 	mov	%o0, %o3
 | |
| 	mov	%o1, %o0
 | |
| 	mov	%o3, %o1
 | |
| #endif
 | |
| ENTRY(bcopy) /* src, dest, size */
 | |
| #ifdef DEBUG
 | |
| 	set	pmapdebug, %o4
 | |
| 	ld	[%o4], %o4
 | |
| 	btst	0x80, %o4	! PDB_COPY
 | |
| 	bz,pt	%icc, 3f
 | |
| 	 nop
 | |
| 	save	%sp, -CC64FSZ, %sp
 | |
| 	mov	%i0, %o1
 | |
| 	set	2f, %o0
 | |
| 	mov	%i1, %o2
 | |
| 	call	printf
 | |
| 	 mov	%i2, %o3
 | |
| !	ta	1; nop
 | |
| 	restore
 | |
| 	.data
 | |
| 2:	.asciz	"bcopy(%p->%p,%x)\n"
 | |
| 	_ALIGN
 | |
| 	.text
 | |
| 3:
 | |
| #endif
 | |
| 	/*
 | |
| 	 * Check for overlaps and punt.
 | |
| 	 *
 | |
| 	 * If src <= dest <= src+len we have a problem.
 | |
| 	 */
 | |
| 
 | |
| 	sub	%o1, %o0, %o3
 | |
| 
 | |
| 	cmp	%o3, %o2
 | |
| 	blu,pn	%xcc, Lovbcopy
 | |
| 	 cmp	%o2, BCOPY_SMALL
 | |
| Lbcopy_start:
 | |
| 	bge,pt	%xcc, 2f	! if >= this many, go be fancy.
 | |
| 	 cmp	%o2, 256
 | |
| 
 | |
| 	mov	%o1, %o5	! Save memcpy return value
 | |
| 	/*
 | |
| 	 * Not much to copy, just do it a byte at a time.
 | |
| 	 */
 | |
| 	deccc	%o2		! while (--len >= 0)
 | |
| 	bl	1f
 | |
| 	 EMPTY
 | |
| 0:
 | |
| 	inc	%o0
 | |
| 	ldsb	[%o0 - 1], %o4	!	(++dst)[-1] = *src++;
 | |
| 	stb	%o4, [%o1]
 | |
| 	deccc	%o2
 | |
| 	bge	0b
 | |
| 	 inc	%o1
 | |
| 1:
 | |
| 	retl
 | |
| 	 mov	%o5, %o0
 | |
| 	NOTREACHED
 | |
| 
 | |
| 	/*
 | |
| 	 * Overlapping bcopies -- punt.
 | |
| 	 */
 | |
| Lovbcopy:
 | |
| 
 | |
| 	/*
 | |
| 	 * Since src comes before dst, and the regions might overlap,
 | |
| 	 * we have to do the copy starting at the end and working backwards.
 | |
| 	 *
 | |
| 	 * We could optimize this, but it almost never happens.
 | |
| 	 */
 | |
| 	mov	%o1, %o5	! Retval
 | |
| 	add	%o2, %o0, %o0	! src += len
 | |
| 	add	%o2, %o1, %o1	! dst += len
 | |
| 	
 | |
| 	deccc	%o2
 | |
| 	bl,pn	%xcc, 1f
 | |
| 	 dec	%o0
 | |
| 0:
 | |
| 	dec	%o1
 | |
| 	ldsb	[%o0], %o4
 | |
| 	dec	%o0
 | |
| 	
 | |
| 	deccc	%o2
 | |
| 	bge,pt	%xcc, 0b
 | |
| 	 stb	%o4, [%o1]
 | |
| 1:
 | |
| 	retl
 | |
| 	 mov	%o5, %o0
 | |
| 
 | |
| 	/*
 | |
| 	 * Plenty of data to copy, so try to do it optimally.
 | |
| 	 */
 | |
| 2:
 | |
| #if 1
 | |
| 	! If it is big enough, use VIS instructions
 | |
| 	bge	Lbcopy_block
 | |
| 	 nop
 | |
| #endif
 | |
| Lbcopy_fancy:
 | |
| 
 | |
| 	!!
 | |
| 	!! First align the output to a 8-byte entity
 | |
| 	!! 
 | |
| 
 | |
| 	save	%sp, -CC64FSZ, %sp
 | |
| 	
 | |
| 	mov	%i0, %o0
 | |
| 	mov	%i1, %o1
 | |
| 	
 | |
| 	mov	%i2, %o2
 | |
| 	btst	1, %o1
 | |
| 	
 | |
| 	bz,pt	%icc, 4f
 | |
| 	 btst	2, %o1
 | |
| 	ldub	[%o0], %o4				! Load 1st byte
 | |
| 	
 | |
| 	deccc	1, %o2
 | |
| 	ble,pn	%xcc, Lbcopy_finish			! XXXX
 | |
| 	 inc	1, %o0
 | |
| 	
 | |
| 	stb	%o4, [%o1]				! Store 1st byte
 | |
| 	inc	1, %o1					! Update address
 | |
| 	btst	2, %o1
 | |
| 4:	
 | |
| 	bz,pt	%icc, 4f
 | |
| 	
 | |
| 	 btst	1, %o0
 | |
| 	bz,a	1f
 | |
| 	 lduh	[%o0], %o4				! Load short
 | |
| 
 | |
| 	ldub	[%o0], %o4				! Load bytes
 | |
| 	
 | |
| 	ldub	[%o0+1], %o3
 | |
| 	sllx	%o4, 8, %o4
 | |
| 	or	%o3, %o4, %o4
 | |
| 	
 | |
| 1:	
 | |
| 	deccc	2, %o2
 | |
| 	ble,pn	%xcc, Lbcopy_finish			! XXXX
 | |
| 	 inc	2, %o0
 | |
| 	sth	%o4, [%o1]				! Store 1st short
 | |
| 	
 | |
| 	inc	2, %o1
 | |
| 4:
 | |
| 	btst	4, %o1
 | |
| 	bz,pt	%xcc, 4f
 | |
| 	
 | |
| 	 btst	3, %o0
 | |
| 	bz,a,pt	%xcc, 1f
 | |
| 	 lduw	[%o0], %o4				! Load word -1
 | |
| 
 | |
| 	btst	1, %o0
 | |
| 	bz,a,pt	%icc, 2f
 | |
| 	 lduh	[%o0], %o4
 | |
| 	
 | |
| 	ldub	[%o0], %o4
 | |
| 	
 | |
| 	lduh	[%o0+1], %o3
 | |
| 	sllx	%o4, 16, %o4
 | |
| 	or	%o4, %o3, %o4
 | |
| 	
 | |
| 	ldub	[%o0+3], %o3
 | |
| 	sllx	%o4, 8, %o4
 | |
| 	ba,pt	%icc, 1f
 | |
| 	 or	%o4, %o3, %o4
 | |
| 	
 | |
| 2:
 | |
| 	lduh	[%o0+2], %o3
 | |
| 	sllx	%o4, 16, %o4
 | |
| 	or	%o4, %o3, %o4
 | |
| 	
 | |
| 1:	
 | |
| 	deccc	4, %o2
 | |
| 	ble,pn	%xcc, Lbcopy_finish		! XXXX
 | |
| 	 inc	4, %o0
 | |
| 	
 | |
| 	st	%o4, [%o1]				! Store word
 | |
| 	inc	4, %o1
 | |
| 4:
 | |
| 	!!
 | |
| 	!! We are now 32-bit aligned in the dest.
 | |
| 	!!
 | |
| Lbcopy__common:	
 | |
| 
 | |
| 	and	%o0, 7, %o4				! Shift amount
 | |
| 	andn	%o0, 7, %o0				! Source addr
 | |
| 	
 | |
| 	brz,pt	%o4, Lbcopy_noshift8			! No shift version...
 | |
| 
 | |
| 	 sllx	%o4, 3, %o4				! In bits
 | |
| 	mov	8<<3, %o3
 | |
| 	
 | |
| 	ldx	[%o0], %l0				! Load word -1
 | |
| 	sub	%o3, %o4, %o3				! Reverse shift
 | |
| 	deccc	16*8, %o2				! Have enough room?
 | |
| 	
 | |
| 	sllx	%l0, %o4, %l0
 | |
| 	bl,pn	%xcc, 2f
 | |
| 	 and	%o3, 0x38, %o3
 | |
| Lbcopy_unrolled8:
 | |
| 
 | |
| 	/*
 | |
| 	 * This is about as close to optimal as you can get, since
 | |
| 	 * the shifts require EU0 and cannot be paired, and you have
 | |
| 	 * 3 dependent operations on the data.
 | |
| 	 */ 
 | |
| 
 | |
| !	ldx	[%o0+0*8], %l0				! Already done
 | |
| !	sllx	%l0, %o4, %l0				! Already done
 | |
| 	ldx	[%o0+1*8], %l1
 | |
| 	ldx	[%o0+2*8], %l2
 | |
| 	ldx	[%o0+3*8], %l3
 | |
| 	ldx	[%o0+4*8], %l4
 | |
| 	ldx	[%o0+5*8], %l5
 | |
| 	ldx	[%o0+6*8], %l6
 | |
| #if 1
 | |
| 	ba,pt	%icc, 1f
 | |
| 	 ldx	[%o0+7*8], %l7
 | |
| 	.align	8
 | |
| 1:
 | |
| 	srlx	%l1, %o3, %g1
 | |
| 	inc	8*8, %o0
 | |
| 	
 | |
| 	sllx	%l1, %o4, %l1
 | |
| 	or	%g1, %l0, %o5
 | |
| 	ldx	[%o0+0*8], %l0
 | |
| 	
 | |
| 	stx	%o5, [%o1+0*8]
 | |
| 	srlx	%l2, %o3, %g1
 | |
| 
 | |
| 	sllx	%l2, %o4, %l2
 | |
| 	or	%g1, %l1, %o5
 | |
| 	ldx	[%o0+1*8], %l1
 | |
| 	
 | |
| 	stx	%o5, [%o1+1*8]
 | |
| 	srlx	%l3, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l3, %o4, %l3
 | |
| 	or	%g1, %l2, %o5
 | |
| 	ldx	[%o0+2*8], %l2
 | |
| 	
 | |
| 	stx	%o5, [%o1+2*8]
 | |
| 	srlx	%l4, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l4, %o4, %l4	
 | |
| 	or	%g1, %l3, %o5
 | |
| 	ldx	[%o0+3*8], %l3
 | |
| 	
 | |
| 	stx	%o5, [%o1+3*8]
 | |
| 	srlx	%l5, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l5, %o4, %l5
 | |
| 	or	%g1, %l4, %o5
 | |
| 	ldx	[%o0+4*8], %l4
 | |
| 	
 | |
| 	stx	%o5, [%o1+4*8]
 | |
| 	srlx	%l6, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l6, %o4, %l6
 | |
| 	or	%g1, %l5, %o5
 | |
| 	ldx	[%o0+5*8], %l5
 | |
| 	
 | |
| 	stx	%o5, [%o1+5*8]
 | |
| 	srlx	%l7, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l7, %o4, %l7
 | |
| 	or	%g1, %l6, %o5
 | |
| 	ldx	[%o0+6*8], %l6
 | |
| 	
 | |
| 	stx	%o5, [%o1+6*8]
 | |
| 	srlx	%l0, %o3, %g1
 | |
| 	deccc	8*8, %o2				! Have enough room?
 | |
| 	
 | |
| 	sllx	%l0, %o4, %l0				! Next loop
 | |
| 	or	%g1, %l7, %o5
 | |
| 	ldx	[%o0+7*8], %l7
 | |
| 	
 | |
| 	stx	%o5, [%o1+7*8]
 | |
| 	bge,pt	%xcc, 1b
 | |
| 	 inc	8*8, %o1
 | |
| 
 | |
| Lbcopy_unrolled8_cleanup:	
 | |
| 	!!
 | |
| 	!! Finished 8 byte block, unload the regs.
 | |
| 	!! 
 | |
| 	srlx	%l1, %o3, %g1
 | |
| 	inc	7*8, %o0
 | |
| 	
 | |
| 	sllx	%l1, %o4, %l1
 | |
| 	or	%g1, %l0, %o5
 | |
| 		
 | |
| 	stx	%o5, [%o1+0*8]
 | |
| 	srlx	%l2, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l2, %o4, %l2
 | |
| 	or	%g1, %l1, %o5
 | |
| 		
 | |
| 	stx	%o5, [%o1+1*8]
 | |
| 	srlx	%l3, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l3, %o4, %l3
 | |
| 	or	%g1, %l2, %o5
 | |
| 		
 | |
| 	stx	%o5, [%o1+2*8]
 | |
| 	srlx	%l4, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l4, %o4, %l4	
 | |
| 	or	%g1, %l3, %o5
 | |
| 		
 | |
| 	stx	%o5, [%o1+3*8]
 | |
| 	srlx	%l5, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l5, %o4, %l5
 | |
| 	or	%g1, %l4, %o5
 | |
| 		
 | |
| 	stx	%o5, [%o1+4*8]
 | |
| 	srlx	%l6, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l6, %o4, %l6
 | |
| 	or	%g1, %l5, %o5
 | |
| 		
 | |
| 	stx	%o5, [%o1+5*8]
 | |
| 	srlx	%l7, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l7, %o4, %l7
 | |
| 	or	%g1, %l6, %o5
 | |
| 		
 | |
| 	stx	%o5, [%o1+6*8]
 | |
| 	inc	7*8, %o1
 | |
| 	
 | |
| 	mov	%l7, %l0				! Save our unused data
 | |
| 	dec	7*8, %o2
 | |
| #else
 | |
| 	/*
 | |
| 	 * This version also handles aligned copies at almost the
 | |
| 	 * same speed.  It should take the same number of cycles
 | |
| 	 * as the previous version, but is slightly slower, probably
 | |
| 	 * due to i$ issues.
 | |
| 	 */
 | |
| 	ldx	[%o0+7*8], %l7
 | |
| 	ba,pt	%icc, 1f
 | |
| 	 clr	%g1
 | |
| 	.align 32
 | |
| 1:
 | |
| 	srlx	%l1, %o3, %g1
 | |
| 	bz,pn	%xcc, 3f
 | |
| 	 inc	8*8, %o0
 | |
| 
 | |
| 	sllx	%l1, %o4, %l1
 | |
| 	or	%g1, %l0, %o5
 | |
| 	ba,pt	%icc, 4f
 | |
| 	ldx	[%o0+0*8], %l0
 | |
| 	
 | |
| 	nop
 | |
| 3:
 | |
| 	mov	%l0, %o5
 | |
| 	ldx	[%o0+0*8], %l0
 | |
| 	
 | |
| 4:	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+0*8]
 | |
| 	srlx	%l2, %o3, %g1
 | |
| 
 | |
| 	sllx	%l2, %o4, %l2
 | |
| 3:	
 | |
| 	or	%g1, %l1, %o5
 | |
| 	ldx	[%o0+1*8], %l1
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+1*8]
 | |
| 	srlx	%l3, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l3, %o4, %l3
 | |
| 3:	
 | |
| 	or	%g1, %l2, %o5
 | |
| 	ldx	[%o0+2*8], %l2
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+2*8]
 | |
| 	srlx	%l4, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l4, %o4, %l4
 | |
| 3:	
 | |
| 	or	%g1, %l3, %o5
 | |
| 	ldx	[%o0+3*8], %l3
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+3*8]
 | |
| 	srlx	%l5, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l5, %o4, %l5
 | |
| 3:	
 | |
| 	or	%g1, %l4, %o5
 | |
| 	ldx	[%o0+4*8], %l4
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+4*8]
 | |
| 	srlx	%l6, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l6, %o4, %l6
 | |
| 3:	
 | |
| 	or	%g1, %l5, %o5
 | |
| 	ldx	[%o0+5*8], %l5
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+5*8]
 | |
| 	srlx	%l7, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l7, %o4, %l7
 | |
| 3:	
 | |
| 	or	%g1, %l6, %o5
 | |
| 	ldx	[%o0+6*8], %l6
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+6*8]
 | |
| 	srlx	%l0, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l0, %o4, %l0				! Next loop
 | |
| 3:	
 | |
| 	or	%g1, %l7, %o5
 | |
| 	ldx	[%o0+7*8], %l7
 | |
| 	deccc	8*8, %o2				! Have enough room?
 | |
| 	
 | |
| 	stx	%o5, [%o1+7*8]
 | |
| 	inc	8*8, %o1
 | |
| 	bge,pt	%xcc, 1b
 | |
| 	 tst	%o4
 | |
| 
 | |
| 
 | |
| 	!!
 | |
| 	!! Now unload all those regs
 | |
| 	!! 
 | |
| Lbcopy_unrolled8_cleanup:	
 | |
| 	srlx	%l1, %o3, %g1
 | |
| 	bz,pn	%xcc, 3f
 | |
| 	 inc	7*8, %o0				! Point at the last load
 | |
| 
 | |
| 	sllx	%l1, %o4, %l1
 | |
| 	ba,pt	%icc, 4f
 | |
| 	 or	%g1, %l0, %o5
 | |
| 	
 | |
| 3:
 | |
| 	mov	%l0, %o5
 | |
| 	
 | |
| 4:	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+0*8]
 | |
| 	srlx	%l2, %o3, %g1
 | |
| 
 | |
| 	sllx	%l2, %o4, %l2
 | |
| 3:	
 | |
| 	or	%g1, %l1, %o5
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+1*8]
 | |
| 	srlx	%l3, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l3, %o4, %l3
 | |
| 3:	
 | |
| 	or	%g1, %l2, %o5
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+2*8]
 | |
| 	srlx	%l4, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l4, %o4, %l4
 | |
| 3:	
 | |
| 	or	%g1, %l3, %o5
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+3*8]
 | |
| 	srlx	%l5, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l5, %o4, %l5
 | |
| 3:	
 | |
| 	or	%g1, %l4, %o5
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+4*8]
 | |
| 	srlx	%l6, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l6, %o4, %l6
 | |
| 3:	
 | |
| 	or	%g1, %l5, %o5
 | |
| 	
 | |
| 	bz,pn	%icc, 3f
 | |
| 	stx	%o5, [%o1+5*8]
 | |
| 	srlx	%l7, %o3, %g1
 | |
| 	
 | |
| 	sllx	%l7, %o4, %l7
 | |
| 3:	
 | |
| 	or	%g1, %l6, %o5
 | |
| 	mov	%l7, %l0				! Shuffle to %l0
 | |
| 	
 | |
| 	stx	%o5, [%o1+6*8]
 | |
| 	or	%g1, %l7, %o5
 | |
| 	dec	7*8, %o2
 | |
| 	
 | |
| 	inc	7*8, %o1				! Point at last store
 | |
| #endif
 | |
| 2:
 | |
| 	inccc	16*8, %o2
 | |
| 	bz,pn	%icc, Lbcopy_complete
 | |
| 	
 | |
| 	!! Unrolled 8 times
 | |
| Lbcopy_aligned8:	
 | |
| !	ldx	[%o0], %l0				! Already done
 | |
| !	sllx	%l0, %o4, %l0				! Shift high word
 | |
| 	
 | |
| 	 deccc	8, %o2					! Pre-decrement
 | |
| 	bl,pn	%xcc, Lbcopy_finish
 | |
| 1:
 | |
| 	ldx	[%o0+8], %l1				! Load word 0
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	srlx	%l1, %o3, %o5
 | |
| 	or	%o5, %l0, %o5				! Combine
 | |
| 	
 | |
| 	stx	%o5, [%o1]				! Store result
 | |
| 	 inc	8, %o1
 | |
| 	
 | |
| 	deccc	8, %o2
 | |
| 	bge,pn	%xcc, 1b
 | |
| 	 sllx	%l1, %o4, %l0	
 | |
| 
 | |
| 	btst	7, %o2					! Done?
 | |
| 	bz,pt	%xcc, Lbcopy_complete
 | |
| 
 | |
| 	!!
 | |
| 	!! Loadup the last dregs into %l0 and shift it into place
 | |
| 	!! 
 | |
| 	 srlx	%o3, 3, %o5				! # bytes in %l0
 | |
| 	dec	8, %o5					!  - 8
 | |
| 	!! n-8 - (by - 8) -> n - by
 | |
| 	subcc	%o2, %o5, %g0				! # bytes we need
 | |
| 	ble,pt	%icc, Lbcopy_finish
 | |
| 	 nop
 | |
| 	ldx	[%o0+8], %l1				! Need another word
 | |
| 	srlx	%l1, %o3, %l1
 | |
| 	ba,pt	%icc, Lbcopy_finish
 | |
| 	 or	%l0, %l1, %l0				! All loaded up.
 | |
| 	
 | |
| Lbcopy_noshift8:
 | |
| 	deccc	8*8, %o2				! Have enough room?
 | |
| 	bl,pn	%xcc, 2f
 | |
| 	 nop
 | |
| 	ba,pt	%icc, 1f
 | |
| 	 nop
 | |
| 	.align	32
 | |
| 1:	
 | |
| 	ldx	[%o0+0*8], %l0
 | |
| 	ldx	[%o0+1*8], %l1
 | |
| 	ldx	[%o0+2*8], %l2
 | |
| 	ldx	[%o0+3*8], %l3
 | |
| 	stx	%l0, [%o1+0*8]
 | |
| 	stx	%l1, [%o1+1*8]
 | |
| 	stx	%l2, [%o1+2*8]
 | |
| 	stx	%l3, [%o1+3*8]
 | |
| 
 | |
| 	
 | |
| 	ldx	[%o0+4*8], %l4
 | |
| 	ldx	[%o0+5*8], %l5
 | |
| 	ldx	[%o0+6*8], %l6
 | |
| 	ldx	[%o0+7*8], %l7
 | |
| 	inc	8*8, %o0
 | |
| 	stx	%l4, [%o1+4*8]
 | |
| 	stx	%l5, [%o1+5*8]
 | |
| 	deccc	8*8, %o2
 | |
| 	stx	%l6, [%o1+6*8]
 | |
| 	stx	%l7, [%o1+7*8]
 | |
| 	stx	%l2, [%o1+2*8]
 | |
| 	bge,pt	%xcc, 1b
 | |
| 	 inc	8*8, %o1
 | |
| 2:
 | |
| 	inc	8*8, %o2
 | |
| 1:	
 | |
| 	deccc	8, %o2
 | |
| 	bl,pn	%icc, 1f				! < 0 --> sub word
 | |
| 	 nop
 | |
| 	ldx	[%o0], %o5
 | |
| 	inc	8, %o0
 | |
| 	stx	%o5, [%o1]
 | |
| 	bg,pt	%icc, 1b				! Exactly 0 --> done
 | |
| 	 inc	8, %o1
 | |
| 1:
 | |
| 	btst	7, %o2					! Done?
 | |
| 	bz,pt	%xcc, Lbcopy_complete
 | |
| 	 clr	%o4
 | |
| 	ldx	[%o0], %l0
 | |
| Lbcopy_finish:
 | |
| 	
 | |
| 	brz,pn	%o2, 2f					! 100% complete?
 | |
| 	 cmp	%o2, 8					! Exactly 8 bytes?
 | |
| 	bz,a,pn	%xcc, 2f
 | |
| 	 stx	%l0, [%o1]
 | |
| 
 | |
| 	btst	4, %o2					! Word store?
 | |
| 	bz	%xcc, 1f
 | |
| 	 srlx	%l0, 32, %o5				! Shift high word down
 | |
| 	stw	%o5, [%o1]
 | |
| 	inc	4, %o1
 | |
| 	mov	%l0, %o5				! Operate on the low bits
 | |
| 1:
 | |
| 	btst	2, %o2
 | |
| 	mov	%o5, %l0
 | |
| 	bz	1f
 | |
| 	 srlx	%l0, 16, %o5
 | |
| 	
 | |
| 	sth	%o5, [%o1]				! Store short
 | |
| 	inc	2, %o1
 | |
| 	mov	%l0, %o5				! Operate on low bytes
 | |
| 1:
 | |
| 	mov	%o5, %l0
 | |
| 	btst	1, %o2					! Byte aligned?
 | |
| 	bz	2f
 | |
| 	 srlx	%l0, 8, %o5
 | |
| 
 | |
| 	stb	%o5, [%o1]				! Store last byte
 | |
| 	inc	1, %o1					! Update address
 | |
| 2:	
 | |
| Lbcopy_complete:
 | |
| #if 0
 | |
| 	!!
 | |
| 	!! verify copy success.
 | |
| 	!! 
 | |
| 
 | |
| 	mov	%i0, %o2
 | |
| 	mov	%i1, %o4
 | |
| 	mov	%i2, %l4
 | |
| 0:	
 | |
| 	ldub	[%o2], %o1
 | |
| 	inc	%o2
 | |
| 	ldub	[%o4], %o3
 | |
| 	inc	%o4
 | |
| 	cmp	%o3, %o1
 | |
| 	bnz	1f
 | |
| 	 dec	%l4
 | |
| 	brnz	%l4, 0b
 | |
| 	 nop
 | |
| 	ba	2f
 | |
| 	 nop
 | |
| 
 | |
| 1:
 | |
| 	set	0f, %o0
 | |
| 	call	printf
 | |
| 	 sub	%i2, %l4, %o5
 | |
| 	set	1f, %o0
 | |
| 	mov	%i0, %o1
 | |
| 	mov	%i1, %o2
 | |
| 	call	printf
 | |
| 	 mov	%i2, %o3
 | |
| 	ta	1
 | |
| 	.data
 | |
| 0:	.asciz	"bcopy failed: %x@%p != %x@%p byte %d\n"
 | |
| 1:	.asciz	"bcopy(%p, %p, %lx)\n"
 | |
| 	.align 8
 | |
| 	.text
 | |
| 2:	
 | |
| #endif
 | |
| 	ret
 | |
| 	 restore %i1, %g0, %o0
 | |
| 
 | |
| #if 1
 | |
| 
 | |
| /*
 | |
|  * Block copy.  Useful for >256 byte copies.
 | |
|  *
 | |
|  * Benchmarking has shown this always seems to be slower than
 | |
|  * the integer version, so this is disabled.  Maybe someone will
 | |
|  * figure out why sometime.
 | |
|  */
 | |
| 	
 | |
| Lbcopy_block:
 | |
| #ifdef _KERNEL
 | |
| /*
 | |
|  * Kernel:
 | |
|  *
 | |
|  * Here we use VIS instructions to do a block clear of a page.
 | |
|  * But before we can do that we need to save and enable the FPU.
 | |
|  * The last owner of the FPU registers is fpproc, and
 | |
|  * fpproc->p_md.md_fpstate is the current fpstate.  If that's not
 | |
|  * null, call savefpstate() with it to store our current fp state.
 | |
|  *
 | |
|  * Next, allocate an aligned fpstate on the stack.  We will properly
 | |
|  * nest calls on a particular stack so this should not be a problem.
 | |
|  *
 | |
|  * Now we grab either curproc (or if we're on the interrupt stack
 | |
|  * proc0).  We stash its existing fpstate in a local register and
 | |
|  * put our new fpstate in curproc->p_md.md_fpstate.  We point
 | |
|  * fpproc at curproc (or proc0) and enable the FPU.
 | |
|  *
 | |
|  * If we are ever preempted, our FPU state will be saved in our
 | |
|  * fpstate.  Then, when we're resumed and we take an FPDISABLED
 | |
|  * trap, the trap handler will be able to fish our FPU state out
 | |
|  * of curproc (or proc0).
 | |
|  *
 | |
|  * On exiting this routine we undo the damage: restore the original
 | |
|  * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable
 | |
|  * the MMU.
 | |
|  *
 | |
|  *
 | |
|  * Register usage, Kernel only (after save):
 | |
|  *
 | |
|  * %i0		src
 | |
|  * %i1		dest
 | |
|  * %i2		size
 | |
|  *
 | |
|  * %l0		XXXX DEBUG old fpstate
 | |
|  * %l1		fpproc (hi bits only)
 | |
|  * %l2		orig fpproc
 | |
|  * %l3		orig fpstate
 | |
|  * %l5		curproc
 | |
|  * %l6		old fpstate
 | |
|  *
 | |
|  * Register ussage, Kernel and user:
 | |
|  *
 | |
|  * %g1		src (retval for memcpy)
 | |
|  *
 | |
|  * %o0		src
 | |
|  * %o1		dest
 | |
|  * %o2		end dest
 | |
|  * %o5		last safe fetchable address
 | |
|  */
 | |
| 
 | |
| #if 1
 | |
| 	ENABLE_FPU(0)
 | |
| #else
 | |
| 	save	%sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp	! Allocate an fpstate
 | |
| 	sethi	%hi(FPPROC), %l1
 | |
| 	LDPTR	[%l1 + %lo(FPPROC)], %l2		! Load fpproc
 | |
| 	add	%sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0	! Calculate pointer to fpstate
 | |
| 	brz,pt	%l2, 1f					! fpproc == NULL?
 | |
| 	 andn	%l0, BLOCK_ALIGN, %l0			! And make it block aligned
 | |
| 	LDPTR	[%l2 + P_FPSTATE], %l3
 | |
| 	brz,pn	%l3, 1f					! Make sure we have an fpstate
 | |
| 	 mov	%l3, %o0
 | |
| 	call	_C_LABEL(savefpstate)			! Save the old fpstate
 | |
| 	 set	EINTSTACK-STKB, %l4			! Are we on intr stack?
 | |
| 	cmp	%sp, %l4
 | |
| 	bgu,pt	%xcc, 1f
 | |
| 	 set	INTSTACK-STKB, %l4
 | |
| 	cmp	%sp, %l4
 | |
| 	blu	%xcc, 1f
 | |
| 0:
 | |
| 	 sethi	%hi(_C_LABEL(proc0)), %l4		! Yes, use proc0
 | |
| 	ba,pt	%xcc, 2f				! XXXX needs to change to CPUs idle proc
 | |
| 	 or	%l4, %lo(_C_LABEL(proc0)), %l5
 | |
| 1:
 | |
| 	sethi	%hi(CURPROC), %l4			! Use curproc
 | |
| 	LDPTR	[%l4 + %lo(CURPROC)], %l5
 | |
| 	brz,pn	%l5, 0b					! If curproc is NULL need to use proc0
 | |
| 	 nop
 | |
| 2:
 | |
| 	LDPTR	[%l5 + P_FPSTATE], %l6			! Save old fpstate
 | |
| 	STPTR	%l0, [%l5 + P_FPSTATE]			! Insert new fpstate
 | |
| 	STPTR	%l5, [%l1 + %lo(FPPROC)]		! Set new fpproc
 | |
| 	wr	%g0, FPRS_FEF, %fprs			! Enable FPU
 | |
| #endif
 | |
| 	mov	%i0, %o0				! Src addr.
 | |
| 	mov	%i1, %o1				! Store our dest ptr here.
 | |
| 	mov	%i2, %o2				! Len counter
 | |
| #endif
 | |
| 
 | |
| 	!!
 | |
| 	!! First align the output to a 64-bit entity
 | |
| 	!! 
 | |
| 
 | |
| 	mov	%o1, %g1				! memcpy retval
 | |
| 	add	%o0, %o2, %o5				! End of source block
 | |
| 
 | |
| 	andn	%o0, 7, %o3				! Start of block
 | |
| 	dec	%o5
 | |
| 	fzero	%f0
 | |
| 
 | |
| 	andn	%o5, BLOCK_ALIGN, %o5			! Last safe addr.
 | |
| 	ldd	[%o3], %f2				! Load 1st word
 | |
| 
 | |
| 	dec	8, %o3					! Move %o3 1 word back
 | |
| 	btst	1, %o1
 | |
| 	bz	4f
 | |
| 	
 | |
| 	 mov	-7, %o4					! Lowest src addr possible
 | |
| 	alignaddr %o0, %o4, %o4				! Base addr for load.
 | |
| 
 | |
| 	cmp	%o3, %o4
 | |
| 	be,pt	%xcc, 1f				! Already loaded?
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! No. Shift
 | |
| 	ldd	[%o3+8], %f2				! And load
 | |
| 1:	
 | |
| 
 | |
| 	faligndata	%f0, %f2, %f4			! Isolate 1st byte
 | |
| 
 | |
| 	stda	%f4, [%o1] ASI_FL8_P			! Store 1st byte
 | |
| 	inc	1, %o1					! Update address
 | |
| 	inc	1, %o0
 | |
| 	dec	1, %o2
 | |
| 4:	
 | |
| 	btst	2, %o1
 | |
| 	bz	4f
 | |
| 
 | |
| 	 mov	-6, %o4					! Calculate src - 6
 | |
| 	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
 | |
| 
 | |
| 	cmp	%o3, %o4				! Addresses same?
 | |
| 	be,pt	%xcc, 1f
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! Shuffle data
 | |
| 	ldd	[%o3+8], %f2				! Load word 0
 | |
| 1:	
 | |
| 	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
 | |
| 
 | |
| 	stda	%f4, [%o1] ASI_FL16_P			! Store 1st short
 | |
| 	dec	2, %o2
 | |
| 	inc	2, %o1
 | |
| 	inc	2, %o0
 | |
| 4:
 | |
| 	brz,pn	%o2, Lbcopy_blockfinish			! XXXX
 | |
| 
 | |
| 	 btst	4, %o1
 | |
| 	bz	4f
 | |
| 
 | |
| 	mov	-4, %o4
 | |
| 	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
 | |
| 
 | |
| 	cmp	%o3, %o4				! Addresses same?
 | |
| 	beq,pt	%xcc, 1f
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! Shuffle data
 | |
| 	ldd	[%o3+8], %f2				! Load word 0
 | |
| 1:	
 | |
| 	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
 | |
| 
 | |
| 	st	%f5, [%o1]				! Store word
 | |
| 	dec	4, %o2
 | |
| 	inc	4, %o1
 | |
| 	inc	4, %o0
 | |
| 4:
 | |
| 	brz,pn	%o2, Lbcopy_blockfinish			! XXXX
 | |
| 	!!
 | |
| 	!! We are now 32-bit aligned in the dest.
 | |
| 	!!
 | |
| Lbcopy_block_common:	
 | |
| 
 | |
| 	 mov	-0, %o4
 | |
| 	alignaddr %o0, %o4, %o4				! base - shift
 | |
| 
 | |
| 	cmp	%o3, %o4				! Addresses same?
 | |
| 	beq,pt	%xcc, 1f
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! Shuffle data
 | |
| 	ldd	[%o3+8], %f2				! Load word 0
 | |
| 1:	
 | |
| 	add	%o3, 8, %o0				! now use %o0 for src
 | |
| 	
 | |
| 	!!
 | |
| 	!! Continue until our dest is block aligned
 | |
| 	!! 
 | |
| Lbcopy_block_aligned8:	
 | |
| 1:
 | |
| 	brz	%o2, Lbcopy_blockfinish
 | |
| 	 btst	BLOCK_ALIGN, %o1			! Block aligned?
 | |
| 	bz	1f
 | |
| 	
 | |
| 	 faligndata %f0, %f2, %f4			! Generate result
 | |
| 	deccc	8, %o2
 | |
| 	ble,pn	%icc, Lbcopy_blockfinish		! Should never happen
 | |
| 	 fmovd	%f4, %f48
 | |
| 	
 | |
| 	std	%f4, [%o1]				! Store result
 | |
| 	inc	8, %o1
 | |
| 	
 | |
| 	fmovd	%f2, %f0
 | |
| 	inc	8, %o0
 | |
| 	ba,pt	%xcc, 1b				! Not yet.
 | |
| 	 ldd	[%o0], %f2				! Load next part
 | |
| Lbcopy_block_aligned64:	
 | |
| 1:
 | |
| 
 | |
| /*
 | |
|  * 64-byte aligned -- ready for block operations.
 | |
|  *
 | |
|  * Here we have the destination block aligned, but the
 | |
|  * source pointer may not be.  Sub-word alignment will
 | |
|  * be handled by faligndata instructions.  But the source
 | |
|  * can still be potentially aligned to 8 different words
 | |
|  * in our 64-bit block, so we have 8 different copy routines.
 | |
|  *
 | |
|  * Once we figure out our source alignment, we branch
 | |
|  * to the appropriate copy routine, which sets up the
 | |
|  * alignment for faligndata and loads (sets) the values
 | |
|  * into the source registers and does the copy loop.
 | |
|  *
 | |
|  * When were down to less than 1 block to store, we
 | |
|  * exit the copy loop and execute cleanup code.
 | |
|  *
 | |
|  * Block loads and stores are not properly interlocked.
 | |
|  * Stores save one reg/cycle, so you can start overwriting
 | |
|  * registers the cycle after the store is issued.  
 | |
|  * 
 | |
|  * Block loads require a block load to a different register
 | |
|  * block or a membar #Sync before accessing the loaded
 | |
|  * data.
 | |
|  *	
 | |
|  * Since the faligndata instructions may be offset as far
 | |
|  * as 7 registers into a block (if you are shifting source 
 | |
|  * 7 -> dest 0), you need 3 source register blocks for full 
 | |
|  * performance: one you are copying, one you are loading, 
 | |
|  * and one for interlocking.  Otherwise, we would need to
 | |
|  * sprinkle the code with membar #Sync and lose the advantage
 | |
|  * of running faligndata in parallel with block stores.  This 
 | |
|  * means we are fetching a full 128 bytes ahead of the stores.  
 | |
|  * We need to make sure the prefetch does not inadvertently 
 | |
|  * cross a page boundary and fault on data that we will never 
 | |
|  * store.
 | |
|  *
 | |
|  */
 | |
| #if 1
 | |
| 	and	%o0, BLOCK_ALIGN, %o3
 | |
| 	srax	%o3, 3, %o3				! Isolate the offset
 | |
| 
 | |
| 	brz	%o3, L100				! 0->0
 | |
| 	 btst	4, %o3
 | |
| 	bnz	%xcc, 4f
 | |
| 	 btst	2, %o3
 | |
| 	bnz	%xcc, 2f
 | |
| 	 btst	1, %o3
 | |
| 	ba,pt	%xcc, L101				! 0->1
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| 2:
 | |
| 	bz	%xcc, L102				! 0->2
 | |
| 	 nop
 | |
| 	ba,pt	%xcc, L103				! 0->3
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| 4:	
 | |
| 	bnz	%xcc, 2f
 | |
| 	 btst	1, %o3
 | |
| 	bz	%xcc, L104				! 0->4
 | |
| 	 nop
 | |
| 	ba,pt	%xcc, L105				! 0->5
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| 2:
 | |
| 	bz	%xcc, L106				! 0->6
 | |
| 	 nop
 | |
| 	ba,pt	%xcc, L107				! 0->7
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| #else
 | |
| 
 | |
| 	!!
 | |
| 	!! Isolate the word offset, which just happens to be
 | |
| 	!! the slot in our jump table.
 | |
| 	!!
 | |
| 	!! This is 6 insns, most of which cannot be paired,
 | |
| 	!! which is about the same as the above version.
 | |
| 	!!
 | |
| 	rd	%pc, %o4
 | |
| 1:	
 | |
| 	and	%o0, 0x31, %o3
 | |
| 	add	%o3, (Lbcopy_block_jmp - 1b), %o3
 | |
| 	jmpl	%o4 + %o3, %g0
 | |
| 	 nop
 | |
| 
 | |
| 	!!
 | |
| 	!! Jump table
 | |
| 	!!
 | |
| 	
 | |
| Lbcopy_block_jmp:
 | |
| 	ba,a,pt	%xcc, L100
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L101
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L102
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L103
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L104
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L105
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L106
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L107
 | |
| 	 nop
 | |
| #endif
 | |
| 
 | |
| 	!!
 | |
| 	!! Source is block aligned.
 | |
| 	!!
 | |
| 	!! Just load a block and go.
 | |
| 	!!
 | |
| L100:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L100"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0 , %f62
 | |
| 	ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 3f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	ba,pt	%icc, 3f
 | |
| 	 membar #Sync
 | |
| 	
 | |
| 	.align	32					! ICache align.
 | |
| 3:
 | |
| 	faligndata	%f62, %f0, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f0, %f2, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f2, %f4, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f4, %f6, %f38
 | |
| 	faligndata	%f6, %f8, %f40
 | |
| 	faligndata	%f8, %f10, %f42
 | |
| 	faligndata	%f10, %f12, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f12, %f14, %f46
 | |
| 	
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	faligndata	%f14, %f16, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f16, %f18, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f18, %f20, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f20, %f22, %f38
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f22, %f24, %f40
 | |
| 	faligndata	%f24, %f26, %f42
 | |
| 	faligndata	%f26, %f28, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f28, %f30, %f46
 | |
| 	
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	faligndata	%f30, %f48, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f48, %f50, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f50, %f52, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f52, %f54, %f38
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f54, %f56, %f40
 | |
| 	faligndata	%f56, %f58, %f42
 | |
| 	faligndata	%f58, %f60, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f60, %f62, %f46
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16			! Increment is at top
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 	
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+8
 | |
| 	!!
 | |
| 	!! We need to load almost 1 complete block by hand.
 | |
| 	!! 
 | |
| L101:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L101"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| !	fmovd	%f0, %f0				! Hoist fmovd
 | |
| 	ldd	[%o0], %f2
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f4
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f6
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 3f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 3:	
 | |
| 	faligndata	%f0, %f2, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f2, %f4, %f34
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f4, %f6, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f6, %f8, %f38
 | |
| 	faligndata	%f8, %f10, %f40
 | |
| 	faligndata	%f10, %f12, %f42
 | |
| 	faligndata	%f12, %f14, %f44
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f14, %f16, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	
 | |
| 	faligndata	%f16, %f18, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f20, %f22, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f22, %f24, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f24, %f26, %f40
 | |
| 	faligndata	%f26, %f28, %f42
 | |
| 	faligndata	%f28, %f30, %f44
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f30, %f48, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f48, %f50, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f50, %f52, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f52, %f54, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f54, %f56, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f56, %f58, %f40
 | |
| 	faligndata	%f58, %f60, %f42
 | |
| 	faligndata	%f60, %f62, %f44
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f62, %f0, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+16
 | |
| 	!!
 | |
| 	!! We need to load 6 doubles by hand.
 | |
| 	!! 
 | |
| L102:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L102"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	ldd	[%o0], %f4
 | |
| 	inc	8, %o0
 | |
| 	fmovd	%f0, %f2				! Hoist fmovd
 | |
| 	ldd	[%o0], %f6
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 3f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 3:	
 | |
| 	faligndata	%f2, %f4, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f4, %f6, %f34
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f6, %f8, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f8, %f10, %f38
 | |
| 	faligndata	%f10, %f12, %f40
 | |
| 	faligndata	%f12, %f14, %f42
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f44
 | |
| 
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f16, %f18, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f18, %f20, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f20, %f22, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f22, %f24, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f24, %f26, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f26, %f28, %f40
 | |
| 	faligndata	%f28, %f30, %f42
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f30, %f48, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f48, %f50, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f50, %f52, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f52, %f54, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f54, %f56, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f56, %f58, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f58, %f60, %f40
 | |
| 	faligndata	%f60, %f62, %f42
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f62, %f0, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f0, %f2, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 	
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+24
 | |
| 	!!
 | |
| 	!! We need to load 5 doubles by hand.
 | |
| 	!! 
 | |
| L103:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L103"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f4
 | |
| 	ldd	[%o0], %f6
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f4, %f6, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f6, %f8, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f8, %f10, %f36
 | |
| 	faligndata	%f10, %f12, %f38
 | |
| 	faligndata	%f12, %f14, %f40
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f16, %f18, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f18, %f20, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f20, %f22, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f22, %f24, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f24, %f26, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f26, %f28, %f38
 | |
| 	faligndata	%f28, %f30, %f40
 | |
| 	ble,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f30, %f48, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f48, %f50, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f50, %f52, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f52, %f54, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f54, %f56, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f56, %f58, %f36
 | |
| 	faligndata	%f58, %f60, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f60, %f62, %f40
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f62, %f0, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f0, %f2, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f2, %f4, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+32
 | |
| 	!!
 | |
| 	!! We need to load 4 doubles by hand.
 | |
| 	!! 
 | |
| L104:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L104"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f6
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f6, %f8, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f8, %f10, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f10, %f12, %f36
 | |
| 	faligndata	%f12, %f14, %f38
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f40
 | |
| 	faligndata	%f16, %f18, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f20, %f22, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f22, %f24, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f24, %f26, %f34
 | |
| 	faligndata	%f26, %f28, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f28, %f30, %f38
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f30, %f48, %f40
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f48, %f50, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f50, %f52, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f52, %f54, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f54, %f56, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f56, %f58, %f34
 | |
| 	faligndata	%f58, %f60, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f60, %f62, %f38
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f62, %f0, %f40
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f0, %f2, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f2, %f4, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f4, %f6, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+40
 | |
| 	!!
 | |
| 	!! We need to load 3 doubles by hand.
 | |
| 	!! 
 | |
| L105:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L105"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f8
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f8, %f10, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f10, %f12, %f34
 | |
| 	faligndata	%f12, %f14, %f36
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f16, %f18, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f42
 | |
| 	faligndata	%f20, %f22, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f22, %f24, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f24, %f26, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f26, %f28, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f28, %f30, %f36
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f30, %f48, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f48, %f50, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f50, %f52, %f42
 | |
| 	faligndata	%f52, %f54, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f54, %f56, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f56, %f58, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f58, %f60, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f60, %f62, %f36
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f62, %f0, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f0, %f2, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f2, %f4, %f42
 | |
| 	faligndata	%f4, %f6, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f6, %f8, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+48
 | |
| 	!!
 | |
| 	!! We need to load 2 doubles by hand.
 | |
| 	!! 
 | |
| L106:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L106"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f10
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f10, %f12, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f12, %f14, %f34
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f16, %f18, %f38
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f40
 | |
| 	faligndata	%f20, %f22, %f42
 | |
| 	faligndata	%f22, %f24, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f24, %f26, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f26, %f28, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f28, %f30, %f34
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f30, %f48, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f48, %f50, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f50, %f52, %f40
 | |
| 	faligndata	%f52, %f54, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f54, %f56, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f56, %f58, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f58, %f60, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f60, %f62, %f34
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f62, %f0, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f0, %f2, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f2, %f4, %f40
 | |
| 	faligndata	%f4, %f6, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f6, %f8, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f8, %f10, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+56
 | |
| 	!!
 | |
| 	!! We need to load 1 double by hand.
 | |
| 	!! 
 | |
| L107:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L107"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f12
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f12, %f14, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f16, %f18, %f36
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f38
 | |
| 	faligndata	%f20, %f22, %f40
 | |
| 	faligndata	%f22, %f24, %f42
 | |
| 	faligndata	%f24, %f26, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f26, %f28, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f28, %f30, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f30, %f48, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f48, %f50, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f50, %f52, %f38
 | |
| 	faligndata	%f52, %f54, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f54, %f56, %f42
 | |
| 	faligndata	%f56, %f58, %f44
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f58, %f60, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f60, %f62, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f62, %f0, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f0, %f2, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f2, %f4, %f38
 | |
| 	faligndata	%f4, %f6, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f6, %f8, %f42
 | |
| 	faligndata	%f8, %f10, %f44
 | |
| 
 | |
| 	brlez,pn	%o2, Lbcopy_blockdone
 | |
| 	 faligndata	%f10, %f12, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 	
 | |
| Lbcopy_blockdone:
 | |
| 	inc	BLOCK_SIZE, %o2				! Fixup our overcommit
 | |
| 	membar	#Sync					! Finish any pending loads
 | |
| #define	FINISH_REG(f)				\
 | |
| 	deccc	8, %o2;				\
 | |
| 	bl,a	Lbcopy_blockfinish;		\
 | |
| 	 fmovd	f, %f48;			\
 | |
| 	std	f, [%o1];			\
 | |
| 	inc	8, %o1
 | |
| 
 | |
| 	FINISH_REG(%f32)
 | |
| 	FINISH_REG(%f34)
 | |
| 	FINISH_REG(%f36)
 | |
| 	FINISH_REG(%f38)
 | |
| 	FINISH_REG(%f40)
 | |
| 	FINISH_REG(%f42)
 | |
| 	FINISH_REG(%f44)
 | |
| 	FINISH_REG(%f46)
 | |
| 	FINISH_REG(%f48)
 | |
| #undef FINISH_REG
 | |
| 	!! 
 | |
| 	!! The low 3 bits have the sub-word bits needed to be
 | |
| 	!! stored [because (x-8)&0x7 == x].
 | |
| 	!!
 | |
| Lbcopy_blockfinish:
 | |
| 	brz,pn	%o2, 2f					! 100% complete?
 | |
| 	 fmovd	%f48, %f4
 | |
| 	cmp	%o2, 8					! Exactly 8 bytes?
 | |
| 	bz,a,pn	%xcc, 2f
 | |
| 	 std	%f4, [%o1]
 | |
| 
 | |
| 	btst	4, %o2					! Word store?
 | |
| 	bz	%xcc, 1f
 | |
| 	 nop
 | |
| 	st	%f4, [%o1]
 | |
| 	inc	4, %o1
 | |
| 1:
 | |
| 	btst	2, %o2
 | |
| 	fzero	%f0
 | |
| 	bz	1f
 | |
| 
 | |
| 	 mov	-6, %o4
 | |
| 	alignaddr %o1, %o4, %g0
 | |
| 
 | |
| 	faligndata %f0, %f4, %f8
 | |
| 	
 | |
| 	stda	%f8, [%o1] ASI_FL16_P			! Store short
 | |
| 	inc	2, %o1
 | |
| 1:
 | |
| 	btst	1, %o2					! Byte aligned?
 | |
| 	bz	2f
 | |
| 
 | |
| 	 mov	-7, %o0					! Calculate dest - 7
 | |
| 	alignaddr %o1, %o0, %g0				! Calculate shift mask and dest.
 | |
| 
 | |
| 	faligndata %f0, %f4, %f8			! Move 1st byte to low part of f8
 | |
| 
 | |
| 	stda	%f8, [%o1] ASI_FL8_P			! Store 1st byte
 | |
| 	inc	1, %o1					! Update address
 | |
| 2:
 | |
| 	membar	#Sync
 | |
| #if 0
 | |
| 	!!
 | |
| 	!! verify copy success.
 | |
| 	!! 
 | |
| 
 | |
| 	mov	%i0, %o2
 | |
| 	mov	%i1, %o4
 | |
| 	mov	%i2, %l4
 | |
| 0:	
 | |
| 	ldub	[%o2], %o1
 | |
| 	inc	%o2
 | |
| 	ldub	[%o4], %o3
 | |
| 	inc	%o4
 | |
| 	cmp	%o3, %o1
 | |
| 	bnz	1f
 | |
| 	 dec	%l4
 | |
| 	brnz	%l4, 0b
 | |
| 	 nop
 | |
| 	ba	2f
 | |
| 	 nop
 | |
| 
 | |
| 1:
 | |
| 	set	block_disable, %o0
 | |
| 	stx	%o0, [%o0]
 | |
| 	
 | |
| 	set	0f, %o0
 | |
| 	call	prom_printf
 | |
| 	 sub	%i2, %l4, %o5
 | |
| 	set	1f, %o0
 | |
| 	mov	%i0, %o1
 | |
| 	mov	%i1, %o2
 | |
| 	call	prom_printf
 | |
| 	 mov	%i2, %o3
 | |
| 	ta	1
 | |
| 	.data
 | |
| 	_ALIGN
 | |
| block_disable:	.xword	0
 | |
| 0:	.asciz	"bcopy failed: %x@%p != %x@%p byte %d\r\n"
 | |
| 1:	.asciz	"bcopy(%p, %p, %lx)\r\n"
 | |
| 	_ALIGN
 | |
| 	.text
 | |
| 2:	
 | |
| #endif
 | |
| #ifdef _KERNEL		
 | |
| 
 | |
| 	set 1f, %o0
 | |
| 	mov	%i0, %o1
 | |
| 	mov	%i1, %o2
 | |
| 	call	printf
 | |
| 	mov	%i2, %o3
 | |
| 	
 | |
| 	.data
 | |
| 	_ALIGN
 | |
| 1:	.asciz "block exit (%p, %p, %d)\n"
 | |
| 	_ALIGN
 | |
| 	.text
 | |
| /*
 | |
|  * Weve saved our possible fpstate, now disable the fpu
 | |
|  * and continue with life.
 | |
|  */
 | |
| #if 1
 | |
| 	RESTORE_FPU
 | |
| #else
 | |
| #ifdef DEBUG
 | |
| 	LDPTR	[%l1 + %lo(FPPROC)], %l7
 | |
| 	cmp	%l7, %l5
 | |
| !	tnz	1		! fpproc has changed!
 | |
| 	LDPTR	[%l5 + P_FPSTATE], %l7
 | |
| 	cmp	%l7, %l0
 | |
| 	tnz	1		! fpstate has changed!
 | |
| #endif
 | |
| 	andcc	%l2, %l3, %g0				! If (fpproc && fpstate)
 | |
| 	STPTR	%l2, [%l1 + %lo(FPPROC)]		! Restore old fproc
 | |
| 	bz,pt	%xcc, 1f				! Skip if no fpstate
 | |
| 	 STPTR	%l6, [%l5 + P_FPSTATE]			! Restore old fpstate
 | |
| 	
 | |
| 	call	_C_LABEL(loadfpstate)			! Re-load orig fpstate
 | |
| 	 mov	%l3, %o0
 | |
| 1:
 | |
| #endif
 | |
| 	set 1f, %o0
 | |
| 	mov	%i0, %o1
 | |
| 	mov	%i1, %o2
 | |
| 	call	printf
 | |
| 	mov	%i2, %o3
 | |
| 	
 | |
| 	.data
 | |
| 	_ALIGN
 | |
| 1:	.asciz "block done (%p, %p, %d)\n"
 | |
| 	_ALIGN
 | |
| 	.text
 | |
| 
 | |
| 	
 | |
| 	ret
 | |
| 	 restore	%g1, 0, %o0			! Return DEST for memcpy
 | |
| #endif
 | |
| 	retl
 | |
| 	 mov	%g1, %o0
 | |
| #endif
 | |
| 
 | |
| 
 |