 84d9c625bf
			
		
	
	
		84d9c625bf
		
	
	
	
	
		
			
			- Fix for possible unset uid/gid in toproto
 - Fix for default mtree style
 - Update libelf
 - Importing libexecinfo
 - Resynchronize GCC, mpc, gmp, mpfr
 - build.sh: Replace params with show-params.
     This has been done as the make target has been renamed in the same
     way, while a new target named params has been added. This new
     target generates a file containing all the parameters, instead of
     printing it on the console.
 - Update test48 with new etc/services (Fix by Ben Gras <ben@minix3.org)
     get getservbyport() out of the inner loop
Change-Id: Ie6ad5226fa2621ff9f0dee8782ea48f9443d2091
		
	
			
		
			
				
	
	
		
			1628 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			1628 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*	$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $	*/
 | |
| 
 | |
| /*
 | |
|  * Copyright (c) 1996-2002 Eduardo Horvath
 | |
|  * All rights reserved.
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions
 | |
|  * are met:
 | |
|  * 1. Redistributions of source code must retain the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR  ``AS IS'' AND
 | |
|  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
|  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR  BE LIABLE
 | |
|  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
|  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 | |
|  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 | |
|  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 | |
|  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | |
|  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 | |
|  * SUCH DAMAGE.
 | |
|  *
 | |
|  */
 | |
| #include "strmacros.h"
 | |
| #if defined(LIBC_SCCS) && !defined(lint)
 | |
| RCSID("$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $")
 | |
| #endif  /* LIBC_SCCS and not lint */
 | |
| 
 | |
| /*
 | |
|  * memcpy
 | |
|  * Assumes regions do not overlap;
 | |
|  *
 | |
|  * Must not use %g7 (see copyin/copyout above).
 | |
|  */
 | |
| ENTRY(memcpy) /* dest, src, size */
 | |
| 	/*
 | |
| 	 * Swap args for bcopy.  Gcc generates calls to memcpy for
 | |
| 	 * structure assignments.
 | |
| 	 */
 | |
| 	mov	%o0, %o3
 | |
| 	mov	%o1, %o0
 | |
| 	mov	%o3, %o1
 | |
| #if !defined(_KERNEL) || defined(_RUMPKERNEL)
 | |
| ENTRY(bcopy) /* src, dest, size */
 | |
| #endif
 | |
| #ifdef DEBUG
 | |
| #if defined(_KERNEL) && !defined(_RUMPKERNEL)
 | |
| 	set	pmapdebug, %o4
 | |
| 	ld	[%o4], %o4
 | |
| 	btst	0x80, %o4	! PDB_COPY
 | |
| 	bz,pt	%icc, 3f
 | |
| 	 nop
 | |
| #endif
 | |
| 	save	%sp, -CC64FSZ, %sp
 | |
| 	mov	%i0, %o1
 | |
| 	set	2f, %o0
 | |
| 	mov	%i1, %o2
 | |
| 	call	printf
 | |
| 	 mov	%i2, %o3
 | |
| !	ta	1; nop
 | |
| 	restore
 | |
| 	.data
 | |
| 2:	.asciz	"memcpy(%p<-%p,%x)\n"
 | |
| 	_ALIGN
 | |
| 	.text
 | |
| 3:
 | |
| #endif
 | |
| 
 | |
| 	cmp	%o2, BCOPY_SMALL
 | |
| 
 | |
| Lmemcpy_start:
 | |
| 	bge,pt	CCCR, 2f	! if >= this many, go be fancy.
 | |
| 	 cmp	%o2, 256
 | |
| 
 | |
| 	mov	%o1, %o5	! Save memcpy return value
 | |
| 	/*
 | |
| 	 * Not much to copy, just do it a byte at a time.
 | |
| 	 */
 | |
| 	deccc	%o2		! while (--len >= 0)
 | |
| 	bl	1f
 | |
| 	 .empty
 | |
| 0:
 | |
| 	inc	%o0
 | |
| 	ldsb	[%o0 - 1], %o4	!	(++dst)[-1] = *src++;
 | |
| 	stb	%o4, [%o1]
 | |
| 	deccc	%o2
 | |
| 	bge	0b
 | |
| 	 inc	%o1
 | |
| 1:
 | |
| 	retl
 | |
| 	 mov	%o5, %o0
 | |
| 	NOTREACHED
 | |
| 
 | |
| 	/*
 | |
| 	 * Plenty of data to copy, so try to do it optimally.
 | |
| 	 */
 | |
| 2:
 | |
| #ifdef USE_BLOCK_STORE_LOAD
 | |
| 	! If it is big enough, use VIS instructions
 | |
| 	bge	Lmemcpy_block
 | |
| 	 nop
 | |
| #endif /* USE_BLOCK_STORE_LOAD */
 | |
| Lmemcpy_fancy:
 | |
| 
 | |
| 	!!
 | |
| 	!! First align the output to a 8-byte entity
 | |
| 	!! 
 | |
| 
 | |
| 	save	%sp, -CC64FSZ, %sp
 | |
| 	
 | |
| 	mov	%i0, %l0
 | |
| 	mov	%i1, %l1
 | |
| 	
 | |
| 	mov	%i2, %l2
 | |
| 	btst	1, %l1
 | |
| 	
 | |
| 	bz,pt	%icc, 4f
 | |
| 	 btst	2, %l1
 | |
| 	ldub	[%l0], %l4				! Load 1st byte
 | |
| 	
 | |
| 	deccc	1, %l2
 | |
| 	ble,pn	CCCR, Lmemcpy_finish			! XXXX
 | |
| 	 inc	1, %l0
 | |
| 	
 | |
| 	stb	%l4, [%l1]				! Store 1st byte
 | |
| 	inc	1, %l1					! Update address
 | |
| 	btst	2, %l1
 | |
| 4:	
 | |
| 	bz,pt	%icc, 4f
 | |
| 	
 | |
| 	 btst	1, %l0
 | |
| 	bz,a	1f
 | |
| 	 lduh	[%l0], %l4				! Load short
 | |
| 
 | |
| 	ldub	[%l0], %l4				! Load bytes
 | |
| 	
 | |
| 	ldub	[%l0+1], %l3
 | |
| 	sllx	%l4, 8, %l4
 | |
| 	or	%l3, %l4, %l4
 | |
| 	
 | |
| 1:	
 | |
| 	deccc	2, %l2
 | |
| 	ble,pn	CCCR, Lmemcpy_finish			! XXXX
 | |
| 	 inc	2, %l0
 | |
| 	sth	%l4, [%l1]				! Store 1st short
 | |
| 	
 | |
| 	inc	2, %l1
 | |
| 4:
 | |
| 	btst	4, %l1
 | |
| 	bz,pt	CCCR, 4f
 | |
| 	
 | |
| 	 btst	3, %l0
 | |
| 	bz,a,pt	CCCR, 1f
 | |
| 	 lduw	[%l0], %l4				! Load word -1
 | |
| 
 | |
| 	btst	1, %l0
 | |
| 	bz,a,pt	%icc, 2f
 | |
| 	 lduh	[%l0], %l4
 | |
| 	
 | |
| 	ldub	[%l0], %l4
 | |
| 	
 | |
| 	lduh	[%l0+1], %l3
 | |
| 	sllx	%l4, 16, %l4
 | |
| 	or	%l4, %l3, %l4
 | |
| 	
 | |
| 	ldub	[%l0+3], %l3
 | |
| 	sllx	%l4, 8, %l4
 | |
| 	ba,pt	%icc, 1f
 | |
| 	 or	%l4, %l3, %l4
 | |
| 	
 | |
| 2:
 | |
| 	lduh	[%l0+2], %l3
 | |
| 	sllx	%l4, 16, %l4
 | |
| 	or	%l4, %l3, %l4
 | |
| 	
 | |
| 1:	
 | |
| 	deccc	4, %l2
 | |
| 	ble,pn	CCCR, Lmemcpy_finish		! XXXX
 | |
| 	 inc	4, %l0
 | |
| 	
 | |
| 	st	%l4, [%l1]				! Store word
 | |
| 	inc	4, %l1
 | |
| 4:
 | |
| 	!!
 | |
| 	!! We are now 32-bit aligned in the dest.
 | |
| 	!!
 | |
| Lmemcpy_common:	
 | |
| 
 | |
| 	and	%l0, 7, %l4				! Shift amount
 | |
| 	andn	%l0, 7, %l0				! Source addr
 | |
| 	
 | |
| 	brz,pt	%l4, Lmemcpy_noshift8			! No shift version...
 | |
| 
 | |
| 	 sllx	%l4, 3, %l4				! In bits
 | |
| 	mov	8<<3, %l3
 | |
| 	
 | |
| 	ldx	[%l0], %o0				! Load word -1
 | |
| 	sub	%l3, %l4, %l3				! Reverse shift
 | |
| 	deccc	12*8, %l2				! Have enough room?
 | |
| 	
 | |
| 	sllx	%o0, %l4, %o0
 | |
| 	bl,pn	CCCR, 2f
 | |
| 	 and	%l3, 0x38, %l3
 | |
| Lmemcpy_unrolled8:
 | |
| 
 | |
| 	/*
 | |
| 	 * This is about as close to optimal as you can get, since
 | |
| 	 * the shifts require EU0 and cannot be paired, and you have
 | |
| 	 * 3 dependent operations on the data.
 | |
| 	 */ 
 | |
| 
 | |
| !	ldx	[%l0+0*8], %o0				! Already done
 | |
| !	sllx	%o0, %l4, %o0				! Already done
 | |
| 	ldx	[%l0+1*8], %o1
 | |
| 	ldx	[%l0+2*8], %o2
 | |
| 	ldx	[%l0+3*8], %o3
 | |
| 	ldx	[%l0+4*8], %o4
 | |
| 	ba,pt	%icc, 1f
 | |
| 	 ldx	[%l0+5*8], %o5
 | |
| 	.align	8
 | |
| 1:
 | |
| 	srlx	%o1, %l3, %g1
 | |
| 	inc	6*8, %l0
 | |
| 	
 | |
| 	sllx	%o1, %l4, %o1
 | |
| 	or	%g1, %o0, %g6
 | |
| 	ldx	[%l0+0*8], %o0
 | |
| 	
 | |
| 	stx	%g6, [%l1+0*8]
 | |
| 	srlx	%o2, %l3, %g1
 | |
| 
 | |
| 	sllx	%o2, %l4, %o2
 | |
| 	or	%g1, %o1, %g6
 | |
| 	ldx	[%l0+1*8], %o1
 | |
| 	
 | |
| 	stx	%g6, [%l1+1*8]
 | |
| 	srlx	%o3, %l3, %g1
 | |
| 	
 | |
| 	sllx	%o3, %l4, %o3
 | |
| 	or	%g1, %o2, %g6
 | |
| 	ldx	[%l0+2*8], %o2
 | |
| 	
 | |
| 	stx	%g6, [%l1+2*8]
 | |
| 	srlx	%o4, %l3, %g1
 | |
| 	
 | |
| 	sllx	%o4, %l4, %o4	
 | |
| 	or	%g1, %o3, %g6
 | |
| 	ldx	[%l0+3*8], %o3
 | |
| 	
 | |
| 	stx	%g6, [%l1+3*8]
 | |
| 	srlx	%o5, %l3, %g1
 | |
| 	
 | |
| 	sllx	%o5, %l4, %o5
 | |
| 	or	%g1, %o4, %g6
 | |
| 	ldx	[%l0+4*8], %o4
 | |
| 
 | |
| 	stx	%g6, [%l1+4*8]
 | |
| 	srlx	%o0, %l3, %g1
 | |
| 	deccc	6*8, %l2				! Have enough room?
 | |
| 
 | |
| 	sllx	%o0, %l4, %o0				! Next loop
 | |
| 	or	%g1, %o5, %g6
 | |
| 	ldx	[%l0+5*8], %o5
 | |
| 	
 | |
| 	stx	%g6, [%l1+5*8]
 | |
| 	bge,pt	CCCR, 1b
 | |
| 	 inc	6*8, %l1
 | |
| 
 | |
| Lmemcpy_unrolled8_cleanup:	
 | |
| 	!!
 | |
| 	!! Finished 8 byte block, unload the regs.
 | |
| 	!! 
 | |
| 	srlx	%o1, %l3, %g1
 | |
| 	inc	5*8, %l0
 | |
| 	
 | |
| 	sllx	%o1, %l4, %o1
 | |
| 	or	%g1, %o0, %g6
 | |
| 		
 | |
| 	stx	%g6, [%l1+0*8]
 | |
| 	srlx	%o2, %l3, %g1
 | |
| 	
 | |
| 	sllx	%o2, %l4, %o2
 | |
| 	or	%g1, %o1, %g6
 | |
| 		
 | |
| 	stx	%g6, [%l1+1*8]
 | |
| 	srlx	%o3, %l3, %g1
 | |
| 	
 | |
| 	sllx	%o3, %l4, %o3
 | |
| 	or	%g1, %o2, %g6
 | |
| 		
 | |
| 	stx	%g6, [%l1+2*8]
 | |
| 	srlx	%o4, %l3, %g1
 | |
| 	
 | |
| 	sllx	%o4, %l4, %o4	
 | |
| 	or	%g1, %o3, %g6
 | |
| 		
 | |
| 	stx	%g6, [%l1+3*8]
 | |
| 	srlx	%o5, %l3, %g1
 | |
| 	
 | |
| 	sllx	%o5, %l4, %o5
 | |
| 	or	%g1, %o4, %g6
 | |
| 		
 | |
| 	stx	%g6, [%l1+4*8]
 | |
| 	inc	5*8, %l1
 | |
| 	
 | |
| 	mov	%o5, %o0				! Save our unused data
 | |
| 	dec	5*8, %l2
 | |
| 2:
 | |
| 	inccc	12*8, %l2
 | |
| 	bz,pn	%icc, Lmemcpy_complete
 | |
| 	
 | |
| 	!! Unrolled 8 times
 | |
| Lmemcpy_aligned8:	
 | |
| !	ldx	[%l0], %o0				! Already done
 | |
| !	sllx	%o0, %l4, %o0				! Shift high word
 | |
| 	
 | |
| 	 deccc	8, %l2					! Pre-decrement
 | |
| 	bl,pn	CCCR, Lmemcpy_finish
 | |
| 1:
 | |
| 	ldx	[%l0+8], %o1				! Load word 0
 | |
| 	inc	8, %l0
 | |
| 	
 | |
| 	srlx	%o1, %l3, %g6
 | |
| 	or	%g6, %o0, %g6				! Combine
 | |
| 	
 | |
| 	stx	%g6, [%l1]				! Store result
 | |
| 	 inc	8, %l1
 | |
| 	
 | |
| 	deccc	8, %l2
 | |
| 	bge,pn	CCCR, 1b
 | |
| 	 sllx	%o1, %l4, %o0	
 | |
| 
 | |
| 	btst	7, %l2					! Done?
 | |
| 	bz,pt	CCCR, Lmemcpy_complete
 | |
| 
 | |
| 	!!
 | |
| 	!! Loadup the last dregs into %o0 and shift it into place
 | |
| 	!! 
 | |
| 	 srlx	%l3, 3, %g6				! # bytes in %o0
 | |
| 	dec	8, %g6					!  - 8
 | |
| 	!! n-8 - (by - 8) -> n - by
 | |
| 	subcc	%l2, %g6, %g0				! # bytes we need
 | |
| 	ble,pt	%icc, Lmemcpy_finish
 | |
| 	 nop
 | |
| 	ldx	[%l0+8], %o1				! Need another word
 | |
| 	srlx	%o1, %l3, %o1
 | |
| 	ba,pt	%icc, Lmemcpy_finish
 | |
| 	 or	%o0, %o1, %o0				! All loaded up.
 | |
| 	
 | |
| Lmemcpy_noshift8:
 | |
| 	deccc	6*8, %l2				! Have enough room?
 | |
| 	bl,pn	CCCR, 2f
 | |
| 	 nop
 | |
| 	ba,pt	%icc, 1f
 | |
| 	 nop
 | |
| 	.align	32
 | |
| 1:	
 | |
| 	ldx	[%l0+0*8], %o0
 | |
| 	ldx	[%l0+1*8], %o1
 | |
| 	ldx	[%l0+2*8], %o2
 | |
| 	stx	%o0, [%l1+0*8]
 | |
| 	stx	%o1, [%l1+1*8]
 | |
| 	stx	%o2, [%l1+2*8]
 | |
| 
 | |
| 	
 | |
| 	ldx	[%l0+3*8], %o3
 | |
| 	ldx	[%l0+4*8], %o4
 | |
| 	ldx	[%l0+5*8], %o5
 | |
| 	inc	6*8, %l0
 | |
| 	stx	%o3, [%l1+3*8]
 | |
| 	deccc	6*8, %l2
 | |
| 	stx	%o4, [%l1+4*8]
 | |
| 	stx	%o5, [%l1+5*8]
 | |
| 	bge,pt	CCCR, 1b
 | |
| 	 inc	6*8, %l1
 | |
| 2:
 | |
| 	inc	6*8, %l2
 | |
| 1:	
 | |
| 	deccc	8, %l2
 | |
| 	bl,pn	%icc, 1f				! < 0 --> sub word
 | |
| 	 nop
 | |
| 	ldx	[%l0], %g6
 | |
| 	inc	8, %l0
 | |
| 	stx	%g6, [%l1]
 | |
| 	bg,pt	%icc, 1b				! Exactly 0 --> done
 | |
| 	 inc	8, %l1
 | |
| 1:
 | |
| 	btst	7, %l2					! Done?
 | |
| 	bz,pt	CCCR, Lmemcpy_complete
 | |
| 	 clr	%l4
 | |
| 	ldx	[%l0], %o0
 | |
| Lmemcpy_finish:
 | |
| 	
 | |
| 	brz,pn	%l2, 2f					! 100% complete?
 | |
| 	 cmp	%l2, 8					! Exactly 8 bytes?
 | |
| 	bz,a,pn	CCCR, 2f
 | |
| 	 stx	%o0, [%l1]
 | |
| 
 | |
| 	btst	4, %l2					! Word store?
 | |
| 	bz	CCCR, 1f
 | |
| 	 srlx	%o0, 32, %g6				! Shift high word down
 | |
| 	stw	%g6, [%l1]
 | |
| 	inc	4, %l1
 | |
| 	mov	%o0, %g6				! Operate on the low bits
 | |
| 1:
 | |
| 	btst	2, %l2
 | |
| 	mov	%g6, %o0
 | |
| 	bz	1f
 | |
| 	 srlx	%o0, 16, %g6
 | |
| 	
 | |
| 	sth	%g6, [%l1]				! Store short
 | |
| 	inc	2, %l1
 | |
| 	mov	%o0, %g6				! Operate on low bytes
 | |
| 1:
 | |
| 	mov	%g6, %o0
 | |
| 	btst	1, %l2					! Byte aligned?
 | |
| 	bz	2f
 | |
| 	 srlx	%o0, 8, %g6
 | |
| 
 | |
| 	stb	%g6, [%l1]				! Store last byte
 | |
| 	inc	1, %l1					! Update address
 | |
| 2:	
 | |
| Lmemcpy_complete:
 | |
| #if 0
 | |
| 	!!
 | |
| 	!! verify copy success.
 | |
| 	!! 
 | |
| 
 | |
| 	mov	%i0, %o2
 | |
| 	mov	%i1, %o4
 | |
| 	mov	%i2, %l4
 | |
| 0:	
 | |
| 	ldub	[%o2], %o1
 | |
| 	inc	%o2
 | |
| 	ldub	[%o4], %o3
 | |
| 	inc	%o4
 | |
| 	cmp	%o3, %o1
 | |
| 	bnz	1f
 | |
| 	 dec	%l4
 | |
| 	brnz	%l4, 0b
 | |
| 	 nop
 | |
| 	ba	2f
 | |
| 	 nop
 | |
| 
 | |
| 1:
 | |
| 	set	0f, %o0
 | |
| 	call	printf
 | |
| 	 sub	%i2, %l4, %o5
 | |
| 	set	1f, %o0
 | |
| 	mov	%i0, %o2
 | |
| 	mov	%i1, %o1
 | |
| 	call	printf
 | |
| 	 mov	%i2, %o3
 | |
| 	ta	1
 | |
| 	.data
 | |
| 0:	.asciz	"memcpy failed: %x@%p != %x@%p byte %d\n"
 | |
| 1:	.asciz	"memcpy(%p, %p, %lx)\n"
 | |
| 	.align 8
 | |
| 	.text
 | |
| 2:	
 | |
| #endif
 | |
| 	ret
 | |
| 	 restore %i1, %g0, %o0
 | |
| 
 | |
| #ifdef USE_BLOCK_STORE_LOAD
 | |
| 
 | |
| /*
 | |
|  * Block copy.  Useful for >256 byte copies.
 | |
|  *
 | |
|  * Benchmarking has shown this always seems to be slower than
 | |
|  * the integer version, so this is disabled.  Maybe someone will
 | |
|  * figure out why sometime.
 | |
|  */
 | |
| 	
 | |
| Lmemcpy_block:
 | |
| 	sethi	%hi(block_disable), %o3
 | |
| 	ldx	[ %o3 + %lo(block_disable) ], %o3
 | |
| 	brnz,pn	%o3, Lmemcpy_fancy
 | |
| 	!! Make sure our trap table is installed
 | |
| 	set	_C_LABEL(trapbase), %o5
 | |
| 	rdpr	%tba, %o3
 | |
| 	sub	%o3, %o5, %o3
 | |
| 	brnz,pn	%o3, Lmemcpy_fancy	! No, then don't use block load/store
 | |
| 	 nop
 | |
| #if defined(_KERNEL) && !defined(_RUMPKERNEL)
 | |
| /*
 | |
|  * Kernel:
 | |
|  *
 | |
|  * Here we use VIS instructions to do a block clear of a page.
 | |
|  * But before we can do that we need to save and enable the FPU.
 | |
|  * The last owner of the FPU registers is fplwp, and
 | |
|  * fplwp->l_md.md_fpstate is the current fpstate.  If that's not
 | |
|  * null, call savefpstate() with it to store our current fp state.
 | |
|  *
 | |
|  * Next, allocate an aligned fpstate on the stack.  We will properly
 | |
|  * nest calls on a particular stack so this should not be a problem.
 | |
|  *
 | |
|  * Now we grab either curlwp (or if we're on the interrupt stack
 | |
|  * lwp0).  We stash its existing fpstate in a local register and
 | |
|  * put our new fpstate in curlwp->p_md.md_fpstate.  We point
 | |
|  * fplwp at curlwp (or lwp0) and enable the FPU.
 | |
|  *
 | |
|  * If we are ever preempted, our FPU state will be saved in our
 | |
|  * fpstate.  Then, when we're resumed and we take an FPDISABLED
 | |
|  * trap, the trap handler will be able to fish our FPU state out
 | |
|  * of curlwp (or lwp0).
 | |
|  *
 | |
|  * On exiting this routine we undo the damage: restore the original
 | |
|  * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
 | |
|  * the MMU.
 | |
|  *
 | |
|  *
 | |
|  * Register usage, Kernel only (after save):
 | |
|  *
 | |
|  * %i0		src
 | |
|  * %i1		dest
 | |
|  * %i2		size
 | |
|  *
 | |
|  * %l0		XXXX DEBUG old fpstate
 | |
|  * %l1		fplwp (hi bits only)
 | |
|  * %l2		orig fplwp
 | |
|  * %l3		orig fpstate
 | |
|  * %l5		curlwp
 | |
|  * %l6		old fpstate
 | |
|  *
 | |
|  * Register ussage, Kernel and user:
 | |
|  *
 | |
|  * %g1		src (retval for memcpy)
 | |
|  *
 | |
|  * %o0		src
 | |
|  * %o1		dest
 | |
|  * %o2		end dest
 | |
|  * %o5		last safe fetchable address
 | |
|  */
 | |
| 
 | |
| 	ENABLE_FPU(0)
 | |
| 
 | |
| 	mov	%i0, %o0				! Src addr.
 | |
| 	mov	%i1, %o1				! Store our dest ptr here.
 | |
| 	mov	%i2, %o2				! Len counter
 | |
| #endif	/* _KERNEL */
 | |
| 
 | |
| 	!!
 | |
| 	!! First align the output to a 64-bit entity
 | |
| 	!! 
 | |
| 
 | |
| 	mov	%o1, %g1				! memcpy retval
 | |
| 	add	%o0, %o2, %o5				! End of source block
 | |
| 
 | |
| 	andn	%o0, 7, %o3				! Start of block
 | |
| 	dec	%o5
 | |
| 	fzero	%f0
 | |
| 
 | |
| 	andn	%o5, BLOCK_ALIGN, %o5			! Last safe addr.
 | |
| 	ldd	[%o3], %f2				! Load 1st word
 | |
| 
 | |
| 	dec	8, %o3					! Move %o3 1 word back
 | |
| 	btst	1, %o1
 | |
| 	bz	4f
 | |
| 	
 | |
| 	 mov	-7, %o4					! Lowest src addr possible
 | |
| 	alignaddr %o0, %o4, %o4				! Base addr for load.
 | |
| 
 | |
| 	cmp	%o3, %o4
 | |
| 	be,pt	CCCR, 1f				! Already loaded?
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! No. Shift
 | |
| 	ldd	[%o3+8], %f2				! And load
 | |
| 1:	
 | |
| 
 | |
| 	faligndata	%f0, %f2, %f4			! Isolate 1st byte
 | |
| 
 | |
| 	stda	%f4, [%o1] ASI_FL8_P			! Store 1st byte
 | |
| 	inc	1, %o1					! Update address
 | |
| 	inc	1, %o0
 | |
| 	dec	1, %o2
 | |
| 4:	
 | |
| 	btst	2, %o1
 | |
| 	bz	4f
 | |
| 
 | |
| 	 mov	-6, %o4					! Calculate src - 6
 | |
| 	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
 | |
| 
 | |
| 	cmp	%o3, %o4				! Addresses same?
 | |
| 	be,pt	CCCR, 1f
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! Shuffle data
 | |
| 	ldd	[%o3+8], %f2				! Load word 0
 | |
| 1:	
 | |
| 	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
 | |
| 
 | |
| 	stda	%f4, [%o1] ASI_FL16_P			! Store 1st short
 | |
| 	dec	2, %o2
 | |
| 	inc	2, %o1
 | |
| 	inc	2, %o0
 | |
| 4:
 | |
| 	brz,pn	%o2, Lmemcpy_blockfinish			! XXXX
 | |
| 
 | |
| 	 btst	4, %o1
 | |
| 	bz	4f
 | |
| 
 | |
| 	mov	-4, %o4
 | |
| 	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
 | |
| 
 | |
| 	cmp	%o3, %o4				! Addresses same?
 | |
| 	beq,pt	CCCR, 1f
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! Shuffle data
 | |
| 	ldd	[%o3+8], %f2				! Load word 0
 | |
| 1:	
 | |
| 	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
 | |
| 
 | |
| 	st	%f5, [%o1]				! Store word
 | |
| 	dec	4, %o2
 | |
| 	inc	4, %o1
 | |
| 	inc	4, %o0
 | |
| 4:
 | |
| 	brz,pn	%o2, Lmemcpy_blockfinish			! XXXX
 | |
| 	!!
 | |
| 	!! We are now 32-bit aligned in the dest.
 | |
| 	!!
 | |
| Lmemcpy_block_common:	
 | |
| 
 | |
| 	 mov	-0, %o4
 | |
| 	alignaddr %o0, %o4, %o4				! base - shift
 | |
| 
 | |
| 	cmp	%o3, %o4				! Addresses same?
 | |
| 	beq,pt	CCCR, 1f
 | |
| 	 mov	%o4, %o3
 | |
| 	fmovd	%f2, %f0				! Shuffle data
 | |
| 	ldd	[%o3+8], %f2				! Load word 0
 | |
| 1:	
 | |
| 	add	%o3, 8, %o0				! now use %o0 for src
 | |
| 	
 | |
| 	!!
 | |
| 	!! Continue until our dest is block aligned
 | |
| 	!! 
 | |
| Lmemcpy_block_aligned8:	
 | |
| 1:
 | |
| 	brz	%o2, Lmemcpy_blockfinish
 | |
| 	 btst	BLOCK_ALIGN, %o1			! Block aligned?
 | |
| 	bz	1f
 | |
| 	
 | |
| 	 faligndata %f0, %f2, %f4			! Generate result
 | |
| 	deccc	8, %o2
 | |
| 	ble,pn	%icc, Lmemcpy_blockfinish		! Should never happen
 | |
| 	 fmovd	%f4, %f48
 | |
| 	
 | |
| 	std	%f4, [%o1]				! Store result
 | |
| 	inc	8, %o1
 | |
| 	
 | |
| 	fmovd	%f2, %f0
 | |
| 	inc	8, %o0
 | |
| 	ba,pt	%xcc, 1b				! Not yet.
 | |
| 	 ldd	[%o0], %f2				! Load next part
 | |
| Lmemcpy_block_aligned64:	
 | |
| 1:
 | |
| 
 | |
| /*
 | |
|  * 64-byte aligned -- ready for block operations.
 | |
|  *
 | |
|  * Here we have the destination block aligned, but the
 | |
|  * source pointer may not be.  Sub-word alignment will
 | |
|  * be handled by faligndata instructions.  But the source
 | |
|  * can still be potentially aligned to 8 different words
 | |
|  * in our 64-bit block, so we have 8 different copy routines.
 | |
|  *
 | |
|  * Once we figure out our source alignment, we branch
 | |
|  * to the appropriate copy routine, which sets up the
 | |
|  * alignment for faligndata and loads (sets) the values
 | |
|  * into the source registers and does the copy loop.
 | |
|  *
 | |
|  * When were down to less than 1 block to store, we
 | |
|  * exit the copy loop and execute cleanup code.
 | |
|  *
 | |
|  * Block loads and stores are not properly interlocked.
 | |
|  * Stores save one reg/cycle, so you can start overwriting
 | |
|  * registers the cycle after the store is issued.  
 | |
|  * 
 | |
|  * Block loads require a block load to a different register
 | |
|  * block or a membar #Sync before accessing the loaded
 | |
|  * data.
 | |
|  *	
 | |
|  * Since the faligndata instructions may be offset as far
 | |
|  * as 7 registers into a block (if you are shifting source 
 | |
|  * 7 -> dest 0), you need 3 source register blocks for full 
 | |
|  * performance: one you are copying, one you are loading, 
 | |
|  * and one for interlocking.  Otherwise, we would need to
 | |
|  * sprinkle the code with membar #Sync and lose the advantage
 | |
|  * of running faligndata in parallel with block stores.  This 
 | |
|  * means we are fetching a full 128 bytes ahead of the stores.  
 | |
|  * We need to make sure the prefetch does not inadvertently 
 | |
|  * cross a page boundary and fault on data that we will never 
 | |
|  * store.
 | |
|  *
 | |
|  */
 | |
| #if 1
 | |
| 	and	%o0, BLOCK_ALIGN, %o3
 | |
| 	srax	%o3, 3, %o3				! Isolate the offset
 | |
| 
 | |
| 	brz	%o3, L100				! 0->0
 | |
| 	 btst	4, %o3
 | |
| 	bnz	%xcc, 4f
 | |
| 	 btst	2, %o3
 | |
| 	bnz	%xcc, 2f
 | |
| 	 btst	1, %o3
 | |
| 	ba,pt	%xcc, L101				! 0->1
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| 2:
 | |
| 	bz	%xcc, L102				! 0->2
 | |
| 	 nop
 | |
| 	ba,pt	%xcc, L103				! 0->3
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| 4:	
 | |
| 	bnz	%xcc, 2f
 | |
| 	 btst	1, %o3
 | |
| 	bz	%xcc, L104				! 0->4
 | |
| 	 nop
 | |
| 	ba,pt	%xcc, L105				! 0->5
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| 2:
 | |
| 	bz	%xcc, L106				! 0->6
 | |
| 	 nop
 | |
| 	ba,pt	%xcc, L107				! 0->7
 | |
| 	 nop	/* XXX spitfire bug */
 | |
| #else
 | |
| 
 | |
| 	!!
 | |
| 	!! Isolate the word offset, which just happens to be
 | |
| 	!! the slot in our jump table.
 | |
| 	!!
 | |
| 	!! This is 6 insns, most of which cannot be paired,
 | |
| 	!! which is about the same as the above version.
 | |
| 	!!
 | |
| 	rd	%pc, %o4
 | |
| 1:	
 | |
| 	and	%o0, 0x31, %o3
 | |
| 	add	%o3, (Lmemcpy_block_jmp - 1b), %o3
 | |
| 	jmpl	%o4 + %o3, %g0
 | |
| 	 nop
 | |
| 
 | |
| 	!!
 | |
| 	!! Jump table
 | |
| 	!!
 | |
| 	
 | |
| Lmemcpy_block_jmp:
 | |
| 	ba,a,pt	%xcc, L100
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L101
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L102
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L103
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L104
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L105
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L106
 | |
| 	 nop
 | |
| 	ba,a,pt	%xcc, L107
 | |
| 	 nop
 | |
| #endif
 | |
| 
 | |
| 	!!
 | |
| 	!! Source is block aligned.
 | |
| 	!!
 | |
| 	!! Just load a block and go.
 | |
| 	!!
 | |
| L100:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L100"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0 , %f62
 | |
| 	ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 3f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	ba,pt	%icc, 3f
 | |
| 	 membar #Sync
 | |
| 	
 | |
| 	.align	32					! ICache align.
 | |
| 3:
 | |
| 	faligndata	%f62, %f0, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f0, %f2, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f2, %f4, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f4, %f6, %f38
 | |
| 	faligndata	%f6, %f8, %f40
 | |
| 	faligndata	%f8, %f10, %f42
 | |
| 	faligndata	%f10, %f12, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f12, %f14, %f46
 | |
| 	
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	faligndata	%f14, %f16, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f16, %f18, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f18, %f20, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f20, %f22, %f38
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f22, %f24, %f40
 | |
| 	faligndata	%f24, %f26, %f42
 | |
| 	faligndata	%f26, %f28, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f28, %f30, %f46
 | |
| 	
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	faligndata	%f30, %f48, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f48, %f50, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f50, %f52, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f52, %f54, %f38
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f54, %f56, %f40
 | |
| 	faligndata	%f56, %f58, %f42
 | |
| 	faligndata	%f58, %f60, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f60, %f62, %f46
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16			! Increment is at top
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 	
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+8
 | |
| 	!!
 | |
| 	!! We need to load almost 1 complete block by hand.
 | |
| 	!! 
 | |
| L101:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L101"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| !	fmovd	%f0, %f0				! Hoist fmovd
 | |
| 	ldd	[%o0], %f2
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f4
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f6
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 3f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 3:	
 | |
| 	faligndata	%f0, %f2, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f2, %f4, %f34
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f4, %f6, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f6, %f8, %f38
 | |
| 	faligndata	%f8, %f10, %f40
 | |
| 	faligndata	%f10, %f12, %f42
 | |
| 	faligndata	%f12, %f14, %f44
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f14, %f16, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	
 | |
| 	faligndata	%f16, %f18, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f20, %f22, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f22, %f24, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f24, %f26, %f40
 | |
| 	faligndata	%f26, %f28, %f42
 | |
| 	faligndata	%f28, %f30, %f44
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f30, %f48, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f48, %f50, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f50, %f52, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f52, %f54, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f54, %f56, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f56, %f58, %f40
 | |
| 	faligndata	%f58, %f60, %f42
 | |
| 	faligndata	%f60, %f62, %f44
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f62, %f0, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+16
 | |
| 	!!
 | |
| 	!! We need to load 6 doubles by hand.
 | |
| 	!! 
 | |
| L102:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L102"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	ldd	[%o0], %f4
 | |
| 	inc	8, %o0
 | |
| 	fmovd	%f0, %f2				! Hoist fmovd
 | |
| 	ldd	[%o0], %f6
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 3f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 3:	
 | |
| 	faligndata	%f2, %f4, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f4, %f6, %f34
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f6, %f8, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f8, %f10, %f38
 | |
| 	faligndata	%f10, %f12, %f40
 | |
| 	faligndata	%f12, %f14, %f42
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f44
 | |
| 
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f16, %f18, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f18, %f20, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f20, %f22, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f22, %f24, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f24, %f26, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f26, %f28, %f40
 | |
| 	faligndata	%f28, %f30, %f42
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f30, %f48, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f48, %f50, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f50, %f52, %f32
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f52, %f54, %f34
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f54, %f56, %f36
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f56, %f58, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f58, %f60, %f40
 | |
| 	faligndata	%f60, %f62, %f42
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f62, %f0, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f0, %f2, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 	
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+24
 | |
| 	!!
 | |
| 	!! We need to load 5 doubles by hand.
 | |
| 	!! 
 | |
| L103:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L103"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f4
 | |
| 	ldd	[%o0], %f6
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f4, %f6, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f6, %f8, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f8, %f10, %f36
 | |
| 	faligndata	%f10, %f12, %f38
 | |
| 	faligndata	%f12, %f14, %f40
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f16, %f18, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f18, %f20, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f20, %f22, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f22, %f24, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f24, %f26, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f26, %f28, %f38
 | |
| 	faligndata	%f28, %f30, %f40
 | |
| 	ble,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f30, %f48, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f48, %f50, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f50, %f52, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f52, %f54, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f54, %f56, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f56, %f58, %f36
 | |
| 	faligndata	%f58, %f60, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f60, %f62, %f40
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f62, %f0, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f0, %f2, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f2, %f4, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+32
 | |
| 	!!
 | |
| 	!! We need to load 4 doubles by hand.
 | |
| 	!! 
 | |
| L104:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L104"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f6
 | |
| 	ldd	[%o0], %f8
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f6, %f8, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f8, %f10, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f10, %f12, %f36
 | |
| 	faligndata	%f12, %f14, %f38
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f40
 | |
| 	faligndata	%f16, %f18, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f20, %f22, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f22, %f24, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f24, %f26, %f34
 | |
| 	faligndata	%f26, %f28, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f28, %f30, %f38
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f30, %f48, %f40
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f48, %f50, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f50, %f52, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f52, %f54, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f54, %f56, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f56, %f58, %f34
 | |
| 	faligndata	%f58, %f60, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f60, %f62, %f38
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:	
 | |
| 	faligndata	%f62, %f0, %f40
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f0, %f2, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f2, %f4, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f4, %f6, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+40
 | |
| 	!!
 | |
| 	!! We need to load 3 doubles by hand.
 | |
| 	!! 
 | |
| L105:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L105"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f8
 | |
| 	ldd	[%o0], %f10
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f8, %f10, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f10, %f12, %f34
 | |
| 	faligndata	%f12, %f14, %f36
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f38
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f16, %f18, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f42
 | |
| 	faligndata	%f20, %f22, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f22, %f24, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f24, %f26, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f26, %f28, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f28, %f30, %f36
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f30, %f48, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f48, %f50, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f50, %f52, %f42
 | |
| 	faligndata	%f52, %f54, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f54, %f56, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f56, %f58, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f58, %f60, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f60, %f62, %f36
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f62, %f0, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f0, %f2, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f2, %f4, %f42
 | |
| 	faligndata	%f4, %f6, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f6, %f8, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+48
 | |
| 	!!
 | |
| 	!! We need to load 2 doubles by hand.
 | |
| 	!! 
 | |
| L106:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L106"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f10
 | |
| 	ldd	[%o0], %f12
 | |
| 	inc	8, %o0
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 	
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f10, %f12, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f12, %f14, %f34
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f16, %f18, %f38
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f40
 | |
| 	faligndata	%f20, %f22, %f42
 | |
| 	faligndata	%f22, %f24, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f24, %f26, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f26, %f28, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f28, %f30, %f34
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f30, %f48, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f48, %f50, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f50, %f52, %f40
 | |
| 	faligndata	%f52, %f54, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f54, %f56, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f56, %f58, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f58, %f60, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	faligndata	%f60, %f62, %f34
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f62, %f0, %f36
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f0, %f2, %f38
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f2, %f4, %f40
 | |
| 	faligndata	%f4, %f6, %f42
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f6, %f8, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f8, %f10, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 
 | |
| 
 | |
| 	!!
 | |
| 	!! Source at BLOCK_ALIGN+56
 | |
| 	!!
 | |
| 	!! We need to load 1 double by hand.
 | |
| 	!! 
 | |
| L107:
 | |
| #ifdef RETURN_NAME
 | |
| 	sethi	%hi(1f), %g1
 | |
| 	ba,pt	%icc, 2f
 | |
| 	 or	%g1, %lo(1f), %g1
 | |
| 1:	
 | |
| 	.asciz	"L107"
 | |
| 	.align	8
 | |
| 2:	
 | |
| #endif
 | |
| 	fmovd	%f0, %f12
 | |
| 	ldd	[%o0], %f14
 | |
| 	inc	8, %o0
 | |
| 
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar #Sync
 | |
| 2:	
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 3:	
 | |
| 	faligndata	%f12, %f14, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f48
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f14, %f16, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f16, %f18, %f36
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f18, %f20, %f38
 | |
| 	faligndata	%f20, %f22, %f40
 | |
| 	faligndata	%f22, %f24, %f42
 | |
| 	faligndata	%f24, %f26, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f26, %f28, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f28, %f30, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f0
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f30, %f48, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f48, %f50, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f50, %f52, %f38
 | |
| 	faligndata	%f52, %f54, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f54, %f56, %f42
 | |
| 	faligndata	%f56, %f58, %f44
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f58, %f60, %f46
 | |
| 	
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 
 | |
| 	faligndata	%f60, %f62, %f32
 | |
| 	cmp	%o0, %o5
 | |
| 	bleu,a,pn	%icc, 2f
 | |
| 	 ldda	[%o0] ASI_BLK_P, %f16
 | |
| 	membar	#Sync
 | |
| 2:
 | |
| 	faligndata	%f62, %f0, %f34
 | |
| 	dec	BLOCK_SIZE, %o2
 | |
| 	faligndata	%f0, %f2, %f36
 | |
| 	inc	BLOCK_SIZE, %o1
 | |
| 	faligndata	%f2, %f4, %f38
 | |
| 	faligndata	%f4, %f6, %f40
 | |
| 	inc	BLOCK_SIZE, %o0
 | |
| 	faligndata	%f6, %f8, %f42
 | |
| 	faligndata	%f8, %f10, %f44
 | |
| 
 | |
| 	brlez,pn	%o2, Lmemcpy_blockdone
 | |
| 	 faligndata	%f10, %f12, %f46
 | |
| 
 | |
| 	stda	%f32, [%o1] ASI_STORE
 | |
| 	ba	3b
 | |
| 	 inc	BLOCK_SIZE, %o1
 | |
| 	
 | |
| Lmemcpy_blockdone:
 | |
| 	inc	BLOCK_SIZE, %o2				! Fixup our overcommit
 | |
| 	membar	#Sync					! Finish any pending loads
 | |
| #define	FINISH_REG(f)				\
 | |
| 	deccc	8, %o2;				\
 | |
| 	bl,a	Lmemcpy_blockfinish;		\
 | |
| 	 fmovd	f, %f48;			\
 | |
| 	std	f, [%o1];			\
 | |
| 	inc	8, %o1
 | |
| 
 | |
| 	FINISH_REG(%f32)
 | |
| 	FINISH_REG(%f34)
 | |
| 	FINISH_REG(%f36)
 | |
| 	FINISH_REG(%f38)
 | |
| 	FINISH_REG(%f40)
 | |
| 	FINISH_REG(%f42)
 | |
| 	FINISH_REG(%f44)
 | |
| 	FINISH_REG(%f46)
 | |
| 	FINISH_REG(%f48)
 | |
| #undef FINISH_REG
 | |
| 	!! 
 | |
| 	!! The low 3 bits have the sub-word bits needed to be
 | |
| 	!! stored [because (x-8)&0x7 == x].
 | |
| 	!!
 | |
| Lmemcpy_blockfinish:
 | |
| 	brz,pn	%o2, 2f					! 100% complete?
 | |
| 	 fmovd	%f48, %f4
 | |
| 	cmp	%o2, 8					! Exactly 8 bytes?
 | |
| 	bz,a,pn	CCCR, 2f
 | |
| 	 std	%f4, [%o1]
 | |
| 
 | |
| 	btst	4, %o2					! Word store?
 | |
| 	bz	CCCR, 1f
 | |
| 	 nop
 | |
| 	st	%f4, [%o1]
 | |
| 	inc	4, %o1
 | |
| 1:
 | |
| 	btst	2, %o2
 | |
| 	fzero	%f0
 | |
| 	bz	1f
 | |
| 
 | |
| 	 mov	-6, %o4
 | |
| 	alignaddr %o1, %o4, %g0
 | |
| 
 | |
| 	faligndata %f0, %f4, %f8
 | |
| 	
 | |
| 	stda	%f8, [%o1] ASI_FL16_P			! Store short
 | |
| 	inc	2, %o1
 | |
| 1:
 | |
| 	btst	1, %o2					! Byte aligned?
 | |
| 	bz	2f
 | |
| 
 | |
| 	 mov	-7, %o0					! Calculate dest - 7
 | |
| 	alignaddr %o1, %o0, %g0				! Calculate shift mask and dest.
 | |
| 
 | |
| 	faligndata %f0, %f4, %f8			! Move 1st byte to low part of f8
 | |
| 
 | |
| 	stda	%f8, [%o1] ASI_FL8_P			! Store 1st byte
 | |
| 	inc	1, %o1					! Update address
 | |
| 2:
 | |
| 	membar	#Sync
 | |
| #if 0
 | |
| 	!!
 | |
| 	!! verify copy success.
 | |
| 	!! 
 | |
| 
 | |
| 	mov	%i0, %o2
 | |
| 	mov	%i1, %o4
 | |
| 	mov	%i2, %l4
 | |
| 0:	
 | |
| 	ldub	[%o2], %o1
 | |
| 	inc	%o2
 | |
| 	ldub	[%o4], %o3
 | |
| 	inc	%o4
 | |
| 	cmp	%o3, %o1
 | |
| 	bnz	1f
 | |
| 	 dec	%l4
 | |
| 	brnz	%l4, 0b
 | |
| 	 nop
 | |
| 	ba	2f
 | |
| 	 nop
 | |
| 
 | |
| 1:
 | |
| 	set	block_disable, %o0
 | |
| 	stx	%o0, [%o0]
 | |
| 	
 | |
| 	set	0f, %o0
 | |
| 	call	prom_printf
 | |
| 	 sub	%i2, %l4, %o5
 | |
| 	set	1f, %o0
 | |
| 	mov	%i0, %o2
 | |
| 	mov	%i1, %o1
 | |
| 	call	prom_printf
 | |
| 	 mov	%i2, %o3
 | |
| 	ta	1
 | |
| 	.data
 | |
| 	_ALIGN
 | |
| 0:	.asciz	"block memcpy failed: %x@%p != %x@%p byte %d\r\n"
 | |
| 1:	.asciz	"memcpy(%p, %p, %lx)\r\n"
 | |
| 	_ALIGN
 | |
| 	.text
 | |
| 2:	
 | |
| #endif
 | |
| #if defined(_KERNEL) && !defined(_RUMPKERNEL)
 | |
| 
 | |
| /*
 | |
|  * Weve saved our possible fpstate, now disable the fpu
 | |
|  * and continue with life.
 | |
|  */
 | |
| 	RESTORE_FPU
 | |
| 	ret
 | |
| 	 restore	%g1, 0, %o0			! Return DEST for memcpy
 | |
| #endif
 | |
|  	retl
 | |
| 	 mov	%g1, %o0
 | |
| /*
 | |
|  * Use block_disable to turn off block insns for
 | |
|  * memcpy/memset
 | |
|  */
 | |
| 	.data
 | |
| 	.align	8
 | |
| 	.globl	block_disable
 | |
| block_disable:	.xword	1
 | |
| 	.text
 | |
| #endif	/* USE_BLOCK_STORE_LOAD */
 |