This patch imports the unmodified current version of NetBSD libc. The NetBSD includes are in /nbsd_include, while the libc code itself is split between lib/nbsd_libc and common/lib/libc.
		
			
				
	
	
		
			1959 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			1959 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/*	$NetBSD: memcpy.S,v 1.2 2001/08/01 05:52:12 eeh Exp $	*/
 | 
						|
 | 
						|
/*
 | 
						|
 * Copyright (c) 2001	Eduardo E. Horvath
 | 
						|
 *
 | 
						|
 * This software was developed by the Computer Systems Engineering group
 | 
						|
 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
 | 
						|
 * contributed to Berkeley.
 | 
						|
 *
 | 
						|
 * Redistribution and use in source and binary forms, with or without
 | 
						|
 * modification, are permitted provided that the following conditions
 | 
						|
 * are met:
 | 
						|
 * 1. Redistributions of source code must retain the above copyright
 | 
						|
 *    notice, this list of conditions and the following disclaimer.
 | 
						|
 * 2. Redistributions in binary form must reproduce the above copyright
 | 
						|
 *    notice, this list of conditions and the following disclaimer in the
 | 
						|
 *    documentation and/or other materials provided with the distribution.
 | 
						|
 * 3. All advertising materials mentioning features or use of this software
 | 
						|
 *    must display the following acknowledgement:
 | 
						|
 *	This product includes software developed by the University of
 | 
						|
 *	California, Berkeley and its contributors.
 | 
						|
 * 4. Neither the name of the University nor the names of its contributors
 | 
						|
 *    may be used to endorse or promote products derived from this software
 | 
						|
 *    without specific prior written permission.
 | 
						|
 *
 | 
						|
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 | 
						|
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
						|
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
						|
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 | 
						|
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
						|
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 | 
						|
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 | 
						|
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 | 
						|
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | 
						|
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 | 
						|
 * SUCH DAMAGE.
 | 
						|
 *
 | 
						|
 */
 | 
						|
 | 
						|
#include <machine/asm.h>
 | 
						|
#ifndef _LOCORE
 | 
						|
#define _LOCORE
 | 
						|
#endif
 | 
						|
#include <machine/ctlreg.h>
 | 
						|
#include <machine/frame.h>
 | 
						|
#include <machine/psl.h>
 | 
						|
 | 
						|
#if defined(LIBC_SCCS) && !defined(lint)
 | 
						|
	RCSID("$NetBSD: memcpy.S,v 1.2 2001/08/01 05:52:12 eeh Exp $")
 | 
						|
#endif  /* LIBC_SCCS and not lint */
 | 
						|
 | 
						|
#define	EMPTY	nop
 | 
						|
#define	NOTREACHED	ta	1
 | 
						|
 | 
						|
#define	BCOPY_SMALL	16
 | 
						|
#define	BLOCK_SIZE	64
 | 
						|
 | 
						|
#if 0
 | 
						|
#define ASI_STORE	ASI_BLK_COMMIT_P
 | 
						|
#else
 | 
						|
#define ASI_STORE	ASI_BLK_P
 | 
						|
#endif
 | 
						|
 | 
						|
#if 1
 | 
						|
/*
 | 
						|
 * kernel bcopy/memcpy
 | 
						|
 * Assumes regions do not overlap; has no useful return value.
 | 
						|
 *
 | 
						|
 * Must not use %g7 (see copyin/copyout above).
 | 
						|
 */
 | 
						|
ENTRY(memcpy) /* dest, src, size */
 | 
						|
	/*
 | 
						|
	 * Swap args for bcopy.  Gcc generates calls to memcpy for
 | 
						|
	 * structure assignments.
 | 
						|
	 */
 | 
						|
	mov	%o0, %o3
 | 
						|
	mov	%o1, %o0
 | 
						|
	mov	%o3, %o1
 | 
						|
#endif
 | 
						|
ENTRY(bcopy) /* src, dest, size */
 | 
						|
#ifdef DEBUG
 | 
						|
	set	pmapdebug, %o4
 | 
						|
	ld	[%o4], %o4
 | 
						|
	btst	0x80, %o4	! PDB_COPY
 | 
						|
	bz,pt	%icc, 3f
 | 
						|
	 nop
 | 
						|
	save	%sp, -CC64FSZ, %sp
 | 
						|
	mov	%i0, %o1
 | 
						|
	set	2f, %o0
 | 
						|
	mov	%i1, %o2
 | 
						|
	call	printf
 | 
						|
	 mov	%i2, %o3
 | 
						|
!	ta	1; nop
 | 
						|
	restore
 | 
						|
	.data
 | 
						|
2:	.asciz	"bcopy(%p->%p,%x)\n"
 | 
						|
	_ALIGN
 | 
						|
	.text
 | 
						|
3:
 | 
						|
#endif
 | 
						|
	/*
 | 
						|
	 * Check for overlaps and punt.
 | 
						|
	 *
 | 
						|
	 * If src <= dest <= src+len we have a problem.
 | 
						|
	 */
 | 
						|
 | 
						|
	sub	%o1, %o0, %o3
 | 
						|
 | 
						|
	cmp	%o3, %o2
 | 
						|
	blu,pn	%xcc, Lovbcopy
 | 
						|
	 cmp	%o2, BCOPY_SMALL
 | 
						|
Lbcopy_start:
 | 
						|
	bge,pt	%xcc, 2f	! if >= this many, go be fancy.
 | 
						|
	 cmp	%o2, 256
 | 
						|
 | 
						|
	mov	%o1, %o5	! Save memcpy return value
 | 
						|
	/*
 | 
						|
	 * Not much to copy, just do it a byte at a time.
 | 
						|
	 */
 | 
						|
	deccc	%o2		! while (--len >= 0)
 | 
						|
	bl	1f
 | 
						|
	 EMPTY
 | 
						|
0:
 | 
						|
	inc	%o0
 | 
						|
	ldsb	[%o0 - 1], %o4	!	(++dst)[-1] = *src++;
 | 
						|
	stb	%o4, [%o1]
 | 
						|
	deccc	%o2
 | 
						|
	bge	0b
 | 
						|
	 inc	%o1
 | 
						|
1:
 | 
						|
	retl
 | 
						|
	 mov	%o5, %o0
 | 
						|
	NOTREACHED
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Overlapping bcopies -- punt.
 | 
						|
	 */
 | 
						|
Lovbcopy:
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Since src comes before dst, and the regions might overlap,
 | 
						|
	 * we have to do the copy starting at the end and working backwards.
 | 
						|
	 *
 | 
						|
	 * We could optimize this, but it almost never happens.
 | 
						|
	 */
 | 
						|
	mov	%o1, %o5	! Retval
 | 
						|
	add	%o2, %o0, %o0	! src += len
 | 
						|
	add	%o2, %o1, %o1	! dst += len
 | 
						|
	
 | 
						|
	deccc	%o2
 | 
						|
	bl,pn	%xcc, 1f
 | 
						|
	 dec	%o0
 | 
						|
0:
 | 
						|
	dec	%o1
 | 
						|
	ldsb	[%o0], %o4
 | 
						|
	dec	%o0
 | 
						|
	
 | 
						|
	deccc	%o2
 | 
						|
	bge,pt	%xcc, 0b
 | 
						|
	 stb	%o4, [%o1]
 | 
						|
1:
 | 
						|
	retl
 | 
						|
	 mov	%o5, %o0
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Plenty of data to copy, so try to do it optimally.
 | 
						|
	 */
 | 
						|
2:
 | 
						|
#if 1
 | 
						|
	! If it is big enough, use VIS instructions
 | 
						|
	bge	Lbcopy_block
 | 
						|
	 nop
 | 
						|
#endif
 | 
						|
Lbcopy_fancy:
 | 
						|
 | 
						|
	!!
 | 
						|
	!! First align the output to a 8-byte entity
 | 
						|
	!! 
 | 
						|
 | 
						|
	save	%sp, -CC64FSZ, %sp
 | 
						|
	
 | 
						|
	mov	%i0, %o0
 | 
						|
	mov	%i1, %o1
 | 
						|
	
 | 
						|
	mov	%i2, %o2
 | 
						|
	btst	1, %o1
 | 
						|
	
 | 
						|
	bz,pt	%icc, 4f
 | 
						|
	 btst	2, %o1
 | 
						|
	ldub	[%o0], %o4				! Load 1st byte
 | 
						|
	
 | 
						|
	deccc	1, %o2
 | 
						|
	ble,pn	%xcc, Lbcopy_finish			! XXXX
 | 
						|
	 inc	1, %o0
 | 
						|
	
 | 
						|
	stb	%o4, [%o1]				! Store 1st byte
 | 
						|
	inc	1, %o1					! Update address
 | 
						|
	btst	2, %o1
 | 
						|
4:	
 | 
						|
	bz,pt	%icc, 4f
 | 
						|
	
 | 
						|
	 btst	1, %o0
 | 
						|
	bz,a	1f
 | 
						|
	 lduh	[%o0], %o4				! Load short
 | 
						|
 | 
						|
	ldub	[%o0], %o4				! Load bytes
 | 
						|
	
 | 
						|
	ldub	[%o0+1], %o3
 | 
						|
	sllx	%o4, 8, %o4
 | 
						|
	or	%o3, %o4, %o4
 | 
						|
	
 | 
						|
1:	
 | 
						|
	deccc	2, %o2
 | 
						|
	ble,pn	%xcc, Lbcopy_finish			! XXXX
 | 
						|
	 inc	2, %o0
 | 
						|
	sth	%o4, [%o1]				! Store 1st short
 | 
						|
	
 | 
						|
	inc	2, %o1
 | 
						|
4:
 | 
						|
	btst	4, %o1
 | 
						|
	bz,pt	%xcc, 4f
 | 
						|
	
 | 
						|
	 btst	3, %o0
 | 
						|
	bz,a,pt	%xcc, 1f
 | 
						|
	 lduw	[%o0], %o4				! Load word -1
 | 
						|
 | 
						|
	btst	1, %o0
 | 
						|
	bz,a,pt	%icc, 2f
 | 
						|
	 lduh	[%o0], %o4
 | 
						|
	
 | 
						|
	ldub	[%o0], %o4
 | 
						|
	
 | 
						|
	lduh	[%o0+1], %o3
 | 
						|
	sllx	%o4, 16, %o4
 | 
						|
	or	%o4, %o3, %o4
 | 
						|
	
 | 
						|
	ldub	[%o0+3], %o3
 | 
						|
	sllx	%o4, 8, %o4
 | 
						|
	ba,pt	%icc, 1f
 | 
						|
	 or	%o4, %o3, %o4
 | 
						|
	
 | 
						|
2:
 | 
						|
	lduh	[%o0+2], %o3
 | 
						|
	sllx	%o4, 16, %o4
 | 
						|
	or	%o4, %o3, %o4
 | 
						|
	
 | 
						|
1:	
 | 
						|
	deccc	4, %o2
 | 
						|
	ble,pn	%xcc, Lbcopy_finish		! XXXX
 | 
						|
	 inc	4, %o0
 | 
						|
	
 | 
						|
	st	%o4, [%o1]				! Store word
 | 
						|
	inc	4, %o1
 | 
						|
4:
 | 
						|
	!!
 | 
						|
	!! We are now 32-bit aligned in the dest.
 | 
						|
	!!
 | 
						|
Lbcopy__common:	
 | 
						|
 | 
						|
	and	%o0, 7, %o4				! Shift amount
 | 
						|
	andn	%o0, 7, %o0				! Source addr
 | 
						|
	
 | 
						|
	brz,pt	%o4, Lbcopy_noshift8			! No shift version...
 | 
						|
 | 
						|
	 sllx	%o4, 3, %o4				! In bits
 | 
						|
	mov	8<<3, %o3
 | 
						|
	
 | 
						|
	ldx	[%o0], %l0				! Load word -1
 | 
						|
	sub	%o3, %o4, %o3				! Reverse shift
 | 
						|
	deccc	16*8, %o2				! Have enough room?
 | 
						|
	
 | 
						|
	sllx	%l0, %o4, %l0
 | 
						|
	bl,pn	%xcc, 2f
 | 
						|
	 and	%o3, 0x38, %o3
 | 
						|
Lbcopy_unrolled8:
 | 
						|
 | 
						|
	/*
 | 
						|
	 * This is about as close to optimal as you can get, since
 | 
						|
	 * the shifts require EU0 and cannot be paired, and you have
 | 
						|
	 * 3 dependent operations on the data.
 | 
						|
	 */ 
 | 
						|
 | 
						|
!	ldx	[%o0+0*8], %l0				! Already done
 | 
						|
!	sllx	%l0, %o4, %l0				! Already done
 | 
						|
	ldx	[%o0+1*8], %l1
 | 
						|
	ldx	[%o0+2*8], %l2
 | 
						|
	ldx	[%o0+3*8], %l3
 | 
						|
	ldx	[%o0+4*8], %l4
 | 
						|
	ldx	[%o0+5*8], %l5
 | 
						|
	ldx	[%o0+6*8], %l6
 | 
						|
#if 1
 | 
						|
	ba,pt	%icc, 1f
 | 
						|
	 ldx	[%o0+7*8], %l7
 | 
						|
	.align	8
 | 
						|
1:
 | 
						|
	srlx	%l1, %o3, %g1
 | 
						|
	inc	8*8, %o0
 | 
						|
	
 | 
						|
	sllx	%l1, %o4, %l1
 | 
						|
	or	%g1, %l0, %o5
 | 
						|
	ldx	[%o0+0*8], %l0
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+0*8]
 | 
						|
	srlx	%l2, %o3, %g1
 | 
						|
 | 
						|
	sllx	%l2, %o4, %l2
 | 
						|
	or	%g1, %l1, %o5
 | 
						|
	ldx	[%o0+1*8], %l1
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+1*8]
 | 
						|
	srlx	%l3, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l3, %o4, %l3
 | 
						|
	or	%g1, %l2, %o5
 | 
						|
	ldx	[%o0+2*8], %l2
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+2*8]
 | 
						|
	srlx	%l4, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l4, %o4, %l4	
 | 
						|
	or	%g1, %l3, %o5
 | 
						|
	ldx	[%o0+3*8], %l3
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+3*8]
 | 
						|
	srlx	%l5, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l5, %o4, %l5
 | 
						|
	or	%g1, %l4, %o5
 | 
						|
	ldx	[%o0+4*8], %l4
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+4*8]
 | 
						|
	srlx	%l6, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l6, %o4, %l6
 | 
						|
	or	%g1, %l5, %o5
 | 
						|
	ldx	[%o0+5*8], %l5
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+5*8]
 | 
						|
	srlx	%l7, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l7, %o4, %l7
 | 
						|
	or	%g1, %l6, %o5
 | 
						|
	ldx	[%o0+6*8], %l6
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+6*8]
 | 
						|
	srlx	%l0, %o3, %g1
 | 
						|
	deccc	8*8, %o2				! Have enough room?
 | 
						|
	
 | 
						|
	sllx	%l0, %o4, %l0				! Next loop
 | 
						|
	or	%g1, %l7, %o5
 | 
						|
	ldx	[%o0+7*8], %l7
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+7*8]
 | 
						|
	bge,pt	%xcc, 1b
 | 
						|
	 inc	8*8, %o1
 | 
						|
 | 
						|
Lbcopy_unrolled8_cleanup:	
 | 
						|
	!!
 | 
						|
	!! Finished 8 byte block, unload the regs.
 | 
						|
	!! 
 | 
						|
	srlx	%l1, %o3, %g1
 | 
						|
	inc	7*8, %o0
 | 
						|
	
 | 
						|
	sllx	%l1, %o4, %l1
 | 
						|
	or	%g1, %l0, %o5
 | 
						|
		
 | 
						|
	stx	%o5, [%o1+0*8]
 | 
						|
	srlx	%l2, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l2, %o4, %l2
 | 
						|
	or	%g1, %l1, %o5
 | 
						|
		
 | 
						|
	stx	%o5, [%o1+1*8]
 | 
						|
	srlx	%l3, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l3, %o4, %l3
 | 
						|
	or	%g1, %l2, %o5
 | 
						|
		
 | 
						|
	stx	%o5, [%o1+2*8]
 | 
						|
	srlx	%l4, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l4, %o4, %l4	
 | 
						|
	or	%g1, %l3, %o5
 | 
						|
		
 | 
						|
	stx	%o5, [%o1+3*8]
 | 
						|
	srlx	%l5, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l5, %o4, %l5
 | 
						|
	or	%g1, %l4, %o5
 | 
						|
		
 | 
						|
	stx	%o5, [%o1+4*8]
 | 
						|
	srlx	%l6, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l6, %o4, %l6
 | 
						|
	or	%g1, %l5, %o5
 | 
						|
		
 | 
						|
	stx	%o5, [%o1+5*8]
 | 
						|
	srlx	%l7, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l7, %o4, %l7
 | 
						|
	or	%g1, %l6, %o5
 | 
						|
		
 | 
						|
	stx	%o5, [%o1+6*8]
 | 
						|
	inc	7*8, %o1
 | 
						|
	
 | 
						|
	mov	%l7, %l0				! Save our unused data
 | 
						|
	dec	7*8, %o2
 | 
						|
#else
 | 
						|
	/*
 | 
						|
	 * This version also handles aligned copies at almost the
 | 
						|
	 * same speed.  It should take the same number of cycles
 | 
						|
	 * as the previous version, but is slightly slower, probably
 | 
						|
	 * due to i$ issues.
 | 
						|
	 */
 | 
						|
	ldx	[%o0+7*8], %l7
 | 
						|
	ba,pt	%icc, 1f
 | 
						|
	 clr	%g1
 | 
						|
	.align 32
 | 
						|
1:
 | 
						|
	srlx	%l1, %o3, %g1
 | 
						|
	bz,pn	%xcc, 3f
 | 
						|
	 inc	8*8, %o0
 | 
						|
 | 
						|
	sllx	%l1, %o4, %l1
 | 
						|
	or	%g1, %l0, %o5
 | 
						|
	ba,pt	%icc, 4f
 | 
						|
	ldx	[%o0+0*8], %l0
 | 
						|
	
 | 
						|
	nop
 | 
						|
3:
 | 
						|
	mov	%l0, %o5
 | 
						|
	ldx	[%o0+0*8], %l0
 | 
						|
	
 | 
						|
4:	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+0*8]
 | 
						|
	srlx	%l2, %o3, %g1
 | 
						|
 | 
						|
	sllx	%l2, %o4, %l2
 | 
						|
3:	
 | 
						|
	or	%g1, %l1, %o5
 | 
						|
	ldx	[%o0+1*8], %l1
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+1*8]
 | 
						|
	srlx	%l3, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l3, %o4, %l3
 | 
						|
3:	
 | 
						|
	or	%g1, %l2, %o5
 | 
						|
	ldx	[%o0+2*8], %l2
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+2*8]
 | 
						|
	srlx	%l4, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l4, %o4, %l4
 | 
						|
3:	
 | 
						|
	or	%g1, %l3, %o5
 | 
						|
	ldx	[%o0+3*8], %l3
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+3*8]
 | 
						|
	srlx	%l5, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l5, %o4, %l5
 | 
						|
3:	
 | 
						|
	or	%g1, %l4, %o5
 | 
						|
	ldx	[%o0+4*8], %l4
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+4*8]
 | 
						|
	srlx	%l6, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l6, %o4, %l6
 | 
						|
3:	
 | 
						|
	or	%g1, %l5, %o5
 | 
						|
	ldx	[%o0+5*8], %l5
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+5*8]
 | 
						|
	srlx	%l7, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l7, %o4, %l7
 | 
						|
3:	
 | 
						|
	or	%g1, %l6, %o5
 | 
						|
	ldx	[%o0+6*8], %l6
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+6*8]
 | 
						|
	srlx	%l0, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l0, %o4, %l0				! Next loop
 | 
						|
3:	
 | 
						|
	or	%g1, %l7, %o5
 | 
						|
	ldx	[%o0+7*8], %l7
 | 
						|
	deccc	8*8, %o2				! Have enough room?
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+7*8]
 | 
						|
	inc	8*8, %o1
 | 
						|
	bge,pt	%xcc, 1b
 | 
						|
	 tst	%o4
 | 
						|
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Now unload all those regs
 | 
						|
	!! 
 | 
						|
Lbcopy_unrolled8_cleanup:	
 | 
						|
	srlx	%l1, %o3, %g1
 | 
						|
	bz,pn	%xcc, 3f
 | 
						|
	 inc	7*8, %o0				! Point at the last load
 | 
						|
 | 
						|
	sllx	%l1, %o4, %l1
 | 
						|
	ba,pt	%icc, 4f
 | 
						|
	 or	%g1, %l0, %o5
 | 
						|
	
 | 
						|
3:
 | 
						|
	mov	%l0, %o5
 | 
						|
	
 | 
						|
4:	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+0*8]
 | 
						|
	srlx	%l2, %o3, %g1
 | 
						|
 | 
						|
	sllx	%l2, %o4, %l2
 | 
						|
3:	
 | 
						|
	or	%g1, %l1, %o5
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+1*8]
 | 
						|
	srlx	%l3, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l3, %o4, %l3
 | 
						|
3:	
 | 
						|
	or	%g1, %l2, %o5
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+2*8]
 | 
						|
	srlx	%l4, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l4, %o4, %l4
 | 
						|
3:	
 | 
						|
	or	%g1, %l3, %o5
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+3*8]
 | 
						|
	srlx	%l5, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l5, %o4, %l5
 | 
						|
3:	
 | 
						|
	or	%g1, %l4, %o5
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+4*8]
 | 
						|
	srlx	%l6, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l6, %o4, %l6
 | 
						|
3:	
 | 
						|
	or	%g1, %l5, %o5
 | 
						|
	
 | 
						|
	bz,pn	%icc, 3f
 | 
						|
	stx	%o5, [%o1+5*8]
 | 
						|
	srlx	%l7, %o3, %g1
 | 
						|
	
 | 
						|
	sllx	%l7, %o4, %l7
 | 
						|
3:	
 | 
						|
	or	%g1, %l6, %o5
 | 
						|
	mov	%l7, %l0				! Shuffle to %l0
 | 
						|
	
 | 
						|
	stx	%o5, [%o1+6*8]
 | 
						|
	or	%g1, %l7, %o5
 | 
						|
	dec	7*8, %o2
 | 
						|
	
 | 
						|
	inc	7*8, %o1				! Point at last store
 | 
						|
#endif
 | 
						|
2:
 | 
						|
	inccc	16*8, %o2
 | 
						|
	bz,pn	%icc, Lbcopy_complete
 | 
						|
	
 | 
						|
	!! Unrolled 8 times
 | 
						|
Lbcopy_aligned8:	
 | 
						|
!	ldx	[%o0], %l0				! Already done
 | 
						|
!	sllx	%l0, %o4, %l0				! Shift high word
 | 
						|
	
 | 
						|
	 deccc	8, %o2					! Pre-decrement
 | 
						|
	bl,pn	%xcc, Lbcopy_finish
 | 
						|
1:
 | 
						|
	ldx	[%o0+8], %l1				! Load word 0
 | 
						|
	inc	8, %o0
 | 
						|
	
 | 
						|
	srlx	%l1, %o3, %o5
 | 
						|
	or	%o5, %l0, %o5				! Combine
 | 
						|
	
 | 
						|
	stx	%o5, [%o1]				! Store result
 | 
						|
	 inc	8, %o1
 | 
						|
	
 | 
						|
	deccc	8, %o2
 | 
						|
	bge,pn	%xcc, 1b
 | 
						|
	 sllx	%l1, %o4, %l0	
 | 
						|
 | 
						|
	btst	7, %o2					! Done?
 | 
						|
	bz,pt	%xcc, Lbcopy_complete
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Loadup the last dregs into %l0 and shift it into place
 | 
						|
	!! 
 | 
						|
	 srlx	%o3, 3, %o5				! # bytes in %l0
 | 
						|
	dec	8, %o5					!  - 8
 | 
						|
	!! n-8 - (by - 8) -> n - by
 | 
						|
	subcc	%o2, %o5, %g0				! # bytes we need
 | 
						|
	ble,pt	%icc, Lbcopy_finish
 | 
						|
	 nop
 | 
						|
	ldx	[%o0+8], %l1				! Need another word
 | 
						|
	srlx	%l1, %o3, %l1
 | 
						|
	ba,pt	%icc, Lbcopy_finish
 | 
						|
	 or	%l0, %l1, %l0				! All loaded up.
 | 
						|
	
 | 
						|
Lbcopy_noshift8:
 | 
						|
	deccc	8*8, %o2				! Have enough room?
 | 
						|
	bl,pn	%xcc, 2f
 | 
						|
	 nop
 | 
						|
	ba,pt	%icc, 1f
 | 
						|
	 nop
 | 
						|
	.align	32
 | 
						|
1:	
 | 
						|
	ldx	[%o0+0*8], %l0
 | 
						|
	ldx	[%o0+1*8], %l1
 | 
						|
	ldx	[%o0+2*8], %l2
 | 
						|
	ldx	[%o0+3*8], %l3
 | 
						|
	stx	%l0, [%o1+0*8]
 | 
						|
	stx	%l1, [%o1+1*8]
 | 
						|
	stx	%l2, [%o1+2*8]
 | 
						|
	stx	%l3, [%o1+3*8]
 | 
						|
 | 
						|
	
 | 
						|
	ldx	[%o0+4*8], %l4
 | 
						|
	ldx	[%o0+5*8], %l5
 | 
						|
	ldx	[%o0+6*8], %l6
 | 
						|
	ldx	[%o0+7*8], %l7
 | 
						|
	inc	8*8, %o0
 | 
						|
	stx	%l4, [%o1+4*8]
 | 
						|
	stx	%l5, [%o1+5*8]
 | 
						|
	deccc	8*8, %o2
 | 
						|
	stx	%l6, [%o1+6*8]
 | 
						|
	stx	%l7, [%o1+7*8]
 | 
						|
	stx	%l2, [%o1+2*8]
 | 
						|
	bge,pt	%xcc, 1b
 | 
						|
	 inc	8*8, %o1
 | 
						|
2:
 | 
						|
	inc	8*8, %o2
 | 
						|
1:	
 | 
						|
	deccc	8, %o2
 | 
						|
	bl,pn	%icc, 1f				! < 0 --> sub word
 | 
						|
	 nop
 | 
						|
	ldx	[%o0], %o5
 | 
						|
	inc	8, %o0
 | 
						|
	stx	%o5, [%o1]
 | 
						|
	bg,pt	%icc, 1b				! Exactly 0 --> done
 | 
						|
	 inc	8, %o1
 | 
						|
1:
 | 
						|
	btst	7, %o2					! Done?
 | 
						|
	bz,pt	%xcc, Lbcopy_complete
 | 
						|
	 clr	%o4
 | 
						|
	ldx	[%o0], %l0
 | 
						|
Lbcopy_finish:
 | 
						|
	
 | 
						|
	brz,pn	%o2, 2f					! 100% complete?
 | 
						|
	 cmp	%o2, 8					! Exactly 8 bytes?
 | 
						|
	bz,a,pn	%xcc, 2f
 | 
						|
	 stx	%l0, [%o1]
 | 
						|
 | 
						|
	btst	4, %o2					! Word store?
 | 
						|
	bz	%xcc, 1f
 | 
						|
	 srlx	%l0, 32, %o5				! Shift high word down
 | 
						|
	stw	%o5, [%o1]
 | 
						|
	inc	4, %o1
 | 
						|
	mov	%l0, %o5				! Operate on the low bits
 | 
						|
1:
 | 
						|
	btst	2, %o2
 | 
						|
	mov	%o5, %l0
 | 
						|
	bz	1f
 | 
						|
	 srlx	%l0, 16, %o5
 | 
						|
	
 | 
						|
	sth	%o5, [%o1]				! Store short
 | 
						|
	inc	2, %o1
 | 
						|
	mov	%l0, %o5				! Operate on low bytes
 | 
						|
1:
 | 
						|
	mov	%o5, %l0
 | 
						|
	btst	1, %o2					! Byte aligned?
 | 
						|
	bz	2f
 | 
						|
	 srlx	%l0, 8, %o5
 | 
						|
 | 
						|
	stb	%o5, [%o1]				! Store last byte
 | 
						|
	inc	1, %o1					! Update address
 | 
						|
2:	
 | 
						|
Lbcopy_complete:
 | 
						|
#if 0
 | 
						|
	!!
 | 
						|
	!! verify copy success.
 | 
						|
	!! 
 | 
						|
 | 
						|
	mov	%i0, %o2
 | 
						|
	mov	%i1, %o4
 | 
						|
	mov	%i2, %l4
 | 
						|
0:	
 | 
						|
	ldub	[%o2], %o1
 | 
						|
	inc	%o2
 | 
						|
	ldub	[%o4], %o3
 | 
						|
	inc	%o4
 | 
						|
	cmp	%o3, %o1
 | 
						|
	bnz	1f
 | 
						|
	 dec	%l4
 | 
						|
	brnz	%l4, 0b
 | 
						|
	 nop
 | 
						|
	ba	2f
 | 
						|
	 nop
 | 
						|
 | 
						|
1:
 | 
						|
	set	0f, %o0
 | 
						|
	call	printf
 | 
						|
	 sub	%i2, %l4, %o5
 | 
						|
	set	1f, %o0
 | 
						|
	mov	%i0, %o1
 | 
						|
	mov	%i1, %o2
 | 
						|
	call	printf
 | 
						|
	 mov	%i2, %o3
 | 
						|
	ta	1
 | 
						|
	.data
 | 
						|
0:	.asciz	"bcopy failed: %x@%p != %x@%p byte %d\n"
 | 
						|
1:	.asciz	"bcopy(%p, %p, %lx)\n"
 | 
						|
	.align 8
 | 
						|
	.text
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	ret
 | 
						|
	 restore %i1, %g0, %o0
 | 
						|
 | 
						|
#if 1
 | 
						|
 | 
						|
/*
 | 
						|
 * Block copy.  Useful for >256 byte copies.
 | 
						|
 *
 | 
						|
 * Benchmarking has shown this always seems to be slower than
 | 
						|
 * the integer version, so this is disabled.  Maybe someone will
 | 
						|
 * figure out why sometime.
 | 
						|
 */
 | 
						|
	
 | 
						|
Lbcopy_block:
 | 
						|
#ifdef _KERNEL
 | 
						|
/*
 | 
						|
 * Kernel:
 | 
						|
 *
 | 
						|
 * Here we use VIS instructions to do a block clear of a page.
 | 
						|
 * But before we can do that we need to save and enable the FPU.
 | 
						|
 * The last owner of the FPU registers is fpproc, and
 | 
						|
 * fpproc->p_md.md_fpstate is the current fpstate.  If that's not
 | 
						|
 * null, call savefpstate() with it to store our current fp state.
 | 
						|
 *
 | 
						|
 * Next, allocate an aligned fpstate on the stack.  We will properly
 | 
						|
 * nest calls on a particular stack so this should not be a problem.
 | 
						|
 *
 | 
						|
 * Now we grab either curproc (or if we're on the interrupt stack
 | 
						|
 * proc0).  We stash its existing fpstate in a local register and
 | 
						|
 * put our new fpstate in curproc->p_md.md_fpstate.  We point
 | 
						|
 * fpproc at curproc (or proc0) and enable the FPU.
 | 
						|
 *
 | 
						|
 * If we are ever preempted, our FPU state will be saved in our
 | 
						|
 * fpstate.  Then, when we're resumed and we take an FPDISABLED
 | 
						|
 * trap, the trap handler will be able to fish our FPU state out
 | 
						|
 * of curproc (or proc0).
 | 
						|
 *
 | 
						|
 * On exiting this routine we undo the damage: restore the original
 | 
						|
 * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable
 | 
						|
 * the MMU.
 | 
						|
 *
 | 
						|
 *
 | 
						|
 * Register usage, Kernel only (after save):
 | 
						|
 *
 | 
						|
 * %i0		src
 | 
						|
 * %i1		dest
 | 
						|
 * %i2		size
 | 
						|
 *
 | 
						|
 * %l0		XXXX DEBUG old fpstate
 | 
						|
 * %l1		fpproc (hi bits only)
 | 
						|
 * %l2		orig fpproc
 | 
						|
 * %l3		orig fpstate
 | 
						|
 * %l5		curproc
 | 
						|
 * %l6		old fpstate
 | 
						|
 *
 | 
						|
 * Register ussage, Kernel and user:
 | 
						|
 *
 | 
						|
 * %g1		src (retval for memcpy)
 | 
						|
 *
 | 
						|
 * %o0		src
 | 
						|
 * %o1		dest
 | 
						|
 * %o2		end dest
 | 
						|
 * %o5		last safe fetchable address
 | 
						|
 */
 | 
						|
 | 
						|
#if 1
 | 
						|
	ENABLE_FPU(0)
 | 
						|
#else
 | 
						|
	save	%sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp	! Allocate an fpstate
 | 
						|
	sethi	%hi(FPPROC), %l1
 | 
						|
	LDPTR	[%l1 + %lo(FPPROC)], %l2		! Load fpproc
 | 
						|
	add	%sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0	! Calculate pointer to fpstate
 | 
						|
	brz,pt	%l2, 1f					! fpproc == NULL?
 | 
						|
	 andn	%l0, BLOCK_ALIGN, %l0			! And make it block aligned
 | 
						|
	LDPTR	[%l2 + P_FPSTATE], %l3
 | 
						|
	brz,pn	%l3, 1f					! Make sure we have an fpstate
 | 
						|
	 mov	%l3, %o0
 | 
						|
	call	_C_LABEL(savefpstate)			! Save the old fpstate
 | 
						|
	 set	EINTSTACK-STKB, %l4			! Are we on intr stack?
 | 
						|
	cmp	%sp, %l4
 | 
						|
	bgu,pt	%xcc, 1f
 | 
						|
	 set	INTSTACK-STKB, %l4
 | 
						|
	cmp	%sp, %l4
 | 
						|
	blu	%xcc, 1f
 | 
						|
0:
 | 
						|
	 sethi	%hi(_C_LABEL(proc0)), %l4		! Yes, use proc0
 | 
						|
	ba,pt	%xcc, 2f				! XXXX needs to change to CPUs idle proc
 | 
						|
	 or	%l4, %lo(_C_LABEL(proc0)), %l5
 | 
						|
1:
 | 
						|
	sethi	%hi(CURPROC), %l4			! Use curproc
 | 
						|
	LDPTR	[%l4 + %lo(CURPROC)], %l5
 | 
						|
	brz,pn	%l5, 0b					! If curproc is NULL need to use proc0
 | 
						|
	 nop
 | 
						|
2:
 | 
						|
	LDPTR	[%l5 + P_FPSTATE], %l6			! Save old fpstate
 | 
						|
	STPTR	%l0, [%l5 + P_FPSTATE]			! Insert new fpstate
 | 
						|
	STPTR	%l5, [%l1 + %lo(FPPROC)]		! Set new fpproc
 | 
						|
	wr	%g0, FPRS_FEF, %fprs			! Enable FPU
 | 
						|
#endif
 | 
						|
	mov	%i0, %o0				! Src addr.
 | 
						|
	mov	%i1, %o1				! Store our dest ptr here.
 | 
						|
	mov	%i2, %o2				! Len counter
 | 
						|
#endif
 | 
						|
 | 
						|
	!!
 | 
						|
	!! First align the output to a 64-bit entity
 | 
						|
	!! 
 | 
						|
 | 
						|
	mov	%o1, %g1				! memcpy retval
 | 
						|
	add	%o0, %o2, %o5				! End of source block
 | 
						|
 | 
						|
	andn	%o0, 7, %o3				! Start of block
 | 
						|
	dec	%o5
 | 
						|
	fzero	%f0
 | 
						|
 | 
						|
	andn	%o5, BLOCK_ALIGN, %o5			! Last safe addr.
 | 
						|
	ldd	[%o3], %f2				! Load 1st word
 | 
						|
 | 
						|
	dec	8, %o3					! Move %o3 1 word back
 | 
						|
	btst	1, %o1
 | 
						|
	bz	4f
 | 
						|
	
 | 
						|
	 mov	-7, %o4					! Lowest src addr possible
 | 
						|
	alignaddr %o0, %o4, %o4				! Base addr for load.
 | 
						|
 | 
						|
	cmp	%o3, %o4
 | 
						|
	be,pt	%xcc, 1f				! Already loaded?
 | 
						|
	 mov	%o4, %o3
 | 
						|
	fmovd	%f2, %f0				! No. Shift
 | 
						|
	ldd	[%o3+8], %f2				! And load
 | 
						|
1:	
 | 
						|
 | 
						|
	faligndata	%f0, %f2, %f4			! Isolate 1st byte
 | 
						|
 | 
						|
	stda	%f4, [%o1] ASI_FL8_P			! Store 1st byte
 | 
						|
	inc	1, %o1					! Update address
 | 
						|
	inc	1, %o0
 | 
						|
	dec	1, %o2
 | 
						|
4:	
 | 
						|
	btst	2, %o1
 | 
						|
	bz	4f
 | 
						|
 | 
						|
	 mov	-6, %o4					! Calculate src - 6
 | 
						|
	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
 | 
						|
 | 
						|
	cmp	%o3, %o4				! Addresses same?
 | 
						|
	be,pt	%xcc, 1f
 | 
						|
	 mov	%o4, %o3
 | 
						|
	fmovd	%f2, %f0				! Shuffle data
 | 
						|
	ldd	[%o3+8], %f2				! Load word 0
 | 
						|
1:	
 | 
						|
	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
 | 
						|
 | 
						|
	stda	%f4, [%o1] ASI_FL16_P			! Store 1st short
 | 
						|
	dec	2, %o2
 | 
						|
	inc	2, %o1
 | 
						|
	inc	2, %o0
 | 
						|
4:
 | 
						|
	brz,pn	%o2, Lbcopy_blockfinish			! XXXX
 | 
						|
 | 
						|
	 btst	4, %o1
 | 
						|
	bz	4f
 | 
						|
 | 
						|
	mov	-4, %o4
 | 
						|
	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
 | 
						|
 | 
						|
	cmp	%o3, %o4				! Addresses same?
 | 
						|
	beq,pt	%xcc, 1f
 | 
						|
	 mov	%o4, %o3
 | 
						|
	fmovd	%f2, %f0				! Shuffle data
 | 
						|
	ldd	[%o3+8], %f2				! Load word 0
 | 
						|
1:	
 | 
						|
	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
 | 
						|
 | 
						|
	st	%f5, [%o1]				! Store word
 | 
						|
	dec	4, %o2
 | 
						|
	inc	4, %o1
 | 
						|
	inc	4, %o0
 | 
						|
4:
 | 
						|
	brz,pn	%o2, Lbcopy_blockfinish			! XXXX
 | 
						|
	!!
 | 
						|
	!! We are now 32-bit aligned in the dest.
 | 
						|
	!!
 | 
						|
Lbcopy_block_common:	
 | 
						|
 | 
						|
	 mov	-0, %o4
 | 
						|
	alignaddr %o0, %o4, %o4				! base - shift
 | 
						|
 | 
						|
	cmp	%o3, %o4				! Addresses same?
 | 
						|
	beq,pt	%xcc, 1f
 | 
						|
	 mov	%o4, %o3
 | 
						|
	fmovd	%f2, %f0				! Shuffle data
 | 
						|
	ldd	[%o3+8], %f2				! Load word 0
 | 
						|
1:	
 | 
						|
	add	%o3, 8, %o0				! now use %o0 for src
 | 
						|
	
 | 
						|
	!!
 | 
						|
	!! Continue until our dest is block aligned
 | 
						|
	!! 
 | 
						|
Lbcopy_block_aligned8:	
 | 
						|
1:
 | 
						|
	brz	%o2, Lbcopy_blockfinish
 | 
						|
	 btst	BLOCK_ALIGN, %o1			! Block aligned?
 | 
						|
	bz	1f
 | 
						|
	
 | 
						|
	 faligndata %f0, %f2, %f4			! Generate result
 | 
						|
	deccc	8, %o2
 | 
						|
	ble,pn	%icc, Lbcopy_blockfinish		! Should never happen
 | 
						|
	 fmovd	%f4, %f48
 | 
						|
	
 | 
						|
	std	%f4, [%o1]				! Store result
 | 
						|
	inc	8, %o1
 | 
						|
	
 | 
						|
	fmovd	%f2, %f0
 | 
						|
	inc	8, %o0
 | 
						|
	ba,pt	%xcc, 1b				! Not yet.
 | 
						|
	 ldd	[%o0], %f2				! Load next part
 | 
						|
Lbcopy_block_aligned64:	
 | 
						|
1:
 | 
						|
 | 
						|
/*
 | 
						|
 * 64-byte aligned -- ready for block operations.
 | 
						|
 *
 | 
						|
 * Here we have the destination block aligned, but the
 | 
						|
 * source pointer may not be.  Sub-word alignment will
 | 
						|
 * be handled by faligndata instructions.  But the source
 | 
						|
 * can still be potentially aligned to 8 different words
 | 
						|
 * in our 64-bit block, so we have 8 different copy routines.
 | 
						|
 *
 | 
						|
 * Once we figure out our source alignment, we branch
 | 
						|
 * to the appropriate copy routine, which sets up the
 | 
						|
 * alignment for faligndata and loads (sets) the values
 | 
						|
 * into the source registers and does the copy loop.
 | 
						|
 *
 | 
						|
 * When were down to less than 1 block to store, we
 | 
						|
 * exit the copy loop and execute cleanup code.
 | 
						|
 *
 | 
						|
 * Block loads and stores are not properly interlocked.
 | 
						|
 * Stores save one reg/cycle, so you can start overwriting
 | 
						|
 * registers the cycle after the store is issued.  
 | 
						|
 * 
 | 
						|
 * Block loads require a block load to a different register
 | 
						|
 * block or a membar #Sync before accessing the loaded
 | 
						|
 * data.
 | 
						|
 *	
 | 
						|
 * Since the faligndata instructions may be offset as far
 | 
						|
 * as 7 registers into a block (if you are shifting source 
 | 
						|
 * 7 -> dest 0), you need 3 source register blocks for full 
 | 
						|
 * performance: one you are copying, one you are loading, 
 | 
						|
 * and one for interlocking.  Otherwise, we would need to
 | 
						|
 * sprinkle the code with membar #Sync and lose the advantage
 | 
						|
 * of running faligndata in parallel with block stores.  This 
 | 
						|
 * means we are fetching a full 128 bytes ahead of the stores.  
 | 
						|
 * We need to make sure the prefetch does not inadvertently 
 | 
						|
 * cross a page boundary and fault on data that we will never 
 | 
						|
 * store.
 | 
						|
 *
 | 
						|
 */
 | 
						|
#if 1
 | 
						|
	and	%o0, BLOCK_ALIGN, %o3
 | 
						|
	srax	%o3, 3, %o3				! Isolate the offset
 | 
						|
 | 
						|
	brz	%o3, L100				! 0->0
 | 
						|
	 btst	4, %o3
 | 
						|
	bnz	%xcc, 4f
 | 
						|
	 btst	2, %o3
 | 
						|
	bnz	%xcc, 2f
 | 
						|
	 btst	1, %o3
 | 
						|
	ba,pt	%xcc, L101				! 0->1
 | 
						|
	 nop	/* XXX spitfire bug */
 | 
						|
2:
 | 
						|
	bz	%xcc, L102				! 0->2
 | 
						|
	 nop
 | 
						|
	ba,pt	%xcc, L103				! 0->3
 | 
						|
	 nop	/* XXX spitfire bug */
 | 
						|
4:	
 | 
						|
	bnz	%xcc, 2f
 | 
						|
	 btst	1, %o3
 | 
						|
	bz	%xcc, L104				! 0->4
 | 
						|
	 nop
 | 
						|
	ba,pt	%xcc, L105				! 0->5
 | 
						|
	 nop	/* XXX spitfire bug */
 | 
						|
2:
 | 
						|
	bz	%xcc, L106				! 0->6
 | 
						|
	 nop
 | 
						|
	ba,pt	%xcc, L107				! 0->7
 | 
						|
	 nop	/* XXX spitfire bug */
 | 
						|
#else
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Isolate the word offset, which just happens to be
 | 
						|
	!! the slot in our jump table.
 | 
						|
	!!
 | 
						|
	!! This is 6 insns, most of which cannot be paired,
 | 
						|
	!! which is about the same as the above version.
 | 
						|
	!!
 | 
						|
	rd	%pc, %o4
 | 
						|
1:	
 | 
						|
	and	%o0, 0x31, %o3
 | 
						|
	add	%o3, (Lbcopy_block_jmp - 1b), %o3
 | 
						|
	jmpl	%o4 + %o3, %g0
 | 
						|
	 nop
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Jump table
 | 
						|
	!!
 | 
						|
	
 | 
						|
Lbcopy_block_jmp:
 | 
						|
	ba,a,pt	%xcc, L100
 | 
						|
	 nop
 | 
						|
	ba,a,pt	%xcc, L101
 | 
						|
	 nop
 | 
						|
	ba,a,pt	%xcc, L102
 | 
						|
	 nop
 | 
						|
	ba,a,pt	%xcc, L103
 | 
						|
	 nop
 | 
						|
	ba,a,pt	%xcc, L104
 | 
						|
	 nop
 | 
						|
	ba,a,pt	%xcc, L105
 | 
						|
	 nop
 | 
						|
	ba,a,pt	%xcc, L106
 | 
						|
	 nop
 | 
						|
	ba,a,pt	%xcc, L107
 | 
						|
	 nop
 | 
						|
#endif
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Source is block aligned.
 | 
						|
	!!
 | 
						|
	!! Just load a block and go.
 | 
						|
	!!
 | 
						|
L100:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L100"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	fmovd	%f0 , %f62
 | 
						|
	ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 3f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	ba,pt	%icc, 3f
 | 
						|
	 membar #Sync
 | 
						|
	
 | 
						|
	.align	32					! ICache align.
 | 
						|
3:
 | 
						|
	faligndata	%f62, %f0, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f0, %f2, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f2, %f4, %f36
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f4, %f6, %f38
 | 
						|
	faligndata	%f6, %f8, %f40
 | 
						|
	faligndata	%f8, %f10, %f42
 | 
						|
	faligndata	%f10, %f12, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f12, %f14, %f46
 | 
						|
	
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	faligndata	%f14, %f16, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f16, %f18, %f34
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f18, %f20, %f36
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f20, %f22, %f38
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f22, %f24, %f40
 | 
						|
	faligndata	%f24, %f26, %f42
 | 
						|
	faligndata	%f26, %f28, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f28, %f30, %f46
 | 
						|
	
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	faligndata	%f30, %f48, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f48, %f50, %f34
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f50, %f52, %f36
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f52, %f54, %f38
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f54, %f56, %f40
 | 
						|
	faligndata	%f56, %f58, %f42
 | 
						|
	faligndata	%f58, %f60, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f60, %f62, %f46
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16			! Increment is at top
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
	
 | 
						|
	!!
 | 
						|
	!! Source at BLOCK_ALIGN+8
 | 
						|
	!!
 | 
						|
	!! We need to load almost 1 complete block by hand.
 | 
						|
	!! 
 | 
						|
L101:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L101"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
!	fmovd	%f0, %f0				! Hoist fmovd
 | 
						|
	ldd	[%o0], %f2
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f4
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f6
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f8
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f10
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f12
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f14
 | 
						|
	inc	8, %o0
 | 
						|
	
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 3f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar #Sync
 | 
						|
3:	
 | 
						|
	faligndata	%f0, %f2, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f2, %f4, %f34
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f4, %f6, %f36
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f6, %f8, %f38
 | 
						|
	faligndata	%f8, %f10, %f40
 | 
						|
	faligndata	%f10, %f12, %f42
 | 
						|
	faligndata	%f12, %f14, %f44
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f14, %f16, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	
 | 
						|
	faligndata	%f16, %f18, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f18, %f20, %f34
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f20, %f22, %f36
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f22, %f24, %f38
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f24, %f26, %f40
 | 
						|
	faligndata	%f26, %f28, %f42
 | 
						|
	faligndata	%f28, %f30, %f44
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f30, %f48, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f48, %f50, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f50, %f52, %f34
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f52, %f54, %f36
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f54, %f56, %f38
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f56, %f58, %f40
 | 
						|
	faligndata	%f58, %f60, %f42
 | 
						|
	faligndata	%f60, %f62, %f44
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f62, %f0, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Source at BLOCK_ALIGN+16
 | 
						|
	!!
 | 
						|
	!! We need to load 6 doubles by hand.
 | 
						|
	!! 
 | 
						|
L102:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L102"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	ldd	[%o0], %f4
 | 
						|
	inc	8, %o0
 | 
						|
	fmovd	%f0, %f2				! Hoist fmovd
 | 
						|
	ldd	[%o0], %f6
 | 
						|
	inc	8, %o0
 | 
						|
	
 | 
						|
	ldd	[%o0], %f8
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f10
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f12
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f14
 | 
						|
	inc	8, %o0
 | 
						|
	
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 3f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar #Sync
 | 
						|
3:	
 | 
						|
	faligndata	%f2, %f4, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f4, %f6, %f34
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f6, %f8, %f36
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f8, %f10, %f38
 | 
						|
	faligndata	%f10, %f12, %f40
 | 
						|
	faligndata	%f12, %f14, %f42
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f14, %f16, %f44
 | 
						|
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f16, %f18, %f46
 | 
						|
	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f18, %f20, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f20, %f22, %f34
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f22, %f24, %f36
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f24, %f26, %f38
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f26, %f28, %f40
 | 
						|
	faligndata	%f28, %f30, %f42
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	faligndata	%f30, %f48, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f48, %f50, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f50, %f52, %f32
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f52, %f54, %f34
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f54, %f56, %f36
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f56, %f58, %f38
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f58, %f60, %f40
 | 
						|
	faligndata	%f60, %f62, %f42
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	faligndata	%f62, %f0, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f0, %f2, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
	
 | 
						|
	!!
 | 
						|
	!! Source at BLOCK_ALIGN+24
 | 
						|
	!!
 | 
						|
	!! We need to load 5 doubles by hand.
 | 
						|
	!! 
 | 
						|
L103:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L103"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	fmovd	%f0, %f4
 | 
						|
	ldd	[%o0], %f6
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f8
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f10
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f12
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f14
 | 
						|
	inc	8, %o0
 | 
						|
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar #Sync
 | 
						|
2:	
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
3:	
 | 
						|
	faligndata	%f4, %f6, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f6, %f8, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f8, %f10, %f36
 | 
						|
	faligndata	%f10, %f12, %f38
 | 
						|
	faligndata	%f12, %f14, %f40
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f14, %f16, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f16, %f18, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f18, %f20, %f46
 | 
						|
	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f20, %f22, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f22, %f24, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f24, %f26, %f36
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f26, %f28, %f38
 | 
						|
	faligndata	%f28, %f30, %f40
 | 
						|
	ble,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	faligndata	%f30, %f48, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f48, %f50, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f50, %f52, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f52, %f54, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f54, %f56, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f56, %f58, %f36
 | 
						|
	faligndata	%f58, %f60, %f38
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f60, %f62, %f40
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	faligndata	%f62, %f0, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f0, %f2, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f2, %f4, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Source at BLOCK_ALIGN+32
 | 
						|
	!!
 | 
						|
	!! We need to load 4 doubles by hand.
 | 
						|
	!! 
 | 
						|
L104:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L104"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	fmovd	%f0, %f6
 | 
						|
	ldd	[%o0], %f8
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f10
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f12
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f14
 | 
						|
	inc	8, %o0
 | 
						|
	
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar #Sync
 | 
						|
2:	
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
3:	
 | 
						|
	faligndata	%f6, %f8, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f8, %f10, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f10, %f12, %f36
 | 
						|
	faligndata	%f12, %f14, %f38
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f14, %f16, %f40
 | 
						|
	faligndata	%f16, %f18, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f18, %f20, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f20, %f22, %f46
 | 
						|
	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f22, %f24, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f24, %f26, %f34
 | 
						|
	faligndata	%f26, %f28, %f36
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f28, %f30, %f38
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	faligndata	%f30, %f48, %f40
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f48, %f50, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f50, %f52, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f52, %f54, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f54, %f56, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f56, %f58, %f34
 | 
						|
	faligndata	%f58, %f60, %f36
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f60, %f62, %f38
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar	#Sync
 | 
						|
2:	
 | 
						|
	faligndata	%f62, %f0, %f40
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f0, %f2, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f2, %f4, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f4, %f6, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Source at BLOCK_ALIGN+40
 | 
						|
	!!
 | 
						|
	!! We need to load 3 doubles by hand.
 | 
						|
	!! 
 | 
						|
L105:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L105"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	fmovd	%f0, %f8
 | 
						|
	ldd	[%o0], %f10
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f12
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f14
 | 
						|
	inc	8, %o0
 | 
						|
	
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar #Sync
 | 
						|
2:	
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
3:	
 | 
						|
	faligndata	%f8, %f10, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f10, %f12, %f34
 | 
						|
	faligndata	%f12, %f14, %f36
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f14, %f16, %f38
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f16, %f18, %f40
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f18, %f20, %f42
 | 
						|
	faligndata	%f20, %f22, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f22, %f24, %f46
 | 
						|
	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f24, %f26, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f26, %f28, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f28, %f30, %f36
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f30, %f48, %f38
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f48, %f50, %f40
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f50, %f52, %f42
 | 
						|
	faligndata	%f52, %f54, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f54, %f56, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f56, %f58, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f58, %f60, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f60, %f62, %f36
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f62, %f0, %f38
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f0, %f2, %f40
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f2, %f4, %f42
 | 
						|
	faligndata	%f4, %f6, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f6, %f8, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Source at BLOCK_ALIGN+48
 | 
						|
	!!
 | 
						|
	!! We need to load 2 doubles by hand.
 | 
						|
	!! 
 | 
						|
L106:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L106"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	fmovd	%f0, %f10
 | 
						|
	ldd	[%o0], %f12
 | 
						|
	inc	8, %o0
 | 
						|
	ldd	[%o0], %f14
 | 
						|
	inc	8, %o0
 | 
						|
	
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar #Sync
 | 
						|
2:	
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
3:	
 | 
						|
	faligndata	%f10, %f12, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f12, %f14, %f34
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f14, %f16, %f36
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f16, %f18, %f38
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f18, %f20, %f40
 | 
						|
	faligndata	%f20, %f22, %f42
 | 
						|
	faligndata	%f22, %f24, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f24, %f26, %f46
 | 
						|
	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f26, %f28, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f28, %f30, %f34
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f30, %f48, %f36
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f48, %f50, %f38
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f50, %f52, %f40
 | 
						|
	faligndata	%f52, %f54, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f54, %f56, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f56, %f58, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f58, %f60, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	faligndata	%f60, %f62, %f34
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f62, %f0, %f36
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f0, %f2, %f38
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f2, %f4, %f40
 | 
						|
	faligndata	%f4, %f6, %f42
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f6, %f8, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f8, %f10, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
 | 
						|
 | 
						|
	!!
 | 
						|
	!! Source at BLOCK_ALIGN+56
 | 
						|
	!!
 | 
						|
	!! We need to load 1 double by hand.
 | 
						|
	!! 
 | 
						|
L107:
 | 
						|
#ifdef RETURN_NAME
 | 
						|
	sethi	%hi(1f), %g1
 | 
						|
	ba,pt	%icc, 2f
 | 
						|
	 or	%g1, %lo(1f), %g1
 | 
						|
1:	
 | 
						|
	.asciz	"L107"
 | 
						|
	.align	8
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
	fmovd	%f0, %f12
 | 
						|
	ldd	[%o0], %f14
 | 
						|
	inc	8, %o0
 | 
						|
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar #Sync
 | 
						|
2:	
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
3:	
 | 
						|
	faligndata	%f12, %f14, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f48
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f14, %f16, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f16, %f18, %f36
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f18, %f20, %f38
 | 
						|
	faligndata	%f20, %f22, %f40
 | 
						|
	faligndata	%f22, %f24, %f42
 | 
						|
	faligndata	%f24, %f26, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f26, %f28, %f46
 | 
						|
	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f28, %f30, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f0
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f30, %f48, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f48, %f50, %f36
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f50, %f52, %f38
 | 
						|
	faligndata	%f52, %f54, %f40
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f54, %f56, %f42
 | 
						|
	faligndata	%f56, %f58, %f44
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f58, %f60, %f46
 | 
						|
	
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
 | 
						|
	faligndata	%f60, %f62, %f32
 | 
						|
	cmp	%o0, %o5
 | 
						|
	bleu,a,pn	%icc, 2f
 | 
						|
	 ldda	[%o0] ASI_BLK_P, %f16
 | 
						|
	membar	#Sync
 | 
						|
2:
 | 
						|
	faligndata	%f62, %f0, %f34
 | 
						|
	dec	BLOCK_SIZE, %o2
 | 
						|
	faligndata	%f0, %f2, %f36
 | 
						|
	inc	BLOCK_SIZE, %o1
 | 
						|
	faligndata	%f2, %f4, %f38
 | 
						|
	faligndata	%f4, %f6, %f40
 | 
						|
	inc	BLOCK_SIZE, %o0
 | 
						|
	faligndata	%f6, %f8, %f42
 | 
						|
	faligndata	%f8, %f10, %f44
 | 
						|
 | 
						|
	brlez,pn	%o2, Lbcopy_blockdone
 | 
						|
	 faligndata	%f10, %f12, %f46
 | 
						|
 | 
						|
	stda	%f32, [%o1] ASI_STORE
 | 
						|
	ba	3b
 | 
						|
	 inc	BLOCK_SIZE, %o1
 | 
						|
	
 | 
						|
Lbcopy_blockdone:
 | 
						|
	inc	BLOCK_SIZE, %o2				! Fixup our overcommit
 | 
						|
	membar	#Sync					! Finish any pending loads
 | 
						|
#define	FINISH_REG(f)				\
 | 
						|
	deccc	8, %o2;				\
 | 
						|
	bl,a	Lbcopy_blockfinish;		\
 | 
						|
	 fmovd	f, %f48;			\
 | 
						|
	std	f, [%o1];			\
 | 
						|
	inc	8, %o1
 | 
						|
 | 
						|
	FINISH_REG(%f32)
 | 
						|
	FINISH_REG(%f34)
 | 
						|
	FINISH_REG(%f36)
 | 
						|
	FINISH_REG(%f38)
 | 
						|
	FINISH_REG(%f40)
 | 
						|
	FINISH_REG(%f42)
 | 
						|
	FINISH_REG(%f44)
 | 
						|
	FINISH_REG(%f46)
 | 
						|
	FINISH_REG(%f48)
 | 
						|
#undef FINISH_REG
 | 
						|
	!! 
 | 
						|
	!! The low 3 bits have the sub-word bits needed to be
 | 
						|
	!! stored [because (x-8)&0x7 == x].
 | 
						|
	!!
 | 
						|
Lbcopy_blockfinish:
 | 
						|
	brz,pn	%o2, 2f					! 100% complete?
 | 
						|
	 fmovd	%f48, %f4
 | 
						|
	cmp	%o2, 8					! Exactly 8 bytes?
 | 
						|
	bz,a,pn	%xcc, 2f
 | 
						|
	 std	%f4, [%o1]
 | 
						|
 | 
						|
	btst	4, %o2					! Word store?
 | 
						|
	bz	%xcc, 1f
 | 
						|
	 nop
 | 
						|
	st	%f4, [%o1]
 | 
						|
	inc	4, %o1
 | 
						|
1:
 | 
						|
	btst	2, %o2
 | 
						|
	fzero	%f0
 | 
						|
	bz	1f
 | 
						|
 | 
						|
	 mov	-6, %o4
 | 
						|
	alignaddr %o1, %o4, %g0
 | 
						|
 | 
						|
	faligndata %f0, %f4, %f8
 | 
						|
	
 | 
						|
	stda	%f8, [%o1] ASI_FL16_P			! Store short
 | 
						|
	inc	2, %o1
 | 
						|
1:
 | 
						|
	btst	1, %o2					! Byte aligned?
 | 
						|
	bz	2f
 | 
						|
 | 
						|
	 mov	-7, %o0					! Calculate dest - 7
 | 
						|
	alignaddr %o1, %o0, %g0				! Calculate shift mask and dest.
 | 
						|
 | 
						|
	faligndata %f0, %f4, %f8			! Move 1st byte to low part of f8
 | 
						|
 | 
						|
	stda	%f8, [%o1] ASI_FL8_P			! Store 1st byte
 | 
						|
	inc	1, %o1					! Update address
 | 
						|
2:
 | 
						|
	membar	#Sync
 | 
						|
#if 0
 | 
						|
	!!
 | 
						|
	!! verify copy success.
 | 
						|
	!! 
 | 
						|
 | 
						|
	mov	%i0, %o2
 | 
						|
	mov	%i1, %o4
 | 
						|
	mov	%i2, %l4
 | 
						|
0:	
 | 
						|
	ldub	[%o2], %o1
 | 
						|
	inc	%o2
 | 
						|
	ldub	[%o4], %o3
 | 
						|
	inc	%o4
 | 
						|
	cmp	%o3, %o1
 | 
						|
	bnz	1f
 | 
						|
	 dec	%l4
 | 
						|
	brnz	%l4, 0b
 | 
						|
	 nop
 | 
						|
	ba	2f
 | 
						|
	 nop
 | 
						|
 | 
						|
1:
 | 
						|
	set	block_disable, %o0
 | 
						|
	stx	%o0, [%o0]
 | 
						|
	
 | 
						|
	set	0f, %o0
 | 
						|
	call	prom_printf
 | 
						|
	 sub	%i2, %l4, %o5
 | 
						|
	set	1f, %o0
 | 
						|
	mov	%i0, %o1
 | 
						|
	mov	%i1, %o2
 | 
						|
	call	prom_printf
 | 
						|
	 mov	%i2, %o3
 | 
						|
	ta	1
 | 
						|
	.data
 | 
						|
	_ALIGN
 | 
						|
block_disable:	.xword	0
 | 
						|
0:	.asciz	"bcopy failed: %x@%p != %x@%p byte %d\r\n"
 | 
						|
1:	.asciz	"bcopy(%p, %p, %lx)\r\n"
 | 
						|
	_ALIGN
 | 
						|
	.text
 | 
						|
2:	
 | 
						|
#endif
 | 
						|
#ifdef _KERNEL		
 | 
						|
 | 
						|
	set 1f, %o0
 | 
						|
	mov	%i0, %o1
 | 
						|
	mov	%i1, %o2
 | 
						|
	call	printf
 | 
						|
	mov	%i2, %o3
 | 
						|
	
 | 
						|
	.data
 | 
						|
	_ALIGN
 | 
						|
1:	.asciz "block exit (%p, %p, %d)\n"
 | 
						|
	_ALIGN
 | 
						|
	.text
 | 
						|
/*
 | 
						|
 * Weve saved our possible fpstate, now disable the fpu
 | 
						|
 * and continue with life.
 | 
						|
 */
 | 
						|
#if 1
 | 
						|
	RESTORE_FPU
 | 
						|
#else
 | 
						|
#ifdef DEBUG
 | 
						|
	LDPTR	[%l1 + %lo(FPPROC)], %l7
 | 
						|
	cmp	%l7, %l5
 | 
						|
!	tnz	1		! fpproc has changed!
 | 
						|
	LDPTR	[%l5 + P_FPSTATE], %l7
 | 
						|
	cmp	%l7, %l0
 | 
						|
	tnz	1		! fpstate has changed!
 | 
						|
#endif
 | 
						|
	andcc	%l2, %l3, %g0				! If (fpproc && fpstate)
 | 
						|
	STPTR	%l2, [%l1 + %lo(FPPROC)]		! Restore old fproc
 | 
						|
	bz,pt	%xcc, 1f				! Skip if no fpstate
 | 
						|
	 STPTR	%l6, [%l5 + P_FPSTATE]			! Restore old fpstate
 | 
						|
	
 | 
						|
	call	_C_LABEL(loadfpstate)			! Re-load orig fpstate
 | 
						|
	 mov	%l3, %o0
 | 
						|
1:
 | 
						|
#endif
 | 
						|
	set 1f, %o0
 | 
						|
	mov	%i0, %o1
 | 
						|
	mov	%i1, %o2
 | 
						|
	call	printf
 | 
						|
	mov	%i2, %o3
 | 
						|
	
 | 
						|
	.data
 | 
						|
	_ALIGN
 | 
						|
1:	.asciz "block done (%p, %p, %d)\n"
 | 
						|
	_ALIGN
 | 
						|
	.text
 | 
						|
 | 
						|
	
 | 
						|
	ret
 | 
						|
	 restore	%g1, 0, %o0			! Return DEST for memcpy
 | 
						|
#endif
 | 
						|
	retl
 | 
						|
	 mov	%g1, %o0
 | 
						|
#endif
 | 
						|
 | 
						|
 |