/*
* ========== Copyright Header Begin ==========================================
* 
* OpenSPARC T2 Processor File: sparcv9native.S
* Copyright (c) 2006 Sun Microsystems, Inc.  All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
* 
* The above named program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
* 
* The above named program is distributed in the hope that it will be 
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* General Public License for more details.
* 
* You should have received a copy of the GNU General Public
* License along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
* 
* ========== Copyright Header End ============================================
*/
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma	ident	"@(#)sparcv9native.S	1.25	07/02/20 SMI"

#include <sys/asm_linkage.h>
#include "assembly.h"


	/*
	 * Assembly support functions required by the simulator
	 */


	.section ".text"

	ENTRY(sim_atomic_add_32)
	ALTENTRY(sim_atomic_add_32_nv)
	ALTENTRY(sim_atomic_add_long)
	ALTENTRY(sim_atomic_add_long_nv)
	ld	[%o0], %o2
1:
	add	%o2, %o1, %o3
	cas	[%o0], %o2, %o3
	cmp	%o2, %o3
	bne,a,pn %icc, 1b
	ld	[%o0], %o2
	retl
	add	%o2, %o1, %o0		! return new value
	SET_SIZE(sim_atomic_add_32_nv)
	SET_SIZE(sim_atomic_add_32)
	SET_SIZE(sim_atomic_add_long_nv)
	SET_SIZE(sim_atomic_add_long)

		/*
		 * o0 = pointer to memory location
		 * o1 = value to compare with
		 * o2 = value to swap in if equal
		 * returns:
		 * o0 = original contents of memory location
		 */
	.global	host_ldstub
host_ldstub:
	ldstub	[%o0], %o2
	retl
	mov	%o2, %o0
	
		/*
		 * o0 = pointer to memory location
		 * o1 = value to compare with
		 * o2 = value to swap in if equal
		 * returns:
		 * o0 = original contents of memory location
		 */
	.global	host_cas32
host_cas32:
	cas	[%o0], %o1, %o2
	retl
	mov	%o2, %o0

	.global	host_cas64
host_cas64:
	casx	[%o0], %o1, %o2
	retl
	mov	%o2, %o0

	.global host_swap
host_swap:
	swap	[%o0], %o1
	retl
	mov	%o1, %o0


		/*
		 * Atomic load of 128 bits big endian into two 64bit registers.
		 * Have to do this kludge because SPARC doesnt provide a 128bit atomic
		 * fetch that executes at user level.
		 * I just hope the 64byte block load is atomic on all architectures.
		 * %o0 points to memory location (128 bit aligned).
		 * %o1 points to high 64bits of result register (big endian)
		 * %o2 points to low 64bits of result register (big endian)
		 */

#define	ASI_BLK_P	0xf0	/* VIS 1.0 block load from primary AS */

	.global host_atomic_get128be
host_atomic_get128be:
		/* align the memory address for a block load */
	andn	%o0, 0x3f, %o3
	membar	#Sync
	ldda	[%o3]ASI_BLK_P, %f0
	membar	#Sync			/* ensure the data is present */
	and	%o0, 0x30, %o4		/* figure out which of the 4 128bit blocks we want */
_base:
	rd	%pc, %o0
	add	%o4, %o0, %o0
	jmpl	%o0 + (_xxword0 - _base), %g0
_xxword0:
	std	%f0, [ %o1 ]
	retl
	std	%f2, [ %o2 ]
	nop
_xxword1:
	std	%f4, [ %o1 ]
	retl
	std	%f6, [ %o2 ]
	nop
_xxword2:
	std	%f8, [ %o1 ]
	retl
	std	%f10, [ %o2 ]
	nop
_xxword3:
	std	%f12, [ %o1 ]
	retl
	std	%f14, [ %o2 ]
	nop


	/*
	 * Assembly version of certain simulator instruction implementations.
	 */


#define	ld_simm16( _Rxip, _offset, _Rdest )	\
	ldsh	[ _Rxip + _offset ], _Rdest		/* 1 instn only !! */

#define	ld_simm32( _Rxip, _offset, _Rdest )	\
	ldsw	[ _Rxip + _offset ], _Rdest		/* 1 instn only !! */

#define	ldx_ireg( _Rcpup, _Rxip, _offset, _Rdest, _Rscratch )	\
	lduh	[ _Rxip + _offset ], _Rscratch		NL\
	ldx	[ _Rcpup + _Rscratch ], _Rdest

#define	stx_ireg( _Rcpup, _Rxip, _offset, _Rval, _Rscratch )	\
	lduh	[ _Rxip + _offset ], _Rscratch		NL\
	stx	_Rval, [ _Rcpup + _Rscratch ]

#define	ld_fpreg( _ldt, _Rcpup, _Rxip, _Rfprp, _offset, _Rdest, _Rscratch )	\
	lduh	[ _Rxip + _offset ], _Rscratch				NL\
	_ldt	[ _Rfprp + _Rscratch ], _Rdest

#define	st_fpreg( _stt, _Rcpup, _Rxip, _Rfprp, _offset, _Rdest, _Rscratch )	\
	lduh	[ _Rxip + _offset ], _Rscratch				NL\
	_stt	_Rdest, [ _Rfprp + _Rscratch ]

	/*
	 * %o0 = simcpu_t *
	 * %o1 = xicache_instn_t *
	 * %g1 is used as a scratch register by these macros
	 */

#define	ldx_Rsrc1( _r )	ldx_ireg( %o0, %o1, XIC_INTREG_SRC1_OFFSET, _r, %g1 )
#define	ldx_Rsrc2( _r )	ldx_ireg( %o0, %o1, XIC_INTREG_SRC2_OFFSET, _r, %g1 )
#define	ld_Simm16( _r )	ld_simm16( %o1, XIC_SIMM16_OFFSET, _r )

#define	ld_BrOff32( _r )	ld_simm32( %o1, XIC_BROFF32_OFFSET, _r )

#define	ld_BrRegOff32( _r )	ld_simm32( %o1, XIC_BREGOFF32_OFFSET, _r )

#define	stx_Rdest( _r )	stx_ireg( %o0, %o1, XIC_INTREG_DEST_OFFSET, _r, %g1 )

#define	ldx_Rccr( _r )	ldx	[ %o0 + SIMCPU_v9CCR_OFFSET ], _r
#define	stx_Rccr( _r )	stx	_r, [ %o0 + SIMCPU_v9CCR_OFFSET ]

#define	ldx_Rpc( _r )	ldx	[ %o0 + SIMCPU_PC_OFFSET ], _r
#define	stx_Rpc( _r )	stx	_r, [ %o0 + SIMCPU_PC_OFFSET ]
#define	ldx_Rnpc( _r )	ldx	[ %o0 + SIMCPU_NPC_OFFSET ], _r
#define	stx_Rnpc( _r )	stx	_r, [ %o0 + SIMCPU_NPC_OFFSET ]


	/* FP support */

#define	ld_FPsrc1( _ldt, _r )	ld_fpreg( _ldt, %o0, %o1, %o0, XIC_FPREG_SRC1_OFFSET, _r, %g1 )
#define	ld_FPsrc2( _ldt, _r )	ld_fpreg( _ldt, %o0, %o1, %o0, XIC_FPREG_SRC2_OFFSET, _r, %g1 )


#define	IMPL( _name )					\
							NL\
	.global decoded_impl_##_name			NL\
	.align	8					NL\
decoded_impl_##_name:


#define	ENDI			\
	ldx [ %o0 + SIMCPU_NPC_OFFSET ], %o1		NL\
	add	%o1, 4, %o2				NL\
	stx %o1, [ %o0 + SIMCPU_PC_OFFSET ]		NL\
	retl						NL\
	stx %o2, [ %o0 + SIMCPU_NPC_OFFSET ]		NL\
	ENDINSTN

#define	ENDINSTN	/* nada */
	

	/*
	 * For executing floating point operations on SPARC ..
	 * .. specifically SPARC on SPARC we use the FSR for the
	 * cpu being emulated, but disable all floating point traps.
	 * Then we test whether an exception has occurred after the
	 * instruction execution, and update the simulated FSR
	 * accordingly, then finally signal a trap if the simulated
	 * machine actually desired one.
	 *
	 * the simcpu_t simulated registers are used as follows:
	 *	v9_fsr_ctrl holds the SPARC fsr control bits ..
	 * for condition codes, rounding etc. The execution results
	 * (errors and accumulated errors) are held in the the
	 * v9_fsr_exc pseudo register, and the trap enable bits
	 * (TEM) are held in the v9_fsr_tem pseudo reg.
	 *
	 * Note: we have to use the simcpu_t scratch64 value because we
	 * can't get the fsr value out of the cpu without using a
	 * store instruction. This prob. should be per exec_thread, but
	 * its harder to get at than per simcpu in here.
	 */


#define	FPOP_setup_fsr					\
	ldx	[ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o4	NL\
	set	0xff, %o3				NL\
	sllx	%o3, 30, %o3 /* FCC[321] + RD mask */	NL\
	or	%o3, (3 << 10), %o3 /* | FCC0 mask */	NL\
	and	%o4, %o3, %o4				NL\
	stx	%o4, [ %o0 + SIMCPU_SCRATCH64_OFFSET ]	NL\
	ldx	[ %o0 + SIMCPU_SCRATCH64_OFFSET ], %fsr	NL\
	ldx	[ %o0 + SIMCPU_v9GSR_CTRL_OFFSET ], %o4	NL\
	wr	%o4, %gsr

#ifdef FP_DECODE_DISABLED
#define	FPOP_fpu_on_check
#else /* FP_DECODE_DISABLED */
#define	FPOP_fpu_on_check				\
	ldx	[ %o0 + SIMCPU_SPECIFICP_OFFSET ], %o4	NL\
	ld	[ %o4 + SPARCV9_FPU_ON_OFFSET ], %o4	NL\
	brz	%o4, sparcv9_deliver_fp_disabled_exception	NL\
	nop
#endif /* FP_DECODE_DISABLED */

#define	FPOP_setup					\
	FPOP_fpu_on_check				NL\
	FPOP_setup_fsr

#define	FPOP_cmp( _ldt, _fpop, _fcc )		\
	FPOP_setup					NL\
	ld_FPsrc1( _ldt, %f0 )				NL\
	ld_FPsrc2( _ldt, %f4 )				NL\
	_fpop	%_fcc, %f0, %f4				NL\
	FPOP_cleanup					NL\
	FPOP_save_fcc					NL\
	FPOP_ENDI

/* FPOP_save_fcc assumes FPOP_cleanup stored %fsr in scratch */
#define	FPOP_save_fcc					\
	ldx	[ %o0 + SIMCPU_SCRATCH64_OFFSET ], %o4	NL\
	set	0x3f, %o3				NL\
	sllx	%o3, 32, %o3 /* FCC[321] mask */	NL\
	or	%o3, (3 << 10), %o3 /* | FCC0 mask */	NL\
	ldx	[ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o2	NL\
	andn	%o2, %o3, %o2				NL\
	and	%o4, %o3, %o4				NL\
	or	%o2, %o4, %o2				NL\
	stx	%o2, [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ]


		/*
		 * Since we're running as a user process, we're not
		 * going to see anything here other than ieee754 exceptions
		 *
		 * But these have to be handled carefully, since the simulated FSR
		 * configuration may require that a proper exception is generated
		 * ... 
		 * annoyingly we have to save the fsr somewhere in order to get access to
		 * the execution results - we use a per-cpu scratch area so we avoid MT conflicts
		 *
		 * So retrieve the FSR, stash it back into ctrl sans error bits
		 * (tem bits should still be zero).
		 * Then, look for errors from the last executed instrucion .. if none, then
		 * do nothing. If some then accumulate or generate a trap as necessary.
		 */

	/* Must not modify %o0 or %o1 */
#define	FPOP_cleanup					\
	stx %fsr, [ %o0 + SIMCPU_SCRATCH64_OFFSET ]	NL\
	ldx [ %o0 + SIMCPU_SCRATCH64_OFFSET ], %o4	NL\
	/* must clear cexec field if no exceptions */	NL\
	ldx	[ %o0 + SIMCPU_v9FSR_EXC_OFFSET ], %o3	NL\
	and	%o3, 0x1f<<5, %o3			NL\
	stx	%o3, [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ]	NL\
	andcc %o4, 0x1f, %o2				NL\
	bne,a,pt %xcc, sparcv9_fsr_exception_update	NL\
	rd	%pc, %o5				NL\
	/* fall through to the update part of the instruction */

#define	FPOP_ENDI					\
	ENDI
	

		/*
		 * Hand off routine for floating point closure
		 * If any IEEE exception occurred, we need now to check and see if the simulated
		 * FSR required a trap to be generated, or the error to be accumulated.
		 * NOTE: error is not accumulated if a trap is to be delivered.
		 */

	.section ".text"
	.align 8
sparcv9_fsr_exception_update:
	ldx	[ %o0 + SIMCPU_v9FSR_TEM_OFFSET ], %o3
	ldx	[ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o4
	srlx	%o4, 14, %o4	/* FTT field - no trap if non-zero */
	and	%o4, 7, %o4
	movrnz	%o4, %g0, %o3
	andcc	%o2, %o3, %g0
			/* OK build the EXC group ... */
	sllx	%o2, 5, %o3
			/* clear the accumulation if trap to be delivered */
	movne	%xcc, %g0, %o3
	ldx	[ %o0 + SIMCPU_v9FSR_EXC_OFFSET ], %o4
			/* build error and accum bits */
	or	%o2, %o3, %o2
			/* mask out previous accum bits */
	and	%o4, 0x1f<<5, %o4
			/* combine new error and old accum bits */
	or	%o2, %o4, %o2
			/* update the execution FSR state */
	stx	%o2, [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ]
			/* now that the status is updated, branch into the
			 * C function to deliver the IEEE trap if appropriate
			 */
	bne,pn	%xcc, sparcv9_deliver_ieee_exception
	nop
	jmp	%o5 + 4	/* finish instruction */
	nop


		/*
		 * instruction targets
		 * implemented in assembly language to improve performance on
		 * certain host machines.
		 *
		 * This file is for a sparcv9 host.
		 */

	.section ".text"
	.align 8

		/* Args are: %o0 = simcpu_t*, %o1 = xicache_instn_t * */


		/*
		 * Sparc v9 add and substract instructions
		 */

IMPL( sparcv9_add_co_imm )
	ba internal_add_co;
	ld_Simm16(%o3)

IMPL( sparcv9_add_co_rrr )
	ldx_Rsrc2(%o3)
internal_add_co:
	ldx_Rsrc1(%o2)
	addcc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_add_co_imm_rd0 )
	ba internal_add_co_rd0;
	ld_Simm16(%o3)

IMPL( sparcv9_add_co_rrr_rd0 )
	ldx_Rsrc2(%o3)
internal_add_co_rd0:
	ldx_Rsrc1(%o2)
	addcc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_add_ci_imm )
	ba internal_add_ci;
	ld_Simm16(%o3)

IMPL( sparcv9_add_ci_rrr )
	ldx_Rsrc2(%o3)
internal_add_ci:
	ldx_Rccr(%o4)
	wr	%o4, %ccr
	ldx_Rsrc1(%o2)
	addc	%o2, %o3, %o2
	stx_Rdest(%o2)
	ENDI


IMPL( sparcv9_add_cico_imm )
	ba internal_add_cico;
	ld_Simm16(%o3)

IMPL( sparcv9_add_cico_rrr )
	ldx_Rsrc2(%o3)
internal_add_cico:
	ldx_Rccr(%o4)
	wr	%o4, %ccr
	ldx_Rsrc1(%o2)
	addccc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_add_cico_imm_rd0 )
	ba internal_add_cico_rd0;
	ld_Simm16(%o3)

IMPL( sparcv9_add_cico_rrr_rd0 )
	ldx_Rsrc2(%o3)
internal_add_cico_rd0:
	ldx_Rccr(%o4)
	wr	%o4, %ccr
	ldx_Rsrc1(%o2)
	addccc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_sub_co_imm )
	ba internal_sub_co;
	ld_Simm16(%o3)

IMPL( sparcv9_sub_co_rrr )
	ldx_Rsrc2(%o3)
internal_sub_co:
	ldx_Rsrc1(%o2)
	subcc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_sub_co_imm_rd0 )
	ba internal_sub_co_rd0;
	ld_Simm16(%o3)

IMPL( sparcv9_sub_co_rrr_rd0 )
	ldx_Rsrc2(%o3)
internal_sub_co_rd0:
	ldx_Rsrc1(%o2)
	subcc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_sub_ci_imm )
	ba internal_sub_ci;
	ld_Simm16(%o3)

IMPL( sparcv9_sub_ci_rrr )
	ldx_Rsrc2(%o3)
internal_sub_ci:
	ldx_Rccr(%o4)
	wr	%o4, %ccr
	ldx_Rsrc1(%o2)
	subc	%o2, %o3, %o2
	stx_Rdest(%o2)
	ENDI


IMPL( sparcv9_sub_cico_imm )
	ba internal_sub_cico;
	ld_Simm16(%o3)

IMPL( sparcv9_sub_cico_rrr )
	ldx_Rsrc2(%o3)
internal_sub_cico:
	ldx_Rccr(%o4)
	wr	%o4, %ccr
	ldx_Rsrc1(%o2)
	subccc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_sub_cico_imm_rd0 )
	ba internal_sub_cico_rd0;
	ld_Simm16(%o3)

IMPL( sparcv9_sub_cico_rrr_rd0 )
	ldx_Rsrc2(%o3)
internal_sub_cico_rd0:
	ldx_Rccr(%o4)
	wr	%o4, %ccr
	ldx_Rsrc1(%o2)
	subccc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/*
	 * Logic CC instructions ...
	 */

IMPL( sparcv9_and_cc_imm )
	ba internal_and_cc;
	ld_Simm16(%o3)

IMPL( sparcv9_and_cc_rrr )
	ldx_Rsrc2(%o3)
internal_and_cc:
	ldx_Rsrc1(%o2)
	andcc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_and_cc_imm_rd0 )
	ba internal_and_cc_rd0;
	ld_Simm16(%o3)

IMPL( sparcv9_and_cc_rrr_rd0 )
	ldx_Rsrc2(%o3)
internal_and_cc_rd0:
	ldx_Rsrc1(%o2)
	andcc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/*  sparcv9_andn_cc_imm  - synthesysed by inverting imm for andcc */

IMPL( sparcv9_andn_cc_rrr )
	ldx_Rsrc2(%o3)
	ldx_Rsrc1(%o2)
	andncc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/*  sparcv9_andn_cc_imm_rd0 - synthesysed by inverting imm for andcc */

IMPL( sparcv9_andn_cc_rrr_rd0 )
	ldx_Rsrc2(%o3)
	ldx_Rsrc1(%o2)
	andncc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_or_cc_imm )
	ba internal_or_cc;
	ld_Simm16(%o3)

IMPL( sparcv9_or_cc_rrr )
	ldx_Rsrc2(%o3)
internal_or_cc:
	ldx_Rsrc1(%o2)
	orcc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_or_cc_imm_rd0 )
	ba internal_or_cc_rd0;
	ld_Simm16(%o3)

IMPL( sparcv9_or_cc_rrr_rd0 )
	ldx_Rsrc2(%o3)
internal_or_cc_rd0:
	ldx_Rsrc1(%o2)
	orcc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/* sparcv9_orn_cc_imm	- synth by inverting imm field for orcc */

IMPL( sparcv9_orn_cc_rrr )
	ldx_Rsrc2(%o3)
	ldx_Rsrc1(%o2)
	orncc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/* sparcv9_orn_cc_imm_rd0 - synth by inverting imm field for orcc */

IMPL( sparcv9_orn_cc_rrr_rd0 )
	ldx_Rsrc2(%o3)
	ldx_Rsrc1(%o2)
	orncc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_xor_cc_imm )
	ba internal_xor_cc;
	ld_Simm16(%o3)

IMPL( sparcv9_xor_cc_rrr )
	ldx_Rsrc2(%o3)
internal_xor_cc:
	ldx_Rsrc1(%o2)
	xorcc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


IMPL( sparcv9_xor_cc_imm_rd0 )
	ba internal_xor_cc_rd0;
	ld_Simm16(%o3)

IMPL( sparcv9_xor_cc_rrr_rd0 )
	ldx_Rsrc2(%o3)
internal_xor_cc_rd0:
	ldx_Rsrc1(%o2)
	xorcc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/* sparcv9_xnor_cc_imm - can synth by ~imm using orcc imm */

IMPL( sparcv9_xnor_cc_rrr )
	ldx_Rsrc2(%o3)
	ldx_Rsrc1(%o2)
	xnorcc	%o2, %o3, %o2
	stx_Rdest(%o2)
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/* sparcv9_xnor_cc_imm_rd0  - can synth by ~imm using xorcc imm */

IMPL( sparcv9_xnor_cc_rrr_rd0 )
	ldx_Rsrc2(%o3)
	ldx_Rsrc1(%o2)
	xnorcc	%o2, %o3, %g0
	rd	%ccr, %o3
	stx_Rccr(%o3)
	ENDI


	/*
	 * Branch instructions change the value of npc
	 * Could encode a mask into the xi immediate, but that would
	 * be slow to extract, and leave us with a less than useful
	 * immediate field.
	 */

	/* delay slot:
	 *	always executed - IF a branch IS taken
	 *	annulled - if a bit set and branch IS NOT taken
	 */

	/* Sparc branches are bloody awful - delay slots plus mutiple
	 * condition varients ...
	 *
	 *	X: br	Y
	 *	   br	Z
	 *
	 *	Y: slot instn
	 *
	 *	Z:
	 *
	 * ... instn @X executes instn @Y, but then X+4 in DS of X causes branch to Z
	 */


	/*
	 * Policy
		annul:
			not taken:			taken:
			pc = oldnpc+4			pc = oldnpc
			npc = oldnpc + 8;		npc = target

		no annul:
			pc = npc
			npc = target | npc + 4;

	 */

	
#define	BRANCH( _opc, _cc )						\
	ldx_Rccr (%o2)									NL\
	wr	%o2, %ccr								NL\
	ldx_Rpc	(%o3)									NL\
	ldx_Rnpc(%o4)									NL\
	ld_BrOff32(%o5)									NL\
	stx_Rpc	(%o4)									NL\
	add	%o3, %o5, %o5		/* branch target */				NL\
	add	%o4, 4, %o4		/* npc + 4 */					NL\
	mov##_opc	_cc, %o5, %o4	/* overwrite npc if branch taken */	NL\
	retl										NL\
	stx_Rnpc(%o4)


#define	BRANCH_an( _opc, _cc )						\
	ldx_Rccr (%o2)									NL\
	wr	%o2, %ccr								NL\
	ldx_Rpc	(%o3)									NL\
	ldx_Rnpc(%o4)									NL\
	ld_BrOff32(%o2)									NL\
	add	%o3, %o2, %o3		/* branch target */				NL\
	add	%o4, 4, %o5		/* oldnpc + 4 */				NL\
	add	%o4, 8, %g1		/* oldnpc + 8 */				NL\
	mov##_opc	_cc, %o4, %o5	/* overwrite pc if branch taken */	NL\
	mov##_opc	_cc, %o3, %g1	/* overwrite npc if branch taken */	NL\
	stx_Rpc	(%o5)			/* no annul ds not squashed */			NL\
	retl										NL\
	stx_Rnpc(%g1)


	/* There has to be a better way than to ennunciate every instruction form !! */

	/* icc version */

IMPL( sparcv9_bne_icc )
	BRANCH( ne, %icc )
	ENDINSTN

IMPL( sparcv9_be_icc )
	BRANCH( e, %icc )
	ENDINSTN

IMPL( sparcv9_bg_icc )
	BRANCH( g, %icc )
	ENDINSTN

IMPL( sparcv9_ble_icc )
	BRANCH( le, %icc )
	ENDINSTN

IMPL( sparcv9_bge_icc )
	BRANCH( ge, %icc )
	ENDINSTN

IMPL( sparcv9_bl_icc )
	BRANCH( l, %icc )
	ENDINSTN

IMPL( sparcv9_bgu_icc )
	BRANCH( gu, %icc )
	ENDINSTN

IMPL( sparcv9_bleu_icc )
	BRANCH( leu, %icc )
	ENDINSTN

IMPL( sparcv9_bcc_icc )
	BRANCH( cc, %icc )
	ENDINSTN

IMPL( sparcv9_bcs_icc )
	BRANCH( cs, %icc )
	ENDINSTN

IMPL( sparcv9_bpos_icc )
	BRANCH( pos, %icc )
	ENDINSTN

IMPL( sparcv9_bneg_icc )
	BRANCH( neg, %icc )
	ENDINSTN

IMPL( sparcv9_bvc_icc )
	BRANCH( vc, %icc )
	ENDINSTN

IMPL( sparcv9_bvs_icc )
	BRANCH( vs, %icc )
	ENDINSTN


	/* xcc versions */

IMPL( sparcv9_bne_xcc )
	BRANCH( ne, %xcc )
	ENDINSTN

IMPL( sparcv9_be_xcc )
	BRANCH( e, %xcc )
	ENDINSTN

IMPL( sparcv9_bg_xcc )
	BRANCH( g, %xcc )
	ENDINSTN

IMPL( sparcv9_ble_xcc )
	BRANCH( le, %xcc )
	ENDINSTN

IMPL( sparcv9_bge_xcc )
	BRANCH( ge, %xcc )
	ENDINSTN

IMPL( sparcv9_bl_xcc )
	BRANCH( l, %xcc )
	ENDINSTN

IMPL( sparcv9_bgu_xcc )
	BRANCH( gu, %xcc )
	ENDINSTN

IMPL( sparcv9_bleu_xcc )
	BRANCH( leu, %xcc )
	ENDINSTN

IMPL( sparcv9_bcc_xcc )
	BRANCH( cc, %xcc )
	ENDINSTN

IMPL( sparcv9_bcs_xcc )
	BRANCH( cs, %xcc )
	ENDINSTN

IMPL( sparcv9_bpos_xcc )
	BRANCH( pos, %xcc )
	ENDINSTN

IMPL( sparcv9_bneg_xcc )
	BRANCH( neg, %xcc )
	ENDINSTN

IMPL( sparcv9_bvc_xcc )
	BRANCH( vc, %xcc )
	ENDINSTN

IMPL( sparcv9_bvs_xcc )
	BRANCH( vs, %xcc )
	ENDINSTN


		/*
		 * Annulled delay slot versions !!
		 */


	/* icc version */

IMPL( sparcv9_bne_icc_an )
	BRANCH_an( ne, %icc )
	ENDINSTN

IMPL( sparcv9_be_icc_an )
	BRANCH_an( e, %icc )
	ENDINSTN

IMPL( sparcv9_bg_icc_an )
	BRANCH_an( g, %icc )
	ENDINSTN

IMPL( sparcv9_ble_icc_an )
	BRANCH_an( le, %icc )
	ENDINSTN

IMPL( sparcv9_bge_icc_an )
	BRANCH_an( ge, %icc )
	ENDINSTN

IMPL( sparcv9_bl_icc_an )
	BRANCH_an( l, %icc )
	ENDINSTN

IMPL( sparcv9_bgu_icc_an )
	BRANCH_an( gu, %icc )
	ENDINSTN

IMPL( sparcv9_bleu_icc_an )
	BRANCH_an( leu, %icc )
	ENDINSTN

IMPL( sparcv9_bcc_icc_an )
	BRANCH_an( cc, %icc )
	ENDINSTN

IMPL( sparcv9_bcs_icc_an )
	BRANCH_an( cs, %icc )
	ENDINSTN

IMPL( sparcv9_bpos_icc_an )
	BRANCH_an( pos, %icc )
	ENDINSTN

IMPL( sparcv9_bneg_icc_an )
	BRANCH_an( neg, %icc )
	ENDINSTN

IMPL( sparcv9_bvc_icc_an )
	BRANCH_an( vc, %icc )
	ENDINSTN

IMPL( sparcv9_bvs_icc_an )
	BRANCH_an( vs, %icc )
	ENDINSTN


	/* xcc versions */

IMPL( sparcv9_bne_xcc_an )
	BRANCH_an( ne, %xcc )
	ENDINSTN

IMPL( sparcv9_be_xcc_an )
	BRANCH_an( e, %xcc )
	ENDINSTN

IMPL( sparcv9_bg_xcc_an )
	BRANCH_an( g, %xcc )
	ENDINSTN

IMPL( sparcv9_ble_xcc_an )
	BRANCH_an( le, %xcc )
	ENDINSTN

IMPL( sparcv9_bge_xcc_an )
	BRANCH_an( ge, %xcc )
	ENDINSTN

IMPL( sparcv9_bl_xcc_an )
	BRANCH_an( l, %xcc )
	ENDINSTN

IMPL( sparcv9_bgu_xcc_an )
	BRANCH_an( gu, %xcc )
	ENDINSTN

IMPL( sparcv9_bleu_xcc_an )
	BRANCH_an( leu, %xcc )
	ENDINSTN

IMPL( sparcv9_bcc_xcc_an )
	BRANCH_an( cc, %xcc )
	ENDINSTN

IMPL( sparcv9_bcs_xcc_an )
	BRANCH_an( cs, %xcc )
	ENDINSTN

IMPL( sparcv9_bpos_xcc_an )
	BRANCH_an( pos, %xcc )
	ENDINSTN

IMPL( sparcv9_bneg_xcc_an )
	BRANCH_an( neg, %xcc )
	ENDINSTN

IMPL( sparcv9_bvc_xcc_an )
	BRANCH_an( vc, %xcc )
	ENDINSTN

IMPL( sparcv9_bvs_xcc_an )
	BRANCH_an( vs, %xcc )
	ENDINSTN


#undef	BRANCH
#undef	BRANCH_an


	/*
	 * versions for the branch on register value operations
	 */


	/*
	 * Policy
		no annul:
			pc = npc
			npc = target | npc + 4;

		annul:
			not taken:			taken:
			pc = oldnpc+4			pc = oldnpc
			npc = oldnpc + 8;		npc = target

	 */

	
#define	BRANCH( _opc )								\
	ldx_Rsrc1(%o2)									NL\
	ldx_Rpc	(%o3)									NL\
	ldx_Rnpc(%o4)									NL\
	ld_BrRegOff32(%o5)								NL\
	stx_Rpc	(%o4)			/* pc = npc */					NL\
	add	%o3, %o5, %o5		/* branch target */				NL\
	add	%o4, 4, %o4		/* npc + 4 */					NL\
	movr##_opc %o2, %o5, %o4	/* overwrite npc if branch taken */		NL\
	retl										NL\
	stx_Rnpc(%o4)


#define	BRANCH_an( _opc )						\
	ldx_Rsrc1(%o2)									NL\
	ldx_Rpc	(%o3)									NL\
	ldx_Rnpc(%o4)									NL\
	add	%o4, 4, %o5		/* oldnpc + 4 */				NL\
	movr##_opc	%o2, %o4, %o5	/* overwrite pc if branch taken */		NL\
	stx_Rpc	(%o5)			/* no annul ds not squashed */			NL\
	ld_BrRegOff32(%o5)								NL\
	add	%o3, %o5, %o3		/* branch target */				NL\
	add	%o4, 8, %o5		/* oldnpc + 8 */				NL\
	movr##_opc	%o2, %o3, %o5	/* overwrite npc if branch taken */		NL\
	retl										NL\
	stx_Rnpc(%o5)


IMPL( sparcv9_brz )
	BRANCH( z )
	ENDINSTN

IMPL( sparcv9_brlez )
	BRANCH( lez )
	ENDINSTN

IMPL( sparcv9_brlz )
	BRANCH( lz )
	ENDINSTN

IMPL( sparcv9_brnz )
	BRANCH( nz )
	ENDINSTN

IMPL( sparcv9_brgz )
	BRANCH( gz )
	ENDINSTN

IMPL( sparcv9_brgez )
	BRANCH( gez )
	ENDINSTN

IMPL( sparcv9_brz_an )
	BRANCH_an( z )
	ENDINSTN

IMPL( sparcv9_brlez_an )
	BRANCH_an( lez )
	ENDINSTN

IMPL( sparcv9_brlz_an )
	BRANCH_an( lz )
	ENDINSTN

IMPL( sparcv9_brnz_an )
	BRANCH_an( nz )
	ENDINSTN

IMPL( sparcv9_brgz_an )
	BRANCH_an( gz )
	ENDINSTN

IMPL( sparcv9_brgez_an )
	BRANCH_an( gez )
	ENDINSTN


	/*
	 * SPARC floating point compares
	 */

IMPL( sparcv9_fcmps_fcc0 )
	FPOP_cmp( ld, fcmps, fcc0 )
	ENDINSTN

IMPL( sparcv9_fcmps_fcc1 )
	FPOP_cmp( ld, fcmps, fcc1 )
	ENDINSTN

IMPL( sparcv9_fcmps_fcc2 )
	FPOP_cmp( ld, fcmps, fcc2 )
	ENDINSTN

IMPL( sparcv9_fcmps_fcc3 )
	FPOP_cmp( ld, fcmps, fcc3 )
	ENDINSTN

IMPL( sparcv9_fcmpd_fcc0 )
	FPOP_cmp( ldd, fcmpd, fcc0 )
	ENDINSTN

IMPL( sparcv9_fcmpd_fcc1 )
	FPOP_cmp( ldd, fcmpd, fcc1 )
	ENDINSTN

IMPL( sparcv9_fcmpd_fcc2 )
	FPOP_cmp( ldd, fcmpd, fcc2 )
	ENDINSTN

IMPL( sparcv9_fcmpd_fcc3 )
	FPOP_cmp( ldd, fcmpd, fcc3 )
	ENDINSTN

IMPL( sparcv9_fcmpes_fcc0 )
	FPOP_cmp( ld, fcmpes, fcc0 )
	ENDINSTN

IMPL( sparcv9_fcmpes_fcc1 )
	FPOP_cmp( ld, fcmpes, fcc1 )
	ENDINSTN

IMPL( sparcv9_fcmpes_fcc2 )
	FPOP_cmp( ld, fcmpes, fcc2 )
	ENDINSTN

IMPL( sparcv9_fcmpes_fcc3 )
	FPOP_cmp( ld, fcmpes, fcc3 )
	ENDINSTN

IMPL( sparcv9_fcmped_fcc0 )
	FPOP_cmp( ldd, fcmped, fcc0 )
	ENDINSTN

IMPL( sparcv9_fcmped_fcc1 )
	FPOP_cmp( ldd, fcmped, fcc1 )
	ENDINSTN

IMPL( sparcv9_fcmped_fcc2 )
	FPOP_cmp( ldd, fcmped, fcc2 )
	ENDINSTN

IMPL( sparcv9_fcmped_fcc3 )
	FPOP_cmp( ldd, fcmped, fcc3 )
	ENDINSTN