legion/src/host/sparcv9native.S

/*
* ========== Copyright Header Begin ==========================================
*
* OpenSPARC T2 Processor File: sparcv9native.S
* Copyright (c) 2006 Sun Microsystems, Inc.  All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
*
* The above named program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* The above named program is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*
* ========== Copyright Header End ============================================
*/
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "@(#)sparcv9native.S    1.25    07/02/20 SMI"

#include <sys/asm_linkage.h>
#include "assembly.h"


        /*
         * Assembly support functions required by the simulator
         */


        .section ".text"

        ENTRY(sim_atomic_add_32)
        ALTENTRY(sim_atomic_add_32_nv)
        ALTENTRY(sim_atomic_add_long)
        ALTENTRY(sim_atomic_add_long_nv)
        ld      [%o0], %o2
1:
        add     %o2, %o1, %o3
        cas     [%o0], %o2, %o3
        cmp     %o2, %o3
        bne,a,pn %icc, 1b
        ld      [%o0], %o2
        retl
        add     %o2, %o1, %o0           ! return new value
        SET_SIZE(sim_atomic_add_32_nv)
        SET_SIZE(sim_atomic_add_32)
        SET_SIZE(sim_atomic_add_long_nv)
        SET_SIZE(sim_atomic_add_long)

                /*
                 * o0 = pointer to memory location
                 * o1 = value to compare with
                 * o2 = value to swap in if equal
                 * returns:
                 * o0 = original contents of memory location
                 */
        .global host_ldstub
host_ldstub:
        ldstub  [%o0], %o2
        retl
        mov     %o2, %o0

                /*
                 * o0 = pointer to memory location
                 * o1 = value to compare with
                 * o2 = value to swap in if equal
                 * returns:
                 * o0 = original contents of memory location
                 */
        .global host_cas32
host_cas32:
        cas     [%o0], %o1, %o2
        retl
        mov     %o2, %o0

        .global host_cas64
host_cas64:
        casx    [%o0], %o1, %o2
        retl
        mov     %o2, %o0

        .global host_swap
host_swap:
        swap    [%o0], %o1
        retl
        mov     %o1, %o0


                /*
                 * Atomic load of 128 bits big endian into two 64bit registers.
                 * Have to do this kludge because SPARC doesnt provide a 128bit atomic
                 * fetch that executes at user level.
                 * I just hope the 64byte block load is atomic on all architectures.
                 * %o0 points to memory location (128 bit aligned).
                 * %o1 points to high 64bits of result register (big endian)
                 * %o2 points to low 64bits of result register (big endian)
                 */

#define ASI_BLK_P       0xf0    /* VIS 1.0 block load from primary AS */

        .global host_atomic_get128be
host_atomic_get128be:
                /* align the memory address for a block load */
        andn    %o0, 0x3f, %o3
        membar  #Sync
        ldda    [%o3]ASI_BLK_P, %f0
        membar  #Sync                   /* ensure the data is present */
        and     %o0, 0x30, %o4          /* figure out which of the 4 128bit blocks we want */
_base:
        rd      %pc, %o0
        add     %o4, %o0, %o0
        jmpl    %o0 + (_xxword0 - _base), %g0
_xxword0:
        std     %f0, [ %o1 ]
        retl
        std     %f2, [ %o2 ]
        nop
_xxword1:
        std     %f4, [ %o1 ]
        retl
        std     %f6, [ %o2 ]
        nop
_xxword2:
        std     %f8, [ %o1 ]
        retl
        std     %f10, [ %o2 ]
        nop
_xxword3:
        std     %f12, [ %o1 ]
        retl
        std     %f14, [ %o2 ]
        nop


        /*
         * Assembly version of certain simulator instruction implementations.
         */


#define ld_simm16( _Rxip, _offset, _Rdest )     \
        ldsh    [ _Rxip + _offset ], _Rdest             /* 1 instn only !! */

#define ld_simm32( _Rxip, _offset, _Rdest )     \
        ldsw    [ _Rxip + _offset ], _Rdest             /* 1 instn only !! */

#define ldx_ireg( _Rcpup, _Rxip, _offset, _Rdest, _Rscratch )   \
        lduh    [ _Rxip + _offset ], _Rscratch          NL\
        ldx     [ _Rcpup + _Rscratch ], _Rdest

#define stx_ireg( _Rcpup, _Rxip, _offset, _Rval, _Rscratch )    \
        lduh    [ _Rxip + _offset ], _Rscratch          NL\
        stx     _Rval, [ _Rcpup + _Rscratch ]

#define ld_fpreg( _ldt, _Rcpup, _Rxip, _Rfprp, _offset, _Rdest, _Rscratch )     \
        lduh    [ _Rxip + _offset ], _Rscratch                          NL\
        _ldt    [ _Rfprp + _Rscratch ], _Rdest

#define st_fpreg( _stt, _Rcpup, _Rxip, _Rfprp, _offset, _Rdest, _Rscratch )     \
        lduh    [ _Rxip + _offset ], _Rscratch                          NL\
        _stt    _Rdest, [ _Rfprp + _Rscratch ]

        /*
         * %o0 = simcpu_t *
         * %o1 = xicache_instn_t *
         * %g1 is used as a scratch register by these macros
         */

#define ldx_Rsrc1( _r ) ldx_ireg( %o0, %o1, XIC_INTREG_SRC1_OFFSET, _r, %g1 )
#define ldx_Rsrc2( _r ) ldx_ireg( %o0, %o1, XIC_INTREG_SRC2_OFFSET, _r, %g1 )
#define ld_Simm16( _r ) ld_simm16( %o1, XIC_SIMM16_OFFSET, _r )

#define ld_BrOff32( _r )        ld_simm32( %o1, XIC_BROFF32_OFFSET, _r )

#define ld_BrRegOff32( _r )     ld_simm32( %o1, XIC_BREGOFF32_OFFSET, _r )

#define stx_Rdest( _r ) stx_ireg( %o0, %o1, XIC_INTREG_DEST_OFFSET, _r, %g1 )

#define ldx_Rccr( _r )  ldx     [ %o0 + SIMCPU_v9CCR_OFFSET ], _r
#define stx_Rccr( _r )  stx     _r, [ %o0 + SIMCPU_v9CCR_OFFSET ]

#define ldx_Rpc( _r )   ldx     [ %o0 + SIMCPU_PC_OFFSET ], _r
#define stx_Rpc( _r )   stx     _r, [ %o0 + SIMCPU_PC_OFFSET ]
#define ldx_Rnpc( _r )  ldx     [ %o0 + SIMCPU_NPC_OFFSET ], _r
#define stx_Rnpc( _r )  stx     _r, [ %o0 + SIMCPU_NPC_OFFSET ]


        /* FP support */

#define ld_FPsrc1( _ldt, _r )   ld_fpreg( _ldt, %o0, %o1, %o0, XIC_FPREG_SRC1_OFFSET, _r, %g1 )
#define ld_FPsrc2( _ldt, _r )   ld_fpreg( _ldt, %o0, %o1, %o0, XIC_FPREG_SRC2_OFFSET, _r, %g1 )


#define IMPL( _name )                                   \
                                                        NL\
        .global decoded_impl_##_name                    NL\
        .align  8                                       NL\
decoded_impl_##_name:


#define ENDI                    \
        ldx [ %o0 + SIMCPU_NPC_OFFSET ], %o1            NL\
        add     %o1, 4, %o2                             NL\
        stx %o1, [ %o0 + SIMCPU_PC_OFFSET ]             NL\
        retl                                            NL\
        stx %o2, [ %o0 + SIMCPU_NPC_OFFSET ]            NL\
        ENDINSTN

#define ENDINSTN        /* nada */


        /*
         * For executing floating point operations on SPARC ..
         * .. specifically SPARC on SPARC we use the FSR for the
         * cpu being emulated, but disable all floating point traps.
         * Then we test whether an exception has occurred after the
         * instruction execution, and update the simulated FSR
         * accordingly, then finally signal a trap if the simulated
         * machine actually desired one.
         *
         * the simcpu_t simulated registers are used as follows:
         *      v9_fsr_ctrl holds the SPARC fsr control bits ..
         * for condition codes, rounding etc. The execution results
         * (errors and accumulated errors) are held in the the
         * v9_fsr_exc pseudo register, and the trap enable bits
         * (TEM) are held in the v9_fsr_tem pseudo reg.
         *
         * Note: we have to use the simcpu_t scratch64 value because we
         * can't get the fsr value out of the cpu without using a
         * store instruction. This prob. should be per exec_thread, but
         * its harder to get at than per simcpu in here.
         */


#define FPOP_setup_fsr                                  \
        ldx     [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o4 NL\
        set     0xff, %o3                               NL\
        sllx    %o3, 30, %o3 /* FCC[321] + RD mask */   NL\
        or      %o3, (3 << 10), %o3 /* | FCC0 mask */   NL\
        and     %o4, %o3, %o4                           NL\
        stx     %o4, [ %o0 + SIMCPU_SCRATCH64_OFFSET ]  NL\
        ldx     [ %o0 + SIMCPU_SCRATCH64_OFFSET ], %fsr NL\
        ldx     [ %o0 + SIMCPU_v9GSR_CTRL_OFFSET ], %o4 NL\
        wr      %o4, %gsr

#ifdef FP_DECODE_DISABLED
#define FPOP_fpu_on_check
#else /* FP_DECODE_DISABLED */
#define FPOP_fpu_on_check                               \
        ldx     [ %o0 + SIMCPU_SPECIFICP_OFFSET ], %o4  NL\
        ld      [ %o4 + SPARCV9_FPU_ON_OFFSET ], %o4    NL\
        brz     %o4, sparcv9_deliver_fp_disabled_exception      NL\
        nop
#endif /* FP_DECODE_DISABLED */

#define FPOP_setup                                      \
        FPOP_fpu_on_check                               NL\
        FPOP_setup_fsr

#define FPOP_cmp( _ldt, _fpop, _fcc )           \
        FPOP_setup                                      NL\
        ld_FPsrc1( _ldt, %f0 )                          NL\
        ld_FPsrc2( _ldt, %f4 )                          NL\
        _fpop   %_fcc, %f0, %f4                         NL\
        FPOP_cleanup                                    NL\
        FPOP_save_fcc                                   NL\
        FPOP_ENDI

/* FPOP_save_fcc assumes FPOP_cleanup stored %fsr in scratch */
#define FPOP_save_fcc                                   \
        ldx     [ %o0 + SIMCPU_SCRATCH64_OFFSET ], %o4  NL\
        set     0x3f, %o3                               NL\
        sllx    %o3, 32, %o3 /* FCC[321] mask */        NL\
        or      %o3, (3 << 10), %o3 /* | FCC0 mask */   NL\
        ldx     [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o2 NL\
        andn    %o2, %o3, %o2                           NL\
        and     %o4, %o3, %o4                           NL\
        or      %o2, %o4, %o2                           NL\
        stx     %o2, [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ]


                /*
                 * Since we're running as a user process, we're not
                 * going to see anything here other than ieee754 exceptions
                 *
                 * But these have to be handled carefully, since the simulated FSR
                 * configuration may require that a proper exception is generated
                 * ...
                 * annoyingly we have to save the fsr somewhere in order to get access to
                 * the execution results - we use a per-cpu scratch area so we avoid MT conflicts
                 *
                 * So retrieve the FSR, stash it back into ctrl sans error bits
                 * (tem bits should still be zero).
                 * Then, look for errors from the last executed instrucion .. if none, then
                 * do nothing. If some then accumulate or generate a trap as necessary.
                 */

        /* Must not modify %o0 or %o1 */
#define FPOP_cleanup                                    \
        stx %fsr, [ %o0 + SIMCPU_SCRATCH64_OFFSET ]     NL\
        ldx [ %o0 + SIMCPU_SCRATCH64_OFFSET ], %o4      NL\
        /* must clear cexec field if no exceptions */   NL\
        ldx     [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ], %o3  NL\
        and     %o3, 0x1f<<5, %o3                       NL\
        stx     %o3, [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ]  NL\
        andcc %o4, 0x1f, %o2                            NL\
        bne,a,pt %xcc, sparcv9_fsr_exception_update     NL\
        rd      %pc, %o5                                NL\
        /* fall through to the update part of the instruction */

#define FPOP_ENDI                                       \
        ENDI


                /*
                 * Hand off routine for floating point closure
                 * If any IEEE exception occurred, we need now to check and see if the simulated
                 * FSR required a trap to be generated, or the error to be accumulated.
                 * NOTE: error is not accumulated if a trap is to be delivered.
                 */

        .section ".text"
        .align 8
sparcv9_fsr_exception_update:
        ldx     [ %o0 + SIMCPU_v9FSR_TEM_OFFSET ], %o3
        ldx     [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o4
        srlx    %o4, 14, %o4    /* FTT field - no trap if non-zero */
        and     %o4, 7, %o4
        movrnz  %o4, %g0, %o3
        andcc   %o2, %o3, %g0
                        /* OK build the EXC group ... */
        sllx    %o2, 5, %o3
                        /* clear the accumulation if trap to be delivered */
        movne   %xcc, %g0, %o3
        ldx     [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ], %o4
                        /* build error and accum bits */
        or      %o2, %o3, %o2
                        /* mask out previous accum bits */
        and     %o4, 0x1f<<5, %o4
                        /* combine new error and old accum bits */
        or      %o2, %o4, %o2
                        /* update the execution FSR state */
        stx     %o2, [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ]
                        /* now that the status is updated, branch into the
                         * C function to deliver the IEEE trap if appropriate
                         */
        bne,pn  %xcc, sparcv9_deliver_ieee_exception
        nop
        jmp     %o5 + 4 /* finish instruction */
        nop


                /*
                 * instruction targets
                 * implemented in assembly language to improve performance on
                 * certain host machines.
                 *
                 * This file is for a sparcv9 host.
                 */

        .section ".text"
        .align 8

                /* Args are: %o0 = simcpu_t*, %o1 = xicache_instn_t * */


                /*
                 * Sparc v9 add and substract instructions
                 */

IMPL( sparcv9_add_co_imm )
        ba internal_add_co;
        ld_Simm16(%o3)

IMPL( sparcv9_add_co_rrr )
        ldx_Rsrc2(%o3)
internal_add_co:
        ldx_Rsrc1(%o2)
        addcc   %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_add_co_imm_rd0 )
        ba internal_add_co_rd0;
        ld_Simm16(%o3)

IMPL( sparcv9_add_co_rrr_rd0 )
        ldx_Rsrc2(%o3)
internal_add_co_rd0:
        ldx_Rsrc1(%o2)
        addcc   %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_add_ci_imm )
        ba internal_add_ci;
        ld_Simm16(%o3)

IMPL( sparcv9_add_ci_rrr )
        ldx_Rsrc2(%o3)
internal_add_ci:
        ldx_Rccr(%o4)
        wr      %o4, %ccr
        ldx_Rsrc1(%o2)
        addc    %o2, %o3, %o2
        stx_Rdest(%o2)
        ENDI


IMPL( sparcv9_add_cico_imm )
        ba internal_add_cico;
        ld_Simm16(%o3)

IMPL( sparcv9_add_cico_rrr )
        ldx_Rsrc2(%o3)
internal_add_cico:
        ldx_Rccr(%o4)
        wr      %o4, %ccr
        ldx_Rsrc1(%o2)
        addccc  %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_add_cico_imm_rd0 )
        ba internal_add_cico_rd0;
        ld_Simm16(%o3)

IMPL( sparcv9_add_cico_rrr_rd0 )
        ldx_Rsrc2(%o3)
internal_add_cico_rd0:
        ldx_Rccr(%o4)
        wr      %o4, %ccr
        ldx_Rsrc1(%o2)
        addccc  %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_sub_co_imm )
        ba internal_sub_co;
        ld_Simm16(%o3)

IMPL( sparcv9_sub_co_rrr )
        ldx_Rsrc2(%o3)
internal_sub_co:
        ldx_Rsrc1(%o2)
        subcc   %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_sub_co_imm_rd0 )
        ba internal_sub_co_rd0;
        ld_Simm16(%o3)

IMPL( sparcv9_sub_co_rrr_rd0 )
        ldx_Rsrc2(%o3)
internal_sub_co_rd0:
        ldx_Rsrc1(%o2)
        subcc   %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_sub_ci_imm )
        ba internal_sub_ci;
        ld_Simm16(%o3)

IMPL( sparcv9_sub_ci_rrr )
        ldx_Rsrc2(%o3)
internal_sub_ci:
        ldx_Rccr(%o4)
        wr      %o4, %ccr
        ldx_Rsrc1(%o2)
        subc    %o2, %o3, %o2
        stx_Rdest(%o2)
        ENDI


IMPL( sparcv9_sub_cico_imm )
        ba internal_sub_cico;
        ld_Simm16(%o3)

IMPL( sparcv9_sub_cico_rrr )
        ldx_Rsrc2(%o3)
internal_sub_cico:
        ldx_Rccr(%o4)
        wr      %o4, %ccr
        ldx_Rsrc1(%o2)
        subccc  %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_sub_cico_imm_rd0 )
        ba internal_sub_cico_rd0;
        ld_Simm16(%o3)

IMPL( sparcv9_sub_cico_rrr_rd0 )
        ldx_Rsrc2(%o3)
internal_sub_cico_rd0:
        ldx_Rccr(%o4)
        wr      %o4, %ccr
        ldx_Rsrc1(%o2)
        subccc  %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /*
         * Logic CC instructions ...
         */

IMPL( sparcv9_and_cc_imm )
        ba internal_and_cc;
        ld_Simm16(%o3)

IMPL( sparcv9_and_cc_rrr )
        ldx_Rsrc2(%o3)
internal_and_cc:
        ldx_Rsrc1(%o2)
        andcc   %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_and_cc_imm_rd0 )
        ba internal_and_cc_rd0;
        ld_Simm16(%o3)

IMPL( sparcv9_and_cc_rrr_rd0 )
        ldx_Rsrc2(%o3)
internal_and_cc_rd0:
        ldx_Rsrc1(%o2)
        andcc   %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /*  sparcv9_andn_cc_imm  - synthesysed by inverting imm for andcc */

IMPL( sparcv9_andn_cc_rrr )
        ldx_Rsrc2(%o3)
        ldx_Rsrc1(%o2)
        andncc  %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /*  sparcv9_andn_cc_imm_rd0 - synthesysed by inverting imm for andcc */

IMPL( sparcv9_andn_cc_rrr_rd0 )
        ldx_Rsrc2(%o3)
        ldx_Rsrc1(%o2)
        andncc  %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_or_cc_imm )
        ba internal_or_cc;
        ld_Simm16(%o3)

IMPL( sparcv9_or_cc_rrr )
        ldx_Rsrc2(%o3)
internal_or_cc:
        ldx_Rsrc1(%o2)
        orcc    %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_or_cc_imm_rd0 )
        ba internal_or_cc_rd0;
        ld_Simm16(%o3)

IMPL( sparcv9_or_cc_rrr_rd0 )
        ldx_Rsrc2(%o3)
internal_or_cc_rd0:
        ldx_Rsrc1(%o2)
        orcc    %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /* sparcv9_orn_cc_imm   - synth by inverting imm field for orcc */

IMPL( sparcv9_orn_cc_rrr )
        ldx_Rsrc2(%o3)
        ldx_Rsrc1(%o2)
        orncc   %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /* sparcv9_orn_cc_imm_rd0 - synth by inverting imm field for orcc */

IMPL( sparcv9_orn_cc_rrr_rd0 )
        ldx_Rsrc2(%o3)
        ldx_Rsrc1(%o2)
        orncc   %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_xor_cc_imm )
        ba internal_xor_cc;
        ld_Simm16(%o3)

IMPL( sparcv9_xor_cc_rrr )
        ldx_Rsrc2(%o3)
internal_xor_cc:
        ldx_Rsrc1(%o2)
        xorcc   %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


IMPL( sparcv9_xor_cc_imm_rd0 )
        ba internal_xor_cc_rd0;
        ld_Simm16(%o3)

IMPL( sparcv9_xor_cc_rrr_rd0 )
        ldx_Rsrc2(%o3)
internal_xor_cc_rd0:
        ldx_Rsrc1(%o2)
        xorcc   %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /* sparcv9_xnor_cc_imm - can synth by ~imm using orcc imm */

IMPL( sparcv9_xnor_cc_rrr )
        ldx_Rsrc2(%o3)
        ldx_Rsrc1(%o2)
        xnorcc  %o2, %o3, %o2
        stx_Rdest(%o2)
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /* sparcv9_xnor_cc_imm_rd0  - can synth by ~imm using xorcc imm */

IMPL( sparcv9_xnor_cc_rrr_rd0 )
        ldx_Rsrc2(%o3)
        ldx_Rsrc1(%o2)
        xnorcc  %o2, %o3, %g0
        rd      %ccr, %o3
        stx_Rccr(%o3)
        ENDI


        /*
         * Branch instructions change the value of npc
         * Could encode a mask into the xi immediate, but that would
         * be slow to extract, and leave us with a less than useful
         * immediate field.
         */

        /* delay slot:
         *      always executed - IF a branch IS taken
         *      annulled - if a bit set and branch IS NOT taken
         */

        /* Sparc branches are bloody awful - delay slots plus mutiple
         * condition varients ...
         *
         *      X: br   Y
         *         br   Z
         *
         *      Y: slot instn
         *
         *      Z:
         *
         * ... instn @X executes instn @Y, but then X+4 in DS of X causes branch to Z
         */


        /*
         * Policy
                annul:
                        not taken:                      taken:
                        pc = oldnpc+4                   pc = oldnpc
                        npc = oldnpc + 8;               npc = target

                no annul:
                        pc = npc
                        npc = target | npc + 4;

         */


#define BRANCH( _opc, _cc )                                             \
        ldx_Rccr (%o2)                                                                  NL\
        wr      %o2, %ccr                                                               NL\
        ldx_Rpc (%o3)                                                                   NL\
        ldx_Rnpc(%o4)                                                                   NL\
        ld_BrOff32(%o5)                                                                 NL\
        stx_Rpc (%o4)                                                                   NL\
        add     %o3, %o5, %o5           /* branch target */                             NL\
        add     %o4, 4, %o4             /* npc + 4 */                                   NL\
        mov##_opc       _cc, %o5, %o4   /* overwrite npc if branch taken */     NL\
        retl                                                                            NL\
        stx_Rnpc(%o4)


#define BRANCH_an( _opc, _cc )                                          \
        ldx_Rccr (%o2)                                                                  NL\
        wr      %o2, %ccr                                                               NL\
        ldx_Rpc (%o3)                                                                   NL\
        ldx_Rnpc(%o4)                                                                   NL\
        ld_BrOff32(%o2)                                                                 NL\
        add     %o3, %o2, %o3           /* branch target */                             NL\
        add     %o4, 4, %o5             /* oldnpc + 4 */                                NL\
        add     %o4, 8, %g1             /* oldnpc + 8 */                                NL\
        mov##_opc       _cc, %o4, %o5   /* overwrite pc if branch taken */      NL\
        mov##_opc       _cc, %o3, %g1   /* overwrite npc if branch taken */     NL\
        stx_Rpc (%o5)                   /* no annul ds not squashed */                  NL\
        retl                                                                            NL\
        stx_Rnpc(%g1)


        /* There has to be a better way than to ennunciate every instruction form !! */

        /* icc version */

IMPL( sparcv9_bne_icc )
        BRANCH( ne, %icc )
        ENDINSTN

IMPL( sparcv9_be_icc )
        BRANCH( e, %icc )
        ENDINSTN

IMPL( sparcv9_bg_icc )
        BRANCH( g, %icc )
        ENDINSTN

IMPL( sparcv9_ble_icc )
        BRANCH( le, %icc )
        ENDINSTN

IMPL( sparcv9_bge_icc )
        BRANCH( ge, %icc )
        ENDINSTN

IMPL( sparcv9_bl_icc )
        BRANCH( l, %icc )
        ENDINSTN

IMPL( sparcv9_bgu_icc )
        BRANCH( gu, %icc )
        ENDINSTN

IMPL( sparcv9_bleu_icc )
        BRANCH( leu, %icc )
        ENDINSTN

IMPL( sparcv9_bcc_icc )
        BRANCH( cc, %icc )
        ENDINSTN

IMPL( sparcv9_bcs_icc )
        BRANCH( cs, %icc )
        ENDINSTN

IMPL( sparcv9_bpos_icc )
        BRANCH( pos, %icc )
        ENDINSTN

IMPL( sparcv9_bneg_icc )
        BRANCH( neg, %icc )
        ENDINSTN

IMPL( sparcv9_bvc_icc )
        BRANCH( vc, %icc )
        ENDINSTN

IMPL( sparcv9_bvs_icc )
        BRANCH( vs, %icc )
        ENDINSTN


        /* xcc versions */

IMPL( sparcv9_bne_xcc )
        BRANCH( ne, %xcc )
        ENDINSTN

IMPL( sparcv9_be_xcc )
        BRANCH( e, %xcc )
        ENDINSTN

IMPL( sparcv9_bg_xcc )
        BRANCH( g, %xcc )
        ENDINSTN

IMPL( sparcv9_ble_xcc )
        BRANCH( le, %xcc )
        ENDINSTN

IMPL( sparcv9_bge_xcc )
        BRANCH( ge, %xcc )
        ENDINSTN

IMPL( sparcv9_bl_xcc )
        BRANCH( l, %xcc )
        ENDINSTN

IMPL( sparcv9_bgu_xcc )
        BRANCH( gu, %xcc )
        ENDINSTN

IMPL( sparcv9_bleu_xcc )
        BRANCH( leu, %xcc )
        ENDINSTN

IMPL( sparcv9_bcc_xcc )
        BRANCH( cc, %xcc )
        ENDINSTN

IMPL( sparcv9_bcs_xcc )
        BRANCH( cs, %xcc )
        ENDINSTN

IMPL( sparcv9_bpos_xcc )
        BRANCH( pos, %xcc )
        ENDINSTN

IMPL( sparcv9_bneg_xcc )
        BRANCH( neg, %xcc )
        ENDINSTN

IMPL( sparcv9_bvc_xcc )
        BRANCH( vc, %xcc )
        ENDINSTN

IMPL( sparcv9_bvs_xcc )
        BRANCH( vs, %xcc )
        ENDINSTN


                /*
                 * Annulled delay slot versions !!
                 */


        /* icc version */

IMPL( sparcv9_bne_icc_an )
        BRANCH_an( ne, %icc )
        ENDINSTN

IMPL( sparcv9_be_icc_an )
        BRANCH_an( e, %icc )
        ENDINSTN

IMPL( sparcv9_bg_icc_an )
        BRANCH_an( g, %icc )
        ENDINSTN

IMPL( sparcv9_ble_icc_an )
        BRANCH_an( le, %icc )
        ENDINSTN

IMPL( sparcv9_bge_icc_an )
        BRANCH_an( ge, %icc )
        ENDINSTN

IMPL( sparcv9_bl_icc_an )
        BRANCH_an( l, %icc )
        ENDINSTN

IMPL( sparcv9_bgu_icc_an )
        BRANCH_an( gu, %icc )
        ENDINSTN

IMPL( sparcv9_bleu_icc_an )
        BRANCH_an( leu, %icc )
        ENDINSTN

IMPL( sparcv9_bcc_icc_an )
        BRANCH_an( cc, %icc )
        ENDINSTN

IMPL( sparcv9_bcs_icc_an )
        BRANCH_an( cs, %icc )
        ENDINSTN

IMPL( sparcv9_bpos_icc_an )
        BRANCH_an( pos, %icc )
        ENDINSTN

IMPL( sparcv9_bneg_icc_an )
        BRANCH_an( neg, %icc )
        ENDINSTN

IMPL( sparcv9_bvc_icc_an )
        BRANCH_an( vc, %icc )
        ENDINSTN

IMPL( sparcv9_bvs_icc_an )
        BRANCH_an( vs, %icc )
        ENDINSTN


        /* xcc versions */

IMPL( sparcv9_bne_xcc_an )
        BRANCH_an( ne, %xcc )
        ENDINSTN

IMPL( sparcv9_be_xcc_an )
        BRANCH_an( e, %xcc )
        ENDINSTN

IMPL( sparcv9_bg_xcc_an )
        BRANCH_an( g, %xcc )
        ENDINSTN

IMPL( sparcv9_ble_xcc_an )
        BRANCH_an( le, %xcc )
        ENDINSTN

IMPL( sparcv9_bge_xcc_an )
        BRANCH_an( ge, %xcc )
        ENDINSTN

IMPL( sparcv9_bl_xcc_an )
        BRANCH_an( l, %xcc )
        ENDINSTN

IMPL( sparcv9_bgu_xcc_an )
        BRANCH_an( gu, %xcc )
        ENDINSTN

IMPL( sparcv9_bleu_xcc_an )
        BRANCH_an( leu, %xcc )
        ENDINSTN

IMPL( sparcv9_bcc_xcc_an )
        BRANCH_an( cc, %xcc )
        ENDINSTN

IMPL( sparcv9_bcs_xcc_an )
        BRANCH_an( cs, %xcc )
        ENDINSTN

IMPL( sparcv9_bpos_xcc_an )
        BRANCH_an( pos, %xcc )
        ENDINSTN

IMPL( sparcv9_bneg_xcc_an )
        BRANCH_an( neg, %xcc )
        ENDINSTN

IMPL( sparcv9_bvc_xcc_an )
        BRANCH_an( vc, %xcc )
        ENDINSTN

IMPL( sparcv9_bvs_xcc_an )
        BRANCH_an( vs, %xcc )
        ENDINSTN


#undef  BRANCH
#undef  BRANCH_an


        /*
         * versions for the branch on register value operations
         */


        /*
         * Policy
                no annul:
                        pc = npc
                        npc = target | npc + 4;

                annul:
                        not taken:                      taken:
                        pc = oldnpc+4                   pc = oldnpc
                        npc = oldnpc + 8;               npc = target

         */


#define BRANCH( _opc )                                                          \
        ldx_Rsrc1(%o2)                                                                  NL\
        ldx_Rpc (%o3)                                                                   NL\
        ldx_Rnpc(%o4)                                                                   NL\
        ld_BrRegOff32(%o5)                                                              NL\
        stx_Rpc (%o4)                   /* pc = npc */                                  NL\
        add     %o3, %o5, %o5           /* branch target */                             NL\
        add     %o4, 4, %o4             /* npc + 4 */                                   NL\
        movr##_opc %o2, %o5, %o4        /* overwrite npc if branch taken */             NL\
        retl                                                                            NL\
        stx_Rnpc(%o4)


#define BRANCH_an( _opc )                                               \
        ldx_Rsrc1(%o2)                                                                  NL\
        ldx_Rpc (%o3)                                                                   NL\
        ldx_Rnpc(%o4)                                                                   NL\
        add     %o4, 4, %o5             /* oldnpc + 4 */                                NL\
        movr##_opc      %o2, %o4, %o5   /* overwrite pc if branch taken */              NL\
        stx_Rpc (%o5)                   /* no annul ds not squashed */                  NL\
        ld_BrRegOff32(%o5)                                                              NL\
        add     %o3, %o5, %o3           /* branch target */                             NL\
        add     %o4, 8, %o5             /* oldnpc + 8 */                                NL\
        movr##_opc      %o2, %o3, %o5   /* overwrite npc if branch taken */             NL\
        retl                                                                            NL\
        stx_Rnpc(%o5)


IMPL( sparcv9_brz )
        BRANCH( z )
        ENDINSTN

IMPL( sparcv9_brlez )
        BRANCH( lez )
        ENDINSTN

IMPL( sparcv9_brlz )
        BRANCH( lz )
        ENDINSTN

IMPL( sparcv9_brnz )
        BRANCH( nz )
        ENDINSTN

IMPL( sparcv9_brgz )
        BRANCH( gz )
        ENDINSTN

IMPL( sparcv9_brgez )
        BRANCH( gez )
        ENDINSTN

IMPL( sparcv9_brz_an )
        BRANCH_an( z )
        ENDINSTN

IMPL( sparcv9_brlez_an )
        BRANCH_an( lez )
        ENDINSTN

IMPL( sparcv9_brlz_an )
        BRANCH_an( lz )
        ENDINSTN

IMPL( sparcv9_brnz_an )
        BRANCH_an( nz )
        ENDINSTN

IMPL( sparcv9_brgz_an )
        BRANCH_an( gz )
        ENDINSTN

IMPL( sparcv9_brgez_an )
        BRANCH_an( gez )
        ENDINSTN


        /*
         * SPARC floating point compares
         */

IMPL( sparcv9_fcmps_fcc0 )
        FPOP_cmp( ld, fcmps, fcc0 )
        ENDINSTN

IMPL( sparcv9_fcmps_fcc1 )
        FPOP_cmp( ld, fcmps, fcc1 )
        ENDINSTN

IMPL( sparcv9_fcmps_fcc2 )
        FPOP_cmp( ld, fcmps, fcc2 )
        ENDINSTN

IMPL( sparcv9_fcmps_fcc3 )
        FPOP_cmp( ld, fcmps, fcc3 )
        ENDINSTN

IMPL( sparcv9_fcmpd_fcc0 )
        FPOP_cmp( ldd, fcmpd, fcc0 )
        ENDINSTN

IMPL( sparcv9_fcmpd_fcc1 )
        FPOP_cmp( ldd, fcmpd, fcc1 )
        ENDINSTN

IMPL( sparcv9_fcmpd_fcc2 )
        FPOP_cmp( ldd, fcmpd, fcc2 )
        ENDINSTN

IMPL( sparcv9_fcmpd_fcc3 )
        FPOP_cmp( ldd, fcmpd, fcc3 )
        ENDINSTN

IMPL( sparcv9_fcmpes_fcc0 )
        FPOP_cmp( ld, fcmpes, fcc0 )
        ENDINSTN

IMPL( sparcv9_fcmpes_fcc1 )
        FPOP_cmp( ld, fcmpes, fcc1 )
        ENDINSTN

IMPL( sparcv9_fcmpes_fcc2 )
        FPOP_cmp( ld, fcmpes, fcc2 )
        ENDINSTN

IMPL( sparcv9_fcmpes_fcc3 )
        FPOP_cmp( ld, fcmpes, fcc3 )
        ENDINSTN

IMPL( sparcv9_fcmped_fcc0 )
        FPOP_cmp( ldd, fcmped, fcc0 )
        ENDINSTN

IMPL( sparcv9_fcmped_fcc1 )
        FPOP_cmp( ldd, fcmped, fcc1 )
        ENDINSTN

IMPL( sparcv9_fcmped_fcc2 )
        FPOP_cmp( ldd, fcmped, fcc2 )
        ENDINSTN

IMPL( sparcv9_fcmped_fcc3 )
        FPOP_cmp( ldd, fcmped, fcc3 )
        ENDINSTN