* ========== Copyright Header Begin ==========================================
* OpenSPARC T2 Processor File: sparcv9native.S
* Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
* The above named program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
* The above named program is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
* You should have received a copy of the GNU General Public
* License along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
* ========== Copyright Header End ============================================
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
#pragma ident "@(#)sparcv9native.S 1.25 07/02/20 SMI"
#include <sys/asm_linkage.h>
* Assembly support functions required by the simulator
ALTENTRY(sim_atomic_add_32_nv)
ALTENTRY(sim_atomic_add_long)
ALTENTRY(sim_atomic_add_long_nv)
add %o2, %o1, %o0 ! return new value
SET_SIZE(sim_atomic_add_32_nv)
SET_SIZE(sim_atomic_add_32)
SET_SIZE(sim_atomic_add_long_nv)
SET_SIZE(sim_atomic_add_long)
* o0 = pointer to memory location
* o1 = value to compare with
* o2 = value to swap in if equal
* o0 = original contents of memory location
* o0 = pointer to memory location
* o1 = value to compare with
* o2 = value to swap in if equal
* o0 = original contents of memory location
* Atomic load of 128 bits big endian into two 64bit registers.
* Have to do this kludge because SPARC doesnt provide a 128bit atomic
* fetch that executes at user level.
* I just hope the 64byte block load is atomic on all architectures.
* %o0 points to memory location (128 bit aligned).
* %o1 points to high 64bits of result register (big endian)
* %o2 points to low 64bits of result register (big endian)
#define ASI_BLK_P 0xf0 /* VIS 1.0 block load from primary AS */
.global host_atomic_get128be
/* align the memory address for a block load */
membar #Sync /* ensure the data is present */
and %o0, 0x30, %o4 /* figure out which of the 4 128bit blocks we want */
jmpl %o0 + (_xxword0 - _base), %g0
* Assembly version of certain simulator instruction implementations.
#define ld_simm16( _Rxip, _offset, _Rdest ) \
ldsh [ _Rxip + _offset ], _Rdest /* 1 instn only !! */
#define ld_simm32( _Rxip, _offset, _Rdest ) \
ldsw [ _Rxip + _offset ], _Rdest /* 1 instn only !! */
#define ldx_ireg( _Rcpup, _Rxip, _offset, _Rdest, _Rscratch ) \
lduh [ _Rxip + _offset ], _Rscratch NL\
ldx [ _Rcpup + _Rscratch ], _Rdest
#define stx_ireg( _Rcpup, _Rxip, _offset, _Rval, _Rscratch ) \
lduh [ _Rxip + _offset ], _Rscratch NL\
stx _Rval, [ _Rcpup + _Rscratch ]
#define ld_fpreg( _ldt, _Rcpup, _Rxip, _Rfprp, _offset, _Rdest, _Rscratch ) \
lduh [ _Rxip + _offset ], _Rscratch NL\
_ldt [ _Rfprp + _Rscratch ], _Rdest
#define st_fpreg( _stt, _Rcpup, _Rxip, _Rfprp, _offset, _Rdest, _Rscratch ) \
lduh [ _Rxip + _offset ], _Rscratch NL\
_stt _Rdest, [ _Rfprp + _Rscratch ]
* %o1 = xicache_instn_t *
* %g1 is used as a scratch register by these macros
#define ldx_Rsrc1( _r ) ldx_ireg( %o0, %o1, XIC_INTREG_SRC1_OFFSET, _r, %g1 )
#define ldx_Rsrc2( _r ) ldx_ireg( %o0, %o1, XIC_INTREG_SRC2_OFFSET, _r, %g1 )
#define ld_Simm16( _r ) ld_simm16( %o1, XIC_SIMM16_OFFSET, _r )
#define ld_BrOff32( _r ) ld_simm32( %o1, XIC_BROFF32_OFFSET, _r )
#define ld_BrRegOff32( _r ) ld_simm32( %o1, XIC_BREGOFF32_OFFSET, _r )
#define stx_Rdest( _r ) stx_ireg( %o0, %o1, XIC_INTREG_DEST_OFFSET, _r, %g1 )
#define ldx_Rccr( _r ) ldx [ %o0 + SIMCPU_v9CCR_OFFSET ], _r
#define stx_Rccr( _r ) stx _r, [ %o0 + SIMCPU_v9CCR_OFFSET ]
#define ldx_Rpc( _r ) ldx [ %o0 + SIMCPU_PC_OFFSET ], _r
#define stx_Rpc( _r ) stx _r, [ %o0 + SIMCPU_PC_OFFSET ]
#define ldx_Rnpc( _r ) ldx [ %o0 + SIMCPU_NPC_OFFSET ], _r
#define stx_Rnpc( _r ) stx _r, [ %o0 + SIMCPU_NPC_OFFSET ]
#define ld_FPsrc1( _ldt, _r ) ld_fpreg( _ldt, %o0, %o1, %o0, XIC_FPREG_SRC1_OFFSET, _r, %g1 )
#define ld_FPsrc2( _ldt, _r ) ld_fpreg( _ldt, %o0, %o1, %o0, XIC_FPREG_SRC2_OFFSET, _r, %g1 )
.global decoded_impl_##_name NL\
ldx [ %o0 + SIMCPU_NPC_OFFSET ], %o1 NL\
stx %o1, [ %o0 + SIMCPU_PC_OFFSET ] NL\
stx %o2, [ %o0 + SIMCPU_NPC_OFFSET ] NL\
#define ENDINSTN /* nada */
* For executing floating point operations on SPARC ..
* .. specifically SPARC on SPARC we use the FSR for the
* cpu being emulated, but disable all floating point traps.
* Then we test whether an exception has occurred after the
* instruction execution, and update the simulated FSR
* accordingly, then finally signal a trap if the simulated
* machine actually desired one.
* the simcpu_t simulated registers are used as follows:
* v9_fsr_ctrl holds the SPARC fsr control bits ..
* for condition codes, rounding etc. The execution results
* (errors and accumulated errors) are held in the the
* v9_fsr_exc pseudo register, and the trap enable bits
* (TEM) are held in the v9_fsr_tem pseudo reg.
* Note: we have to use the simcpu_t scratch64 value because we
* can't get the fsr value out of the cpu without using a
* store instruction. This prob. should be per exec_thread, but
* its harder to get at than per simcpu in here.
ldx [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o4 NL\
sllx %o3, 30, %o3 /* FCC[321] + RD mask */ NL\
or %o3, (3 << 10), %o3 /* | FCC0 mask */ NL\
stx %o4, [ %o0 + SIMCPU_SCRATCH64_OFFSET ] NL\
ldx [ %o0 + SIMCPU_SCRATCH64_OFFSET ], %fsr NL\
ldx [ %o0 + SIMCPU_v9GSR_CTRL_OFFSET ], %o4 NL\
#ifdef FP_DECODE_DISABLED
#define FPOP_fpu_on_check
#else /* FP_DECODE_DISABLED */
#define FPOP_fpu_on_check \
ldx [ %o0 + SIMCPU_SPECIFICP_OFFSET ], %o4 NL\
ld [ %o4 + SPARCV9_FPU_ON_OFFSET ], %o4 NL\
brz %o4, sparcv9_deliver_fp_disabled_exception NL\
#endif /* FP_DECODE_DISABLED */
#define FPOP_cmp( _ldt, _fpop, _fcc ) \
ld_FPsrc1( _ldt, %f0 ) NL\
ld_FPsrc2( _ldt, %f4 ) NL\
_fpop %_fcc, %f0, %f4 NL\
/* FPOP_save_fcc assumes FPOP_cleanup stored %fsr in scratch */
ldx [ %o0 + SIMCPU_SCRATCH64_OFFSET ], %o4 NL\
sllx %o3, 32, %o3 /* FCC[321] mask */ NL\
or %o3, (3 << 10), %o3 /* | FCC0 mask */ NL\
ldx [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o2 NL\
stx %o2, [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ]
* Since we're running as a user process, we're not
* going to see anything here other than ieee754 exceptions
* But these have to be handled carefully, since the simulated FSR
* configuration may require that a proper exception is generated
* annoyingly we have to save the fsr somewhere in order to get access to
* the execution results - we use a per-cpu scratch area so we avoid MT conflicts
* So retrieve the FSR, stash it back into ctrl sans error bits
* (tem bits should still be zero).
* Then, look for errors from the last executed instrucion .. if none, then
* do nothing. If some then accumulate or generate a trap as necessary.
/* Must not modify %o0 or %o1 */
stx %fsr, [ %o0 + SIMCPU_SCRATCH64_OFFSET ] NL\
ldx [ %o0 + SIMCPU_SCRATCH64_OFFSET ], %o4 NL\
/* must clear cexec field if no exceptions */ NL\
ldx [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ], %o3 NL\
and %o3, 0x1f<<5, %o3 NL\
stx %o3, [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ] NL\
bne,a,pt %xcc, sparcv9_fsr_exception_update NL\
/* fall through to the update part of the instruction */
* Hand off routine for floating point closure
* If any IEEE exception occurred, we need now to check and see if the simulated
* FSR required a trap to be generated, or the error to be accumulated.
* NOTE: error is not accumulated if a trap is to be delivered.
sparcv9_fsr_exception_update:
ldx [ %o0 + SIMCPU_v9FSR_TEM_OFFSET ], %o3
ldx [ %o0 + SIMCPU_v9FSR_CTRL_OFFSET ], %o4
srlx %o4, 14, %o4 /* FTT field - no trap if non-zero */
/* OK build the EXC group ... */
/* clear the accumulation if trap to be delivered */
ldx [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ], %o4
/* build error and accum bits */
/* mask out previous accum bits */
/* combine new error and old accum bits */
/* update the execution FSR state */
stx %o2, [ %o0 + SIMCPU_v9FSR_EXC_OFFSET ]
/* now that the status is updated, branch into the
* C function to deliver the IEEE trap if appropriate
bne,pn %xcc, sparcv9_deliver_ieee_exception
jmp %o5 + 4 /* finish instruction */
* implemented in assembly language to improve performance on
* This file is for a sparcv9 host.
/* Args are: %o0 = simcpu_t*, %o1 = xicache_instn_t * */
* Sparc v9 add and substract instructions
IMPL( sparcv9_add_co_imm )
IMPL( sparcv9_add_co_rrr )
IMPL( sparcv9_add_co_imm_rd0 )
IMPL( sparcv9_add_co_rrr_rd0 )
IMPL( sparcv9_add_ci_imm )
IMPL( sparcv9_add_ci_rrr )
IMPL( sparcv9_add_cico_imm )
IMPL( sparcv9_add_cico_rrr )
IMPL( sparcv9_add_cico_imm_rd0 )
ba internal_add_cico_rd0;
IMPL( sparcv9_add_cico_rrr_rd0 )
IMPL( sparcv9_sub_co_imm )
IMPL( sparcv9_sub_co_rrr )
IMPL( sparcv9_sub_co_imm_rd0 )
IMPL( sparcv9_sub_co_rrr_rd0 )
IMPL( sparcv9_sub_ci_imm )
IMPL( sparcv9_sub_ci_rrr )
IMPL( sparcv9_sub_cico_imm )
IMPL( sparcv9_sub_cico_rrr )
IMPL( sparcv9_sub_cico_imm_rd0 )
ba internal_sub_cico_rd0;
IMPL( sparcv9_sub_cico_rrr_rd0 )
* Logic CC instructions ...
IMPL( sparcv9_and_cc_imm )
IMPL( sparcv9_and_cc_rrr )
IMPL( sparcv9_and_cc_imm_rd0 )
IMPL( sparcv9_and_cc_rrr_rd0 )
/* sparcv9_andn_cc_imm - synthesysed by inverting imm for andcc */
IMPL( sparcv9_andn_cc_rrr )
/* sparcv9_andn_cc_imm_rd0 - synthesysed by inverting imm for andcc */
IMPL( sparcv9_andn_cc_rrr_rd0 )
IMPL( sparcv9_or_cc_imm )
IMPL( sparcv9_or_cc_rrr )
IMPL( sparcv9_or_cc_imm_rd0 )
IMPL( sparcv9_or_cc_rrr_rd0 )
/* sparcv9_orn_cc_imm - synth by inverting imm field for orcc */
IMPL( sparcv9_orn_cc_rrr )
/* sparcv9_orn_cc_imm_rd0 - synth by inverting imm field for orcc */
IMPL( sparcv9_orn_cc_rrr_rd0 )
IMPL( sparcv9_xor_cc_imm )
IMPL( sparcv9_xor_cc_rrr )
IMPL( sparcv9_xor_cc_imm_rd0 )
IMPL( sparcv9_xor_cc_rrr_rd0 )
/* sparcv9_xnor_cc_imm - can synth by ~imm using orcc imm */
IMPL( sparcv9_xnor_cc_rrr )
/* sparcv9_xnor_cc_imm_rd0 - can synth by ~imm using xorcc imm */
IMPL( sparcv9_xnor_cc_rrr_rd0 )
* Branch instructions change the value of npc
* Could encode a mask into the xi immediate, but that would
* be slow to extract, and leave us with a less than useful
* always executed - IF a branch IS taken
* annulled - if a bit set and branch IS NOT taken
/* Sparc branches are bloody awful - delay slots plus mutiple
* ... instn @X executes instn @Y, but then X+4 in DS of X causes branch to Z
pc = oldnpc+4 pc = oldnpc
npc = oldnpc + 8; npc = target
#define BRANCH( _opc, _cc ) \
add %o3, %o5, %o5 /* branch target */ NL\
add %o4, 4, %o4 /* npc + 4 */ NL\
mov##_opc _cc, %o5, %o4 /* overwrite npc if branch taken */ NL\
#define BRANCH_an( _opc, _cc ) \
add %o3, %o2, %o3 /* branch target */ NL\
add %o4, 4, %o5 /* oldnpc + 4 */ NL\
add %o4, 8, %g1 /* oldnpc + 8 */ NL\
mov##_opc _cc, %o4, %o5 /* overwrite pc if branch taken */ NL\
mov##_opc _cc, %o3, %g1 /* overwrite npc if branch taken */ NL\
stx_Rpc (%o5) /* no annul ds not squashed */ NL\
/* There has to be a better way than to ennunciate every instruction form !! */
* Annulled delay slot versions !!
IMPL( sparcv9_bne_icc_an )
IMPL( sparcv9_be_icc_an )
IMPL( sparcv9_bg_icc_an )
IMPL( sparcv9_ble_icc_an )
IMPL( sparcv9_bge_icc_an )
IMPL( sparcv9_bl_icc_an )
IMPL( sparcv9_bgu_icc_an )
IMPL( sparcv9_bleu_icc_an )
IMPL( sparcv9_bcc_icc_an )
IMPL( sparcv9_bcs_icc_an )
IMPL( sparcv9_bpos_icc_an )
IMPL( sparcv9_bneg_icc_an )
IMPL( sparcv9_bvc_icc_an )
IMPL( sparcv9_bvs_icc_an )
IMPL( sparcv9_bne_xcc_an )
IMPL( sparcv9_be_xcc_an )
IMPL( sparcv9_bg_xcc_an )
IMPL( sparcv9_ble_xcc_an )
IMPL( sparcv9_bge_xcc_an )
IMPL( sparcv9_bl_xcc_an )
IMPL( sparcv9_bgu_xcc_an )
IMPL( sparcv9_bleu_xcc_an )
IMPL( sparcv9_bcc_xcc_an )
IMPL( sparcv9_bcs_xcc_an )
IMPL( sparcv9_bpos_xcc_an )
IMPL( sparcv9_bneg_xcc_an )
IMPL( sparcv9_bvc_xcc_an )
IMPL( sparcv9_bvs_xcc_an )
* versions for the branch on register value operations
pc = oldnpc+4 pc = oldnpc
npc = oldnpc + 8; npc = target
stx_Rpc (%o4) /* pc = npc */ NL\
add %o3, %o5, %o5 /* branch target */ NL\
add %o4, 4, %o4 /* npc + 4 */ NL\
movr##_opc %o2, %o5, %o4 /* overwrite npc if branch taken */ NL\
#define BRANCH_an( _opc ) \
add %o4, 4, %o5 /* oldnpc + 4 */ NL\
movr##_opc %o2, %o4, %o5 /* overwrite pc if branch taken */ NL\
stx_Rpc (%o5) /* no annul ds not squashed */ NL\
add %o3, %o5, %o3 /* branch target */ NL\
add %o4, 8, %o5 /* oldnpc + 8 */ NL\
movr##_opc %o2, %o3, %o5 /* overwrite npc if branch taken */ NL\
* SPARC floating point compares
IMPL( sparcv9_fcmps_fcc0 )
FPOP_cmp( ld, fcmps, fcc0 )
IMPL( sparcv9_fcmps_fcc1 )
FPOP_cmp( ld, fcmps, fcc1 )
IMPL( sparcv9_fcmps_fcc2 )
FPOP_cmp( ld, fcmps, fcc2 )
IMPL( sparcv9_fcmps_fcc3 )
FPOP_cmp( ld, fcmps, fcc3 )
IMPL( sparcv9_fcmpd_fcc0 )
FPOP_cmp( ldd, fcmpd, fcc0 )
IMPL( sparcv9_fcmpd_fcc1 )
FPOP_cmp( ldd, fcmpd, fcc1 )
IMPL( sparcv9_fcmpd_fcc2 )
FPOP_cmp( ldd, fcmpd, fcc2 )
IMPL( sparcv9_fcmpd_fcc3 )
FPOP_cmp( ldd, fcmpd, fcc3 )
IMPL( sparcv9_fcmpes_fcc0 )
FPOP_cmp( ld, fcmpes, fcc0 )
IMPL( sparcv9_fcmpes_fcc1 )
FPOP_cmp( ld, fcmpes, fcc1 )
IMPL( sparcv9_fcmpes_fcc2 )
FPOP_cmp( ld, fcmpes, fcc2 )
IMPL( sparcv9_fcmpes_fcc3 )
FPOP_cmp( ld, fcmpes, fcc3 )
IMPL( sparcv9_fcmped_fcc0 )
FPOP_cmp( ldd, fcmped, fcc0 )
IMPL( sparcv9_fcmped_fcc1 )
FPOP_cmp( ldd, fcmped, fcc1 )
IMPL( sparcv9_fcmped_fcc2 )
FPOP_cmp( ldd, fcmped, fcc2 )
IMPL( sparcv9_fcmped_fcc3 )
FPOP_cmp( ldd, fcmped, fcc3 )