sam-t2/sam/cpus/vonk/ss/lib/cpu/src/SS_Ld128Atomic.s

/*
* ========== Copyright Header Begin ==========================================
*
* OpenSPARC T2 Processor File: SS_Ld128Atomic.s
* Copyright (c) 2006 Sun Microsystems, Inc.  All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
*
* The above named program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* The above named program is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*
* ========== Copyright Header End ============================================
*/
#if defined(ARCH_X64)

        .text
        .align  16
        .globl  ss_ld128atomic
        .type   ss_ld128atomic, @function
ss_ld128atomic:
        movdqa  (%rdi),%xmm0    /* 128-bit load (atomic) */
        movhlps %xmm0,%xmm1
        movdq   %xmm0,%r8
        bswapq  %r8
        movq    %r8,(%rsi)
        movdq   %xmm1,%r8
        bswapq  %r8
        movq    %r8,8(%rsi)
        ret
        .size   ss_ld128atomic, [.-ss_ld128atomic]

#else

.section        ".text"

/*============================================================================*\
 * void ss_ld128atomic( SS_Paddr addr, uint64_t data[2] );
 *----------------------------------------------------------------------------
 * This routines emulates the quad load atomic instruction. It should only be
 * used with a memory implementation that uses the native sparc instructions,
 * including the atomics, to implement simulated memory. This algorithm allows
 * us to implement memory without locks around loadquads and stores.
 *
 * The abstract algorithm is
 *
 * do
 *   ld ref  @ ofs + 0
 *   ld low  @ ofs + 8
 *   ld cmp  @ ofs + 0
 * while ref != cmp
 *
 * The ref and cmp load from the same address. These two loads form a time
 * window. In this time window we check that the memory location @ ofs + 0
 * did not change: ref == cmp. If the value did not change then the low value,
 * which must be loaded in the time window, is a correct lower part of the quad.
 * The ref (or cmp) value the correct upper part f the quad load. Now if the
 * value @ ofs + 0 did change then we try again.
 *
 * For RMO we inserts membars between the loads to guarantee ordering of the
 * loads.
\*============================================================================*/

.global ss_ld128atomic
.type   ss_ld128atomic, #function

ss_ld128atomic:
        ldx     [%o0 + 0],%o2           ! load reference high
1:
        membar  #LoadLoad               ! RMO
        ldx     [%o0 + 8],%o3           ! load reference low
        membar  #LoadLoad               ! RMO
        ldx     [%o0 + 0],%o4           ! load compare high
        cmp     %o2,%o4                 ! check high values loaded are the same
        bne,a   %xcc,1b                 ! which means no store inbetween
        ldx     [%o0 + 0],%o2
        stx     %o2,[%o1 + 0]           ! store high
        retl
        stx     %o3,[%o1 + 8]           ! store low


#endif