\ ========== Copyright Header Begin ==========================================
\ 
\ Hypervisor Software File: move.fth
\ 
\ Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
\ 
\  - Do no alter or remove copyright notices
\ 
\  - Redistribution and use of this software in source and binary forms, with 
\    or without modification, are permitted provided that the following 
\    conditions are met: 
\ 
\  - Redistribution of source code must retain the above copyright notice, 
\    this list of conditions and the following disclaimer.
\ 
\  - Redistribution in binary form must reproduce the above copyright notice,
\    this list of conditions and the following disclaimer in the
\    documentation and/or other materials provided with the distribution. 
\ 
\    Neither the name of Sun Microsystems, Inc. or the names of contributors 
\ may be used to endorse or promote products derived from this software 
\ without specific prior written permission. 
\ 
\     This software is provided "AS IS," without a warranty of any kind. 
\ ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, 
\ INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A 
\ PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE HEREBY EXCLUDED. SUN 
\ MICROSYSTEMS, INC. ("SUN") AND ITS LICENSORS SHALL NOT BE LIABLE FOR 
\ ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR 
\ DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. IN NO EVENT WILL SUN 
\ OR ITS LICENSORS BE LIABLE FOR ANY LOST REVENUE, PROFIT OR DATA, OR 
\ FOR DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL OR PUNITIVE 
\ DAMAGES, HOWEVER CAUSED AND REGARDLESS OF THE THEORY OF LIABILITY, 
\ ARISING OUT OF THE USE OF OR INABILITY TO USE THIS SOFTWARE, EVEN IF 
\ SUN HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
\ 
\ You acknowledge that this software is not designed, licensed or
\ intended for use in the design, construction, operation or maintenance of
\ any nuclear facility. 
\ 
\ ========== Copyright Header End ============================================
\ move.fth 2.6 93/10/20
\ Copyright 1985-1990 Bradley Forthware

\ Mike Saari's blazing `move' ...
\ This implements the MOVE instruction.  It is optimized
\ for speed, particularly when longword stores may be used.

code (move)  ( src dst cnt -- )
			\ tos = Count
   sp 1 /n*  scr   nget	\ scr = Src address
   sp 0 /n*  sc1   nget	\ sc1 = Dst address
			\ sc2 = Temp. data being transferred
			\ sc3 = src xor drc, low bits=0 indicates compatible
			\ sc4 = Working src in loops
			\       (also temp last+1 address)
			\ sc5 = Working dst in loops
			\ sc6 = Loop index

   scr sc1  %g0  subcc	\ Src > dst?
   > if			\ Then copy low-to-high
      scr sc1  sc3  xor		\ (delay) sc3 low bits=0 indicates compatible

      tos h# 10   %g0  subcc	\ Enough bytes to bother optimizing?
      >= if			\ Otherwise, just skip to byte move
         sc3 1    %g0  andcc	\ (delay) =0 if at least shortword aligned
         0= if			\ Otherwise, just skip to byte move

            scr 1   %g0  andcc		\ (delay) Not on halfword boundary?
            0<> if		\ Ensure halfword alignment (lower)
               scr 0  sc2  ldub		\ (delay) Load bottom byte
               sc2  sc1 0  stb		\ Store byte 
               scr 1  scr  add		\ Advance by one byte
               sc1 1  sc1  add		\  "
               tos 1  tos  sub		\ Decrement count 
            then

            sc3 2   %g0  andcc		\ =0 if at least longword aligned
            0= if		\ Otherwise, skip to halfword case

               scr 2  %g0  andcc	\ (delay) Not on longword boundary?
               0<> if		\ Ensure longword alignment (lower)
                  scr 0  sc2  lduh	\ (delay) Load bottom halfword
                  sc2  sc1 0  sth	\ Store halfword
                  scr 2  scr  add	\ Advance by one halfword
                  sc1 2  sc1  add	\  "
                  tos 2  tos  sub       \ Decrement count
               then

               sc3 4  %g0  andcc	\ =0 if doublelong aligned
               0= if		\ Otherwise, skip to longword case

                  scr 4  %g0  andcc	\ (delay) Not on doublelong boundary?
                  0<> if	\ Ensure doublelong alignment (lower)
                     scr 0  sc2  ld	\ (delay) Load bottom longword
                     sc2  sc1 0  st	\ Store longword
                     scr 4  scr  add	\ Advance by one longword
                     sc1 4  sc1  add	\  "
                     tos 4  tos  sub	\ Decrement count
                  then
				\ Doublelong Copy Loop (low-to-high)
                  tos 7  sc6  andn	\ Index w/ even multiples of 8
                  scr sc6  scr  add	\ src = src+index
                  scr   8  sc4  sub 	\ Working src = src+index-8 
                  sc1 sc6  sc1  add	\ dst = dst+index
                  sc1   8  sc5  sub	\ Working dst = dst+index-8
                  %g0 sc6  sc6  subcc	\ Negate index
                  begin	
                  < while
                     sc6 8     sc6  addcc	\ (delay) Increment index
32\                  sc4 sc6   sc2  ldd		\ Load doublelong
64\                  sc4 sc6   sc2  ldx		\ Load 64-bit
                  repeat
32\                  sc2   sc5 sc6  std		\ (delay) Store doublelong
64\                  sc2   sc5 sc6  stx		\ (delay) Store 64-bit

                  tos 7   tos   and	\ At end, adjust cnt for few remaining

               else		\ Longword Copy Loop (low-to-high)
                  nop			\ (delay)
                  tos 3  sc6  andn	\ Index w/ even multiples of 4
                  scr sc6  scr  add	\ src = src+index
                  scr   4  sc4  sub	\ Working src = src+index-4 
                  sc1 sc6  sc1  add	\ dst = dst+index
                  sc1   4  sc5  sub	\ Working dst = dst+index-4
                  %g0 sc6  sc6  subcc	\ Negate index
                  begin
                  < while
                     sc6 4     sc6  addcc	\ (delay) Increment index
                     sc4 sc6   sc2  ld		\ Load longword
                  repeat
                     sc2   sc5 sc6  st		\ (delay) Store longword

                  tos 3   tos   and	\ At end, adjust cnt for few remaining
               then

            else 		\ Halfword Copy Loop (low-to-high)
               nop			\ (delay)
               tos 1  sc6  andn		\ Index w/ even multiples of 2
               scr sc6  scr  add	\ src = src+index
               scr   2  sc4  sub	\ Working src = src+index-2 
               sc1 sc6  sc1  add	\ dst = dst+index
               sc1   2  sc5  sub	\ Working dst = dst+index-2
               %g0 sc6  sc6  subcc	\ Negate index
               begin
               < while
                  sc6 2     sc6  addcc		\ (delay) Increment index
                  sc4 sc6   sc2  lduh		\ Load halfword
               repeat
                  sc2   sc5 sc6  sth		\ (delay) Store halfword

               tos 1   tos   and	\ At end, adjust cnt for few remaining
            then
         then			
      then	\ Now do a normal byte move for all remaining bytes (at top)

      \ Byte Copy Loop (low-to-high)
				\ (tos = index)
      scr tos  scr  add
      scr   1  sc4  sub		\ Working src = src+cnt-1 
      sc1 tos  sc1  add
      sc1   1  sc5  sub		\ Working dst = dst+cnt-1 
      %g0 tos  sc6  subcc	\ Negate index
      begin
      < while
         sc6 1     sc6  addcc	\ (delay) Increment cnt
         sc4 sc6   sc2  ldub	\ Load byte
      repeat
         sc2   sc5 sc6  stb	\ (delay) Store byte

   else  		\ Copy high-to-low case
      nop			\ (delay)
      tos h# 10   %g0  subcc	\ Enough bytes to bother optimizing?
      >= if			\ Otherwise, just skip to byte move
         sc3 1    %g0  andcc	\ (delay) =0 if at least shortword aligned
         0= if			\ Otherwise, just skip to byte move

            scr tos  sc4  add		\ (delay) Calculate last+1 address

            sc4 1    %g0  andcc		\ Not on halfword boundary? (at top)
            0<> if		\ Ensure halfword alignment (at top)
               sc4 -1   sc2  ldub	\ (delay) Load top byte
               tos 1    tos  sub	\ Decrement count 
               sc2  sc1 tos  stb	\ Store byte 
               sc4 1    sc4  sub	\ Recalculate last+1 address
            then

            sc3 2   %g0  andcc		\ =0 if at least longword aligned
            0= if		\ Otherwise, skip to halfword case

               sc4 2    %g0  andcc	\ (delay) Not on longword boundary? (at top)
               0<> if		\ Ensure longword alignment (at top)
                  sc4 -2   sc2  lduh	\ (delay) Load top halfword
                  tos 2    tos  sub	\ Decrement count
                  sc2  sc1 tos  sth	\ Store halfword
                  sc4 2    sc4  sub	\ Recalculate last+1 address
               then

               sc3 4  %g0  andcc	\ =0 if doublelong aligned
               0= if		\ Otherwise, skip to longword case

                  sc4 4  %g0  andcc	\ (delay) Not on doublelong boundary? (top)
                  0<> if	\ Ensure doublelong alignment (at top)
                     sc4 -4   sc2  ld	\ (delay) Load top longword
                     tos 4    tos  sub	\ Decrement count
                     sc2  sc1 tos  st	\ Store longword
                  then
				\ Doublelong Copy Loop (high-to-low)
                  scr 8   sc4   add	\ Working src = src+8
                  sc1 8   sc5   add	\ Working dst = dst+8
                  tos 8   sc6   subcc	\ Loop index = cnt-8
                  begin
                  >= while
                     sc6 8     sc6  subcc	\ (delay) Decrement index
32\                  sc4 sc6   sc2  ldd		\ Load doublelong
64\                  sc4 sc6   sc2  ldx		\ Load 64-bit
                  repeat
32\                  sc2   sc5 sc6  std		\ (delay) Store doublelong
64\                  sc2   sc5 sc6  stx		\ (delay) Store 64-bit

                  tos 7   tos   and	\ At end, adjust cnt for few remaining

               else		\ Longword Copy Loop (high-to-low)
                  nop			\ (delay)
                  scr 4   sc4   add	\ Working src = src+4
                  sc1 4   sc5   add	\ Working dst = dst+4
                  tos 4   sc6   subcc	\ Loop index = cnt-4
                  begin
                  >= while
                     sc6 4     sc6  subcc	\ (delay) Decrement index
                     sc4 sc6   sc2  ld		\ Load longword
                  repeat
                     sc2   sc5 sc6  st		\ (delay) Store longword

                  tos 3   tos   and	\ At end, adjust cnt for few remaining
               then

            else		\ Halfword Copy Loop (high-to-low)
               nop			\ (delay)
               scr 2   sc4   add	\ Working src = src+2
               sc1 2   sc5   add	\ Working dst = dst+2
               tos 2   sc6   subcc	\ Loop index = cnt-2
               begin
               >= while
                  sc6 2     sc6  subcc	\ (delay) Decrement index
                  sc4 sc6   sc2  lduh	\ Load halfword
               repeat
                  sc2   sc5 sc6  sth	\ (delay) Store halfword

               tos 1   tos   and	\ At end, adjust cnt for few remaining
            then
         then	
      then	\ Now do a normal byte move for all remaining bytes (at bottom)

      \ Byte Copy Loop (high-to-low)
      scr 1     sc4  add	\ Working src = src+1
      sc1 1     sc5  add	\ Working dst = dst+1
      tos 1     tos  subcc	\ Loop index = cnt-1
      begin
      >= while
         tos 1     tos  subcc	\ (delay) Decrement index
         sc4 tos   sc2  ldub	\ Load byte
      repeat
         sc2   sc5 tos  stb	\ (delay) Store byte
   then

   sp 2 /n*  tos nget	\ Delete 3 stack items
   sp 3 /n*  sp  add	\   "
c;
defer move
' (move) is move