obp/obp/fm/kernel/sparc/kerncode.fth

\ ========== Copyright Header Begin ==========================================
\
\ Hypervisor Software File: kerncode.fth
\
\ Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
\
\  - Do no alter or remove copyright notices
\
\  - Redistribution and use of this software in source and binary forms, with
\    or without modification, are permitted provided that the following
\    conditions are met:
\
\  - Redistribution of source code must retain the above copyright notice,
\    this list of conditions and the following disclaimer.
\
\  - Redistribution in binary form must reproduce the above copyright notice,
\    this list of conditions and the following disclaimer in the
\    documentation and/or other materials provided with the distribution.
\
\    Neither the name of Sun Microsystems, Inc. or the names of contributors
\ may be used to endorse or promote products derived from this software
\ without specific prior written permission.
\
\     This software is provided "AS IS," without a warranty of any kind.
\ ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
\ INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
\ PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE HEREBY EXCLUDED. SUN
\ MICROSYSTEMS, INC. ("SUN") AND ITS LICENSORS SHALL NOT BE LIABLE FOR
\ ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
\ DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. IN NO EVENT WILL SUN
\ OR ITS LICENSORS BE LIABLE FOR ANY LOST REVENUE, PROFIT OR DATA, OR
\ FOR DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL OR PUNITIVE
\ DAMAGES, HOWEVER CAUSED AND REGARDLESS OF THE THEORY OF LIABILITY,
\ ARISING OUT OF THE USE OF OR INABILITY TO USE THIS SOFTWARE, EVEN IF
\ SUN HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
\
\ You acknowledge that this software is not designed, licensed or
\ intended for use in the design, construction, operation or maintenance of
\ any nuclear facility.
\
\ ========== Copyright Header End ============================================
id: @(#)kerncode.fth 2.43 07/06/05 10:54:47
purpose:
copyright: Copyright 2007 Sun Microsystems, Inc.  All Rights Reserved.
copyright: Copyright 1985-1990 Bradley Forthware
copyright: Use is subject to license terms.

\ ident "@(#)kerncode.fth       2.43    07/06/05 SMI"

\ Meta compiler source for the Forth 83 kernel code words.
\ TODO:
\ separate heads.
\ Change code-field: so that when compiled into a metacompiler definition,
\ that word would return the 0-relative address.  When compiled into a
\ target definition, the word would return the absolute address.  Essentially,
\ we need to define "dolabel" very early in the kernel source.

meta
hex

\ Allocate and clear the initial user area image
\ mlabel init-user-area
setup-user-area

extend-meta-assembler

\ ---- Assembler macros that reside in the host environment
\ and assemble code for the target environment

\ Forth Virtual Machine registers

\ Note that the Forth Stack Pointer (%g7) is NOT the same register that
\ C uses for the stack pointer (%o6).  The hardware does all sorts of
\ funny things with the C stack pointer when you do save and restore
\ instructions, and when the register windows overflow.

:-h sp   %i5  ;-h  :-h base %g2  ;-h  :-h up  %g3  ;-h
:-h tos  %g4  ;-h  :-h ip   %i3  ;-h  :-h rp  %i4  ;-h

\ Scratch Registers
:-h scr %l0  ;-h  :-h sc1  %l1 ;-h  :-h sc2  %l2 ;-h  :-h sc3 %l3  ;-h
:-h sc4 %l4  ;-h  :-h sc5  %l5 ;-h  :-h sc6  %l6 ;-h  :-h sc7 %l7  ;-h

:-h spc %o7  ;-h        \ Saved Program Counter - set by the CALL instruction

\ Macros:

\ Parameter Field Address
\t32-t \dtc-t :-h apf  ( -- )  spc 8  ;-h
\t32-t \itc-t :-h apf  ( -- )  sc1 4  ;-h
\t16-t        :-h apf  ( -- )  sc1 2  ;-h

\ Put a bubble in the pipeline to patch the load interlock bug
:-h bubble  ( nop )  ;-h

32\ :-h slln  ( rs1 rs2 rd -- )  sll  ;-h
32\ :-h srln  ( rs1 rs2 rd -- )  srl  ;-h
32\ :-h sran  ( rs1 rs2 rd -- )  sra  ;-h
32\ :-h nget  ( ptr off  dst -- )  ld  ;-h
32\ :-h nput  ( src off  ptr -- )  st  ;-h

64\ :-h slln  ( rs1 rs2 rd -- )  sllx  ;-h
64\ :-h srln  ( rs1 rs2 rd -- )  srlx  ;-h
64\ :-h sran  ( rs1 rs2 rd -- )  srax  ;-h
64\ :-h nget  ( ptr off  dst -- )  ldx  ;-h
64\ :-h nput  ( src off  ptr -- )  stx  ;-h

:-h lget  ( ptr dst -- )  0  swap  ld  ;-h
:-h lput  ( src ptr -- )  0  swap  st  ;-h

:-h get   ( ptr dst -- )  0  swap  nget ;-h
:-h put   ( src ptr -- )  0  swap  nput ;-h

:-h move ( src dst -- )  %g0     -rot  add  ;-h
:-h ainc ( ptr -- )      dup /n  swap  add  ;-h
:-h adec ( ptr -- )      dup /n  swap  sub  ;-h
:-h push ( src ptr -- )  dup     adec  put  ;-h
:-h pop  ( ptr dst -- )  over   -rot  get   ainc  ;-h
:-h test ( src -- )      %g0    %g0   addcc  ;-h
:-h cmp  ( s1 s2 -- )    %g0          subcc  ;-h
\ Get a token
:-h rtget ( srca srcb dst -- )
\t16-t  dup >r lduh  r> ( dst )
\t16-t  tshift-t over sll

\t32-t  ld   bubble
\t32-t   \ We could increment a counter here to gather statistics with
\t32-t   \ no speed penalty in the 32-bit !

;-h
\ Get a branch offset
:-h bget ( src dst -- )
\t8-t   0 swap  ldsb    \ Is the limited range a problem?
\t16-t  0 swap  ldsh
32\ \t32-t  0 swap  ld
64\ \t32-t  tuck 0 swap  lduw
64\ \t32-t  0  over      sra
;-h

:-h /n* /n * ;-h

:-h 'user#  \ name  ( -- user# )
    '  ( acf-of-user-variable )  >body-t
\t32-t l@-t
\t16-t w@-t
;-h
:-h 'user  \ name  ( -- user-addressing-mode )
    meta-asm[   up 'user#   ]meta-asm
;-h
:-h 'body  \ name  ( -- variable-apf )
    '  ( acf-of-user-variable )  >body-t
;-h
:-h 'acf  \ name  ( -- variable-apf )
    '  ( acf-of-user-variable )  >body-t
;-h
:-h set  ( value  reg  -- )
    2dup sethi  swap h# 3ff land swap tuck  add
;-h

\ There are a few places in the code where moving the previous instruction
\ to the delay slot of the "next jmp" instruction won't work.  Generally
\ these are places where a control structure ends just before "next".
\ inhibit-delay assembles a nop instruction in cases where that is needed.
\ This ought to be done by the assembler, but it is hard to figure out.
:-h inhibit-delay
\t16-t  meta-asm[  nop  ]meta-asm
;-h

\ assembler macro to assemble next
:-h next
   meta-asm[
\t8-t    byte-next always branchif
\t8-t    nop            \ XXX should be  token-table sc2 sethi

\t16-t   here-t 4 - l@-t  here-t l!-t   \ Advance previous instruction
\t16-t   h# 81c0.e000  here-t 4 - l!-t  4 allot-t  \ up 0   %g0 jmpl  instr.

\t32-t   ip 0      scr  rtget
\t32-t   scr base  %g0  jmpl
\t32-t   ip /token-t     ip   add
  ]meta-asm
;-h
:-h c;    next end-code ;-h

\t16-t \itc :-h tld  ( src offset  dst -- )
\t16-t \itc    dup >r  lduh
\t16-t \itc    r@  tshift-t  r>  sll
\t16-t \itc ;-h

\ Create the code for "next" in the user area

\t16-t  compile-in-user-area

mlabel (next)   \ Shared code for next; will be copied into user area
\t16   ip 0       sc1  rtget
\t16   sc1  base  sc1  add
\t16   sc1 0      scr  rtget
\t16   scr base      %g0  jmpl
\t16   ip /token-t   ip   add
\t16-t end-code
\t16-t restore-dictionary

\itc-t d# 64 equ #user-init     \ Leaves space for the shared "next"

meta-compile


\ ---- Action code for target words classes.

\ "docode" eliminates the need to separately acf-align both the code field
\ and the body of a code definition, thus saving 12 bytes per code definition
\ in the t16s4 version.

\t16-t tshift-t 4 =  [if]
\t16-t code-field: docode
\t16-t    apf 2 + %g0 jmpl
\t16-t    nop
\t16-t end-code
\t16-t [then]

code-field: dolabel
\itc    sp         adec
\dtc  \ The label's code field contains        dolabel call   sp adec

      tos    sp    put  \ Push the apf of the variable
      apf    tos   add

\itc  tos 3  tos   add  \ Align to a longword boundary
\itc  tos 3  tos   andn
c;

code-field: docolon
\itc  rp          adec
\dtc  \ The colon definition's code field contains   docolon call   rp adec
      ip    rp    put   \ Save the ip on the return stack
      apf   ip    add   \ Reload ip with apf of colon definition
c;

code-field: docreate
\itc  sp          adec
\dtc  \ The word's code field contains        docreate call   sp adec
      tos   sp    put   \ Push the apf of the variable
      apf   tos   add
c;

\  In-dictionary variables are a leftover from the earliest FORTH
\  implementations.  They have no place in a ROMable target-system
\  and we are deprecating support for them; but Just In Case you
\  ever want to restore support for them, define the command-line
\  symbol:   in-dictionary-variables
[ifdef] in-dictionary-variables
   \  Support for in-dictionary variables, i.e., where the variable's
   \  storage location is in the dictionary rather than in user-space.
   code-field: dovariable
   \itc  sp          adec
   \dtc  \ The variable's code field contains        dovariable call   sp adec
         tos   sp    put        \ Push the apf of the variable
         apf   tos   add
   c;
      \  Hey, waidaminit!  This is the same as  docreate  just above!
      \  An in-dictionary variable could be as simple as    create 0 ,   ...
[then]

code-field: douser
\itc  sp            adec
\dtc  \ The user variable's code field contains       douser call   sp adec
      tos     sp   put
\t16  apf     scr  lduh \ Get the user number
\t32  apf     scr  ld   \ Get the user number
      bubble
      scr up  tos  add  \ Add the base address of the user area
c;

code-field: dovalue
\itc  sp           adec
\dtc  \ The value's code field contains       dovalue call   sp adec
      tos     sp   put
\t16  apf     scr  lduh \ Get the user number
\t32  apf     scr  ld   \ Get the user number
      bubble
      scr up  tos  nget \ Get the contents of the user area location
c;

\ Defers could run faster by compiling the defer offset into the instruction
\ as in    up user#  scr  ld     scr base  %g0 jmpl    nop
\ But it would be harder to compile, metacompile, decompile, and set

code-field: dodefer
\dtc  \ The user variable's code field contains  dodefer call  apf  scr  ld
\t32       scr up    scr  ld    \ Get the acf stored in that user location
\t32       bubble

\t16       apf       scr  lduh
\t16       scr up    sc1  tld   \ Get the acf stored in that user location
\t16       sc1 base  scr  rtget \ Read the token

           scr base  %g0  jmpl  \ Execute that word
\t16   sc1  base  sc1  add

   nop
end-code

code-field: doconstant
\itc  sp           adec
\dtc  \ The constant's code field contains        doconstant call   sp adec
      tos      sp    put
\dtc  apf      tos   ld         \ Get the constant's value
64\ \dtc tos 20   tos  sllx
64\ \dtc apf 4 +  scr  ld
64\ \dtc tos scr  tos  or

\itc  apf      tos   lduh       \ Get the high halfword of the constant's value
\itc  tos 10   tos   slln       \ Shift into high halfword
\itc  apf 2 +  scr   lduh       \ Get the low halfword of the constant's value
\itc  scr tos  tos   add        \ Merge the two halves
64\ \itc tos 10  tos slln
64\ \itc apf 4 + scr lduh
64\ \itc scr tos  tos add
64\ \itc tos 10  tos slln
64\ \itc apf 6 + scr lduh
64\ \itc scr tos  tos add
c;

code-field: do2constant
\itc  sp             adec
\dtc  \ The constant's code field contains        do2constant call  sp adec
      sp             adec       \ Make room on the stack
      tos      sp /n nput       \ Save the old tos on the memory stack

\dtc  apf      tos   ld         \ Get the bottom constant's value

64\ \dtc tos th 20   tos  sllx
64\ \dtc apf 4 +     scr  ld
64\ \dtc tos scr     tos  or

\dtc  tos      sp    put        \ Put it on the memory stack
\dtc  apf /n + tos   ld         \ Get the top constant's value

64\ \dtc tos th 20     tos  sllx
64\ \dtc apf /n 4 + +  scr  ld
64\ \dtc tos scr       tos  or

\itc  apf      tos   lduh       \ Get the high halfword of the bottom value
\itc  tos      sp 0  sth        \ Store on stack
\itc  apf /w +  tos  lduh       \ Get the low halfword of the bottom value
\itc  tos      sp 2  sth        \ Store on stack

\itc  apf /n + tos   lduh       \ Get the high halfword of the top value
\itc  tos 10   tos   sll        \ Shift into high halfword
\itc  apf /n /w + +  scr   lduh \ Get the low halfword of the top value
\itc  scr tos  tos   add        \ Merge the two halves
c;

code-field: dodoes
\itc  \ The child word's code field contains a pointer to the doesclause
\dtc  \ The child word's code field contains   doesclause call   apf scr add
      \ The doesclause's code field contains       dodoes call   sp  adec
      tos     sp   put
\dtc  scr     tos  move
\itc  apf     tos  add
      ip      rp   push
\dtc  apf     ip   add
\itc  spc 8   ip   add
c;

\ ---- Define the format of target code fields by creating host
\ words that will create target code fields.

:-h place-cf-t  ( action-apf -- )
         aligned-t
\dtc-t   meta-asm[  ( action-adr )  call  sp adec  ]meta-asm
\itc-t   token,-t
;-h

:-h code-cf     ( -- )
\itc-t  \t32-t  here /token-t + aligned
\itc-t  \t16-t  [ tshift-t 4 <> ]-h [if]  here /token-t + aligned  [else]  docode  [then]
\itc-t          place-cf-t  align-t
;-h

:-h colon-cf    ( -- )  ( 'body-t ) docolon    place-cf-t
\dtc-t  -4 allot-t  meta-asm[  rp adec  ]meta-asm
;-h

:-h defer-cf    ( -- )
  ( 'body-t ) dodefer    place-cf-t
\dtc-t  -4 allot-t  meta-asm[  apf scr ld  ]meta-asm
;-h

:-h label-cf    ( -- )  ( 'body-t ) dolabel    place-cf-t   align-t  ;-h
:-h constant-cf ( -- )  ( 'body-t ) doconstant place-cf-t  ;-h
:-h create-cf   ( -- )  ( 'body-t ) docreate   place-cf-t  ;-h
[ifdef] in-dictionary-variables
   :-h variable-cf ( -- )  ( 'body-t ) dovariable place-cf-t  ;-h
[then]
:-h user-cf     ( -- )  ( 'body-t ) douser     place-cf-t  ;-h
:-h value-cf    ( -- )  ( 'body-t ) dovalue    place-cf-t  ;-h
:-h startdoes   ( -- )
\dtc-t  ( 'body-t ) dodoes     place-cf-t
\itc-t   meta-asm[  dodoes call  sp adec  ]meta-asm
;-h
:-h start;code  ( -- )  ;-h
:-h vocabulary-cf ( -- )
    \ The forward reference will be resolved later by fix-vocabularies
    compile-t <vocabulary>

\dtc-t   meta-asm[  apf  scr  add        ]meta-asm  \ Address of parameter field

;-h


\ ---- Run-time words compiled by compiling words.

headerless
\ We can do better; combine the incrementing in   ip ainc   with that in next
code (lit)  ( -- n )
   tos sp push

    \t16   ip  0    scr  lduh   scr 10  scr  slln   ip 2  tos  lduh   scr tos  tos  add
64\ \t16   tos 10   tos  slln   ip   4  scr  lduh
64\ \t16   tos scr  tos  add    tos 10  tos  slln   ip 6  scr  lduh   scr tos  tos  add

32\ \t32   ip 0 tos nget
64\ \t32   ip 0  scr  lduw  scr 20  scr  sllx  ip 4  tos  lduw   scr tos  tos  add
   ip ainc
c;

code (wlit)  ( -- n )
   tos sp push
\t16   ip 0  tos  lduh   ip 2  ip  add   tos 1  tos  sub
\t32   ip tos get  ip ainc
c;

code (llit)  ( -- n )
    \t32  tos    sp   push
    \t32  ip     tos  lget
64\ \t32  tos 1  tos  sub
64\ \t32  ip  /l ip   add
32\ \t32  ip          ainc

    \t16  tos      sp   push
    \t16  ip  0    scr  lduh
    \t16  scr 10   scr  slln
    \t16  ip  2    tos  lduh
    \t16  scr tos  tos  add
64\ \t16  tos 1    tos  sub
    \t16  ip  /l   ip   add
c;

\ High level branch. The branch offset is compiled in-line.
code branch ( -- )
( 0 L: ) mloclabel bran1
   ip      scr  bget  \ branch
   ip scr  ip   add
c;

\ High level conditional branch.
code ?branch ( f -- )  \ Takes the branch if the flag is false
   tos 0  %g0   addcc
   sp     tos   get
   ( 0 B: ) bran1  0=    brif
   sp           ainc    \ Delay slot
   ip /branch  ip    add
c;

\ Run time word for loop
code (loop)  ( -- )
   rp     scr       get
   bubble
   scr 1  scr       addcc  \ increment loop index
   ( 0 B: ) bran1 vc  brif   \ branch if not done
   scr   rp         put    \ Write back the loop index (delay slot)
   rp 3 /n*    rp   add    \ done; remove loop params from stack
   ip /branch  ip   add    \ Skip the branch offset
c;

\ Run time word for +loop
code (+loop) ( increment -- )
   rp       scr     get
   bubble
   scr tos  scr     addcc  \ increment loop index
   scr   rp         put    \ Write back the loop index
   sp    tos        get
   bran1 ( 0 B: ) vc         brif   \ branch if not done
   sp               ainc   \ Delay slot
   rp 3 /n*    rp   add    \ done; remove loop params from stack
   ip /branch  ip   add    \ Skip the branch offset
c;

\ Run time word for do
code (do)  ( l i -- )
   tos  sc1      move   \ i in sc1
   sp   scr      get    \ l in scr
   sp 1 /n*      tos  nget
   sp 2 /n*      sp   add
( 1 L: ) mloclabel pd0 ( -- r: loop-end-offset l+0x8000 i-l-0x8000 )
    ip           rp   push     \ remember the do offset address
    ip /branch   ip   add     \ skip the do offset
    h# 8000.0000 sc2  sethi
64\ sc2  h# 20   sc2  sllx
    scr sc2      scr  add
    scr          rp   push
    sc1 scr      sc1  sub
    sc1          rp   push
c;
meta

\ Run time word for ?do
code (?do)  ( l i -- )
   tos       sc1   move   \ i in sc1
   sp        scr   get    \ l in scr
   sp 1 /n*  tos   nget
   sc1 scr         cmp
   ( 1 B: ) pd0  0<>       brif
   sp 2 /n*  sp    add
   ip        scr   bget \ branch
   scr ip    ip    add
c;

headers
\ Loop index for current do loop
code i  ( -- n )
   tos       sp    push
   rp        tos   get
   rp 1 /n*  scr   nget
   bubble
   tos scr   tos   add
c;

\ Loop index for next enclosing do loop
code j   ( -- n )
   tos       sp   push
   rp 3 /n*  tos  nget
   rp 4 /n*  scr  nget
   bubble
   tos scr   tos  add
c;

headerless
code (leave)  ( -- )
( 2 L: ) mloclabel pleave
   rp 2 /n*   ip   nget    \ Get the address of the ending offset
   rp 3 /n*   rp   add     \ get rid of the loop indices
   ip         scr  bget    \ branch
   ip scr     ip   add
c;

code (?leave)  ( f -- )
   tos        test
   sp   tos   get
   ( 2 B:  ) pleave 0<> brif
   sp         ainc
   inhibit-delay
c;

headers
code unloop  ( -- )  rp  3 /n*  rp  add  c;  \ Discard the loop indices

headerless
code (of)  ( selector test -- [ selector ] )
   sp  scr  pop     \ Test in tos, Selector in scr
   scr tos  cmp
   0=  if
   scr  tos  move  \ Delay slot - Copy selector to tos
      sp     tos  pop
      ip /branch  ip   add      \ Skip the branch offset
      next
   then
   ip     scr  bget
   ip scr  ip  add      \ Take the branch
c;

\ (endof) is the same as branch, and (endcase) is the same as drop,
\ but redefining them this way makes the decompiler much easier.

code (endof)  ( -- )   ip  scr  bget    ip scr  ip   add    c;
code (endcase)  ( n -- )      sp   tos  pop    c;

\ ---- Ordinary Forth words.

headers
\ Execute a Forth word given a code field address
code execute   ( acf -- )
\dtc tos     scr  move
\dtc sp      tos  get
\dtc scr 0   %g0  jmpl
\dtc sp           ainc

\itc tos       sc1  move
\itc sp        tos  get
\itc sc1 0     scr  rtget
\itc scr base  %g0  jmpl
\itc sp             ainc
end-code

assembler  ( 3 L: ) mlabel dofalse   0  tos  move  next   meta

\ Convert a character to a digit according to the current base
code digit  ( char base -- digit true | char false )
   tos  scr  move       \ base in scr
   sp   tos  get        \ char in tos
   tos ascii 0 tos  subcc       \ convert to number
   ( 3 B: ) dofalse < brif      \ Anything less than ascii 0 isn't a digit
  tos  h# 0a  cmp       \ test for >= 10
   >=  if annul         \ Try for a letter representing a digit
      tos  scr    cmp   \ Compare digit to base

      tos   ascii A ascii 0 -  cmp
      ( 3 B: ) dofalse < brif   \ bad if > '9' and < 'A'
      tos   ascii a ascii 0 -  cmp
      >=  if
         tos ascii A ascii 0 - d# 10 -   tos  sub   \ Delay
         tos ascii a ascii A -           tos  sub
      then
      tos scr cmp       \ Compare digit to base
   then
   ( 3 B: ) dofalse >= brif     \ Not a digit
   nop
   tos  sp  put         \ Replace the char on the stack with the digit
   -1   tos move        \ True to indicate success
c;

\ Copy cnt characters starting at from-addr to to-addr.  Copying is done
\ strictly from low to high addresses, so be careful of overlap between the
\ two buffers.

code cmove  ( src dst cnt -- )  \ Copy from bottom to top
   sp 1 /n*  scr   nget     \ Src into scr
   sp 0 /n*  sc1   nget     \ Dst into sc1

   scr tos  scr  add    \ Src = src+cnt (optimize for low-to-high copy)
   sc1 tos  sc1  add    \ Dst = dst+cnt
   sc1 1    sc1  sub    \ Account for the position of the addcc instruction
   %g0 tos  tos  subcc  \ Negate cnt

   <> if
      nop
      begin
         scr tos   sc2  ldub       \ (delay) Load byte
         tos 1     tos  addcc      \ (delay) Increment cnt
      >= until
         sc2   sc1 tos  stb        \ Store byte
   then

   sp 2 /n*  tos  nget    \ Delete 3 stack items
   sp 3 /n*  sp   add     \   "
c;

code cmove>  ( src dst cnt -- )  \ Copy from top to bottom
   sp 1 /n*  scr   nget      \ Src into scr
   sp 0 /n*  sc1   nget      \ Dst into sc1

   sc1 1 sc1   add      \ Account for the position of the subcc instruction

   tos 0       cmp      \ Don't do anything if the count is 0.
   <> if
      tos 1  tos  sub   \ Decrement cnt (startup loop)

      begin
         scr tos   sc2  ldub    \ (delay) Load byte
         tos 1     tos  subcc   \ (delay) Decrement cnt
      < until
         sc2   sc1 tos  stb     \ Store byte
   then

   sp 2 /n*  tos  nget    \ Delete 3 stack items
   sp 3 /n*  sp   add     \   "
c;

code and ( n1 n2 -- n3 )  sp  scr  pop   tos scr  tos  and   c;
code or  ( n1 n2 -- n3 )  sp  scr  pop   tos scr  tos  or    c;
code xor ( n1 n2 -- n3 )  sp  scr  pop   tos scr  tos  xor   c;

code <<      ( n1 cnt -- n2 )  sp  scr  pop   scr tos  tos  slln   c;
code >>      ( n1 cnt -- n2 )  sp  scr  pop   scr tos  tos  srln   c;
code >>a     ( n1 cnt -- n2 )  sp  scr  pop   scr tos  tos  sran   c;
code lshift  ( n1 cnt -- n2 )  sp  scr  pop   scr tos  tos  slln   c;
code rshift  ( n1 cnt -- n2 )  sp  scr  pop   scr tos  tos  srln   c;

code +   ( n1 n2 -- n3 )  sp  scr  pop   tos scr  tos  add   c;
code -   ( n1 n2 -- n3 )  sp  scr  pop   scr tos  tos  sub   c;

code invert  ( n1 -- n2 )     tos -1  tos  xor  c;
code negate  ( n1 -- n2 )  %g0 tos  tos  sub   c;

\  Mark the first code-definition in the dictionary;
\  we will need it later...
\  XXX  We might be able to make this  low-dictionary-adr
\  XXX  and move that from  debugm.fth  (or  debugm16.fth )
headerless
: first-code-word ( -- acf )  (') (lit) ;
headers

: abs   ( n1 -- n2 )  dup 0<  if  negate  then   ;

: min  ( n1 n2 -- n3 )  2dup  >  if  swap  then  drop  ;
: max  ( n1 n2 -- n3 )  2dup  <  if  swap  then  drop  ;
: umin ( u1 u2 -- u3 )  2dup u>  if  swap  then  drop  ;
: umax ( u1 u2 -- u3 )  2dup u<  if  swap  then  drop  ;

code up@  ( -- addr )  tos sp push   up tos move   c;
code sp@  ( -- addr )  tos sp push   sp tos move   c;
code rp@  ( -- addr )  tos sp push   rp tos move   c;
code up!  ( addr -- )  tos up move   sp tos pop    c;
code sp!  ( addr -- )  tos sp move   sp tos pop    c;
code rp!  ( addr -- )  tos rp move   sp tos pop    c;
code >r   ( n -- )     tos rp push   sp tos pop    c;
code r>   ( -- n )     tos sp push   rp tos pop    c;
code r@   ( -- n )     tos sp push   rp tos get    c;
code >user ( pfa -- addr )
\t32    tos     %g0     scr     lduw
\t16    tos     %g0     scr     lduh
        up      scr     tos     add
c;
code 2>r  ( n1 n2 -- )
   rp  /n 2*  rp     sub
   sp         scr    get
   scr        rp /n  nput
   tos        rp 0   nput
   sp  /n     tos    nget
   sp  /n 2*  sp     add
c;
code 2r>  ( -- n1 n2 )
   sp  /n 2*  sp     sub
   tos        sp /n  nput
   rp /n      tos    nget
   tos        sp 0   nput
   rp 0       tos    nget
   rp  /n 2*  rp     add
c;
code 2r@  ( -- n1 n2 )
   sp  /n 2*  sp     sub
   tos        sp /n  nput
   rp /n      tos    nget
   tos        sp 0   nput
   rp 0       tos    nget
c;

code >ip   ( n -- )     tos rp push   sp tos pop    c;
code ip>   ( -- n )     tos sp push   rp tos pop    c;
code ip@   ( -- n )     tos sp push   rp tos get    c;
: ip>token  ( ip -- token-adr )  /token -  ;

code exit ( -- )       rp ip pop  c;
code unnest ( -- )     rp ip pop  c;

code tuck  ( n1 n2 -- n2 n1 n2 )
   sp   scr    get
   bubble
   scr  sp     push
   tos  sp /n  nput
c;
code nip   ( n1 n2 -- n2 )
   sp  ainc
c;
code flip  ( w1 -- w2 )  \  byte-swap the low two bytes; clear the rest.
   tos  0ff   scr  and          \  lowest byte into scr
   scr    8   scr  slln         \  lowest byte into second byte of scr
   tos    8   tos  srln         \  second byte into lowest byte of tos
   tos  0ff   tos  and          \  clear the rest of tos
   tos scr  tos  or
c;

extend-meta-assembler
:-h leaveflag  ( condition -- )
\ macro to assemble code to leave a flag on the stack
   if
   0  tos  move   \ Delay slot
      -1 tos move
   then
   inhibit-delay
;-h

meta-compile

code 0=  ( n -- f )  tos test  0=  leaveflag c;
code 0<> ( n -- f )  tos test  0<> leaveflag c;
code 0<  ( n -- f )  tos test  0<  leaveflag c;
code 0<= ( n -- f )  tos test  <=  leaveflag c;
code 0>  ( n -- f )  tos test  >   leaveflag c;
code 0>= ( n -- f )  tos test  0>= leaveflag c;

extend-meta-assembler
:-h compare
   sp  scr  pop
   scr tos  cmp
;-h
meta-compile

code <   ( n1 n2 -- f )  compare <   leaveflag c;
code >   ( n1 n2 -- f )  compare >   leaveflag c;
code =   ( n1 n2 -- f )  compare 0=  leaveflag c;
code <>  ( n1 n2 -- f )  compare <>  leaveflag c;
code u>  ( n1 n2 -- f )  compare u>  leaveflag c;
code u<= ( n1 n2 -- f )  compare u<= leaveflag c;
code u<  ( n1 n2 -- f )  compare u<  leaveflag c;
code u>= ( n1 n2 -- f )  compare u>= leaveflag c;
code >=  ( n1 n2 -- f )  compare >=  leaveflag c;
code <=  ( n1 n2 -- f )  compare <=  leaveflag c;

code drop ( n -- )      sp   tos  pop    c;
code ?dup ( n -- 0|n,n)
   tos  %g0     %g0     subcc
   0<> if
      nop
      tos       sp      push
   then
   inhibit-delay
c;
code dup  ( n -- n n )  tos  sp   push   c;
code over ( n1 n2 -- n1 n2 n1 )  tos sp push    sp /n  tos  nget  c;
code swap ( n1 n2 -- n2 n1 )
   sp   scr  get
   tos  sp   put
   scr  tos  move
c;
code rot  ( n1 n2 n3 -- n2 n3 n1 )
   sp 0 /n*    scr    nget
   sp 1 /n*    sc1    nget
   scr    sp 1 /n*    nput
   tos    sp 0 /n*    nput
   sc1    tos    move
c;
code -rot ( n1 n2 n3 -- n3 n1 n2 )
   sp 0 /n*  scr    nget
   sp 1 /n*  sc1    nget
   tos    sp 1 /n*  nput
   sc1    sp 0 /n*  nput
   scr    tos    move
c;
code 2drop  ( d -- )      sp ainc   sp tos pop  c;
code 2dup   ( d -- d d )
   sp        scr       get
   sp 2 /n*  sp        sub
   tos       sp 1 /n*  nput
   scr       sp 0 /n*  nput
c;
code 2over  ( d1 d2 -- d1 d2 d1 )
   sp 2 /n*  sp        sub
   tos       sp 1 /n*  nput
   sp 4 /n*  tos       nget
   bubble
   tos       sp 0 /n*  nput
   sp 3 /n*  tos       nget
c;
code 2swap  ( d1 d2 -- d2 d1 )
   sp 2 /n*  sc2    nget
   sp 1 /n*  sc1    nget
   sp 0 /n*  scr    nget
   bubble
   scr    sp 2 /n*  nput
   tos    sp 1 /n*  nput
   sc2    sp 0 /n*  nput
   sc1    tos    move
c;
code 3drop  ( n1 n2 n3 -- )
   sp 2 /n*  tos   nget
   sp 3 /n*  sp    add
c;
code 3dup   ( n1 n2 n3 -- n1 n2 n3 n1 n2 n3 )
   sp 1 /n*  sc1    nget
   sp 0 /n*  scr    nget
   sp 3 /n*  sp     sub
   tos    sp 2 /n*  nput
   sc1    sp 1 /n*  nput
   scr    sp 0 /n*  nput
c;

code pick   ( nm ... n1 n0 k -- nm ... n2 n0 nk )
32\  tos 2   tos  sll    \ Multiply by /n
64\  tos 3   tos  sllx   \ Multiply by /n
     sp tos  tos  nget   \ Index into stack
c;

code 1+  ( n1 -- n2 )  tos 1  tos  add   c;
code 2+  ( n1 -- n2 )  tos 2  tos  add   c;
code 1-  ( n1 -- n2 )  tos 1  tos  sub   c;
code 2-  ( n1 -- n2 )  tos 2  tos  sub   c;

code 2/  ( n1 -- n2 )  tos 1  tos  sran   c;
code u2/ ( n1 -- n2 )  tos 1  tos  srln   c;
code 2*  ( n1 -- n2 )  tos 1  tos  slln   c;
code 4*  ( n1 -- n2 )  tos 2  tos  slln   c;
code 8*  ( n1 -- n2 )  tos 3  tos  slln   c;

code on ( addr -- )
           -1  scr    move
    \dtc  scr  tos 0  st
64\ \dtc  scr  tos 4  st
64\ \itc  scr  tos 4  sth
64\ \itc  scr  tos 6  sth
    \itc  scr  tos 0  sth
    \itc  scr  tos 2  sth
          sp  tos     pop
c;
code off ( addr -- )
    \dtc   %g0  tos 0  st
64\ \dtc   %g0  tos 4  st
64\ \itc   %g0  tos 6  sth
64\ \itc   %g0  tos 4  sth
    \itc   %g0  tos 0  sth
    \itc   %g0  tos 2  sth
            sp  tos    pop
c;

code +! ( n addr -- )
          sp  0 /n*  scr  nget
    \dtc  tos        sc1  lget

64\ \dtc  sc1 20     sc1  slln
64\ \dtc  tos /l     sc2  ld
64\ \dtc  sc1 sc2    sc1  add

    \itc  tos 0      sc1  lduh
    \itc  sc1 10     sc1  slln
    \itc  tos 2      sc2  lduh
    \itc  sc1 sc2    sc1  add

64\ \itc  tos 4      sc2  lduh
64\ \itc  sc1 10     sc1  slln
64\ \itc  sc1 sc2    sc1  add

64\ \itc  tos 6      sc2  lduh
64\ \itc  sc1 10     sc1  slln
64\ \itc  sc1 sc2    sc1  add

          sc1 scr    sc1  add

64\ \dtc  sc1 tos    /l   st
64\ \dtc  sc1 20     sc1  srln
    \dtc  sc1        tos  lput

64\ \itc  sc1 tos    6    sth
64\ \itc  sc1 10     sc1  srln
64\ \itc  sc1 tos    4    sth
64\ \itc  sc1 10     sc1  srln

    \itc  sc1 tos    2    sth
    \itc  sc1 10     sc1  srln
    \itc  sc1 tos    0    sth

          sp  1 /n*  tos  nget
          sp  2 /n*  sp   add
c;

code @   ( addr -- n )
64\ \dtc tos 0    scr  ld
64\ \dtc scr 20   scr  slln
64\ \dtc tos 4    tos  ld
64\ \dtc tos scr  tos  or

64\ \itc tos 0    sc1  lduh
64\ \itc sc1 10   scr  slln
64\ \itc tos 2    sc1  lduh
64\ \itc sc1 scr  scr  or
64\ \itc scr 10   scr  slln
64\ \itc tos 4    sc1  lduh
64\ \itc sc1 scr  scr  or
64\ \itc scr 10   scr  slln
64\ \itc tos 6    sc1  lduh
64\ \itc sc1 scr  tos  or

32\ \dtc tos 0    tos  ld

32\ \itc tos 2    scr  lduh
32\ \itc tos 0    tos  lduh
32\ \itc tos 10   tos  slln
32\ \itc scr tos  tos  add
c;

code d@  ( addr -- nlow nhigh )
   tos 0  scr ldd
   sc1    sp  push
   scr    tos move
c;

64\ code x@ ( addr -- x )       \ doubleword aligned
64\   tos tos get
64\ c;

code l@ ( addr -- l )           \ longword aligned
   tos tos lget
c;

32\ code <l@ ( addr -- l )  tos 0  tos  ld    c;
code w@ ( addr -- w )           \ 16-bit word aligned
   tos 0  tos  lduh
c;

32\ code <w@ ( addr -- w )  tos 0  tos  ldsh  c; \ with sign extension
64\ code <w@ ( addr -- w )
64\    tos 0  tos     lduh
64\    tos d# 48 tos  sllx
64\    tos d# 48 tos  srax
64\ c;
64\ code <l@ ( addr -- l )
64\    tos 0  tos  lduw
64\    tos 0  tos  sra
64\ c;

code c@ ( addr -- c )
   tos 0  tos  ldub
c;

code unaligned-@  ( addr -- l )
   tos 0  scr  ldub
   tos 1  sc1  ldub   scr 8 scr slln  scr sc1 scr add
   tos 2  sc1  ldub   scr 8 scr slln  scr sc1 scr add
   tos 3  sc1  ldub   scr 8 scr slln
64\   scr sc1 scr add
64\   tos 4  sc1  ldub   scr 8 scr slln  scr sc1 scr add
64\   tos 5  sc1  ldub   scr 8 scr slln  scr sc1 scr add
64\   tos 6  sc1  ldub   scr 8 scr slln  scr sc1 scr add
64\   tos 7  sc1  ldub   scr 8 scr slln
   scr sc1 tos add
c;
code be-l@  ( addr -- l )
   tos 0  scr  ldub
   tos 1  sc1  ldub   scr 8 scr slln  scr sc1 scr add
   tos 2  sc1  ldub   scr 8 scr slln  scr sc1 scr add
   tos 3  sc1  ldub   scr 8 scr slln  scr sc1 tos add
c;
code unaligned-l@  ( addr -- l )
   tos 0  scr  ldub
   tos 1  sc1  ldub   scr 8 scr slln  scr sc1 scr add
   tos 2  sc1  ldub   scr 8 scr slln  scr sc1 scr add
   tos 3  sc1  ldub   scr 8 scr slln  scr sc1 tos add
c;
code unaligned-w@  ( addr -- w )
   tos 0  scr  ldub
   tos 1  sc1  ldub   scr 8 scr slln  scr sc1 tos add
c;

\ 16-bit token version doesn't require alignment on a word boundary
code !   ( n addr -- )
( 4 L: ) mloclabel  start-of-!
   sp  0  scr  nget
   bubble

64\ \dtc   scr     tos /l  st
64\ \dtc   scr 20  scr     srln
    \dtc   scr     tos 0   st

64\ \itc   scr     tos 6  sth
64\ \itc   scr 10  scr    srln
64\ \itc   scr     tos 4  sth
64\ \itc   scr 10  scr    srln

    \itc   scr     tos 2  sth
    \itc   scr 10  scr    srln
    \itc   scr     tos 0  sth

   sp 1 /n*  tos  nget
   sp 2 /n*  sp   add
c;

headerless
\  These two words are sufficient to implement a very fast  IS
\  The first will be applied to USER definitions (primarily VALUEs
\  but also VARIABLEs) and the second to DEFER words.
\  Their actions are the same as the obsolete  (is)  used to be;
\  the main difference is that the determination of the word-type
\  of the target of the IS is made at compile-time rather than
\  at run-time.

code (is-user)  ( n -- )
   tos       sp   push          \  Do the  (')  in-line
   ip 0      tos  rtget                 \  Next token in caller
   tos base  tos  add                   \  TOS <= ACF-of-next-token-in-caller
   ip /token ip   add                   \  Complete the  (')
                                \  Do the  >body  in-line
   tos   0 >body-t   tos        add

        tos     %g0     scr     \  Do the  >user  in-line
\t32                            lduw
\t16                            lduh

   ( 4 B: ) start-of-!   bra                    \  Go to the !
        up      scr     tos     add     \  TOS <= user-addr of IS-target
end-code

code (is-defer)  ( acf -- )
   tos  base  scr  sub          \  Start the token!
\t16  scr tshift-t  scr      srl        \  SCR <= token to store
                                \  Do the  (')  in-line
   ip 0      tos  rtget                 \  Next token in caller
   ip /token ip   add                   \  Bump past next token in caller
   tos base  tos  add                   \  TOS <= ACF of next token
                                        \  That completed the  (')

                                \  Do the  >body  in-line
   tos   0 >body-t   tos        add

        tos     %g0     sc1     \  Do the  >user  in-line
\t32                            lduw
\t16                            lduh
        up      sc1     tos     add     \  TOS <= user-addr of IS-target

        scr    tos              \  Complete the token!
\t16                0    sth
\t32                     lput ( ???XXX tput )

        sp   tos  pop
c;


headers


code d!  ( n-low n-high addr -- )
   sp  0 /n*  scr  nget
   sp  1 /n*  sc1  nget
   bubble
   scr  tos 0  std
   sp  2 /n*  tos  nget
   sp  3 /n*  sp   add
c;
64\ code x!   ( x addr -- )
64\    sp  0  scr  nget
64\    bubble
64\    scr        tos  put
64\    sp  1 /n*  tos  nget
64\    sp  2 /n*  sp   add
64\ c;

code l!   ( n addr -- )
   sp  0  scr  nget
   bubble
   scr   tos 0 st
   sp  1 /n*  tos  nget
   sp  2 /n*  sp   add
c;
code w!  ( w addr -- )
   sp  0  scr  nget
   bubble
   scr   tos 0 sth
   sp 1 /n*  tos  nget
   sp 2 /n*  sp   add
c;
code c!  ( c addr -- )
   sp  0  scr  nget
   bubble
   scr   tos 0 stb
   sp  1 /n*  tos  nget
   sp  2 /n*  sp   add
c;

code unaligned-d!   ( d addr -- )
   sp  0  scr  nget

64\                        scr    tos 1 /n* 7 +  stb
64\   scr 8  scr    srln   scr    tos 1 /n* 6 +  stb
64\   scr 8  scr    srln   scr    tos 1 /n* 5 +  stb
64\   scr 8  scr    srln   scr    tos 1 /n* 4 +  stb
64\   scr 8  scr    srln
                           scr    tos 1 /n* 3 +  stb
      scr 8  scr    srln   scr    tos 1 /n* 2 +  stb
      scr 8  scr    srln   scr    tos 1 /n* 1 +  stb
      scr 8  scr    srln   scr    tos 1 /n* 0 +  stb

   sp  1 /n*  scr  nget

64\                        scr    tos 7  stb
64\   scr 8  scr    srln   scr    tos 6  stb
64\   scr 8  scr    srln   scr    tos 5  stb
64\   scr 8  scr    srln   scr    tos 4  stb
64\   scr 8  scr    srln
                           scr    tos 3  stb
      scr 8  scr    srln   scr    tos 2  stb
      scr 8  scr    srln   scr    tos 1  stb
      scr 8  scr    srln   scr    tos 0  stb

   sp  2 /n*  tos  nget
   sp  3 /n*  sp   add
c;
code unaligned-!   ( n addr -- )
   sp  0  scr  nget
   bubble

64\   scr    tos 7  stb
64\   scr 8  scr    srln   scr    tos 6  stb
64\   scr 8  scr    srln   scr    tos 5  stb
64\   scr 8  scr    srln   scr    tos 4  stb
64\   scr 8  scr    srln

   scr    tos 3  stb
   scr 8  scr    srln   scr    tos 2  stb
   scr 8  scr    srln   scr    tos 1  stb
   scr 8  scr    srln   scr    tos 0  stb

   sp  1 /n*  tos  nget
   sp  2 /n*  sp   add
c;
code be-l!   ( n addr -- )
   sp  0  scr  nget
   bubble
   scr    tos 3  stb
   scr 8  scr    srln   scr    tos 2  stb
   scr 8  scr    srln   scr    tos 1  stb
   scr 8  scr    srln   scr    tos 0  stb
   sp  1 /n*  tos  nget
   sp  2 /n*  sp   add
c;
\ In some versions, be-l, needs to set a swap bit
: be-l,  ( l -- )  here /l allot  be-l!  ;
code unaligned-l!   ( n addr -- )
   sp  0  scr  nget
   bubble
   scr    tos 3  stb
   scr 8  scr    srln   scr    tos 2  stb
   scr 8  scr    srln   scr    tos 1  stb
   scr 8  scr    srln   scr    tos 0  stb
   sp  1 /n*  tos  nget
   sp  2 /n*  sp   add
c;
code unaligned-w!   ( w addr -- )
   sp  0  scr  nget
   bubble
   scr    tos 1  stb
   scr 8  scr    srl
   scr    tos 0  stb
   sp  1 /n*  tos  nget
   sp  2 /n*  sp   add
c;

code 2@  ( addr -- d )
    tos /n   sc1  lduh tos /n 2 +   scr  lduh  sc1 10   sc1  slln
64\ scr sc1  sc1  add  tos /n 4 +   scr  lduh  sc1 10   sc1  slln
64\ scr sc1  sc1  add  tos /n 6 +   scr  lduh  sc1 10   sc1  slln
    scr sc1  scr  add

    scr      sp   push

    tos  0   sc1  lduh tos 2   scr  lduh  sc1 10   sc1  slln
64\ scr sc1  sc1  add  tos 4   scr  lduh  sc1 10   sc1  slln
64\ scr sc1  sc1  add  tos 6   scr  lduh  sc1 10   sc1  slln

    scr  sc1  tos  add
c;
code 2!  ( d addr -- )
    sp  0   scr    nget
    bubble

64\ scr   tos 6  sth  scr 10  scr  srln
64\ scr   tos 4  sth  scr 10  scr  srln
    scr   tos 2  sth  scr 10  scr  srln
    scr   tos 0  sth

    sp  /n  scr    nget

    bubble

64\ scr   tos /n 6 + sth  scr 10  scr  srln
64\ scr   tos /n 4 + sth  scr 10  scr  srln
    scr   tos /n 2 + sth  scr 10  scr  srln
    scr   tos /n 0 + sth

    sp  2 /n*   tos    nget
    sp  3 /n*   sp     add
c;

\  code fill ( start-addr count char -- )
\                       \ char in tos
\     sp 0 /n*  scr  nget       \ count in scr
\
\     scr %g0 %g0 subcc
\     > if
\        nop
\        sp 1 /n*  sc1  nget    \ start in sc1
\        begin
\        scr 1  scr  subcc
\           tos  sc1 scr  stb
\        0= until
\           nop
\     then
\
\     sp 2 /n*   tos  nget
\     sp 3 /n*   sp   add
\  c;

code fill  ( start-addr count char -- )
                                \ tos = data byte
   sp  0 /n*  scr   nget        \ scr = count
                                \ sc1 = addr

   scr 10  %g0   subcc
   >= if                        \ Enough to bother optimizing?
      sp 1 /n*  sc1   nget      \ ( delay)  sc1 = addr

      \ Store stray bytes at top of range
      scr sc1  sc2  add         \ Last+1 byte location in range
      sc2   3  sc3  andcc       \ Count - # extra bytes at top of range (0-3)
      scr sc3  scr  sub         \ Adjust main counter for later
      0 F:  bra                 \ Jump to the until branch
      sc2   3  sc2  andn        \ (delay) Starting adr at top (X X X 0|4)
      begin
         tos  sc2 sc3  stb      \ Store data byte
      0 L:
      0<= until
         sc3  1   sc3  subcc    \ (delay)

      \ Fill sc4-sc5 pair with repeated data bytes
      tos  ff  sc4  and         \ Mask all but desired byte
      sc4   8  sc2  sll
      sc4 sc2  sc4  or          \ sc4 = 0000abab
      sc4  10  sc2  sll
      sc4 sc2  sc4  or          \ sc4 = abababab

      \ Store bulk of data, as 32-bit words (4 bytes at a time)
      \ Guaranteed to execute at least once
      scr  4   scr  subcc       \ Pre-subtract count
      0 F:  bra                 \ Jump to the until branch
      sc1  4   sc3  add         \ (delay) Pre-add starting address
      begin
         sc4  sc3 scr  st       \ Store sc4 data (4 bytes)
      0 L:
      0< until
         scr  4   scr  subcc    \ (delay)

      scr  8  scr  add          \ Restore correct remaining count
   then

   \ Store the few remaining bytes at bottom of range
   0 F:  bra                    \ Jump to the until branch
   scr 0   %g0  subcc           \ (delay)
   begin
      tos  sc1 scr  stb         \ Store data byte
   0 L:
   0<= until
      scr  1   scr  subcc       \ (delay)

   sp  2 /n*    tos  nget       \ Remove 3 items off of stack
   sp  3 /n*    sp   add        \   "
c;

code noop ( -- )  inhibit-delay  c;

32\ code n->l ( n.unsigned -- l )  inhibit-delay  c;
64\ code n->l ( n.unsigned -- l )  tos 0 tos srl  c;
: s>d  ( n -- d )  dup 0<  ;  \ Depends on  true=-1, false=0

code wbsplit ( l -- b.low b.high )
   tos  h# ff  scr  and
   scr         sp   push
   tos  8      tos  srln
   tos  h# ff  tos  and
c;

code bwjoin ( b.low b.high -- w )
   sp         scr  pop
   scr h# ff  scr  and
   tos h# ff  tos  and
   tos  8     tos  slln
   tos  scr   tos  or
c;

code lwsplit ( l -- w.low w.high )  \ split a long into two words
   tos     scr  move
   scr 10  scr  sll
   scr 10  scr  srl
   scr     sp   push
   tos 10  tos  srl
c;
code wljoin ( w.low w.high -- l )
   sp       scr  pop
   scr 10   scr  sll   \ Throw away any high order bits in w.low
   scr 10   scr  srl
   tos 10   tos  sll
   tos scr  tos  or
c;

64\ code xlsplit ( x -- l.lo l.hi )
64\   tos 0      scr  srl  \ Clear high order 32 bits
64\   scr        sp   push
64\   tos h# 20  tos  srln
64\ c;

64\ code lxjoin ( l.lo l.hi -- x )
64\    sp         scr  pop
64\    scr 0      scr  srl  \ Clear high order 32 bits
64\    tos h# 20  tos  slln
64\    tos scr    tos  or
64\ c;

1 constant /c
2 constant /w
4 constant /l
8 constant /x

16\ /w constant /n
32\ /l constant /n
64\ /x constant /n

code ca+  ( addr index -- addr+index*/c )
   sp       scr  pop
   tos scr  tos  add
c;
code wa+  ( addr index -- addr+index*/w )
   sp       scr  pop
   tos 1    tos  sll
   tos scr  tos  add
c;
code la+  ( addr index -- addr+index*/l )
   sp       scr  pop
   tos 2    tos  sll
   tos scr  tos  add
c;
64\ code xa+  ( addr index -- addr+index*/x )
64\    sp       scr  pop
64\    tos 3    tos  slln
64\    tos scr  tos  add
64\ c;
code na+  ( addr index -- addr+index*/n )
     sp       scr  pop
16\  tos 1    tos  slln   \ Multiply by /n
32\  tos 2    tos  slln   \ Multiply by /n
64\  tos 3    tos  slln   \ Multiply by /n
     tos scr  tos  add
c;
code ta+  ( addr index -- addr+index*/t )
   sp       scr  pop
\t16   tos 1    tos  slln
\t32   tos 2    tos  slln
   tos scr  tos  add
c;

code ca1+  ( addr -- addr+/w )      tos /c  tos  add   c;
code char+ ( addr -- addr+/w )      tos /c  tos  add   c;
code wa1+  ( addr -- addr+/w )      tos /w  tos  add   c;
code la1+  ( addr -- addr+/l )      tos /l  tos  add   c;
64\ code xa1+  ( addr -- addr+/x )      tos /x  tos  add   c;
code na1+  ( addr -- addr+/n )      tos /n  tos  add   c;
code cell+ ( addr -- addr+/n )      tos /n  tos  add   c;
code ta1+  ( addr -- addr+/token )  tos /token  tos  add   c;

code /c*   ( n -- n*/c )  inhibit-delay  c;
code chars ( n -- n*/c )  inhibit-delay  c;
code /w*   ( n -- n*/w )  tos 1  tos  slln  c;
code /l*   ( n -- n*/l )  tos 2  tos  slln  c;
code /x*   ( n -- n*/x )  tos 3  tos  slln  c;
16\ code /n* ( n -- n*/n )  tos 1  tos  slln  c; \ Multiply by /n
32\ code /n* ( n -- n*/n )  tos 2  tos  slln  c; \ Multiply by /n
64\ code /n* ( n -- n*/n )  tos 3  tos  slln  c; \ Multiply by /n
16\ code cells ( n -- n*/n )  tos 1  tos  slln  c; \ Multiply by /n
32\ code cells ( n -- n*/n )  tos 2  tos  slln  c; \ Multiply by /n
64\ code cells ( n -- n*/n )  tos 3  tos  slln  c; \ Multiply by /n

code upc ( char -- upper-case-char )
   tos  ascii a  cmp
   >=  if
      tos  ascii z  cmp
      >  if  annul
         tos  ascii A ascii a -  tos  add
      then
   then
   inhibit-delay
c;
code lcc ( char -- lower-case-char )
   tos  ascii A  cmp
   >=  if
      tos  ascii Z  cmp
      >  if  annul
         tos   ascii a ascii A -   tos  add
      then
   then
   inhibit-delay
c;

\ string compare - case sensitive
code comp ( addr1 addr2 len -- -1 | 0 | 1 )
                        \ len in tos
   sp 0 /n*  scr   nget \ addr2 in scr
   sp 1 /n*  sc1   nget \ addr1 is sc1

   0 F:  bra  \ jump to the subcc instruction
   nop
   begin
      sc1 1  sc1  add
      scr 0  sc3  ldub
      scr 1  scr  add
      sc2 sc3     cmp
      <> if   nop
         <  if
            1   tos   move      \ Delay slot
            -1  tos   move
         then
         sp 2 /n*   sp  add
         next
      then

      \ branch target
      0 L:
      tos 1  tos  subcc
   0< until  annul
      sc1 0  sc2  ldub          \ Delay slot

   0         tos  move
   sp 2 /n*  sp   add
c;

\ string compare - case insensitive
code caps-comp ( addr1 addr2 len -- -1 | 0 | 1 )
                         \ len in tos
   sp 0 /n*  scr   nget  \ addr2 in scr
   sp 1 /n*  sc1   nget  \ addr1 is sc1

   0 F:  bra  \ jump to the subcc instruction
   nop
   begin
      sc1 1  sc1  add
      scr 0  sc3  ldub
      scr 1  scr  add
      sc2 ascii a cmp
      >= if
         sc2 ascii z cmp        \ Delay slot
         <= if  nop
            sc2 ascii A ascii a -  sc2  add
         then
      then
      sc3 ascii a cmp
      >= if
         sc3 ascii z cmp        \ Delay slot
         <= if  nop
            sc3 ascii A ascii a -  sc3  add
         then
      then
      sc2 sc3     cmp
      <> if   nop
         <  if
            1   tos   move      \ Delay slot
            -1  tos   move
         then
         sp 2 /n*   sp  add
         next
      then

      \ branch target
      0 L:
      tos 1  tos  subcc
   0< until  annul
      sc1 0  sc2  ldub          \ Delay slot

   0      tos  move
   sp 2 /n*  sp   add
c;

code pack  ( str-addr len to -- to )
   sp  scr  pop         \ scr is len
   sp  sc1  pop         \ sc1 is "from"; tos is "to"

   scr ff   scr  and    \ Never store more than 257 bytes

   scr  tos 0    stb    \ Place length byte

   tos 1    tos  add    \ Offset "to" by 1 to skip past the length byte

   %g0  tos scr  stb    \ Put a null byte at the end

   0 F:  bra            \ jump to the until  branch
   scr 1    scr    subcc        \ Delay slot

   begin
      sc2   tos scr   stb
      scr 1     scr   subcc
   0 L:
   0< until annul
      sc1 scr   sc2   ldub      \ Delay slot

   tos 1   tos  sub             \ Fix "to" to point to the length byte
c;

code (')  ( -- acf )
   tos       sp   push
   ip 0      tos  rtget
   ip /token ip   add
   tos base  tos  add
c;
\ Modifies caller's ip to skip over an in-line string
code skipstr ( -- addr len)
   sp 2 /n*  sp        sub
   tos       sp 1 /n*  nput
   rp  0     scr       nget    \ Get string address in scr
   bubble
   scr 0     tos       ldub  \ Get length byte in tos
   scr 1     scr       add   \ Address of data bytes
   scr       sp 0 /n*  nput  \ Put addr on stack

   \ Now we have to skip the string
   scr tos         scr   add   \ Scr now points past the last data byte
   scr #talign     scr   add   \ Round up to token boundary + null byte
   scr #talign 1-  scr   andn
   scr             rp 0  nput  \ Put the modified ip back
c;
code (")  ( -- addr len)
   sp 2 /n*  sp     sub
   tos       sp /n  nput
   ip  0     tos    ldub  \ Get length byte in tos
   ip  1     ip     add   \ Address of data bytes
   ip        sp 0   nput  \ Put addr on stack

   \ Now we have to skip the string
   ip  tos          ip   add   \ ip now points past the last data byte
   ip  #talign      ip   add  \ Round up to a token boundary, plus null byte
   ip  #talign 1-   ip   andn
c;
code count  ( addr -- addr+1 len )
   tos 1   tos  add
   tos -1  scr  ldub
   tos     sp   push
   scr     tos  move
c;

code between ( n min max -- f )
   tos          scr     move            \ max
   sp           sc2     pop             \ min
   sp           sc3     pop             \ n
   sc3  sc2     %g0     subcc
   0>=                  if
      %g0       tos     move            \ (delay)
      sc3  scr  %g0     subcc
      0>                if
         %g0 1  tos     sub             \ (delay)
         %g0    tos     move
      then
   then
   inhibit-delay
c;

code within ( n1 min max+1 -- f )
   tos          scr     move            \ max
   sp           sc2     pop             \ min
   sp           sc3     pop             \ n
   sc3  sc2     %g0     subcc
   0>=                  if
      %g0       tos     move            \ (delay)
      sc3  scr  %g0     subcc
      0<                if
         %g0    tos     move            \ (delay)
         %g0 1  tos     sub
      then
   then
   inhibit-delay
c;

code bounds ( adr len -- adr+len adr )
   tos          scr     move            \ len
   sp           sc1     pop             \ adr
   sc1  tos     sc2     add             \ adr+len
   sc2          sp      push
   sc1          tos     move
c;

code origin  ( -- addr )
   tos  sp   push
   base tos  move
c;
code origin+  ( n -- adr )
   tos base  tos  add
c;
code origin-  ( n -- adr )
   tos base  tos  sub
c;

code i-flush  ( adr -- )
   tos 0         iflush        \ This may cause a trap on MP machines
   sp  tos       pop
c;

\  : instruction!  ( bits adr -- )
\     tuck l! i-flush
\  ;
code instruction!  ( bits adr -- )
   sp     scr    get
   scr    tos 0  st
   tos 0         iflush        \ This may cause a trap on MP machines
   sp 1 /n*  tos nget
   sp 2 /n*  sp  add
c;

: instruction, ( opcode -- )
   here /l allot instruction!
;

\ ---- Support words for the incremental compiler

headerless

\  Create constants to represent the instructions that go into the
\  delay-slots of the code-fields of various definition-types.
\  We can use the assembler itself to construct the instruction.
\  This is more efficient and accurate than using literal numerics,
\  and will also be handy in determining definition-types.

\  Because  constant  is not yet properly defined, we have to use the
\  assembler to create the code-field of a  constant  definition-type.
\  This turns out to be not too bad, because we need the assembler anyway...

\  Integer value of the instruction that goes into the delay-slot
\  after the  call  in:  create  variable  user  value  constant
\  and in the  doesclause  of a defining word that uses  does>
\
\  The instruction itself:
\      Decrements the Stack Pointer.
\dtc  code dec-sp-instr
\dtc     doconstant  call
\dtc     sp  adec       \  Execute this in the delay slot
\dtc 64\   0 l,         \  High-half of longword constant for 64-bit platforms
\dtc     sp  adec       \  This is the constant!  =  8e21e00 /n or
\dtc  end-code

\itc  label dec-sp-instr   #align-t negate allot-t   \  Kind of suckey,
                                                     \  but at least it works.
\  \itc  code-field: dec-sp-instr  \  Tried this instead; it failed BIG TIME!

\itc     doconstant token,-t
\itc 64\   0 l,         \  High-half of longword constant for 64-bit platforms
\itc     sp  adec       \  This is the constant!  =  8e21e00 /n or
\itc  do-exitcode


\dtc  \  Integer value of the instruction that goes into the delay-slot
\dtc  \  after the  call  in the CF of a word defined by :  (colon).
\dtc  \
\dtc  \  The instruction itself:
\dtc  \      Decrements the Return-Stack Pointer.
\dtc  code dec-rp-instr
\dtc     doconstant call
\dtc        sp  adec    \  Execute this in the delay slot
\dtc 64\   0 l,         \  High-half of longword constant for 64-bit platforms
\dtc     rp  adec       \  This is the constant!  =  8c21a000 /n or
\dtc  end-code


\dtc  \  Integer value of the instruction that goes into the delay-slot
\dtc  \  after the  call  in the CF of a child word of a  does>  definer
\dtc  \  or in the CF of an  action:  of a word defined with  used .
\dtc  \
\dtc  \  The instruction itself:
\dtc  \      Adds 8 to the PC in %o7, yielding the PFA, which goes into scr
\dtc  code pfa>scr-instr
\dtc     doconstant call
\dtc        sp  adec    \  Execute this in the delay slot
\dtc 64\   0 l,         \  High-half of longword constant for 64-bit platforms
\dtc     apf  scr  add  \  This is the constant!  =  a003e008
\dtc  end-code


\dtc  \  Integer value of the instruction that goes into the delay-slot
\dtc  \  after the  call  in the CF of a  defer  word.
\dtc  \
\dtc  \  The instruction itself:
\dtc  \      Adds 8 to the PC in %o7, yielding the PFA, and loads the
\dtc  \      contents of that location (i.e., the first Parameter) into scr
\dtc  code param>scr-instr
\dtc     doconstant call
\dtc        sp  adec    \  Execute this in the delay slot
\dtc 64\   0 l,         \  High-half of longword constant for 64-bit platforms
\dtc     apf  scr  ld   \  This is the constant!  =  e003e008
\dtc  end-code


\  Prepare the 30-bit-wide longword-offset for a call or branch instruction
: >offset-30 ( target-addr where -- longword-offset )
   -
64\   n->l
   2 >>
;
\ Put a call instruction to target-addr at where
: put-call  ( target-addr where -- )
   tuck >offset-30              ( where longword-offset )
   4000.0000         or         ( where call-instruction )
   swap instruction!
;

\ Put a branch instruction to target-addr at where
: put-branch  ( target-addr where -- )
   tuck >offset-30              ( where longword-offset )
     3f.ffff    and             ( where branch-offset )
   3080.0000    or              ( where branch-instruction )
   swap instruction!
;

\ Replace the delay slot of the previous code field
: set-delay-slot  ( delay-instruction -- )  here /l - instruction!  ;

: place-call  ( action-adr -- )
   origin+ acf-align  here  /l 2* allot  put-call
   dec-sp-instr set-delay-slot          \  sp  adec
;

\ Place the "standard" code field, with a "sp /n  sp  sub" instruction
\ in the delay slot
: place-cf  ( action-adr -- )
\dtc  place-call
\itc  origin+ acf-align  token,
;
: code-cf  ( -- )
\dtc             acf-align
\itc \t32        here ta1+ aligned origin -
\itc \t16 [ tshift-t 4 <> ] [if]  here ta1+ aligned  origin - [else]  docode [then]
\itc             place-cf align
;
: >code  ( acf-of-code-word -- address-of-start-of-machine-code )
\itc  >body aligned
;
\dtc : code?  ( acf -- f )  \ True if the acf is for a code word
\dtc  c@ h# c0 and h# 40 <>  \ Most non-code words start with a call instr.
\dtc ;

\itc \t16 tshift-t 4 <> [if]
\itc \t16 : code? ( acf -- f )
\itc \t16    dup token@ swap  2dup 2 + = >r  4 + =  r> or
\itc \t16 ;
\itc \t16 [else]
\itc \t16 : code? ( acf -- f )
\itc \t16    token@ origin- docode =
\itc \t16 ;
\itc \t16 [then]

headers
: next  (  --- )
\    ip  0       scr  ld
\    scr base    %g0  jmpl
\    ip  /token  ip   add
\t32  e006e000 instruction,     \ ld    [%i3], %l0
\t32  81c40002 instruction,     \ jmp   %l0, %g2, %g0
\t32  [ b606e000 /token or ]
\t32     literal instruction,   \ add   %i3, /token, %i3

\     up 0  %g0 jmpl
\     nop
\t16  81c0.e000 instruction,    \ jmp  %g3, 0, %g0
\t16  8000.0000 instruction,    \ add  %g0, %g0, %g0
;

headerless

\ The "word type" is a number that distinguishes one type of
\ word from another.  This is highly implementation-dependent.

\ For the SPARC implementation, the magic number returned by
\ word-type is the offset of the action code from the origin

\itc  \  Indicate whether the given location is a call instruction
\itc  \      and, if so, return the target address
\itc  : call-placed? ( acf -- addr true | false )
\itc     dup l@ dup c000.0000 and 4000.0000 = tuck if
\itc         2 << l->n rot + swap
\itc     else
\itc        drop nip
\itc     then
\itc  ;

headers

: word-type  ( acf -- word-type )
\dtc dup l@ 2 << l->n +
\itc token@
;

headerless

: create-cf    ( -- )  docreate   place-cf  ;
[ifdef] in-dictionary-variables
   : variable-cf  ( -- )  dovariable place-cf  ;
[then]
: place-does   ( -- )  dodoes     place-call  ;
: place-;code  ( -- )  ;

\ Ip is assumed to point to (;code .  flag is true if
\ the code at ip is a does> clause as opposed to a ;code clause.
: does-ip?  ( ip -- ip' flag )
   dup token@  ['] (does>) =  ( ip flag )
   if   ta1+ acf-aligned la1+ la1+  true  else  ta1+ acf-aligned false  then
;

: put-cf  ( action-clause-addr where -- )
\dtc   tuck put-call                    ( where )
\dtc   pfa>scr-instr     swap la1+ instruction!         \  apf  scr  add
\itc   token!
;

\  used  sets the code field of the most-recently-defined word
\ so that it executes the code at action-clause-addr
: used  ( action-clause-addr -- )  lastacf put-cf  ;


\  Indicate whether the given address has the code-field of a does-clause.
\      (I.e., the call to  dodoes).
\  Leave the address, return a flag.
: does-clause? ( addr -- addr flag )
   dup la1+ l@ dec-sp-instr = if
      dup               \  Delay-slot instruction is right...
\dtc  word-type
\itc  call-placed? if
         dodoes origin+ = exit
\itc  then
   then
   false
;

\  Indicate whether given ACF is of a word that was defined with
\       does> .  If so, return the  does-cfa  under the  true.
: does-cf? ( possible-acf -- does-cfa true | false )
\dtc                    \  Possible valid child word of a  does>  definer?
\dtc  dup la1+ l@ pfa>scr-instr = if                    \  apf  scr  add
\dtc                    \  Delay-slot instruction is right...
         word-type              \  Possible address of the  does-clause
         does-clause?  ?dup nip  exit
\dtc  then
      drop false
;

headers
\ Need this to make headerless work
: colon-cf  ( -- )
   docolon place-cf
\dtc   dec-rp-instr  set-delay-slot     \ rp adec
;
headerless
: colon-cf?  ( possible-acf -- flag )
\dtc dup word-type docolon origin+ =  swap
\dtc la1+ l@  dec-rp-instr  =  and      \ rp adec
\itc      token@  ['] here token@ =
;
: user-cf  ( -- )  douser place-cf  ;
: value-cf  ( -- )  dovalue place-cf  ;
: constant-cf  ( -- )  doconstant place-cf  ;
: defer-cf  ( -- )
   dodefer place-cf
\dtc   param>scr-instr   set-delay-slot         \ apf  scr  ld
;
\  Indicate whether the word whose ACF is given
\  was defined with  defer .
: defer?  ( acf -- flag )
\dtc     dup
   word-type dodefer origin+ =
\dtc     swap la1+ l@  param>scr-instr  =  and  \ apf  scr  ld
;
: 2constant-cf  ( -- )  do2constant place-cf  ;

\t16 2 constant /branch
\t32 4 constant /branch
: branch, ( offset -- )
\t32 l,
\t16 w,
;
: branch! ( offset where -- )
\t16 w!
\t32 l!
;
: branch@ ( where -- offset )
\t16 <w@
\t32 <l@
;
\ >target depends on the way that branches are compiled
: >target  ( ip-of-branch-instruction -- target )  ta1+ dup branch@ +  ;

\ ---- More ordinary Forth words.

headers
/a constant /a
[ifexist] t8
: a@  ( adr -- adr' )  @ origin+  ;
: a!  ( adr1 adr2 -- )  swap origin- swap !  ;
[else]
code a@  ( adr -- adr' )
\t16 tos 0 tos lduh  tos tshift-t tos sll
\ XX 64\ \t32 tos /l  scr ld
\ XX 64\ \t32 tos tos lget
\ XX 64\ \t32 tos h# 20 tos sllx
\ XX 64\ \t32 tos scr tos or
\t32 tos tos lget
   tos base tos add
c;
code a!  ( adr1 adr2 -- )
   sp         scr  pop
   scr base   scr  sub
\t16   scr tshift-t    scr  srl
\t16   scr           tos 0  sth
\ XX 64\ \t32   scr     tos /l  st
\ XX 64\ \t32   scr  h# 20 scr  srlx
\t32   scr     tos 0   st
   sp         tos  pop
c;
[then]
: a,  ( adr -- )  here  /a allot  a!  ;

/token constant /token
code token@ ( addr -- cfa )
   tos 0     tos  rtget
   tos base  tos  add
c;
code token! ( cfa addr -- )
   sp        scr  get
   bubble
   scr base  scr  sub
\t16  scr tshift-t  scr      srl
\t16  scr    tos 0    sth
\t32  scr    tos      lput ( ???XXX tput )
   sp  1 /n*  tos  nget
   sp  2 /n*  sp   add
c;

: token, ( cfa -- )  here  /token allot  token!  ;

code null  ( -- token )
   tos   sp   push
   base  tos  move
c;
: !null-link   ( adr -- )  null swap link!  ;
: !null-token  ( adr -- )  null swap token!  ;
code non-null?  ( link -- false | link true )
   tos  base    cmp
   <>  if
   false scr  move       \ Delay slot

      tos  sp  push
      true scr move
   then
   scr  tos  move
c;
: get-token?     ( adr -- false | acf  true )  token@ non-null?  ;
: another-link?  ( adr -- false | link true )  link@  non-null?  ;


code body> ( pfa -- cfa )
\dtc    tos     8       tos     sub
\itc    tos     /token  tos     sub
c;
code >body ( cfa -- pfa )
\dtc    tos     8       tos     add
\itc    tos     /token  tos     add
c;
\t16 /w constant /user#
\t32 /l constant /user#

\ Move to a machine alignment boundary.
\ SPARC requires alignment on 32-bit boundaries, but we only require
\ 16-bit alignment in the 16-bit token version, using halfword memory
\ accesses to make this work.

: round-down  ( adr granularity -- adr' )  1- invert and  ;
: round-up  ( adr granularity -- adr' )  1-  tuck +  swap invert and  ;
: (align)  ( size granularity -- )
   1-  begin  dup here and  while  0 c,  repeat  drop
;
: aligned  ( adr -- adr' )  3 + -4 and  ;

code acf-aligned  ( adr -- adr' )
\t16  1 tshift-t << 1 -  scr  move
\t32  3                  scr  move
      tos scr            tos  add
      tos scr            tos  andn
c;
: acf-align  ( -- )  #acf-align (align)  here 'lastacf token!  ;

headers
: /mod  ( dividend divisor -- remainder quotient )
  \ Check if either factor is negative
    2dup               ( n1 n2 n1 n2)
    or 0< if           ( n1 n2)

        \ Both factors not non-negative do division by:
        \ Take absolute value and do unsigned division
        \ Convert to truncated signed divide by:
        \  if dividend is negative then negate the remainder
        \  if dividend and divisor have opposite signs then negate the quotient
        \ Then convert to floored signed divide by:
        \  if quotient is negative and remainder is non-zero
        \    add divisor to remainder and decrement quotient

        2dup swap abs swap abs  ( n1 n2 u1 u2)     \ Absolute values

        u/mod              ( n1 n2 urem uqout)     \ Unsigned divide
        >r >r              ( n1 n2) ( uquot urem)

        over 0< if         ( n1 n2) ( uquot urem)
            r> negate >r                   \ Negative dividend; negate remainder
        then               ( n1 n2) ( uquot trem)

        swap over          ( n2 n1 n2) ( uquot trem)
        xor 0< if          ( n2) ( uquot trem)
            r> r>
            negate         ( n2 trem tquot)  \ Opposite signs; negate quotient
           -rot            ( tquot n2 trem)
            dup 0<>  if
                +          ( tquot rem) \ Negative quotient & non-zero remainder
                swap 1-    ( rem quot)  \ add divisor to rem. & decrement  quot.
            else
                nip swap   ( rem quot)
            then
        else
            drop r> r>     ( rem quot)
        then

    else   \ Both factors non-negative

        u/mod          ( rem quot)
    then
;

: /     ( n1 n2 -- quot )   /mod  nip ;

: mod   ( n1 n2 -- rem )    /mod  drop  ;

headerless
\ SPARC version is dynamically relocated, so we don't need a bitmap
: clear-relocation-bits  ( adr len -- )  2drop  ;
headers