Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / legion / src / procs / sunsparc / libniagara / modarith.c
/*
* ========== Copyright Header Begin ==========================================
*
* OpenSPARC T2 Processor File: modarith.c
* Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
*
* The above named program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* The above named program is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*
* ========== Copyright Header End ============================================
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "@(#)modarith.c 1.8 07/02/26 SMI"
/*
* The module implements the modular arithmetic unit.
*/
/* exact cut and paste from ss_common.c */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h> /* memcpy/memset */
#include <strings.h>
#include <thread.h>
#include "ss_common.h"
#ifdef NIAGARA1
#include "niagara.h"
#endif
#ifdef NIAGARA2
#include "niagara2.h"
#endif
#include "modarith.h"
#include "bignum.h"
static mod_arith_rv_t mod_mpy(mod_arith_t *masp);
static mod_arith_rv_t mod_reduce(mod_arith_t *masp);
static mod_arith_rv_t mod_exp(mod_arith_t *masp);
extern BIGNUM One;
#define DBGX(s) do { } while (0)
/*
* Make bignums be twice the largest possible size + 1. (Size is in
* words determined by the BIG_CHUNK_SIZE.)
*/
#define BIG_CHUNK_SIZE 32
#define BIGSIZE (2 * 2048 / BIG_CHUNK_SIZE + 1)
#define EXTR(val, hibit, lobit) \
(((val) >> (lobit)) & ((1ULL << ((hibit) - (lobit) + 1)) - 1))
/*
* Registers
* addr RW SZ Function Comment
* ----- -- ------ -------------- --------------------------------
* 0x80 RW 64-bit ASI_MA_CONTROL_REG strand, busy, int, opcode, len +
* 0x88 RW 64-bit ASI_MA_MPA_REG pointer to MA memory region
* 0x90 RW 64-bit ASI_MA_ADDR_REG 6 8-bit offsets into MA mem reg'n
* 0x98 RW 64-bit ASI_MA_NP_REG N' (for Montgomery Mpy)
* 0xa0 R 64-bit ASI_MA_SYNC_REG load blocks until op done
*/
static void
send_interrupt(simcpu_t *sp, int thread, sparcv9_trap_type_t type)
{
sparcv9_cpu_t *v9p = (sparcv9_cpu_t *)(sp->specificp);
ss_strand_t *nsp = v9p->impl_specificp;
ss_proc_t *npp = (ss_proc_t *)(sp->config_procp->procp);
int core = nsp->core;
int idx = STRANDID2IDX(npp,
((core & 0x7) << 2) | (thread & 0x3));
sparcv9_cpu_t *target_v9p = npp->strand[idx];
simcpu_t *target_sp = v9p->simp;
target_v9p->post_precise_trap(target_sp, type);
}
/*
* Loads from the ASI_MY_SYNC_REG (it appears as a word in the address
* space) are tricky. A load blocks until the current MA operation
* completes or is aborted. If it completes normally, a zero is
* returned. If it is aborted, the target register of the load is
* unchanged.
*/
/*
* Some operations need to be continued back in ss_common.c. So if
* lstmp->mtxp is set, the mutex is still held and needs to be
* released in the calling environment. (It seems like offset should
* be tpaddr_t. XXX)
*/
mod_arith_rv_t
modarith_cpu_access(simcpu_t *sp, tvaddr_t offset, maccess_t op,
uint64_t *valp)
{
int size;
int len; /* count in words */
int i;
mod_arith_rv_t rv = MOD_ARITH_FATAL;
sparcv9_cpu_t *v9p;
ss_strand_t *nsp;
ss_proc_t *npp;
mod_arith_t *masp;
domain_t *domainp;
int bytes_moved;
int core;
v9p = (sparcv9_cpu_t *)(sp->specificp);
nsp = v9p->impl_specificp;
npp = (ss_proc_t *)(sp->config_procp->procp);
domainp = sp->config_procp->domainp;
core = nsp->core;
masp = &npp->mod_arith_p[core];
size = op & MA_Size_Mask;
op &= MA_Op_Mask;
pthread_mutex_lock(&masp->lock);
switch (offset) { /* control register (ASI_MA_CONTROL_REG) */
case 0x80:
switch (op) {
case MA_Ld:
case MA_LdSigned:
*valp = masp->strand << 11 |
masp->busy << 10 |
masp->do_interrupt << 9 |
masp->op << 6 |
(masp->length - 1);
rv = MOD_ARITH_LD_COMPLETE;
break;
case MA_St:
if (EXTR(*valp, 63, 14)) {
EXEC_WARNING(("modarith store to control reg: "
"reserved bits set"));
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
goto cleanexit;
}
/*
* We are supposed to do an abort if the
* ma_unit is busy, but we are going to just
* issue a warning and wait.
*/
if (masp->busy) {
EXEC_WARNING(("store to ASI_MA_CONTROL_REG "
"while mod_arith unit is busy"));
/*
* Should use a cv, but that is a big
* pain (had to do a cond_broadcast
* every place busy is cleared), and
* we are treating this as an error
* case anyway.
*/
while (masp->busy) {
pthread_mutex_unlock(&masp->lock);
sleep(1);
pthread_mutex_lock(&masp->lock);
}
}
masp->busy = 1;
masp->do_interrupt = EXTR(*valp, 9, 9);
masp->op = EXTR(*valp, 8, 6);
masp->length = EXTR(*valp, 5, 0) + 1;
switch (masp->op) {
case 0: /* load MA memory */
len = masp->length;
if (len > MA_MEM_XWORDS) {
/*
* Niagara PRM 20.3 says MA
* loads and stores with
* length_field + 1 > 160 will
* produce undefined results.
*/
EXEC_WARNING(("modarith load ma_mem: "
"length = %d, set to 160",
len));
len = MA_MEM_XWORDS;
}
rv = MOD_ARITH_DONE;
ASSERT((masp->ADDR[0] + len) <= MA_MEM_XWORDS);
bytes_moved = ss_cpu_mem(domainp,
npp, v9p,
NA_mem_read,
masp->ma_data_p, /* physaddr */
0, /* already physical */
(unsigned char *) &masp->ma_mem[
masp->ADDR[0]], /* buffer */
8 * len);
if (bytes_moved != 8 * len) {
IMPL_WARNING(("modarith: ma_load "
"moved %d bytes, "
"expected %d\n",
bytes_moved, 8 * len));
}
if (masp->do_interrupt) {
send_interrupt(sp, masp->strand,
(sparcv9_trap_type_t)
N1_trap_modular_arithmetic);
}
break;
case 1: /* store MA memory */
len = masp->length;
if (len > MA_MEM_XWORDS) {
/*
* Niagara PRM 20.3 says MA
* loads and stores with
* length_field + 1 > 160 will
* produce undefined results.
*/
EXEC_WARNING(("modarith store ma_mem: "
"length = %d, set to 160",
len));
len = MA_MEM_XWORDS;
}
rv = MOD_ARITH_DONE;
ASSERT((masp->ADDR[0] + len) <= MA_MEM_XWORDS);
bytes_moved = ss_cpu_mem(domainp,
npp, v9p,
NA_mem_write,
masp->ma_data_p, /* phys addr */
0, /* already physical */
(unsigned char *) &masp->ma_mem[
masp->ADDR[0]], /* buffer */
8 * len); /* size */
if (bytes_moved != 8 * len) {
IMPL_WARNING(("modarith: ma_store "
"moved %d bytes, expected %d\n",
bytes_moved, 8 * len));
}
if (masp->do_interrupt) {
send_interrupt(sp, masp->strand,
(sparcv9_trap_type_t)
N1_trap_modular_arithmetic);
}
break;
case 2: /* modular multiply */
rv = mod_mpy(masp);
if (masp->do_interrupt) {
send_interrupt(sp, masp->strand,
(sparcv9_trap_type_t)
N1_trap_modular_arithmetic);
}
break;
case 3:
rv = mod_reduce(masp);
if (masp->do_interrupt) {
send_interrupt(sp,masp->strand,
(sparcv9_trap_type_t)
N1_trap_modular_arithmetic);
}
break;
case 4:
rv = mod_exp(masp);
if (masp->do_interrupt) {
send_interrupt(sp, masp->strand,
(sparcv9_trap_type_t)
N1_trap_modular_arithmetic);
}
break;
default:
EXEC_WARNING(("modarith store to control reg: "
"Illegal opcode %d", masp->op));
masp->busy = 0;
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
}
break;
default:
EXEC_WARNING(("modarith: Illegal memory access type "
"%d", op));
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
}
break;
case 0x88: /* Address register (ASI_MA_MPA_REG) */
switch (op) {
case MA_Ld:
case MA_LdSigned:
*valp = masp->ma_data_p;
rv = MOD_ARITH_LD_COMPLETE;
break;
case MA_St:
if (*valp & (0 - (1ULL << 48))) {
EXEC_WARNING(("modarith: attempt to set "
"reserved bits in ASI_MA_MPA_REG"));
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
goto cleanexit;
}
if (*valp & ((1ULL << 39) || 0x7)) {
EXEC_WARNING(("modarith: zeroing bits in "
"ASI_MA_MPA_REG"));
}
masp->ma_data_p = *valp & ~((1ULL << 39) || 0x7);
rv = MOD_ARITH_DONE;
break;
default:
EXEC_WARNING(("modarith: Illegal memory access type "
"%d", op));
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
}
break;
case 0x90: /* offsets register (ASI_MA_ADDR_REG) */
switch (op) {
case MA_Ld:
case MA_LdSigned:
*valp = 0;
for (i = 0; i < MA_N_ADDR; ++i) {
*valp |= masp->ADDR[i] << (8 * i);
}
rv = MOD_ARITH_LD_COMPLETE;
break;
case MA_St:
if (*valp & (0 - (1ULL << (MA_N_ADDR*8)))) {
EXEC_WARNING(("modarith offsets: reserved "
"bits set"));
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
break;
}
for (i = 0; i < MA_N_ADDR; ++i) {
masp->ADDR[i] = EXTR(*valp, 8 * i + 7, 8 * i);
}
rv = MOD_ARITH_DONE;
break;
default:
EXEC_WARNING(("modarith: Illegal memory access type "
"%d", op));
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
}
break;
case 0x98: /* N' register (ASI_MA_NP_REG)---Montgomery mpy, exp, etc */
switch (op) {
case MA_Ld:
case MA_LdSigned:
*valp = masp->n_prime;
rv = MOD_ARITH_DONE;
break;
case MA_St:
masp->n_prime = *valp;
rv = MOD_ARITH_DONE;
break;
default:
EXEC_WARNING(("modarith: Illegal memory access type "
"%d", op));
rv = MOD_ARITH_ILLEGAL_INST_TRAP;
}
break;
case 0xa0: /* sync register (ASI_MA_SYNC_REG) */
switch (op) {
sparcv9_cpu_t *v9p;
case MA_Ld:
case MA_LdSigned:
v9p = (sparcv9_cpu_t *)(sp->specificp);
if (nsp->vcore_id == masp->strand) {
/*
* Normal case. We are supposed to
* wait until the operation is done.
* But for now we have all operations
* complete instantly, so no waiting
* is necessary. If the calling
* strand does not match the STRAND
* field in the control register, we
* do not update the register.
*/
*valp = 0;
rv = MOD_ARITH_LD_COMPLETE;
} else {
rv = MOD_ARITH_DONE;
}
break;
default:
/*
* Should cause a data_access_exception trap.
* Do that when we learn how.
*/
EXEC_WARNING(("modarith: Illegal access - only "
"loads allowed to ASI_MA_SYNC_REG"));
rv = MOD_ARITH_DATA_ACCESS_EX_TRAP;
}
break;
default:
if (offset & 0x7) {
/*
* Should take a mem_address_not_aligned trap.
* We'll do that when we learn how.
*/
EXEC_WARNING(("modarith: unaligned memory access"));
rv = MOD_ARITH_MEM_ALIGN_TRAP;
} else {
/*
* Something else wrong.
*/
EXEC_WARNING(("modarith: access to illegal or "
"unimplmented address"));
rv = MOD_ARITH_UNIMPLEMENTED;
}
}
cleanexit:
masp->busy = 0;
pthread_mutex_unlock(&masp->lock);
return (rv);
}
/*
* print messages for BEG_ERROR_CODE values and return a mod_arith
* return code
*/
static mod_arith_rv_t
bigrv_print_conv(int bigcode)
{
switch (bigcode) {
case BIG_OK: /* can't happen */
IMPL_WARNING(("modarith: bigrv_called with BIG_OK---shouldn't "
"happen"));
return (MOD_ARITH_DONE);
case BIG_NO_MEM:
IMPL_WARNING(("modarith: malloc failed"));
return (MOD_ARITH_FATAL);
case BIG_INVALID_ARGS:
EXEC_WARNING(("modarith: bignum package complains of invalid "
"args"));
return (MOD_ARITH_DONE);
case BIG_DIV_BY_0:
EXEC_WARNING(("modarith: bignum package complains of zero "
"divide"));
return (MOD_ARITH_DONE);
default:
IMPL_WARNING(("modarith: bignum package returned unexpected "
"error %d", bigcode));
return (MOD_ARITH_DONE);
}
}
/*
* Initializes *numberp from reg.
*/
static BIG_ERR_CODE
big_init2(BIGNUM *numberp, mod_arith_t *masp, uint_t reg)
{
int rv;
int i;
uint_t len64 = masp->length; /* in 64 bit words */
uint64_t *p = &masp->ma_mem[masp->ADDR[reg]];
ASSERT(reg < MA_N_ADDR);
ASSERT((masp->ADDR[reg] + len64) <= MA_MEM_XWORDS);
/*
* This code depends on the bignum value being an array of 32 bit
* words. Verify that this is so.
*/
#if BIG_CHUNK_SIZE != 32
#error
#endif
ASSERT(sizeof (numberp->value[0]) == BIG_CHUNK_SIZE / 8);
rv = big_init(numberp, BIGSIZE);
if (rv) {
return (rv);
}
ASSERT(64 * len64 <= 4096);
for (i = 0; i < len64; ++i) {
numberp->value[2 * i] = p[i] & 0xffffffffULL;
numberp->value[2 * i + 1] = p[i] >> 32;
}
numberp->len = len64 * 64 / BIG_CHUNK_SIZE;
return (BIG_OK);
}
/*
* Copies the value out to reg and destroys *numberp. It there is
* insufficient room, BIG_INVALID_ARGS is returned, and *numberp is
* destroyed anyway. This stores the result little-endian by word.
* Thus it must not be used for the exponent.
*/
static BIG_ERR_CODE
big_flush(BIGNUM *numberp, mod_arith_t *masp, uint_t reg)
{
int i;
uint64_t *tgt = &masp->ma_mem[masp->ADDR[reg]];
uint_t tgtsize = masp->length; /* in words */
uint64_t overflow = 0;
ASSERT(reg < MA_N_ADDR);
ASSERT((masp->ADDR[reg] + tgtsize) <= MA_MEM_XWORDS);
/*
* This code depends on the bignum value being an array of 32 bit
* words. Verify that this is so.
*/
#if BIG_CHUNK_SIZE != 32
#error
#endif
ASSERT(sizeof (numberp->value[0]) == BIG_CHUNK_SIZE / 8);
memset(tgt, 0, 8 * tgtsize);
for (i = 0; i < numberp->len; ++i) {
if (i/2 < tgtsize) {
tgt[i/2] |= ((uint64_t)(numberp->value[i])) <<
(32 * (i & 1));
} else {
overflow |= !!numberp->value[i];
}
}
big_finish(numberp);
numberp->malloced = 0;
return (overflow ? BIG_INVALID_ARGS : BIG_OK);
}
/*
* Thre is a gigantic hack in all the code below. the N-Prime value
* for Niagara is the inverse of the modulus mod 2^64. But the
* N-Prime value that is needed by the 32 bit big number package is
* the inverse of the modulus mod 2^32. But the latter value is just
* the 32 lower bits of the former. So we just pass the former, and
* it gets cut down to 32 bits in the parameter passing mechanism.
* Can you spell "sleazy hack"? We really need to convert to a 64 bit
* bignum library so we test the upper bits. Oh well.
*/
/*
* Montogomery multiplications, ie R = A * B * 2^-modbits mod N, where
* modbits is rounded up to a multiple of the wordsize.
*/
static mod_arith_rv_t
mod_mpy(mod_arith_t *masp)
{
BIG_ERR_CODE rv;
BIGNUM A; /* multiplier */
BIGNUM B; /* multiplicand */
BIGNUM N; /* modulus */
BIGNUM X; /* result */
A.malloced = 0;
B.malloced = 0;
N.malloced = 0;
X.malloced = 0;
/* masp->lock must always be held when getting to cleanexit */
rv = big_init2(&A, masp, 0);
if (rv) {
goto cleanexit;
}
rv = big_init2(&B, masp, 1);
if (rv) {
goto cleanexit;
}
rv = big_init2(&N, masp, 2);
if (rv) {
goto cleanexit;
}
rv = big_init(&X, BIGSIZE);
if (rv) {
goto cleanexit;
}
/* release lock */
pthread_mutex_unlock(&masp->lock);
rv = big_mont_mul(&X, &A, &B, &N, masp->n_prime);
if (rv) {
pthread_mutex_lock(&masp->lock);
goto cleanexit;
}
/* Do calls to free with lock released */
big_finish(&A);
big_finish(&B);
big_finish(&N);
pthread_mutex_lock(&masp->lock);
big_flush(&X, masp, 4);
cleanexit:
big_finish(&A); /* idempotent and fast if nothing to do */
big_finish(&B);
big_finish(&N);
big_finish(&X);
ASSERT((masp->ADDR[4] + (masp->length + sizeof (uint64_t) - 1) / sizeof (uint64_t)) <= MA_MEM_XWORDS);
/* the tmp reg (X) gets destroyed; we just set it to a bogus value */
memset(&masp->ma_mem[masp->ADDR[4]], 0x57, masp->length);
if (rv) {
return (bigrv_print_conv(rv));
} else {
return (MOD_ARITH_DONE);
}
}
/*
* mod_reduce is just a Montgomery multiply by 1, i.e. R = A * 2^
* -modbits mod N, where modbits is rounded up to a multpiple of the
* wordsize.
*/
static mod_arith_rv_t
mod_reduce(mod_arith_t *masp)
{
BIG_ERR_CODE rv;
BIGNUM A; /* operand */
BIGNUM N; /* modulus */
BIGNUM R; /* result */
A.malloced = 0;
N.malloced = 0;
R.malloced = 0;
rv = big_init2(&A, masp, 0);
if (rv) {
goto cleanexit;
}
rv = big_init2(&N, masp, 1);
if (rv) {
goto cleanexit;
}
pthread_mutex_unlock(&masp->lock);
if (big_cmp_abs(&A, &N) < 0) {
/* A < N; so do R = A */
big_copy(&R, &A);
} else {
/* A >= N, so do R = A - N */
big_sub_pos(&R, &A, &N);
}
big_finish(&A);
big_finish(&N);
pthread_mutex_lock(&masp->lock);
rv = big_flush(&R, masp, 2);
cleanexit:
big_finish(&A); /* only do work in error cases */
big_finish(&N);
big_finish(&R);
/* masp->lock must be held when we get here */
if (rv) {
return (bigrv_print_conv(rv));
} else {
return (MOD_ARITH_DONE);
}
}
static int
exponentbit(uint64_t *exponent, int exponentsize, int bit)
{
int wordfromleft = bit / 64;
int bitpos = 63 - bit % 64; /* lsb is bitpos 0 */
return ((exponent[wordfromleft] >> bitpos) & 1);
}
static mod_arith_rv_t
mod_exp(mod_arith_t *masp)
{
BIG_ERR_CODE rv;
int i;
int explen = 8 * (masp->ADDR[5] + 1);
uint64_t *exponentp = &masp->ma_mem[masp->ADDR[4]];
int masplocked = 1;
BIGNUM A; /* base */
BIGNUM N; /* modulus */
BIGNUM X; /* result */
ASSERT((masp->ADDR[4] + (explen / 8)) <= MA_MEM_XWORDS);
A.malloced = 0; /* make safe to call big_finish */
N.malloced = 0;
X.malloced = 0;
rv = big_init2(&A, masp, 0);
if (rv) {
goto cleanexit;
}
rv = big_init2(&N, masp, 2);
if (rv) {
goto cleanexit;
}
rv = big_init2(&X, masp, 3);
if (rv) {
goto cleanexit;
}
pthread_mutex_unlock(&masp->lock);
masplocked = 0;
for (i = 0; i < explen; ++i) {
rv = big_mont_mul(&X, &X, &X, &N, masp->n_prime);
if (rv) {
goto cleanexit;
}
if (exponentbit(exponentp, explen, i)) {
rv = big_mont_mul(&X, &X, &A, &N, masp->n_prime);
if (rv) {
goto cleanexit;
}
}
}
big_finish(&A);
big_finish(&N);
pthread_mutex_lock(&masp->lock);
masplocked = 1;
ASSERT((masp->ADDR[1] + masp->length) <= MA_MEM_XWORDS);
/*
* The tmp reg (M) gets destroyed; we just set it to an
* intentional bogus value, every byte 0x87.
*/
memset(&masp->ma_mem[masp->ADDR[1]], 0x87, 8 * masp->length);
rv = big_flush(&X, masp, 3); /* copies out and finishes X */
cleanexit:
big_finish(&A); /* these only do work in error branches */
big_finish(&N);
big_finish(&X);
if (!masplocked) {
pthread_mutex_lock(&masp->lock);
}
if (rv) {
return (bigrv_print_conv(rv));
} else {
return (MOD_ARITH_DONE);
}
}