Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / sam / system / blaze / workerthreads.cc
// ========== Copyright Header Begin ==========================================
//
// OpenSPARC T2 Processor File: workerthreads.cc
// Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
//
// The above named program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public
// License version 2 as published by the Free Software Foundation.
//
// The above named program is distributed in the hope that it will be
// useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public
// License along with this work; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
//
// ========== Copyright Header End ============================================
/*
* Copyright (C) 2001,2005 Sun Microsystems, Inc.
* All rights reserved.
*/
#pragma ident "%%1.27 06/12/14 %%"
/*
"threads.cc"
Supports multiple cpu simulation threads that "system.cc"
can signal and wait for, ie that live for the duration of blaze rather
than being created/destroyed for each UI command.
These threads can however be destroyed/re-created, because we support
changing numthreads (which indirectly changes cpus-per-thread), as
well as changing the enabled state of individual cpus (which changes
the number-of-cpus to run, and hence the distribution of running cpus
across the worker threads).
*/
#include "ui.h"
#include "workerthread.h"
extern int blaze_debug; /* main/ui_cmds */
extern int blaze_option;
// Vcpu0 calls these doneftn_t with void * arg, when all the workerthreads have
// done their required execution.
doneftn_t volatile wrkthdCBFunc = NULL; /* NB-one-shot callback ftn */
void * volatile wrkthdCBArg = NULL; /* and its callback arg */
// system level event queue
// static EventQue *eventque;
sema_t wrkthdDONE; // DONE semaphore. Vcpu0 workerthread signals on
// this semaphore when all the worker threads are done
// UI threads typically hang on this sema while
// waiting for stepi, stept to finish
extern void write_scalar_64 (FILE *fp, const char * name, uint64_t v);
extern bool_t read_scalar_64 (FILE *fp, const char * name, uint64_t *v);
static void inline breakpoint_hit(int cpu_id)
{
ui->output("cpu[%i] hit a breakpoint. stop.. \n", cpu_id);
}
volatile int WorkerThread::BarrierCount = 0;
volatile int WorkerThread::BarrierLock = 0;
volatile int WorkerThread::BarrierTemp = 0;
int64_t WorkerThread::Nusecs = 0;
int64_t WorkerThread::Ninstrs = 0;
int64_t WorkerThread::Ncycles = 0;
// step_remainder holds the number of instructions (in normal mode)
// or cycles (in execution-driven mode) before the next stick_update needs
// to happen. This usually happens when a simulation is stopped between two
// stick updates (usually 1 microsecond of simulated time).
// Implementation note: this used to be known as instrs_till_next_stick_update
// before it was needed in execution-driven mode.
volatile uint64_t WorkerThread::step_remainder = 0;
volatile int WorkerThread::numThds = 0;
WorkerThread * WorkerThread::wrkthds = 0;
pthread_key_t WorkerThread::key = 0;
volatile uint64_t WorkerThread::GlobalTimeUsecs = 0;
volatile int64_t WorkerThread::GlobalTicks = 0;
volatile uint64_t WorkerThread::stick_incr = 0;
volatile uint64_t WorkerThread::stick_remainder = 0;
/* static */ Vcpu * WorkerThread::first_cpu = 0; // first cpu has some special work to do
volatile int64_t WorkerThread::u_intervals = 0;
volatile int64_t WorkerThread::k_intervals = 0;
volatile int64_t WorkerThread::u_instrs = 0;
volatile int64_t WorkerThread::k_instrs = 0;
void WorkerThread::kill_worker_threads ()
{
if (numThds > 0) {
while (sema_trywait (&wrkthdDONE) == 0) ; // "reset" just in case
for (int i=0; i < numThds; i++)
wrkthds[i].killThread();
numThds = 0;
}
if (wrkthds) {
delete[] wrkthds;
wrkthds = 0;
}
}
int WorkerThread::barrier()
{
atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
&blaze_stop_request,&BarrierTemp);
GlobalTimeUsecs = ++simTime;
doEventqueCallbacks();
if (first_cpu->config.trace_on) {
doTrace();
}
int ret = atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
&blaze_stop_request,&BarrierTemp);
if(cpus[0] == first_cpu) { // VCPU0
// this is *only* used for the "perf" UI command
//@@@ and is now broken when umips != kmips @@@
GlobalTicks += the_arch.mips;
// all vcpu's execute update_stick() with same stick_incr because of the
// first atomic_barrier at entrance of this function
stick_incr = the_arch.stick_freq/1000000ull;
stick_remainder += the_arch.stick_freq % 1000000ull;
if(stick_remainder >= 1000000ull){
// adjust the drift in stick_incr when stick_remainder is more then
// 10^6
stick_incr += stick_remainder/1000000ull;
stick_remainder %= 1000000ull;
}
}
return ret;
}
static inline int pstate_is_userp (Vcpu * vcpu)
{
uint64_t tl, tt, tstate0; // special case TL > 0
vcpu->get_reg (VCPU_PR_TL, &tl);
if (tl > 0) vcpu->get_reg (VCPU_PR_TT, &tt);
if (tl > 0 && ((tt >= 0x064 && tt <= 0x06f) // tlb miss
|| (tt >= 0x080 && tt <= 0x0ff))) { // win spill/fill
vcpu->get_reg (VCPU_PR_TSTATE, &tstate0);
if (tstate0 & (0x0004 << 8)) return 0;
else return 1;
}
uint64_t pstate; // regular case TL == 0
vcpu->get_reg (VCPU_PR_PSTATE, &pstate);
return (pstate & 4) ? 0 : 1;
}
static inline int get_mmu_cntx (Vcpu * vcpu)
/* Hack for sw05b, the kcfd process runs in kernel mode and with */
/* primary mmu context 0, but can be identified by its' */
/* secondary mmu context, which is unique the the kcfd (although */
/* you do have to be clever to figure out what that value is!) */
{
uint64_t data; /* Cheetah (only?) */
(void) vcpu->get_asi (0x58, 0x10ll, data); /* I/D MMU Secondary cntx */
return (int)data;
}
static const uint64_t pstate_priv_mask = 0x2ull;
void WorkerThread::stept (uint64_t usecs)
{
int rslt;
if (SYSTEM_in_execution_driven_mode()) {
stepc(usecs * the_arch.cpu_freq / 1000000ull);
return;
}
if (step_remainder) {
// stepi and stept have been intermixed.
// execute the leftover instructions,
// before simulating 'usecs'
called_from_stept = true;
stepi (step_remainder);
}
// When the conditional stepi() call above was called and we hit
// a breakpoint point then we should not enter the for loop below.
// @@@ this needs to be re-thought, should there be a barrier here @@@
if (!IN_STOP_STATE(barrier ())) {
uint64_t kmips = the_arch.kmips; /* re-fetch volatiles every iter */
uint64_t umips = the_arch.umips;
uint64_t cmips = the_arch_cmips;
int ccntx = the_arch_ccntx;
if (the_arch.roundrobin && (num_cpus > 1)) {
// round-robin algorithm, with variable user/kernel mips:
//
// 1. derive user and kernel CPIs from the configured mips and cpu freq:
// CPI = MHz/MIPS
// 2. When a strand steps by 1 instruction, an associated cycle counter is
// incremented by the CPI
// 3. We round-robin over all strands, advancing each strand until it catches
// up with the other strands on this worker-thread.
// For accuracy, CPI cannot be limited to whole numbers.
// we use fixed-point arithmetic with 10 bits of
// precision, to avoid FP ops in the critical loop.
// all quantities named *_x_1024 represent such
// fixed-point numbers.
int64_t mhz = the_arch.cpu_freq/1e+6; // NOTE: cpu freq must be in whole megahz for accuracy
int64_t kernel_cpi_x_1024 = (mhz << 10)/kmips;
int64_t user_cpi_x_1024 = (mhz << 10)/umips;
int64_t mhz_x_1024 = mhz << 10;
for (int usec=0; usec<usecs; usec++) {
int cpuid;
for (cpuid=0; cpuid<num_cpus; cpuid++) {
done_x_1024[cpuid] = 0;
}
int64_t cdone_x_1024 = 0; // workerthread cycles done
// go round-robin, while cycle counter represents less than a usec
while(cdone_x_1024 < mhz_x_1024) {
int64_t cmin_x_1024 = LLONG_MAX; // min of cycles done across all cpus
for (cpuid=0; cpuid<num_cpus; cpuid++) {
uint64_t pstate;
cpus[cpuid]->get_reg(VCPU_PR_PSTATE, &pstate);
int64_t cpi_x_1024 = (pstate & pstate_priv_mask)? kernel_cpi_x_1024 : user_cpi_x_1024;
// step this cpu as long as its cycle counter does not exceed all other cpus
while(done_x_1024[cpuid] <= cdone_x_1024) {
int rslt = cpus[cpuid]->stepi(1);
done_x_1024[cpuid] += cpi_x_1024;
if (rslt) {
BLAZE_STOP(blaze_stop_request);
breakpoint_hit (cpus[cpuid]->id());
goto STEPT_LAST_RR;
}
} // while cycle counter not caught up with rest of the strands on this wt
if (done_x_1024[cpuid] < cmin_x_1024) cmin_x_1024 = done_x_1024[cpuid];
} // for each cpuid
// move up the threshold for the next round-robin interval
cdone_x_1024 = cmin_x_1024;
} // while <mhz> cycles not done
STEPT_LAST_RR:
for (cpuid=0; cpuid<num_cpus; cpuid++) {
cpus[cpuid]->update_stick(stick_incr);
}
if (IN_STOP_STATE(barrier ()))
break;
} // for usecs
} else { // chunky mode: not round robin
for (int i = 0; i < usecs; i++) {
for(int j = 0; j < num_cpus; j++) {
Vcpu * vcpu = cpus[j];
/*---STEPI---*/
if (pstate_is_userp (vcpu)) {
rslt = vcpu->stepi (umips); atomic_add_64 (&u_intervals, 1);
atomic_add_64 (&u_instrs, umips);
} else {
if (ccntx != 0 && get_mmu_cntx (vcpu) == ccntx) {
rslt = vcpu->stepi (cmips); atomic_add_64 (&k_intervals, 1);
atomic_add_64 (&k_instrs, cmips);
} else {
rslt = vcpu->stepi (kmips); atomic_add_64 (&k_intervals, 1);
atomic_add_64 (&k_instrs, kmips);
}
}
/*---STICK---*/
vcpu->update_stick(stick_incr);
if (rslt) {
BLAZE_STOP(blaze_stop_request);
breakpoint_hit (vcpu->id ());
}
} // for cpus
if (IN_STOP_STATE(barrier ()))
break;
} // for usecs
}
} // not already stopped
// @@@ we _should_ be able to eliminate this barrier, but then sam crashes,
// and I haven't had time to figure out why @@@
//
atomic_barrier (&numThds, &BarrierCount, &BarrierLock,
&blaze_stop_request, &BarrierTemp);
if (cpus[0] == first_cpu) { // VCPU 0 does some extra work...
if (IN_STOP_STATE(blaze_stop_request)) { // stop request -> stop
BLAZE_STOP(blaze_run_state);
BLAZE_CLEAR(blaze_stop_request);
} else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
BLAZE_GTWAIT(blaze_run_state);
else // anything else -> stop
BLAZE_STOP(blaze_run_state);
doneftn_t TmpFunc = wrkthdCBFunc;
if (TmpFunc != NULL) {
wrkthdCBFunc = NULL; // reset first to avoid race !
(*TmpFunc)(wrkthdCBArg); // then `do' callback !
}
sema_post (&wrkthdDONE); // ------------------- DONE !!!
}
}
void WorkerThread::stepi(uint64_t n){
uint64_t MIPS = the_arch.mips; // instructions before every sync
uint64_t loops; // number of 1 usec loops to execute
uint64_t leftover_to_execute;
// number of instructions left from n
// that have to be executed before return
// w/o updating the STICK
uint64_t leftovers;
// number of instructions that have to be
// executed next time stepi is called
// before updating STICK. leftovers < MIPS
bool update = false; // should update STICK in this call ?
// since n could be less then leftovers
// loops * mips + leftover_to_execute + step_remainder == n
if(step_remainder > n){
loops = 0;
leftover_to_execute = n;
leftovers = step_remainder - n;
update = false;
}else{
n -= step_remainder;
loops = n/MIPS;
leftover_to_execute = n % MIPS;
leftovers = MIPS - n % MIPS;
if(step_remainder)
// do not update if step_remainder was 0
update = true;
}
if(update){
// update STICK after executing 'step_remainder'
for( int j = 0; j < num_cpus ; j++){
if(cpus[j]->stepi(step_remainder) != 0){
// have hit a breakpoint on this strand or cpu, so
BLAZE_STOP(blaze_stop_request);
breakpoint_hit(cpus[j]->id());
}
cpus[j]->update_stick(stick_incr);
}
int sam_state = barrier();
if (IN_STOP_STATE(sam_state))
// a stop request has come either through UI, or some strand has hit a
// breakpoint. Do not wait to complete the rest of usecs.
goto STOPNOW;
}
for (int i = 0; i < loops; i++) {
int sam_state;
for (int j = 0; j < num_cpus; j++ ) {
if(cpus[j]->stepi(MIPS) != 0){
// have hit a breakpoint on this strand or cpu
BLAZE_STOP(blaze_stop_request);
breakpoint_hit(cpus[j]->id());
// don't break, let other cpus on this workerthread complete
// their required number of instructions.
}
cpus[j]->update_stick(stick_incr);
}
sam_state = barrier();
if (IN_STOP_STATE(sam_state))
// a stop request has come either through UI, or some strand has hit a
// breakpoint. Do not wait to complete the rest of usecs.
goto STOPNOW;
}
// execute the rest of the intructions w/o updating STICK
for( int j = 0; j < num_cpus; j++ ) {
if(cpus[j]->stepi(leftover_to_execute) != 0){
// have hit a breakpoint on this strand or cpu
BLAZE_STOP(blaze_stop_request);
breakpoint_hit(cpus[j]->id());
// don't break, let other cpus on this workerthread complete
// their required number of instructions.
}
}
STOPNOW:
atomic_barrier(&numThds,&BarrierCount,&BarrierLock,&blaze_stop_request,&BarrierTemp);
if(cpus[0] == first_cpu){
step_remainder = leftovers;
if(called_from_stept){
called_from_stept = false; // do not want to stop here
return;
}
if (IN_STOP_STATE(blaze_stop_request)) { // stop request -> stop
BLAZE_STOP(blaze_run_state);
BLAZE_CLEAR(blaze_stop_request);
} else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
BLAZE_GTWAIT(blaze_run_state);
else // anything else -> stop
BLAZE_STOP(blaze_run_state);
if(wrkthdCBFunc){
doneftn_t TmpFunc = wrkthdCBFunc;
if (TmpFunc != NULL) {
wrkthdCBFunc = NULL; // reset first to avoid race !
(*TmpFunc)(wrkthdCBArg); // then `do' callback !
}
}
sema_post (&wrkthdDONE); // ---------- DONE !!!
}
}
// stepc is called in exec-driven mode
void WorkerThread::stepc(int64_t ncycles)
{
int64_t cycles_per_usec = the_arch.cpu_freq / 1000000ull;
int64_t i;
for (i=step_remainder; i<=ncycles; i++) {
int rslt = g_cpu_ex_intf.cycle(1);
step_remainder--;
if (step_remainder == 0) {
for (int j = 0; j < num_cpus; j++) {
Vcpu * vcpu = cpus[j];
vcpu->update_stick(stick_incr);
}
step_remainder = cycles_per_usec;
if (IN_STOP_STATE(barrier())) {
break;
}
}
if (rslt) {
BLAZE_STOP(blaze_stop_request);
break;
}
}
// FIXME FIXME FIXME
// remember cycles left over until next sync
if (IN_STOP_STATE(blaze_stop_request)) { // stop request -> stop
BLAZE_STOP(blaze_run_state);
BLAZE_CLEAR(blaze_stop_request);
} else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
BLAZE_GTWAIT(blaze_run_state);
else // anything else -> stop
BLAZE_STOP(blaze_run_state);
doneftn_t TmpFunc = wrkthdCBFunc;
if (TmpFunc != NULL) {
wrkthdCBFunc = NULL; // reset first to avoid race !
(*TmpFunc)(wrkthdCBArg); // then `do' callback !
}
sema_post (&wrkthdDONE); // ------------------- DONE !!!
} // void WorkerThread::stepc()
void WorkerThread::dump(FILE * fp){
write_scalar_64 (fp, "GlobalTimeUsecs",GlobalTimeUsecs );
write_scalar_64 (fp, "GlobalTicks",GlobalTicks );
write_scalar_64 (fp, "step_remainder",step_remainder);
write_scalar_64 (fp, "stick_remainder",stick_remainder);
}
int WorkerThread::restore (char * line)
{
if (sscanf (line, "GlobalTimeUsecs %lli", &GlobalTimeUsecs) == 1) ;
else if (sscanf (line, "GlobalTicks %lli", &GlobalTicks) == 1) ;
else if (sscanf (line, "stick_remainder %lli", &stick_remainder) == 1) ;
// Note that the following two lines restore the same variable. This reflects
// a name change of the variable from instr_till_next_stick_update to step_remainder
// to reflect its broader purpose in execution-driven runs. The second restore has
// been left for backward compatibility to keep the ability to restore older checkpoints.
// Please do not remove that line unless you are sure no checkpoint created prior to
// 12/13/2007 exists or they have all been patched to rename this variable.
// Only one of those lines will ever get executed depending on when the checkpoint was
// taken.
else if (sscanf (line, "step_remainder %lli", &step_remainder) == 1) ;
else if (sscanf (line, "instr_till_next_stick_update %lli", &step_remainder) == 1) ;
else return FALSE;
return TRUE;
}
void WorkerThread::create_worker_threads (int NumCpus, int cpusPerThread, int numThreads)
{
static int FIRST_TIME = 1; // init flag
int NumEnabled = 0;
int NumPerThread = 0;
int NumModThread = 0;
int newnum = numThreads;
if (newnum == -1) {
if (cpusPerThread == -1) {
newnum = 1;
} else {
newnum = NumCpus / cpusPerThread;
}
}
if (newnum <= 0) {
ui->warning("invalid conf numthds, using 1\n");
newnum = 1;
} else if (newnum > 64) {
ui->warning("invalid conf numthds, using 64\n");
newnum = 64;
}
NumEnabled = 0;
for (int i=0; i<=g_vcpu_id_max; i++)
if (get_vcpu(i) && cpu_enabled[i])
NumEnabled++;
if (NumEnabled == 0) {
ui->warning("Invalid: 0 enabled cpus, using 1\n");
NumEnabled = 1;
}
NumPerThread = NumEnabled / newnum;
NumModThread = NumEnabled % newnum;
if (FIRST_TIME) {
sema_init (&wrkthdDONE, 0, USYNC_THREAD,NULL);
// eventque = new EventQue();
assert(pthread_key_create(&key,0) == 0);
FIRST_TIME = 0;
}
if (newnum != WorkerThread::numThds || cpu_enable_changed) {
if (newnum > HOSTINFO_numcpus()) {
ui->warning("not enough host-cpus(%d) for sim-threads(%d)\n",
HOSTINFO_numcpus(), newnum);
}
// retrieve any existing events on the current workerthread eq's
std::list<Event_t *> eventlist;
for( int i = 0 ; i < numThds; i++){
while(!wrkthds[i].eq->empty()){
Event_t *event = new Event_t;
wrkthds[i].eq->get_top (event);
eventlist.push_front(event);
}
}
kill_worker_threads ();
wrkthds = new WorkerThread[newnum];
for (int i = 0; i < newnum; i++)
wrkthds[i].worker_id = i;
// put all events in eventlist on new wrkthd 0 event queue
while( !eventlist.empty() ){
Event_t * e = eventlist.front();
wrkthds[0].eq->insert_callback(e->pq_priority,
e->pq_cbfunc,e->pq_cbarg1,e->pq_cbarg2,
e->pq_cbunload, e->dbgstring, e->worker_id);
eventlist.pop_front();
delete e;
}
// reassign vcpu's to the worker threads
int nextcpu = 0;
first_cpu = 0;
for (int i = 0; i < newnum; i++) {
// number of vcpu's for this worker thread
wrkthds[i].num_cpus = NumPerThread + (i < NumModThread);
for (int j = 0; j < wrkthds[i].num_cpus; j++)
{
// find next vcpu
Vcpu* vcpu = get_vcpu(nextcpu);
for (int k=nextcpu; k<=g_vcpu_id_max; k++)
{
if(!vcpu || !cpu_enabled[nextcpu]) // skip not enabled
vcpu =get_vcpu(++nextcpu);
else
break;
}
if (first_cpu == 0)
first_cpu = vcpu;
wrkthds[i].cpus[j] = vcpu;
nextcpu++;
}
}
for (int i=0; i<newnum; i++) {
ui->verbose("cpu-worker-thread[%d] ",i);
wrkthds[i].info();
}
WorkerThread::numThds = newnum;
cpu_enable_changed = false;
}
}