// ========== Copyright Header Begin ==========================================
// OpenSPARC T2 Processor File: workerthreads.cc
// Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
// The above named program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public
// License version 2 as published by the Free Software Foundation.
// The above named program is distributed in the hope that it will be
// useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
// You should have received a copy of the GNU General Public
// License along with this work; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
// ========== Copyright Header End ============================================
* Copyright (C) 2001,2005 Sun Microsystems, Inc.
#pragma ident "%%1.27 06/12/14 %%"
Supports multiple cpu simulation threads that "system.cc"
can signal and wait for, ie that live for the duration of blaze rather
than being created/destroyed for each UI command.
These threads can however be destroyed/re-created, because we support
changing numthreads (which indirectly changes cpus-per-thread), as
well as changing the enabled state of individual cpus (which changes
the number-of-cpus to run, and hence the distribution of running cpus
across the worker threads).
#include "workerthread.h"
extern int blaze_debug
; /* main/ui_cmds */
// Vcpu0 calls these doneftn_t with void * arg, when all the workerthreads have
// done their required execution.
doneftn_t
volatile wrkthdCBFunc
= NULL
; /* NB-one-shot callback ftn */
void * volatile wrkthdCBArg
= NULL
; /* and its callback arg */
// system level event queue
// static EventQue *eventque;
sema_t wrkthdDONE
; // DONE semaphore. Vcpu0 workerthread signals on
// this semaphore when all the worker threads are done
// UI threads typically hang on this sema while
// waiting for stepi, stept to finish
extern void write_scalar_64 (FILE *fp
, const char * name
, uint64_t v
);
extern bool_t
read_scalar_64 (FILE *fp
, const char * name
, uint64_t *v
);
static void inline breakpoint_hit(int cpu_id
)
ui
->output("cpu[%i] hit a breakpoint. stop.. \n", cpu_id
);
volatile int WorkerThread::BarrierCount
= 0;
volatile int WorkerThread::BarrierLock
= 0;
volatile int WorkerThread::BarrierTemp
= 0;
int64_t WorkerThread::Nusecs
= 0;
int64_t WorkerThread::Ninstrs
= 0;
int64_t WorkerThread::Ncycles
= 0;
// step_remainder holds the number of instructions (in normal mode)
// or cycles (in execution-driven mode) before the next stick_update needs
// to happen. This usually happens when a simulation is stopped between two
// stick updates (usually 1 microsecond of simulated time).
// Implementation note: this used to be known as instrs_till_next_stick_update
// before it was needed in execution-driven mode.
volatile uint64_t WorkerThread::step_remainder
= 0;
volatile int WorkerThread::numThds
= 0;
WorkerThread
* WorkerThread::wrkthds
= 0;
pthread_key_t
WorkerThread::key
= 0;
volatile uint64_t WorkerThread::GlobalTimeUsecs
= 0;
volatile int64_t WorkerThread::GlobalTicks
= 0;
volatile uint64_t WorkerThread::stick_incr
= 0;
volatile uint64_t WorkerThread::stick_remainder
= 0;
/* static */ Vcpu
* WorkerThread::first_cpu
= 0; // first cpu has some special work to do
volatile int64_t WorkerThread::u_intervals
= 0;
volatile int64_t WorkerThread::k_intervals
= 0;
volatile int64_t WorkerThread::u_instrs
= 0;
volatile int64_t WorkerThread::k_instrs
= 0;
void WorkerThread::kill_worker_threads ()
while (sema_trywait (&wrkthdDONE
) == 0) ; // "reset" just in case
for (int i
=0; i
< numThds
; i
++)
int WorkerThread::barrier()
atomic_barrier(&numThds
,&BarrierCount
,&BarrierLock
,
&blaze_stop_request
,&BarrierTemp
);
GlobalTimeUsecs
= ++simTime
;
if (first_cpu
->config
.trace_on
) {
int ret
= atomic_barrier(&numThds
,&BarrierCount
,&BarrierLock
,
&blaze_stop_request
,&BarrierTemp
);
if(cpus
[0] == first_cpu
) { // VCPU0
// this is *only* used for the "perf" UI command
//@@@ and is now broken when umips != kmips @@@
GlobalTicks
+= the_arch
.mips
;
// all vcpu's execute update_stick() with same stick_incr because of the
// first atomic_barrier at entrance of this function
stick_incr
= the_arch
.stick_freq
/1000000ull;
stick_remainder
+= the_arch
.stick_freq
% 1000000ull;
if(stick_remainder
>= 1000000ull){
// adjust the drift in stick_incr when stick_remainder is more then
stick_incr
+= stick_remainder
/1000000ull;
stick_remainder
%= 1000000ull;
static inline int pstate_is_userp (Vcpu
* vcpu
)
uint64_t tl
, tt
, tstate0
; // special case TL > 0
vcpu
->get_reg (VCPU_PR_TL
, &tl
);
if (tl
> 0) vcpu
->get_reg (VCPU_PR_TT
, &tt
);
if (tl
> 0 && ((tt
>= 0x064 && tt
<= 0x06f) // tlb miss
|| (tt
>= 0x080 && tt
<= 0x0ff))) { // win spill/fill
vcpu
->get_reg (VCPU_PR_TSTATE
, &tstate0
);
if (tstate0
& (0x0004 << 8)) return 0;
uint64_t pstate
; // regular case TL == 0
vcpu
->get_reg (VCPU_PR_PSTATE
, &pstate
);
return (pstate
& 4) ? 0 : 1;
static inline int get_mmu_cntx (Vcpu
* vcpu
)
/* Hack for sw05b, the kcfd process runs in kernel mode and with */
/* primary mmu context 0, but can be identified by its' */
/* secondary mmu context, which is unique the the kcfd (although */
/* you do have to be clever to figure out what that value is!) */
uint64_t data
; /* Cheetah (only?) */
(void) vcpu
->get_asi (0x58, 0x10ll
, data
); /* I/D MMU Secondary cntx */
static const uint64_t pstate_priv_mask
= 0x2ull
;
void WorkerThread::stept (uint64_t usecs
)
if (SYSTEM_in_execution_driven_mode()) {
stepc(usecs
* the_arch
.cpu_freq
/ 1000000ull);
// stepi and stept have been intermixed.
// execute the leftover instructions,
// before simulating 'usecs'
called_from_stept
= true;
// When the conditional stepi() call above was called and we hit
// a breakpoint point then we should not enter the for loop below.
// @@@ this needs to be re-thought, should there be a barrier here @@@
if (!IN_STOP_STATE(barrier ())) {
uint64_t kmips
= the_arch
.kmips
; /* re-fetch volatiles every iter */
uint64_t umips
= the_arch
.umips
;
uint64_t cmips
= the_arch_cmips
;
int ccntx
= the_arch_ccntx
;
if (the_arch
.roundrobin
&& (num_cpus
> 1)) {
// round-robin algorithm, with variable user/kernel mips:
// 1. derive user and kernel CPIs from the configured mips and cpu freq:
// 2. When a strand steps by 1 instruction, an associated cycle counter is
// incremented by the CPI
// 3. We round-robin over all strands, advancing each strand until it catches
// up with the other strands on this worker-thread.
// For accuracy, CPI cannot be limited to whole numbers.
// we use fixed-point arithmetic with 10 bits of
// precision, to avoid FP ops in the critical loop.
// all quantities named *_x_1024 represent such
int64_t mhz
= the_arch
.cpu_freq
/1e+6; // NOTE: cpu freq must be in whole megahz for accuracy
int64_t kernel_cpi_x_1024
= (mhz
<< 10)/kmips
;
int64_t user_cpi_x_1024
= (mhz
<< 10)/umips
;
int64_t mhz_x_1024
= mhz
<< 10;
for (int usec
=0; usec
<usecs
; usec
++) {
for (cpuid
=0; cpuid
<num_cpus
; cpuid
++) {
int64_t cdone_x_1024
= 0; // workerthread cycles done
// go round-robin, while cycle counter represents less than a usec
while(cdone_x_1024
< mhz_x_1024
) {
int64_t cmin_x_1024
= LLONG_MAX
; // min of cycles done across all cpus
for (cpuid
=0; cpuid
<num_cpus
; cpuid
++) {
cpus
[cpuid
]->get_reg(VCPU_PR_PSTATE
, &pstate
);
int64_t cpi_x_1024
= (pstate
& pstate_priv_mask
)? kernel_cpi_x_1024
: user_cpi_x_1024
;
// step this cpu as long as its cycle counter does not exceed all other cpus
while(done_x_1024
[cpuid
] <= cdone_x_1024
) {
int rslt
= cpus
[cpuid
]->stepi(1);
done_x_1024
[cpuid
] += cpi_x_1024
;
BLAZE_STOP(blaze_stop_request
);
breakpoint_hit (cpus
[cpuid
]->id());
} // while cycle counter not caught up with rest of the strands on this wt
if (done_x_1024
[cpuid
] < cmin_x_1024
) cmin_x_1024
= done_x_1024
[cpuid
];
// move up the threshold for the next round-robin interval
cdone_x_1024
= cmin_x_1024
;
} // while <mhz> cycles not done
for (cpuid
=0; cpuid
<num_cpus
; cpuid
++) {
cpus
[cpuid
]->update_stick(stick_incr
);
if (IN_STOP_STATE(barrier ()))
} else { // chunky mode: not round robin
for (int i
= 0; i
< usecs
; i
++) {
for(int j
= 0; j
< num_cpus
; j
++) {
if (pstate_is_userp (vcpu
)) {
rslt
= vcpu
->stepi (umips
); atomic_add_64 (&u_intervals
, 1);
atomic_add_64 (&u_instrs
, umips
);
if (ccntx
!= 0 && get_mmu_cntx (vcpu
) == ccntx
) {
rslt
= vcpu
->stepi (cmips
); atomic_add_64 (&k_intervals
, 1);
atomic_add_64 (&k_instrs
, cmips
);
rslt
= vcpu
->stepi (kmips
); atomic_add_64 (&k_intervals
, 1);
atomic_add_64 (&k_instrs
, kmips
);
vcpu
->update_stick(stick_incr
);
BLAZE_STOP(blaze_stop_request
);
breakpoint_hit (vcpu
->id ());
if (IN_STOP_STATE(barrier ()))
// @@@ we _should_ be able to eliminate this barrier, but then sam crashes,
// and I haven't had time to figure out why @@@
atomic_barrier (&numThds
, &BarrierCount
, &BarrierLock
,
&blaze_stop_request
, &BarrierTemp
);
if (cpus
[0] == first_cpu
) { // VCPU 0 does some extra work...
if (IN_STOP_STATE(blaze_stop_request
)) { // stop request -> stop
BLAZE_STOP(blaze_run_state
);
BLAZE_CLEAR(blaze_stop_request
);
} else if (IN_GTSTEP_STATE(blaze_run_state
))// time sync -> wait
BLAZE_GTWAIT(blaze_run_state
);
else // anything else -> stop
BLAZE_STOP(blaze_run_state
);
doneftn_t TmpFunc
= wrkthdCBFunc
;
wrkthdCBFunc
= NULL
; // reset first to avoid race !
(*TmpFunc
)(wrkthdCBArg
); // then `do' callback !
sema_post (&wrkthdDONE
); // ------------------- DONE !!!
void WorkerThread::stepi(uint64_t n
){
uint64_t MIPS
= the_arch
.mips
; // instructions before every sync
uint64_t loops
; // number of 1 usec loops to execute
uint64_t leftover_to_execute
;
// number of instructions left from n
// that have to be executed before return
// w/o updating the STICK
// number of instructions that have to be
// executed next time stepi is called
// before updating STICK. leftovers < MIPS
bool update
= false; // should update STICK in this call ?
// since n could be less then leftovers
// loops * mips + leftover_to_execute + step_remainder == n
leftovers
= step_remainder
- n
;
leftover_to_execute
= n
% MIPS
;
leftovers
= MIPS
- n
% MIPS
;
// do not update if step_remainder was 0
// update STICK after executing 'step_remainder'
for( int j
= 0; j
< num_cpus
; j
++){
if(cpus
[j
]->stepi(step_remainder
) != 0){
// have hit a breakpoint on this strand or cpu, so
BLAZE_STOP(blaze_stop_request
);
breakpoint_hit(cpus
[j
]->id());
cpus
[j
]->update_stick(stick_incr
);
int sam_state
= barrier();
if (IN_STOP_STATE(sam_state
))
// a stop request has come either through UI, or some strand has hit a
// breakpoint. Do not wait to complete the rest of usecs.
for (int i
= 0; i
< loops
; i
++) {
for (int j
= 0; j
< num_cpus
; j
++ ) {
if(cpus
[j
]->stepi(MIPS
) != 0){
// have hit a breakpoint on this strand or cpu
BLAZE_STOP(blaze_stop_request
);
breakpoint_hit(cpus
[j
]->id());
// don't break, let other cpus on this workerthread complete
// their required number of instructions.
cpus
[j
]->update_stick(stick_incr
);
if (IN_STOP_STATE(sam_state
))
// a stop request has come either through UI, or some strand has hit a
// breakpoint. Do not wait to complete the rest of usecs.
// execute the rest of the intructions w/o updating STICK
for( int j
= 0; j
< num_cpus
; j
++ ) {
if(cpus
[j
]->stepi(leftover_to_execute
) != 0){
// have hit a breakpoint on this strand or cpu
BLAZE_STOP(blaze_stop_request
);
breakpoint_hit(cpus
[j
]->id());
// don't break, let other cpus on this workerthread complete
// their required number of instructions.
atomic_barrier(&numThds
,&BarrierCount
,&BarrierLock
,&blaze_stop_request
,&BarrierTemp
);
if(cpus
[0] == first_cpu
){
step_remainder
= leftovers
;
called_from_stept
= false; // do not want to stop here
if (IN_STOP_STATE(blaze_stop_request
)) { // stop request -> stop
BLAZE_STOP(blaze_run_state
);
BLAZE_CLEAR(blaze_stop_request
);
} else if (IN_GTSTEP_STATE(blaze_run_state
))// time sync -> wait
BLAZE_GTWAIT(blaze_run_state
);
else // anything else -> stop
BLAZE_STOP(blaze_run_state
);
doneftn_t TmpFunc
= wrkthdCBFunc
;
wrkthdCBFunc
= NULL
; // reset first to avoid race !
(*TmpFunc
)(wrkthdCBArg
); // then `do' callback !
sema_post (&wrkthdDONE
); // ---------- DONE !!!
// stepc is called in exec-driven mode
void WorkerThread::stepc(int64_t ncycles
)
int64_t cycles_per_usec
= the_arch
.cpu_freq
/ 1000000ull;
for (i
=step_remainder
; i
<=ncycles
; i
++) {
int rslt
= g_cpu_ex_intf
.cycle(1);
if (step_remainder
== 0) {
for (int j
= 0; j
< num_cpus
; j
++) {
vcpu
->update_stick(stick_incr
);
step_remainder
= cycles_per_usec
;
if (IN_STOP_STATE(barrier())) {
BLAZE_STOP(blaze_stop_request
);
// remember cycles left over until next sync
if (IN_STOP_STATE(blaze_stop_request
)) { // stop request -> stop
BLAZE_STOP(blaze_run_state
);
BLAZE_CLEAR(blaze_stop_request
);
} else if (IN_GTSTEP_STATE(blaze_run_state
))// time sync -> wait
BLAZE_GTWAIT(blaze_run_state
);
else // anything else -> stop
BLAZE_STOP(blaze_run_state
);
doneftn_t TmpFunc
= wrkthdCBFunc
;
wrkthdCBFunc
= NULL
; // reset first to avoid race !
(*TmpFunc
)(wrkthdCBArg
); // then `do' callback !
sema_post (&wrkthdDONE
); // ------------------- DONE !!!
} // void WorkerThread::stepc()
void WorkerThread::dump(FILE * fp
){
write_scalar_64 (fp
, "GlobalTimeUsecs",GlobalTimeUsecs
);
write_scalar_64 (fp
, "GlobalTicks",GlobalTicks
);
write_scalar_64 (fp
, "step_remainder",step_remainder
);
write_scalar_64 (fp
, "stick_remainder",stick_remainder
);
int WorkerThread::restore (char * line
)
if (sscanf (line
, "GlobalTimeUsecs %lli", &GlobalTimeUsecs
) == 1) ;
else if (sscanf (line
, "GlobalTicks %lli", &GlobalTicks
) == 1) ;
else if (sscanf (line
, "stick_remainder %lli", &stick_remainder
) == 1) ;
// Note that the following two lines restore the same variable. This reflects
// a name change of the variable from instr_till_next_stick_update to step_remainder
// to reflect its broader purpose in execution-driven runs. The second restore has
// been left for backward compatibility to keep the ability to restore older checkpoints.
// Please do not remove that line unless you are sure no checkpoint created prior to
// 12/13/2007 exists or they have all been patched to rename this variable.
// Only one of those lines will ever get executed depending on when the checkpoint was
else if (sscanf (line
, "step_remainder %lli", &step_remainder
) == 1) ;
else if (sscanf (line
, "instr_till_next_stick_update %lli", &step_remainder
) == 1) ;
void WorkerThread::create_worker_threads (int NumCpus
, int cpusPerThread
, int numThreads
)
static int FIRST_TIME
= 1; // init flag
if (cpusPerThread
== -1) {
newnum
= NumCpus
/ cpusPerThread
;
ui
->warning("invalid conf numthds, using 1\n");
} else if (newnum
> 64) {
ui
->warning("invalid conf numthds, using 64\n");
for (int i
=0; i
<=g_vcpu_id_max
; i
++)
if (get_vcpu(i
) && cpu_enabled
[i
])
ui
->warning("Invalid: 0 enabled cpus, using 1\n");
NumPerThread
= NumEnabled
/ newnum
;
NumModThread
= NumEnabled
% newnum
;
sema_init (&wrkthdDONE
, 0, USYNC_THREAD
,NULL
);
// eventque = new EventQue();
assert(pthread_key_create(&key
,0) == 0);
if (newnum
!= WorkerThread::numThds
|| cpu_enable_changed
) {
if (newnum
> HOSTINFO_numcpus()) {
ui
->warning("not enough host-cpus(%d) for sim-threads(%d)\n",
HOSTINFO_numcpus(), newnum
);
// retrieve any existing events on the current workerthread eq's
std::list
<Event_t
*> eventlist
;
for( int i
= 0 ; i
< numThds
; i
++){
while(!wrkthds
[i
].eq
->empty()){
Event_t
*event
= new Event_t
;
wrkthds
[i
].eq
->get_top (event
);
eventlist
.push_front(event
);
wrkthds
= new WorkerThread
[newnum
];
for (int i
= 0; i
< newnum
; i
++)
wrkthds
[i
].worker_id
= i
;
// put all events in eventlist on new wrkthd 0 event queue
while( !eventlist
.empty() ){
Event_t
* e
= eventlist
.front();
wrkthds
[0].eq
->insert_callback(e
->pq_priority
,
e
->pq_cbfunc
,e
->pq_cbarg1
,e
->pq_cbarg2
,
e
->pq_cbunload
, e
->dbgstring
, e
->worker_id
);
// reassign vcpu's to the worker threads
for (int i
= 0; i
< newnum
; i
++) {
// number of vcpu's for this worker thread
wrkthds
[i
].num_cpus
= NumPerThread
+ (i
< NumModThread
);
for (int j
= 0; j
< wrkthds
[i
].num_cpus
; j
++)
Vcpu
* vcpu
= get_vcpu(nextcpu
);
for (int k
=nextcpu
; k
<=g_vcpu_id_max
; k
++)
if(!vcpu
|| !cpu_enabled
[nextcpu
]) // skip not enabled
vcpu
=get_vcpu(++nextcpu
);
wrkthds
[i
].cpus
[j
] = vcpu
;
for (int i
=0; i
<newnum
; i
++) {
ui
->verbose("cpu-worker-thread[%d] ",i
);
WorkerThread::numThds
= newnum
;
cpu_enable_changed
= false;