[OpenSPARC-T2-SAM] / sam-t2 / sam / system / blaze / workerthreads.cc

// ========== Copyright Header Begin ==========================================
// 
// OpenSPARC T2 Processor File: workerthreads.cc
// Copyright (c) 2006 Sun Microsystems, Inc.  All Rights Reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
// 
// The above named program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public
// License version 2 as published by the Free Software Foundation.
// 
// The above named program is distributed in the hope that it will be 
// useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
// 
// You should have received a copy of the GNU General Public
// License along with this work; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
// 
// ========== Copyright Header End ============================================
/*
 * Copyright (C) 2001,2005 Sun Microsystems, Inc.
 * All rights reserved.
 */
#pragma ident "%%1.27    06/12/14    %%"

/*
   "threads.cc"

    Supports multiple cpu simulation threads that "system.cc"
    can signal and wait for, ie that live for the duration of blaze rather
    than being created/destroyed for each UI command.

    These threads can however be destroyed/re-created, because we support
    changing numthreads (which indirectly changes cpus-per-thread), as
    well as changing the enabled state of individual cpus (which changes
    the number-of-cpus to run, and hence the distribution of running cpus
    across the worker threads).

*/

#include "ui.h"
#include "workerthread.h"


extern int blaze_debug;               /* main/ui_cmds */
extern int blaze_option;


// Vcpu0 calls these doneftn_t with void * arg, when all the workerthreads have
// done their required execution.
doneftn_t  volatile wrkthdCBFunc = NULL;	/* NB-one-shot callback ftn */
void *     volatile wrkthdCBArg = NULL;	/* and its callback arg     */


// system level event queue
// static EventQue  *eventque;

sema_t wrkthdDONE;	// DONE semaphore. Vcpu0 workerthread signals on 
                        // this semaphore when all the worker threads are done
                        // UI threads typically hang on this sema while
                        // waiting for stepi, stept to finish   


extern void   write_scalar_64 (FILE *fp, const char * name, uint64_t v);
extern bool_t read_scalar_64  (FILE *fp, const char * name, uint64_t *v);

static void inline breakpoint_hit(int cpu_id)
{
    ui->output("cpu[%i] hit a breakpoint. stop.. \n", cpu_id);
}

volatile int WorkerThread::BarrierCount = 0;
volatile int WorkerThread::BarrierLock = 0;
volatile int WorkerThread::BarrierTemp = 0;
int64_t      WorkerThread::Nusecs = 0;
int64_t      WorkerThread::Ninstrs = 0;
int64_t	     WorkerThread::Ncycles = 0;

// step_remainder holds the number of instructions (in normal mode)
// or cycles (in execution-driven mode) before the next stick_update needs
// to happen. This usually happens when a simulation is stopped between two
// stick updates (usually 1 microsecond of simulated time). 
// Implementation note: this used to be known as instrs_till_next_stick_update
// before it was needed in execution-driven mode.
volatile uint64_t WorkerThread::step_remainder = 0;

volatile int WorkerThread::numThds = 0;
WorkerThread * WorkerThread::wrkthds = 0;
pthread_key_t WorkerThread::key = 0;
volatile uint64_t WorkerThread::GlobalTimeUsecs = 0;
volatile int64_t  WorkerThread::GlobalTicks = 0;

volatile uint64_t WorkerThread::stick_incr = 0;
volatile uint64_t  WorkerThread::stick_remainder = 0;

/* static */ Vcpu * WorkerThread::first_cpu = 0;   // first cpu has some special work to do


volatile int64_t      WorkerThread::u_intervals = 0;
volatile int64_t      WorkerThread::k_intervals = 0;

volatile int64_t      WorkerThread::u_instrs = 0;
volatile int64_t      WorkerThread::k_instrs = 0;


void WorkerThread::kill_worker_threads ()
{
    if (numThds > 0) {

        while (sema_trywait (&wrkthdDONE) == 0) ;    // "reset" just in case

        for (int i=0; i < numThds; i++)
            wrkthds[i].killThread();

        numThds = 0;
    }

    if (wrkthds) {
        delete[] wrkthds;
        wrkthds = 0;
    }
}


int WorkerThread::barrier()
{


    atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
		   &blaze_stop_request,&BarrierTemp);

    GlobalTimeUsecs = ++simTime;

    doEventqueCallbacks();

    if (first_cpu->config.trace_on) {
	doTrace();
    }

    int ret = atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
			     &blaze_stop_request,&BarrierTemp);


    if(cpus[0] == first_cpu) {       // VCPU0

	// this is *only* used for the "perf" UI command
	//@@@ and is now broken when umips != kmips @@@
        GlobalTicks += the_arch.mips;

	// all vcpu's execute update_stick() with same stick_incr because of the
        // first atomic_barrier at entrance of this function
        stick_incr = the_arch.stick_freq/1000000ull;
        stick_remainder += the_arch.stick_freq % 1000000ull;
        if(stick_remainder >= 1000000ull){
           // adjust the drift in stick_incr when stick_remainder is more then
           // 10^6
            stick_incr += stick_remainder/1000000ull;
            stick_remainder %= 1000000ull;
        }
    }


    return ret;
}


static inline int pstate_is_userp (Vcpu * vcpu)
{
	uint64_t tl, tt, tstate0;		// special case TL > 0
	vcpu->get_reg (VCPU_PR_TL, &tl);
	if (tl > 0) vcpu->get_reg (VCPU_PR_TT, &tt);
	if (tl > 0 && ((tt >= 0x064 && tt <= 0x06f)	   // tlb miss
	            || (tt >= 0x080 && tt <= 0x0ff))) {	   // win spill/fill
	    vcpu->get_reg (VCPU_PR_TSTATE, &tstate0);

	    if (tstate0 & (0x0004 << 8)) return 0;
	    else                         return 1;
	}

	uint64_t pstate;			// regular case TL == 0
	vcpu->get_reg (VCPU_PR_PSTATE, &pstate);
	return (pstate & 4) ? 0 : 1;
}

static inline int get_mmu_cntx (Vcpu * vcpu)
	/* Hack for sw05b, the kcfd process runs in kernel mode and with      */
	/* primary mmu context 0, but can be identified by its'               */
	/* secondary mmu context, which is unique the the kcfd (although      */
	/* you do have to be clever to figure out what that value is!)        */
{
	uint64_t data;				    /* Cheetah (only?)        */
	(void) vcpu->get_asi (0x58, 0x10ll, data);  /* I/D MMU Secondary cntx */
	return (int)data;
}


static const uint64_t pstate_priv_mask = 0x2ull;

void WorkerThread::stept (uint64_t usecs) 
{
    int rslt;

    if (SYSTEM_in_execution_driven_mode()) {
	stepc(usecs * the_arch.cpu_freq / 1000000ull);
	return;
    }

    if (step_remainder) {
        // stepi and stept have been intermixed.
        // execute the leftover instructions, 
        // before simulating 'usecs'
        called_from_stept = true;
        stepi (step_remainder);
    }

    // When the conditional stepi() call above was called and we hit
    // a breakpoint point then we should not enter the for loop below.
    // @@@ this needs to be re-thought, should there be a barrier here @@@
    if (!IN_STOP_STATE(barrier ())) {
	uint64_t kmips = the_arch.kmips; /* re-fetch volatiles every iter */
	uint64_t umips = the_arch.umips;
	uint64_t cmips = the_arch_cmips;
	int      ccntx = the_arch_ccntx;

	if (the_arch.roundrobin && (num_cpus > 1)) {

	    // round-robin algorithm, with variable user/kernel mips:
	    //
	    // 1. derive user and kernel CPIs from the configured mips and cpu freq:
	    //   CPI = MHz/MIPS
	    // 2. When a strand steps by 1 instruction, an associated cycle counter is
	    //   incremented by the CPI
	    // 3. We round-robin over all strands, advancing each strand until it catches
	    //   up with the other strands on this worker-thread.

	    // For accuracy, CPI cannot be limited to whole numbers.

	    // we use fixed-point arithmetic with 10 bits of
	    // precision, to avoid FP ops in the critical loop.
	    // all quantities named *_x_1024 represent such
	    // fixed-point numbers.

	    int64_t mhz = the_arch.cpu_freq/1e+6; // NOTE: cpu freq must be in whole megahz for accuracy

	    int64_t kernel_cpi_x_1024 = (mhz << 10)/kmips;
	    int64_t user_cpi_x_1024 = (mhz << 10)/umips;

	    int64_t mhz_x_1024 = mhz << 10;

	    for (int usec=0; usec<usecs; usec++) {
		int cpuid;
		for (cpuid=0; cpuid<num_cpus; cpuid++) {
		    done_x_1024[cpuid] = 0;
		}

		int64_t cdone_x_1024 = 0; // workerthread cycles done

		// go round-robin, while cycle counter represents less than a usec
		while(cdone_x_1024 < mhz_x_1024) {

		    int64_t cmin_x_1024 = LLONG_MAX; // min of cycles done across all cpus
		    for (cpuid=0; cpuid<num_cpus; cpuid++) {

			uint64_t pstate;
                        cpus[cpuid]->get_reg(VCPU_PR_PSTATE, &pstate);
			int64_t cpi_x_1024 = (pstate & pstate_priv_mask)? kernel_cpi_x_1024 : user_cpi_x_1024;

			// step this cpu as long as its cycle counter does not exceed all other cpus
			while(done_x_1024[cpuid] <= cdone_x_1024) {
			    int rslt = cpus[cpuid]->stepi(1);
			    done_x_1024[cpuid] += cpi_x_1024;
			    if (rslt) {
				BLAZE_STOP(blaze_stop_request);
				breakpoint_hit (cpus[cpuid]->id());
				goto STEPT_LAST_RR;
			    }
			} // while cycle counter not caught up with rest of the strands on this wt

			if (done_x_1024[cpuid] < cmin_x_1024) cmin_x_1024 = done_x_1024[cpuid];

		    } // for each cpuid

		    // move up the threshold for the next round-robin interval
		    cdone_x_1024 = cmin_x_1024;
		} // while <mhz> cycles not done

	    STEPT_LAST_RR:

		for (cpuid=0; cpuid<num_cpus; cpuid++) {
		    cpus[cpuid]->update_stick(stick_incr);
		}

		if (IN_STOP_STATE(barrier ()))
		    break;

	    } // for usecs
	} else { // chunky mode: not round robin

	    for (int i = 0; i < usecs; i++) {

		for(int j = 0; j < num_cpus; j++) {

		    Vcpu * vcpu = cpus[j];

		    /*---STEPI---*/
		    if (pstate_is_userp (vcpu)) {
			rslt = vcpu->stepi (umips);  atomic_add_64 (&u_intervals, 1);
			atomic_add_64 (&u_instrs, umips);
		    } else {
			if (ccntx != 0 && get_mmu_cntx (vcpu) == ccntx) {
			    rslt = vcpu->stepi (cmips);  atomic_add_64 (&k_intervals, 1);
			    atomic_add_64 (&k_instrs, cmips);
			} else {
			    rslt = vcpu->stepi (kmips);  atomic_add_64 (&k_intervals, 1);
			    atomic_add_64 (&k_instrs, kmips);
			}
		    }
		    /*---STICK---*/
		    vcpu->update_stick(stick_incr);


		    if (rslt) {
			BLAZE_STOP(blaze_stop_request);
			breakpoint_hit (vcpu->id ());
		    }
		} // for cpus


		if (IN_STOP_STATE(barrier ()))
		    break;

	    } // for usecs
	}
    } // not already stopped 

    // @@@ we _should_ be able to eliminate this barrier, but then sam crashes,
    // and I haven't had time to figure out why @@@
    //
    atomic_barrier (&numThds, &BarrierCount, &BarrierLock,
		    &blaze_stop_request, &BarrierTemp);

    if (cpus[0] == first_cpu) {         // VCPU 0 does some extra work...
        if (IN_STOP_STATE(blaze_stop_request)) {    // stop request -> stop
            BLAZE_STOP(blaze_run_state); 
            BLAZE_CLEAR(blaze_stop_request);
        } else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
            BLAZE_GTWAIT(blaze_run_state);
        else                                        // anything else -> stop
            BLAZE_STOP(blaze_run_state);
        doneftn_t  TmpFunc = wrkthdCBFunc;
        if (TmpFunc != NULL) {
            wrkthdCBFunc = NULL;		// reset first to avoid race !
            (*TmpFunc)(wrkthdCBArg);		// then `do' callback !
        }
        sema_post (&wrkthdDONE);		// ------------------- DONE !!!
    }
}


void WorkerThread::stepi(uint64_t n){

    uint64_t MIPS = the_arch.mips;   // instructions before every sync

    uint64_t loops;         // number of 1 usec loops to execute 
    uint64_t leftover_to_execute; 
                            // number of instructions left from n
                            // that have to be executed before return
                            // w/o updating the STICK     
    uint64_t leftovers;
                            // number of instructions that have to be
                            // executed next time stepi is called 
                            // before updating STICK. leftovers < MIPS

    bool update = false;    // should update STICK in this call ?
                            // since n could be less then leftovers


    // loops * mips + leftover_to_execute + step_remainder == n
    if(step_remainder > n){
        loops = 0;
        leftover_to_execute = n;
        leftovers = step_remainder - n;
        update = false;
    }else{
        n -= step_remainder;
        loops = n/MIPS;
        leftover_to_execute = n % MIPS;
        leftovers = MIPS - n % MIPS;
        if(step_remainder)
            // do not update if step_remainder was 0
            update = true;
    }


    if(update){
        // update STICK after executing 'step_remainder'
        for( int j = 0; j < num_cpus ; j++){
           if(cpus[j]->stepi(step_remainder) != 0){
                 // have hit a breakpoint on this strand or cpu, so
                BLAZE_STOP(blaze_stop_request);
                breakpoint_hit(cpus[j]->id());
            }
	    cpus[j]->update_stick(stick_incr);
        }

        int sam_state = barrier();

        if (IN_STOP_STATE(sam_state))
        // a stop request has come either through UI, or some strand has hit a
        // breakpoint. Do not wait to complete the rest of usecs.
            goto STOPNOW;
    }

    for (int i = 0; i < loops; i++) {
        int sam_state;
        for (int j = 0; j < num_cpus; j++ ) {
            if(cpus[j]->stepi(MIPS) != 0){
                // have hit a breakpoint on this strand or cpu
                BLAZE_STOP(blaze_stop_request);
                breakpoint_hit(cpus[j]->id());
                // don't break, let other cpus on this workerthread complete
                // their required number of instructions.
            }
	    cpus[j]->update_stick(stick_incr);
        }
        sam_state = barrier();

        if (IN_STOP_STATE(sam_state))
        // a stop request has come either through UI, or some strand has hit a
        // breakpoint. Do not wait to complete the rest of usecs.
            goto STOPNOW;
    } 


    // execute the rest of the intructions w/o updating STICK
    for( int j = 0; j < num_cpus; j++ ) { 
        if(cpus[j]->stepi(leftover_to_execute) != 0){
                // have hit a breakpoint on this strand or cpu
                BLAZE_STOP(blaze_stop_request);
                breakpoint_hit(cpus[j]->id());
                // don't break, let other cpus on this workerthread complete
                // their required number of instructions.
        }
    }


STOPNOW:

    atomic_barrier(&numThds,&BarrierCount,&BarrierLock,&blaze_stop_request,&BarrierTemp);
    
    if(cpus[0] == first_cpu){
        step_remainder = leftovers;

        if(called_from_stept){
            called_from_stept = false;  // do not want to stop here
            return;
        }
    
        if (IN_STOP_STATE(blaze_stop_request)) {    // stop request -> stop
            BLAZE_STOP(blaze_run_state); 
            BLAZE_CLEAR(blaze_stop_request);
        } else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
            BLAZE_GTWAIT(blaze_run_state);
        else                                        // anything else -> stop
            BLAZE_STOP(blaze_run_state);

        if(wrkthdCBFunc){
            doneftn_t  TmpFunc = wrkthdCBFunc;
            if (TmpFunc != NULL) {
                wrkthdCBFunc = NULL;                      // reset first to avoid race !
                (*TmpFunc)(wrkthdCBArg);                  // then `do' callback !
            }
        }
        sema_post (&wrkthdDONE);                          // ---------- DONE !!!
    }
}


// stepc is called in exec-driven mode
void WorkerThread::stepc(int64_t ncycles)
{
  int64_t cycles_per_usec = the_arch.cpu_freq / 1000000ull;

  int64_t i;
  for (i=step_remainder; i<=ncycles; i++) {
    int rslt = g_cpu_ex_intf.cycle(1);
    step_remainder--;

    if (step_remainder == 0) {
      for (int j = 0; j < num_cpus; j++) {
	Vcpu * vcpu = cpus[j];
	vcpu->update_stick(stick_incr);
      }
      step_remainder = cycles_per_usec;
      if (IN_STOP_STATE(barrier())) {
	break;
      }
    }

    if (rslt) {
      BLAZE_STOP(blaze_stop_request);
      break;
    }

  }

  // FIXME FIXME FIXME
  // remember cycles left over until next sync
  if (IN_STOP_STATE(blaze_stop_request)) {    // stop request -> stop
    BLAZE_STOP(blaze_run_state); 
    BLAZE_CLEAR(blaze_stop_request);
  } else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
    BLAZE_GTWAIT(blaze_run_state);
  else                                        // anything else -> stop
    BLAZE_STOP(blaze_run_state);
  doneftn_t  TmpFunc = wrkthdCBFunc;
  if (TmpFunc != NULL) {
    wrkthdCBFunc = NULL;		// reset first to avoid race !
    (*TmpFunc)(wrkthdCBArg);		// then `do' callback !
  }
  sema_post (&wrkthdDONE);		// ------------------- DONE !!!  
} // void WorkerThread::stepc()


void WorkerThread::dump(FILE * fp){
        write_scalar_64 (fp, "GlobalTimeUsecs",GlobalTimeUsecs );
        write_scalar_64 (fp, "GlobalTicks",GlobalTicks );
        write_scalar_64 (fp, "step_remainder",step_remainder);
        write_scalar_64 (fp, "stick_remainder",stick_remainder);
}

int WorkerThread::restore (char * line)
{
         if (sscanf (line, "GlobalTimeUsecs %lli", &GlobalTimeUsecs)   == 1) ;
    else if (sscanf (line, "GlobalTicks     %lli", &GlobalTicks)       == 1) ;
    else if (sscanf (line, "stick_remainder %lli", &stick_remainder)   == 1) ;

    // Note that the following two lines restore the same variable. This reflects
    // a name change of the variable from instr_till_next_stick_update to step_remainder
    // to reflect its broader purpose in execution-driven runs. The second restore has
    // been left for backward compatibility to keep the ability to restore older checkpoints.
    // Please do not remove that line unless you are sure no checkpoint created prior to 
    // 12/13/2007 exists or they have all been patched to rename this variable.
    // Only one of those lines will ever get executed depending on when the checkpoint was
    // taken.
    else if (sscanf (line, "step_remainder %lli", &step_remainder)   == 1) ;
    else if (sscanf (line, "instr_till_next_stick_update %lli", &step_remainder)   == 1) ; 

    else return FALSE;

    return TRUE;
}


void WorkerThread::create_worker_threads (int NumCpus, int cpusPerThread, int numThreads)
{
    static int FIRST_TIME = 1;  // init flag

    int        NumEnabled = 0;
    int        NumPerThread = 0;
    int        NumModThread = 0;


    int newnum = numThreads;
 
    if (newnum == -1) {
        if (cpusPerThread == -1) {
            newnum = 1;
        } else {
            newnum = NumCpus / cpusPerThread;
        }
    }
    
    if (newnum <= 0) {
        ui->warning("invalid conf numthds, using 1\n");
        newnum = 1;
    } else if (newnum > 64) {
        ui->warning("invalid conf numthds, using 64\n");
        newnum = 64;
    }

    NumEnabled = 0;
    for (int i=0; i<=g_vcpu_id_max; i++) 
        if (get_vcpu(i) && cpu_enabled[i]) 
            NumEnabled++;

    if (NumEnabled == 0) {
        ui->warning("Invalid: 0 enabled cpus, using 1\n");
        NumEnabled = 1;
    }

    NumPerThread = NumEnabled / newnum;
    NumModThread = NumEnabled % newnum;


    if (FIRST_TIME) {
        sema_init (&wrkthdDONE, 0, USYNC_THREAD,NULL);
        // eventque = new EventQue();
        assert(pthread_key_create(&key,0) == 0);
        FIRST_TIME = 0;
    }

    if (newnum != WorkerThread::numThds || cpu_enable_changed) {

        if (newnum > HOSTINFO_numcpus()) {
            ui->warning("not enough host-cpus(%d) for sim-threads(%d)\n",
            HOSTINFO_numcpus(),  newnum);
        }


        // retrieve any existing events on the current workerthread eq's
        std::list<Event_t *> eventlist;
        for( int i = 0 ; i < numThds; i++){
            while(!wrkthds[i].eq->empty()){
                Event_t *event = new Event_t;
                wrkthds[i].eq->get_top (event);
                eventlist.push_front(event);
            }
        }

        kill_worker_threads ();
        wrkthds = new WorkerThread[newnum];
        for (int i = 0; i < newnum; i++)
            wrkthds[i].worker_id = i;

        // put all events in eventlist on new wrkthd 0 event queue
        while( !eventlist.empty() ){
            Event_t * e = eventlist.front();
            wrkthds[0].eq->insert_callback(e->pq_priority,
			e->pq_cbfunc,e->pq_cbarg1,e->pq_cbarg2,
			e->pq_cbunload, e->dbgstring, e->worker_id);
            eventlist.pop_front();
            delete e;
        }


        // reassign vcpu's to the worker threads
        int nextcpu = 0;
        first_cpu   = 0;
        for (int i = 0; i < newnum; i++) {
            // number of vcpu's for this worker thread
            wrkthds[i].num_cpus = NumPerThread + (i < NumModThread);

            for (int j = 0; j < wrkthds[i].num_cpus; j++) 
            {            
                // find next vcpu
                Vcpu* vcpu = get_vcpu(nextcpu);
                for (int k=nextcpu; k<=g_vcpu_id_max; k++)
                {
                    if(!vcpu || !cpu_enabled[nextcpu]) // skip not enabled 
                        vcpu =get_vcpu(++nextcpu); 
                    else
                        break;
                }

                if (first_cpu == 0)
                    first_cpu = vcpu; 

                wrkthds[i].cpus[j] = vcpu;
                nextcpu++;
            } 
        }

        for (int i=0; i<newnum; i++) {
            ui->verbose("cpu-worker-thread[%d] ",i);
            wrkthds[i].info();
        }

        WorkerThread::numThds = newnum;
        cpu_enable_changed = false;
    }
}
Commit	Line	Data
920dae64 AT	1	// ========== Copyright Header Begin ==========================================
	2	//
	3	// OpenSPARC T2 Processor File: workerthreads.cc
	4	// Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved.
	5	// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
	6	//
	7	// The above named program is free software; you can redistribute it and/or
	8	// modify it under the terms of the GNU General Public
	9	// License version 2 as published by the Free Software Foundation.
	10	//
	11	// The above named program is distributed in the hope that it will be
	12	// useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	// General Public License for more details.
	15	//
	16	// You should have received a copy of the GNU General Public
	17	// License along with this work; if not, write to the Free Software
	18	// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
	19	//
	20	// ========== Copyright Header End ============================================
	21	/*
	22	* Copyright (C) 2001,2005 Sun Microsystems, Inc.
	23	* All rights reserved.
	24	*/
	25	#pragma ident "%%1.27 06/12/14 %%"
	26
	27	/*
	28	"threads.cc"
	29
	30	Supports multiple cpu simulation threads that "system.cc"
	31	can signal and wait for, ie that live for the duration of blaze rather
	32	than being created/destroyed for each UI command.
	33
	34	These threads can however be destroyed/re-created, because we support
	35	changing numthreads (which indirectly changes cpus-per-thread), as
	36	well as changing the enabled state of individual cpus (which changes
	37	the number-of-cpus to run, and hence the distribution of running cpus
	38	across the worker threads).
	39
	40	*/
	41
	42	#include "ui.h"
	43	#include "workerthread.h"
	44
	45
	46	extern int blaze_debug; /* main/ui_cmds */
	47	extern int blaze_option;
	48
	49
	50
	51
	52	// Vcpu0 calls these doneftn_t with void * arg, when all the workerthreads have
	53	// done their required execution.
	54	doneftn_t volatile wrkthdCBFunc = NULL; /* NB-one-shot callback ftn */
	55	void * volatile wrkthdCBArg = NULL; /* and its callback arg */
	56
	57
	58	// system level event queue
	59	// static EventQue *eventque;
	60
	61	sema_t wrkthdDONE; // DONE semaphore. Vcpu0 workerthread signals on
	62	// this semaphore when all the worker threads are done
	63	// UI threads typically hang on this sema while
	64	// waiting for stepi, stept to finish
65
66
67	extern void write_scalar_64 (FILE fp, const char name, uint64_t v);
68	extern bool_t read_scalar_64 (FILE fp, const char name, uint64_t *v);
69
70	static void inline breakpoint_hit(int cpu_id)
71	{
72	ui->output("cpu[%i] hit a breakpoint. stop.. \n", cpu_id);
73	}
74
75	volatile int WorkerThread::BarrierCount = 0;
76	volatile int WorkerThread::BarrierLock = 0;
77	volatile int WorkerThread::BarrierTemp = 0;
78	int64_t WorkerThread::Nusecs = 0;
79	int64_t WorkerThread::Ninstrs = 0;
80	int64_t WorkerThread::Ncycles = 0;
81
82	// step_remainder holds the number of instructions (in normal mode)
83	// or cycles (in execution-driven mode) before the next stick_update needs
84	// to happen. This usually happens when a simulation is stopped between two
85	// stick updates (usually 1 microsecond of simulated time).
86	// Implementation note: this used to be known as instrs_till_next_stick_update
87	// before it was needed in execution-driven mode.
88	volatile uint64_t WorkerThread::step_remainder = 0;
89
90	volatile int WorkerThread::numThds = 0;
91	WorkerThread * WorkerThread::wrkthds = 0;
92	pthread_key_t WorkerThread::key = 0;
93	volatile uint64_t WorkerThread::GlobalTimeUsecs = 0;
94	volatile int64_t WorkerThread::GlobalTicks = 0;
95
96	volatile uint64_t WorkerThread::stick_incr = 0;
97	volatile uint64_t WorkerThread::stick_remainder = 0;
98
99	/* static / Vcpu WorkerThread::first_cpu = 0; // first cpu has some special work to do
100
101
102
103	volatile int64_t WorkerThread::u_intervals = 0;
104	volatile int64_t WorkerThread::k_intervals = 0;
105
106	volatile int64_t WorkerThread::u_instrs = 0;
107	volatile int64_t WorkerThread::k_instrs = 0;
108
109
110
111	void WorkerThread::kill_worker_threads ()
112	{
113	if (numThds > 0) {
114
115	while (sema_trywait (&wrkthdDONE) == 0) ; // "reset" just in case
116
117	for (int i=0; i < numThds; i++)
118	wrkthds[i].killThread();
119
120	numThds = 0;
121	}
122
123	if (wrkthds) {
124	delete[] wrkthds;
125	wrkthds = 0;
126	}
127	}
128
129
130
131	int WorkerThread::barrier()
132	{
133
134
135	atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
136	&blaze_stop_request,&BarrierTemp);
137
138	GlobalTimeUsecs = ++simTime;
139
140	doEventqueCallbacks();
141
142	if (first_cpu->config.trace_on) {
143	doTrace();
144	}
145
146	int ret = atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
147	&blaze_stop_request,&BarrierTemp);
148
149
150	if(cpus[0] == first_cpu) { // VCPU0
151
152	// this is only used for the "perf" UI command
153	//@@@ and is now broken when umips != kmips @@@
154	GlobalTicks += the_arch.mips;
155
156	// all vcpu's execute update_stick() with same stick_incr because of the
157	// first atomic_barrier at entrance of this function
158	stick_incr = the_arch.stick_freq/1000000ull;
159	stick_remainder += the_arch.stick_freq % 1000000ull;
160	if(stick_remainder >= 1000000ull){
161	// adjust the drift in stick_incr when stick_remainder is more then
162	// 10^6
163	stick_incr += stick_remainder/1000000ull;
164	stick_remainder %= 1000000ull;
165	}
166	}
167
168
169	return ret;
170	}
171
172
173
174
175	static inline int pstate_is_userp (Vcpu * vcpu)
176	{
177	uint64_t tl, tt, tstate0; // special case TL > 0
178	vcpu->get_reg (VCPU_PR_TL, &tl);
179	if (tl > 0) vcpu->get_reg (VCPU_PR_TT, &tt);
180	if (tl > 0 && ((tt >= 0x064 && tt <= 0x06f) // tlb miss
181	\|\| (tt >= 0x080 && tt <= 0x0ff))) { // win spill/fill
182	vcpu->get_reg (VCPU_PR_TSTATE, &tstate0);
183
184	if (tstate0 & (0x0004 << 8)) return 0;
185	else return 1;
186	}
187
188	uint64_t pstate; // regular case TL == 0
189	vcpu->get_reg (VCPU_PR_PSTATE, &pstate);
190	return (pstate & 4) ? 0 : 1;
191	}
192
193	static inline int get_mmu_cntx (Vcpu * vcpu)
194	/* Hack for sw05b, the kcfd process runs in kernel mode and with */
195	/* primary mmu context 0, but can be identified by its' */
196	/* secondary mmu context, which is unique the the kcfd (although */
197	/* you do have to be clever to figure out what that value is!) */
198	{
199	uint64_t data; /* Cheetah (only?) */
200	(void) vcpu->get_asi (0x58, 0x10ll, data); /* I/D MMU Secondary cntx */
201	return (int)data;
202	}
203
204
205	static const uint64_t pstate_priv_mask = 0x2ull;
206
207	void WorkerThread::stept (uint64_t usecs)
208	{
209	int rslt;
210
211	if (SYSTEM_in_execution_driven_mode()) {
212	stepc(usecs * the_arch.cpu_freq / 1000000ull);
213	return;
214	}
215
216	if (step_remainder) {
217	// stepi and stept have been intermixed.
218	// execute the leftover instructions,
219	// before simulating 'usecs'
220	called_from_stept = true;
221	stepi (step_remainder);
222	}
223
224	// When the conditional stepi() call above was called and we hit
225	// a breakpoint point then we should not enter the for loop below.
226	// @@@ this needs to be re-thought, should there be a barrier here @@@
227	if (!IN_STOP_STATE(barrier ())) {
228	uint64_t kmips = the_arch.kmips; /* re-fetch volatiles every iter */
229	uint64_t umips = the_arch.umips;
230	uint64_t cmips = the_arch_cmips;
231	int ccntx = the_arch_ccntx;
232
233	if (the_arch.roundrobin && (num_cpus > 1)) {
234
235	// round-robin algorithm, with variable user/kernel mips:
236	//
237	// 1. derive user and kernel CPIs from the configured mips and cpu freq:
238	// CPI = MHz/MIPS
239	// 2. When a strand steps by 1 instruction, an associated cycle counter is
240	// incremented by the CPI
241	// 3. We round-robin over all strands, advancing each strand until it catches
242	// up with the other strands on this worker-thread.
243
244	// For accuracy, CPI cannot be limited to whole numbers.
245
246	// we use fixed-point arithmetic with 10 bits of
247	// precision, to avoid FP ops in the critical loop.
248	// all quantities named *_x_1024 represent such
249	// fixed-point numbers.
250
251	int64_t mhz = the_arch.cpu_freq/1e+6; // NOTE: cpu freq must be in whole megahz for accuracy
252
253	int64_t kernel_cpi_x_1024 = (mhz << 10)/kmips;
254	int64_t user_cpi_x_1024 = (mhz << 10)/umips;
255
256	int64_t mhz_x_1024 = mhz << 10;
257
258	for (int usec=0; usec<usecs; usec++) {
259	int cpuid;
260	for (cpuid=0; cpuid<num_cpus; cpuid++) {
261	done_x_1024[cpuid] = 0;
262	}
263
264	int64_t cdone_x_1024 = 0; // workerthread cycles done
265
266	// go round-robin, while cycle counter represents less than a usec
267	while(cdone_x_1024 < mhz_x_1024) {
268
269	int64_t cmin_x_1024 = LLONG_MAX; // min of cycles done across all cpus
270	for (cpuid=0; cpuid<num_cpus; cpuid++) {
271
272	uint64_t pstate;
273	cpus[cpuid]->get_reg(VCPU_PR_PSTATE, &pstate);
274	int64_t cpi_x_1024 = (pstate & pstate_priv_mask)? kernel_cpi_x_1024 : user_cpi_x_1024;
275
276	// step this cpu as long as its cycle counter does not exceed all other cpus
277	while(done_x_1024[cpuid] <= cdone_x_1024) {
278	int rslt = cpus[cpuid]->stepi(1);
279	done_x_1024[cpuid] += cpi_x_1024;
280	if (rslt) {
281	BLAZE_STOP(blaze_stop_request);
282	breakpoint_hit (cpus[cpuid]->id());
283	goto STEPT_LAST_RR;
284	}
285	} // while cycle counter not caught up with rest of the strands on this wt
286
287	if (done_x_1024[cpuid] < cmin_x_1024) cmin_x_1024 = done_x_1024[cpuid];
288
289	} // for each cpuid
290
291	// move up the threshold for the next round-robin interval
292	cdone_x_1024 = cmin_x_1024;
293	} // while <mhz> cycles not done
294
295	STEPT_LAST_RR:
296
297	for (cpuid=0; cpuid<num_cpus; cpuid++) {
298	cpus[cpuid]->update_stick(stick_incr);
299	}
300
301	if (IN_STOP_STATE(barrier ()))
302	break;
303
304	} // for usecs
305	} else { // chunky mode: not round robin
306
307	for (int i = 0; i < usecs; i++) {
308
309	for(int j = 0; j < num_cpus; j++) {
310
311	Vcpu * vcpu = cpus[j];
312
313	/---STEPI---/
314	if (pstate_is_userp (vcpu)) {
315	rslt = vcpu->stepi (umips); atomic_add_64 (&u_intervals, 1);
316	atomic_add_64 (&u_instrs, umips);
317	} else {
318	if (ccntx != 0 && get_mmu_cntx (vcpu) == ccntx) {
319	rslt = vcpu->stepi (cmips); atomic_add_64 (&k_intervals, 1);
320	atomic_add_64 (&k_instrs, cmips);
321	} else {
322	rslt = vcpu->stepi (kmips); atomic_add_64 (&k_intervals, 1);
323	atomic_add_64 (&k_instrs, kmips);
324	}
325	}
326	/---STICK---/
327	vcpu->update_stick(stick_incr);
328
329
330	if (rslt) {
331	BLAZE_STOP(blaze_stop_request);
332	breakpoint_hit (vcpu->id ());
333	}
334	} // for cpus
335
336
337	if (IN_STOP_STATE(barrier ()))
338	break;
339
340	} // for usecs
341	}
342	} // not already stopped
343
344	// @@@ we _should_ be able to eliminate this barrier, but then sam crashes,
345	// and I haven't had time to figure out why @@@
346	//
347	atomic_barrier (&numThds, &BarrierCount, &BarrierLock,
348	&blaze_stop_request, &BarrierTemp);
349
350	if (cpus[0] == first_cpu) { // VCPU 0 does some extra work...
351	if (IN_STOP_STATE(blaze_stop_request)) { // stop request -> stop
352	BLAZE_STOP(blaze_run_state);
353	BLAZE_CLEAR(blaze_stop_request);
354	} else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
355	BLAZE_GTWAIT(blaze_run_state);
356	else // anything else -> stop
357	BLAZE_STOP(blaze_run_state);
358	doneftn_t TmpFunc = wrkthdCBFunc;
359	if (TmpFunc != NULL) {
360	wrkthdCBFunc = NULL; // reset first to avoid race !
361	(*TmpFunc)(wrkthdCBArg); // then `do' callback !
362	}
363	sema_post (&wrkthdDONE); // ------------------- DONE !!!
364	}
365	}
366
367
368
369
370
371
372
373
374	void WorkerThread::stepi(uint64_t n){
375
376	uint64_t MIPS = the_arch.mips; // instructions before every sync
377
378	uint64_t loops; // number of 1 usec loops to execute
379	uint64_t leftover_to_execute;
380	// number of instructions left from n
381	// that have to be executed before return
382	// w/o updating the STICK
383	uint64_t leftovers;
384	// number of instructions that have to be
385	// executed next time stepi is called
386	// before updating STICK. leftovers < MIPS
387
388	bool update = false; // should update STICK in this call ?
389	// since n could be less then leftovers
390
391
392	// loops * mips + leftover_to_execute + step_remainder == n
393	if(step_remainder > n){
394	loops = 0;
395	leftover_to_execute = n;
396	leftovers = step_remainder - n;
397	update = false;
398	}else{
399	n -= step_remainder;
400	loops = n/MIPS;
401	leftover_to_execute = n % MIPS;
402	leftovers = MIPS - n % MIPS;
403	if(step_remainder)
404	// do not update if step_remainder was 0
405	update = true;
406	}
407
408
409	if(update){
410	// update STICK after executing 'step_remainder'
411	for( int j = 0; j < num_cpus ; j++){
412	if(cpus[j]->stepi(step_remainder) != 0){
413	// have hit a breakpoint on this strand or cpu, so
414	BLAZE_STOP(blaze_stop_request);
415	breakpoint_hit(cpus[j]->id());
416	}
417	cpus[j]->update_stick(stick_incr);
418	}
419
420	int sam_state = barrier();
421
422	if (IN_STOP_STATE(sam_state))
423	// a stop request has come either through UI, or some strand has hit a
424	// breakpoint. Do not wait to complete the rest of usecs.
425	goto STOPNOW;
426	}
427
428	for (int i = 0; i < loops; i++) {
429	int sam_state;
430	for (int j = 0; j < num_cpus; j++ ) {
431	if(cpus[j]->stepi(MIPS) != 0){
432	// have hit a breakpoint on this strand or cpu
433	BLAZE_STOP(blaze_stop_request);
434	breakpoint_hit(cpus[j]->id());
435	// don't break, let other cpus on this workerthread complete
436	// their required number of instructions.
437	}
438	cpus[j]->update_stick(stick_incr);
439	}
440	sam_state = barrier();
441
442	if (IN_STOP_STATE(sam_state))
443	// a stop request has come either through UI, or some strand has hit a
444	// breakpoint. Do not wait to complete the rest of usecs.
445	goto STOPNOW;
446	}
447
448
449	// execute the rest of the intructions w/o updating STICK
450	for( int j = 0; j < num_cpus; j++ ) {
451	if(cpus[j]->stepi(leftover_to_execute) != 0){
452	// have hit a breakpoint on this strand or cpu
453	BLAZE_STOP(blaze_stop_request);
454	breakpoint_hit(cpus[j]->id());
455	// don't break, let other cpus on this workerthread complete
456	// their required number of instructions.
457	}
458	}
459
460
461	STOPNOW:
462
463	atomic_barrier(&numThds,&BarrierCount,&BarrierLock,&blaze_stop_request,&BarrierTemp);
464
465	if(cpus[0] == first_cpu){
466	step_remainder = leftovers;
467
468	if(called_from_stept){
469	called_from_stept = false; // do not want to stop here
470	return;
471	}
472
473	if (IN_STOP_STATE(blaze_stop_request)) { // stop request -> stop
474	BLAZE_STOP(blaze_run_state);
475	BLAZE_CLEAR(blaze_stop_request);
476	} else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
477	BLAZE_GTWAIT(blaze_run_state);
478	else // anything else -> stop
479	BLAZE_STOP(blaze_run_state);
480
481	if(wrkthdCBFunc){
482	doneftn_t TmpFunc = wrkthdCBFunc;
483	if (TmpFunc != NULL) {
484	wrkthdCBFunc = NULL; // reset first to avoid race !
485	(*TmpFunc)(wrkthdCBArg); // then `do' callback !
486	}
487	}
488	sema_post (&wrkthdDONE); // ---------- DONE !!!
489	}
490	}
491
492
493	// stepc is called in exec-driven mode
494	void WorkerThread::stepc(int64_t ncycles)
495	{
496	int64_t cycles_per_usec = the_arch.cpu_freq / 1000000ull;
497
498	int64_t i;
499	for (i=step_remainder; i<=ncycles; i++) {
500	int rslt = g_cpu_ex_intf.cycle(1);
501	step_remainder--;
502
503	if (step_remainder == 0) {
504	for (int j = 0; j < num_cpus; j++) {
505	Vcpu * vcpu = cpus[j];
506	vcpu->update_stick(stick_incr);
507	}
508	step_remainder = cycles_per_usec;
509	if (IN_STOP_STATE(barrier())) {
510	break;
511	}
512	}
513
514	if (rslt) {
515	BLAZE_STOP(blaze_stop_request);
516	break;
517	}
518
519	}
520
521	// FIXME FIXME FIXME
522	// remember cycles left over until next sync
523	if (IN_STOP_STATE(blaze_stop_request)) { // stop request -> stop
524	BLAZE_STOP(blaze_run_state);
525	BLAZE_CLEAR(blaze_stop_request);
526	} else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
527	BLAZE_GTWAIT(blaze_run_state);
528	else // anything else -> stop
529	BLAZE_STOP(blaze_run_state);
530	doneftn_t TmpFunc = wrkthdCBFunc;
531	if (TmpFunc != NULL) {
532	wrkthdCBFunc = NULL; // reset first to avoid race !
533	(*TmpFunc)(wrkthdCBArg); // then `do' callback !
534	}
535	sema_post (&wrkthdDONE); // ------------------- DONE !!!
536	} // void WorkerThread::stepc()
537
538
539
540	void WorkerThread::dump(FILE * fp){
541	write_scalar_64 (fp, "GlobalTimeUsecs",GlobalTimeUsecs );
542	write_scalar_64 (fp, "GlobalTicks",GlobalTicks );
543	write_scalar_64 (fp, "step_remainder",step_remainder);
544	write_scalar_64 (fp, "stick_remainder",stick_remainder);
545	}
546
547	int WorkerThread::restore (char * line)
548	{
549	if (sscanf (line, "GlobalTimeUsecs %lli", &GlobalTimeUsecs) == 1) ;
550	else if (sscanf (line, "GlobalTicks %lli", &GlobalTicks) == 1) ;
551	else if (sscanf (line, "stick_remainder %lli", &stick_remainder) == 1) ;
552
553	// Note that the following two lines restore the same variable. This reflects
554	// a name change of the variable from instr_till_next_stick_update to step_remainder
555	// to reflect its broader purpose in execution-driven runs. The second restore has
556	// been left for backward compatibility to keep the ability to restore older checkpoints.
557	// Please do not remove that line unless you are sure no checkpoint created prior to
558	// 12/13/2007 exists or they have all been patched to rename this variable.
559	// Only one of those lines will ever get executed depending on when the checkpoint was
560	// taken.
561	else if (sscanf (line, "step_remainder %lli", &step_remainder) == 1) ;
562	else if (sscanf (line, "instr_till_next_stick_update %lli", &step_remainder) == 1) ;
563
564	else return FALSE;
565
566	return TRUE;
567	}
568
569
570	void WorkerThread::create_worker_threads (int NumCpus, int cpusPerThread, int numThreads)
571	{
572	static int FIRST_TIME = 1; // init flag
573
574	int NumEnabled = 0;
575	int NumPerThread = 0;
576	int NumModThread = 0;
577
578
579	int newnum = numThreads;
580
581	if (newnum == -1) {
582	if (cpusPerThread == -1) {
583	newnum = 1;
584	} else {
585	newnum = NumCpus / cpusPerThread;
586	}
587	}
588
589	if (newnum <= 0) {
590	ui->warning("invalid conf numthds, using 1\n");
591	newnum = 1;
592	} else if (newnum > 64) {
593	ui->warning("invalid conf numthds, using 64\n");
594	newnum = 64;
595	}
596
597	NumEnabled = 0;
598	for (int i=0; i<=g_vcpu_id_max; i++)
599	if (get_vcpu(i) && cpu_enabled[i])
600	NumEnabled++;
601
602	if (NumEnabled == 0) {
603	ui->warning("Invalid: 0 enabled cpus, using 1\n");
604	NumEnabled = 1;
605	}
606
607	NumPerThread = NumEnabled / newnum;
608	NumModThread = NumEnabled % newnum;
609
610
611	if (FIRST_TIME) {
612	sema_init (&wrkthdDONE, 0, USYNC_THREAD,NULL);
613	// eventque = new EventQue();
614	assert(pthread_key_create(&key,0) == 0);
615	FIRST_TIME = 0;
616	}
617
618	if (newnum != WorkerThread::numThds \|\| cpu_enable_changed) {
619
620	if (newnum > HOSTINFO_numcpus()) {
621	ui->warning("not enough host-cpus(%d) for sim-threads(%d)\n",
622	HOSTINFO_numcpus(), newnum);
623	}
624
625
626	// retrieve any existing events on the current workerthread eq's
627	std::list<Event_t *> eventlist;
628	for( int i = 0 ; i < numThds; i++){
629	while(!wrkthds[i].eq->empty()){
630	Event_t *event = new Event_t;
631	wrkthds[i].eq->get_top (event);
632	eventlist.push_front(event);
633	}
634	}
635
636	kill_worker_threads ();
637	wrkthds = new WorkerThread[newnum];
638	for (int i = 0; i < newnum; i++)
639	wrkthds[i].worker_id = i;
640
641	// put all events in eventlist on new wrkthd 0 event queue
642	while( !eventlist.empty() ){
643	Event_t * e = eventlist.front();
644	wrkthds[0].eq->insert_callback(e->pq_priority,
645	e->pq_cbfunc,e->pq_cbarg1,e->pq_cbarg2,
646	e->pq_cbunload, e->dbgstring, e->worker_id);
647	eventlist.pop_front();
648	delete e;
649	}
650
651
652	// reassign vcpu's to the worker threads
653	int nextcpu = 0;
654	first_cpu = 0;
655	for (int i = 0; i < newnum; i++) {
656	// number of vcpu's for this worker thread
657	wrkthds[i].num_cpus = NumPerThread + (i < NumModThread);
658
659	for (int j = 0; j < wrkthds[i].num_cpus; j++)
660	{
661	// find next vcpu
662	Vcpu* vcpu = get_vcpu(nextcpu);
663	for (int k=nextcpu; k<=g_vcpu_id_max; k++)
664	{
665	if(!vcpu \|\| !cpu_enabled[nextcpu]) // skip not enabled
666	vcpu =get_vcpu(++nextcpu);
667	else
668	break;
669	}
670
671	if (first_cpu == 0)
672	first_cpu = vcpu;
673
674	wrkthds[i].cpus[j] = vcpu;
675	nextcpu++;
676	}
677	}
678
679	for (int i=0; i<newnum; i++) {
680	ui->verbose("cpu-worker-thread[%d] ",i);
681	wrkthds[i].info();
682	}
683
684	WorkerThread::numThds = newnum;
685	cpu_enable_changed = false;
686	}
687	}