sam-t2/sam/system/blaze/workerthreads.cc

// ========== Copyright Header Begin ==========================================
//
// OpenSPARC T2 Processor File: workerthreads.cc
// Copyright (c) 2006 Sun Microsystems, Inc.  All Rights Reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
//
// The above named program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public
// License version 2 as published by the Free Software Foundation.
//
// The above named program is distributed in the hope that it will be
// useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public
// License along with this work; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
//
// ========== Copyright Header End ============================================
/*
 * Copyright (C) 2001,2005 Sun Microsystems, Inc.
 * All rights reserved.
 */
#pragma ident "%%1.27    06/12/14    %%"

/*
   "threads.cc"

    Supports multiple cpu simulation threads that "system.cc"
    can signal and wait for, ie that live for the duration of blaze rather
    than being created/destroyed for each UI command.

    These threads can however be destroyed/re-created, because we support
    changing numthreads (which indirectly changes cpus-per-thread), as
    well as changing the enabled state of individual cpus (which changes
    the number-of-cpus to run, and hence the distribution of running cpus
    across the worker threads).

*/

#include "ui.h"
#include "workerthread.h"


extern int blaze_debug;               /* main/ui_cmds */
extern int blaze_option;


// Vcpu0 calls these doneftn_t with void * arg, when all the workerthreads have
// done their required execution.
doneftn_t  volatile wrkthdCBFunc = NULL;        /* NB-one-shot callback ftn */
void *     volatile wrkthdCBArg = NULL; /* and its callback arg     */


// system level event queue
// static EventQue  *eventque;

sema_t wrkthdDONE;      // DONE semaphore. Vcpu0 workerthread signals on
                        // this semaphore when all the worker threads are done
                        // UI threads typically hang on this sema while
                        // waiting for stepi, stept to finish


extern void   write_scalar_64 (FILE *fp, const char * name, uint64_t v);
extern bool_t read_scalar_64  (FILE *fp, const char * name, uint64_t *v);

static void inline breakpoint_hit(int cpu_id)
{
    ui->output("cpu[%i] hit a breakpoint. stop.. \n", cpu_id);
}

volatile int WorkerThread::BarrierCount = 0;
volatile int WorkerThread::BarrierLock = 0;
volatile int WorkerThread::BarrierTemp = 0;
int64_t      WorkerThread::Nusecs = 0;
int64_t      WorkerThread::Ninstrs = 0;
int64_t      WorkerThread::Ncycles = 0;

// step_remainder holds the number of instructions (in normal mode)
// or cycles (in execution-driven mode) before the next stick_update needs
// to happen. This usually happens when a simulation is stopped between two
// stick updates (usually 1 microsecond of simulated time).
// Implementation note: this used to be known as instrs_till_next_stick_update
// before it was needed in execution-driven mode.
volatile uint64_t WorkerThread::step_remainder = 0;

volatile int WorkerThread::numThds = 0;
WorkerThread * WorkerThread::wrkthds = 0;
pthread_key_t WorkerThread::key = 0;
volatile uint64_t WorkerThread::GlobalTimeUsecs = 0;
volatile int64_t  WorkerThread::GlobalTicks = 0;

volatile uint64_t WorkerThread::stick_incr = 0;
volatile uint64_t  WorkerThread::stick_remainder = 0;

/* static */ Vcpu * WorkerThread::first_cpu = 0;   // first cpu has some special work to do


volatile int64_t      WorkerThread::u_intervals = 0;
volatile int64_t      WorkerThread::k_intervals = 0;

volatile int64_t      WorkerThread::u_instrs = 0;
volatile int64_t      WorkerThread::k_instrs = 0;


void WorkerThread::kill_worker_threads ()
{
    if (numThds > 0) {

        while (sema_trywait (&wrkthdDONE) == 0) ;    // "reset" just in case

        for (int i=0; i < numThds; i++)
            wrkthds[i].killThread();

        numThds = 0;
    }

    if (wrkthds) {
        delete[] wrkthds;
        wrkthds = 0;
    }
}


int WorkerThread::barrier()
{


    atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
                   &blaze_stop_request,&BarrierTemp);

    GlobalTimeUsecs = ++simTime;

    doEventqueCallbacks();

    if (first_cpu->config.trace_on) {
        doTrace();
    }

    int ret = atomic_barrier(&numThds,&BarrierCount,&BarrierLock,
                             &blaze_stop_request,&BarrierTemp);


    if(cpus[0] == first_cpu) {       // VCPU0

        // this is *only* used for the "perf" UI command
        //@@@ and is now broken when umips != kmips @@@
        GlobalTicks += the_arch.mips;

        // all vcpu's execute update_stick() with same stick_incr because of the
        // first atomic_barrier at entrance of this function
        stick_incr = the_arch.stick_freq/1000000ull;
        stick_remainder += the_arch.stick_freq % 1000000ull;
        if(stick_remainder >= 1000000ull){
           // adjust the drift in stick_incr when stick_remainder is more then
           // 10^6
            stick_incr += stick_remainder/1000000ull;
            stick_remainder %= 1000000ull;
        }
    }


    return ret;
}


static inline int pstate_is_userp (Vcpu * vcpu)
{
        uint64_t tl, tt, tstate0;               // special case TL > 0
        vcpu->get_reg (VCPU_PR_TL, &tl);
        if (tl > 0) vcpu->get_reg (VCPU_PR_TT, &tt);
        if (tl > 0 && ((tt >= 0x064 && tt <= 0x06f)        // tlb miss
                    || (tt >= 0x080 && tt <= 0x0ff))) {    // win spill/fill
            vcpu->get_reg (VCPU_PR_TSTATE, &tstate0);

            if (tstate0 & (0x0004 << 8)) return 0;
            else                         return 1;
        }

        uint64_t pstate;                        // regular case TL == 0
        vcpu->get_reg (VCPU_PR_PSTATE, &pstate);
        return (pstate & 4) ? 0 : 1;
}

static inline int get_mmu_cntx (Vcpu * vcpu)
        /* Hack for sw05b, the kcfd process runs in kernel mode and with      */
        /* primary mmu context 0, but can be identified by its'               */
        /* secondary mmu context, which is unique the the kcfd (although      */
        /* you do have to be clever to figure out what that value is!)        */
{
        uint64_t data;                              /* Cheetah (only?)        */
        (void) vcpu->get_asi (0x58, 0x10ll, data);  /* I/D MMU Secondary cntx */
        return (int)data;
}


static const uint64_t pstate_priv_mask = 0x2ull;

void WorkerThread::stept (uint64_t usecs)
{
    int rslt;

    if (SYSTEM_in_execution_driven_mode()) {
        stepc(usecs * the_arch.cpu_freq / 1000000ull);
        return;
    }

    if (step_remainder) {
        // stepi and stept have been intermixed.
        // execute the leftover instructions,
        // before simulating 'usecs'
        called_from_stept = true;
        stepi (step_remainder);
    }

    // When the conditional stepi() call above was called and we hit
    // a breakpoint point then we should not enter the for loop below.
    // @@@ this needs to be re-thought, should there be a barrier here @@@
    if (!IN_STOP_STATE(barrier ())) {
        uint64_t kmips = the_arch.kmips; /* re-fetch volatiles every iter */
        uint64_t umips = the_arch.umips;
        uint64_t cmips = the_arch_cmips;
        int      ccntx = the_arch_ccntx;

        if (the_arch.roundrobin && (num_cpus > 1)) {

            // round-robin algorithm, with variable user/kernel mips:
            //
            // 1. derive user and kernel CPIs from the configured mips and cpu freq:
            //   CPI = MHz/MIPS
            // 2. When a strand steps by 1 instruction, an associated cycle counter is
            //   incremented by the CPI
            // 3. We round-robin over all strands, advancing each strand until it catches
            //   up with the other strands on this worker-thread.

            // For accuracy, CPI cannot be limited to whole numbers.

            // we use fixed-point arithmetic with 10 bits of
            // precision, to avoid FP ops in the critical loop.
            // all quantities named *_x_1024 represent such
            // fixed-point numbers.

            int64_t mhz = the_arch.cpu_freq/1e+6; // NOTE: cpu freq must be in whole megahz for accuracy

            int64_t kernel_cpi_x_1024 = (mhz << 10)/kmips;
            int64_t user_cpi_x_1024 = (mhz << 10)/umips;

            int64_t mhz_x_1024 = mhz << 10;

            for (int usec=0; usec<usecs; usec++) {
                int cpuid;
                for (cpuid=0; cpuid<num_cpus; cpuid++) {
                    done_x_1024[cpuid] = 0;
                }

                int64_t cdone_x_1024 = 0; // workerthread cycles done

                // go round-robin, while cycle counter represents less than a usec
                while(cdone_x_1024 < mhz_x_1024) {

                    int64_t cmin_x_1024 = LLONG_MAX; // min of cycles done across all cpus
                    for (cpuid=0; cpuid<num_cpus; cpuid++) {

                        uint64_t pstate;
                        cpus[cpuid]->get_reg(VCPU_PR_PSTATE, &pstate);
                        int64_t cpi_x_1024 = (pstate & pstate_priv_mask)? kernel_cpi_x_1024 : user_cpi_x_1024;

                        // step this cpu as long as its cycle counter does not exceed all other cpus
                        while(done_x_1024[cpuid] <= cdone_x_1024) {
                            int rslt = cpus[cpuid]->stepi(1);
                            done_x_1024[cpuid] += cpi_x_1024;
                            if (rslt) {
                                BLAZE_STOP(blaze_stop_request);
                                breakpoint_hit (cpus[cpuid]->id());
                                goto STEPT_LAST_RR;
                            }
                        } // while cycle counter not caught up with rest of the strands on this wt

                        if (done_x_1024[cpuid] < cmin_x_1024) cmin_x_1024 = done_x_1024[cpuid];

                    } // for each cpuid

                    // move up the threshold for the next round-robin interval
                    cdone_x_1024 = cmin_x_1024;
                } // while <mhz> cycles not done

            STEPT_LAST_RR:

                for (cpuid=0; cpuid<num_cpus; cpuid++) {
                    cpus[cpuid]->update_stick(stick_incr);
                }

                if (IN_STOP_STATE(barrier ()))
                    break;

            } // for usecs
        } else { // chunky mode: not round robin

            for (int i = 0; i < usecs; i++) {

                for(int j = 0; j < num_cpus; j++) {

                    Vcpu * vcpu = cpus[j];

                    /*---STEPI---*/
                    if (pstate_is_userp (vcpu)) {
                        rslt = vcpu->stepi (umips);  atomic_add_64 (&u_intervals, 1);
                        atomic_add_64 (&u_instrs, umips);
                    } else {
                        if (ccntx != 0 && get_mmu_cntx (vcpu) == ccntx) {
                            rslt = vcpu->stepi (cmips);  atomic_add_64 (&k_intervals, 1);
                            atomic_add_64 (&k_instrs, cmips);
                        } else {
                            rslt = vcpu->stepi (kmips);  atomic_add_64 (&k_intervals, 1);
                            atomic_add_64 (&k_instrs, kmips);
                        }
                    }
                    /*---STICK---*/
                    vcpu->update_stick(stick_incr);


                    if (rslt) {
                        BLAZE_STOP(blaze_stop_request);
                        breakpoint_hit (vcpu->id ());
                    }
                } // for cpus


                if (IN_STOP_STATE(barrier ()))
                    break;

            } // for usecs
        }
    } // not already stopped

    // @@@ we _should_ be able to eliminate this barrier, but then sam crashes,
    // and I haven't had time to figure out why @@@
    //
    atomic_barrier (&numThds, &BarrierCount, &BarrierLock,
                    &blaze_stop_request, &BarrierTemp);

    if (cpus[0] == first_cpu) {         // VCPU 0 does some extra work...
        if (IN_STOP_STATE(blaze_stop_request)) {    // stop request -> stop
            BLAZE_STOP(blaze_run_state);
            BLAZE_CLEAR(blaze_stop_request);
        } else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
            BLAZE_GTWAIT(blaze_run_state);
        else                                        // anything else -> stop
            BLAZE_STOP(blaze_run_state);
        doneftn_t  TmpFunc = wrkthdCBFunc;
        if (TmpFunc != NULL) {
            wrkthdCBFunc = NULL;                // reset first to avoid race !
            (*TmpFunc)(wrkthdCBArg);            // then `do' callback !
        }
        sema_post (&wrkthdDONE);                // ------------------- DONE !!!
    }
}


void WorkerThread::stepi(uint64_t n){

    uint64_t MIPS = the_arch.mips;   // instructions before every sync

    uint64_t loops;         // number of 1 usec loops to execute
    uint64_t leftover_to_execute;
                            // number of instructions left from n
                            // that have to be executed before return
                            // w/o updating the STICK
    uint64_t leftovers;
                            // number of instructions that have to be
                            // executed next time stepi is called
                            // before updating STICK. leftovers < MIPS

    bool update = false;    // should update STICK in this call ?
                            // since n could be less then leftovers


    // loops * mips + leftover_to_execute + step_remainder == n
    if(step_remainder > n){
        loops = 0;
        leftover_to_execute = n;
        leftovers = step_remainder - n;
        update = false;
    }else{
        n -= step_remainder;
        loops = n/MIPS;
        leftover_to_execute = n % MIPS;
        leftovers = MIPS - n % MIPS;
        if(step_remainder)
            // do not update if step_remainder was 0
            update = true;
    }


    if(update){
        // update STICK after executing 'step_remainder'
        for( int j = 0; j < num_cpus ; j++){
           if(cpus[j]->stepi(step_remainder) != 0){
                 // have hit a breakpoint on this strand or cpu, so
                BLAZE_STOP(blaze_stop_request);
                breakpoint_hit(cpus[j]->id());
            }
            cpus[j]->update_stick(stick_incr);
        }

        int sam_state = barrier();

        if (IN_STOP_STATE(sam_state))
        // a stop request has come either through UI, or some strand has hit a
        // breakpoint. Do not wait to complete the rest of usecs.
            goto STOPNOW;
    }

    for (int i = 0; i < loops; i++) {
        int sam_state;
        for (int j = 0; j < num_cpus; j++ ) {
            if(cpus[j]->stepi(MIPS) != 0){
                // have hit a breakpoint on this strand or cpu
                BLAZE_STOP(blaze_stop_request);
                breakpoint_hit(cpus[j]->id());
                // don't break, let other cpus on this workerthread complete
                // their required number of instructions.
            }
            cpus[j]->update_stick(stick_incr);
        }
        sam_state = barrier();

        if (IN_STOP_STATE(sam_state))
        // a stop request has come either through UI, or some strand has hit a
        // breakpoint. Do not wait to complete the rest of usecs.
            goto STOPNOW;
    }


    // execute the rest of the intructions w/o updating STICK
    for( int j = 0; j < num_cpus; j++ ) {
        if(cpus[j]->stepi(leftover_to_execute) != 0){
                // have hit a breakpoint on this strand or cpu
                BLAZE_STOP(blaze_stop_request);
                breakpoint_hit(cpus[j]->id());
                // don't break, let other cpus on this workerthread complete
                // their required number of instructions.
        }
    }


STOPNOW:

    atomic_barrier(&numThds,&BarrierCount,&BarrierLock,&blaze_stop_request,&BarrierTemp);

    if(cpus[0] == first_cpu){
        step_remainder = leftovers;

        if(called_from_stept){
            called_from_stept = false;  // do not want to stop here
            return;
        }

        if (IN_STOP_STATE(blaze_stop_request)) {    // stop request -> stop
            BLAZE_STOP(blaze_run_state);
            BLAZE_CLEAR(blaze_stop_request);
        } else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
            BLAZE_GTWAIT(blaze_run_state);
        else                                        // anything else -> stop
            BLAZE_STOP(blaze_run_state);

        if(wrkthdCBFunc){
            doneftn_t  TmpFunc = wrkthdCBFunc;
            if (TmpFunc != NULL) {
                wrkthdCBFunc = NULL;                      // reset first to avoid race !
                (*TmpFunc)(wrkthdCBArg);                  // then `do' callback !
            }
        }
        sema_post (&wrkthdDONE);                          // ---------- DONE !!!
    }
}


// stepc is called in exec-driven mode
void WorkerThread::stepc(int64_t ncycles)
{
  int64_t cycles_per_usec = the_arch.cpu_freq / 1000000ull;

  int64_t i;
  for (i=step_remainder; i<=ncycles; i++) {
    int rslt = g_cpu_ex_intf.cycle(1);
    step_remainder--;

    if (step_remainder == 0) {
      for (int j = 0; j < num_cpus; j++) {
        Vcpu * vcpu = cpus[j];
        vcpu->update_stick(stick_incr);
      }
      step_remainder = cycles_per_usec;
      if (IN_STOP_STATE(barrier())) {
        break;
      }
    }

    if (rslt) {
      BLAZE_STOP(blaze_stop_request);
      break;
    }

  }

  // FIXME FIXME FIXME
  // remember cycles left over until next sync
  if (IN_STOP_STATE(blaze_stop_request)) {    // stop request -> stop
    BLAZE_STOP(blaze_run_state);
    BLAZE_CLEAR(blaze_stop_request);
  } else if (IN_GTSTEP_STATE(blaze_run_state))// time sync -> wait
    BLAZE_GTWAIT(blaze_run_state);
  else                                        // anything else -> stop
    BLAZE_STOP(blaze_run_state);
  doneftn_t  TmpFunc = wrkthdCBFunc;
  if (TmpFunc != NULL) {
    wrkthdCBFunc = NULL;                // reset first to avoid race !
    (*TmpFunc)(wrkthdCBArg);            // then `do' callback !
  }
  sema_post (&wrkthdDONE);              // ------------------- DONE !!!
} // void WorkerThread::stepc()


void WorkerThread::dump(FILE * fp){
        write_scalar_64 (fp, "GlobalTimeUsecs",GlobalTimeUsecs );
        write_scalar_64 (fp, "GlobalTicks",GlobalTicks );
        write_scalar_64 (fp, "step_remainder",step_remainder);
        write_scalar_64 (fp, "stick_remainder",stick_remainder);
}

int WorkerThread::restore (char * line)
{
         if (sscanf (line, "GlobalTimeUsecs %lli", &GlobalTimeUsecs)   == 1) ;
    else if (sscanf (line, "GlobalTicks     %lli", &GlobalTicks)       == 1) ;
    else if (sscanf (line, "stick_remainder %lli", &stick_remainder)   == 1) ;

    // Note that the following two lines restore the same variable. This reflects
    // a name change of the variable from instr_till_next_stick_update to step_remainder
    // to reflect its broader purpose in execution-driven runs. The second restore has
    // been left for backward compatibility to keep the ability to restore older checkpoints.
    // Please do not remove that line unless you are sure no checkpoint created prior to
    // 12/13/2007 exists or they have all been patched to rename this variable.
    // Only one of those lines will ever get executed depending on when the checkpoint was
    // taken.
    else if (sscanf (line, "step_remainder %lli", &step_remainder)   == 1) ;
    else if (sscanf (line, "instr_till_next_stick_update %lli", &step_remainder)   == 1) ;

    else return FALSE;

    return TRUE;
}


void WorkerThread::create_worker_threads (int NumCpus, int cpusPerThread, int numThreads)
{
    static int FIRST_TIME = 1;  // init flag

    int        NumEnabled = 0;
    int        NumPerThread = 0;
    int        NumModThread = 0;


    int newnum = numThreads;

    if (newnum == -1) {
        if (cpusPerThread == -1) {
            newnum = 1;
        } else {
            newnum = NumCpus / cpusPerThread;
        }
    }

    if (newnum <= 0) {
        ui->warning("invalid conf numthds, using 1\n");
        newnum = 1;
    } else if (newnum > 64) {
        ui->warning("invalid conf numthds, using 64\n");
        newnum = 64;
    }

    NumEnabled = 0;
    for (int i=0; i<=g_vcpu_id_max; i++)
        if (get_vcpu(i) && cpu_enabled[i])
            NumEnabled++;

    if (NumEnabled == 0) {
        ui->warning("Invalid: 0 enabled cpus, using 1\n");
        NumEnabled = 1;
    }

    NumPerThread = NumEnabled / newnum;
    NumModThread = NumEnabled % newnum;


    if (FIRST_TIME) {
        sema_init (&wrkthdDONE, 0, USYNC_THREAD,NULL);
        // eventque = new EventQue();
        assert(pthread_key_create(&key,0) == 0);
        FIRST_TIME = 0;
    }

    if (newnum != WorkerThread::numThds || cpu_enable_changed) {

        if (newnum > HOSTINFO_numcpus()) {
            ui->warning("not enough host-cpus(%d) for sim-threads(%d)\n",
            HOSTINFO_numcpus(),  newnum);
        }


        // retrieve any existing events on the current workerthread eq's
        std::list<Event_t *> eventlist;
        for( int i = 0 ; i < numThds; i++){
            while(!wrkthds[i].eq->empty()){
                Event_t *event = new Event_t;
                wrkthds[i].eq->get_top (event);
                eventlist.push_front(event);
            }
        }

        kill_worker_threads ();
        wrkthds = new WorkerThread[newnum];
        for (int i = 0; i < newnum; i++)
            wrkthds[i].worker_id = i;

        // put all events in eventlist on new wrkthd 0 event queue
        while( !eventlist.empty() ){
            Event_t * e = eventlist.front();
            wrkthds[0].eq->insert_callback(e->pq_priority,
                        e->pq_cbfunc,e->pq_cbarg1,e->pq_cbarg2,
                        e->pq_cbunload, e->dbgstring, e->worker_id);
            eventlist.pop_front();
            delete e;
        }


        // reassign vcpu's to the worker threads
        int nextcpu = 0;
        first_cpu   = 0;
        for (int i = 0; i < newnum; i++) {
            // number of vcpu's for this worker thread
            wrkthds[i].num_cpus = NumPerThread + (i < NumModThread);

            for (int j = 0; j < wrkthds[i].num_cpus; j++)
            {
                // find next vcpu
                Vcpu* vcpu = get_vcpu(nextcpu);
                for (int k=nextcpu; k<=g_vcpu_id_max; k++)
                {
                    if(!vcpu || !cpu_enabled[nextcpu]) // skip not enabled
                        vcpu =get_vcpu(++nextcpu);
                    else
                        break;
                }

                if (first_cpu == 0)
                    first_cpu = vcpu;

                wrkthds[i].cpus[j] = vcpu;
                nextcpu++;
            }
        }

        for (int i=0; i<newnum; i++) {
            ui->verbose("cpu-worker-thread[%d] ",i);
            wrkthds[i].info();
        }

        WorkerThread::numThds = newnum;
        cpu_enable_changed = false;
    }
}