sam-t2/sam/cpus/vonk/ss/api/memsync/src/MemorySync.cc

// ========== Copyright Header Begin ==========================================
//
// OpenSPARC T2 Processor File: MemorySync.cc
// Copyright (c) 2006 Sun Microsystems, Inc.  All Rights Reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
//
// The above named program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public
// License version 2 as published by the Free Software Foundation.
//
// The above named program is distributed in the hope that it will be
// useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public
// License along with this work; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
//
// ========== Copyright Header End ============================================
/************************************************************************
**
**  Copyright (C) 2002, Sun Microsystems, Inc.
**
**  Sun considers its source code as an unpublished, proprietary
**  trade secret and it is available only under strict license provisions.
**  This copyright notice is placed here only to protect Sun in the event
**  the source is deemed a published work. Disassembly, decompilation,
**  or other means of reducing the object code to human readable form
**  is prohibited by the license agreement under which this code is
**  provided to the user or company in possession of this copy."
**
*************************************************************************/
#include "MemorySync.h"
#include <sstream>
#include <cstdlib>
#include <time.h>
#include "MemoryTransaction.h"
#include "SS_Strand.h"


using namespace std;
////////////////////////////////////////////////
typedef void (MemorySync::* mem_type)(MemoryTransaction&);

// static variable
MemorySync* MemorySync::msyncObj = NULL;
// static function, dump buffer content
string MemorySync::dumpBuffers()
{
    if (MemorySync::msyncObj) {
        return MemorySync::msyncObj->toString();
    }
    else {
        return string("");
    }
}

MemorySync::MemorySync() : addrTrans(NULL), socket(NULL)
{
}

MemorySync::MemorySync(int max_strands, int strands_per_core, int cores_per_cpu, int memDebug, int tsoChecker, int callback) :
  memDebug_(memDebug),
  addrTrans(NULL)
{
  int i;

  // keep this in sas.log, so it is easier to check whether msync is enabled
  // or not.
  time_t clock;
  time(&clock);

  rif_.setMaxStrands(max_strands);
  rif_.setCoreStrands(strands_per_core);
  rif_.setCpuCores(cores_per_cpu);
  mab_.setRieslingInterface(&rif_);

  ldb_ = new LoadStoreBuffer[MAX_STRANDS];
  stb_ = new LoadStoreBuffer[MAX_STRANDS];
  retStb_ = new LoadStoreBuffer[MAX_STRANDS];
  rmoStb_ = new LoadStoreBuffer[MAX_STRANDS];
  ifb_ = new LoadStoreBuffer[MAX_STRANDS];

  iseq_ = new uint64_t[MAX_STRANDS];

  for (i = 0; i < MAX_STRANDS; i++) {
    ldb_[i].setBufName((char*) "LoadBuffer");
    stb_[i].setBufName((char*) "StoreBuffer");
    retStb_[i].setBufName((char*) "RetiredBuffer");
    rmoStb_[i].setBufName((char*) "RMOBuffer");
    ifb_[i].setBufName((char*) "FetchBuffer");
    iseq_[i] = 1;
  }

  /**
   * set Memory preMemoryAccess and postMemoryAccess function pointer
  */
  if (callback > 0)
  {
    SS_Memory::memory.msync_object      = this;
    SS_Memory::memory.msync_pre_access  = pre_memory_access;
    SS_Memory::memory.msync_post_access = post_memory_access;

    SS_Io::io.msync_object      = this;
    SS_Io::io.msync_pre_access  = pre_memory_access;
    SS_Io::io.msync_post_access = post_memory_access;
  }

  MemorySyncMessage::debugLevel = memDebug;
  if ((tsoChecker > 0) && (callback > 0)) {
    // tsoChecker can become heavy in a multi-strand environment, use it
    // with caution. By default tsoChecker is off, use
    // -sas_run_args=-DTSO_CHECKER to enable it.
    tsoChecker_ = new TsoChecker();
    tsoChecker_->init(max_strands);
  }
  else {
      tsoChecker_ = NULL;
  }

  if (MemorySync::msyncObj == NULL) {
      MemorySync::msyncObj = this;
  }

  // by default, only core_0 of node_0 is enabled and is able to produce
  // cache invalidation request.
  inv_vec_mask[0] = 0x1;
  for (int i = 1; i < INV_VEC_SIZE; i++)
    inv_vec_mask[i] = 0x0;
}

////////////////////////////////////////////////

MemorySync::MemorySync( const MemorySync & orig )
{

}

////////////////////////////////////////////////

MemorySync::~MemorySync()
{
}

////////////////////////////////////////////////

const MemorySync &
MemorySync::operator=( const MemorySync & rhs )
{
    // Replace the following line with your function body.
  //    RIESLING_THROW_DOMAIN_ERROR( "Unimplemented function." );

    return *this;
}

////////////////////////////////////////////////

bool
MemorySync::operator==( const MemorySync & rhs ) const
{
    // Replace the following line with your function body.
  // RIESLING_THROW_DOMAIN_ERROR( "Unimplemented function." );
    return false;
}

////////////////////////////////////////////////

string
MemorySync::toString() const
{
    ostringstream os;

    os << "mab: " << mab_.toString();
    for (int i = 0; i < MAX_STRANDS; i++) {
        if (ldb_[i].size() > 0) {
            os << "ldb[" << i << "]: " << ldb_[i].toString();
        }
        if (stb_[i].size() > 0) {
            os << "stb[" << i << "]: " << stb_[i].toString();
        }
        if (retStb_[i].size() > 0) {
            os << "retStb[" << i << "]: " << retStb_[i].toString();
        }
        if (rmoStb_[i].size() > 0) {
            os << "rmoStb[" << i << "]: " << rmoStb_[i].toString();
        }
        if (ifb_[i].size() > 0) {
            os << "ifb[" << i << "]: " << ifb_[i].toString();
        }
    }

    return os.str();
}

/******************************************************************************
 * This implementation assumes each load sends only one LoadIssue command.
 * If an environment prefers to send 8 LoadIssues for a Block-Load and 2
 * LoadIssues for a Quad-Load, then the code can be simplied to handle all
 * types of load like the default case.
 ******************************************************************************/
void
MemorySync::handleLoadIssue(LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  list<LoadStoreEntry>::iterator ii;
  LoadStoreEntry*                entry;
  LoadStoreBuffer&               ldb = ldb_[cmd.getThrdId()];

#ifdef N2MODEL
  /******************************************************************************
   * In general, the RTL should send the data source along with the LoadData.
   * However, N2 made a special request to also check STB bypassing in Memory
   * Sync model for double check RTL function. It was told to check this STB
   * bypassing when receiving LoadData command. However, it turns out that the
   * RTL actually checks STB bypassing at LoadIssue. (5/17/04). Hence, this
   * function is moved to here. This is one case of drawback that a model is too
   * microarchitecture-dependent. Accompany with this change, they also change
   * their STB bypass check criteria. Hence, my code.
   ******************************************************************************/
  list<LoadStoreEntry>::iterator ste;
  LoadStoreBuffer&               stb = stb_[cmd.getThrdId()];

  if (!cmd.isIO() &&
      cmd.getItype() != ITYPE_BLOCK_LOAD  &&
      cmd.getItype() != ITYPE_DOUBLE_LOAD && // added 6/1/04 TPS
      cmd.getItype() != ITYPE_QUAD_LOAD   &&
      cmd.getItype() != ITYPE_ATOMIC        ) {
    ste = stb.findN2DataBypassing(cmd.getAddr(), cmd.getSizeV());
    if (ste != stb.end()) { // found in STB
//       if (cmd.getDsrc() != DSRC_STB) {
//      MS_ERROR("MemorySync found the data in STB; while RTL's data is from %s", mmdsrc[cmd.getDsrc()]);
//       }
      cmd.setDsrc(DSRC_STB);
    } else { // else find data from L2_MEMORY
//       if (cmd.getDsrc() == DSRC_STB) {
//      MS_ERROR("RTL indicates data is from STB; while MemorySync does not find the data in STB");
//       }
      cmd.setDsrc(DSRC_NONE);
    }
  }
#endif

  // uint64_t iseq = iSeq(cmd.getThrdId());
  uint64_t iseq;

  switch (cmd.getItype()) {
  case ITYPE_BLOCK_LOAD:
    cmd.setSize(8);
    for (int i = 0; i < 8; i++) {
      entry = new LoadStoreEntry(cmd);
      entry->setIseq(iSeq(cmd.getThrdId())); // each entry set different iseq
      ii = ldb.pushBack(*entry);
      cmd.setAddr(cmd.getAddr() + 8ull);
      MSYNC_DEBUG(2, "Last Enter to LDB %s", ii->toString().c_str());
    }
    break;
  case ITYPE_QUAD_LOAD:
    iseq = iSeq(cmd.getThrdId());
    cmd.setSize(8);
    for (int i = 0; i < 2; i++) {
      entry = new LoadStoreEntry(cmd);
      entry->setIseq(iseq);                // each entry set same iseq
      ii = ldb.pushBack(*entry);
      cmd.setAddr(cmd.getAddr() + 8ull);
      MSYNC_DEBUG(2, "Last Enter to LDB %s", ii->toString().c_str());
    }
    break;
  default:
    entry = new LoadStoreEntry(cmd);
    entry->setIseq(iSeq(cmd.getThrdId()));
    ii = ldb.pushBack(*entry);
    MSYNC_DEBUG(2, "Last Enter to LDB %s", ii->toString().c_str());
    break;
  }
}

void
MemorySync::handleLoadData(LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  list<MemoryAccessEntry>::iterator mlink;
  list<LoadStoreEntry>::iterator    lde;
  list<LoadStoreEntry>::iterator    ste;
  MemoryAccessEntry*                mae;
  int                               nMaePerLoadData;
  uint32_t                          cid = cmd.getCoreId();
  uint32_t                          tid = cmd.getThrdId();
  enum DATA_SRC                     dsrc = cmd.getDsrc();

  LoadStoreBuffer&                  ldb = ldb_[tid];

  /******************************************************************************
   * Find corresponding load instruction in the Load Buffer
   * - Note that <address> and <size> are generally not countable since if the
   *   access is to L2, usually only address bits necessary to access cache lines
   *   are observable in the RTL, so does the size parameter.
   * - <id> is certainly the most dependable way of matching. However, machines
   *   that only allow one outstanding load may consider sending this info over
   *   a un-necessary effort.
   *
   * This implementation searches the load buffer for a LoadData by finding the
   * oldest one that has not yet been associated with a LoadData and whose <id>
   * and <address> match with the <id> and <address> of the LoadData, where the
   * <address> comparison ignores the 3 least significant bits.
   *
   * In case the <id> is not implemented, then the mechanism still works as long
   * as the access to the same cache line (or relax more, the same 8-byte block)
   * results in in-order LoadData. Machines allowing only one outstanding load
   * certainly meet this condition. I believe many machines that allows multiple
   * outstanding loads should also meet this criteria. Note that the environment
   * that does not implement <id> should always set this field 0.
   ******************************************************************************/
  lde = ldb.find1stNoLink (cmd.getId(), cmd.getAddr(), BLK_ADDR_MASK);
  if (lde == ldb_[tid].end())
  {
    MS_ERROR("LoadData failed to find matched LoadIssue. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }

  /* Assign size info to the cmd from load buffer so that the MEM_LOAD_DATA
     can have exact data size. This satisfies two purposes: (1) N2 requests
     to remove <size> info from LoadData command since it is hard for them
     to probe correct <size> info in multi-core environment. (6/22/04) (2)
     the TsoChecker needs to have correct <size> info for load. */
  cmd.setSize(lde->getSize());
  cmd.setSizeV(lde->getSizeV());

#ifdef N2MODEL_REMOVE
  /******************************************************************************
   * In general, the RTL should send the data source along with the LoadData.
   * However, N2 made a special request to only indicate whether the data is
   * from L1$ or not. If the data is not from L1$, then the Memory Sync Model
   * will have to find out whether the data is in the store buffer. Also, in
   * N2, one LoadData represent 16 bytes of data for block load and quad load.
   * ^^^^^^^^^^^ This check moves to LoadIssue. See note in LoadIssue. 5/17/04
   ******************************************************************************/
  /******************************************************************************
   * Four types of accesses in N2 will never get data from the store buffer.
   * They are IO, Block Load, Qual Load, and atomic accesses.
   ******************************************************************************/
  if (dsrc == DSRC_L2_MEMORY &&
      !cmd.isIO() &&
      lde->getItype() != ITYPE_BLOCK_LOAD &&
      lde->getItype() != ITYPE_QUAD_LOAD  &&
      lde->getItype() != ITYPE_ATOMIC        ) {
    ste = stb_[tid].findDataBypassing(cmd.getAddr(), cmd.getSizeV());
    if (ste != stb_[tid].end()) { // find in STB
      dsrc = DSRC_STB;
    } // else find data from L2_MEMORY
  }
#endif

#ifdef N2MODEL
  /* If the STB hit at LoadIssue, then the LoadData data source is ignored. */
  if (lde->getDsrc() == DSRC_STB) {
    if (dsrc != DSRC_STB) {
      MS_ERROR("MemorySync found the data in STB; while RTL's data is from %s", mmdsrc[cmd.getDsrc()]);
      return;
    }
  } else { // else find data from L2_MEMORY
    if (dsrc == DSRC_STB) {
      MS_ERROR("RTL indicates data is from STB; while MemorySync does not find the data in STB");
      return;
    }
  }

  if (lde->getItype() == ITYPE_BLOCK_LOAD ||
      lde->getItype() == ITYPE_QUAD_LOAD) {
    nMaePerLoadData = 2;
  } else {
    nMaePerLoadData = 1;
  }
#else
  nMaePerLoadData = 1;
#endif

  mae = new MemoryAccessEntry (cmd);
  mae->setItype(lde->getItype());
  mae->setDsrc(dsrc);
  mae->setIseq(lde->getIseq());
  mlink = mab_.pushBack(*mae);
  lde->setLink(mlink);  // load buffer entry has link to the correponding MAB entry

 /**************************************
  * if (LDDATA & dsrc=3),
  *   then scan MAB for any STCOM to this PA. STCOM inv_vect does not matter in this case.
  *
  * if (LDDATA & dsrc=2)
  *   then scan MAB for STCOM
  *     if (STCOM inv_vect[mycore]=0), then use this Store
  *     else if (STCOM inv_vect[mycore]=1) and (STINV received), then use this Store
  *     else if (STCOM inv_vect[mycore]=1) and (STINV not received), then can't use this Store, keep scanning MAB.
  *
  * where mycore is the core number of the LDDATA.
  */

  switch (dsrc) {
  case DSRC_STB:
    {
      LoadStoreBuffer& stb = stb_[tid];
      LoadStoreBuffer& rstb = retStb_[tid];

      /* find data from store buffer */

      /******************************************************************************
       * STB bypassing is checked different for N1 and N2
       * (also see comments in how to model stores)
       * o N1 checks both stb_ and retStb_. It checks retired_buf_ becuase there
       *   is still a timing gap allowed for bypassing after a StoreAck is received
       *   in N1. This implementation does not check rmoStb_, however. The assumption
       *   is that no STB bypassing of the rmo store data is possible. In theory, this
       *   assumption is not always true. The timing gap mentioned above should also
       *   possible in RMO case. In general, if a load would like to use the data of
       *   a RMO store, a membar instruction should be introduced. If this is strictly
       *   followed, then there is no any problem not to check rmoStb_. Diags may
       *   fail to guarantee this, the implementation takes the risk any way.
       * o N2 checks only stb_. It must be there since it is checked before by the
       *   program.
       ******************************************************************************/
      ste = stb.findDataBypassing(cmd.getAddr(), cmd.getSizeV());
      if (ste == stb.end()) {                                        // not in STB
        ste = rstb.findDataBypassing(cmd.getAddr(), cmd.getSizeV()); // try retired_stb
        if (ste == rstb.end())
        {
          MS_ERROR("LoadData failed to find matched STB bypassing entry. tid=%d  PA=%llx", tid, cmd.getAddr());
          return;
        }

        MSYNC_DEBUG(1, "Store Bypassing gets data from Retired Buffer %s",
                                 ste->toString().c_str());
      }
      /* set data in the memory access entry */
#ifdef N2MODEL
      if (!ste->isDataValid())
      {
        MS_ERROR("LoadData finds Store Buffer data is not ready. tid=%d  PA=%llx", tid, cmd.getAddr());
        return;
      }
#endif
      // In N1 swerver-memory.cc environment, the data may not be ready at this time
      if (ste->isDataValid()) {
        mlink->setData(ste->getData());
      } else {                   // N2 should not go this path
        mlink->setData(0ull);    // set arbitrary data
        ste->setLink2(mlink);    // when data is ready, it can also set the MAB entry
      }
      mlink->setDsrcMid(mlink->getId());
      break;
    }
  case DSRC_L1:
    {
      list<MemoryAccessEntry>::iterator dsrcMlink;

      dsrcMlink = mab_.findL1DataEntry(cmd);
      if (dsrcMlink == mab_.end()) {
        mlink->setData(rif_.readMemory(cid, tid, cmd.getAddr() & ADDR_MASK, 8)); // read aligned 8 bytes
        //      mlink->setDsrcMid(mlink->getId());
        mlink->setDsrcMid(mab_.begin()->getId() - 1); // source is from before MAB head
      } else {
        mlink->setData(dsrcMlink->getData());
        mlink->setDsrcMid(dsrcMlink->getDsrcMid());
      }
      break;
    }
  case DSRC_L2_MEMORY:
      // if it is a I/O address, let's defer the data setting til the
      // corresponding load instruction is invoked, where we may pick up a
      // (follow-me) CSR_READ
      if ((cmd.getAddr() & IO_ADDR_BIT_MASK) != IO_ADDR_BIT_MASK)
      {
        // not an I/O address, should set the value here so that entries
        // after this may use it.
        mlink->setData(getL2Data(mab_.end(), cmd));
      }
      mlink->setDsrcMid(mlink->getId());

    if (nMaePerLoadData > 1) {
      for (int i = 1; i < nMaePerLoadData; i++) {
        lde++;
        cmd.setAddr(cmd.getAddr() + 8); // must be 8 byte per access if nMaePerLoadData > 1
        if (lde == ldb_[tid].end())
        {
          MS_ERROR("LoadData failed to find matched LoadIssue. tid=%d  PA=%llx", tid, cmd.getAddr());
          return;
        }
        MSYNC_DEBUG(2, "lde->getAddr()=%llx, cmd.getAddr()=%llx",
                                 lde->getAddr(), cmd.getAddr());
        if (lde->getAddr() != cmd.getAddr())
        {
          MS_ERROR("Mismatch address between LoadData and LoadIssue. tid=%d  PA=%llx", tid, cmd.getAddr());
          return;
        }

        mae = new MemoryAccessEntry (cmd);
        mae->setItype(lde->getItype());
        mae->setDsrc(dsrc);
        // if it is not a I/O address, ditto
        if ((cmd.getAddr() & IO_ADDR_BIT_MASK) != IO_ADDR_BIT_MASK)
        {
          // ditto
          mae->setData(getL2Data(mlink, cmd));
        }
        mae->setDsrcMid(mae->getId());
        mae->setIseq(lde->getIseq());
        mlink = mab_.pushBack(*mae);
        lde->setLink(mlink);  // load buffer entry has link to the correponding MAB entry
      }
    }
    break;
  default:
    MS_ERROR("LoadData receives unknown data source. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
}

void
MemorySync::handleLoadFill (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  list<MemoryAccessEntry>::iterator mlink, newlink;
  MemoryAccessEntry entry (cmd);

  mlink = mab_.findLoadFillSrc(cmd);
  if (mlink == mab_.end())
  {
    MS_ERROR("LoadFill failed to find matched LoadData. tid=%d  PA=%llx", cmd.getThrdId(), cmd.getAddr());
    return;
  }
//   //TODO  this assumes the corresponding load entry is popped by LOAD_POP,
//   //      but it can be an error instead, so need a better solution.
//   if (mlink == mab_.end()) return;

  entry.setData(mlink->getData());
  entry.setExecuted(true);  // set for being retired from MAB when it reaches to head
  entry.setLink(mlink);
  entry.setDsrcMid(mlink->getId());
  newlink = mab_.pushBack(entry);
  mlink->setLink(newlink);

  /* Also get the other part of L1$ line data, TPS 4/22/04 */
  MemoryAccessEntry* e;
  uint64_t           line_addr_mask = ~((uint64_t) (DCACHE_LINE_SIZE - 1));
  uint64_t           addr = cmd.getAddr() & line_addr_mask;

  for (int i = 0; i < DCACHE_LINE_SIZE/8; i++, addr += 8) {
    if ((mlink->getAddr() & ADDR_MASK) == (addr & ADDR_MASK)) {
      continue;
    }

    cmd.setAddr(addr);
    e = new MemoryAccessEntry (cmd);
    e->setData(getL2Data(mlink, cmd)); // the same line must get data from the same time
    e->setExecuted(true); // set true so that when it reach to head, it can be removed
    e->setLink(mlink);    // set to mlink indicating its search starts from here
    e->setDsrcMid(mlink->getId());
    mab_.pushBack(*e);
  }
}

/******************************************************************************
 * %%% Store Behavior
 *
 * %% General Implementation Note
 *
 * - StoreAck
 *   By definition, StoreAck is sent when the corresponding StoreIssue is to be
 *   removed from the Store Buffer so that no STB bypassing from this entry is
 *   possible.
 *
 *   However, N1 still has a timing window that the STB bypassing is allowed
 *   after the StoreAck is sent. Hence, a retStb_ is added to temporarily
 *   hold the Acked entries.
 *
 *   N2 does obey this rule, its behavior is described below:
 *   1. Normal store
 *      In Niagara 2, the design of the StoreAck can be abstracted as follows:
 *      StoreAcks are queued, and are released from the queue in order. If a
 *      StoreAck also causes store update or invalidation, the StoreAck will be
 *      held in the queue until its update/invalidation is done.
 *
 *      Upon a StoreAck is released from the queue, a StoreAck is sent to the
 *      Memory Sync model followed by, if any, a StoreUpdate or StoreInv. Hence,
 *      the Memory Sync model should see in-order StoreAcks.
 *
 *      For normal store, StoreAck is always after StoreCommit.
 *
 *   2. RMO Store (Block Store, stores using *_BLOCK_INIT_ST_QUAD_LDD_*)
 *      By definition, these stores do not check data dependence, and follow RMO
 *      memory consistency model. They leave the StoreBuffer when they are issued.
 *      A StoreAck will be sent to reflect this fact. Hence, this type of stores
 *      should have StoreAck before its StoreCommit. A data structure, rmoStb_
 *      is introduced to hold these stores after they are removed from the store
 *      buffer so that the StoreCommit can still find the matched entry to extract
 *      some needed info from StoreIssue.
 *
 * - Order Between STEP and StoreAck
 *   Note that this implementation does not assume the order of a STEP and its
 *   corresonding StoreAck. Different CPU can have different implementation.
 *   The following are some cases:
 *   > Normal stores usually have their STEP before StoreAck.
 *   > Atomic alwyas have STEP after StoreAck in N2, but may have STEP before
 *     StoreAck in N1.
 *   > RMO Stores could have StoreAck before or after STEP.
 *
 *
 * %% MemorySync Store Model
 *
 * - When a StoreIssue is received, an entry is place into store buffer
 *
 * - When a StoreAck is received, the corresponding StoreIssue must be at the head
 *   of the STB. The entry is then moved to
 *   o rmoStb_[tid],     if cmd.isRMOstore() is true
 *   o retStb_[tid], otherwise
 *
 * - When a StoreCommit is received, the program searches for both stb_[tid]
 *   (must be in-order) and rmo_stb[tid] (may be out-of-order). Normal stores
 *   must obey TSO, hence, N1/N2 cannot issue next store, unless in the same
 *   cache line, until a StoreAck is received. Therefore, the StoreCommit must
 *   be in-order.
 *   However, RMO stores follow RMO memory consistency model. A RMO store can
 *   be issued from the store buffer without waiting for the commit of a prior
 *   RMO store. As a result, the MemorySync may see out-of-order StoreCommit
 *   for RMO stores since stores access different banks may commit out-of issue
 *   order. Nevertheless, accesses to the same bank should still follow issue order.
 *
 *   In summay, in either case, StoreCommit for the same cache line should be
 *   in-order, which is the assumption made to match StoreCommit-StoreIssue in
 *   handleStoreCommit() method.
 *
 * - When a STEP (callback) is received, the program searches for both stb_[tid]
 *   (must be in-order) and rmo_stb[tid] (out-of-order or in-order ?).
 *
 * - When a StoreUpdate/StoreInv is received, it finds the StoreCommit that
 *   initiates this StoreUpdate/StoreInv. No need to match it with entry in
 *   either stb_, rmoStb_, or retStb_.
 *
 *   => a rmoStb_ entry is removed when it has issued to L2 and is executed
 *   => a retire_stb_ entry is removed when a new entry is entered and the
 *      retire_stb_ size is greater than 2 (2 is a arbitrary number since this
 *      retire_stb_ is for N1 compatibility to keep the entry enough time for
 *      STB bypassing)
 ******************************************************************************/

void
MemorySync::handleStoreIssue(LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  list<LoadStoreEntry>::iterator ii;
  LoadStoreEntry ste (cmd);

#ifndef N1MODEL // <data> is expected, however, N1 does not have data at this moment
  ste.setState(LS_TDATA);
#endif
  if (cmd.getItype() == ITYPE_ATOMIC) {    // assume atomic-store always directly
    ste.setIseq(getIseq(cmd.getThrdId()) - 1); // follows atomic-load
  } else {
    ste.setIseq(iSeq(cmd.getThrdId()));
  }
  ii = stb_[cmd.getThrdId()].pushBack(ste);

  MSYNC_DEBUG(2, "Last Enter to STB %s", ii->toString().c_str());
}

void
MemorySync::handleStoreCommit (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  int tid = cmd.getThrdId();
  list<MemoryAccessEntry>::iterator mlink;
  list<LoadStoreEntry>::iterator   slink;
  LoadStoreBuffer& stb = stb_[tid];
  LoadStoreBuffer& rmostb = rmoStb_[tid];
  MemoryAccessEntry entry (cmd);
  uint64_t mdata; // mab data
  uint64_t sdata; // stb data
  uint8_t  mszv;  // size vector
  uint8_t  sszv;  // size vector
  bool     zeroCacheLine;

  /******************************************************************************
   * Similar to the reason in LoadData, when matching the StoreCommit to
   * StoreIssue, the implementation searches the store buffer by finding the
   * oldest one that has not yet been associated with a StoreCommit and whose <id>
   * and <address> match with the <id> and <address> of the StoreCommit, where the
   * <address> comparison ignores the 3 least significant bits.
   *
   * When the <id> is not implemented, this mechanism works as long as the access
   * to the same cache line (or relax more, the same 8-byte block) results in
   * in-order StoreCommit. This condition should be true for cacheable access.
   * For non-cacheable access, then whether the condition is met depends on the
   * implementation decision. The guess is yes. For I/O, Sun's CPU design will
   * most likely to enforce SYNC before and after.  Note that the environment
   * that does not implement <id> should always set this field 0.
   ******************************************************************************/

#ifdef N3MODEL // N1 assume RMO store's StoreAck is after StoreCommit as other store
  slink = stb.find1stNoLink (cmd.getId(), cmd.getAddr(), BLK_ADDR_MASK);
  if (slink == stb.end())
  {
    MS_ERROR("StoreCommit failed to find matched StoreIssue. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
  slink->setState(LS_ISSUE);

#else // N2's RMO store has StoreAck before StoreCommit
  slink = rmostb.find1stNoLink (cmd.getId(), cmd.getAddr(), BLK_ADDR_MASK);
  if (slink == rmostb.end()) {
    slink = stb.find1stNoLink (cmd.getId(), cmd.getAddr(), BLK_ADDR_MASK);
    if (slink == stb.end())
    {
      MS_ERROR("StoreCommit failed to find matched StoreIssue. tid=%d  PA=%llx", tid, cmd.getAddr());
      return;
    }
    slink->setState(LS_ISSUE);
  } else {
    entry.setAcked(true);
  }
#endif
  /******************************************************************************
   * Special handling for BLK_INIT store that in a certain case will zero the
   * whole case line. This type of instruction is supported in N1 and N2.
   ******************************************************************************/
  if ((slink->getItype() == ITYPE_STORE_INIT) &&
      !slink->isIO() && ((slink->getAddr() & 0x3f) == 0) && !cmd.isL2hit()) {
    zeroCacheLine = true;
  } else {
    zeroCacheLine = false;
  }

  /* merge data and saved the merged data in the correponding memory access entry */
  sdata = slink->getData();

  /* Origianl, sizeV == 0 indicates CAS comparison false. Change to use a separate flag
     to indicate this status. 9/10/04 */
  //  sszv  = (cmd.getSizeV() == 0) ? 0 : slink->getSizeV(); // size_vector = 0 in CMD => cas comparison false
  sszv  = (!cmd.isSwitchData()) ? 0 : slink->getSizeV(); // !isSwitchData() => cas comparison false

  entry.setMerged(false);
  if (zeroCacheLine) { // special case for STORE_INIT
    mdata = merge (0ull, sdata, sszv);
  } else {
    if (slink->getSize() == 8 && sszv == 0xff) { // no merge needed
      mdata = sdata;
    } else { // sszv = 0 still need to get correct data in the entry
      mdata = getL2Data(mab_.end(), cmd);
      mdata = merge (mdata, sdata, sszv);
      // keep merge-related data in case we need to repeat it (with a different
      // mdata)
      entry.setMerged(true);
      entry.setOrigData(sdata);
      entry.setOrigSizeV(sszv);
    }
  }
  entry.setData(mdata);
  entry.setExecuted(slink->isExecuted());

  if (sszv == 0) entry.setSizeV(0); // This is possible in N1 for CAS

  entry.setItype(slink->getItype());
  entry.setIseq(slink->getIseq());
  mlink = mab_.pushBack(entry);
  mlink->setLink(mlink); // point to itself
  slink->setLink(mlink); // STB entry's link pointed to the STORE_COMMIT in MAB
  // if inv_vec is not zero, re-adjust it to mask out sniper related
  // invalidation, because sniper does not generate related inv command,
  // otheriwse the inv_vec bit(s) related to sniper will stay on, and the
  // store_commit will stay in mab forever.
  if (mlink->getInv())
  {
    uint32_t mask = 0;
    for (int i=0; i<4; i++)
    {
      mask |= (((uint32_t)inv_vec_mask[i]) << (i*8));
    }
    mlink->setInv(mlink->getInv() & mask);
  }

  if (zeroCacheLine) { // Add StoreCommit entries with 0 data for the l2 cache line
    MemoryAccessEntry* zero_entry;
    for (int i = 1; i < L2CACHE_LINE_SIZE/8; i++) {
      zero_entry = new MemoryAccessEntry (cmd);
      zero_entry->setAddr(slink->getAddr() + (i << 3));
      zero_entry->setData(0ull);
      zero_entry->setExecuted(true);
      zero_entry->setSizeV(0xff);
      zero_entry->setInv(0);
      zero_entry->setAcked(true);
      zero_entry->setItype(ITYPE_STORE_INIT);
      zero_entry->setIseq(slink->getIseq());
      mlink = mab_.pushBack(*zero_entry);
      mlink->setLink(mlink); // point to itself
    }
  }

  /* Manage the rmoStb_ */
  if (slink->isRMOstore() && slink->isExecuted()) {
    rmostb.erase(slink);
  }

  /* Added to speed up the drain of MAB - 6/3/04 TPS */
  vector<MemoryAccessEntry> wdbuf;

  mab_.popFront(&wdbuf, tsoChecker_);
  bufWriteMemory(wdbuf);
}

void
MemorySync::handleStoreAck (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  uint32_t tid = cmd.getThrdId();
  LoadStoreBuffer& stb = stb_[tid];
  list<LoadStoreEntry>::iterator ii;

  MSYNC_DEBUG(2, "stb=%s", stb.toString().c_str());

  /******************************************************************************
   * In Niagara 1, the StoreAck is issued in-order, but they may
   * Note that in N1, rmo stores, BLK_INIT store and BLOCK_STORE, are Acked same
   * as other stores. Since rmo stores can be issued to L2 without waiting for
   * the ACK before issuing another one, their StoreAcks can be out-of-order if
   * they are from different banks.
   * In N2, StoreAck for a rmo store is sent when it is released from the store
   * buffer. Hence, it is still in order.
   *
   * The N1MODEL may have a potential problem. STORE_ACK CMD has only address,
   * N1's swerver-memory.cc uses address and size for searching. So, a <size>
   * field may be need in my specification. However, as the development of this
   * code, N1 is not a target to support.
   ******************************************************************************/
#ifdef N1MODEL // cannot just search by address, entries may have same address
  ii = stb.findMatchO2Y(cmd.getAddr(), cmd.isRMOstore(), FULL_ADDR_MASK);
  if (ii == stb.end())
  {
    MS_ERROR("StoreAck failed to find matched StoreIssue. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
#else         // N2 does not send address, and guarantees to send StoreAck in order
  ii = stb.begin();
  if (ii == stb.end())
  {
    MS_ERROR("StoreAck sees empty STB. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
#endif

#ifdef N1MODEL // N2's StoreAck does not have address
  MSYNC_DEBUG (2, "addr-stb=%#llx addr-cmd=%#llx", ii->getAddr(), cmd.getAddr());
  if (ii->getAddr() != cmd.getAddr())
  {
    MS_ERROR("StoreAck is issued out-of-order, should be in-order. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
#endif

  /* In n1 swerer-memory.cc, if the atomic, block store, or store init needs
     not to invalidate the L1$, then they are also copied to the thrd_retired_buf.
     Removing the conditions below for compatibility. For cases that not copying
     into the thrd_retired_buf, the code is inserted with the RetireBuffer.popBack()
     to remove the entry. */
//   if (ii->isExecuted() && ii->getItype() != ITYPE_ATOMIC &&
//       ii->getItype() != ITYPE_BLOCK_STORE && ii->getItype() != ITYPE_STORE_INIT) {

  if (cmd.isRMOstore()) {
    /* No state change for RMO store whose StoreAck is before StoreCommit */
    if (!ii->isRMOstore())
    {
      MS_ERROR("RMO StoreAck find non-RMO entry in STB. tid=%d  PA=%llx", tid, cmd.getAddr());
      return;
    }
    rmoStb_[tid].pushBack(*ii);
  } else {
    ii->setState(LS_ACK);
    if (!ii->isExecuted() && (ii->getItype() != ITYPE_ATOMIC))
    {
      MS_ERROR("StoreAck before SSTEP for non-atomic non-RMO store. tid=%d  PA=%llx", tid, cmd.getAddr());
      return;
    }
    retStb_[tid].pushBack(*ii); // move to retired store buffer, for N1
    if (retStb_[tid].size() > 2) {
      retStb_[tid].popFront();
    }
  }
  stb.erase(ii);

  /* Add to speed-up the drain of MAB */
  vector<MemoryAccessEntry> wdbuf;

  mab_.popFront(&wdbuf, tsoChecker_);
  bufWriteMemory(wdbuf);

  /* StoreAck is not invloved in determining if a StoreCommit can be removed,
     hence, there is no need to call handleStoreInv(cmd)
  */
}

void
MemorySync::handleStoreInv (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  uint32_t tid = cmd.getThrdId();
  LoadStoreBuffer& rstb = retStb_[tid];
  LoadStoreBuffer& rmostb = rmoStb_[tid];
  list<LoadStoreEntry>::iterator rse, rme;

  /******************************************************************************
   * StoreInv command does not provide enough info for us to check matching
   * store buffer entry as that is done in StoreUpate. StoreUpdate can only
   * happens on normal store, hence, we only have to check retStb_. But,
   * StoreInv can also caused by RMO store.  The StoreInv API is not required
   * to differentiate RMO store from others. Therefore, in general a StoreInv
   * can match with StoreIssue in either retStb_ or rmoStb_, or both.
   * It is possible to implement this check. The intruction type can be obtained
   * by matched StoreCommit. Another issue is that the code does not required
   * a store in a store buffer waiting the StoreUpdate/StoreInv before it can
   * be removed. So, just remove the check.
   ******************************************************************************/
//   if (cmd.getThrdId() == cmd.getSrcTid()) { // same thread

// #ifdef N2MODEL
//     /* N2's StoreInv set 3 LSBs to 0, hence cannot compare exact address.
//        However, it must be Acked and Inv in order. */
//     rse = --(rstb.end()); // note that (rsb.end())-- does not work
//     MS_ASSERT((cmd.getAddr() & ADDR_MASK) == (rse->getAddr() & ADDR_MASK), "StoreInv finds last StoreAck having different address", cmd.getAddr());
// #else
//     rse = rstb.findMatchO2Y(cmd.getAddr(), LS_ACK); // update or inv to the same core should be in-order
//     MS_ASSERT(rse != rstb.end(), "StoreInv failed to find matched one in ther RetiredBuffer", cmd.getAddr());
// #endif
//     rse->setState(LS_INV);
//   }

  completeStoreInvStoreUpdate(cmd);
}

/********************************************************************************
 * StoreUpdate command provides two purposes. One is exact the same as StoreInv
 * that prevent the corresponding StoreCommit from removing from the MAB too
 * early. The other is to provide data for the following LoadData with L1 hit.
 * Recall that the LoadData with L1 hit will search for its data from the same
 * core LoadFill or StoreUpdate record in the MAB.
 ********************************************************************************/

void
MemorySync::handleStoreUpdate (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  uint32_t tid = cmd.getThrdId();
  LoadStoreBuffer& rstb = retStb_[tid];
  list<LoadStoreEntry>::iterator ii;

  /******************************************************************************
   * The following code does check. It is not functionally critical. In SUN's
   * implementation, usually the block stores (and in N2 BLK_INIT stores) do not
   * update L1$, instead, they invalidate L1$. Hence, only retStb_ needs to
   * perform search for matching StoreIssue.
   *
   * In addition, Niagara 2 sends StoreAck and StoreUpdate or StoreInv (same cid)
   * in the same cycle. In N2, the design of the StoreAck/StoreUpdate/StoreInv can
   * be abstracted as follows:
   * StoreAcks are queued, and are released from the queue in order. If a
   * StoreAck also causes store update or invalidation, the StoreAck will be
   * held in the queue until its update/invalidation is done.
   *
   * Niagara 1, however, does not queue the StoreAck. Hence, the StoreUpdate (and
   * StoreInv) may not related to the last StoreAck since not all StoreAcks cause
   * StoreUpdate/StoreInv.
   ******************************************************************************/
#ifdef N2MODEL
  /* N2's StoreUpdate set 3 LSBs to 0, hence cannot compare exact address.
     However, it must be Acked and Updated in order. */
  ii = --(rstb.end()); // note that (rsb.end())-- does not work
  //  cerr << "MemSync: " << ii->toString();
  if ((cmd.getAddr() & ADDR_MASK) != (ii->getAddr() & ADDR_MASK))
  {
    MS_ERROR("StoreUpdate finds last StoreAck having different address. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
#else
  /******************************************************************************
   * In general, searching for state LS_ACK is not ok since as discussed in the
   * handleStoreInv StoreInv is not appropriate to perform such check, and thus
   * the state of the corresponding StoreIssue remains at state LS_ACK. As a
   * result, the search may find wrong entry. The longer the retire_stb_, the
   * more likely the error. So far, the retire_stb_ maximum size is set to 2.
   ******************************************************************************/
  ii = rstb.findMatchO2Y(cmd.getAddr(), LS_ACK, BLK_ADDR_MASK); // update or inv to the same core should be in-order
  if (ii == rstb.end())
  {
    MS_ERROR("StoreUpdate failed to find matched one in the RetiredBuffer. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
#endif
  ii->setState(LS_UPDATE);

  completeStoreInvStoreUpdate(cmd);
}

void
MemorySync::completeStoreInvStoreUpdate (LoadStoreCmd& cmd)
{
  list<MemoryAccessEntry>::iterator srclink;
  list<MemoryAccessEntry>::iterator newlink;
  vector<MemoryAccessEntry> wdbuf;
  MemoryAccessEntry mae (cmd);
  uint32_t cbit = 1 << cmd.getCoreId();

  srclink = mab_.findStoreInvStoreUpdateSrc (cmd);

#ifdef N1MODEL // it seems that in N1 IO access does not have StoreCommit signal
  if (cmd.isIO() && srclink == mab_.end()) {
    return;
  }
#endif

  if (srclink == mab_.end())
  {
    MS_ERROR("StoreInv/Update failed to find matched StoreCommit. tid=%d  PA=%llx", cmd.getSrcTid(), cmd.getAddr());
    return;
  }
  MSYNC_DEBUG(4, "StoreInv-StoreUpdate %s", srclink->toString().c_str());
  srclink->addCinv(cbit);          // record invalidated/updated core

  mae.setData(srclink->getData()); // although only StoreUpdate needs data, it's ok for StoreInv
  mae.setExecuted(true);           // required for retirement from MAB when it reaches to head

  // initially, a StoreCommit will have a link to itself
  if (!srclink->isLinkValid())
  {
    MS_ERROR("StoreCommit does not have a valid link");
    return;
  }
  mae.setLink(srclink->getLink()); // set link chain for debugging purpose
  mae.setDsrcMid(srclink->getId()); // data source MAE id
  srclink->setGlobal(mae.getId()); // Global will be set the last StoreInv or StoreUpdate, eventually
  newlink = mab_.pushBack(mae);
  srclink->setLink(newlink);

  /* Add to speed-up the drain of MAB */
  mab_.popFront(&wdbuf, tsoChecker_);
  bufWriteMemory(wdbuf);
}

void
MemorySync::handleEvict (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  /* Note that if the D$ line size is different from the I$ line size,
     then the following code may have to be modified. Although N2 has
     differernt line size of D$ and I$, it is ok to just use the line
     size of D$ since the N2 is decided not to handle I-Fetch side of
     operation. */
  /* For MemorySync to function correctly, only one EVICT entry per
     EVICT command is enough since it functions as a fence. However,
     for TSOchecker to fucntion correctly, each 8-byte needs to have
     a EVICT so that the TSOchecker can depend on this to retire store
     node in the memory access history structure. */
  if (cmd.getInv() == 0)
  {
    MS_ERROR("MEM_EVICT has inv_vec==0x0. cmd=%s", cmd.toString().c_str());
    return;
  }
  uint64_t base = cmd.getAddr() & (~(DCACHE_LINE_SIZE - 1));
  uint64_t addr;
  for (int i = 0; i < DCACHE_LINE_SIZE/8; i++)
  {
    addr = base + (8 * i);
    cmd.setAddr(addr);
    cmd.setSrcBank ((addr & L2_BANK_ADDR_BITS) >> L2_BANK_ADDR_SFT);
    list<MemoryAccessEntry>::iterator ii = mab_.findDmaStoreStart(cmd, DMA_EVICT);
    if (ii != mab_.end())
    {
      // a remote evict for a dma_store, store its inv in the dma_store_start,
      // it will be picked up when the corresponding dma_store is issued later
      ii->setInv((ii->getInv() | cmd.getInv()));
    }
    else
    {
      ii = mab_.findDmaStoreEntry(cmd, DMA_EVICT);
      if (ii != mab_.end())
      {
        // find one, this evict is for a dma_store entry, it is mainly used to
        // fill in the dma_store's inv_vec field. No need to add this entry
        // to MAB. the dma_store may contain remote evict, so OR them together
        ii->setInv((ii->getInv() | cmd.getInv()));
        // we should see no more evict associated with this entry
        ii->setInvSet(true);
        // let inv/cinv decide when this entry can be removed from MAB
        ii->setExecuted(true);
        //ii->setSrcBank((cmd.getAddr() & L2_BANK_ADDR_BITS) >> L2_BANK_ADDR_SFT);
        MSYNC_DEBUG(4, "EVICT DMA_STORE %s", ii->toString().c_str());
      }
      else
      {
        // normal MEM_EVICT
        list<MemoryAccessEntry>::iterator newlink;
        MemoryAccessEntry* mae = new MemoryAccessEntry (cmd);
        //mae->setAddr(addr);
        mae->setData(0ull);            // Evict does not set real data
        mae->setExecuted(true);        // needed for retirement from MAB
        mae->setAcked(true);
        //mae->setSrcBank ((addr & L2_BANK_ADDR_BITS) >> L2_BANK_ADDR_SFT);
        mae->setSizeV(0xff);
        newlink = mab_.pushBack(*mae);
        newlink->setLink(newlink);         // set link to itself
      }
    }
  }
}

void
MemorySync::handleEvictInv (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  // evict_inv does not have addr value, so it is always 0 to begin with.
  uint64_t base = cmd.getAddr() & (~(DCACHE_LINE_SIZE - 1));
  uint64_t addr;
  uint32_t cbit = 1 << cmd.getCoreId();
  /**
   * Note using DCACHE_LINE_SIZE implies only one EvictInv can be sent
   * from each core assocaited with one Evict command.
   */
  for (int i = 0; i < DCACHE_LINE_SIZE/8; i++)
  {
    addr = base + (8 * i);
    cmd.setAddr(addr);
    // for evict_inv, srcBank is provided by testbench, this can be a problem
    // when partial bank mode is used. ===> it will not, testbench uses static
    // bank calculation too.
    list<MemoryAccessEntry>::iterator ii = mab_.findDmaStoreStart(cmd, DMA_EVICT_INV);
    if (ii != mab_.end())
    {
      // a remote evict_inv for a dma_store, update the cinv in the
      // dma_store_start, it will be picked up when the corresponding
      // dma_store is issued later
      ii->addCinv(cbit);
    }
    else
    {
      ii = mab_.findDmaStoreEntry(cmd, DMA_EVICT_INV);
      if (ii != mab_.end())
      {
        // find one, this evict_inv is for a dma_store entry, mask the
        // corresponding bit in inv_vec.
        ii->addCinv(cbit);
        MSYNC_DEBUG(4, "EVICT_INC DMA_STORE %s", ii->toString().c_str());
      }
      else
      {
        // normal MEM_EVICT_INV
        list<MemoryAccessEntry>::iterator srclink;
        list<MemoryAccessEntry>::iterator newlink;
        MemoryAccessEntry* mae;
        srclink = mab_.findEvictInvSrc(cmd);
        if (srclink == mab_.end())
        {
          MS_ERROR("EvictInv failed to find matched Evict. tid=%d  PA=%llx", cmd.getThrdId(), cmd.getAddr());
          return;
        }
        srclink->addCinv(cbit);

        mae = new MemoryAccessEntry(cmd);
        mae->setExecuted(true);           // required for retirement from MAB when it reaches to head
        //assert (srclink->isLinkValid());  // initially, an Evict will have a link to itself
        mae->setLink(srclink->getLink()); // set link chain for debugging purpose
        srclink->setGlobal(mae->getId()); // set dinv
        newlink = mab_.pushBack(*mae);
        srclink->setLink(newlink);
      }
    }
  }
}

/******************************************************************************
  To allow DMA accesses from I/O. It occurres when a SIU Store (from ENET
  or PCI) is seen in the L2 or when Bench back-door stores to memory.
******************************************************************************/
void
MemorySync::handleStoreSlam(StoreIssueCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
    uint64_t paddr = cmd.getAddr();
    if (cmd.getSizeV() != 0xff) {
        // need to read in data for merging
        uint64_t srcData = rif_.readMemory(0, 0, (paddr & ADDR_MASK), 8);
        uint64_t tarData = align8byte(cmd.getData(), cmd.getSize(), (paddr & ~ADDR_MASK));
        uint64_t newData = merge(srcData, tarData, cmd.getSizeV());
        rif_.slamMemory(0, 0, (paddr & ADDR_MASK), newData, 8);
    }
    else {
        if ((paddr & 0x7ULL) != 0x0)
        {
          MS_ERROR("Address 0x%llx is not 8-byte aligned", paddr);
          return;
        }
        rif_.slamMemory(0, 0, paddr, cmd.getData(), 8);
    }
}

//=============================================================================
// signal the beginning of a dma_store operation, there is just one
// dma_store_start, regardless of the dma_store is 8 bytes or 64 bytes.
// For a 64 bytes dma_store, we will see 8 dma_store commands.
//=============================================================================
void
MemorySync::handleDmaStoreStart(LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  int totalSize = cmd.getTsize();
  uint64_t addr = cmd.getAddr();
  for (int i = 0; i < totalSize/8; i++)
  {
    // create one DMA_STORE_START, in MAB,  for every 8-byte DMA_STORE
    cmd.setAddr(addr);
    MemoryAccessEntry* mae = new MemoryAccessEntry(cmd);
    mae->setEntryType(MEM_DMA_STORE_START);
    mab_.pushBack(*mae);
    addr += 8;
  }
}

/******************************************************************************
  Similar to MEM_SLAM, but allow inv_vec to handle L1 conflict.
******************************************************************************/
void
MemorySync::handleDmaStore(LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  // treat it as a store_commit with evict & evict_inv
  // we will have one evict to fill in the inv_vec, then one or more
  // evict_inv to mask each able bit in inv_vec.
  // 2/14/06, all DMA_STORE must go through MAB
  list<MemoryAccessEntry>::iterator mlink;
  uint64_t mdata;
  uint64_t sdata;
  uint8_t  sszv;
  int tid = cmd.getThrdId();
  StoreCommitCmd commit(cmd.getThrdId(), 0, 0, cmd.getAddr(), cmd.getSizeV(), true, (cmd.getSizeV()==0?false:true), cmd.getCycle());
  // each DMA_STORE operation can be either 8 bytes or 64 bytes, but
  // each DMA_STORE command only represent 8 bytes, we need the total
  // size informaiton to handle the corresponding EVICT and EVICT_INV
  // commands, as there will be only one EVICT command for each
  // DMA_STORE operation (which can have 1 or 8 DMA_STORE commands)
  commit.setTsize(cmd.getTsize());
  commit.setData(cmd.getData());
  MemoryAccessEntry entry(commit);
  sdata = commit.getData();
  sszv = commit.getSizeV();
  entry.setMerged(false);
  if ((commit.getSize() == 8) && (sszv == 0xff))
  {
    // no merge needed
    mdata = sdata;
  }
  else
  {
    // sszv = 0 still need to get correct data in the entry
    mdata = getL2Data(mab_.end(), commit);
    mdata = merge(mdata, sdata, sszv);
    // keep merge-related data in case we need to repeat it (with a
    // different mdata)
    entry.setMerged(true);
    entry.setOrigData(sdata);
    entry.setOrigSizeV(sszv);
  }
  entry.setData(mdata);
  list<MemoryAccessEntry>::iterator ii = mab_.findDmaStoreStart(cmd, DMA_STORE);
  if (ii != mab_.end())
  {
    // there is a dma_store_start associated with this dma_store, use the
    // inv and cinv there. mark the dma_store_start as executed so that it
    // can be removed from MAB
    ii->setExecuted(true);
    entry.setInv(ii->getInv());
    entry.setCinv(ii->getCinv());
  }
  if (cmd.getInv() == 0)
  {
    // there is no local evict associated with the dma_store, so set inv
    // setting as done, also set this entry as executed, and let inv/cinv
    // decide when this entry's data can be committed to memory.
    entry.setInvSet(true);
    entry.setExecuted(true);
  }
  entry.setItype(ITYPE_STORE);
  entry.setIseq(iSeq(tid));
  entry.setDmaStore(true);
  mlink = mab_.pushBack(entry);
  mlink->setLink(mlink); // point to itself

  // buffer for writing data back to memory
  vector<MemoryAccessEntry> wdbuf;
  mab_.popFront(&wdbuf, tsoChecker_);
  /* write STORE_COMMIT data to memory */
  bufWriteMemory(wdbuf);
}

/************************************************************************************
  Reset the state back to LS_ACK. This implementation assumes only the
  cacheable request can have FetchFill where the state is set to LS_ACK.
  In general, this is inaccurate since LS_ACK in Fetch means instruction
  is received. This may be ok if only cacheable instruction can have
  self-modifying-code.

  In general, a fetch has
  - FetchIssue: instruction fetch issue
  - FetchData:  instruction access gets data from memory system
  - FetchFill:  line fill to L1
  - FetchAck:   instruction data come back to fetch buffer

   So far, only FetchData and FetchFill is implemented.
*************************************************************************************/

void
MemorySync::handleFetchIssue(LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  list<LoadStoreEntry>::iterator ii;
  LoadStoreEntry* entry;
  uint32_t        tid = cmd.getThrdId();

  entry = new LoadStoreEntry(cmd);
  ii = ifb_[tid].pushBack(*entry);
  MSYNC_DEBUG(2, "Last Enter to IFB %s", ii->toString().c_str());
}

void
MemorySync::handleFetchData(LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  list<MemoryAccessEntry>::iterator mlink;
  list<LoadStoreEntry>::iterator    ife;
  vector<MemoryAccessEntry>         wdbuf;
  MemoryAccessEntry*                mae;
  uint32_t                          tid = cmd.getThrdId();
  enum DATA_SRC                     dsrc = cmd.getDsrc();
  LoadStoreBuffer&                  ifb = ifb_[tid];

  //  ife = ifb.findMatchO2Y (cmd.getId(), cmd.getAddr(), cmd.getSizeV());
  ife = ifb.findMatchO2Y (cmd.getAddr(), LS_NEW, BLK_ADDR_MASK);
  if (ife == ifb.end())
  {
    MS_ERROR("Matched FetchIssue expected. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }

  mae = new MemoryAccessEntry (cmd);
  mae->setItype(ITYPE_FETCH);

  /* Reset the state back to LS_ACK. This implementation assumes only the
     cacheable request can have FetchFill where the state is set to LS_ACK.
     In general, this is inaccurate since LS_ACK in Fetch means instruction
     is received. This may be ok if only cacheable instruction can have
     self-modifying-code.

     In general, a fetch has
     - FetchIssue: instruction fetch issue
     - FetchData:  instruction access gets data from memory system
     - FetchFill:  line fill to L1
     - FetchAck:   instruction data come back to fetch buffer

     So far, only FetchIssue, FetchData and FetchFill is implemented.
  */

  switch (dsrc) {
  case DSRC_L1:
    mae->setData(getL1Instr(cmd));
    break;
  case DSRC_L2_MEMORY:
    MSYNC_DEBUG(2, "mab_=%s", mab_.toString().c_str());
    mae->setData(getL2Data(mab_.end(), cmd));
    break;
  case DSRC_IO: // access instruction directly from Riesling memory
    mae->setData(rif_.readMemory(cmd.getCoreId(), cmd.getThrdId(), cmd.getAddr() & ADDR_MASK, 8));
    break;
  default:
    MS_ERROR("Wrong dsrc value %d", dsrc);
    return;
  }
  mlink = mab_.pushBack(*mae);
  ife->setLink(mlink);
  ife->setState(LS_RDATA);
}

/**
   LoadFill must bring in the whole L1 cache line, TPS 4/22/04
*/
void
MemorySync::handleFetchFill (LoadStoreCmd& cmd)
{
  if (addrTrans)
  {
    cmd.setAddr(addrTrans(cmd.getAddr()));
  }
  list<MemoryAccessEntry>::iterator mlink, newlink;
  list<LoadStoreEntry>::iterator    ife;
  MemoryAccessEntry                 entry (cmd);
  uint32_t                          tid = cmd.getThrdId();
  LoadStoreBuffer&                  ifb = ifb_[tid];

  /***********************************************************************************
   * The design assumes FetchFill of the same 8-byte block should come back in order.
   * In addition, only fetches whose data source is L2/Memory and is cacheable to L1
   * can have FetchFill. The following do-while statement searches for such entry in
   * the fetch buffer. Entries that fetch the same 8-byte block but do not need
   * FetchFill are assigned LS_ACK state. This is to indicate that those entries are
   * considered done with fetched instruction backed to the fetch buffer already.
   * (Recall that FetchAck is not implemented so far and it maybe never needed.)
   * Note that after an entry is assigned LS_ACK, the search in the next iteration
   * will skip it.
   ***********************************************************************************/
  do {
    MSYNC_DEBUG(2, "ifb=%s", ifb.toString().c_str());
    ife = ifb.findMatchO2Y(cmd.getAddr(), LS_RDATA, BLK_ADDR_MASK);
    if (ife == ifb.end())
    {
      MS_ERROR("FetchFill failed to find matched entry in FetchBuffer. tid=%d  PA=%llx", tid, cmd.getAddr());
      return;
    }
    if (!ife->isLinkValid())
    {
      MS_ERROR("FetchFill's matched FetchBuffer entry does not have FetchData. tid=%d  PA=%llx", tid, cmd.getAddr());
      return;
    }
    ife->setState(LS_ACK); // this could be the one found, or one does not need FetchFill
  } while (!(ife->getLink()->getDsrc() == DSRC_L2_MEMORY && ife->getLink()->isCacheL1()));

  /* The following search in FetchBuffer is actually not needed. This provides
     additional layer of check to see if its link is the same as that found
     from the MAB search. When the protocol stablizes, can determine if to
     remove this segment of code */

  mlink = mab_.findFetchFillSrc(cmd);
  MSYNC_DEBUG(2, "ife=%s", (ife->toString()).c_str());
  MSYNC_DEBUG(2, "mlink=%s", (mlink->toString()).c_str());
  if (mlink == mab_.end())
  {
    MS_ERROR("FetchFill failed to find matched FetchData. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }
  if (ife->getLink() != mlink)
  {
    MS_ERROR("FetchFill's matched FetchBuffer entry's FetchData mismatch. tid=%d  PA=%llx", tid, cmd.getAddr());
    return;
  }

  entry.setData(mlink->getData());
  entry.setLink(mlink);
  entry.setExecuted(true);// set for being retired from MAB when it reaches to head
  newlink = mab_.pushBack(entry);
  mlink->setLink(newlink);

  /* Also get the other part of L1$ line data, TPS 5/4/04 */
  MemoryAccessEntry* e;
  uint64_t           line_addr_mask = ~((uint64_t) (ICACHE_LINE_SIZE - 1));
  uint64_t           addr = cmd.getAddr() & line_addr_mask;
  uint64_t           maeAddr = mlink->getAddr();

  for (int i = 0; i < ICACHE_LINE_SIZE/8; i++, addr += 8) {
    if ((maeAddr & ADDR_MASK) == (addr & ADDR_MASK)) {
      continue;
    }

    cmd.setAddr(addr);
    e = new MemoryAccessEntry (cmd);
    e->setData(getL2Data(mlink, cmd));
    e->setExecuted(true); // set true so that when it reach to head, it can be removed
    e->setLink(mlink);    // set to mlink indicating its data got from here
    mab_.pushBack(*e);
  }
}

void
MemorySync::removeRetiredStore (uint64_t addr, uint32_t tid)
{
  LoadStoreBuffer& rstb = retStb_[tid];
  list<LoadStoreEntry>::iterator ii;

  /* must be the first entry of the STB */
  ii = rstb.begin();

  if (rstb.getBufPtr()->size() == 0)
  {
    MS_ERROR("Attempt to remove entry in empty RetiredBuffer. tid=%d  PA=%llx", tid, addr);
    return;
  }
//   if (rstb.getBufPtr()->size() == 0) {
//     cerr << "MemModel: WARNING[removeRetiredStore()] rstb.size() == 0" << endl;
//     return;
//   }

  MSYNC_DEBUG (1, "Remove RetiredBuffer Entry tid=%d addr=%#llx addr_buf=%#llx executed=%d",
                            tid, addr, ii->getAddr(), (int) ii->isExecuted());
  MSYNC_DEBUG (2, "%s", toString().c_str());

  if (ii->getAddr() != addr)
  {
    MS_ERROR("Remove RetiredBuffer head entry mismatches with the address. tid=%d  PA=%llx", tid, addr);
    return;
  }

  /* For CAS instruction, if the store part is not sent then the retired instruction
     may not be executed. The following code assumes that when an instruction is to
     be removed from this buffer, it must have been executed.
  */
  if (!ii->isExecuted()) {
    if (ii->getItype() != ITYPE_ATOMIC)
    {
      MS_ERROR("Found non-executed instr. in RetiredBuffer to be removed. tid=%d  PA=%llx", tid, addr);
      return;
    }
    ii->setExecuted(true);
    if (ii->isLinkValid()) {
      ii->getLink()->setExecuted(true);
    }
  }

  rstb.popFront();
}


/************************************************************************************
 * preMemoryAccess()
 * - takes the data to be written by the reference model,
 * - checks that with the one in Memory Sync Model, and
 * - sets flag so that the reference model won't update its memory.
 *
 * The MemoryTransaction is used to store the data. The data is in
 * 8-byte chunks, and each is arranged in big endian fashion. If the
 * size is less than 8, only data[0] is used and the data is shifted
 * to the right (toward least significant byte).
 ************************************************************************************/

void MemorySync::pre_memory_access( void* msync_obj, MemoryTransaction &xact )
{
  MemorySync* self = (MemorySync*)msync_obj;
  if (self)
    self->preMemoryAccess(xact);
}

void MemorySync::preMemoryAccess ( MemoryTransaction& xact)
{

    if (MemorySyncMessage::skipCallback == 1) {
        //cerr << "ERROR: MSYNC-SKIP: T" << dec << rif_.getTid(xact) << " skip preMemoryAccess()\n";
        return;
    }

  list<LoadStoreEntry>::iterator ii;
  uint64_t mask;
  uint64_t rdata[8];
  uint64_t rpa = xact.paddr();
  if (addrTrans)
  {
    rpa = addrTrans(rpa);
  }
  uint64_t rva = xact.vaddr();
  uint8_t  rsize = xact.size();
  int      nrdata = 0;

  int      tid;
  int      i,j;

  if ((rpa & 0xff00000000ULL) == 0x9000000000ULL) {
      // Bench is not sending any ldst_sync messages if PA[39:32]=0x90.
      // This is true for ST, STXA and LD, LDXA
      return;
  }

  tid = xact.getStrand();

  if (tid >= MAX_STRANDS) { // indicates the xact is not normal instruction access
    return;
  }

  if (xact.referenceType() == MemoryTransaction::INSTR) {
    return;
  }

//   if (xact.noOperationXact()) {
//     return;
//   }
  /**********************************************************************************
   * In Riesling, atomic is performed in two transactions, one read and one write.
   * This following checks return normal load as well as the load part of an atomic
   **********************************************************************************/
//   if (xact.readXact()) {
//     return;
//   }

  if (!xact.writeXact()) {
    return;
  }

  if (xact.internalXact() || xact.tablewalk()) {
    return;
  }

  /* store or atomic instructions */
  LoadStoreBuffer& stb = stb_[tid];

  MSYNC_DEBUG(1, "STEP (preMemAcc) tid=%d va=%#llx pa=%#llx size=%d atomic=%d", tid, rva, rpa, rsize, (int) xact.atomicXact());

  if (rsize > 8) {
    uint8_t buf[64];
    if (rsize != 64 && rsize != 16) {
      MS_ERROR("Unsupported store data size %d", xact.size());
      return;
    }
    for (i = 0; i < rsize/8; i++) {
      rdata[i] = xact.getData(i);
      nrdata = 8;
    }
  } else {
    rdata[0] = xact.getData(0);
    nrdata = 1;
  }

  bool             atomic = xact.atomicXact();
  uint64_t         addr;
  LoadStoreBuffer& rstb = retStb_[tid];
  LoadStoreBuffer& rmostb = rmoStb_[tid];
  /**********************************************************************************
   * The assumption made is only atomic and rmo stores can have StoreAck issued
   * before SSTEP. Therefore, for these two types of instructions, search matched
   * StoreIssue must begin with either retStb_ (atomic) or rm_stb_ (rmo).
   **********************************************************************************/
  LoadStoreBuffer& ret_or_rmo_stb = (atomic) ? rstb : rmostb;

  MSYNC_DEBUG(2, "rstb=%s", rstb.toString().c_str());
  MSYNC_DEBUG(2, "stb=%s", stb.toString().c_str());
  MSYNC_DEBUG(2, "rmostb=%s", rmostb.toString().c_str());

  for (i = 0; i < nrdata; i++) {
    addr = rpa + (i << 3);

    ii = ret_or_rmo_stb.find1stNonExe();
    if (ii == ret_or_rmo_stb.end()) {
      ii = stb.find1stNonExe();
      //MS_ASSERT(ii != stb.end(), "STEP failed to find match StoreIssue in STB. tid=%d  PA=%llx", tid, addr);
      if (ii == stb.end()) {
          if (atomic) {
              MS_ERROR("STEP (store part of an atomic instr) failed to find match StoreIssue in STB. tid=%d  PA=%llx", tid, addr);
              return;
          }
          else {
              MS_ERROR("STEP failed to find match StoreIssue in STB. tid=%d  PA=%llx", tid, addr);
              return;
          }
      }
    }

    /* Note that RTL does not check store address and data, MemorySync performs
       this additional check for completion */
    if ((addr & ADDR_MASK) != (ii->getAddr() & ADDR_MASK)) {
      MS_ERROR (" Store Address Mismatch (bits 2-0 ignored) cid=%d tid=%d va=%#llx pa-ref=%#llx pa-rtl=%#llx",
                                tid/NSTRANDS_PER_CORE, tid, rva + (i << 3), addr, ii->getAddr());
      return;
    }
    if ((addr & ADDR_MASK) != (ii->getAddr() & ADDR_MASK))
    {
      MS_ERROR("STEP's address mismatches with 1st non-executed StoreIssue entry. tid=%d  PA=%llx", tid, addr);
      return;
    }

// #ifdef N2MODEL
//     if (rif_.isPartialStore(xact.asi())) {
//       rdata[i] = align8byte (rdata[i], rsize, addr & ~ADDR_MASK);
//     } else {
//       rdata[i] = align8byte (rdata[i], ii->getSize(), ii->getAddr() & ~ADDR_MASK);
//     }
// #else
//     rdata[i] = align8byte (rdata[i], ii->getSize(), ii->getAddr() & ~ADDR_MASK);
// #endif

    /* It should be fine to use xact info to aling data */
    rdata[i] = align8byte (rdata[i], rsize, addr & ~ADDR_MASK);
    /* size_vector exists only in MemorySync model */
    mask = byteMask(ii->getVbyte());

    MSYNC_DEBUG(1, "STEP (preMemAcc) cid=%d tid=%d iseq=%lld va=%#llx pa=%#llx data-ref=%#llx data-rtl=%#llx mask=%#llx",
                             tid/NSTRANDS_PER_CORE, tid, ii->getIseq(), rva + (i << 3), addr, rdata[i],
                             ii->getData(), mask);

    /**********************************************************************************
     * Note that in N1 swerver-memory.cc environment, store data is not always known
     * before the SSTEP command. (Data is known @PCX_L2 stage that can happen after
     * SSTEP). In this case, the swerver-memory.cc saves the data obtained at SSTEP
     * to thrd_write_buf, i.e., STB. So, a load with STB Bypass can still get the
     * correct data.
     *
     * In order to make this memory sync model work in that environment, the data in
     * the STB is also updated, the state then is changed to LS_RDATA.
     **********************************************************************************/
    if (ii->isDataValid()) {  // indicates data is set, in N2, StoreIssue should set state to LS_TDATA
      if ((rdata[i] & mask) != (ii->getData() & mask)) {
        MS_ERROR (" (Store Data) cid=%d tid=%d va=%#llx pa=%#llx data-ref=%#llx data-rtl=%#llx mask=%#llx",
                                  tid/NSTRANDS_PER_CORE, tid, rva + (i << 3), addr,
                                  rdata[i], ii->getData(), mask);
        return;
      }
    } else { // in N2, this should not happen
      ii->setData(rdata[i]);  // ii points to the store buffer entry
      ii->setState(LS_RDATA); // set data from Reference Model
      /* Does the following statements needed for N1, definitely not needed for N2? TPS, 6/16/04 */
//     if (ii->isLink2Valid()) {
//       ii->getLink2()->setData(data);
//     }
    }

    ii->setExecuted(true);
    if (ii->isLinkValid()) {
      (ii->getLink())->setExecuted(true);
    }

    /* Manage the rmoStb_ */
    if (ii->isRMOstore() && ii->isLinkValid()) {
      rmostb.erase(ii);
    }

    /* Two conditions must be satisfied before a store buffer entry is removed:
       1). must be executed by the reference model, 2). must be acked
       Except for atomic instruction, all stores will have STEP before ACK.
       If this happens, perform popFront() here. Others will perform popFront()
       when they receive ACK.
    */
  }
  xact.access(MemoryTransaction::WRITE | MemoryTransaction::NOP);

  /* in order not to alter the memory access, no mae_.popFront() and write memory here. */
  /* Don't know why the above statement is here? Add anyway in order to speed up the mab
     retirement so that the self-modifying code will not cause problem - 6/3/04 */

  vector<MemoryAccessEntry> wdbuf; // buffer for writing data back to memory

  mab_.popFront(&wdbuf, tsoChecker_);
  /* write STORE_COMMIT data to memory */
  bufWriteMemory(wdbuf);

  return;
}

/************************************************************************************
 * postMemoryAccess()
 * - takes the data stored in the memory sync model, and
 * - puts the data back to the MemoryTrasaction so that the reference
 *   model actually read data provided by the memory sync model
 *
 * The MemoryTransaction is used to store the data. The data is in
 * 8-byte chunks, and each is arranged in big endian fashion. If the
 * size is less than 8, only data[0] is used and the data is shifted
 * to the right (toward least significant byte).
 ************************************************************************************/

void MemorySync::post_memory_access( void* msync_obj, MemoryTransaction &xact )
{
  MemorySync* self = (MemorySync*)msync_obj;
  if (self)
    self->postMemoryAccess(xact);
}

void MemorySync::postMemoryAccess (MemoryTransaction& xact)
{
  if (MemorySyncMessage::skipCallback == 1) {
    //cerr << "ERROR: MSYNC-SKIP: T" << dec << rif_.getTid(xact) << " skip postMemoryAccess()\n";
    return;
  }

  vector<MemoryAccessEntry> wdbuf; // buffer for writing data back to memory
  list<MemoryAccessEntry>::iterator mii;
  list<LoadStoreEntry>::iterator ii;
  uint64_t mask;
  uint64_t data;
  uint64_t rpa = xact.paddr();
  if (addrTrans)
  {
    rpa = addrTrans(rpa);
  }
  uint64_t rva = xact.vaddr();
  uint8_t  rsize = xact.size();

  int      tid;
  int      iter;
  int      i,j;

  if ((rpa & 0xff00000000ULL) == 0x9000000000ULL) {
      // Bench is not sending any ldst_sync messages if PA[39:32]=0x90.
      // This is true for ST, STXA and LD, LDXA
      return;
  }

  tid = xact.getStrand();

  if (tid >= MAX_STRANDS) { // indicates the xact is not normal instruction access
    return;
  }

  if (xact.referenceType() == MemoryTransaction::INSTR) {
    MSYNC_DEBUG(1, "I-Fetch tid=%d va=%#llx pa=%#llx iword=%#010llx size=%d",
                tid, rva, rpa, xact.getData(), rsize);

    /* As of 5/18/04, only N1 sends Fetch related command */
#ifdef N1MODEL
    if (rva == 0 && rpa == 0) return; // remove glitch during RESET_INTERRUPT

    LoadStoreCmd cmd;
    list<LoadStoreEntry>::iterator    ife, ii;
    list<MemoryAccessEntry>::iterator mae;
    LoadStoreBuffer&                  ifb  = ifb_[tid];

    cmd.setAddr(rpa);

    // ife = ifb.findMatchY2O(rpa, cmd.sz2szv(rsize, rpa & ~ADDR_MASK));
    ife = ifb.find1stNonExeFetch (rpa);
    //    MS_ASSERT(ife != ifb.end(), "I-Fetch cannot find matched entry in FetchBuffer", cmd.getAddr());
    if (ife == ifb.end()) {
        if (MSYNC_DEBUGLevel != 88) {
            MemorySyncMessage::warning("I-Fetch cannot find matched entry in FetchBuffer tid=%d va=%#llx pa=%#llx iword=%#010llx", tid, rva, rpa, xact.getData());
        }
        else {
            MemorySyncMessage::warning("I-Fetch cannot find matched entry in FetchBuffer tid=%d va=%#llx pa=%#llx iword='%s'", tid, rva, rpa, Hv_InstructionWord::disassemble(xact.getData()).c_str());
        }
      return; // use the one in nas-memory
    }
    //    if (ife != (ii = ifb.find1stNonExe())) {
    if (ife != (ii = ifb.begin())) {
      for (ii; ii != ife;) {
        ii->setExecuted(true);
        // MS_ASSERT(ii->isLinkValid(), "I-Fetch entry in FetchBuffer does not have FetchData", cmd.getAddr());
        if (ii->isLinkValid()) // some fetch issue may not be real?
          ii->getLink()->setExecuted(true);
        MSYNC_DEBUG(1, "I-Fetch removes non-used Fetch Entries tid=%d pa=%#llx size=%d",
                             tid, ii->getAddr(), ii->getSize());
        ii++;
        ifb.popFront();
      }
    }

//#ifdef N1MODEL // There is cases in N1 that FetchIssue does not have FetchData, timing?
    if (!ife->isLinkValid()) return;
//#endif
    if (!ife->isLinkValid())
    {
      MS_ERROR("I-Fetch entry in FetchBuffer does not have FetchData. tid=%d  PA=%llx", tid, cmd.getAddr());
      return;
    }
    ife->setExecuted(true);
    mae = ife->getLink();
    mae->setExecuted(true);
    data = align2addr(mae->getData(), 4, (rpa & ~ADDR_MASK));
    if (data != xact.getData() && !xact.noWriteXact()) {
      MemorySyncMessage::warning("I-Fetch detects on-the-fly modified code tid=%d va=%#llx pa=%#llx i-ref=%#10llx i-rtl=%#10llx",
                                 tid, rva, rpa, xact.getData(), data);
      xact.setData(data);
    }
    ifb.popFront();
    mab_.popFront(&wdbuf, tsoChecker_);
    bufWriteMemory(wdbuf);
#endif // #ifdef N1MODEL
    return;
  }

//   if (xact.noOperationXact()) {
//     return;
//   }

  /**********************************************************************************
   * In Riesling, atomic is performed in two transactions, one read and one write.
   * This following checks return normal store as well as the store part of an atomic
   **********************************************************************************/
//   if (xact.writeXact()) {
//     return;
//   }

  if (!xact.readXact()) {
    return;
  }

  if (xact.internalXact()) {
    return;
  }

  if (xact.tablewalk()) {
      // accessing TSB, look for matching ST_COMMIT
      uint64_t data = mab_.getL2Data(mab_.end(), tid/NSTRANDS_PER_CORE, tid, rpa, true);
      xact.setData(0, data);
      return;
  }

  /**********************************************************************************
   * In Riesling, partial store sends load and store transaction.
   * The first load should be ignored in this model, the data check in store part
   * should check only those indicated by the size_vector.
   **********************************************************************************/
  //TODO  why do we comment this out? 11/22/06
  //if (rif_.isPartialStore(xact.asi())) {
  //  return;
  //}

 /* read and atomic instructions */

  LoadStoreBuffer& ldb = ldb_[tid];

  MSYNC_DEBUG(1, "STEP (postMemAcc) tid=%d va=%#llx pa=%#llx size=%d atomic=%d", tid, rva, rpa, rsize, (int) xact.atomicXact());

  int sz;
  if (rsize > 8) {
    if (rsize != 64 && rsize != 16) {
      MS_ERROR("Unsupported load data size %d", xact.size());
      return;
    }
    iter = rsize / 8;
    sz   = 8;         // each one equals 8 bytes
  } else {
    iter = 1;
    sz = rsize;
  }
  uint64_t addr;
  for (int i = 0; i < iter; i++) {
    addr = rpa + (i << 3);

    if ((addr & IO_ADDR_BIT_MASK) == IO_ADDR_BIT_MASK) { // IO address
      ii = ldb.find1stNonExeMatchedAddr(addr); // IO can be out-of-order
    } else {
      ii = ldb.find1stNonExe();
    }

    MSYNC_DEBUG (4, "ldb=%s", ldb.toString().c_str());

    if (ii == ldb.end())
    {
      MS_ERROR("STEP failed to find LoadIssue entry (possibly DUT took trap & Riesling did not). tid=%d  PA=%llx", tid, addr);
      return;
    }
    /* Note that RTL does not check load address, MemorySync performs
       this additional check for completion */
    if (addr != ii->getAddr()) {
      MS_ERROR (" Load Address Mismatch cid=%d tid=%d va=%#llx pa-ref=%#llx pa-rtl=%#llx",
                                tid/NSTRANDS_PER_CORE, tid, rva + (i << 3), addr, ii->getAddr());
      return;
    }
    if (addr != ii->getAddr())
    {
      MS_ERROR("STEP's address mismatches with the 1st non-executed Load. tid=%d  PA=%llx", tid, addr);
      return;
    }
    if (!ii->isLinkValid())
    {
      MS_ERROR("STEP's corresponding LoadData has not yet been issued. tid=%d  PA=%llx", tid, addr);
      return;
    }
    if (xact.atomicXact()) { // this assert is to make sure we can use xact.atomic() later
      if (ii->getItype() != ITYPE_ATOMIC)
      {
        MS_ERROR("STEP (postMemAcc) atomic matches non-atomic load entry. tid=%d  PA=%llx", tid, addr);
        return;
      }
    }

    // If Load to I/O space (PA[39]=1), Riesling will see the following
    // messages
    //   LDISSUE
    //   LDDATA
    //   [CSR_READ] [Optional]
    //   SSTEP
    //
    // Riesling will have to queue up the CSR_READ messages from the Bench.
    // When Riesling sees an SSTEP for an Load to I/O, it must first
    // process the LDDATA normally. Then, if the oldest CSR_READ in the
    // queue has a matching PA, use it to override the LDDATA value and
    // pop it from the queue. If oldest CSR_READ does not have a matching
    // PA, then use LDDATA value and do not pop from queue.

    // xact contains data from either memory or I/O address follow-me, we
    // cannot tell which here.

    //TODO  so we can be in trouble here, e.g.
    //   LDISSUE
    //   LDDATA
    //   STCOMMIT
    //   SSTEP
    // in this case the data from memory (there is no csr_read follow-me) can
    // be different to the value in LDDATA, yet we cannot tell whether the
    // value is from follow-me or not.
    // xact.getData() will conduct data re-arrangement according to the
    // related instruction, we don't want that here, we only want the original
    // raw data, so use no-fault version of getData().
    //TODO  there is no getDataNF(), is getData() good enough?
    uint64_t memData = xact.getData(i);
    // if the load entry's data comes from a store_commit entry, make sure
    // that entry is indeed executed, it may happen that the entry is voided
    // due to error injection
    //===> shouldn't we mark it as 'popped' in such case? 2/15/06
    if (ii->getLink()->getDsrc() == DSRC_L2_MEMORY) {
        if (!xact.followmeXact()) {
            // if the xact data is coming from a follow-me, don't do another
            // read here, we may pop out another follow-me (intended for the
            // next load)
            //ii->getLink()->setData(mab_.getL2Data(ii->getLink(), tid/NSTRANDS_PER_CORE, tid, addr, true));
            ii->getLink()->setData(mab_.getL2Data(ii->getLink(), tid/NSTRANDS_PER_CORE, tid, addr));
        }
    }
    // now we should have the correct value
    data = ii->getLink()->getData();
    uint64_t ldData = data;
    if (xact.followmeXact()) {
        //TODO  if memData is not from csr_read follow-me, this is wrong,
        //      do we have a way to tell? 10/26/05
        // memData from xact should already be aligned to address/size,
        // no need to go through align2addr() again.
        data = memData;
    }
    else {
    data = align2addr(data, sz, (addr & ~ADDR_MASK));
    }

    MSYNC_DEBUG(1, "STEP (postMemAcc) cid=%d tid=%d iseq=%lld va=%#llx pa=%#llx data-ref=%#llx (aligned=%#llx) data-rtl=%#llx size=%d ---> use data=%#llx", tid/NSTRANDS_PER_CORE, tid, ii->getIseq(), rva + (i << 3), addr, memData, align2addr(memData, sz, (addr & ~ADDR_MASK)), align2addr(ldData, sz, (addr & ~ADDR_MASK)), sz, data);

    if (!xact.noWriteXact()) {
        xact.setData(i, data);
    }
    else {
        // although the noWriteXact() indicates we must use the data in xact,
        // the data still has to follow aligning rules.
        //---> memData alignment should be handled in xact already, no
        //     need to do it again.
        //xact.setData(i, align2addr(memData, sz, (addr & ~ADDR_MASK)));
        xact.setData(i, memData);
    }
    ii->setExecuted(true);
    ii->getLink()->setExecuted(true);
    mii = ii->getLink();
    ldb.erase(ii);
  }

  /**********************************************************************************
   * The following code is to handle CAS instruction
   * Two issues related to the CAS:
   * 1. The Memory Sync model needs to know the comparison results so that it knows
   *    which data to compare
   *    The solution employed is 1) the StoreIssue of a CAS instruction sends r[rd]
   *    in the <data> field, and 2) if mismatch, reset the StoreCommit size_vector
   *    to zero.
   *
   *    N1 is different. In ni environment, the atomic STEP is sent before the
   *    STORE_COMMIT. Hence, for N1, CAS mismatch will reset
   *    the <size_vector> of the StoreIssue to 0, rather than StoreCommit. However,
   *    the StoreCommit will then copy this value to its field.
   *
   * 2. The Riesling does not send a second write memory transaction if the CAS
   *    comparison results in a false.
   *    In this case, the the executed_ bit of the StoreIssue and StoreCommit must be
   *    set in this method. (Normally, the executed_ bit of a StoreIssue or StoreCommit
   *    is set in preMemoryAccess() when a write memory transaction is issued by the
   *    Riesling model.
   *
   *    Therefore, in this atomic-load transaction, if the size-vector is 0, it
   *    indicates no write transaction, and the execute_ bit will be set accordingly.
   **********************************************************************************/
  if (xact.atomicXact()) {
    list<LoadStoreEntry>::iterator slink;
    LoadStoreBuffer& stb = stb_[tid];
    LoadStoreBuffer& rstb = retStb_[tid];

    /* find the corresponding StoreIssue */
    slink = rstb.find1stNonExe(); // must step in order
    if (slink == rstb.end()) {
      slink = stb.find1stNonExe();
      if (slink == stb.end())
      {
        MS_ERROR("STEP(at) failed to find non-executed StoreIssue entry (possibly DUT took trap & Riesling did not). tid=%d  PA=%llx", tid, addr);
        return;
      }
    }
    if (rpa != slink->getAddr())
    {
      MS_ERROR("STEP(at) mis-matches addr with the 1st non-executed StoreIssue. tid=%d  PA=%llx", tid, addr);
      return;
    }
    if (slink->getItype() != ITYPE_ATOMIC)
    {
      MS_ERROR("STEP(at) found 1st non-executed StoreIssue non-atomic. tid=%d  PA=%llx", tid, addr);
      return;
    }

    /* set the executed_ bit if needed */
#ifdef N1MODEL
    MSYNC_DEBUG(2, "PostMemAccess atomic sizeV=%#x", (int) slink->getSizeV());
    if (slink->getSizeV() == 0) { // CAS comparison is false => no write necessary
      slink->setExecuted(true);
      if (slink->isLinkValid()) {
        slink->getLink()->setExecuted(true);
        /* Fill the store part the same data written info as it load part --- TPS 9/10/04 */
        slink->getLink()->setData(mii->getData());
        slink->getLink()->setSizeV(mii->getSizeV());
      }
    }
#else
    if (!slink->isLinkValid())
    {
      MS_ERROR("STEP(at) misses StoreCommit. tid=%d  PA=%llx", tid, addr);
      return;
    }
    if (slink->getLink()->getSizeV() == 0) { // CAS comparison is false => no write necessary
      slink->setExecuted(true);
      slink->getLink()->setExecuted(true);
      /* Fill the store part the same data written info as it load part --- TPS 9/10/04 */
      slink->getLink()->setData(mii->getData());
      slink->getLink()->setSizeV(mii->getSizeV());
    }
#endif
    MSYNC_DEBUG(2, "slink=%s", (slink->toString()).c_str());
  }

  mab_.popFront(&wdbuf, tsoChecker_);
  /* write STORE_COMMIT data to memory */
  bufWriteMemory(wdbuf);

  return;
}

/* write STORE_COMMIT data to memory */
void
MemorySync::bufWriteMemory (vector<MemoryAccessEntry>& wdbuf)
{
  vector<MemoryAccessEntry>::iterator wi;
  int qsize = wdbuf.size();
  int i = 0;

  wi = wdbuf.begin();
  while (i < qsize) {
    if (wi->getSizeV() != 0 && // (sizeV == 0) => no need to store
        !(wi->getItype() == ITYPE_ATOMIC && !wi->isSwitchData())) { // cas false, no need to store
      rif_.writeMemory(wi->getCoreId(), wi->getThrdId(), (wi->getAddr() & ADDR_MASK), wi->getData(), 8);
    }
    i++;
    wi++;
  }
  wdbuf.clear();
}

void
MemorySync::setTestBenchData (uint32_t tid, uint64_t addr, uint64_t data, bool rmo)
{
  if (addrTrans)
  {
    addr = addrTrans(addr);
  }
  LoadStoreBuffer& stb = stb_[tid];
  LoadStoreBuffer& rmostb = rmoStb_[tid];
  list<LoadStoreEntry>::iterator ii;

  if (rmo) {
    ii = rmostb.findNeedTDataO2Y(addr);
    if (ii == rmostb.end()) {
      ii = stb.findNeedTDataO2Y(addr);
      if (ii == rmostb.end())
      {
        MS_ERROR("setTestBenchData failed to find RMO entry in (RMO)STB. tid=%d  PA=%llx", tid, addr);
        return;
      }
      if (!ii->isRMOstore())
      {
        MS_ERROR("setTestBenchData expected match-address entry in STB to be RMO. tid=%d  PA=%llx", tid, addr);
        return;
      }
    }
  } else {
    ii = stb.findNeedTDataO2Y(addr);
    if (ii == stb.end())
    {
      MS_ERROR("setTestBenchData failed to find entry in (RMO)STB. tid=%d  PA=%llx", tid, addr);
      return;
    }
  }

  if (ii->isDataValid()) {
    // can only be RDATA, Reference Model data set in STEP
    if (ii->getState() != LS_RDATA)
    {
      MS_ERROR("The entry is not in LS_RDATA state");
      return;
    }
    uint64_t mask = byteMask(ii->getVbyte());
    if ((data & mask) != (ii->getData() & mask)) {
      MS_ERROR (" (Store Data) cid=%d tid=%d pa=%#llx data-ref=%#llx data-rtl=%#llx mask=%#x",
                                ii->getCoreId(), ii->getThrdId(), ii->getAddr(),
                                ii->getData(), data, mask);
      return;
    }
  } else {
    ii->setData(data);
    /* Does the following statements needed for N1, definitely not needed for N2? TPS, 6/16/04 */
//     if (ii->isLink2Valid()) {
//       ii->getLink2()->setData(data);
//     }
  }
  ii->setState(LS_TDATA);
}

/* assume data is aligned to 8 byte chunk, the result ha
   1. data shift to the least significant bytes, also
   2. make the higher byte outside the size range 0, which is
      required when this function is used in postMemoryAccess()
      and provide data back to Riesling Reference Model since
      Riesling zero out all bytes not in the request data size
      range.
*/
uint64_t
MemorySync::align2addr (uint64_t data, uint8_t size, uint32_t addr_offset)
{
  uint64_t result;
  int      sft;

  switch (size) {
  case 8:
    assert ((addr_offset & 0x7ULL) == 0);
    result = data;
    break;
  case 4:
    assert ((addr_offset & 0x3ULL) == 0);
    sft = 32 - ((addr_offset & 0x4ULL) << 3);
    result = (data >> sft) & 0xffffffffULL;
    break;
  case 2:
    assert ((addr_offset & 0x1ULL) == 0);
    sft = 48 - ((addr_offset & 0x6ULL) << 3);
    result = (data >> sft) & 0xffffULL;
    break;
  case 1:
    sft = 56 - (addr_offset << 3);
    result = (data >> sft) & 0xffULL;
    break;
  default:
    assert(0);
    result = data;
    break;
  }
  return (result);
}

/* assume data is at the least significant bytes */
uint64_t
MemorySync::align8byte (uint64_t data, uint8_t size, uint32_t addr_offset)
{
  uint64_t result;
  int      sft;

  switch (size) {
  case 64:
  case 16:
  case 8:
    assert ((addr_offset & 0x7ULL) == 0);
    result = data;
    break;
  case 4:
    assert ((addr_offset & 0x3ULL) == 0);
    sft = 32 - ((addr_offset & 0x4ULL) << 3);
    result = data << sft;
    break;
  case 2:
    assert ((addr_offset & 0x1ULL) == 0);
    sft = 48 - ((addr_offset & 0x6ULL) << 3);
    result = data << sft;
    break;
  case 1:
    sft = 56 - (addr_offset << 3);
    result = data << sft;
    break;
  case 0:
    result = data;
    break;
  default:
    assert(0);
    result = data;
    break;
  }
  return (result);
}

uint64_t
MemorySync::merge (uint64_t todata, uint64_t fromdata, uint8_t mgvec)
{
  uint64_t data;
  uint64_t byteMask1 = byteMask(~mgvec);
  uint64_t byteMask2 = byteMask(mgvec);
  data = (todata & byteMask1) | (fromdata & byteMask2);

  MSYNC_DEBUG(4, "merge: todata=0x%llx fdata=0x%llx merge=0x%x result=0x%llx",
                           todata, fromdata, (int) mgvec, data);

  return (data);
}


uint64_t
MemorySync::byteMask(const uint8_t vbyte) {
  uint64_t mask = 0ull;
  uint8_t  bitSelector = 0x80; // 10000000

  for (int i = 0; i < 8; i++) {
    mask = mask << 8;
    mask = ((vbyte & bitSelector) == 0) ? mask : mask | 0xffull;
    bitSelector >>= 1;
  }
  //    cerr << "byteMask: in=0x" << hex << (int) vbyte << " out=0x" << hex << mask << endl;
  return (mask);
}

uint64_t
MemorySync::getL1Data(LoadStoreCmd& cmd)
{
  list<MemoryAccessEntry>::iterator mae;

  mae = mab_.findL1DataEntry(cmd);
  if (mae == mab_.end()) {
    return (rif_.readMemory(cmd.getCoreId(), cmd.getThrdId(), cmd.getAddr() & ADDR_MASK, 8)); // read aligned 8 bytes
  } else {
    return (mae->getData());
  }
}

uint64_t
MemorySync::getL2Data(list<MemoryAccessEntry>::iterator from, LoadStoreCmd& cmd)
{
  list<MemoryAccessEntry>::iterator mae;

  mae = mab_.findL2DataEntry(from, cmd);
  if (mae == mab_.end()) {
    return (rif_.readMemory(cmd.getCoreId(), cmd.getThrdId(), cmd.getAddr() & ADDR_MASK, 8)); // read aligned 8 bytes
  } else {
    return (mae->getData());
  }
}

uint64_t
MemorySync::getL1Instr(LoadStoreCmd& cmd)
{
  list<MemoryAccessEntry>::iterator mae;
  uint32_t tid = cmd.getThrdId();

  mae = mab_.findL1InstrEntry(cmd);
  if (mae == mab_.end()) {
    return (rif_.readMemory(cmd.getCoreId(), cmd.getThrdId(), cmd.getAddr() & ADDR_MASK, 8)); // read aligned 8 bytes
  } else {
    return (mae->getData());
  }
}

//=============================================================================
//=============================================================================
void
MemorySync::nullMsyncCallback()
{
    // turn off msync related callbacks
    MemorySyncMessage::skipCallback = 1;
}

//=============================================================================
//=============================================================================
void
MemorySync::flushMsyncCallback(int tid)
{
    MSYNC_DEBUG(1, "flushMsyncCallback( tid=%d )", tid);

    ldb_[tid].empty();

    //TODO  not sure if we should empty the following buffers as well
    //stb_[tid].empty();
    //ifb_[tid].empty();

    mab_.empty(tid);
    // we rely on the entry in MAB to determine which entry needs to be removed
    // from retStb and rmoStb, so the "mab_.empty" must come first.
    retStb_[tid].markPop();
    rmoStb_[tid].markPop();
}

//=============================================================================
//=============================================================================
void
MemorySync::handleLoadPop(int tid)
{
    MSYNC_DEBUG(1, "handleLoadPop( MEM_LD_POP, tid=%d )", tid);

    list<LoadStoreEntry>::iterator ii = ldb_[tid].queryBack();
    if (ii == ldb_[tid].end())
    {
      MS_ERROR("No load entry to be popped");
      return;
    }
    int count;
    switch ((ii)->getItype()) {
    case ITYPE_BLOCK_LOAD:
        count = 8;
        break;
    case ITYPE_QUAD_LOAD:
        count = 2;
        break;
    default:
        count = 1;
        break;
    }

    if (count > ldb_[tid].size())
    {
      MS_ERROR("Load buffer does not have enough entries to be popped, ldb.size=%d  pop=%d", ldb_[tid].size(), count);
      return;
    }

    ldb_[tid].popBack(count);
    mab_.popBack(tid, count);
}

//=============================================================================
//=============================================================================
void
MemorySync::handleStorePop(int tid)
{
    MSYNC_DEBUG(1, "handleStorePop( MEM_ST_POP, tid=%d )", tid);

    list<LoadStoreEntry>::iterator ii = stb_[tid].queryBack();
    if (ii == stb_[tid].end())
    {
      MS_ERROR("No store entry to be popped");
      return;
    }
    int count;
    switch ((ii)->getItype()) {
    case ITYPE_BLOCK_STORE:
        count = 8;
        break;
    default:
        count = 1;
        break;
    }

    if (count > stb_[tid].size())
    {
      MS_ERROR("Store buffer does not have enough entries to be popped, stb.size=%d  pop=%d", stb_[tid].size(), count);
      return;
    }

    stb_[tid].popBack(count);
    //TODO  do we need to pop store entries in MAB?
}

/******************************************************************************
  compare a 64-byte data block in memory with the data provided by testbench,
  throw error if miscompare.
******************************************************************************/
void
MemorySync::handleMemoryCheck(uint64_t paddr, uint64_t* data, int dataSize)
{
  if (addrTrans)
  {
    paddr = addrTrans(paddr);
  }
  if ((paddr % (dataSize*8)) != 0)
  {
    MS_ERROR("paddr=%#llx is not %d-byte aligned", paddr, (dataSize*8));
    return;
  }

  for (int i = 0; i < dataSize; i++) {
    // DMA_STORE always goes through msync, so MEM_CHECK should go through
    // msync to find matched entry.
    //uint64_t memData = rif_.readMemory(0, 0, (paddr+8*i), 8);
    uint64_t memData = mab_.getL2Data(mab_.end(), 0, 0, (paddr+8*i), true);
    if (data[i] != memData)
    {
      MS_ERROR("memory-check mismatch at addr=%#llx, RTL=%#llx, riesling=%#llx", (paddr+8*i), data[i], memData);
      return;
    }
  }
}

//=============================================================================
//=============================================================================
void
MemorySync::flushAll()
{
    // flush out all entries
    mab_.empty();
    for (int i = 0; i < MAX_STRANDS; i++)
    {
      ldb_[i].empty();
      stb_[i].empty();
      retStb_[i].empty();
      rmoStb_[i].empty();
      ifb_[i].empty();
    }
}

//=============================================================================
// sniper is used by testbench to generate load/store traffic from
// simulated core(s), the goal is to test multi-core load/store traffic
// before multi-core RTL is available. The faked load/store traffic will
// be communicated with riesling through pli-msync commands. With real
// load/store, there are real load/store instructions to match/process them,
// but that won't be the case for sniper-generated load/store, so a new pli
// command SSTEP_SNIPER is added, it will be used to match up sniper-generated
// load/store events in msync, the SSTEP_SNIPER command will not cause any
// cpu architecture state change.
//=============================================================================
void MemorySync::handleSniper(int tid, uint64_t addr, INSTR_TYPE itype, uint64_t data)
{
  MSYNC_DEBUG(4, "SNIPER: STEP (postMemAcc) tid=%d pa=%#llx itype=%d data=%#llx", tid, addr, (int)itype, data);

  if ((itype == ITYPE_LOAD) ||
      (itype == ITYPE_BLOCK_LOAD) ||
      (itype == ITYPE_DOUBLE_LOAD) ||
      (itype == ITYPE_QUAD_LOAD) ||
      (itype == ITYPE_ATOMIC))
  {
    list<LoadStoreEntry>::iterator ii;
    LoadStoreBuffer& ldb = ldb_[tid];
    if ((addr & IO_ADDR_BIT_MASK) == IO_ADDR_BIT_MASK)
    { // IO address, IO can be out-of-order
      ii = ldb.find1stNonExeMatchedAddr(addr);
    }
    else
    {
      ii = ldb.find1stNonExe();
    }
    MSYNC_DEBUG (4, "ldb=%s", ldb.toString().c_str());
    if (ii == ldb.end())
    {
      MS_ERROR("SNIPER: STEP failed to find LoadIssue entry (possibly DUT took trap & Riesling did not). tid=%d  PA=%llx", tid, addr);
      return;
    }
    // Note that RTL does not check load address, MemorySync performs
    // this additional check for completion
    if (addr != ii->getAddr())
    {
      MS_ERROR("SNIPER: STEP's address mismatches with the 1st non-executed Load. tid=%d  PA=%llx", tid, addr);
      return;
    }
    if (!ii->isLinkValid())
    {
      MS_ERROR("SNIPER: STEP's corresponding LoadData has not yet been issued. tid=%d  PA=%llx", tid, addr);
      return;
    }
    if (itype == ITYPE_ATOMIC)
    {
      // this assert is to make sure we can check atomic later
      if (ii->getItype() != ITYPE_ATOMIC)
      {
        MS_ERROR("SNIPER: STEP (postMemAcc) atomic matches non-atomic load entry. tid=%d  PA=%llx", tid, addr);
        return;
      }
    }
    if (ii->getLink()->getDsrc() == DSRC_L2_MEMORY)
    {
      ii->getLink()->setData(mab_.getL2Data(ii->getLink(), tid/NSTRANDS_PER_CORE, tid, addr));
    }
    // msync data does not match the one provided by testbench
    if (ii->getData() != data)
    {
      MS_ERROR("SNIPER: load data (%#llx) mismatches with testbench data (%#llx). tid=%d  PA=%llx", ii->getData(), data, tid, addr);
      return;
    }
    ii->setExecuted(true);
    ii->getLink()->setExecuted(true);
    list<MemoryAccessEntry>::iterator mii = ii->getLink();
    ldb.erase(ii);

    if (itype == ITYPE_ATOMIC)
    {
      list<LoadStoreEntry>::iterator slink;
      LoadStoreBuffer& stb = stb_[tid];
      LoadStoreBuffer& rstb = retStb_[tid];

      // find the corresponding StoreIssue
      slink = rstb.find1stNonExe(); // must step in order
      if (slink == rstb.end())
      {
        slink = stb.find1stNonExe();
        if (slink == stb.end())
        {
          MS_ERROR("SNIPER: STEP(at) failed to find non-executed StoreIssue entry (possibly DUT took trap & Riesling did not). tid=%d  PA=%llx", tid, addr);
          return;
        }
      }
      if (addr != slink->getAddr())
      {
        MS_ERROR("SNIPER: STEP(at) mis-matches addr with the 1st non-executed StoreIssue. tid=%d  PA=%llx", tid, addr);
        return;
      }
      if (slink->getItype() != ITYPE_ATOMIC)
      {
        MS_ERROR("SNIPER: STEP(at) found 1st non-executed StoreIssue non-atomic. tid=%d  PA=%llx", tid, addr);
        return;
      }
      if (!slink->isLinkValid())
      {
        MS_ERROR("SNIPER: STEP(at) misses StoreCommit. tid=%d  PA=%llx", tid, addr);
        return;
      }
      if (slink->getLink()->getSizeV() == 0)
      {
        // CAS comparison is false => no write necessary
        slink->setExecuted(true);
        slink->getLink()->setExecuted(true);
        // Fill the store part the same data written info as it load part
        //   --- TPS 9/10/04
        slink->getLink()->setData(mii->getData());
        slink->getLink()->setSizeV(mii->getSizeV());
      }
      MSYNC_DEBUG(2, "slink=%s", (slink->toString()).c_str());
    }
    // process completed entries in MAB, write STORE_COMMIT data to memory
    // if ready
    vector<MemoryAccessEntry> wdbuf;
    mab_.popFront(&wdbuf, tsoChecker_);
    bufWriteMemory(wdbuf);
  }

  if ((itype == ITYPE_STORE) ||
      (itype == ITYPE_BLOCK_STORE) ||
      (itype == ITYPE_STORE_INIT) ||
      (itype == ITYPE_ATOMIC))
  {
    list<LoadStoreEntry>::iterator ii;
    LoadStoreBuffer& stb = stb_[tid];
    LoadStoreBuffer& rstb = retStb_[tid];
    LoadStoreBuffer& rmostb = rmoStb_[tid];
    LoadStoreBuffer& ret_or_rmo_stb = (itype == ITYPE_ATOMIC) ? rstb : rmostb;
    MSYNC_DEBUG(2, "SNIPER: rstb=%s", rstb.toString().c_str());
    MSYNC_DEBUG(2, "SNIPER: stb=%s", stb.toString().c_str());
    MSYNC_DEBUG(2, "SNIPER: rmostb=%s", rmostb.toString().c_str());
    ii = ret_or_rmo_stb.find1stNonExe();
    if (ii == ret_or_rmo_stb.end())
    {
      ii = stb.find1stNonExe();
      if (ii == stb.end())
      {
        if (itype == ITYPE_ATOMIC)
        {
          MS_ERROR("SNIPER: STEP (store part of an atomic instr) failed to find match StoreIssue in STB. tid=%d  PA=%llx", tid, addr);
          return;
        }
        else
        {
          MS_ERROR("SNIPER: STEP failed to find match StoreIssue in STB. tid=%d  PA=%llx", tid, addr);
          return;
        }
      }
    }
    if ((addr & ADDR_MASK) != (ii->getAddr() & ADDR_MASK))
    {
      MS_ERROR("SNIPER: STEP's address mismatches with 1st non-executed StoreIssue entry. tid=%d  PA=%llx", tid, addr);
      return;
    }
    ii->setExecuted(true);
    if (ii->isLinkValid())
    {
      (ii->getLink())->setExecuted(true);
    }
    if (ii->isRMOstore() && ii->isLinkValid())
    {
      rmostb.erase(ii);
    }
    vector<MemoryAccessEntry> wdbuf;
    mab_.popFront(&wdbuf, tsoChecker_);
    // write STORE_COMMIT data to memory
    bufWriteMemory(wdbuf);
  }
}

void MemorySync::setCoreEnable(int node_index, uint64_t data)
{
  uint8_t bits = 0;
  for (int i=0; i<8; i++)
  {
    if (data & 0xff)
    {
      bits |= (0x1 << i);
    }
    data = data >> 8;
  }
  inv_vec_mask[node_index] = bits;
}