legion/src/procs/sunsparc/include/ss_err_trap.h

/*
* ========== Copyright Header Begin ==========================================
*
* OpenSPARC T2 Processor File: ss_err_trap.h
* Copyright (c) 2006 Sun Microsystems, Inc.  All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES.
*
* The above named program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* The above named program is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*
* ========== Copyright Header End ============================================
*/
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident   "@(#)ss_err_trap.h      1.10    06/11/08 SMI"

#ifndef _SS_ERR_TRAP_H
#define _SS_ERR_TRAP_H

#ifdef  __cplusplus
extern "C" {
#endif

#if ERROR_TRAP_GEN      /* { */


/*
 *
 *
 *
                  Error Trap Generation Framework
                  ===============================

                      Implementation Details

Glossary of Terms:
------------------

"error events" - Data provided by user to describe the error/trap
   he or she wants Legion to inject (when/how/where/what).

"ASI override" - ASI/VA pair provided by the user along with
   optional value masks which tells Legion how to respond when SW
   accesses specific ASI/VAs.

"error entry" - CPU specific information describing a named error
   such as "DCDP" (data cache data parity) in terms of trap type,
   status register bit(s), error enable register bit(s), etc. Each
   CPU provides a list of all the error entries it supports.

"error status register" (ESR) - Most CPU errors are associated with a
   status register which will have one or more bits set when that
   error is detected so that the SW trap handlers know which error
   was encountered.

"error enabling register" (EER) - Most CPU errors in the UltraSPARC
   family are also associated with an error reporting register
   (which controls whether the error generates a trap) and/or and
   error recording register (which controls whether the error is
   detected). Both of these are referred to as error enabling
   registers.


High Level Summary:
-------------------

The user provides Legion (via the conf file) a list of "error events"
and optional "ASI overrides".

Legion examines the error event list and sets itself up to watch for
the trigger conditions specified in the error events.

When the trigger conditions are satisfied (e.g. reached a certain
instruction count or load/store a certain address or executes a specified
%pc) then Legion injects the specified error trap.

If additional error events have been specified, Legion sets itself
up once again to watch for the new trigger conditions.

The Legion code for injecting an error trap is responsible for
injecting the appropriate ESR bits and checking the appropriate
error enabling registers in order to determine whether or not to
post a trap, leave it pending, or just drop it.

For every non-memory ASI access from the simulated SW, Legion checks
it against the list of user provided ASI overrides and returns the
user specified value if a match is found.

The error trap generation framework also monitors the state of the
various ESRs and will repeatedly post traps to the CPU when
appropriate as long as the ESR is not cleared by the simulated SW.


Entry Points:
-------------

When the ERROR_TRAP_GEN compile flag is turned on, the error trap
injection framework is enabled. In order to keep the implementation
modular, easy to maintain, and common across CPUs, we have made an
effort to limit the number of entry points from normal Legion code
into the error trap injection framework. These entry points are
described here.

1) Parsing of user input - ss_parse()
While parsing the "processor" directive in the Legion conf file, if
we encounter a error_event {} or error_asi {} directive, we call into
the error trap injection framework's parsing functions to handle it.

2) CPU initialization - ss_init()
During CPU initialization, we call into the error trap injection
framework in order to initialize a few things. We make one chip
specific init call as well as one thread specific call per simulated
thread.

3) ASI access - ss_asi_access()
For all ASI accesses (other than memory asi access such as
ASI_SECONDARY,  ASI_BLK_S, ASI_REAL_MEM, etc) we call into the error
trap injection framework to see whether the specific ASI/VA in
question (1) corresponds to one of the error status registers or
error enabling registers of this CPU or (2) matches one of the ASI
override entries provided by the user.

4) Instruction cycle count monitoring - ss_cycle_target_match()
In order to trigger the user specified error trap based on the given
conditions, we set certain cycle targets and watch for them.

5) Trigger the trap on:
   a) Load/Store operations -  LOAD_OP()/STORE_OP()
      Once the error trap cycle target has been matched, we check every
      load and store operation to see whether it is time to inject the
      error trap or not.
   b) %pc value - debug_breakpoint_cb()
      When a %pc trigger value is specified, we set a breakpoint at the
      %pc value we want to catch. We then wait for the instn_cnt
      to be reached (if specified) and once we have reached the specified
      instn_cnt, the next time we hit that breakpoint, we trigger the
      error trap. Other constraints will also determine whether we
      should trigger (ie. trap_levelm priv_level).
   When the user specified conditions have been met, we inject the error
   trap by calling trigger_error_trap() for the simcpu in question.

6) Processor State Changes - ss_check_interrupts()
Every time the processor state changes, we call into the error trap
injection framework to check whether there is an error event which
needs to be injected or an error trap which needs to be posted.

7) Taking a trap - ss_take_exception()
Every time the processor is about to take a trap, we call into the
error trap injection framework so that we know whether or not the
trap we injected was actually taken or not.


Error trap injection from start to finish:
------------------------------------------

Step 1 - Parse input
User input is parsed and added to Legion's list of error
events.

Step 2 - Assign error event to a CPU
The error event is assigned to a specific strand (the one specified
to encounter the error event) and the strand is configured to watch
for the trigger conditions associated with the error event.

Step 3 - Trigger detection
Once we detect that the trigger conditions have been satisfied,
Legion marks the error event as "triggered" and calls a routine to
trigger the error, trigger_error_trap().

Step 4 - Triggering the error
The error event trigger code is responsible for ensuring that only
one error is being "triggered" at a time per system. This means we
are serializing the error trap generation code starting from the
point where we "inject" the error and ending when the trap is
actually taken on the CPU. Normally this period is very short, but
can be longer for maskable trap types. This serialization is required
to ensure that we don't lose injected traps due to trap priority
conflicts.

If no error is currently being triggered on the system, we proceed
to call the injection code. Otherwise, if an error is already being
triggered, we simply return and the error event will be checked
after every CPU state change until we are able to inject the error.

Step 5 - Error injection
Based on the name of the error, we search the CPU specific table of
error entries and that error entry tells us all we need to know
about how to inject the error. Details we get from the error entry
include which error status bits to inject, which error enabling
registers to check, and which trap type to post or keep pending.

At this point, we also add any user specified ASI overrides
associated with this error to our master list of ASI overrides.

Depending on whether the error is persistent or not, we once again
set the CPU to monitor for the same trigger conditions (persistent
error events) or check the error event list for a new error.

Step 6 - CPU state change
At every CPU state change, we check our list of errors to see if
there are any error traps which we need to post due to the error
status bit(s) not being clear.

 *
 *
 */

typedef struct ss_error_entry ss_error_entry_t;

#define EAR( _name )    ss_access_asi_##_name /* standard error ASI access routine naming convention */
#define EIR( _name )    ss_inject_esr_##_name /* standard ESR injection routine naming convention */

#define EAR_ARGS        simcpu_t *sp, int asi, tvaddr_t addr, bool_t is_load, uint64_t store_val, bool_t legion_access
#define EIR_ARGS        simcpu_t *sp, uint64_t mask, ss_error_entry_t *errp

#define EAR_DEFINITION( _n )    uint64_t EAR( _n )(EAR_ARGS)
#define EIR_DEFINITION( _n )    bool_t EIR( _n )(EIR_ARGS)

#define ASI_NA  -1
#define ADDR_NA -1
#define ANY_ERR_VA 1
#define INVALID_TRAP -1
#define INVALID_ASI -1

#define END_ERR_STRING "done"
#define TRAP_ERR_STRING "trap-only"
#define TRAP_ONLY_TT 0
#define TARGET_MYSELF -1

/* Service Processor (SP) interrupt stuff */
#define SP_INTR_ERR_STRING "sp-intr"
#define SP_INTR_ONLY -1
#define TARGET_SP -2

/* This trap never gets taken.. generate SP interrupt instead */
#define SS_generate_SP_interrupt NULL

/*
 * There two kinds of register which we manage in Legion
 * for simulation of error traps. Both are accessed via ASI:
 *
 * 1) Error Status Registers (ESR) - This is where specific
 * errors are reported (usually one bit per error type) so
 * that SW can figure out what error type we encountered.
 *
 * 2) Error Enabling Registers (EER) - These control how the
 * CPU behaves after encountering an error (or injecting an
 * error trap in our case). There are two types of EERs:
 *
 *    Error Recording Registers - These control whether or
 *    not a given error is detected and logged.
 *
 *    Error Reporting Registers - These control whether or
 *    not a trap will be generated for a given error.
 *
 * We define a common struct for managing each one of these
 * error register types and each CPU module defines a list of
 * the ESRs and EERs it supports.
 *
 */
typedef struct {
        int             asi;
        tvaddr_t        addr;
        uint64_t        (*reg_access)(EAR_ARGS);
} ss_err_reg_t;

/*
 * We want to come up with a standard structure which can be used to
 * describe all CPU errors for Niagara2, Rock, and possibly other
 * CPUs in a common way.
 *
 * Each CPU module will provide a table describing all the CPU errors
 * is supports and that way we can keep most CPU specific implementation
 * details in an easy to review/modify table and keep the error trap
 * injection framework code simple and common to all CPUs.
 *
 * Keeping that in mind, here are the basic ground rules and assumptions
 * we will be using to implement support for error trap injection
 * under Legion. These should all be supported by the UltraSPARC 2006
 * spec and/or the CPU PRMs.
 *
 * 1) All disrupting error traps are also conditioned by the PSTATE.IE
 *    bit when HPSTATE.HPRIV is set.
 *
 * 2) Precise and Deferred traps are always taken on the CPU which
 *    detected the error.
 *
 * 3) Persistent errors cannot be Non-Maskable.
 *
 * 4) When an error trap is injected and the Error Reporting EER
 *    conditions have not been satisfied (i.e. the trap is masked)
 *    then the trap is held pending only for disrupting errors.
 *    The trap will be dropped for Precise and Deferred errors.
 *    (NOTE: in general, Precise and Deferred errors are not
 *    maskable anyway. But, we provide the option just in case.)
 *
 * NOTE: One issue I'm not sure about right now is whether we
 * need to allow more than one EER per error or not. In theory,
 * detection or trap generation for a given error may be
 * dependent on more than just one EER.
 *
 * If we do need to change the framework to allow more than
 * one EER (or even ESR) per error, the implementation will
 * get a bit more complicated, but the basic ideas still work.
 * We would simply have to have something like a linked list
 * of EERs instead of statically defining one per error.
 *
 * For now, let's just assume one EER/ESR will do for each
 * error -- I'm pretty sure this assumption is valid for Rock
 * and Niagara2.
 */


/*
 * Struct which defines how a given EER controls an error's
 * recording/reporting.
 */
typedef struct SS_EER_CTRL {
        uint64_t (*eer_access)(EAR_ARGS);
        uint64_t mask;
} ss_eer_ctrl_t;

/*
 * Struct which defines how a given ESR is associated with
 * a given error.
 */
typedef struct SS_ESR_UPDATE {
        uint64_t (*esr_access)(EAR_ARGS);
        bool_t (*esr_inject)(EIR_ARGS);
        uint64_t err_inject_mask;
        uint64_t err_pending_mask;
        char * err_inject_name;
} ss_esr_inject_t;

typedef enum {
        PRECISE_TT,
        DEFERRED_TT,
        DISRUPTING_TT,
        SP_INTR
} error_trap_class_t;


typedef struct {
        char                    *error_name;    /* error name */
        uint64_t                sp_intr;        /* SP interrupt level */
} ss_sp_error_t;

/*
 * Struct which defines a CPU error entry.
 */
struct ss_error_entry {
        char *                  error_name;
        sparcv9_trap_type_t     trap_type;
        error_trap_class_t      trap_class;     /* precise/deferred/disrupting */
        bool_t                  is_persistent;  /* Keep generating trap while esr bit is set? */
        int                     trap_target;    /* -1 means always current strand. */
        ss_esr_inject_t         error_status;   /* ESR to update */
        ss_eer_ctrl_t           error_record;   /* should the error be logged? */
        ss_eer_ctrl_t           error_report;   /* should a trap be generated? */
};


/*
 * Per chip Error Trap state register.
 * This struct will hold all the common
 * state information related to the
 * error trap injection framework.
 */
typedef struct {
        /*
         * Pointer to the error entry we are currently
         * trying to inject in the system.
         */
        ss_error_entry_t        *inj_error_trap;
        int                     trap_target_gid;

        /*
         * Used to limit the number of outstanding error
         * trap injections to one. i.e. Once an error trap
         * has been injected on a system, we cannot inject
         * a new one until the error trap has actually
         * been taken by the target CPU. (or we have
         * deemed that no new trap will be generated.)
         */
        bool_t                  ready_for_next_injection;
        pthread_mutex_t         injection_lock;

        /*
         * Used for performance reasons.
         * If we know that no ESRs are currently set (i.e.
         * they are all clear) then we can skip a lot of
         * checks during ss_check_error_traps().
         */
        bool_t                  esrs_clear;

        /*
         * head pointers to the system wide lists which
         * are parsed from user input.
         */
        error_event_t           *error_event_list_rootp;
        error_asi_t             *error_asi_list_rootp;

        /*
         * dynamic reload file name
         */
        char                    *error_config_filep;

        pthread_mutex_t         err_lock;

        ss_err_reg_t            *err_reg_tbl;
        ss_error_entry_t        *err_event_tbl;
        ss_sp_error_t           *sp_err_tbl;

} ss_error_state_t;

/*
 * extern definitions used by other binaries.
 */
extern void ss_check_error_traps(simcpu_t*);
extern void ss_error_taking_trap(simcpu_t*, sparcv9_trap_type_t);
extern bool_t ss_check_user_asi_list(simcpu_t*,int, tvaddr_t, uint64_t*, bool_t, bool_t);
extern bool_t ss_error_asi_access(simcpu_t*, maccess_t, int, int, bool_t, tvaddr_t, uint64_t);
extern void check_pending_error_events(simcpu_t*);
extern void dump_error_event_list(int, error_event_t*);
extern void dump_error_asi_list(int, error_asi_t*);
extern void dump_cpu_error_table(int, ss_error_entry_t*);
extern void dump_cpu_error_reg_table(int, ss_err_reg_t*);
extern bool_t trigger_error_trap(simcpu_t*);
extern void check_if_error_event_pending(void);
extern void ss_inject_error_trap(simcpu_t *, char *, sparcv9_trap_type_t, int);

extern void ss_error_asi_parse(void *procp, bool_t is_reload);
extern void ss_error_event_parse(void *procp, bool_t is_reload);
extern void ss_error_reload_file(config_proc_t *cp);
extern void ss_error_parse_filename(void *procp);

extern void ss_error_dump_active(config_proc_t *cp);
extern void ss_error_dump_supported(config_proc_t *cp);

extern void ss_error_trap_proc_init(config_proc_t * config_procp);
extern void ss_error_trap_strand_init(config_proc_t * config_procp, simcpu_t * sp);

#endif          /* } ERROR_TRAP_GEN */

#ifdef  __cplusplus
}
#endif

#endif /* _SS_ERR_TRAP_H */