usr/src/sys/sparc/sparc/pmap.c

/*
 * Copyright (c) 1992, 1993
 *      The Regents of the University of California.  All rights reserved.
 *
 * This software was developed by the Computer Systems Engineering group
 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
 * contributed to Berkeley.
 *
 * All advertising materials mentioning features or use of this software
 * must display the following acknowledgement:
 *      This product includes software developed by the University of
 *      California, Lawrence Berkeley Laboratory.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *      @(#)pmap.c      8.1 (Berkeley) 6/11/93
 *
 * from: $Header: pmap.c,v 1.39 93/04/20 11:17:12 torek Exp $
 */

/*
 * SPARC physical map management code.
 * Does not function on multiprocessors (yet).
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/device.h>
#include <sys/proc.h>
#include <sys/malloc.h>

#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_prot.h>
#include <vm/vm_page.h>

#include <machine/autoconf.h>
#include <machine/bsd_openprom.h>
#include <machine/cpu.h>
#include <machine/ctlreg.h>

#include <sparc/sparc/asm.h>
#include <sparc/sparc/cache.h>

#ifdef DEBUG
#define PTE_BITS "\20\40V\37W\36S\35NC\33IO\32U\31M"
#endif

extern struct promvec *promvec;

/*
 * The SPARCstation offers us the following challenges:
 *
 *   1. A virtual address cache.  This is, strictly speaking, not
 *      part of the architecture, but the code below assumes one.
 *      This is a write-through cache on the 4c and a write-back cache
 *      on others.
 *
 *   2. An MMU that acts like a cache.  There is not enough space
 *      in the MMU to map everything all the time.  Instead, we need
 *      to load MMU with the `working set' of translations for each
 *      process.
 *
 *   3. Segmented virtual and physical spaces.  The upper 12 bits of
 *      a virtual address (the virtual segment) index a segment table,
 *      giving a physical segment.  The physical segment selects a
 *      `Page Map Entry Group' (PMEG) and the virtual page number---the
 *      next 5 or 6 bits of the virtual address---select the particular
 *      `Page Map Entry' for the page.  We call the latter a PTE and
 *      call each Page Map Entry Group a pmeg (for want of a better name).
 *
 *      Since there are no valid bits in the segment table, the only way
 *      to have an invalid segment is to make one full pmeg of invalid PTEs.
 *      We use the last one (since the ROM does as well).
 *
 *   4. Discontiguous physical pages.  The Mach VM expects physical pages
 *      to be in one sequential lump.
 *
 *   5. The MMU is always on: it is not possible to disable it.  This is
 *      mainly a startup hassle.
 */

struct pmap_stats {
        int     ps_unlink_pvfirst;      /* # of pv_unlinks on head */
        int     ps_unlink_pvsearch;     /* # of pv_unlink searches */
        int     ps_changeprots;         /* # of calls to changeprot */
        int     ps_useless_changeprots; /* # of changeprots for wiring */
        int     ps_enter_firstpv;       /* pv heads entered */
        int     ps_enter_secondpv;      /* pv nonheads entered */
        int     ps_useless_changewire;  /* useless wiring changes */
        int     ps_npg_prot_all;        /* # of active pages protected */
        int     ps_npg_prot_actual;     /* # pages actually affected */
} pmap_stats;

#ifdef DEBUG
#define PDB_CREATE      0x0001
#define PDB_DESTROY     0x0002
#define PDB_REMOVE      0x0004
#define PDB_CHANGEPROT  0x0008
#define PDB_ENTER       0x0010

#define PDB_MMU_ALLOC   0x0100
#define PDB_MMU_STEAL   0x0200
#define PDB_CTX_ALLOC   0x0400
#define PDB_CTX_STEAL   0x0800
int     pmapdebug = 0x0;
#endif

#define splpmap() splbio()

/*
 * First and last managed physical addresses.
 */
#if 0
vm_offset_t     vm_first_phys, vm_last_phys;
#define managed(pa)     ((pa) >= vm_first_phys && (pa) < vm_last_phys)
#else
vm_offset_t     vm_first_phys, vm_num_phys;
#define managed(pa)     ((unsigned)((pa) - vm_first_phys) < vm_num_phys)
#endif

/*
 * For each managed physical page, there is a list of all currently
 * valid virtual mappings of that page.  Since there is usually one
 * (or zero) mapping per page, the table begins with an initial entry,
 * rather than a pointer; this head entry is empty iff its pv_pmap
 * field is NULL.
 *
 * Note that these are per machine independent page (so there may be
 * only one for every two hardware pages, e.g.).  Since the virtual
 * address is aligned on a page boundary, the low order bits are free
 * for storing flags.  Only the head of each list has flags.
 *
 * THIS SHOULD BE PART OF THE CORE MAP
 */
struct pvlist {
        struct  pvlist *pv_next;        /* next pvlist, if any */
        struct  pmap *pv_pmap;          /* pmap of this va */
        int     pv_va;                  /* virtual address */
        int     pv_flags;               /* flags (below) */
};

/*
 * Flags in pv_flags.  Note that PV_MOD must be 1 and PV_REF must be 2
 * since they must line up with the bits in the hardware PTEs (see pte.h).
 */
#define PV_MOD  1               /* page modified */
#define PV_REF  2               /* page referenced */
#define PV_NC   4               /* page cannot be cached */
/*efine PV_ALLF 7               ** all of the above */

struct pvlist *pv_table;        /* array of entries, one per physical page */

#define pvhead(pa)      (&pv_table[atop((pa) - vm_first_phys)])

/*
 * Each virtual segment within each pmap is either valid or invalid.
 * It is valid if pm_npte[VA_VSEG(va)] is not 0.  This does not mean
 * it is in the MMU, however; that is true iff pm_segmap[VA_VSEG(va)]
 * does not point to the invalid PMEG.
 *
 * If a virtual segment is valid and loaded, the correct PTEs appear
 * in the MMU only.  If it is valid and unloaded, the correct PTEs appear
 * in the pm_pte[VA_VSEG(va)] only.  However, some effort is made to keep
 * the software copies consistent enough with the MMU so that libkvm can
 * do user address translations.  In particular, pv_changepte() and
 * pmap_enu() maintain consistency, while less critical changes are
 * not maintained.  pm_pte[VA_VSEG(va)] always points to space for those
 * PTEs, unless this is the kernel pmap, in which case pm_pte[x] is not
 * used (sigh).
 *
 * Each PMEG in the MMU is either free or contains PTEs corresponding to
 * some pmap and virtual segment.  If it contains some PTEs, it also contains
 * reference and modify bits that belong in the pv_table.  If we need
 * to steal a PMEG from some process (if we need one and none are free)
 * we must copy the ref and mod bits, and update pm_segmap in the other
 * pmap to show that its virtual segment is no longer in the MMU.
 *
 * There are 128 PMEGs in a small Sun-4, of which only a few dozen are
 * tied down permanently, leaving `about' 100 to be spread among
 * running processes.  These are managed as an LRU cache.  Before
 * calling the VM paging code for a user page fault, the fault handler
 * calls mmu_load(pmap, va) to try to get a set of PTEs put into the
 * MMU.  mmu_load will check the validity of the segment and tell whether
 * it did something.
 *
 * Since I hate the name PMEG I call this data structure an `mmu entry'.
 * Each mmuentry is on exactly one of three `usage' lists: free, LRU,
 * or locked.  The LRU list is for user processes; the locked list is
 * for kernel entries; both are doubly linked queues headed by `mmuhd's.
 * The free list is a simple list, headed by a free list pointer.
 */
struct mmuhd {
        struct  mmuentry *mh_next;
        struct  mmuentry *mh_prev;
};
struct mmuentry {
        struct  mmuentry *me_next;      /* queue (MUST BE FIRST) or next free */
        struct  mmuentry *me_prev;      /* queue (MUST BE FIRST) */
        struct  pmap *me_pmap;          /* pmap, if in use */
        struct  mmuentry *me_pmforw;    /* pmap pmeg chain */
        struct  mmuentry **me_pmback;   /* pmap pmeg chain */
        u_short me_vseg;                /* virtual segment number in pmap */
        pmeg_t  me_pmeg;                /* hardware PMEG number */
};
struct mmuentry *mmuentry;      /* allocated in pmap_bootstrap */

struct mmuentry *me_freelist;   /* free list (not a queue) */
struct mmuhd me_lru = {         /* LRU (user) entries */
        (struct mmuentry *)&me_lru, (struct mmuentry *)&me_lru
};
struct mmuhd me_locked = {      /* locked (kernel) entries */
        (struct mmuentry *)&me_locked, (struct mmuentry *)&me_locked
};

int     seginval;               /* the invalid segment number */

/*
 * A context is simply a small number that dictates which set of 4096
 * segment map entries the MMU uses.  The Sun 4c has eight such sets.
 * These are alloted in an `almost MRU' fashion.
 *
 * Each context is either free or attached to a pmap.
 *
 * Since the virtual address cache is tagged by context, when we steal
 * a context we have to flush (that part of) the cache.
 */
union ctxinfo {
        union   ctxinfo *c_nextfree;    /* free list (if free) */
        struct  pmap *c_pmap;           /* pmap (if busy) */
};
union ctxinfo *ctxinfo;         /* allocated at in pmap_bootstrap */
int     ncontext;

union   ctxinfo *ctx_freelist;  /* context free list */
int     ctx_kick;               /* allocation rover when none free */
int     ctx_kickdir;            /* ctx_kick roves both directions */

/* XXX need per-cpu vpage[]s (and vmempage, unless we lock in /dev/mem) */
caddr_t vpage[2];               /* two reserved MD virtual pages */
caddr_t vmempage;               /* one reserved MI vpage for /dev/mem */
caddr_t vdumppages;             /* 32KB worth of reserved dump pages */

struct kpmap kernel_pmap_store; /* the kernel's pmap */

/*
 * We need to know real physical memory ranges (for /dev/mem).
 */
#define MA_SIZE 32              /* size of memory descriptor arrays */
struct  memarr pmemarr[MA_SIZE];/* physical memory regions */
int     npmemarr;               /* number of entries in pmemarr */

/*
 * The following four global variables are set in pmap_bootstrap
 * for the vm code to find.  This is Wrong.
 */
vm_offset_t     avail_start;    /* first free physical page number */
vm_offset_t     avail_end;      /* last free physical page number */
vm_offset_t     virtual_avail;  /* first free virtual page number */
vm_offset_t     virtual_end;    /* last free virtual page number */

/*
 * pseudo-functions for mnemonic value
#ifdef notyet
 * NB: setsegmap should be stba for 4c, but stha works and makes the
 * code right for the Sun-4 as well.
#endif
 */
#define getcontext()            lduba(AC_CONTEXT, ASI_CONTROL)
#define setcontext(c)           stba(AC_CONTEXT, ASI_CONTROL, c)
#ifdef notyet
#define getsegmap(va)           lduha(va, ASI_SEGMAP)
#define setsegmap(va, pmeg)     stha(va, ASI_SEGMAP, pmeg)
#else
#define getsegmap(va)           lduba(va, ASI_SEGMAP)
#define setsegmap(va, pmeg)     stba(va, ASI_SEGMAP, pmeg)
#endif

#define getpte(va)              lda(va, ASI_PTE)
#define setpte(va, pte)         sta(va, ASI_PTE, pte)

/*----------------------------------------------------------------*/

#ifdef  sun4c
/*
 * Translations from dense (contiguous) pseudo physical addresses
 * (fed to the VM code, to keep it happy) to sparse (real, hardware)
 * physical addresses.  We call the former `software' page frame
 * numbers and the latter `hardware' page frame numbers.  The
 * translation is done on a `per bank' basis.
 *
 * The HWTOSW and SWTOHW macros handle the actual translation.
 * They are defined as no-ops on Sun-4s.
 *
 * SHOULD DO atop AND ptoa DIRECTLY IN THESE MACROS SINCE ALL CALLERS
 * ALWAYS NEED THAT ANYWAY ... CAN JUST PRECOOK THE TABLES      (TODO)
 *
 * Since we cannot use the memory allocated to the ROM monitor, and
 * this happens to be just under 64K, I have chosen a bank size of
 * 64K.  This is necessary since all banks must be completely full.
 * I have also chosen a physical memory limit of 128 MB.  The 4c is
 * architecturally limited to 256 MB, but 128 MB is more than will
 * fit on present hardware.
 *
 * XXX  FIX THIS: just make all of each bank available and then
 *      take out the pages reserved to the monitor!!
 */
#define MAXMEM  (128 * 1024 * 1024)     /* no more than 128 MB phys mem */
#define NPGBANK 16                      /* 2^4 pages per bank (64K / bank) */
#define BSHIFT  4                       /* log2(NPGBANK) */
#define BOFFSET (NPGBANK - 1)
#define BTSIZE  (MAXMEM / NBPG / NPGBANK)

int     pmap_dtos[BTSIZE];              /* dense to sparse */
int     pmap_stod[BTSIZE];              /* sparse to dense */

#define HWTOSW(pg) (pmap_stod[(pg) >> BSHIFT] | ((pg) & BOFFSET))
#define SWTOHW(pg) (pmap_dtos[(pg) >> BSHIFT] | ((pg) & BOFFSET))

#ifdef DEBUG
struct  memarr pmap_ama[MA_SIZE];
int     pmap_nama;
#define ama pmap_ama
#endif

/*
 * init_translations sets up pmap_dtos[] and pmap_stod[], and
 * returns the number of usable physical pages.
 */
int
init_translations()
{
        register struct memarr *mp;
        register int n, nmem;
        register u_int vbank = 0, pbank, v, a;
        register u_int pages = 0, lost = 0;
#ifndef DEBUG
        struct memarr ama[MA_SIZE];     /* available memory array */
#endif

        nmem = makememarr(ama, MA_SIZE, MEMARR_AVAILPHYS);
#ifdef DEBUG
        pmap_nama = nmem;
#endif
        for (mp = ama; --nmem >= 0; mp++) {
                a = mp->addr >> PGSHIFT;
                v = mp->len >> PGSHIFT;
                if ((n = a & BOFFSET) != 0) {
                        /* round up to next bank */
                        n = NPGBANK - n;
                        if (v < n) {    /* not a whole bank: skip it */
                                lost += v;
                                continue;
                        }
                        lost += n;      /* lose n pages from front */
                        a += n;
                        v -= n;
                }
                n = v >> BSHIFT;        /* calculate number of banks */
                pbank = a >> BSHIFT;    /* and the bank itself */
                if (pbank + n >= BTSIZE)
                        n = BTSIZE - pbank;
                pages += n;             /* off by a factor of 2^BSHIFT */
                lost += v - (n << BSHIFT);
                while (--n >= 0) {
                        pmap_dtos[vbank] = pbank << BSHIFT;
                        pmap_stod[pbank] = vbank << BSHIFT;
                        pbank++;
                        vbank++;
                }
        }
        /* adjust page count */
        pages <<= BSHIFT;
#ifdef DEBUG
        printf("note: lost %d pages in translation\n", lost);
#endif
        return (pages);
}

#else /* sun4c */

/*
 * Pages are physically contiguous, and hardware PFN == software PFN.
 *
 * XXX assumes PAGE_SIZE == NBPG (???)
 */
#define HWTOSW(pg)      (pg)
#define SWTOHW(pg)      (pg)

#endif /* sun4c */

/* update pv_flags given a valid pte */
#define MR(pte) (((pte) >> PG_M_SHIFT) & (PV_MOD | PV_REF))

/*----------------------------------------------------------------*/

/*
 * Agree with the monitor ROM as to how many MMU entries are
 * to be reserved, and map all of its segments into all contexts.
 *
 * Unfortunately, while the Version 0 PROM had a nice linked list of
 * taken virtual memory, the Version 2 PROM provides instead a convoluted
 * description of *free* virtual memory.  Rather than invert this, we
 * resort to two magic constants from the PROM vector description file.
 */
int
mmu_reservemon(nmmu)
        register int nmmu;
{
        register u_int va, eva;
        register int mmuseg, i;

        va = OPENPROM_STARTVADDR;
        eva = OPENPROM_ENDVADDR;
        while (va < eva) {
                mmuseg = getsegmap(va);
                if (mmuseg < nmmu)
                        nmmu = mmuseg;
                for (i = ncontext; --i > 0;)
                        (*promvec->pv_setctxt)(i, (caddr_t)va, mmuseg);
                if (mmuseg == seginval) {
                        va += NBPSG;
                        continue;
                }
                /* PROM maps its memory user-accessible: fix it. */
                for (i = NPTESG; --i >= 0; va += NBPG)
                        setpte(va, getpte(va) | PG_S);
        }
        return (nmmu);
}

/*
 * TODO: agree with the ROM on physical pages by taking them away
 * from the page list, rather than having a dinky BTSIZE above.
 */

/*----------------------------------------------------------------*/

/*
 * MMU management.
 */

/*
 * Change contexts.  We need the old context number as well as the new
 * one.  If the context is changing, we must write all user windows
 * first, lest an interrupt cause them to be written to the (other)
 * user whose context we set here.
 */
#define CHANGE_CONTEXTS(old, new) \
        if ((old) != (new)) { \
                write_user_windows(); \
                setcontext(new); \
        }

/*
 * Allocate an MMU entry (i.e., a PMEG).
 * If necessary, steal one from someone else.
 * Put it on the tail of the given queue
 * (which is either the LRU list or the locked list).
 * The locked list is not actually ordered, but this is easiest.
 * Also put it on the given (new) pmap's chain,
 * enter its pmeg number into that pmap's segmap,
 * and store the pmeg's new virtual segment number (me->me_vseg).
 *
 * This routine is large and complicated, but it must be fast
 * since it implements the dynamic allocation of MMU entries.
 */
struct mmuentry *
me_alloc(mh, newpm, newvseg)
        register struct mmuhd *mh;
        register struct pmap *newpm;
        register int newvseg;
{
        register struct mmuentry *me;
        register struct pmap *pm;
        register int i, va, pa, *pte, tpte;
        int ctx;

        /* try free list first */
        if ((me = me_freelist) != NULL) {
                me_freelist = me->me_next;
#ifdef DEBUG
                if (me->me_pmap != NULL)
                        panic("me_alloc: freelist entry has pmap");
                if (pmapdebug & PDB_MMU_ALLOC)
                        printf("me_alloc: got pmeg %x\n", me->me_pmeg);
#endif
                insque(me, mh->mh_prev);        /* onto end of queue */

                /* onto on pmap chain; pmap is already locked, if needed */
                me->me_pmforw = NULL;
                me->me_pmback = newpm->pm_mmuback;
                *newpm->pm_mmuback = me;
                newpm->pm_mmuback = &me->me_pmforw;

                /* into pmap segment table, with backpointers */
                newpm->pm_segmap[newvseg] = me->me_pmeg;
                me->me_pmap = newpm;
                me->me_vseg = newvseg;

                return (me);
        }

        /* no luck, take head of LRU list */
        if ((me = me_lru.mh_next) == (struct mmuentry *)&me_lru)
                panic("me_alloc: all pmegs gone");
        pm = me->me_pmap;
#ifdef DEBUG
        if (pm == NULL)
                panic("me_alloc: LRU entry has no pmap");
        if (pm == kernel_pmap)
                panic("me_alloc: stealing from kernel");
        pte = pm->pm_pte[me->me_vseg];
        if (pte == NULL)
                panic("me_alloc: LRU entry's pmap has no ptes");
        if (pmapdebug & (PDB_MMU_ALLOC | PDB_MMU_STEAL))
                printf("me_alloc: stealing pmeg %x from pmap %x\n",
                    me->me_pmeg, pm);
#endif
        /*
         * Remove from LRU list, and insert at end of new list
         * (probably the LRU list again, but so what?).
         */
        remque(me);
        insque(me, mh->mh_prev);

        /*
         * The PMEG must be mapped into some context so that we can
         * read its PTEs.  Use its current context if it has one;
         * if not, and since context 0 is reserved for the kernel,
         * the simplest method is to switch to 0 and map the PMEG
         * to virtual address 0---which, being a user space address,
         * is by definition not in use.
         *
         * XXX for ncpus>1 must use per-cpu VA?
         * XXX do not have to flush cache immediately
         */
        ctx = getcontext();
        if (pm->pm_ctx) {
                CHANGE_CONTEXTS(ctx, pm->pm_ctxnum);
#ifdef notdef
                if (vactype != VAC_NONE)
#endif
                        cache_flush_segment(me->me_vseg);
                va = VSTOVA(me->me_vseg);
        } else {
                CHANGE_CONTEXTS(ctx, 0);
                setsegmap(0, me->me_pmeg);
                /*
                 * No cache flush needed: it happened earlier when
                 * the old context was taken.
                 */
                va = 0;
        }

        /*
         * Record reference and modify bits for each page,
         * and copy PTEs into kernel memory so that they can
         * be reloaded later.
         */
        i = NPTESG;
        do {
                tpte = getpte(va);
                if (tpte & PG_V) {
                        pa = ptoa(HWTOSW(tpte & PG_PFNUM));
                        if (managed(pa))
                                pvhead(pa)->pv_flags |= MR(tpte);
                }
                *pte++ = tpte & ~(PG_U|PG_M);
                va += NBPG;
        } while (--i > 0);

        /* update segment tables */
        simple_lock(&pm->pm_lock); /* what if other cpu takes mmuentry ?? */
        if (pm->pm_ctx)
                setsegmap(VSTOVA(me->me_vseg), seginval);
        pm->pm_segmap[me->me_vseg] = seginval;

        /* off old pmap chain */
        if ((*me->me_pmback = me->me_pmforw) != NULL) {
                me->me_pmforw->me_pmback = me->me_pmback;
                me->me_pmforw = NULL;
        } else
                pm->pm_mmuback = me->me_pmback;
        simple_unlock(&pm->pm_lock);
        setcontext(ctx);        /* done with old context */

        /* onto new pmap chain; new pmap is already locked, if needed */
        /* me->me_pmforw = NULL; */     /* done earlier */
        me->me_pmback = newpm->pm_mmuback;
        *newpm->pm_mmuback = me;
        newpm->pm_mmuback = &me->me_pmforw;

        /* into new segment table, with backpointers */
        newpm->pm_segmap[newvseg] = me->me_pmeg;
        me->me_pmap = newpm;
        me->me_vseg = newvseg;

        return (me);
}

/*
 * Free an MMU entry.
 *
 * Assumes the corresponding pmap is already locked.
 * Does NOT flush cache, but does record ref and mod bits.
 * The rest of each PTE is discarded.
 * CALLER MUST SET CONTEXT to pm->pm_ctxnum (if pmap has
 * a context) or to 0 (if not).  Caller must also update
 * pm->pm_segmap and (possibly) the hardware.
 */
void
me_free(pm, pmeg)
        register struct pmap *pm;
        register u_int pmeg;
{
        register struct mmuentry *me = &mmuentry[pmeg];
        register int i, va, pa, tpte;

#ifdef DEBUG
        if (pmapdebug & PDB_MMU_ALLOC)
                printf("me_free: freeing pmeg %x from pmap %x\n",
                    me->me_pmeg, pm);
        if (me->me_pmeg != pmeg)
                panic("me_free: wrong mmuentry");
        if (pm != me->me_pmap)
                panic("me_free: pm != me_pmap");
#endif

        /* just like me_alloc, but no cache flush, and context already set */
        if (pm->pm_ctx)
                va = VSTOVA(me->me_vseg);
        else {
                setsegmap(0, me->me_pmeg);
                va = 0;
        }
        i = NPTESG;
        do {
                tpte = getpte(va);
                if (tpte & PG_V) {
                        pa = ptoa(HWTOSW(tpte & PG_PFNUM));
                        if (managed(pa))
                                pvhead(pa)->pv_flags |= MR(tpte);
                }
                va += NBPG;
        } while (--i > 0);

        /* take mmu entry off pmap chain */
        *me->me_pmback = me->me_pmforw;
        if ((*me->me_pmback = me->me_pmforw) != NULL)
                me->me_pmforw->me_pmback = me->me_pmback;
        else
                pm->pm_mmuback = me->me_pmback;
        /* ... and remove from segment map */
        pm->pm_segmap[me->me_vseg] = seginval;

        /* off LRU or lock chain */
        remque(me);

        /* no associated pmap; on free list */
        me->me_pmap = NULL;
        me->me_next = me_freelist;
        me_freelist = me;
}

/*
 * `Page in' (load or inspect) an MMU entry; called on page faults.
 * Returns 1 if we reloaded the segment, -1 if the segment was
 * already loaded and the page was marked valid (in which case the
 * fault must be a bus error or something), or 0 (segment loaded but
 * PTE not valid, or segment not loaded at all).
 */
int
mmu_pagein(pm, va, bits)
        register struct pmap *pm;
        register int va, bits;
{
        register int *pte;
        register struct mmuentry *me;
        register int vseg = VA_VSEG(va), pmeg, i, s;

        /* return 0 if we have no PTEs to load */
        if ((pte = pm->pm_pte[vseg]) == NULL)
                return (0);
        /* return -1 if the fault is `hard', 0 if not */
        if (pm->pm_segmap[vseg] != seginval)
                return (bits && (getpte(va) & bits) == bits ? -1 : 0);

        /* reload segment: write PTEs into a new LRU entry */
        va = VA_ROUNDDOWNTOSEG(va);
        s = splpmap();          /* paranoid */
        pmeg = me_alloc(&me_lru, pm, vseg)->me_pmeg;
        setsegmap(va, pmeg);
        i = NPTESG;
        do {
                setpte(va, *pte++);
                va += NBPG;
        } while (--i > 0);
        splx(s);
        return (1);
}

/*
 * Allocate a context.  If necessary, steal one from someone else.
 * Changes hardware context number and loads segment map.
 *
 * This routine is only ever called from locore.s just after it has
 * saved away the previous process, so there are no active user windows.
 */
void
ctx_alloc(pm)
        register struct pmap *pm;
{
        register union ctxinfo *c;
        register int cnum, i, va;
        register pmeg_t *segp;

#ifdef DEBUG
        if (pm->pm_ctx)
                panic("ctx_alloc pm_ctx");
        if (pmapdebug & PDB_CTX_ALLOC)
                printf("ctx_alloc(%x)\n", pm);
#endif
        if ((c = ctx_freelist) != NULL) {
                ctx_freelist = c->c_nextfree;
                cnum = c - ctxinfo;
                setcontext(cnum);
        } else {
                if ((ctx_kick += ctx_kickdir) >= ncontext) {
                        ctx_kick = ncontext - 1;
                        ctx_kickdir = -1;
                } else if (ctx_kick < 1) {
                        ctx_kick = 1;
                        ctx_kickdir = 1;
                }
                c = &ctxinfo[cnum = ctx_kick];
#ifdef DEBUG
                if (c->c_pmap == NULL)
                        panic("ctx_alloc cu_pmap");
                if (pmapdebug & (PDB_CTX_ALLOC | PDB_CTX_STEAL))
                        printf("ctx_alloc: steal context %x from %x\n",
                            cnum, c->c_pmap);
#endif
                c->c_pmap->pm_ctx = NULL;
                setcontext(cnum);
#ifdef notdef
                if (vactype != VAC_NONE)
#endif
                        cache_flush_context();
        }
        c->c_pmap = pm;
        pm->pm_ctx = c;
        pm->pm_ctxnum = cnum;

        /*
         * XXX  loop below makes 3584 iterations ... could reduce
         *      by remembering valid ranges per context: two ranges
         *      should suffice (for text/data/bss and for stack).
         */
        segp = pm->pm_rsegmap;
        for (va = 0, i = NUSEG; --i >= 0; va += NBPSG)
                setsegmap(va, *segp++);
}

/*
 * Give away a context.  Flushes cache and sets current context to 0.
 */
void
ctx_free(pm)
        struct pmap *pm;
{
        register union ctxinfo *c;
        register int newc, oldc;

        if ((c = pm->pm_ctx) == NULL)
                panic("ctx_free");
        pm->pm_ctx = NULL;
        oldc = getcontext();
        if (vactype != VAC_NONE) {
                newc = pm->pm_ctxnum;
                CHANGE_CONTEXTS(oldc, newc);
                cache_flush_context();
                setcontext(0);
        } else {
                CHANGE_CONTEXTS(oldc, 0);
        }
        c->c_nextfree = ctx_freelist;
        ctx_freelist = c;
}


/*----------------------------------------------------------------*/

/*
 * pvlist functions.
 */

/*
 * Walk the given pv list, and for each PTE, set or clear some bits
 * (e.g., PG_W or PG_NC).
 *
 * As a special case, this never clears PG_W on `pager' pages.
 * These, being kernel addresses, are always in hardware and have
 * a context.
 *
 * This routine flushes the cache for any page whose PTE changes,
 * as long as the process has a context; this is overly conservative.
 * It also copies ref and mod bits to the pvlist, on the theory that
 * this might save work later.  (XXX should test this theory)
 */
void
pv_changepte(pv0, bis, bic)
        register struct pvlist *pv0;
        register int bis, bic;
{
        register int *pte;
        register struct pvlist *pv;
        register struct pmap *pm;
        register int va, vseg, pmeg, i, flags;
        int ctx, s;

        write_user_windows();           /* paranoid? */

        s = splpmap();                  /* paranoid? */
        if (pv0->pv_pmap == NULL) {
                splx(s);
                return;
        }
        ctx = getcontext();
        flags = pv0->pv_flags;
        for (pv = pv0; pv != NULL; pv = pv->pv_next) {
                pm = pv->pv_pmap;
if(pm==NULL)panic("pv_changepte 1");
                va = pv->pv_va;
                vseg = VA_VSEG(va);
                pte = pm->pm_pte[vseg];
                if ((pmeg = pm->pm_segmap[vseg]) != seginval) {
                        register int tpte;

                        /* in hardware: fix hardware copy */
                        if (pm->pm_ctx) {
                                extern vm_offset_t pager_sva, pager_eva;

                                if (bic == PG_W &&
                                    va >= pager_sva && va < pager_eva)
                                        continue;
                                setcontext(pm->pm_ctxnum);
                                /* XXX should flush only when necessary */
#ifdef notdef
                                if (vactype != VAC_NONE)
#endif
                                        cache_flush_page(va);
                        } else {
                                /* XXX per-cpu va? */
                                setcontext(0);
                                setsegmap(0, pmeg);
                                va = VA_VPG(va) * NBPG;
                        }
                        tpte = getpte(va);
                        if (tpte & PG_V)
                                flags |= (tpte >> PG_M_SHIFT) &
                                    (PV_MOD|PV_REF);
                        tpte = (tpte | bis) & ~bic;
                        setpte(va, tpte);
                        if (pte != NULL)        /* update software copy */
                                pte[VA_VPG(va)] = tpte;
                } else {
                        /* not in hardware: just fix software copy */
                        if (pte == NULL)
                                panic("pv_changepte 2");
                        pte += VA_VPG(va);
                        *pte = (*pte | bis) & ~bic;
                }
        }
        pv0->pv_flags = flags;
        setcontext(ctx);
        splx(s);
}

/*
 * Sync ref and mod bits in pvlist (turns off same in hardware PTEs).
 * Returns the new flags.
 *
 * This is just like pv_changepte, but we never add or remove bits,
 * hence never need to adjust software copies.
 */
int
pv_syncflags(pv0)
        register struct pvlist *pv0;
{
        register struct pvlist *pv;
        register struct pmap *pm;
        register int tpte, va, vseg, pmeg, i, flags;
        int ctx, s;

        write_user_windows();           /* paranoid? */

        s = splpmap();                  /* paranoid? */
        if (pv0->pv_pmap == NULL) {     /* paranoid */
                splx(s);
                return (0);
        }
        ctx = getcontext();
        flags = pv0->pv_flags;
        for (pv = pv0; pv != NULL; pv = pv->pv_next) {
                pm = pv->pv_pmap;
                va = pv->pv_va;
                vseg = VA_VSEG(va);
                if ((pmeg = pm->pm_segmap[vseg]) == seginval)
                        continue;
                if (pm->pm_ctx) {
                        setcontext(pm->pm_ctxnum);
                        /* XXX should flush only when necessary */
#ifdef notdef
                        if (vactype != VAC_NONE)
#endif
                                cache_flush_page(va);
                } else {
                        /* XXX per-cpu va? */
                        setcontext(0);
                        setsegmap(0, pmeg);
                        va = VA_VPG(va) * NBPG;
                }
                tpte = getpte(va);
                if (tpte & (PG_M|PG_U) && tpte & PG_V) {
                        flags |= (tpte >> PG_M_SHIFT) &
                            (PV_MOD|PV_REF);
                        tpte &= ~(PG_M|PG_U);
                        setpte(va, tpte);
                }
        }
        pv0->pv_flags = flags;
        setcontext(ctx);
        splx(s);
        return (flags);
}

/*
 * pv_unlink is a helper function for pmap_remove.
 * It takes a pointer to the pv_table head for some physical address
 * and removes the appropriate (pmap, va) entry.
 *
 * Once the entry is removed, if the pv_table head has the cache
 * inhibit bit set, see if we can turn that off; if so, walk the
 * pvlist and turn off PG_NC in each PTE.  (The pvlist is by
 * definition nonempty, since it must have at least two elements
 * in it to have PV_NC set, and we only remove one here.)
 */
static void
pv_unlink(pv, pm, va)
        register struct pvlist *pv;
        register struct pmap *pm;
        register vm_offset_t va;
{
        register struct pvlist *npv;

        /*
         * First entry is special (sigh).
         */
        npv = pv->pv_next;
        if (pv->pv_pmap == pm && pv->pv_va == va) {
                pmap_stats.ps_unlink_pvfirst++;
                if (npv != NULL) {
                        pv->pv_next = npv->pv_next;
                        pv->pv_pmap = npv->pv_pmap;
                        pv->pv_va = npv->pv_va;
                        free((caddr_t)npv, M_VMPVENT);
                } else
                        pv->pv_pmap = NULL;
        } else {
                register struct pvlist *prev;

                for (prev = pv;; prev = npv, npv = npv->pv_next) {
                        pmap_stats.ps_unlink_pvsearch++;
                        if (npv == NULL)
                                panic("pv_unlink");
                        if (npv->pv_pmap == pm && npv->pv_va == va)
                                break;
                }
                prev->pv_next = npv->pv_next;
                free((caddr_t)npv, M_VMPVENT);
        }
        if (pv->pv_flags & PV_NC) {
                /*
                 * Not cached: check to see if we can fix that now.
                 */
                va = pv->pv_va;
                for (npv = pv->pv_next; npv != NULL; npv = npv->pv_next)
                        if (BADALIAS(va, npv->pv_va))
                                return;
                pv->pv_flags &= ~PV_NC;
                pv_changepte(pv, 0, PG_NC);
        }
}

/*
 * pv_link is the inverse of pv_unlink, and is used in pmap_enter.
 * It returns PG_NC if the (new) pvlist says that the address cannot
 * be cached.
 */
static int
pv_link(pv, pm, va)
        register struct pvlist *pv;
        register struct pmap *pm;
        register vm_offset_t va;
{
        register struct pvlist *npv;
        register int ret;

        if (pv->pv_pmap == NULL) {
                /* no pvlist entries yet */
                pmap_stats.ps_enter_firstpv++;
                pv->pv_next = NULL;
                pv->pv_pmap = pm;
                pv->pv_va = va;
                return (0);
        }
        /*
         * Before entering the new mapping, see if
         * it will cause old mappings to become aliased
         * and thus need to be `discached'.
         */
        ret = 0;
        pmap_stats.ps_enter_secondpv++;
        if (pv->pv_flags & PV_NC) {
                /* already uncached, just stay that way */
                ret = PG_NC;
        } else {
                /* MAY NEED TO DISCACHE ANYWAY IF va IS IN DVMA SPACE? */
                for (npv = pv; npv != NULL; npv = npv->pv_next) {
                        if (BADALIAS(va, npv->pv_va)) {
                                pv->pv_flags |= PV_NC;
                                pv_changepte(pv, ret = PG_NC, 0);
                                break;
                        }
                }
        }
        npv = (struct pvlist *)malloc(sizeof *npv, M_VMPVENT, M_WAITOK);
        npv->pv_next = pv->pv_next;
        npv->pv_pmap = pm;
        npv->pv_va = va;
        pv->pv_next = npv;
        return (ret);
}

/*
 * Walk the given list and flush the cache for each (MI) page that is
 * potentially in the cache.
 */
pv_flushcache(pv)
        register struct pvlist *pv;
{
        register struct pmap *pm;
        register int i, s, ctx;

        write_user_windows();   /* paranoia? */

        s = splpmap();          /* XXX extreme paranoia */
        if ((pm = pv->pv_pmap) != NULL) {
                ctx = getcontext();
                for (;;) {
                        if (pm->pm_ctx) {
                                setcontext(pm->pm_ctxnum);
                                cache_flush_page(pv->pv_va);
                        }
                        pv = pv->pv_next;
                        if (pv == NULL)
                                break;
                        pm = pv->pv_pmap;
                }
                setcontext(ctx);
        }
        splx(s);
}

/*----------------------------------------------------------------*/

/*
 * At last, pmap code.
 */

/*
 * Bootstrap the system enough to run with VM enabled.
 *
 * nmmu is the number of mmu entries (``PMEGs'');
 * nctx is the number of contexts.
 */
void
pmap_bootstrap(nmmu, nctx)
        int nmmu, nctx;
{
        register union ctxinfo *ci;
        register struct mmuentry *me;
        register int i, j, n, z, vs;
        register caddr_t p;
        register void (*rom_setmap)(int ctx, caddr_t va, int pmeg);
        int lastpage;
        extern char end[];
        extern caddr_t reserve_dumppages(caddr_t);

        ncontext = nctx;

        /*
         * Last segment is the `invalid' one (one PMEG of pte's with !pg_v).
         * It will never be used for anything else.
         */
        seginval = --nmmu;

        /*
         * Preserve the monitor ROM's reserved VM region, so that
         * we can use L1-A or the monitor's debugger.  As a side
         * effect we map the ROM's reserved VM into all contexts
         * (otherwise L1-A crashes the machine!).
         */
        nmmu = mmu_reservemon(nmmu);

        /*
         * Allocate and clear mmu entry and context structures.
         */
        p = end;
        mmuentry = me = (struct mmuentry *)p;
        p += nmmu * sizeof *me;
        ctxinfo = ci = (union ctxinfo *)p;
        p += nctx * sizeof *ci;
        bzero(end, p - end);

        /*
         * Set up the `constants' for the call to vm_init()
         * in main().  All pages beginning at p (rounded up to
         * the next whole page) and continuing through the number
         * of available pages are free, but they start at a higher
         * virtual address.  This gives us two mappable MD pages
         * for pmap_zero_page and pmap_copy_page, and one MI page
         * for /dev/mem, all with no associated physical memory.
         */
        p = (caddr_t)(((u_int)p + NBPG - 1) & ~PGOFSET);
        avail_start = (int)p - KERNBASE;
        avail_end = init_translations() << PGSHIFT;
        i = (int)p;
        vpage[0] = p, p += NBPG;
        vpage[1] = p, p += NBPG;
        vmempage = p, p += NBPG;
        p = reserve_dumppages(p);
        virtual_avail = (vm_offset_t)p;
        virtual_end = VM_MAX_KERNEL_ADDRESS;

        p = (caddr_t)i;                 /* retract to first free phys */

        /*
         * Intialize the kernel pmap.
         */
        {
                register struct kpmap *k = &kernel_pmap_store;

/*              kernel_pmap = (struct pmap *)k; */
                k->pm_ctx = ctxinfo;
                /* k->pm_ctxnum = 0; */
                simple_lock_init(&k->pm_lock);
                k->pm_refcount = 1;
                /* k->pm_mmuforw = 0; */
                k->pm_mmuback = &k->pm_mmuforw;
                k->pm_segmap = &k->pm_rsegmap[-NUSEG];
                k->pm_pte = &k->pm_rpte[-NUSEG];
                k->pm_npte = &k->pm_rnpte[-NUSEG];
                for (i = NKSEG; --i >= 0;)
                        k->pm_rsegmap[i] = seginval;
        }

        /*
         * All contexts are free except the kernel's.
         *
         * XXX sun4c could use context 0 for users?
         */
        ci->c_pmap = kernel_pmap;
        ctx_freelist = ci + 1;
        for (i = 1; i < ncontext; i++) {
                ci++;
                ci->c_nextfree = ci + 1;
        }
        ci->c_nextfree = NULL;
        ctx_kick = 0;
        ctx_kickdir = -1;

        /* me_freelist = NULL; */       /* already NULL */

        /*
         * Init mmu entries that map the kernel physical addresses.
         * If the page bits in p are 0, we filled the last segment
         * exactly (now how did that happen?); if not, it is
         * the last page filled in the last segment.
         *
         * All the other MMU entries are free.
         *
         * THIS ASSUMES SEGMENT i IS MAPPED BY MMU ENTRY i DURING THE
         * BOOT PROCESS
         */
        z = ((((u_int)p + NBPSG - 1) & ~SGOFSET) - KERNBASE) >> SGSHIFT;
        lastpage = VA_VPG(p);
        if (lastpage == 0)
                lastpage = NPTESG;
        p = (caddr_t)KERNBASE;          /* first va */
        vs = VA_VSEG(KERNBASE);         /* first virtual segment */
        rom_setmap = promvec->pv_setctxt;
        for (i = 0;;) {
                /*
                 * Distribute each kernel segment into all contexts.
                 * This is done through the monitor ROM, rather than
                 * directly here: if we do a setcontext we will fault,
                 * as we are not (yet) mapped in any other context.
                 */
                for (j = 1; j < nctx; j++)
                        rom_setmap(j, p, i);

                /* set up the mmu entry */
                me->me_pmeg = i;
                insque(me, me_locked.mh_prev);
                /* me->me_pmforw = NULL; */
                me->me_pmback = kernel_pmap->pm_mmuback;
                *kernel_pmap->pm_mmuback = me;
                kernel_pmap->pm_mmuback = &me->me_pmforw;
                me->me_pmap = kernel_pmap;
                me->me_vseg = vs;
                kernel_pmap->pm_segmap[vs] = i;
                n = ++i < z ? NPTESG : lastpage;
                kernel_pmap->pm_npte[vs] = n;
                me++;
                vs++;
                if (i < z) {
                        p += NBPSG;
                        continue;
                }
                /*
                 * Unmap the pages, if any, that are not part of
                 * the final segment.
                 */
                for (p += n * NBPG; j < NPTESG; j++, p += NBPG)
                        setpte(p, 0);
                break;
        }
        for (; i < nmmu; i++, me++) {
                me->me_pmeg = i;
                me->me_next = me_freelist;
                /* me->me_pmap = NULL; */
                me_freelist = me;
        }

        /*
         * write protect & encache kernel text;
         * set red zone at kernel base; enable cache on message buffer.
         */
        {
                extern char etext[], trapbase[];
#ifdef KGDB
                register int mask = ~PG_NC;     /* XXX chgkprot is busted */
#else
                register int mask = ~(PG_W | PG_NC);
#endif
                for (p = trapbase; p < etext; p += NBPG)
                        setpte(p, getpte(p) & mask);
                p = (caddr_t)KERNBASE;
                setpte(p, 0);
                p += NBPG;
                setpte(p, getpte(p) & ~PG_NC);
        }

        /*
         * Grab physical memory list (for /dev/mem).
         */
        npmemarr = makememarr(pmemarr, MA_SIZE, MEMARR_TOTALPHYS);
}

/*
 * Bootstrap memory allocator. This function allows for early dynamic
 * memory allocation until the virtual memory system has been bootstrapped.
 * After that point, either kmem_alloc or malloc should be used. This
 * function works by stealing pages from the (to be) managed page pool,
 * stealing virtual address space, then mapping the pages and zeroing them.
 *
 * It should be used from pmap_bootstrap till vm_page_startup, afterwards
 * it cannot be used, and will generate a panic if tried. Note that this
 * memory will never be freed, and in essence it is wired down.
 */
void *
pmap_bootstrap_alloc(size)
        int size;
{
        register void *mem;
        extern int vm_page_startup_initialized;

        if (vm_page_startup_initialized)
                panic("pmap_bootstrap_alloc: called after startup initialized");
        size = round_page(size);
        mem = (void *)virtual_avail;
        virtual_avail = pmap_map(virtual_avail, avail_start,
            avail_start + size, VM_PROT_READ|VM_PROT_WRITE);
        avail_start += size;
        bzero((void *)mem, size);
        return (mem);
}

/*
 * Initialize the pmap module.
 */
void
pmap_init(phys_start, phys_end)
        register vm_offset_t phys_start, phys_end;
{
        register vm_size_t s;

        if (PAGE_SIZE != NBPG)
                panic("pmap_init: CLSIZE!=1");
        /*
         * Allocate and clear memory for the pv_table.
         */
        s = sizeof(struct pvlist) * atop(phys_end - phys_start);
        s = round_page(s);
        pv_table = (struct pvlist *)kmem_alloc(kernel_map, s);
        bzero((caddr_t)pv_table, s);
        vm_first_phys = phys_start;
        vm_num_phys = phys_end - phys_start;
}

/*
 * Map physical addresses into kernel VM.
 */
vm_offset_t
pmap_map(va, pa, endpa, prot)
        register vm_offset_t va, pa, endpa;
        register int prot;
{
        register int pgsize = PAGE_SIZE;

        while (pa < endpa) {
                pmap_enter(kernel_pmap, va, pa, prot, 1);
                va += pgsize;
                pa += pgsize;
        }
        return (va);
}

/*
 * Create and return a physical map.
 *
 * If size is nonzero, the map is useless. (ick)
 */
struct pmap *
pmap_create(size)
        vm_size_t size;
{
        register struct pmap *pm;

        if (size)
                return (NULL);
        pm = (struct pmap *)malloc(sizeof *pm, M_VMPMAP, M_WAITOK);
#ifdef DEBUG
        if (pmapdebug & PDB_CREATE)
                printf("pmap_create: created %x\n", pm);
#endif
        bzero((caddr_t)pm, sizeof *pm);
        pmap_pinit(pm);
        return (pm);
}

/*
 * Initialize a preallocated and zeroed pmap structure,
 * such as one in a vmspace structure.
 */
void
pmap_pinit(pm)
        register struct pmap *pm;
{
        register int i;

#ifdef DEBUG
        if (pmapdebug & PDB_CREATE)
                printf("pmap_pinit(%x)\n", pm);
#endif
        /* pm->pm_ctx = NULL; */
        simple_lock_init(&pm->pm_lock);
        pm->pm_refcount = 1;
        /* pm->pm_mmuforw = NULL; */
        pm->pm_mmuback = &pm->pm_mmuforw;
        pm->pm_segmap = pm->pm_rsegmap;
        pm->pm_pte = pm->pm_rpte;
        pm->pm_npte = pm->pm_rnpte;
        for (i = NUSEG; --i >= 0;)
                pm->pm_rsegmap[i] = seginval;
        /* bzero((caddr_t)pm->pm_rpte, sizeof pm->pm_rpte); */
        /* bzero((caddr_t)pm->pm_rnpte, sizeof pm->pm_rnpte); */
}

/*
 * Retire the given pmap from service.
 * Should only be called if the map contains no valid mappings.
 */
void
pmap_destroy(pm)
        register struct pmap *pm;
{
        int count;

        if (pm == NULL)
                return;
#ifdef DEBUG
        if (pmapdebug & PDB_DESTROY)
                printf("pmap_destroy(%x)\n", pm);
#endif
        simple_lock(&pm->pm_lock);
        count = --pm->pm_refcount;
        simple_unlock(&pm->pm_lock);
        if (count == 0) {
                pmap_release(pm);
                free((caddr_t)pm, M_VMPMAP);
        }
}

/*
 * Release any resources held by the given physical map.
 * Called when a pmap initialized by pmap_pinit is being released.
 */
void
pmap_release(pm)
        register struct pmap *pm;
{
        register union ctxinfo *c;
        register int s = splpmap();     /* paranoia */

#ifdef DEBUG
        if (pmapdebug & PDB_DESTROY)
                printf("pmap_release(%x)\n", pm);
#endif
        if (pm->pm_mmuforw)
                panic("pmap_release mmuforw");
        if ((c = pm->pm_ctx) != NULL) {
                if (pm->pm_ctxnum == 0)
                        panic("pmap_release: releasing kernel");
                ctx_free(pm);
        }
        splx(s);
}

/*
 * Add a reference to the given pmap.
 */
void
pmap_reference(pm)
        struct pmap *pm;
{

        if (pm != NULL) {
                simple_lock(&pm->pm_lock);
                pm->pm_refcount++;
                simple_unlock(&pm->pm_lock);
        }
}

static int pmap_rmk(struct pmap *, vm_offset_t, vm_offset_t, int, int, int);
static int pmap_rmu(struct pmap *, vm_offset_t, vm_offset_t, int, int, int);

/*
 * Remove the given range of mapping entries.
 * The starting and ending addresses are already rounded to pages.
 * Sheer lunacy: pmap_remove is often asked to remove nonexistent
 * mappings.
 */
void
pmap_remove(pm, va, endva)
        register struct pmap *pm;
        register vm_offset_t va, endva;
{
        register vm_offset_t nva;
        register int vseg, nleft, s, ctx;
        register int (*rm)(struct pmap *, vm_offset_t, vm_offset_t,
                            int, int, int);

        if (pm == NULL)
                return;
#ifdef DEBUG
        if (pmapdebug & PDB_REMOVE)
                printf("pmap_remove(%x, %x, %x)\n", pm, va, endva);
#endif

        if (pm == kernel_pmap) {
                /*
                 * Removing from kernel address space.
                 */
                rm = pmap_rmk;
        } else {
                /*
                 * Removing from user address space.
                 */
                write_user_windows();
                rm = pmap_rmu;
        }

        ctx = getcontext();
        s = splpmap();          /* XXX conservative */
        simple_lock(&pm->pm_lock);
        for (; va < endva; va = nva) {
                /* do one virtual segment at a time */
                vseg = VA_VSEG(va);
                nva = VSTOVA(vseg + 1);
                if (nva == 0 || nva > endva)
                        nva = endva;
                if ((nleft = pm->pm_npte[vseg]) != 0)
                        pm->pm_npte[vseg] = (*rm)(pm, va, nva,
                            vseg, nleft, pm->pm_segmap[vseg]);
        }
        simple_unlock(&pm->pm_lock);
        splx(s);
        setcontext(ctx);
}

#define perftest
#ifdef perftest
/* counters, one per possible length */
int     rmk_vlen[NPTESG+1];     /* virtual length per rmk() call */
int     rmk_npg[NPTESG+1];      /* n valid pages per rmk() call */
int     rmk_vlendiff;           /* # times npg != vlen */
#endif

/*
 * The following magic number was chosen because:
 *      1. It is the same amount of work to cache_flush_page 4 pages
 *         as to cache_flush_segment 1 segment (so at 4 the cost of
 *         flush is the same).
 *      2. Flushing extra pages is bad (causes cache not to work).
 *      3. The current code, which malloc()s 5 pages for each process
 *         for a user vmspace/pmap, almost never touches all 5 of those
 *         pages.
 */
#define PMAP_RMK_MAGIC  5       /* if > magic, use cache_flush_segment */

/*
 * Remove a range contained within a single segment.
 * These are egregiously complicated routines.
 */

/* remove from kernel, return new nleft */
static int
pmap_rmk(pm, va, endva, vseg, nleft, pmeg)
        register struct pmap *pm;
        register vm_offset_t va, endva;
        register int vseg, nleft, pmeg;
{
        register int i, tpte, perpage, npg;
        register struct pvlist *pv;
#ifdef perftest
        register int nvalid;
#endif

#ifdef DEBUG
        if (pmeg == seginval)
                panic("pmap_rmk: not loaded");
        if (pm->pm_ctx == NULL)
                panic("pmap_rmk: lost context");
#endif

        setcontext(0);
        /* decide how to flush cache */
        npg = (endva - va) >> PGSHIFT;
        if (npg > PMAP_RMK_MAGIC) {
                /* flush the whole segment */
                perpage = 0;
#ifdef notdef
                if (vactype != VAC_NONE)
#endif
                        cache_flush_segment(vseg);
        } else {
                /* flush each page individually; some never need flushing */
                perpage = 1;
        }
#ifdef perftest
        nvalid = 0;
#endif
        while (va < endva) {
                tpte = getpte(va);
                if ((tpte & PG_V) == 0) {
                        va += PAGE_SIZE;
                        continue;
                }
                pv = NULL;
                /* if cacheable, flush page as needed */
                if ((tpte & PG_NC) == 0) {
#ifdef perftest
                        nvalid++;
#endif
                        if (perpage)
                                cache_flush_page(va);
                }
                if ((tpte & PG_TYPE) == PG_OBMEM) {
                        i = ptoa(HWTOSW(tpte & PG_PFNUM));
                        if (managed(i)) {
                                pv = pvhead(i);
                                pv->pv_flags |= MR(tpte);
                                pv_unlink(pv, pm, va);
                        }
                }
                nleft--;
                setpte(va, 0);
                va += NBPG;
        }
#ifdef perftest
        rmk_vlen[npg]++;
        rmk_npg[nvalid]++;
        if (npg != nvalid)
                rmk_vlendiff++;
#endif

        /*
         * If the segment is all gone, remove it from everyone and
         * free the MMU entry.
         */
        if (nleft == 0) {
                va = VSTOVA(vseg);              /* retract */
                setsegmap(va, seginval);
                for (i = ncontext; --i > 0;) {
                        setcontext(i);
                        setsegmap(va, seginval);
                }
                me_free(pm, pmeg);
        }
        return (nleft);
}

#ifdef perftest
/* as before but for pmap_rmu */
int     rmu_vlen[NPTESG+1];     /* virtual length per rmu() call */
int     rmu_npg[NPTESG+1];      /* n valid pages per rmu() call */
int     rmu_vlendiff;           /* # times npg != vlen */
int     rmu_noflush;            /* # times rmu does not need to flush at all */
#endif

/*
 * Just like pmap_rmk_magic, but we have a different threshold.
 * Note that this may well deserve further tuning work.
 */
#define PMAP_RMU_MAGIC  4       /* if > magic, use cache_flush_segment */

/* remove from user */
static int
pmap_rmu(pm, va, endva, vseg, nleft, pmeg)
        register struct pmap *pm;
        register vm_offset_t va, endva;
        register int vseg, nleft, pmeg;
{
        register int *pte0, i, pteva, tpte, perpage, npg;
        register struct pvlist *pv;
#ifdef perftest
        register int doflush, nvalid;
#endif

        pte0 = pm->pm_pte[vseg];
        if (pmeg == seginval) {
                register int *pte = pte0 + VA_VPG(va);

                /*
                 * PTEs are not in MMU.  Just invalidate software copies.
                 */
                for (; va < endva; pte++, va += PAGE_SIZE) {
                        tpte = *pte;
                        if ((tpte & PG_V) == 0) {
                                /* nothing to remove (braindead VM layer) */
                                continue;
                        }
                        if ((tpte & PG_TYPE) == PG_OBMEM) {
                                i = ptoa(HWTOSW(tpte & PG_PFNUM));
                                if (managed(i))
                                        pv_unlink(pvhead(i), pm, va);
                        }
                        nleft--;
                        *pte = 0;
                }
                if (nleft == 0) {
                        free((caddr_t)pte0, M_VMPMAP);
                        pm->pm_pte[vseg] = NULL;
                }
                return (nleft);
        }

        /*
         * PTEs are in MMU.  Invalidate in hardware, update ref &
         * mod bits, and flush cache if required.
         */
        if (pm->pm_ctx) {
                /* process has a context, must flush cache */
                npg = (endva - va) >> PGSHIFT;
#ifdef perftest
                doflush = 1;
                nvalid = 0;
#endif
                setcontext(pm->pm_ctxnum);
                if (npg > PMAP_RMU_MAGIC) {
                        perpage = 0; /* flush the whole segment */
#ifdef notdef
                        if (vactype != VAC_NONE)
#endif
                                cache_flush_segment(vseg);
                } else
                        perpage = 1;
                pteva = va;
        } else {
                /* no context, use context 0; cache flush unnecessary */
                setcontext(0);
                /* XXX use per-cpu pteva? */
                setsegmap(0, pmeg);
                pteva = VA_VPG(va) * NBPG;
                perpage = 0;
#ifdef perftest
                npg = 0;
                doflush = 0;
                nvalid = 0;
                rmu_noflush++;
#endif
        }
        for (; va < endva; pteva += PAGE_SIZE, va += PAGE_SIZE) {
                tpte = getpte(pteva);
                if ((tpte & PG_V) == 0)
                        continue;
                pv = NULL;
                /* if cacheable, flush page as needed */
                if (doflush && (tpte & PG_NC) == 0) {
#ifdef perftest
                        nvalid++;
#endif
                        if (perpage)
                                cache_flush_page(va);
                }
                if ((tpte & PG_TYPE) == PG_OBMEM) {
                        i = ptoa(HWTOSW(tpte & PG_PFNUM));
                        if (managed(i)) {
                                pv = pvhead(i);
                                pv->pv_flags |= MR(tpte);
                                pv_unlink(pv, pm, va);
                        }
                }
                nleft--;
                setpte(pteva, 0);
        }
#ifdef perftest
        if (doflush) {
                rmu_vlen[npg]++;
                rmu_npg[nvalid]++;
                if (npg != nvalid)
                        rmu_vlendiff++;
        }
#endif

        /*
         * If the segment is all gone, and the context is loaded, give
         * the segment back.
         */
        if (nleft == 0 && pm->pm_ctx != NULL) {
                va = VSTOVA(vseg);              /* retract */
                setsegmap(va, seginval);
                free((caddr_t)pte0, M_VMPMAP);
                pm->pm_pte[vseg] = NULL;
                me_free(pm, pmeg);
        }
        return (nleft);
}

/*
 * Lower (make more strict) the protection on the specified
 * physical page.
 *
 * There are only two cases: either the protection is going to 0
 * (in which case we do the dirty work here), or it is going from
 * to read-only (in which case pv_changepte does the trick).
 */
void
pmap_page_protect(pa, prot)
        vm_offset_t pa;
        vm_prot_t prot;
{
        register struct pvlist *pv, *pv0, *npv;
        register struct pmap *pm;
        register int *pte;
        register int va, vseg, pteva, tpte;
        register int flags, nleft, i, pmeg, s, ctx, doflush;

#ifdef DEBUG
        if ((pmapdebug & PDB_CHANGEPROT) ||
            (pmapdebug & PDB_REMOVE && prot == VM_PROT_NONE))
                printf("pmap_page_protect(%x, %x)\n", pa, prot);
#endif
        /*
         * Skip unmanaged pages, or operations that do not take
         * away write permission.
         */
        if (!managed(pa) || prot & VM_PROT_WRITE)
                return;
        write_user_windows();   /* paranoia */
        if (prot & VM_PROT_READ) {
                pv_changepte(pvhead(pa), 0, PG_W);
                return;
        }

        /*
         * Remove all access to all people talking to this page.
         * Walk down PV list, removing all mappings.
         * The logic is much like that for pmap_remove,
         * but we know we are removing exactly one page.
         */
        pv = pvhead(pa);
        s = splpmap();
        if ((pm = pv->pv_pmap) == NULL) {
                splx(s);
                return;
        }
        ctx = getcontext();
        pv0 = pv;
        flags = pv->pv_flags & ~PV_NC;
        for (;; pm = pv->pv_pmap) {
                va = pv->pv_va;
                vseg = VA_VSEG(va);
                if ((nleft = pm->pm_npte[vseg]) == 0)
                        panic("pmap_remove_all: empty vseg");
                nleft--;
                pm->pm_npte[vseg] = nleft;
                pmeg = pm->pm_segmap[vseg];
                pte = pm->pm_pte[vseg];
                if (pmeg == seginval) {
                        if (nleft) {
                                pte += VA_VPG(va);
                                *pte = 0;
                        } else {
                                free((caddr_t)pte, M_VMPMAP);
                                pm->pm_pte[vseg] = NULL;
                        }
                        goto nextpv;
                }
                if (pm->pm_ctx) {
                        setcontext(pm->pm_ctxnum);
                        pteva = va;
#ifdef notdef
                        doflush = vactype != VAC_NONE;
#else
                        doflush = 1;
#endif
                } else {
                        setcontext(0);
                        /* XXX use per-cpu pteva? */
                        setsegmap(0, pmeg);
                        pteva = VA_VPG(va) * NBPG;
                        doflush = 0;
                }
                if (nleft) {
                        if (doflush)
                                cache_flush_page(va);
                        tpte = getpte(pteva);
                        if ((tpte & PG_V) == 0)
                                panic("pmap_page_protect !PG_V 1");
                        flags |= MR(tpte);
                        setpte(pteva, 0);
                } else {
                        if (doflush)
                                cache_flush_page(va);
                        tpte = getpte(pteva);
                        if ((tpte & PG_V) == 0)
                                panic("pmap_page_protect !PG_V 2");
                        flags |= MR(tpte);
                        if (pm->pm_ctx) {
                                setsegmap(va, seginval);
                                if (pm == kernel_pmap) {
                                        for (i = ncontext; --i > 0;) {
                                                setcontext(i);
                                                setsegmap(va, seginval);
                                        }
                                        goto skipptefree;
                                }
                        }
                        free((caddr_t)pte, M_VMPMAP);
                        pm->pm_pte[vseg] = NULL;
                skipptefree:
                        me_free(pm, pmeg);
                }
        nextpv:
                npv = pv->pv_next;
                if (pv != pv0)
                        free((caddr_t)pv, M_VMPVENT);
                if ((pv = npv) == NULL)
                        break;
        }
        pv0->pv_pmap = NULL;
        pv0->pv_flags = flags;
        setcontext(ctx);
        splx(s);
}

/*
 * Lower (make more strict) the protection on the specified
 * range of this pmap.
 *
 * There are only two cases: either the protection is going to 0
 * (in which case we call pmap_remove to do the dirty work), or
 * it is going from read/write to read-only.  The latter is
 * fairly easy.
 */
void
pmap_protect(pm, sva, eva, prot)
        register struct pmap *pm;
        vm_offset_t sva, eva;
        vm_prot_t prot;
{
        register int va, nva, vseg, pteva, pmeg;
        register int s, ctx;

        if (pm == NULL || prot & VM_PROT_WRITE)
                return;
        if ((prot & VM_PROT_READ) == 0) {
                pmap_remove(pm, sva, eva);
                return;
        }

        write_user_windows();
        ctx = getcontext();
        s = splpmap();
        simple_lock(&pm->pm_lock);

        for (va = sva; va < eva;) {
                vseg = VA_VSEG(va);
                nva = VSTOVA(vseg + 1);
if (nva == 0) panic("pmap_protect: last segment");      /* cannot happen */
                if (nva > eva)
                        nva = eva;
                if (pm->pm_npte[vseg] == 0) {
                        va = nva;
                        continue;
                }
                pmeg = pm->pm_segmap[vseg];
                if (pmeg == seginval) {
                        register int *pte = &pm->pm_pte[vseg][VA_VPG(va)];

                        /* not in MMU; just clear PG_W from core copies */
                        for (; va < nva; va += NBPG)
                                *pte++ &= ~PG_W;
                } else {
                        /* in MMU: take away write bits from MMU PTEs */
                        if (
#ifdef notdef
                            vactype != VAC_NONE &&
#endif
                            pm->pm_ctx) {
                                register int tpte;

                                /*
                                 * Flush cache so that any existing cache
                                 * tags are updated.  This is really only
                                 * needed for PTEs that lose PG_W.
                                 */
                                setcontext(pm->pm_ctxnum);
                                for (; va < nva; va += NBPG) {
                                        tpte = getpte(va);
                                        pmap_stats.ps_npg_prot_all++;
                                        if (tpte & PG_W) {
                                                pmap_stats.ps_npg_prot_actual++;
                                                cache_flush_page(va);
                                                setpte(va, tpte & ~PG_W);
                                        }
                                }
                        } else {
                                register int pteva;

                                /*
                                 * No context, hence not cached;
                                 * just update PTEs.
                                 */
                                setcontext(0);
                                /* XXX use per-cpu pteva? */
                                setsegmap(0, pmeg);
                                pteva = VA_VPG(va) * NBPG;
                                for (; va < nva; pteva += NBPG, va += NBPG)
                                        setpte(pteva, getpte(pteva) & ~PG_W);
                        }
                }
        }
        simple_unlock(&pm->pm_lock);
        splx(s);
}

/*
 * Change the protection and/or wired status of the given (MI) virtual page.
 * XXX: should have separate function (or flag) telling whether only wiring
 * is changing.
 */
void
pmap_changeprot(pm, va, prot, wired)
        register struct pmap *pm;
        register vm_offset_t va;
        vm_prot_t prot;
        int wired;
{
        register int vseg, tpte, newprot, pmeg, ctx, i, s;

#ifdef DEBUG
        if (pmapdebug & PDB_CHANGEPROT)
                printf("pmap_changeprot(%x, %x, %x, %x)\n",
                    pm, va, prot, wired);
#endif

        write_user_windows();   /* paranoia */

        if (pm == kernel_pmap)
                newprot = prot & VM_PROT_WRITE ? PG_S|PG_W : PG_S;
        else
                newprot = prot & VM_PROT_WRITE ? PG_W : 0;
        vseg = VA_VSEG(va);
        s = splpmap();          /* conservative */
        pmap_stats.ps_changeprots++;

        /* update PTEs in software or hardware */
        if ((pmeg = pm->pm_segmap[vseg]) == seginval) {
                register int *pte = &pm->pm_pte[vseg][VA_VPG(va)];

                /* update in software */
                if ((*pte & PG_PROT) == newprot)
                        goto useless;
                *pte = (*pte & ~PG_PROT) | newprot;
        } else {
                /* update in hardware */
                ctx = getcontext();
                if (pm->pm_ctx) {
                        /* use current context; flush writeback cache */
                        setcontext(pm->pm_ctxnum);
                        tpte = getpte(va);
                        if ((tpte & PG_PROT) == newprot)
                                goto useless;
                        if (vactype == VAC_WRITEBACK &&
                            (newprot & PG_W) == 0 &&
                            (tpte & (PG_W | PG_NC)) == PG_W)
                                cache_flush_page((int)va);
                } else {
                        setcontext(0);
                        /* XXX use per-cpu va? */
                        setsegmap(0, pmeg);
                        va = VA_VPG(va);
                        tpte = getpte(va);
                        if ((tpte & PG_PROT) == newprot)
                                goto useless;
                }
                tpte = (tpte & ~PG_PROT) | newprot;
                setpte(va, tpte);
                setcontext(ctx);
        }
        splx(s);
        return;

useless:
        /* only wiring changed, and we ignore wiring */
        pmap_stats.ps_useless_changeprots++;
        splx(s);
}

/*
 * Insert (MI) physical page pa at virtual address va in the given pmap.
 * NB: the pa parameter includes type bits PMAP_OBIO, PMAP_NC as necessary.
 *
 * If pa is not in the `managed' range it will not be `bank mapped'.
 * This works during bootstrap only because the first 4MB happens to
 * map one-to-one.
 *
 * There may already be something else there, or we might just be
 * changing protections and/or wiring on an existing mapping.
 *      XXX     should have different entry points for changing!
 */
void
pmap_enter(pm, va, pa, prot, wired)
        register struct pmap *pm;
        vm_offset_t va, pa;
        vm_prot_t prot;
        int wired;
{
        register struct pvlist *pv;
        register int pteproto, ctx;

        if (pm == NULL)
                return;
#ifdef DEBUG
        if (pmapdebug & PDB_ENTER)
                printf("pmap_enter(%x, %x, %x, %x, %x)\n",
                    pm, va, pa, prot, wired);
#endif

        pteproto = PG_V | ((pa & PMAP_TNC) << PG_TNC_SHIFT);
        pa &= ~PMAP_TNC;
        /*
         * Set up prototype for new PTE.  Cannot set PG_NC from PV_NC yet
         * since the pvlist no-cache bit might change as a result of the
         * new mapping.
         */
        if (managed(pa)) {
                pteproto |= SWTOHW(atop(pa));
                pv = pvhead(pa);
        } else {
                pteproto |= atop(pa) & PG_PFNUM;
                pv = NULL;
        }
        if (prot & VM_PROT_WRITE)
                pteproto |= PG_W;

        ctx = getcontext();
        if (pm == kernel_pmap)
                pmap_enk(pm, va, prot, wired, pv, pteproto | PG_S);
        else
                pmap_enu(pm, va, prot, wired, pv, pteproto);
        setcontext(ctx);
}

/* enter new (or change existing) kernel mapping */
pmap_enk(pm, va, prot, wired, pv, pteproto)
        register struct pmap *pm;
        vm_offset_t va;
        vm_prot_t prot;
        int wired;
        register struct pvlist *pv;
        register int pteproto;
{
        register int vseg, tpte, pmeg, i, s;

        vseg = VA_VSEG(va);
        s = splpmap();          /* XXX way too conservative */
        if (pm->pm_segmap[vseg] != seginval &&
            (tpte = getpte(va)) & PG_V) {
                register int addr = tpte & PG_PFNUM;

                /* old mapping exists */
                if (addr == (pteproto & PG_PFNUM)) {
                        /* just changing protection and/or wiring */
                        splx(s);
                        pmap_changeprot(pm, va, prot, wired);
                        return;
                }

/*printf("pmap_enk: changing existing va=>pa entry\n");*/
                /*
                 * Switcheroo: changing pa for this va.
                 * If old pa was managed, remove from pvlist.
                 * If old page was cached, flush cache.
                 */
                addr = ptoa(HWTOSW(addr));
                if (managed(addr))
                        pv_unlink(pvhead(addr), pm, va);
                if (
#ifdef notdef
                    vactype != VAC_NONE &&
#endif
                    (tpte & PG_NC) == 0) {
                        setcontext(0);  /* ??? */
                        cache_flush_page((int)va);
                }
        } else {
                /* adding new entry */
                pm->pm_npte[vseg]++;
        }

        /*
         * If the new mapping is for a managed PA, enter into pvlist.
         * Note that the mapping for a malloc page will always be
         * unique (hence will never cause a second call to malloc).
         */
        if (pv != NULL)
                pteproto |= pv_link(pv, pm, va);

        pmeg = pm->pm_segmap[vseg];
        if (pmeg == seginval) {
                register int tva;

                /*
                 * Allocate an MMU entry now (on locked list),
                 * and map it into every context.  Set all its
                 * PTEs invalid (we will then overwrite one, but
                 * this is more efficient than looping twice).
                 */
#ifdef DEBUG
                if (pm->pm_ctx == NULL || pm->pm_ctxnum != 0)
                        panic("pmap_enk: kern seg but no kern ctx");
#endif
                pmeg = me_alloc(&me_locked, pm, vseg)->me_pmeg;
                pm->pm_segmap[vseg] = pmeg;
                i = ncontext - 1;
                do {
                        setcontext(i);
                        setsegmap(va, pmeg);
                } while (--i >= 0);

                /* set all PTEs to invalid, then overwrite one PTE below */
                tva = VA_ROUNDDOWNTOSEG(va);
                i = NPTESG;
                do {
                        setpte(tva, 0);
                        tva += NBPG;
                } while (--i > 0);
        }

        /* ptes kept in hardware only */
        setpte(va, pteproto);
        splx(s);
}

/* enter new (or change existing) user mapping */
pmap_enu(pm, va, prot, wired, pv, pteproto)
        register struct pmap *pm;
        vm_offset_t va;
        vm_prot_t prot;
        int wired;
        register struct pvlist *pv;
        register int pteproto;
{
        register int vseg, *pte, tpte, pmeg, i, s, doflush;

        write_user_windows();           /* XXX conservative */
        vseg = VA_VSEG(va);
        s = splpmap();                  /* XXX conservative */

        /*
         * If there is no space in which the PTEs can be written
         * while they are not in the hardware, this must be a new
         * virtual segment.  Get PTE space and count the segment.
         *
         * TO SPEED UP CTX ALLOC, PUT SEGMENT BOUNDS STUFF HERE
         * AND IN pmap_rmu()
         */
retry:
        pte = pm->pm_pte[vseg];
        if (pte == NULL) {
                /* definitely a new mapping */
                register int size = NPTESG * sizeof *pte;

                pte = (int *)malloc((u_long)size, M_VMPMAP, M_WAITOK);
                if (pm->pm_pte[vseg] != NULL) {
printf("pmap_enter: pte filled during sleep\n");        /* can this happen? */
                        free((caddr_t)pte, M_VMPMAP);
                        goto retry;
                }
#ifdef DEBUG
                if (pm->pm_segmap[vseg] != seginval)
                        panic("pmap_enter: new ptes, but not seginval");
#endif
                bzero((caddr_t)pte, size);
                pm->pm_pte[vseg] = pte;
                pm->pm_npte[vseg] = 1;
        } else {
                /* might be a change: fetch old pte */
                doflush = 0;
                if ((pmeg = pm->pm_segmap[vseg]) == seginval)
                        tpte = pte[VA_VPG(va)]; /* software pte */
                else {
                        if (pm->pm_ctx) {       /* hardware pte */
                                setcontext(pm->pm_ctxnum);
                                tpte = getpte(va);
                                doflush = 1;
                        } else {
                                setcontext(0);
                                /* XXX use per-cpu pteva? */
                                setsegmap(0, pmeg);
                                tpte = getpte(VA_VPG(va) * NBPG);
                        }
                }
                if (tpte & PG_V) {
                        register int addr = tpte & PG_PFNUM;

                        /* old mapping exists */
                        if (addr == (pteproto & PG_PFNUM)) {
                                /* just changing prot and/or wiring */
                                splx(s);
                                /* caller should call this directly: */
                                pmap_changeprot(pm, va, prot, wired);
                                return;
                        }
                        /*
                         * Switcheroo: changing pa for this va.
                         * If old pa was managed, remove from pvlist.
                         * If old page was cached, flush cache.
                         */
/*printf("%s[%d]: pmap_enu: changing existing va(%x)=>pa entry\n",
curproc->p_comm, curproc->p_pid, va);*/
                        addr = ptoa(HWTOSW(addr));
                        if (managed(addr))
                                pv_unlink(pvhead(addr), pm, va);
                        if (
#ifdef notdef
                            vactype != VAC_NONE &&
#endif
                            doflush && (tpte & PG_NC) == 0)
                                cache_flush_page((int)va);
                } else {
                        /* adding new entry */
                        pm->pm_npte[vseg]++;
                }
        }

        if (pv != NULL)
                pteproto |= pv_link(pv, pm, va);

        /*
         * Update hardware or software PTEs (whichever are active).
         */
        if ((pmeg = pm->pm_segmap[vseg]) != seginval) {
                /* ptes are in hardare */
                if (pm->pm_ctx)
                        setcontext(pm->pm_ctxnum);
                else {
                        setcontext(0);
                        /* XXX use per-cpu pteva? */
                        setsegmap(0, pmeg);
                        va = VA_VPG(va) * NBPG;
                }
                setpte(va, pteproto);
        }
        /* update software copy */
        pte += VA_VPG(va);
        *pte = pteproto;

        splx(s);
}

/*
 * Change the wiring attribute for a map/virtual-address pair.
 */
/* ARGSUSED */
void
pmap_change_wiring(pm, va, wired)
        struct pmap *pm;
        vm_offset_t va;
        int wired;
{

        pmap_stats.ps_useless_changewire++;
}

/*
 * Extract the physical page address associated
 * with the given map/virtual_address pair.
 * GRR, the vm code knows; we should not have to do this!
 */
vm_offset_t
pmap_extract(pm, va)
        register struct pmap *pm;
        vm_offset_t va;
{
        register int tpte;
        register int vseg;

        if (pm == NULL) {
                printf("pmap_extract: null pmap\n");
                return (0);
        }
        vseg = VA_VSEG(va);
        if (pm->pm_segmap[vseg] != seginval) {
                register int ctx = getcontext();

                if (pm->pm_ctx) {
                        setcontext(pm->pm_ctxnum);
                        tpte = getpte(va);
                } else {
                        setcontext(0);
                        tpte = getpte(VA_VPG(va) * NBPG);
                }
                setcontext(ctx);
        } else {
                register int *pte = pm->pm_pte[vseg];

                if (pte == NULL) {
                        printf("pmap_extract: invalid vseg\n");
                        return (0);
                }
                tpte = pte[VA_VPG(va)];
        }
        if ((tpte & PG_V) == 0) {
                printf("pmap_extract: invalid pte\n");
                return (0);
        }
        tpte &= PG_PFNUM;
        tpte = HWTOSW(tpte);
        return ((tpte << PGSHIFT) | (va & PGOFSET));
}

/*
 * Copy the range specified by src_addr/len
 * from the source map to the range dst_addr/len
 * in the destination map.
 *
 * This routine is only advisory and need not do anything.
 */
/* ARGSUSED */
void
pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
        struct pmap *dst_pmap, *src_pmap;
        vm_offset_t dst_addr;
        vm_size_t len;
        vm_offset_t src_addr;
{
}

/*
 * Require that all active physical maps contain no
 * incorrect entries NOW.  [This update includes
 * forcing updates of any address map caching.]
 */
void
pmap_update()
{
}

/*
 * Garbage collects the physical map system for
 * pages which are no longer used.
 * Success need not be guaranteed -- that is, there
 * may well be pages which are not referenced, but
 * others may be collected.
 * Called by the pageout daemon when pages are scarce.
 */
/* ARGSUSED */
void
pmap_collect(pm)
        struct pmap *pm;
{
}

/*
 * Clear the modify bit for the given physical page.
 */
void
pmap_clear_modify(pa)
        register vm_offset_t pa;
{
        register struct pvlist *pv;

        if (managed(pa)) {
                pv = pvhead(pa);
                (void) pv_syncflags(pv);
                pv->pv_flags &= ~PV_MOD;
        }
}

/*
 * Tell whether the given physical page has been modified.
 */
int
pmap_is_modified(pa)
        register vm_offset_t pa;
{
        register struct pvlist *pv;

        if (managed(pa)) {
                pv = pvhead(pa);
                if (pv->pv_flags & PV_MOD || pv_syncflags(pv) & PV_MOD)
                        return (1);
        }
        return (0);
}

/*
 * Clear the reference bit for the given physical page.
 */
void
pmap_clear_reference(pa)
        vm_offset_t pa;
{
        register struct pvlist *pv;

        if (managed(pa)) {
                pv = pvhead(pa);
                (void) pv_syncflags(pv);
                pv->pv_flags &= ~PV_REF;
        }
}

/*
 * Tell whether the given physical page has been referenced.
 */
int
pmap_is_referenced(pa)
        vm_offset_t pa;
{
        register struct pvlist *pv;

        if (managed(pa)) {
                pv = pvhead(pa);
                if (pv->pv_flags & PV_REF || pv_syncflags(pv) & PV_REF)
                        return (1);
        }
        return (0);
}

/*
 * Make the specified pages (by pmap, offset) pageable (or not) as requested.
 *
 * A page which is not pageable may not take a fault; therefore, its page
 * table entry must remain valid for the duration (or at least, the trap
 * handler must not call vm_fault).
 *
 * This routine is merely advisory; pmap_enter will specify that these pages
 * are to be wired down (or not) as appropriate.
 */
/* ARGSUSED */
void
pmap_pageable(pm, start, end, pageable)
        struct pmap *pm;
        vm_offset_t start, end;
        int pageable;
{
}

/*
 * Fill the given MI physical page with zero bytes.
 *
 * We avoid stomping on the cache.
 * XXX  might be faster to use destination's context and allow cache to fill?
 */
void
pmap_zero_page(pa)
        register vm_offset_t pa;
{
        register caddr_t va;
        register int pte;

        if (managed(pa)) {
                /*
                 * The following might not be necessary since the page
                 * is being cleared because it is about to be allocated,
                 * i.e., is in use by no one.
                 */
#if 1
#ifdef notdef
                if (vactype != VAC_NONE)
#endif
                        pv_flushcache(pvhead(pa));
#endif
                pte = PG_V | PG_S | PG_W | PG_NC | SWTOHW(atop(pa));
        } else
                pte = PG_V | PG_S | PG_W | PG_NC | (atop(pa) & PG_PFNUM);

        va = vpage[0];
        setpte(va, pte);
        qzero(va, NBPG);
        setpte(va, 0);
}

/*
 * Copy the given MI physical source page to its destination.
 *
 * We avoid stomping on the cache as above (with same `XXX' note).
 * We must first flush any write-back cache for the source page.
 * We go ahead and stomp on the kernel's virtual cache for the
 * source page, since the cache can read memory MUCH faster than
 * the processor.
 */
void
pmap_copy_page(src, dst)
        vm_offset_t src, dst;
{
        register caddr_t sva, dva;
        register int spte, dpte;

        if (managed(src)) {
                if (vactype == VAC_WRITEBACK)
                        pv_flushcache(pvhead(src));
                spte = PG_V | PG_S | SWTOHW(atop(src));
        } else
                spte = PG_V | PG_S | (atop(src) & PG_PFNUM);

        if (managed(dst)) {
                /* similar `might not be necessary' comment applies */
#if 1
#ifdef notdef
                if (vactype != VAC_NONE)
#endif
                        pv_flushcache(pvhead(dst));
#endif
                dpte = PG_V | PG_S | PG_W | PG_NC | SWTOHW(atop(dst));
        } else
                dpte = PG_V | PG_S | PG_W | PG_NC | (atop(dst) & PG_PFNUM);

        sva = vpage[0];
        dva = vpage[1];
        setpte(sva, spte);
        setpte(dva, dpte);
        qcopy(sva, dva, NBPG);  /* loads cache, so we must ... */
        cache_flush_page((int)sva);
        setpte(sva, 0);
        setpte(dva, 0);
}

/*
 * Turn a cdevsw d_mmap value into a byte address for pmap_enter.
 * XXX  this should almost certainly be done differently, and
 *      elsewhere, or even not at all
 */
vm_offset_t
pmap_phys_address(x)
        int x;
{

        return (x);
}

/*
 * Turn off cache for a given (va, number of pages).
 *
 * We just assert PG_NC for each PTE; the addresses must reside
 * in locked kernel space.  A cache flush is also done.
 */
kvm_uncache(va, npages)
        register caddr_t va;
        register int npages;
{
        register int pte;

        for (; --npages >= 0; va += NBPG) {
                pte = getpte(va);
                if ((pte & PG_V) == 0)
                        panic("kvm_uncache !pg_v");
                pte |= PG_NC;
                setpte(va, pte);
                cache_flush_page((int)va);
        }
}

/*
 * For /dev/mem.
 */
int
pmap_enter_hw(pm, va, pa, prot, wired)
        register struct pmap *pm;
        vm_offset_t va, pa;
        vm_prot_t prot;
        int wired;
{
        register struct memarr *ma;
        register int n;
        register u_int t;

        if (pa >= MAXMEM)                               /* ??? */
                return (EFAULT);
        for (ma = pmemarr, n = npmemarr; --n >= 0; ma++) {
                t = (u_int)pa - ma->addr;
                if (t < ma->len)
                        goto ok;
        }
        return (EFAULT);
ok:
        pa = (HWTOSW(atop(pa)) << PGSHIFT) | (pa & PGOFSET);
        if (pa >= vm_first_phys + vm_num_phys)          /* ??? */
                return (EFAULT);

        pmap_enter(pm, va, pa, prot, wired);
        return (0);
}