BSD 4_3 release
[unix-history] / usr / src / sys / sys / vm_page.c
index d7c6d9b..6a1855f 100644 (file)
@@ -1,20 +1,26 @@
-/*     vm_page.c       6.3     83/09/26        */
+/*
+ * Copyright (c) 1982, 1986 Regents of the University of California.
+ * All rights reserved.  The Berkeley software License Agreement
+ * specifies the terms and conditions for redistribution.
+ *
+ *     @(#)vm_page.c   7.1 (Berkeley) 6/5/86
+ */
 
 #include "../machine/reg.h"
 #include "../machine/pte.h"
 
 
 #include "../machine/reg.h"
 #include "../machine/pte.h"
 
-#include "../h/param.h"
-#include "../h/systm.h"
-#include "../h/inode.h"
-#include "../h/dir.h"
-#include "../h/user.h"
-#include "../h/proc.h"
-#include "../h/buf.h"
-#include "../h/text.h"
-#include "../h/cmap.h"
-#include "../h/vm.h"
-#include "../h/file.h"
-#include "../h/trace.h"
+#include "param.h"
+#include "systm.h"
+#include "inode.h"
+#include "dir.h"
+#include "user.h"
+#include "proc.h"
+#include "buf.h"
+#include "text.h"
+#include "cmap.h"
+#include "vm.h"
+#include "file.h"
+#include "trace.h"
 
 int    nohash = 0;
 /*
 
 int    nohash = 0;
 /*
@@ -65,8 +71,7 @@ pagein(virtaddr, dlyu)
 {
        register struct proc *p;
        register struct pte *pte;
 {
        register struct proc *p;
        register struct pte *pte;
-       register struct inode *ip;
-       register u_int v;
+       register unsigned v;
        unsigned pf;
        int type, fileno;
        struct pte opte;
        unsigned pf;
        int type, fileno;
        struct pte opte;
@@ -77,12 +82,14 @@ pagein(virtaddr, dlyu)
        struct cmap *c;
        int j;
        daddr_t bn, bncache, bnswap;
        struct cmap *c;
        int j;
        daddr_t bn, bncache, bnswap;
-       int si;
+       int si, sk;
+       int swerror = 0;
 #ifdef PGINPROF
 #include "../vax/mtpr.h"
 #ifdef PGINPROF
 #include "../vax/mtpr.h"
-       int otime, olbolt, oicr, a, s;
+       int otime, olbolt, oicr, s;
+       long a;
 
 
-       s = spl6();
+       s = splclock();
        otime = time, olbolt = lbolt, oicr = mfpr(ICR);
 #endif
        cnt.v_faults++;
        otime = time, olbolt = lbolt, oicr = mfpr(ICR);
 #endif
        cnt.v_faults++;
@@ -92,16 +99,13 @@ pagein(virtaddr, dlyu)
         */
        vsave = v = clbase(btop(virtaddr));
        p = u.u_procp;
         */
        vsave = v = clbase(btop(virtaddr));
        p = u.u_procp;
-       if (isatsv(p, v)) {
+       if (isatsv(p, v))
                type = CTEXT;
                type = CTEXT;
-               pte = tptopte(p, vtotp(p, v));
-       } else if (isadsv(p, v)) {
-               type = CDATA;
-               pte = dptopte(p, vtodp(p, v));
-       } else {
+       else if (isassv(p, v))
                type = CSTACK;
                type = CSTACK;
-               pte = sptopte(p, vtosp(p, v));
-       }
+       else
+               type = CDATA;
+       pte = vtopte(p, v);
        if (pte->pg_v)
                panic("pagein");
 
        if (pte->pg_v)
                panic("pagein");
 
@@ -142,9 +146,10 @@ valid:
                 * the size recorded for the resident set.
                 */
                si = splimp();
                 * the size recorded for the resident set.
                 */
                si = splimp();
-               if (cmap[pgtocm(pte->pg_pfnum)].c_free) {
+               c = &cmap[pgtocm(pte->pg_pfnum)];
+               if (c->c_free) {
                        pgtrace(TR_FRECLAIM);
                        pgtrace(TR_FRECLAIM);
-                       munlink(pte->pg_pfnum);
+                       munlink(c);
                        cnt.v_pgfrec++;
                        if (type == CTEXT)
                                p->p_textp->x_rssize += CLSIZE;
                        cnt.v_pgfrec++;
                        if (type == CTEXT)
                                p->p_textp->x_rssize += CLSIZE;
@@ -158,7 +163,7 @@ valid:
                        pte->pg_m = 1;
                distcl(pte);
                if (type == CTEXT)
                        pte->pg_m = 1;
                distcl(pte);
                if (type == CTEXT)
-                       distpte(p->p_textp, vtotp(p, v), pte);
+                       distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
                u.u_ru.ru_minflt++;
                cnt.v_pgrec++;
                if (dlyu) {
                u.u_ru.ru_minflt++;
                cnt.v_pgrec++;
                if (dlyu) {
@@ -207,12 +212,7 @@ valid:
                        dev = swapdev;
                        bncache = bnswap;
                } else {
                        dev = swapdev;
                        bncache = bnswap;
                } else {
-                       register struct file *fp = u.u_ofile[fileno];
-
-                       if (fp == NULL || fp->f_type != DTYPE_INODE)
-                               panic("pagein u.u_ofile");
-                       ip = (struct inode *)fp->f_data;
-                       dev = ip->i_dev;
+                       panic("pagein");        /* can't happen */
                }
        }
        klsize = 1;
                }
        }
        klsize = 1;
@@ -223,15 +223,15 @@ valid:
         * This can happen only if the page is filling
         * from a inode or from the swap device, (e.g. not when reading
         * in 407/410 execs to a zero fill page.)
         * This can happen only if the page is filling
         * from a inode or from the swap device, (e.g. not when reading
         * in 407/410 execs to a zero fill page.)
+        * Honor lock bit to avoid races with pageouts.
         */
        if (type == CTEXT && fileno != PG_FZERO && !nohash) {
                si = splimp();
                while ((c = mfind(dev, bncache)) != 0) {
         */
        if (type == CTEXT && fileno != PG_FZERO && !nohash) {
                si = splimp();
                while ((c = mfind(dev, bncache)) != 0) {
-                       pf = cmtopg(c - cmap);
                        if (c->c_lock == 0)
                                break;
                        if (c->c_lock == 0)
                                break;
-                       mlock(pf);
-                       munlock(pf);
+                       MLOCK(c);
+                       MUNLOCK(c);
                }
                if (c) {
                        if (c->c_type != CTEXT || c->c_gone == 0 ||
                }
                if (c) {
                        if (c->c_type != CTEXT || c->c_gone == 0 ||
@@ -241,7 +241,8 @@ valid:
                        /*
                         * Following code mimics memall().
                         */
                        /*
                         * Following code mimics memall().
                         */
-                       munlink(pf);
+                       munlink(c);
+                       pf = cmtopg(c - cmap);
                        for (j = 0; j < CLSIZE; j++) {
                                *(int *)pte = pf++;
                                pte->pg_prot = opte.pg_prot;
                        for (j = 0; j < CLSIZE; j++) {
                                *(int *)pte = pf++;
                                pte->pg_prot = opte.pg_prot;
@@ -285,11 +286,13 @@ valid:
         * If we block we have to start over, since anything
         * could have happened.
         */
         * If we block we have to start over, since anything
         * could have happened.
         */
+       sk = splimp();          /* lock memalls from here into kluster */
        if (freemem < CLSIZE * KLMAX) {
                pgtrace(TR_WAITMEM);
                while (freemem < CLSIZE * KLMAX)
                        sleep((caddr_t)&freemem, PSWP+2);
                pgtrace(TR_EWAITMEM);
        if (freemem < CLSIZE * KLMAX) {
                pgtrace(TR_WAITMEM);
                while (freemem < CLSIZE * KLMAX)
                        sleep((caddr_t)&freemem, PSWP+2);
                pgtrace(TR_EWAITMEM);
+               splx(sk);
                pte = vtopte(p, v);
                if (pte->pg_v)
                        goto valid;
                pte = vtopte(p, v);
                if (pte->pg_v)
                        goto valid;
@@ -303,14 +306,15 @@ valid:
         * the (process or text) resident set size.
         */
        p->p_flag |= SPAGE;
         * the (process or text) resident set size.
         */
        p->p_flag |= SPAGE;
-       (void) memall(pte, CLSIZE, p, type);
+       if (memall(pte, CLSIZE, p, type) == 0)
+               panic("pagein memall");
        pte->pg_prot = opte.pg_prot;
        pf = pte->pg_pfnum;
        cmap[pgtocm(pf)].c_intrans = 1;
        distcl(pte);
        if (type == CTEXT) {
                p->p_textp->x_rssize += CLSIZE;
        pte->pg_prot = opte.pg_prot;
        pf = pte->pg_pfnum;
        cmap[pgtocm(pf)].c_intrans = 1;
        distcl(pte);
        if (type == CTEXT) {
                p->p_textp->x_rssize += CLSIZE;
-               distpte(p->p_textp, vtotp(p, v), pte);
+               distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
        } else
                p->p_rssize += CLSIZE;
 
        } else
                p->p_rssize += CLSIZE;
 
@@ -342,12 +346,20 @@ valid:
                                        clearseg(pf+i);
                                if (type != CTEXT)
                                        cnt.v_zfod += CLSIZE;
                                        clearseg(pf+i);
                                if (type != CTEXT)
                                        cnt.v_zfod += CLSIZE;
+                               splx(sk);
                                goto skipswap;
                        }
                        pgtrace(TR_EXFOD);
                        cnt.v_exfod += CLSIZE;
                } else
                        panic("pagein vread");
                                goto skipswap;
                        }
                        pgtrace(TR_EXFOD);
                        cnt.v_exfod += CLSIZE;
                } else
                        panic("pagein vread");
+               /*
+                * Fill from inode.  Try to find adjacent
+                * pages to bring in also.
+                */
+               v = fodkluster(p, v, pte, &klsize, dev, &bn);
+               bncache = bn;
+               splx(sk);
                /*
                 * Blocks of an executable may still be in the buffer
                 * cache, so we explicitly flush them out to disk
                /*
                 * Blocks of an executable may still be in the buffer
                 * cache, so we explicitly flush them out to disk
@@ -369,12 +381,13 @@ valid:
                v = kluster(p, v, pte, B_READ, &klsize,
                    (type == CTEXT) ? kltxt :
                    ((p->p_flag & SSEQL) ? klseql : klin), bn);
                v = kluster(p, v, pte, B_READ, &klsize,
                    (type == CTEXT) ? kltxt :
                    ((p->p_flag & SSEQL) ? klseql : klin), bn);
+               splx(sk);
                /* THIS COULD BE COMPUTED INCREMENTALLY... */
                bncache = bn = vtod(p, v, &u.u_dmap, &u.u_smap);
        }
 
        distcl(pte);
                /* THIS COULD BE COMPUTED INCREMENTALLY... */
                bncache = bn = vtod(p, v, &u.u_dmap, &u.u_smap);
        }
 
        distcl(pte);
-       swap(p, bn, ptob(v), klsize * ctob(CLSIZE),
+       swerror = swap(p, bn, ptob(v), klsize * ctob(CLSIZE),
            B_READ, B_PGIN, dev, 0); 
 #ifdef TRACE
        trace(TR_PGINDONE, vsave, u.u_procp->p_pid);
            B_READ, B_PGIN, dev, 0); 
 #ifdef TRACE
        trace(TR_PGINDONE, vsave, u.u_procp->p_pid);
@@ -405,9 +418,11 @@ skipswap:
        pte->pg_v = 1;
        distcl(pte);
        if (type == CTEXT) {
        pte->pg_v = 1;
        distcl(pte);
        if (type == CTEXT) {
-               distpte(p->p_textp, vtotp(p, vsave), pte);
-               if (opte.pg_fod)
-                       p->p_textp->x_flag |= XWRIT;
+               if (swerror == 0) {
+                       distpte(p->p_textp, (unsigned)vtotp(p, vsave), pte);
+                       if (opte.pg_fod)
+                               p->p_textp->x_flag |= XWRIT;
+               }
                wakeup((caddr_t)p->p_textp);
        }
 
                wakeup((caddr_t)p->p_textp);
        }
 
@@ -422,13 +437,15 @@ skipswap:
        for (i = 0; i < klsize; i++) {
                c = &cmap[pgtocm(pte->pg_pfnum)];
                c->c_intrans = 0;
        for (i = 0; i < klsize; i++) {
                c = &cmap[pgtocm(pte->pg_pfnum)];
                c->c_intrans = 0;
-               if (type == CTEXT && c->c_blkno == 0 && bncache && !nohash) {
+               if (type == CTEXT && c->c_blkno == 0 && bncache && !nohash &&
+                   !swerror) {
                        mhash(c, dev, bncache);
                        mhash(c, dev, bncache);
-                       bncache += CLBYTES / DEV_BSIZE;
+                       bncache += btodb(CLBYTES);
                }
                if (v != vsave || !dlyu)
                }
                if (v != vsave || !dlyu)
-                       munlock(pte->pg_pfnum);
-               if (v != vsave && type != CTEXT && preptofree) {
+                       MUNLOCK(c);
+               if (v != vsave && type != CTEXT && preptofree &&
+                   opte.pg_fod == 0) {
                        /*
                         * Throw pre-paged data/stack pages at the
                         * bottom of the free list.
                        /*
                         * Throw pre-paged data/stack pages at the
                         * bottom of the free list.
@@ -467,10 +484,6 @@ skipswap:
        }
 }
 
        }
 }
 
-#if defined(BERT)
-int    dmod = 1000000;
-int    dcnt;
-#endif
 /*
  * Take away n pages of data space
  * starting at data page dp.
 /*
  * Take away n pages of data space
  * starting at data page dp.
@@ -493,10 +506,6 @@ dpageout(p, dp, n)
        }
        if (dp + n > p->p_dsize)
                n = p->p_dsize - dp;
        }
        if (dp + n > p->p_dsize)
                n = p->p_dsize - dp;
-#if defined(BERT)
-       if (++dcnt % dmod == 0)
-               printf("dp %d, n %d\n", dp, n);
-#endif
        for (i = 0; i < n; i += CLSIZE, dp += CLSIZE) {
                pte = dptopte(p, dp);
                if (pte->pg_fod || pte->pg_pfnum == 0)
        for (i = 0; i < n; i += CLSIZE, dp += CLSIZE) {
                pte = dptopte(p, dp);
                if (pte->pg_fod || pte->pg_pfnum == 0)
@@ -513,7 +522,7 @@ dpageout(p, dp, n)
                if (dirtycl(pte)) {
                        if (bswlist.av_forw == NULL)
                                continue;
                if (dirtycl(pte)) {
                        if (bswlist.av_forw == NULL)
                                continue;
-                       mlock(pte->pg_pfnum);
+                       MLOCK(c);
                        pte->pg_m = 0;
                        distcl(pte);
                        p->p_poip++;
                        pte->pg_m = 0;
                        distcl(pte);
                        p->p_poip++;
@@ -521,7 +530,7 @@ dpageout(p, dp, n)
                                &klsize, klout, (daddr_t)0);
                        /* THIS ASSUMES THAT p == u.u_procp */
                        daddr = vtod(p, v, &u.u_dmap, &u.u_smap);
                                &klsize, klout, (daddr_t)0);
                        /* THIS ASSUMES THAT p == u.u_procp */
                        daddr = vtod(p, v, &u.u_dmap, &u.u_smap);
-                       swap(p, daddr, ptob(v), klsize * ctob(CLSIZE),
+                       (void)swap(p, daddr, ptob(v), klsize * ctob(CLSIZE),
                            B_WRITE, B_DIRTY, swapdev, pte->pg_pfnum);
                } else {
                        if (c->c_gone == 0)
                            B_WRITE, B_DIRTY, swapdev, pte->pg_pfnum);
                } else {
                        if (c->c_gone == 0)
@@ -532,7 +541,127 @@ dpageout(p, dp, n)
        }
 }
                    
        }
 }
                    
-int    fifo = 0;
+unsigned maxdmap;
+unsigned maxtsize;
+
+/*
+ * Setup the paging constants for the clock algorithm.
+ * Called after the system is initialized and the amount of memory
+ * and number of paging devices is known.
+ *
+ * Threshold constants are defined in ../machine/vmparam.h.
+ */
+vminit()
+{
+
+       /*
+        * Lotsfree is threshold where paging daemon turns on.
+        */
+       if (lotsfree == 0) {
+               lotsfree = LOTSFREE / NBPG;
+               if (lotsfree > LOOPPAGES / LOTSFREEFRACT)
+                       lotsfree = LOOPPAGES / LOTSFREEFRACT;
+       }
+       /*
+        * Desfree is amount of memory desired free.
+        * If less than this for extended period, do swapping.
+        */
+       if (desfree == 0) {
+               desfree = DESFREE / NBPG;
+               if (desfree > LOOPPAGES / DESFREEFRACT)
+                       desfree = LOOPPAGES / DESFREEFRACT;
+       }
+
+       /*
+        * Minfree is minimal amount of free memory which is tolerable.
+        */
+       if (minfree == 0) {
+               minfree = MINFREE / NBPG;
+               if (minfree > desfree / MINFREEFRACT)
+                       minfree = desfree / MINFREEFRACT;
+       }
+
+       /*
+        * Maxpgio thresholds how much paging is acceptable.
+        * This figures that 2/3 busy on an arm is all that is
+        * tolerable for paging.  We assume one operation per disk rev.
+        */
+       if (maxpgio == 0)
+               maxpgio = (DISKRPM * 2) / 3;
+
+       /*
+        * Clock to scan using max of ~~10% of processor time for sampling,
+        *     this estimated to allow maximum of 200 samples per second.
+        * This yields a ``fastscan'' of roughly (with CLSIZE=2):
+        *      <=1m    2m      3m      4m      8m
+        *      5s      10s     15s     20s     40s
+        */
+       if (fastscan == 0)
+               fastscan = 200;
+       if (fastscan > LOOPPAGES / 5)
+               fastscan = LOOPPAGES / 5;
+
+       /*
+        * Set slow scan time to 1/2 the fast scan time.
+        */
+       if (slowscan == 0)
+               slowscan = fastscan / 2;
+
+       /*
+        * Calculate the swap allocation constants.
+        */
+        if (dmmin == 0)
+                dmmin = DMMIN;
+        if (dmmax == 0) {
+                dmmax = DMMAX;
+               while (dmapsize(dmmin, dmmax / 2) >= MAXDSIZ && dmmax > dmmin)
+                       dmmax /= 2;
+       }
+       maxdmap = dmapsize(dmmin, dmmax);
+        if (dmtext == 0)
+                dmtext = DMTEXT;
+        if (dmtext > dmmax)
+                dmtext = dmmax;
+       if (maxtsize == 0)
+               maxtsize = MAXTSIZ;
+       if (maxtsize > dtob(NXDAD * dmtext))
+               maxtsize = dtob(NXDAD * dmtext);
+
+       /*
+        * Set up the initial limits on process VM.
+        * Set the maximum resident set size to be all
+        * of (reasonably) available memory.  This causes
+        * any single, large process to start random page
+        * replacement once it fills memory.
+        */
+        u.u_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
+        u.u_rlimit[RLIMIT_STACK].rlim_max = MIN(MAXSSIZ, maxdmap);
+        u.u_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
+        u.u_rlimit[RLIMIT_DATA].rlim_max = MIN(MAXDSIZ, maxdmap);
+       u.u_rlimit[RLIMIT_RSS].rlim_cur = u.u_rlimit[RLIMIT_RSS].rlim_max =
+               ctob(LOOPPAGES - desfree);
+       proc[0].p_maxrss = LOOPPAGES - desfree;
+}
+
+dmapsize(dmin, dmax)
+       int dmin, dmax;
+{
+       register int i, blk, size = 0;
+
+       blk = dmin;
+       for (i = 0; i < NDMAP; i++) {
+               size += blk;
+               if (blk < dmax)
+                       blk *= 2;
+       }
+       return (dtob(size));
+}
+
+int    pushes;
+
+#define        FRONT   1
+#define        BACK    2
+
 /*
  * The page out daemon, which runs as process 2.
  *
 /*
  * The page out daemon, which runs as process 2.
  *
@@ -540,21 +669,29 @@ int       fifo = 0;
  * this process is not run.  When the number of free
  * pages stays in the range desfree to lotsfree,
  * this daemon runs through the pages in the loop
  * this process is not run.  When the number of free
  * pages stays in the range desfree to lotsfree,
  * this daemon runs through the pages in the loop
- * at a rate determined in vmsched(), simulating the missing
- * hardware reference bit, and cleaning pages and transferring
- * them to the free list.
+ * at a rate determined in vmsched().  Pageout manages
+ * two hands on the clock.  The front hand moves through
+ * memory, clearing the valid bit (simulating a reference bit),
+ * and stealing pages from procs that are over maxrss.
+ * The back hand travels a distance behind the front hand,
+ * freeing the pages that have not been referenced in the time
+ * since the front hand passed.  If modified, they are pushed to
+ * swap before being freed.
  */
 pageout()
 {
  */
 pageout()
 {
-       register struct proc *rp;
-       register struct text *xp;
-       register struct cmap *c;
-       register struct pte *pte;
-       int count, pushes;
-       swblk_t daddr;
-       unsigned v;
-       int maxhand = pgtocm(maxfree);
-       int klsize;
+       register int count;
+       register int maxhand = pgtocm(maxfree);
+       register int fronthand, backhand;
+
+       /*
+        * Set the two clock hands to be separated by a reasonable amount,
+        * but no more than 360 degrees apart.
+        */
+       backhand = 0 / CLBYTES;
+       fronthand = HANDSPREAD / CLBYTES;
+       if (fronthand >= maxhand)
+               fronthand = maxhand - 1;
 
 loop:
        /*
 
 loop:
        /*
@@ -565,7 +702,7 @@ loop:
         *
         * N.B.: We guarantee never to block while the cleaned list is nonempty.
         */
         *
         * N.B.: We guarantee never to block while the cleaned list is nonempty.
         */
-       (void) spl6();
+       (void) splbio();
        if (bclnlist != NULL) {
                (void) spl0();
                cleanup();
        if (bclnlist != NULL) {
                (void) spl0();
                cleanup();
@@ -576,204 +713,18 @@ loop:
        count = 0;
        pushes = 0;
        while (nscan < desscan && freemem < lotsfree) {
        count = 0;
        pushes = 0;
        while (nscan < desscan && freemem < lotsfree) {
-top:
                /*
                /*
-                * An iteration of the clock pointer (hand) around the loop.
-                * Look at the page at hand.  If it is a
-                * locked (for physical i/o e.g.), system (u., page table)
-                * or free, then leave it alone.
-                * Otherwise, find a process and text pointer for the
-                * page, and a virtual page number in either the
-                * process or the text image.
+                * If checkpage manages to add a page to the free list,
+                * we give ourselves another couple of trips around the loop.
                 */
                 */
-               c = &cmap[hand];
-               if (c->c_lock || c->c_free)
-                       goto skip;
-               switch (c->c_type) {
-
-               case CSYS:
-                       goto skip;
-
-               case CTEXT:
-                       xp = &text[c->c_ndx];
-                       rp = xp->x_caddr;
-                       v = tptov(rp, c->c_page);
-                       pte = tptopte(rp, c->c_page);
-                       break;
-
-               case CDATA:
-               case CSTACK:
-                       rp = &proc[c->c_ndx];
-                       while (rp->p_flag & SNOVM)
-                               rp = rp->p_xlink;
-                       xp = rp->p_textp;
-                       if (c->c_type == CDATA) {
-                               v = dptov(rp, c->c_page);
-                               pte = dptopte(rp, c->c_page);
-                       } else {
-                               v = sptov(rp, c->c_page);
-                               pte = sptopte(rp, c->c_page);
-                       }
-                       break;
-               }
-
-               if (pte->pg_pfnum != cmtopg(hand))
-                       panic("bad c_page");
-
-               /*
-                * If page is valid; make invalid but reclaimable.
-                * If this pte is not valid, then it must be reclaimable
-                * and we can add it to the free list.
-                */
-               if (pte->pg_v) {
-                       pte->pg_v = 0;
-                       if (anycl(pte, pg_m))
-                               pte->pg_m = 1;
-                       distcl(pte);
-                       if (c->c_type == CTEXT)
-                               distpte(xp, vtotp(rp, v), pte);
-                       if ((rp->p_flag & (SSEQL|SUANOM)) || fifo ||
-                           rp->p_rssize > rp->p_maxrss)
-                               goto take;
-               } else {
-take:
-                       if (c->c_type != CTEXT) {
-                               /*
-                                * Guarantee a minimal investment in data
-                                * space for jobs in balance set.
-                                */
-                               if (rp->p_rssize < saferss - rp->p_slptime)
-                                       goto skip;
-                       }
-
-                       /*
-                        * If the page is currently dirty, we
-                        * have to arrange to have it cleaned before it
-                        * can be freed.  We mark it clean immediately.
-                        * If it is reclaimed while being pushed, then modified
-                        * again, we are assured of the correct order of 
-                        * writes because we lock the page during the write.  
-                        * This guarantees that a swap() of this process (and
-                        * thus this page), initiated in parallel, will,
-                        * in fact, push the page after us.
-                        *
-                        * The most general worst case here would be for
-                        * a reclaim, a modify and a swapout to occur
-                        * all before the single page transfer completes.
-                        */
-                       if (dirtycl(pte)) {
-                               /*
-                                * Limit pushes to avoid saturating
-                                * pageout device.
-                                *
-                                * MAGIC 4 BECAUSE WE RUN EVERY 1/4 SEC (clock)
-                                */
-                               if (pushes > maxpgio / 4)
-                                       goto skip;
-                               pushes++;
-                               /*
-                                * If the process is being swapped out
-                                * or about to exit, do not bother with its
-                                * dirty pages
-                                */
-                               if (rp->p_flag & (SLOCK|SWEXIT))
-                                       goto skip;
-
-                               /*
-                                * Now carefully make sure that there will
-                                * be a header available for the push so that
-                                * we will not block waiting for a header in
-                                * swap().  The reason this is important is
-                                * that we (proc[2]) are the one who cleans
-                                * dirty swap headers and we could otherwise
-                                * deadlock waiting for ourselves to clean
-                                * swap headers.  The sleep here on &proc[2]
-                                * is actually (effectively) a sleep on both
-                                * ourselves and &bswlist, and this is known
-                                * to swdone and swap in vm_swp.c.  That is,
-                                * &proc[2] will be awakened both when dirty
-                                * headers show up and also to get the pageout
-                                * daemon moving.
-                                */
-               loop2:
-                               (void) spl6();
-                               if (bclnlist != NULL) {
-                                       (void) spl0();
-                                       cleanup();
-                                       goto loop2;
-                               }
-                               if (bswlist.av_forw == NULL) {
-                                       bswlist.b_flags |= B_WANTED;
-                                       sleep((caddr_t)&proc[2], PSWP+2);
-                                       (void) spl0();
-                                       /*
-                                        * Page disposition may have changed
-                                        * since process may have exec'ed,
-                                        * forked, exited or just about
-                                        * anything else... try this page
-                                        * frame again, from the top.
-                                        */
-                                       goto top;
-                               }
-                               (void) spl0();
-
-                               mlock((unsigned)cmtopg(hand));
-                               uaccess(rp, Pushmap, &pushutl);
-                               /*
-                                * Now committed to pushing the page...
-                                */
-                               pte->pg_m = 0;
-                               distcl(pte);
-                               if (c->c_type == CTEXT)  {
-                                       xp->x_poip++;
-                                       distpte(xp, vtotp(rp, v), pte);
-                               } else
-                                       rp->p_poip++;
-                               v = kluster(rp, v, pte, B_WRITE, &klsize, klout, (daddr_t)0);
-                               if (klsize == 0)
-                                       panic("pageout klsize");
-                               daddr = vtod(rp, v, &pushutl.u_dmap, &pushutl.u_smap);
-                               swap(rp, daddr, ptob(v), klsize * ctob(CLSIZE),
-                                   B_WRITE, B_DIRTY, swapdev, pte->pg_pfnum);
-                               /*
-                                * The cleaning of this page will be
-                                * completed later, in cleanup() called
-                                * (synchronously) by us (proc[2]).  In
-                                * the meantime, the page frame is locked
-                                * so no havoc can result.
-                                */
-                               goto skip;
-
-                       }
-                       /*
-                        * Decrement the resident set size of the current
-                        * text object/process, and put the page in the
-                        * free list. Note that we don't give memfree the
-                        * pte as its argument, since we don't want to destroy
-                        * the pte.  If it hasn't already been discarded
-                        * it may yet have a chance to be reclaimed from
-                        * the free list.
-                        */
-                       if (c->c_gone == 0)
-                               if (c->c_type == CTEXT)
-                                       xp->x_rssize -= CLSIZE;
-                               else
-                                       rp->p_rssize -= CLSIZE;
-                       memfree(pte, CLSIZE, 0);
-                       cnt.v_dfree += CLSIZE;
-
-                       /*
-                        * We managed to add a page to the free list,
-                        * so we give ourselves another couple of trips
-                        * around the loop.
-                        */
+               if (checkpage(fronthand, FRONT))
+                       count = 0;
+               if (checkpage(backhand, BACK))
                        count = 0;
                        count = 0;
-               }
-skip:
                cnt.v_scan++;
                nscan++;
                cnt.v_scan++;
                nscan++;
-               if (++hand >= maxhand) {
-                       hand = 0;
+               if (++fronthand >= maxhand) {
+                       fronthand = 0;
                        cnt.v_rev++;
                        if (count > 2) {
                                /*
                        cnt.v_rev++;
                        if (count > 2) {
                                /*
@@ -785,10 +736,218 @@ skip:
                        }
                        count++;
                }
                        }
                        count++;
                }
+               if (++backhand >= maxhand)
+                       backhand = 0;
        }
        goto loop;
 }
 
        }
        goto loop;
 }
 
+/*
+ * An iteration of the clock pointer (hand) around the loop.
+ * Look at the page at hand.  If it is a
+ * locked (for physical i/o e.g.), system (u., page table)
+ * or free, then leave it alone.
+ * Otherwise, if we are running the front hand,
+ * invalidate the page for simulation of the reference bit.
+ * If the proc is over maxrss, we take it.
+ * If running the back hand, check whether the page
+ * has been reclaimed.  If not, free the page,
+ * pushing it to disk first if necessary.
+ */
+checkpage(hand, whichhand)
+       int hand, whichhand;
+{
+       register struct proc *rp;
+       register struct text *xp;
+       register struct cmap *c;
+       register struct pte *pte;
+       swblk_t daddr;
+       unsigned v;
+       int klsize;
+
+top:
+       /*
+        * Find a process and text pointer for the
+        * page, and a virtual page number in either the
+        * process or the text image.
+        */
+       c = &cmap[hand];
+       if (c->c_lock || c->c_free)
+               return (0);
+       switch (c->c_type) {
+
+       case CSYS:
+               return (0);
+
+       case CTEXT:
+               xp = &text[c->c_ndx];
+               rp = xp->x_caddr;
+               v = tptov(rp, c->c_page);
+               pte = tptopte(rp, c->c_page);
+               break;
+
+       case CDATA:
+       case CSTACK:
+               rp = &proc[c->c_ndx];
+               while (rp->p_flag & SNOVM)
+                       rp = rp->p_xlink;
+               xp = rp->p_textp;
+               if (c->c_type == CDATA) {
+                       v = dptov(rp, c->c_page);
+                       pte = dptopte(rp, c->c_page);
+               } else {
+                       v = sptov(rp, c->c_page);
+                       pte = sptopte(rp, c->c_page);
+               }
+               break;
+       }
+
+       if (pte->pg_pfnum != cmtopg(hand))
+               panic("bad c_page");
+
+       /*
+        * If page is valid; make invalid but reclaimable.
+        * If this pte is not valid, then it must be reclaimable
+        * and we can add it to the free list.
+        */
+       if (pte->pg_v) {
+               if (whichhand == BACK)
+                       return(0);
+               pte->pg_v = 0;
+               if (anycl(pte, pg_m))
+                       pte->pg_m = 1;
+               distcl(pte);
+               if (c->c_type == CTEXT)
+                       distpte(xp, (unsigned)vtotp(rp, v), pte);
+               if ((rp->p_flag & (SSEQL|SUANOM)) == 0 &&
+                   rp->p_rssize <= rp->p_maxrss)
+                       return (0);
+       }
+       if (c->c_type != CTEXT) {
+               /*
+                * Guarantee a minimal investment in data
+                * space for jobs in balance set.
+                */
+               if (rp->p_rssize < saferss - rp->p_slptime)
+                       return (0);
+       }
+
+       /*
+        * If the page is currently dirty, we
+        * have to arrange to have it cleaned before it
+        * can be freed.  We mark it clean immediately.
+        * If it is reclaimed while being pushed, then modified
+        * again, we are assured of the correct order of 
+        * writes because we lock the page during the write.  
+        * This guarantees that a swap() of this process (and
+        * thus this page), initiated in parallel, will,
+        * in fact, push the page after us.
+        *
+        * The most general worst case here would be for
+        * a reclaim, a modify and a swapout to occur
+        * all before the single page transfer completes.
+        */
+       if (dirtycl(pte)) {
+               /*
+                * If the process is being swapped out
+                * or about to exit, do not bother with its
+                * dirty pages
+                */
+               if (rp->p_flag & (SLOCK|SWEXIT))
+                       return (0);
+               /*
+                * Limit pushes to avoid saturating
+                * pageout device.
+                */
+               if (pushes > maxpgio / RATETOSCHEDPAGING)
+                       return (0);
+               pushes++;
+
+               /*
+                * Now carefully make sure that there will
+                * be a header available for the push so that
+                * we will not block waiting for a header in
+                * swap().  The reason this is important is
+                * that we (proc[2]) are the one who cleans
+                * dirty swap headers and we could otherwise
+                * deadlock waiting for ourselves to clean
+                * swap headers.  The sleep here on &proc[2]
+                * is actually (effectively) a sleep on both
+                * ourselves and &bswlist, and this is known
+                * to swdone and swap in vm_swp.c.  That is,
+                * &proc[2] will be awakened both when dirty
+                * headers show up and also to get the pageout
+                * daemon moving.
+                */
+loop2:
+               (void) splbio();
+               if (bclnlist != NULL) {
+                       (void) spl0();
+                       cleanup();
+                       goto loop2;
+               }
+               if (bswlist.av_forw == NULL) {
+                       bswlist.b_flags |= B_WANTED;
+                       sleep((caddr_t)&proc[2], PSWP+2);
+                       (void) spl0();
+                       /*
+                        * Page disposition may have changed
+                        * since process may have exec'ed,
+                        * forked, exited or just about
+                        * anything else... try this page
+                        * frame again, from the top.
+                        */
+                       goto top;
+               }
+               (void) spl0();
+
+               MLOCK(c);
+               uaccess(rp, Pushmap, &pushutl);
+               /*
+                * Now committed to pushing the page...
+                */
+               pte->pg_m = 0;
+               distcl(pte);
+               if (c->c_type == CTEXT)  {
+                       xp->x_poip++;
+                       distpte(xp, (unsigned)vtotp(rp, v), pte);
+               } else
+                       rp->p_poip++;
+               v = kluster(rp, v, pte, B_WRITE, &klsize, klout, (daddr_t)0);
+               if (klsize == 0)
+                       panic("pageout klsize");
+               daddr = vtod(rp, v, &pushutl.u_dmap, &pushutl.u_smap);
+               (void)swap(rp, daddr, ptob(v), klsize * ctob(CLSIZE),
+                   B_WRITE, B_DIRTY, swapdev, pte->pg_pfnum);
+               /*
+                * The cleaning of this page will be
+                * completed later, in cleanup() called
+                * (synchronously) by us (proc[2]).  In
+                * the meantime, the page frame is locked
+                * so no havoc can result.
+                */
+               return (1);     /* well, it'll be free soon */
+
+       }
+       /*
+        * Decrement the resident set size of the current
+        * text object/process, and put the page in the
+        * free list. Note that we don't give memfree the
+        * pte as its argument, since we don't want to destroy
+        * the pte.  If it hasn't already been discarded
+        * it may yet have a chance to be reclaimed from
+        * the free list.
+        */
+       if (c->c_gone == 0)
+               if (c->c_type == CTEXT)
+                       xp->x_rssize -= CLSIZE;
+               else
+                       rp->p_rssize -= CLSIZE;
+       memfree(pte, CLSIZE, 0);
+       cnt.v_dfree += CLSIZE;
+       return (1);             /* freed a page! */
+}
+
 /*
  * Process the ``cleaned'' list.
  *
 /*
  * Process the ``cleaned'' list.
  *
@@ -813,7 +972,7 @@ cleanup()
        int s, center;
 
        for (;;) {
        int s, center;
 
        for (;;) {
-               s = spl6();
+               s = splbio();
                if ((bp = bclnlist) == 0)
                        break;
                bclnlist = bp->av_forw;
                if ((bp = bclnlist) == 0)
                        break;
                bclnlist = bp->av_forw;
@@ -822,8 +981,8 @@ cleanup()
                center = 0;
                for (i = 0; i < bp->b_bcount; i += CLSIZE * NBPG) {
                        pf = pte->pg_pfnum;
                center = 0;
                for (i = 0; i < bp->b_bcount; i += CLSIZE * NBPG) {
                        pf = pte->pg_pfnum;
-                       munlock(pf);
                        c = &cmap[pgtocm(pf)];
                        c = &cmap[pgtocm(pf)];
+                       MUNLOCK(c);
                        if (pf != bp->b_pfcent) {
                                if (c->c_gone) {
                                        memfree(pte, CLSIZE, 0);
                        if (pf != bp->b_pfcent) {
                                if (c->c_gone) {
                                        memfree(pte, CLSIZE, 0);
@@ -911,16 +1070,19 @@ kluster(p, v, pte0, rw, pkl, klsize, bn0)
        register struct proc *p;
        unsigned v;
        struct pte *pte0;
        register struct proc *p;
        unsigned v;
        struct pte *pte0;
-       int rw, *pkl, klsize;
+       int rw;
+       register int *pkl;
+       int klsize;
        daddr_t bn0;
 {
        int type, cl, clmax;
        int kloff, k, klmax;
        register struct pte *pte;
        int klback, klforw;
        daddr_t bn0;
 {
        int type, cl, clmax;
        int kloff, k, klmax;
        register struct pte *pte;
        int klback, klforw;
-       register int i;
+       int i;
        unsigned v0;
        daddr_t bn;
        unsigned v0;
        daddr_t bn;
+       register struct cmap *c;
 
        if (rw == B_READ)
                klicnt[0]++;
 
        if (rw == B_READ)
                klicnt[0]++;
@@ -953,7 +1115,7 @@ kluster(p, v, pte0, rw, pkl, klsize, bn0)
                else
                        pte -= CLSIZE;
                if (type == CTEXT && rw == B_READ && bn) {
                else
                        pte -= CLSIZE;
                if (type == CTEXT && rw == B_READ && bn) {
-                       bn -= CLBYTES / DEV_BSIZE;
+                       bn -= btodb(CLBYTES);
                        if (mfind(swapdev, bn))
                                break;
                }
                        if (mfind(swapdev, bn))
                                break;
                }
@@ -973,7 +1135,7 @@ kluster(p, v, pte0, rw, pkl, klsize, bn0)
                else
                        pte += CLSIZE;
                if (type == CTEXT && rw == B_READ && bn) {
                else
                        pte += CLSIZE;
                if (type == CTEXT && rw == B_READ && bn) {
-                       bn += (CLBYTES / DEV_BSIZE);
+                       bn += btodb(CLBYTES);
                        if (mfind(swapdev, bn))
                                break;
                }
                        if (mfind(swapdev, bn))
                                break;
                }
@@ -1001,25 +1163,24 @@ kluster(p, v, pte0, rw, pkl, klsize, bn0)
                if (pte == pte0)
                        goto cont;
                if (rw == B_WRITE) {
                if (pte == pte0)
                        goto cont;
                if (rw == B_WRITE) {
-                       mlock(pte->pg_pfnum);
+                       c = &cmap[pgtocm(pte->pg_pfnum)];
+                       MLOCK(c);
                        pte->pg_m = 0;
                        distcl(pte);
                        if (type == CTEXT)
                        pte->pg_m = 0;
                        distcl(pte);
                        if (type == CTEXT)
-                               distpte(p->p_textp, vtotp(p, v), pte);
+                               distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
                } else {
                        struct pte opte;
                } else {
                        struct pte opte;
-                       int pf;
 
                        opte = *pte;
                        if (memall(pte, CLSIZE, p, type) == 0)
                                panic("kluster");
                        pte->pg_prot = opte.pg_prot;
 
                        opte = *pte;
                        if (memall(pte, CLSIZE, p, type) == 0)
                                panic("kluster");
                        pte->pg_prot = opte.pg_prot;
-                       pf = pte->pg_pfnum;
-                       cmap[pgtocm(pf)].c_intrans = 1;
+                       cmap[pgtocm(pte->pg_pfnum)].c_intrans = 1;
                        distcl(pte);
                        if (type == CTEXT) {
                                p->p_textp->x_rssize += CLSIZE;
                        distcl(pte);
                        if (type == CTEXT) {
                                p->p_textp->x_rssize += CLSIZE;
-                               distpte(p->p_textp, vtotp(p, v), pte);
+                               distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
                        } else
                                p->p_rssize += CLSIZE;
                        distcl(pte);
                        } else
                                p->p_rssize += CLSIZE;
                        distcl(pte);
@@ -1056,3 +1217,132 @@ klok(pte, rw)
                return (1);
        }
 }
                return (1);
        }
 }
+
+/*
+ * Fodkluster locates pages adjacent to the argument pages
+ * that are immediately available to include in the pagein,
+ * and given the availability of memory includes them.
+ * It wants to page in a file system block if it can.
+ */
+int nofodklust;
+int fodklcnt[KLMAX];
+
+fodkluster(p, v0, pte0, pkl, dev, pbn)
+       register struct proc *p;
+       unsigned v0;
+       struct pte *pte0;
+       int *pkl;
+       dev_t dev;
+       daddr_t *pbn;
+{
+       register struct pte *pte;
+       register struct fpte *fpte;
+       struct cmap *c;
+       register daddr_t bn;
+       daddr_t bnswap;
+       unsigned v, vmin, vmax;
+       register int klsize;
+       int klback, type, i;
+
+       if (nofodklust)
+               return (v0);
+       fodklcnt[0]++;
+       *pkl = 1;
+       if (freemem < KLMAX)
+               return (v0);
+       if (isatsv(p, v0)) {
+               type = CTEXT;
+               vmin = tptov(p, 0);
+               vmax = tptov(p, clrnd(p->p_tsize) - CLSIZE);
+       } else {
+               type = CDATA;
+               vmin = dptov(p, 0);
+               vmax = dptov(p, clrnd(p->p_dsize) - CLSIZE);
+       }
+       fpte = (struct fpte *)pte0;
+       bn = *pbn;
+       v = v0;
+       for (klsize = 1; klsize < KLMAX; klsize++) {
+               v -= CLSIZE;
+               if (v < vmin)
+                       break;
+               fpte -= CLSIZE;
+               if (fpte->pg_fod == 0)
+                       break;
+               bn -= btodb(CLBYTES);
+               if (fpte->pg_blkno != bn)
+                       break;
+               if (type == CTEXT) {
+                       if (mfind(dev, bn))
+                               break;
+                       /*
+                        * Flush any previous text page use of this
+                        * swap device block.
+                        */
+                       bnswap = vtod(p, v, &u.u_dmap, &u.u_smap);
+                       c = mfind(swapdev, bnswap);
+                       if (c)
+                               munhash(swapdev, bnswap);
+               }
+       }
+       klback = klsize - 1;
+       fpte = (struct fpte *)pte0;
+       bn = *pbn;
+       v = v0;
+       for (; klsize < KLMAX; klsize++) {
+               v += CLSIZE;
+               if (v > vmax)
+                       break;
+               fpte += CLSIZE;
+               if (fpte->pg_fod == 0)
+                       break;
+               bn += btodb(CLBYTES);
+               if (fpte->pg_blkno != bn)
+                       break;
+               if (type == CTEXT) {
+                       if (mfind(dev, bn))
+                               break;
+                       /*
+                        * Flush any previous text page use of this
+                        * swap device block.
+                        */
+                       bnswap = vtod(p, v, &u.u_dmap, &u.u_smap);
+                       c = mfind(swapdev, bnswap);
+                       if (c)
+                               munhash(swapdev, bnswap);
+               }
+       }
+       if (klsize == 1)
+               return (v0);
+       pte = pte0;
+       pte -= klback * CLSIZE;
+       v0 -= klback * CLSIZE;
+       *pbn -= klback * btodb(CLBYTES);
+       *pkl = klsize;
+       fodklcnt[0]--; fodklcnt[klsize - 1]++;
+       v = v0;
+       for (i = 0; i < klsize; i++) {
+               if (pte != pte0) {
+                       struct pte opte;
+                       int pf;
+
+                       opte = *pte;
+                       if (memall(pte, CLSIZE, p, type) == 0)
+                               panic("fodkluster");
+                       pte->pg_prot = opte.pg_prot;
+                       pf = pte->pg_pfnum;
+                       pte->pg_m = 1;
+                       cmap[pgtocm(pf)].c_intrans = 1;
+                       distcl(pte);
+                       if (type == CTEXT) {
+                               p->p_textp->x_rssize += CLSIZE;
+                               distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
+                       } else
+                               p->p_rssize += CLSIZE;
+                       distcl(pte);
+               }
+               pte += CLSIZE;
+               v += CLSIZE;
+       }
+       return (v0);
+}