BSD 4_3_Reno release
[unix-history] / usr / src / sys / kern / vm_page.c
/*
* Copyright (c) 1982, 1986, 1989 Regents of the University of California.
* All rights reserved.
*
* Redistribution is only permitted until one year after the first shipment
* of 4.4BSD by the Regents. Otherwise, redistribution and use in source and
* binary forms are permitted provided that: (1) source distributions retain
* this entire copyright notice and comment, and (2) distributions including
* binaries display the following acknowledgement: This product includes
* software developed by the University of California, Berkeley and its
* contributors'' in the documentation or other materials provided with the
* distribution and in all advertising materials mentioning features or use
* of this software. Neither the name of the University nor the names of
* its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*
* @(#)vm_page.c 7.14 (Berkeley) 6/28/90
*/
#include "param.h"
#include "systm.h"
#include "user.h"
#include "proc.h"
#include "buf.h"
#include "text.h"
#include "vnode.h"
#include "cmap.h"
#include "vm.h"
#include "trace.h"
#include "file.h"
#include "machine/cpu.h"
#include "machine/pte.h"
#include "machine/mtpr.h"
#if defined(tahoe)
#if CLSIZE == 1
#define uncachecl(pte) uncache(pte)
#endif
#if CLSIZE == 2
#define uncachecl(pte) uncache(pte), uncache((pte)+1)
#endif
#if CLSIZE > 2
#define uncachecl(pte) { \
register ii; \
for (ii = 0; ii < CLSIZE; ii++) \
uncache((pte) + ii); \
}
#endif
#else /* tahoe */
#define uncache(pte) /* nothing */
#define uncachecl(pte) /* nothing */
#endif
int nohash = 0;
/*
* Handle a page fault.
*
* Basic outline
* If page is allocated, but just not valid:
* Wait if intransit, else just revalidate
* Done
* Compute <vp,bn> from which page operation would take place
* If page is text page, and filling from file system or swap space:
* If in free list cache, reattach it and then done
* Allocate memory for page in
* If block here, restart because we could have swapped, etc.
* Lock process from swapping for duration
* Update pte's to reflect that page is intransit.
* If page is zero fill on demand:
* Clear pages and flush free list cache of stale cacheing
* for this swap page (e.g. before initializing again due
* to 407/410 exec).
* If page is fill from file and in buffer cache:
* Copy the page from the buffer cache.
* If not a fill on demand:
* Determine swap address and cluster to page in
* Do the swap to bring the page in
* Instrument the pagein
* After swap validate the required new page
* Leave prepaged pages reclaimable (not valid)
* Update shared copies of text page tables
* Complete bookkeeping on pages brought in:
* No longer intransit
* Hash text pages into core hash structure
* Unlock pages (modulo raw i/o requirements)
* Flush translation buffer
* Process pagein is done
*/
#ifdef TRACE
#define pgtrace(e) trace(e,v,u.u_procp->p_pid)
#else
#define pgtrace(e)
#endif
int preptofree = 1; /* send pre-paged pages to free list */
pagein(virtaddr, dlyu)
unsigned virtaddr;
int dlyu;
{
register struct proc *p;
register struct pte *pte;
register unsigned v;
unsigned pf;
int type, fileno;
struct pte opte;
struct vnode *vp;
register int i;
int klsize;
unsigned vsave;
struct cmap *c;
int j;
daddr_t bn, bncache, bnswap;
int si, sk;
int swerror = 0;
#ifdef PGINPROF
int otime, olbolt, oicr, s;
long a;
s = splclock();
otime = time, olbolt = lbolt, oicr = mfpr(ICR);
#endif
cnt.v_faults++;
/*
* Classify faulted page into a segment and get a pte
* for the faulted page.
*/
vsave = v = clbase(btop(virtaddr));
p = u.u_procp;
if (isatsv(p, v))
type = CTEXT;
else if (isassv(p, v))
type = CSTACK;
else
type = CDATA;
pte = vtopte(p, v);
if (pte->pg_v) {
#ifdef MAPMEM
/* will this ever happen? */
if (pte->pg_fod) {
#ifdef PGINPROF
splx(s);
#endif
return;
}
#endif
panic("pagein");
}
/*
* If page is reclaimable, reclaim it.
* If page is text and intransit, sleep while it is intransit,
* If it is valid after the sleep, we are done.
* Otherwise we have to start checking again, since page could
* even be reclaimable now (we may have swapped for a long time).
*/
restart:
if (pte->pg_fod == 0 && pte->pg_pfnum) {
if (type == CTEXT && cmap[pgtocm(pte->pg_pfnum)].c_intrans) {
pgtrace(TR_INTRANS);
sleep((caddr_t)p->p_textp, PSWP+1);
pgtrace(TR_EINTRANS);
pte = vtopte(p, v);
if (pte->pg_v) {
valid:
if (dlyu) {
c = &cmap[pgtocm(pte->pg_pfnum)];
if (c->c_lock) {
c->c_want = 1;
sleep((caddr_t)c, PSWP+1);
goto restart;
}
c->c_lock = 1;
}
newptes(pte, v, CLSIZE);
cnt.v_intrans++;
#ifdef PGINPROF
splx(s);
#endif
return;
}
goto restart;
}
/*
* If page is in the free list, then take
* it back into the resident set, updating
* the size recorded for the resident set.
*/
si = splimp();
c = &cmap[pgtocm(pte->pg_pfnum)];
if (c->c_free) {
pgtrace(TR_FRECLAIM);
munlink(c);
cnt.v_pgfrec++;
if (type == CTEXT)
p->p_textp->x_rssize += CLSIZE;
else
p->p_rssize += CLSIZE;
} else
pgtrace(TR_RECLAIM);
splx(si);
uncachecl(pte);
pte->pg_v = 1;
if (anycl(pte, pg_m))
pte->pg_m = 1;
distcl(pte);
if (type == CTEXT)
distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
u.u_ru.ru_minflt++;
cnt.v_pgrec++;
if (dlyu) {
c = &cmap[pgtocm(pte->pg_pfnum)];
if (c->c_lock) {
c->c_want = 1;
sleep((caddr_t)c, PSWP+1);
goto restart;
}
c->c_lock = 1;
}
newptes(pte, v, CLSIZE);
#ifdef PGINPROF
a = vmtime(otime, olbolt, oicr);
rectime += a;
if (a >= 0)
vmfltmon(rmon, a, rmonmin, rres, NRMON);
splx(s);
#endif
return;
}
#ifdef PGINPROF
splx(s);
#endif
/*
* <vp,bn> is where data comes from/goes to.
* <vp,bncache> is where data is cached from/to.
* <swapdev_vp,bnswap> is where data will eventually go.
*/
if (pte->pg_fod == 0) {
fileno = -1;
bnswap = bncache = bn = vtod(p, v, &u.u_dmap, &u.u_smap);
vp = swapdev_vp;
} else {
fileno = ((struct fpte *)pte)->pg_fileno;
bn = ((struct fpte *)pte)->pg_blkno;
bnswap = vtod(p, v, &u.u_dmap, &u.u_smap);
if (fileno == PG_FTEXT) {
if (p->p_textp == 0)
panic("pagein PG_FTEXT");
if (VOP_BMAP(p->p_textp->x_vptr, (daddr_t)0, &vp,
(daddr_t *)0)) {
swkill(p, "pagein: filesystem unmounted");
return;
}
bncache = bn;
} else if (fileno == PG_FZERO) {
vp = swapdev_vp;
bncache = bnswap;
}
}
klsize = 1;
opte = *pte;
/*
* Check for text detached but in free list.
* This can happen only if the page is filling
* from a inode or from the swap device, (e.g. not when reading
* in 407/410 execs to a zero fill page.)
* Honor lock bit to avoid races with pageouts.
*/
if (type == CTEXT && fileno != PG_FZERO && !nohash) {
si = splimp();
while ((c = mfind(vp, bncache)) != 0) {
if (c->c_lock == 0)
break;
MLOCK(c);
MUNLOCK(c);
}
if (c) {
if (c->c_type != CTEXT || c->c_gone == 0 ||
c->c_free == 0)
panic("pagein mfind");
p->p_textp->x_rssize += CLSIZE;
/*
* Following code mimics memall().
*/
munlink(c);
pf = cmtopg(c - cmap);
for (j = 0; j < CLSIZE; j++) {
*(int *)pte = 0;
pte->pg_pfnum = pf++;
pte->pg_prot = opte.pg_prot;
pte++;
}
pte -= CLSIZE;
c->c_free = 0;
c->c_gone = 0;
if (c->c_intrans || c->c_want)
panic("pagein intrans|want");
c->c_lock = 1;
if (c->c_page != vtotp(p, v))
panic("pagein c_page chgd");
c->c_ndx = p->p_textp - &text[0];
if (vp == swapdev_vp) {
cnt.v_xsfrec++;
pgtrace(TR_XSFREC);
} else {
cnt.v_xifrec++;
pgtrace(TR_XIFREC);
}
cnt.v_pgrec++;
u.u_ru.ru_minflt++;
if (vp != swapdev_vp) {
munhash(swapdev_vp, bnswap);
pte->pg_m = 1;
}
splx(si);
goto skipswap;
}
splx(si);
}
/*
* Wasn't reclaimable or reattachable.
* Have to prepare to bring the page in.
* We allocate the page before locking so we will
* be swappable if there is no free memory.
* If we block we have to start over, since anything
* could have happened.
*/
sk = splimp(); /* lock memalls from here into kluster */
if (freemem < CLSIZE * KLMAX) {
pgtrace(TR_WAITMEM);
while (freemem < CLSIZE * KLMAX)
sleep((caddr_t)&freemem, PSWP+2);
pgtrace(TR_EWAITMEM);
splx(sk);
pte = vtopte(p, v);
#ifdef PGINPROF
s = splclock();
#endif
if (pte->pg_v)
goto valid;
goto restart;
}
/*
* Now can get memory and committed to bringing in the page.
* Lock this process, get a page,
* construct the new pte, and increment
* the (process or text) resident set size.
*/
p->p_flag |= SPAGE;
if (memall(pte, CLSIZE, p, type) == 0)
panic("pagein memall");
pte->pg_prot = opte.pg_prot;
pf = pte->pg_pfnum;
cmap[pgtocm(pf)].c_intrans = 1;
distcl(pte);
if (type == CTEXT) {
p->p_textp->x_rssize += CLSIZE;
distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
} else
p->p_rssize += CLSIZE;
/*
* Two cases: either fill on demand (zero, or from file or text)
* or from swap space.
*/
if (opte.pg_fod) {
pte->pg_m = 1;
if (fileno == PG_FZERO || fileno == PG_FTEXT) {
/*
* Flush any previous text page use of this
* swap device block.
*/
if (type == CTEXT)
munhash(swapdev_vp, bnswap);
/*
* If zero fill, short-circuit hard work
* by just clearing pages.
*/
if (fileno == PG_FZERO) {
pgtrace(TR_ZFOD);
for (i = 0; i < CLSIZE; i++) {
clearseg(pf+i);
#if defined(tahoe)
mtpr(P1DC, (int)virtaddr + i * NBPG);
#endif
}
if (type != CTEXT)
cnt.v_zfod += CLSIZE;
splx(sk);
goto skipswap;
}
pgtrace(TR_EXFOD);
cnt.v_exfod += CLSIZE;
}
/*
* Fill from inode. Try to find adjacent
* pages to bring in also.
*/
v = fodkluster(p, v, pte, &klsize, vp, &bn);
bncache = bn;
splx(sk);
#ifdef TRACE
if (type != CTEXT)
trace(TR_XFODMISS, vp, bn);
#endif
} else {
if (opte.pg_pfnum)
panic("pagein pfnum");
pgtrace(TR_SWAPIN);
/*
* Fill from swap area. Try to find adjacent
* pages to bring in also.
*/
v = kluster(p, v, pte, B_READ, &klsize,
(type == CTEXT) ? kltxt :
((p->p_flag & SSEQL) ? klseql : klin), bn);
splx(sk);
/* THIS COULD BE COMPUTED INCREMENTALLY... */
bncache = bn = vtod(p, v, &u.u_dmap, &u.u_smap);
}
distcl(pte);
swerror = swap(p, bn, ptob(v), klsize * ctob(CLSIZE),
B_READ, B_PGIN, vp, 0);
#ifdef TRACE
trace(TR_PGINDONE, vsave, u.u_procp->p_pid);
#endif
/*
* Instrumentation.
*/
u.u_ru.ru_majflt++;
cnt.v_pgin++;
cnt.v_pgpgin += klsize * CLSIZE;
#ifdef PGINPROF
a = vmtime(otime, olbolt, oicr) / 100;
pgintime += a;
if (a >= 0)
vmfltmon(pmon, a, pmonmin, pres, NPMON);
#endif
skipswap:
/*
* Fix page table entries.
*
* Only page requested in is validated, and rest of pages
* can be ``reclaimed''. This allows system to reclaim prepaged pages
* quickly if they are not used and memory is tight.
*/
pte = vtopte(p, vsave);
pte->pg_v = 1;
#ifdef REFBIT
/*
* Start with the page used so that pageout doesn't free it
* immediately.
*/
pte->pg_u = 1;
#endif
distcl(pte);
if (type == CTEXT) {
if (swerror) {
xinval(p->p_textp->x_vptr);
} else {
distpte(p->p_textp, (unsigned)vtotp(p, vsave), pte);
if (opte.pg_fod)
p->p_textp->x_flag |= XWRIT;
}
wakeup((caddr_t)p->p_textp);
}
/*
* Memall returned page(s) locked. Unlock all
* pages in cluster. If locking pages for raw i/o
* leave the page which was required to be paged in locked,
* but still unlock others.
* If text pages, hash into the cmap situation table.
*/
pte = vtopte(p, v);
for (i = 0; i < klsize; i++) {
c = &cmap[pgtocm(pte->pg_pfnum)];
c->c_intrans = 0;
if (type == CTEXT && c->c_blkno == 0 && bncache && !nohash &&
!swerror) {
mhash(c, vp, bncache);
bncache += btodb(CLBYTES);
}
if (v != vsave || !dlyu)
MUNLOCK(c);
if (v != vsave && type != CTEXT && preptofree &&
opte.pg_fod == 0) {
/*
* Throw pre-paged data/stack pages at the
* bottom of the free list; leave pg_u clear.
*/
p->p_rssize -= CLSIZE;
memfree(pte, CLSIZE, 0);
}
#ifdef REFBIT
/*
* Text pages paged-in and allocated during the kluster
* must be validated, as they are now in the resident set.
*/
if (v != vsave && type == CTEXT) {
pte->pg_v = 1;
distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
}
#endif
newptes(pte, v, CLSIZE);
v += CLSIZE;
pte += CLSIZE;
}
/*
* All done.
*/
p->p_flag &= ~SPAGE;
/*
* If process is declared fifo, memory is tight,
* and this was a data page-in, free memory
* klsdist pagein clusters away from the current fault.
*/
if ((p->p_flag&SSEQL) && freemem < lotsfree && type == CDATA) {
int k = (vtodp(p, vsave) / CLSIZE) / klseql;
#ifdef notdef
if (vsave > u.u_vsave)
k -= klsdist;
else
k += klsdist;
dpageout(p, k * klseql * CLSIZE, klout*CLSIZE);
u.u_vsave = vsave;
#else
dpageout(p, (k - klsdist) * klseql * CLSIZE, klout*CLSIZE);
dpageout(p, (k + klsdist) * klseql * CLSIZE, klout*CLSIZE);
#endif
}
}
/*
* Take away n pages of data space
* starting at data page dp.
* Used to take pages away from sequential processes.
* Mimics pieces of code in pageout() below.
*/
dpageout(p, dp, n)
struct proc *p;
int dp, n;
{
register struct cmap *c;
int i, klsize;
register struct pte *pte;
unsigned v;
daddr_t daddr;
if (dp < 0) {
n += dp;
dp = 0;
}
if (dp + n > p->p_dsize)
n = p->p_dsize - dp;
for (i = 0; i < n; i += CLSIZE, dp += CLSIZE) {
pte = dptopte(p, dp);
if (pte->pg_fod || pte->pg_pfnum == 0)
continue;
c = &cmap[pgtocm(pte->pg_pfnum)];
if (c->c_lock || c->c_free)
continue;
uncachecl(pte);
if (pte->pg_v) {
pte->pg_v = 0;
if (anycl(pte, pg_m))
pte->pg_m = 1;
distcl(pte);
p->p_flag |= SPTECHG;
}
if (dirtycl(pte)) {
if (bswlist.av_forw == NULL)
continue;
MLOCK(c);
pte->pg_m = 0;
distcl(pte);
p->p_poip++;
v = kluster(p, dptov(p, dp), pte, B_WRITE,
&klsize, klout, (daddr_t)0);
/* THIS ASSUMES THAT p == u.u_procp */
daddr = vtod(p, v, &u.u_dmap, &u.u_smap);
(void) swap(p, daddr, ptob(v), klsize * ctob(CLSIZE),
B_WRITE, B_DIRTY, swapdev_vp, pte->pg_pfnum);
} else {
if (c->c_gone == 0)
p->p_rssize -= CLSIZE;
memfree(pte, CLSIZE, 0);
cnt.v_seqfree += CLSIZE;
}
}
}
unsigned maxdmap;
unsigned maxtsize;
/*
* Setup the paging constants for the clock algorithm.
* Called after the system is initialized and the amount of memory
* and number of paging devices is known.
*
* Threshold constants are defined in machine/vmparam.h.
*/
vminit()
{
/*
* Lotsfree is threshold where paging daemon turns on.
*/
if (lotsfree == 0) {
lotsfree = LOTSFREE / NBPG;
if (lotsfree > LOOPPAGES / LOTSFREEFRACT)
lotsfree = LOOPPAGES / LOTSFREEFRACT;
}
/*
* Desfree is amount of memory desired free.
* If less than this for extended period, do swapping.
*/
if (desfree == 0) {
desfree = DESFREE / NBPG;
if (desfree > LOOPPAGES / DESFREEFRACT)
desfree = LOOPPAGES / DESFREEFRACT;
}
/*
* Minfree is minimal amount of free memory which is tolerable.
*/
if (minfree == 0) {
minfree = MINFREE / NBPG;
if (minfree > desfree / MINFREEFRACT)
minfree = desfree / MINFREEFRACT;
}
/*
* Maxpgio thresholds how much paging is acceptable.
* This figures that 2/3 busy on an arm is all that is
* tolerable for paging. We assume one operation per disk rev.
*/
if (maxpgio == 0)
maxpgio = (DISKRPM * 2) / 3;
/*
* Clock to scan using max of ~~10% of processor time for sampling,
* this estimated to allow maximum of 200 samples per second.
* This yields a ``fastscan'' of roughly (with CLSIZE=2):
* <=1m 2m 3m 4m 8m
* 5s 10s 15s 20s 40s
*/
if (fastscan == 0)
fastscan = 200;
if (fastscan > LOOPPAGES / 5)
fastscan = LOOPPAGES / 5;
/*
* Set slow scan time to 1/2 the fast scan time.
*/
if (slowscan == 0)
slowscan = fastscan / 2;
/*
* Calculate the swap allocation constants.
*/
if (dmmin == 0) {
dmmin = DMMIN;
if (dmmin < CLBYTES/DEV_BSIZE)
dmmin = CLBYTES/DEV_BSIZE;
}
if (dmmax == 0) {
dmmax = DMMAX;
while (dmapsize(dmmin, dmmax / 2) >= MAXDSIZ && dmmax > dmmin)
dmmax /= 2;
}
maxdmap = dmapsize(dmmin, dmmax);
if (dmtext == 0)
dmtext = DMTEXT;
if (dmtext > dmmax)
dmtext = dmmax;
if (maxtsize == 0)
maxtsize = MAXTSIZ;
if (maxtsize > dtob(NXDAD * dmtext))
maxtsize = dtob(NXDAD * dmtext);
/*
* Set up the initial limits on process VM.
* Set the maximum resident set size to be all
* of (reasonably) available memory. This causes
* any single, large process to start random page
* replacement once it fills memory.
*/
u.u_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
u.u_rlimit[RLIMIT_STACK].rlim_max = MIN(MAXSSIZ, maxdmap);
u.u_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
u.u_rlimit[RLIMIT_DATA].rlim_max = MIN(MAXDSIZ, maxdmap);
u.u_rlimit[RLIMIT_RSS].rlim_cur = u.u_rlimit[RLIMIT_RSS].rlim_max =
ctob(LOOPPAGES - desfree);
proc[0].p_maxrss = LOOPPAGES - desfree;
}
dmapsize(dmin, dmax)
int dmin, dmax;
{
register int i, blk, size = 0;
blk = dmin;
for (i = 0; i < NDMAP; i++) {
size += blk;
if (blk < dmax)
blk *= 2;
}
return (dtob(size));
}
int pushes;
#define FRONT 1
#define BACK 2
/*
* The page out daemon, which runs as process 2.
*
* As long as there are at least lotsfree pages,
* this process is not run. When the number of free
* pages stays in the range desfree to lotsfree,
* this daemon runs through the pages in the loop
* at a rate determined in vmsched(). Pageout manages
* two hands on the clock. The front hand moves through
* memory, clearing the valid bit (simulating a reference bit),
* and stealing pages from procs that are over maxrss.
* The back hand travels a distance behind the front hand,
* freeing the pages that have not been referenced in the time
* since the front hand passed. If modified, they are pushed to
* swap before being freed.
*/
pageout()
{
register int count;
register int maxhand = pgtocm(maxfree);
register int fronthand, backhand;
/*
* Set the two clock hands to be separated by a reasonable amount,
* but no more than 360 degrees apart.
*/
backhand = 0 / CLBYTES;
fronthand = HANDSPREAD / CLBYTES;
if (fronthand >= maxhand)
fronthand = maxhand - 1;
loop:
/*
* Before sleeping, look to see if there are any swap I/O headers
* in the ``cleaned'' list that correspond to dirty
* pages that have been pushed asynchronously. If so,
* empty the list by calling cleanup().
*
* N.B.: We guarantee never to block while the cleaned list is nonempty.
*/
(void) splbio();
if (bclnlist != NULL) {
(void) spl0();
cleanup();
goto loop;
}
sleep((caddr_t)&proc[2], PSWP+1);
(void) spl0();
count = 0;
pushes = 0;
while (nscan < desscan && freemem < lotsfree) {
/*
* If checkpage manages to add a page to the free list,
* we give ourselves another couple of trips around the loop.
*/
if (checkpage(fronthand, FRONT))
count = 0;
if (checkpage(backhand, BACK))
count = 0;
cnt.v_scan++;
nscan++;
if (++backhand >= maxhand)
backhand = 0;
if (++fronthand >= maxhand) {
fronthand = 0;
cnt.v_rev++;
if (count > 2) {
/*
* Extremely unlikely, but we went around
* the loop twice and didn't get anywhere.
* Don't cycle, stop till the next clock tick.
*/
goto loop;
}
count++;
}
}
goto loop;
}
/*
* An iteration of the clock pointer (hand) around the loop.
* Look at the page at hand. If it is a
* locked (for physical i/o e.g.), system (u., page table)
* or free, then leave it alone.
* Otherwise, if we are running the front hand,
* invalidate the page for simulation of the reference bit.
* If the proc is over maxrss, we take it.
* If running the back hand, check whether the page
* has been reclaimed. If not, free the page,
* pushing it to disk first if necessary.
*/
checkpage(hand, whichhand)
int hand, whichhand;
{
register struct proc *rp;
register struct text *xp;
register struct cmap *c;
register struct pte *pte;
swblk_t daddr;
unsigned v;
int klsize;
top:
/*
* Find a process and text pointer for the
* page, and a virtual page number in either the
* process or the text image.
*/
c = &cmap[hand];
if (c->c_lock || c->c_free)
return (0);
switch (c->c_type) {
case CSYS:
return (0);
case CTEXT:
xp = &text[c->c_ndx];
rp = xp->x_caddr;
v = tptov(rp, c->c_page);
pte = tptopte(rp, c->c_page);
break;
case CDATA:
case CSTACK:
rp = &proc[c->c_ndx];
while (rp->p_flag & SNOVM)
rp = rp->p_xlink;
xp = rp->p_textp;
if (c->c_type == CDATA) {
v = dptov(rp, c->c_page);
pte = dptopte(rp, c->c_page);
} else {
v = sptov(rp, c->c_page);
pte = sptopte(rp, c->c_page);
}
break;
}
if (pte->pg_pfnum != cmtopg(hand))
panic("bad c_page");
#ifdef REFBIT
/*
* If any processes attached to the text page have used
* it, then mark this one used and on the following
* distpte, they will all be marked used.
*/
if (c->c_type == CTEXT && tanyu(xp, vtotp(rp, v)))
pte->pg_u = 1;
/*
* If page is referenced, clear its reference bit.
* If page is not referenced, clear valid bit
* and add it to the free list.
*/
uncachecl(pte);
if (anycl(pte, pg_u))
#else
/*
* If page is valid; make invalid but reclaimable.
* If this pte is not valid, then it must be reclaimable
* and we can add it to the free list.
*/
if (pte->pg_v)
#endif
{
if (whichhand == BACK)
return (0);
#ifdef REFBIT
pte->pg_u = 0;
#else
pte->pg_v = 0;
rp->p_flag |= SPTECHG;
#endif
if (anycl(pte, pg_m))
pte->pg_m = 1;
distcl(pte);
if (c->c_type == CTEXT)
distpte(xp, (unsigned)vtotp(rp, v), pte);
if ((rp->p_flag & (SSEQL|SUANOM)) == 0 &&
rp->p_rssize <= rp->p_maxrss)
return (0);
}
if (c->c_type != CTEXT) {
/*
* Guarantee a minimal investment in data
* space for jobs in balance set.
*/
if (rp->p_rssize < saferss - rp->p_slptime)
return (0);
}
/*
* If the page is currently dirty, we
* have to arrange to have it cleaned before it
* can be freed. We mark it clean immediately.
* If it is reclaimed while being pushed, then modified
* again, we are assured of the correct order of
* writes because we lock the page during the write.
* This guarantees that a swap() of this process (and
* thus this page), initiated in parallel, will,
* in fact, push the page after us.
*
* The most general worst case here would be for
* a reclaim, a modify and a swapout to occur
* all before the single page transfer completes.
*/
if (dirtycl(pte)) {
/*
* If the process is being swapped out
* or about to exit, do not bother with its
* dirty pages
*/
if (rp->p_flag & (SLOCK|SWEXIT))
return (0);
/*
* Limit pushes to avoid saturating
* pageout device.
*/
if (pushes > maxpgio / RATETOSCHEDPAGING)
return (0);
pushes++;
/*
* Now carefully make sure that there will
* be a header available for the push so that
* we will not block waiting for a header in
* swap(). The reason this is important is
* that we (proc[2]) are the one who cleans
* dirty swap headers and we could otherwise
* deadlock waiting for ourselves to clean
* swap headers. The sleep here on &proc[2]
* is actually (effectively) a sleep on both
* ourselves and &bswlist, and this is known
* to swdone and swap in vm_swp.c. That is,
* &proc[2] will be awakened both when dirty
* headers show up and also to get the pageout
* daemon moving.
*/
loop2:
(void) splbio();
if (bclnlist != NULL) {
(void) spl0();
cleanup();
goto loop2;
}
if (bswlist.av_forw == NULL) {
bswlist.b_flags |= B_WANTED;
sleep((caddr_t)&proc[2], PSWP+2);
(void) spl0();
/*
* Page disposition may have changed
* since process may have exec'ed,
* forked, exited or just about
* anything else... try this page
* frame again, from the top.
*/
goto top;
}
(void) spl0();
MLOCK(c);
uaccess(rp, Pushmap, &pushutl);
/*
* Now committed to pushing the page...
*/
#ifdef REFBIT
pte->pg_v = 0;
rp->p_flag |= SPTECHG;
#endif
pte->pg_m = 0;
distcl(pte);
if (c->c_type == CTEXT) {
xp->x_poip++;
distpte(xp, (unsigned)vtotp(rp, v), pte);
} else
rp->p_poip++;
v = kluster(rp, v, pte, B_WRITE, &klsize, klout, (daddr_t)0);
if (klsize == 0)
panic("pageout klsize");
daddr = vtod(rp, v, &pushutl.u_dmap, &pushutl.u_smap);
(void) swap(rp, daddr, ptob(v), klsize * ctob(CLSIZE),
B_WRITE, B_DIRTY, swapdev_vp, pte->pg_pfnum);
/*
* The cleaning of this page will be
* completed later, in cleanup() called
* (synchronously) by us (proc[2]). In
* the meantime, the page frame is locked
* so no havoc can result.
*/
return (1); /* well, it'll be free soon */
}
/*
* Propagate valid bit changes.
* Decrement the resident set size of the current
* text object/process, and put the page in the
* free list. Don't detach the page yet;
* it may yet have a chance to be reclaimed from
* the free list.
*/
#ifdef REFBIT
pte->pg_v = 0;
distcl(pte);
if (c->c_type == CTEXT)
distpte(xp, (unsigned)vtotp(rp, v), pte);
else
rp->p_flag |= SPTECHG;
#endif
if (c->c_gone == 0)
if (c->c_type == CTEXT)
xp->x_rssize -= CLSIZE;
else
rp->p_rssize -= CLSIZE;
memfree(pte, CLSIZE, 0);
cnt.v_dfree += CLSIZE;
return (1); /* freed a page! */
}
/*
* Process the ``cleaned'' list.
*
* Scan through the linked list of swap I/O headers
* and free the corresponding pages that have been
* cleaned by being written back to the paging area.
* If the page has been reclaimed during this time,
* we do not free the page. As they are processed,
* the swap I/O headers are removed from the cleaned
* list and inserted into the free list.
*/
cleanup()
{
register struct buf *bp;
register struct proc *rp;
register struct text *xp;
register struct cmap *c;
register struct pte *pte;
struct pte *upte;
unsigned pf;
register int i;
int s, center;
for (;;) {
s = splbio();
if ((bp = bclnlist) == 0)
break;
bclnlist = bp->av_forw;
splx(s);
pte = vtopte(&proc[2], btop(bp->b_un.b_addr));
center = 0;
for (i = 0; i < bp->b_bcount; i += CLSIZE * NBPG) {
pf = pte->pg_pfnum;
c = &cmap[pgtocm(pf)];
MUNLOCK(c);
if (pf != bp->b_pfcent) {
if (c->c_gone) {
memfree(pte, CLSIZE, 0);
cnt.v_dfree += CLSIZE;
}
goto skip;
}
center++;
switch (c->c_type) {
case CSYS:
panic("cleanup CSYS");
case CTEXT:
xp = &text[c->c_ndx];
xp->x_poip--;
if (xp->x_poip == 0)
wakeup((caddr_t)&xp->x_poip);
break;
case CDATA:
case CSTACK:
rp = &proc[c->c_ndx];
while (rp->p_flag & SNOVM)
rp = rp->p_xlink;
rp->p_poip--;
if (rp->p_poip == 0)
wakeup((caddr_t)&rp->p_poip);
break;
}
if (c->c_gone == 0) {
switch (c->c_type) {
case CTEXT:
upte = tptopte(xp->x_caddr, c->c_page);
break;
case CDATA:
upte = dptopte(rp, c->c_page);
break;
case CSTACK:
upte = sptopte(rp, c->c_page);
break;
}
if (upte->pg_v)
goto skip;
if (c->c_type == CTEXT)
xp->x_rssize -= CLSIZE;
else
rp->p_rssize -= CLSIZE;
}
memfree(pte, CLSIZE, 0);
cnt.v_dfree += CLSIZE;
skip:
pte += CLSIZE;
}
if (center != 1)
panic("cleanup center");
bp->b_flags = 0;
bp->av_forw = bswlist.av_forw;
bswlist.av_forw = bp;
if (bp->b_vp)
brelvp(bp);
if (bswlist.b_flags & B_WANTED) {
bswlist.b_flags &= ~B_WANTED;
wakeup((caddr_t)&bswlist);
}
}
splx(s);
}
/*
* Kluster locates pages adjacent to the argument pages
* that are immediately available to include in the pagein/pageout,
* and given the availability of memory includes them.
* It knows that the process image is contiguous in chunks;
* an assumption here is that CLSIZE * KLMAX is a divisor of dmmin,
* so that by looking at KLMAX chunks of pages, all such will
* necessarily be mapped swap contiguous.
*/
int noklust;
int klicnt[KLMAX];
int klocnt[KLMAX];
kluster(p, v, pte0, rw, pkl, klsize, bn0)
register struct proc *p;
unsigned v;
struct pte *pte0;
int rw;
register int *pkl;
int klsize;
daddr_t bn0;
{
int type, cl, clmax;
int kloff, k, klmax;
register struct pte *pte;
int klback, klforw;
int i;
unsigned v0;
daddr_t bn;
register struct cmap *c;
if (rw == B_READ)
klicnt[0]++;
else
klocnt[0]++;
*pkl = 1;
if (noklust || klsize <= 1 || klsize > KLMAX || (klsize & (klsize - 1)))
return (v);
if (rw == B_READ && freemem < CLSIZE * KLMAX)
return (v);
if (isassv(p, v)) {
type = CSTACK;
cl = vtosp(p, v) / CLSIZE;
clmax = p->p_ssize / CLSIZE;
} else if (isadsv(p, v)) {
type = CDATA;
cl = vtodp(p, v) / CLSIZE;
clmax = p->p_dsize / CLSIZE;
} else {
type = CTEXT;
cl = vtotp(p, v) / CLSIZE;
clmax = p->p_textp->x_size / CLSIZE;
}
kloff = cl & (klsize - 1);
pte = pte0;
bn = bn0;
for (k = kloff; --k >= 0;) {
if (type == CSTACK)
pte += CLSIZE;
else
pte -= CLSIZE;
if (type == CTEXT && rw == B_READ && bn) {
bn -= btodb(CLBYTES);
if (mfind(swapdev_vp, bn))
break;
}
if (!klok(pte, rw))
break;
}
klback = (kloff - k) - 1;
pte = pte0;
if ((cl - kloff) + klsize > clmax)
klmax = clmax - (cl - kloff);
else
klmax = klsize;
bn = bn0;
for (k = kloff; ++k < klmax;) {
if (type == CSTACK)
pte -= CLSIZE;
else
pte += CLSIZE;
if (type == CTEXT && rw == B_READ && bn) {
bn += btodb(CLBYTES);
if (mfind(swapdev_vp, bn))
break;
}
if (!klok(pte, rw))
break;
}
klforw = (k - kloff) - 1;
if (klforw + klback == 0)
return (v);
pte = pte0;
if (type == CSTACK) {
pte -= klforw * CLSIZE;
v -= klforw * CLSIZE;
} else {
pte -= klback * CLSIZE;
v -= klback * CLSIZE;
}
*pkl = klforw + klback + 1;
if (rw == B_READ)
klicnt[0]--, klicnt[*pkl - 1]++;
else
klocnt[0]--, klocnt[*pkl - 1]++;
v0 = v;
for (i = 0; i < *pkl; i++) {
if (pte == pte0)
goto cont;
if (rw == B_WRITE) {
c = &cmap[pgtocm(pte->pg_pfnum)];
MLOCK(c);
pte->pg_m = 0;
distcl(pte);
if (type == CTEXT)
distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
} else {
struct pte opte;
opte = *pte;
if (memall(pte, CLSIZE, p, type) == 0)
panic("kluster");
pte->pg_prot = opte.pg_prot;
cmap[pgtocm(pte->pg_pfnum)].c_intrans = 1;
distcl(pte);
if (type == CTEXT) {
p->p_textp->x_rssize += CLSIZE;
distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
} else
p->p_rssize += CLSIZE;
distcl(pte);
}
cont:
pte += CLSIZE;
v += CLSIZE;
}
return (v0);
}
klok(pte, rw)
register struct pte *pte;
int rw;
{
register struct cmap *c;
if (rw == B_WRITE) {
if (pte->pg_fod)
return (0);
if (pte->pg_pfnum == 0)
return (0);
c = &cmap[pgtocm(pte->pg_pfnum)];
if (c->c_lock || c->c_intrans)
return (0);
uncachecl(pte);
if (!dirtycl(pte))
return (0);
return (1);
} else {
if (pte->pg_fod)
return (0);
if (pte->pg_pfnum)
return (0);
return (1);
}
}
/*
* Fodkluster locates pages adjacent to the argument pages
* that are immediately available to include in the pagein,
* and given the availability of memory includes them.
* It wants to page in a file system block if it can.
*/
int nofodklust = 0;
int fodklcnt[KLMAX];
fodkluster(p, v0, pte0, pkl, vp, pbn)
register struct proc *p;
unsigned v0;
struct pte *pte0;
int *pkl;
struct vnode *vp;
daddr_t *pbn;
{
register struct pte *pte;
register struct fpte *fpte;
register daddr_t bn;
daddr_t bnswap;
unsigned v, vmin, vmax;
register int klsize;
int klback, type, i;
fodklcnt[0]++;
*pkl = 1;
if (freemem < KLMAX || nofodklust)
return (v0);
if (isatsv(p, v0)) {
type = CTEXT;
vmin = tptov(p, 0);
vmax = tptov(p, clrnd(p->p_tsize) - CLSIZE);
} else {
type = CDATA;
vmin = dptov(p, 0);
vmax = dptov(p, clrnd(p->p_dsize) - CLSIZE);
}
fpte = (struct fpte *)pte0;
bn = *pbn;
v = v0;
for (klsize = 1; klsize < KLMAX; klsize++) {
if (v <= vmin)
break;
v -= CLSIZE;
fpte -= CLSIZE;
if (fpte->pg_fod == 0)
break;
bn -= btodb(CLBYTES);
if (fpte->pg_blkno != bn)
break;
if (type == CTEXT) {
if (mfind(vp, bn))
break;
/*
* Flush any previous text page use of this
* swap device block.
*/
bnswap = vtod(p, v, &u.u_dmap, &u.u_smap);
munhash(swapdev_vp, bnswap);
}
}
klback = klsize - 1;
fpte = (struct fpte *)pte0;
bn = *pbn;
v = v0;
for (; klsize < KLMAX; klsize++) {
v += CLSIZE;
if (v > vmax)
break;
fpte += CLSIZE;
if (fpte->pg_fod == 0)
break;
bn += btodb(CLBYTES);
if (fpte->pg_blkno != bn)
break;
if (type == CTEXT) {
if (mfind(vp, bn))
break;
/*
* Flush any previous text page use of this
* swap device block.
*/
bnswap = vtod(p, v, &u.u_dmap, &u.u_smap);
munhash(swapdev_vp, bnswap);
}
}
if (klsize == 1)
return (v0);
pte = pte0;
pte -= klback * CLSIZE;
v0 -= klback * CLSIZE;
*pbn -= klback * btodb(CLBYTES);
*pkl = klsize;
fodklcnt[0]--; fodklcnt[klsize - 1]++;
v = v0;
for (i = 0; i < klsize; i++) {
if (pte != pte0) {
struct pte opte;
int pf;
opte = *pte;
if (memall(pte, CLSIZE, p, type) == 0)
panic("fodkluster");
pte->pg_prot = opte.pg_prot;
pf = pte->pg_pfnum;
pte->pg_m = 1;
cmap[pgtocm(pf)].c_intrans = 1;
distcl(pte);
if (type == CTEXT) {
p->p_textp->x_rssize += CLSIZE;
distpte(p->p_textp, (unsigned)vtotp(p, v), pte);
} else
p->p_rssize += CLSIZE;
distcl(pte);
}
pte += CLSIZE;
v += CLSIZE;
}
return (v0);
}
#ifdef REFBIT
/*
* Examine the reference bits in the pte's of all
* processes linked to a particular text segment.
*/
tanyu(xp, tp)
struct text *xp;
register tp;
{
register struct proc *p;
register struct pte *pte;
for (p = xp->x_caddr; p; p = p->p_xlink) {
pte = tptopte(p, tp);
uncache(pte);
if (anycl(pte, pg_u))
return (1);
}
return (0);
}
#endif