X-Git-Url: https://git.subgeniuskitty.com/unix-history/.git/blobdiff_plain/fe8987fbae68c824eca164260c258f3146538522..a937f8567ba9375553a85507f18042a6faaffaab:/usr/src/sys/kern/vfs_cluster.c diff --git a/usr/src/sys/kern/vfs_cluster.c b/usr/src/sys/kern/vfs_cluster.c index 1a3d565039..362ad50350 100644 --- a/usr/src/sys/kern/vfs_cluster.c +++ b/usr/src/sys/kern/vfs_cluster.c @@ -1,162 +1,147 @@ -/* vfs_cluster.c 3.3 %H% */ - -#include "../h/param.h" -#include "../h/systm.h" -#include "../h/dir.h" -#include "../h/user.h" -#include "../h/buf.h" -#include "../h/conf.h" -#include "../h/proc.h" -#include "../h/seg.h" -#include "../h/pte.h" -#include "../h/vm.h" - -/* - * The following several routines allocate and free - * buffers with various side effects. In general the - * arguments to an allocate routine are a device and - * a block number, and the value is a pointer to - * to the buffer header; the buffer is marked "busy" - * so that no one else can touch it. If the block was - * already in core, no I/O need be done; if it is - * already busy, the process waits until it becomes free. - * The following routines allocate a buffer: - * getblk - * bread - * breada - * baddr (if it is incore) - * Eventually the buffer must be released, possibly with the - * side effect of writing it out, by using one of - * bwrite - * bdwrite - * bawrite - * brelse - */ - -#define BUFHSZ 63 -#define BUFHASH(blkno) (blkno % BUFHSZ) -short bufhash[BUFHSZ]; - /* - * Initialize hash links for buffers. + * Copyright (c) 1982, 1986, 1989 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)vfs_cluster.c 7.10 (Berkeley) %G% */ -bhinit() -{ - register int i; - - for (i = 0; i < BUFHSZ; i++) - bufhash[i] = -1; -} -/* #define DISKMON 1 */ - -#ifdef DISKMON -struct { - int nbuf; - long nread; - long nreada; - long ncache; - long nwrite; - long bufcount[NBUF]; -} io_info; -#endif - -/* - * Swap IO headers - - * They contain the necessary information for the swap I/O. - * At any given time, a swap header can be in three - * different lists. When free it is in the free list, - * when allocated and the I/O queued, it is on the swap - * device list, and finally, if the operation was a dirty - * page push, when the I/O completes, it is inserted - * in a list of cleaned pages to be processed by the pageout daemon. - */ -struct buf swbuf[NSWBUF]; -short swsize[NSWBUF]; /* CAN WE JUST USE B_BCOUNT? */ -int swpf[NSWBUF]; - - -#ifdef FASTVAX -#define notavail(bp) \ -{ \ - int s = spl6(); \ - (bp)->av_back->av_forw = (bp)->av_forw; \ - (bp)->av_forw->av_back = (bp)->av_back; \ - (bp)->b_flags |= B_BUSY; \ - splx(s); \ -} -#endif +#include "param.h" +#include "user.h" +#include "buf.h" +#include "vnode.h" +#include "trace.h" +#include "ucred.h" /* * Read in (if necessary) the block and return a buffer pointer. */ -struct buf * -bread(dev, blkno) -dev_t dev; -daddr_t blkno; +bread(vp, blkno, size, cred, bpp) + struct vnode *vp; + daddr_t blkno; + int size; + struct ucred *cred; + struct buf **bpp; +#ifdef SECSIZE + long secsize; +#endif SECSIZE { register struct buf *bp; - bp = getblk(dev, blkno); - if (bp->b_flags&B_DONE) { -#ifdef DISKMON - io_info.ncache++; -#endif - return(bp); + if (size == 0) + panic("bread: size 0"); +#ifdef SECSIZE + bp = getblk(dev, blkno, size, secsize); +#else SECSIZE + *bpp = bp = getblk(vp, blkno, size); +#endif SECSIZE + if (bp->b_flags&(B_DONE|B_DELWRI)) { + trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), blkno); + return (0); } bp->b_flags |= B_READ; - bp->b_bcount = BSIZE; - (*bdevsw[major(dev)].d_strategy)(bp); -#ifdef DISKMON - io_info.nread++; -#endif - u.u_vm.vm_inblk++; /* pay for read */ - iowait(bp); - return(bp); + if (bp->b_bcount > bp->b_bufsize) + panic("bread"); + if (bp->b_rcred == NOCRED && cred != NOCRED) { + crhold(cred); + bp->b_rcred = cred; + } + VOP_STRATEGY(bp); + trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), blkno); + u.u_ru.ru_inblock++; /* pay for read */ + return (biowait(bp)); } /* * Read in the block, like bread, but also start I/O on the * read-ahead block (which is not allocated to the caller) */ -struct buf * -breada(dev, blkno, rablkno) -dev_t dev; -daddr_t blkno, rablkno; +breada(vp, blkno, size, rablkno, rabsize, cred, bpp) + struct vnode *vp; + daddr_t blkno; int size; +#ifdef SECSIZE + long secsize; +#endif SECSIZE + daddr_t rablkno; int rabsize; + struct ucred *cred; + struct buf **bpp; { register struct buf *bp, *rabp; bp = NULL; - if (!incore(dev, blkno)) { - bp = getblk(dev, blkno); - if ((bp->b_flags&B_DONE) == 0) { + /* + * If the block isn't in core, then allocate + * a buffer and initiate i/o (getblk checks + * for a cache hit). + */ + if (!incore(vp, blkno)) { + *bpp = bp = getblk(vp, blkno, size); +#endif SECSIZE + if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) { bp->b_flags |= B_READ; - bp->b_bcount = BSIZE; - (*bdevsw[major(dev)].d_strategy)(bp); -#ifdef DISKMON - io_info.nread++; -#endif - u.u_vm.vm_inblk++; /* pay for read */ - } + if (bp->b_bcount > bp->b_bufsize) + panic("breada"); + if (bp->b_rcred == NOCRED && cred != NOCRED) { + crhold(cred); + bp->b_rcred = cred; + } + VOP_STRATEGY(bp); + trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), + blkno); + u.u_ru.ru_inblock++; /* pay for read */ + } else + trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), + blkno); } - if (rablkno && !incore(dev, rablkno)) { - rabp = getblk(dev, rablkno); - if (rabp->b_flags & B_DONE) + + /* + * If there's a read-ahead block, start i/o + * on it also (as above). + */ + if (rablkno && !incore(vp, rablkno)) { + rabp = getblk(vp, rablkno, rabsize); +#endif SECSIZE + if (rabp->b_flags & (B_DONE|B_DELWRI)) { brelse(rabp); - else { + trace(TR_BREADHITRA, + pack(vp->v_mount->m_fsid[0], rabsize), blkno); + } else { rabp->b_flags |= B_READ|B_ASYNC; - rabp->b_bcount = BSIZE; - (*bdevsw[major(dev)].d_strategy)(rabp); -#ifdef DISKMON - io_info.nreada++; -#endif - u.u_vm.vm_inblk++; /* pay in advance */ + if (rabp->b_bcount > rabp->b_bufsize) + panic("breadrabp"); + if (bp->b_rcred == NOCRED && cred != NOCRED) { + crhold(cred); + bp->b_rcred = cred; + } + VOP_STRATEGY(rabp); + trace(TR_BREADMISSRA, + pack(vp->v_mount->m_fsid[0], rabsize), rablock); + u.u_ru.ru_inblock++; /* pay in advance */ } } - if(bp == NULL) - return(bread(dev, blkno)); - iowait(bp); - return(bp); + + /* + * If block was in core, let bread get it. + * If block wasn't in core, then the read was started + * above, and just wait for it. + */ + if (bp == NULL) +#ifdef SECSIZE + return (bread(dev, blkno, size, secsize)); +#else SECSIZE + return (bread(vp, blkno, size, cred, bpp)); + return (biowait(bp)); } /* @@ -164,26 +149,34 @@ daddr_t blkno, rablkno; * Then release the buffer. */ bwrite(bp) -register struct buf *bp; + register struct buf *bp; { - register flag; + register int flag; + int error; flag = bp->b_flags; - bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); - bp->b_bcount = BSIZE; -#ifdef DISKMON - io_info.nwrite++; -#endif + bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); if ((flag&B_DELWRI) == 0) - u.u_vm.vm_oublk++; /* noone paid yet */ - (*bdevsw[major(bp->b_dev)].d_strategy)(bp); + u.u_ru.ru_oublock++; /* noone paid yet */ + trace(TR_BWRITE, + pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bcount), bp->b_blkno); + if (bp->b_bcount > bp->b_bufsize) + panic("bwrite"); + VOP_STRATEGY(bp); + + /* + * If the write was synchronous, then await i/o completion. + * If the write was "delayed", then we put the buffer on + * the q of blocks awaiting i/o completion status. + */ if ((flag&B_ASYNC) == 0) { - iowait(bp); + error = biowait(bp); brelse(bp); - } else if (flag & B_DELWRI) + } else if (flag & B_DELWRI) { bp->b_flags |= B_AGE; - else - geterror(bp); + error = 0; + } + return (error); } /* @@ -195,66 +188,92 @@ register struct buf *bp; * in the same order as requested. */ bdwrite(bp) -register struct buf *bp; + register struct buf *bp; { - register struct buf *dp; if ((bp->b_flags&B_DELWRI) == 0) - u.u_vm.vm_oublk++; /* noone paid yet */ - dp = bdevsw[major(bp->b_dev)].d_tab; - if(dp->b_flags & B_TAPE) + u.u_ru.ru_oublock++; /* noone paid yet */ +#ifdef notdef + /* + * This does not work for buffers associated with + * vnodes that are remote - they have no dev. + * Besides, we don't use bio with tapes, so rather + * than develop a fix, we just ifdef this out for now. + */ + if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) bawrite(bp); else { bp->b_flags |= B_DELWRI | B_DONE; brelse(bp); } +#endif + bp->b_flags |= B_DELWRI | B_DONE; + brelse(bp); } /* * Release the buffer, start I/O on it, but don't wait for completion. */ bawrite(bp) -register struct buf *bp; + register struct buf *bp; { bp->b_flags |= B_ASYNC; - bwrite(bp); + (void) bwrite(bp); } /* - * release the buffer, with no I/O implied. + * Release the buffer, with no I/O implied. */ brelse(bp) -register struct buf *bp; + register struct buf *bp; { - register struct buf **backp; + register struct buf *flist; register s; + trace(TR_BRELSE, + pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); + /* + * If someone's waiting for the buffer, or + * is waiting for a buffer wake 'em up. + */ if (bp->b_flags&B_WANTED) wakeup((caddr_t)bp); - if (bfreelist.b_flags&B_WANTED) { - bfreelist.b_flags &= ~B_WANTED; - wakeup((caddr_t)&bfreelist); + if (bfreelist[0].b_flags&B_WANTED) { + bfreelist[0].b_flags &= ~B_WANTED; + wakeup((caddr_t)bfreelist); } - if ((bp->b_flags&B_ERROR) && bp->b_dev != NODEV) { - bunhash(bp); - bp->b_dev = NODEV; /* no assoc. on error */ + if (bp->b_flags & B_NOCACHE) { + bp->b_flags |= B_INVAL; } - s = spl6(); - if(bp->b_flags & (B_AGE|B_ERROR)) { - backp = &bfreelist.av_forw; - (*backp)->av_back = bp; - bp->av_forw = *backp; - *backp = bp; - bp->av_back = &bfreelist; + if (bp->b_flags&B_ERROR) + if (bp->b_flags & B_LOCKED) + bp->b_flags &= ~B_ERROR; /* try again later */ + else + brelvp(bp); /* no assoc */ + + /* + * Stick the buffer back on a free list. + */ + s = splbio(); + if (bp->b_bufsize <= 0) { + /* block has no buffer ... put at front of unused buffer list */ + flist = &bfreelist[BQ_EMPTY]; + binsheadfree(bp, flist); + } else if (bp->b_flags & (B_ERROR|B_INVAL)) { + /* block has no info ... put at front of most free list */ + flist = &bfreelist[BQ_AGE]; + binsheadfree(bp, flist); } else { - backp = &bfreelist.av_back; - (*backp)->av_forw = bp; - bp->av_back = *backp; - *backp = bp; - bp->av_forw = &bfreelist; + if (bp->b_flags & B_LOCKED) + flist = &bfreelist[BQ_LOCKED]; + else if (bp->b_flags & B_AGE) + flist = &bfreelist[BQ_AGE]; + else + flist = &bfreelist[BQ_LRU]; + binstailfree(bp, flist); } - bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); + bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE|B_NOCACHE); splx(s); } @@ -262,28 +281,36 @@ register struct buf *bp; * See if the block is associated with some buffer * (mainly to avoid getting hung up on a wait in breada) */ -incore(dev, blkno) -dev_t dev; -daddr_t blkno; +incore(vp, blkno) + struct vnode *vp; + daddr_t blkno; { register struct buf *bp; - register int dblkno = fsbtodb(blkno); + register struct buf *dp; - for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; - bp = &buf[bp->b_hlink]) - if (bp->b_blkno == dblkno && bp->b_dev == dev) + dp = BUFHASH(vp, blkno); + for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) + if (bp->b_blkno == blkno && bp->b_vp == vp && + (bp->b_flags & B_INVAL) == 0) return (1); return (0); } -struct buf * -baddr(dev, blkno) -dev_t dev; -daddr_t blkno; +baddr(vp, blkno, size, cred, bpp) + struct vnode *vp; + daddr_t blkno; + int size; + struct ucred *cred; + struct buf **bpp; +#ifdef SECSIZE + long secsize; +#endif SECSIZE { - if (incore(dev, blkno)) - return (bread(dev, blkno)); + if (incore(vp, blkno)) + return (bread(vp, blkno, size, cred, bpp)); + *bpp = 0; +#endif SECSIZE return (0); } @@ -291,93 +318,97 @@ daddr_t blkno; * Assign a buffer for the given block. If the appropriate * block is already associated, return it; otherwise search * for the oldest non-busy buffer and reassign it. + * + * If we find the buffer, but it is dirty (marked DELWRI) and + * its size is changing, we must write it out first. When the + * buffer is shrinking, the write is done by brealloc to avoid + * losing the unwritten data. When the buffer is growing, the + * write is done by getblk, so that bread will not read stale + * disk data over the modified data in the buffer. + * + * We use splx here because this routine may be called + * on the interrupt stack during a dump, and we don't + * want to lower the ipl back to 0. */ struct buf * -getblk(dev, blkno) -dev_t dev; -daddr_t blkno; +#ifdef SECSIZE +getblk(dev, blkno, size, secsize) +#else SECSIZE +getblk(vp, blkno, size) + register struct vnode *vp; + daddr_t blkno; + int size; +#ifdef SECSIZE + long secsize; +#endif SECSIZE { - register struct buf *bp, *dp, *ep; - register int i, x; - register int dblkno = fsbtodb(blkno); - - loop: - VOID spl0(); - for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; - bp = &buf[bp->b_hlink]) { - if (bp->b_blkno != dblkno || bp->b_dev != dev) + register struct buf *bp, *dp; + int s; + + if (size > MAXBSIZE) + panic("getblk: size too big"); + /* + * To prevent overflow of 32-bit ints when converting block + * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set + * to the maximum number that can be converted to a byte offset + * without overflow. This is historic code; what bug it fixed, + * or whether it is still a reasonable thing to do is open to + * dispute. mkm 9/85 + */ + if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) + blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); + /* + * Search the cache for the block. If we hit, but + * the buffer is in use for i/o, then we wait until + * the i/o has completed. + */ + dp = BUFHASH(vp, blkno); +loop: + for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { + if (bp->b_blkno != blkno || bp->b_vp != vp || + bp->b_flags&B_INVAL) continue; - VOID spl6(); + s = splbio(); if (bp->b_flags&B_BUSY) { bp->b_flags |= B_WANTED; sleep((caddr_t)bp, PRIBIO+1); + splx(s); goto loop; } - VOID spl0(); -#ifdef DISKMON - i = 0; - dp = bp->av_forw; - while (dp != &bfreelist) { - i++; - dp = dp->av_forw; - } - if (ib_bcount != size) { + if (bp->b_bcount < size && (bp->b_flags&B_DELWRI)) { + bp->b_flags &= ~B_ASYNC; + (void) bwrite(bp); + goto loop; + } + if (brealloc(bp, size) == 0) + goto loop; + } + if (bp->b_bcount != size && brealloc(bp, size) == 0) + goto loop; bp->b_flags |= B_CACHE; - return(bp); - } - if (major(dev) >= nblkdev) - panic("blkdev"); - dp = bdevsw[major(dev)].d_tab; - if (dp == NULL) - panic("devtab"); - VOID spl6(); - if (bfreelist.av_forw == &bfreelist) { - bfreelist.b_flags |= B_WANTED; - sleep((caddr_t)&bfreelist, PRIBIO+1); - goto loop; + return (bp); } - spl0(); - bp = bfreelist.av_forw; - notavail(bp); - if (bp->b_flags & B_DELWRI) { - bp->b_flags |= B_ASYNC; - bwrite(bp); + bp = getnewbuf(); + bfree(bp); + bremhash(bp); + if (bp->b_vp) + brelvp(bp); + VREF(vp); + bp->b_vp = vp; + bp->b_dev = vp->v_rdev; +#ifdef SECSIZE + bp->b_blksize = secsize; +#endif SECSIZE + bp->b_blkno = blkno; + bp->b_error = 0; + bp->b_resid = 0; + binshash(bp, dp); + if (brealloc(bp, size) == 0) goto loop; - } - if (bp->b_dev == NODEV) - goto done; - /* INLINE EXPANSION OF bunhash(bp) */ - i = BUFHASH(dbtofsb(bp->b_blkno)); - x = bp - buf; - if (bufhash[i] == x) { - bufhash[i] = bp->b_hlink; - } else { - for (ep = &buf[bufhash[i]]; ep != &buf[-1]; - ep = &buf[ep->b_hlink]) - if (ep->b_hlink == x) { - ep->b_hlink = bp->b_hlink; - goto done; - } - panic("getblk"); - } -done: - /* END INLINE EXPANSION */ - bp->b_flags = B_BUSY; - bp->b_back->b_forw = bp->b_forw; - bp->b_forw->b_back = bp->b_back; - bp->b_forw = dp->b_forw; - bp->b_back = dp; - dp->b_forw->b_back = bp; - dp->b_forw = bp; - bp->b_dev = dev; - bp->b_blkno = dblkno; - i = BUFHASH(blkno); - bp->b_hlink = bufhash[i]; - bufhash[i] = bp - buf; - return(bp); + return (bp); } /* @@ -385,122 +416,202 @@ done: * not assigned to any particular device */ struct buf * -geteblk() +geteblk(size) + int size; { - register struct buf *bp, *dp, *ep; - register int i, x; + register struct buf *bp, *flist; + if (size > MAXBSIZE) + panic("geteblk: size too big"); loop: - VOID spl6(); - while (bfreelist.av_forw == &bfreelist) { - bfreelist.b_flags |= B_WANTED; - sleep((caddr_t)&bfreelist, PRIBIO+1); - } - VOID spl0(); - dp = &bfreelist; - bp = bfreelist.av_forw; - notavail(bp); - if (bp->b_flags & B_DELWRI) { - bp->b_flags |= B_ASYNC; - bwrite(bp); + bp = getnewbuf(); + bp->b_flags |= B_INVAL; + bfree(bp); + bremhash(bp); + flist = &bfreelist[BQ_AGE]; + brelvp(bp); +#ifdef SECSIZE + bp->b_blksize = DEV_BSIZE; +#endif SECSIZE + bp->b_error = 0; + bp->b_resid = 0; + binshash(bp, flist); + if (brealloc(bp, size) == 0) goto loop; - } - if (bp->b_dev != NODEV) - bunhash(bp); - bp->b_flags = B_BUSY; - bp->b_back->b_forw = bp->b_forw; - bp->b_forw->b_back = bp->b_back; - bp->b_forw = dp->b_forw; - bp->b_back = dp; - dp->b_forw->b_back = bp; - dp->b_forw = bp; - bp->b_dev = (dev_t)NODEV; - bp->b_hlink = -1; - return(bp); + return (bp); } -bunhash(bp) +/* + * Allocate space associated with a buffer. + * If can't get space, buffer is released + */ +brealloc(bp, size) register struct buf *bp; + int size; { + daddr_t start, last; register struct buf *ep; - register int i, x; + struct buf *dp; + int s; - if (bp->b_dev == NODEV) - return; - i = BUFHASH(dbtofsb(bp->b_blkno)); - x = bp - buf; - if (bufhash[i] == x) { - bufhash[i] = bp->b_hlink; - return; + /* + * First need to make sure that all overlapping previous I/O + * is dispatched with. + */ + if (size == bp->b_bcount) + return (1); + if (size < bp->b_bcount) { + if (bp->b_flags & B_DELWRI) { + (void) bwrite(bp); + return (0); + } + if (bp->b_flags & B_LOCKED) + panic("brealloc"); + return (allocbuf(bp, size)); } - for (ep = &buf[bufhash[i]]; ep != &buf[-1]; - ep = &buf[ep->b_hlink]) - if (ep->b_hlink == x) { - ep->b_hlink = bp->b_hlink; - return; + bp->b_flags &= ~B_DONE; + if (bp->b_vp == (struct vnode *)0) + return (allocbuf(bp, size)); + + trace(TR_BREALLOC, + pack(bp->b_vp->v_mount->m_fsid[0], size), bp->b_blkno); + /* + * Search cache for any buffers that overlap the one that we + * are trying to allocate. Overlapping buffers must be marked + * invalid, after being written out if they are dirty. (indicated + * by B_DELWRI) A disk block must be mapped by at most one buffer + * at any point in time. Care must be taken to avoid deadlocking + * when two buffer are trying to get the same set of disk blocks. + */ + start = bp->b_blkno; +#ifdef SECSIZE + last = start + size/bp->b_blksize - 1; +#else SECSIZE + last = start + btodb(size) - 1; +#endif SECSIZE + dp = BUFHASH(bp->b_vp, bp->b_blkno); +loop: + for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { + if (ep == bp || ep->b_vp != bp->b_vp || + (ep->b_flags & B_INVAL)) + continue; + /* look for overlap */ + if (ep->b_bcount == 0 || ep->b_blkno > last || +#ifdef SECSIZE + ep->b_blkno + ep->b_bcount/ep->b_blksize <= start) +#else SECSIZE + ep->b_blkno + btodb(ep->b_bcount) <= start) +#endif SECSIZE + continue; + s = splbio(); + if (ep->b_flags&B_BUSY) { + ep->b_flags |= B_WANTED; + sleep((caddr_t)ep, PRIBIO+1); + splx(s); + goto loop; } - panic("bunhash"); + splx(s); + notavail(ep); + if (ep->b_flags & B_DELWRI) { + (void) bwrite(ep); + goto loop; + } + ep->b_flags |= B_INVAL; + brelse(ep); + } + return (allocbuf(bp, size)); } /* - * Wait for I/O completion on the buffer; return errors - * to the user. + * Find a buffer which is available for use. + * Select something from a free list. + * Preference is to AGE list, then LRU list. */ -iowait(bp) -register struct buf *bp; +struct buf * +getnewbuf() { + register struct buf *bp, *dp; + register struct ucred *cred; + int s; - VOID spl6(); - while ((bp->b_flags&B_DONE)==0) - sleep((caddr_t)bp, PRIBIO); - VOID spl0(); - geterror(bp); +loop: + s = splbio(); + for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) + if (dp->av_forw != dp) + break; + if (dp == bfreelist) { /* no free blocks */ + dp->b_flags |= B_WANTED; + sleep((caddr_t)dp, PRIBIO+1); + splx(s); + goto loop; + } + splx(s); + bp = dp->av_forw; + notavail(bp); + if (bp->b_flags & B_DELWRI) { + (void) bawrite(bp); + goto loop; + } + trace(TR_BRELSE, + pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); + brelvp(bp); + if (bp->b_rcred != NOCRED) { + cred = bp->b_rcred; + bp->b_rcred = NOCRED; + crfree(cred); + } + if (bp->b_wcred != NOCRED) { + cred = bp->b_wcred; + bp->b_wcred = NOCRED; + crfree(cred); + } + bp->b_flags = B_BUSY; + return (bp); } -#ifndef FASTVAX /* - * Unlink a buffer from the available list and mark it busy. - * (internal interface) + * Wait for I/O completion on the buffer; return errors + * to the user. */ -notavail(bp) -register struct buf *bp; +biowait(bp) + register struct buf *bp; { - register s; + int s; - s = spl6(); - bp->av_back->av_forw = bp->av_forw; - bp->av_forw->av_back = bp->av_back; - bp->b_flags |= B_BUSY; + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) + sleep((caddr_t)bp, PRIBIO); splx(s); + /* + * Pick up the device's error number and pass it to the user; + * if there is an error but the number is 0 set a generalized code. + */ + if ((bp->b_flags & B_ERROR) == 0) + return (0); + if (bp->b_error) + return (bp->b_error); + return (EIO); } -#endif /* - * Mark I/O complete on a buffer. If the header - * indicates a dirty page push completion, the - * header is inserted into the ``cleaned'' list - * to be processed by the pageout daemon. Otherwise - * release it if I/O is asynchronous, and wake - * up anyone waiting for it. + * Mark I/O complete on a buffer. + * If someone should be called, e.g. the pageout + * daemon, do so. Otherwise, wake up anyone + * waiting for it. */ -iodone(bp) -register struct buf *bp; +biodone(bp) + register struct buf *bp; { - register int s; + if (bp->b_flags & B_DONE) + panic("dup biodone"); bp->b_flags |= B_DONE; - if (bp->b_flags & B_DIRTY) { - if (bp->b_flags & B_ERROR) - panic("IO err in push"); - s = spl6(); - cnt.v_pgout++; - bp->av_forw = bclnlist; - bp->b_bcount = swsize[bp - swbuf]; - bp->b_pfcent = swpf[bp - swbuf]; - bclnlist = bp; - if (bswlist.b_flags & B_WANTED) - wakeup((caddr_t)&proc[2]); - splx(s); + if ((bp->b_flags & B_READ) == 0) + bp->b_dirtyoff = bp->b_dirtyend = 0; + if (bp->b_flags & B_CALL) { + bp->b_flags &= ~B_CALL; + (*bp->b_iodone)(bp); + return; } if (bp->b_flags&B_ASYNC) brelse(bp); @@ -511,247 +622,147 @@ register struct buf *bp; } /* - * Zero the core associated with a buffer. + * Ensure that no part of a specified block is in an incore buffer. +#ifdef SECSIZE + * "size" is given in device blocks (the units of b_blkno). +#endif SECSIZE +#ifdef SECSIZE + * "size" is given in device blocks (the units of b_blkno). +#endif SECSIZE */ -clrbuf(bp) -struct buf *bp; +blkflush(vp, blkno, size) + struct vnode *vp; + daddr_t blkno; +#ifdef SECSIZE + int size; +#else SECSIZE + long size; +#endif SECSIZE { - register *p; - register c; - - p = bp->b_un.b_words; - c = BSIZE/sizeof(int); - do - *p++ = 0; - while (--c); - bp->b_resid = 0; -} - -/* - * swap I/O - - * - * If the flag indicates a dirty page push initiated - * by the pageout daemon, we map the page into the i th - * virtual page of process 2 (the daemon itself) where i is - * the index of the swap header that has been allocated. - * We simply initialize the header and queue the I/O but - * do not wait for completion. When the I/O completes, - * iodone() will link the header to a list of cleaned - * pages to be processed by the pageout daemon. - */ -swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent) - struct proc *p; - swblk_t dblkno; - caddr_t addr; - int flag, nbytes; - dev_t dev; - unsigned pfcent; -{ - register struct buf *bp; - register int c; - int p2dp; - register struct pte *dpte, *vpte; - - VOID spl6(); - while (bswlist.av_forw == NULL) { - bswlist.b_flags |= B_WANTED; - sleep((caddr_t)&bswlist, PSWP+1); - } - bp = bswlist.av_forw; - bswlist.av_forw = bp->av_forw; - VOID spl0(); - - bp->b_flags = B_BUSY | B_PHYS | rdflg | flag; - if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0) - if (rdflg == B_READ) - sum.v_pswpin += btoc(nbytes); - else - sum.v_pswpout += btoc(nbytes); - bp->b_proc = p; - if (flag & B_DIRTY) { - p2dp = ((bp - swbuf) * CLSIZE) * KLMAX; - dpte = dptopte(&proc[2], p2dp); - vpte = vtopte(p, btop(addr)); - for (c = 0; c < nbytes; c += NBPG) { - if (vpte->pg_pfnum == 0 || vpte->pg_fod) - panic("swap bad pte"); - *dpte++ = *vpte++; - } - bp->b_un.b_addr = (caddr_t)ctob(p2dp); - } else - bp->b_un.b_addr = addr; - while (nbytes > 0) { - c = imin(ctob(120), nbytes); - bp->b_bcount = c; - bp->b_blkno = dblkno; - bp->b_dev = dev; - if (dev == swapdev) - bp->b_blkno += swplo; - (*bdevsw[major(dev)].d_strategy)(bp); - if (flag & B_DIRTY) { - if (c < nbytes) - panic("big push"); - swsize[bp - swbuf] = nbytes; - swpf[bp - swbuf] = pfcent; - return; + register struct buf *ep; + struct buf *dp; + daddr_t start, last; + int s, error, allerrors = 0; + + start = blkno; +#ifdef SECSIZE + last = start + size - 1; +#else SECSIZE + last = start + btodb(size) - 1; +#endif SECSIZE + dp = BUFHASH(vp, blkno); +loop: + for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { + if (ep->b_vp != vp || (ep->b_flags & B_INVAL)) + continue; + /* look for overlap */ + if (ep->b_bcount == 0 || ep->b_blkno > last || +#ifdef SECSIZE + ep->b_blkno + ep->b_bcount / ep->b_blksize <= start) +#else SECSIZE + ep->b_blkno + btodb(ep->b_bcount) <= start) +#endif SECSIZE + continue; + s = splbio(); + if (ep->b_flags&B_BUSY) { + ep->b_flags |= B_WANTED; + sleep((caddr_t)ep, PRIBIO+1); + splx(s); + goto loop; } - VOID spl6(); - while((bp->b_flags&B_DONE)==0) - sleep((caddr_t)bp, PSWP); - VOID spl0(); - bp->b_un.b_addr += c; - bp->b_flags &= ~B_DONE; - if (bp->b_flags & B_ERROR) { - if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE) - panic("hard IO err in swap"); - swkill(p, (char *)0); + if (ep->b_flags & B_DELWRI) { + splx(s); + notavail(ep); + if (error = bwrite(ep)) + allerrors = error; + goto loop; } - nbytes -= c; - dblkno += btoc(c); - } - VOID spl6(); - bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); - bp->av_forw = bswlist.av_forw; - bswlist.av_forw = bp; - if (bswlist.b_flags & B_WANTED) { - bswlist.b_flags &= ~B_WANTED; - wakeup((caddr_t)&bswlist); - wakeup((caddr_t)&proc[2]); + splx(s); } - VOID spl0(); + return (allerrors); } /* - * If rout == 0 then killed on swap error, else - * rout is the name of the routine where we ran out of - * swap space. + * Make sure all write-behind blocks associated + * with mount point are flushed out (from sync). */ -swkill(p, rout) - struct proc *p; - char *rout; +bflush(mountp) + struct mount *mountp; { + register struct buf *bp; + register struct buf *flist; + int s; - printf("%d: ", p->p_pid); - if (rout) - printf("out of swap space in %s\n", rout); - else - printf("killed on swap error\n"); - /* - * To be sure no looping (e.g. in vmsched trying to - * swap out) mark process locked in core (as though - * done by user) after killing it so noone will try - * to swap it out. - */ - psignal(p, SIGKIL); - p->p_flag |= SULOCK; +loop: + s = splbio(); + for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) { + for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { + if ((bp->b_flags & B_BUSY)) + continue; + if ((bp->b_flags & B_DELWRI) == 0) + continue; + if (bp->b_vp && bp->b_vp->v_mount == mountp) { + notavail(bp); + (void) bawrite(bp); + splx(s); + goto loop; + } + } + } + splx(s); } /* - * make sure all write-behind blocks - * on dev (or NODEV for all) - * are flushed out. - * (from umount and update) + * Invalidate in core blocks belonging to closed or umounted filesystem + * + * We walk through the buffer pool and invalidate any buffers for the + * indicated mount point. Normally this routine is preceeded by a bflush + * call, so that on a quiescent filesystem there will be no dirty + * buffers when we are done. We return the count of dirty buffers when + * we are finished. */ -bflush(dev) -dev_t dev; +binval(mountp) + struct mount *mountp; { register struct buf *bp; + register struct bufhd *hp; + int s, dirty = 0; +#define dp ((struct buf *)hp) loop: - VOID spl6(); - for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) { - if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) { - bp->b_flags |= B_ASYNC; + s = splbio(); + for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) { + for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { + if (bp->b_vp == NULL || bp->b_vp->v_mount != mountp) + continue; + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + sleep((caddr_t)bp, PRIBIO+1); + splx(s); + goto loop; + } notavail(bp); - bwrite(bp); - goto loop; + if (bp->b_flags & B_DELWRI) { + (void) bawrite(bp); + dirty++; + continue; + } + bp->b_flags |= B_INVAL; + brelvp(bp); + brelse(bp); } } - VOID spl0(); + return (dirty); } -/* - * Raw I/O. The arguments are - * The strategy routine for the device - * A buffer, which will always be a special buffer - * header owned exclusively by the device for this purpose - * The device number - * Read/write flag - * Essentially all the work is computing physical addresses and - * validating them. - * If the user has the proper access privilidges, the process is - * marked 'delayed unlock' and the pages involved in the I/O are - * faulted and locked. After the completion of the I/O, the above pages - * are unlocked. - */ -physio(strat, bp, dev, rw, mincnt) -int (*strat)(); -register struct buf *bp; -unsigned (*mincnt)(); +brelvp(bp) + struct buf *bp; { - register int c; - char *a; + struct vnode *vp; - if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) { - u.u_error = EFAULT; + if (bp->b_vp == (struct vnode *) 0) return; - } - VOID spl6(); - while (bp->b_flags&B_BUSY) { - bp->b_flags |= B_WANTED; - sleep((caddr_t)bp, PRIBIO+1); - } - bp->b_error = 0; - bp->b_proc = u.u_procp; - bp->b_un.b_addr = u.u_base; - while (u.u_count != 0 && bp->b_error==0) { - bp->b_flags = B_BUSY | B_PHYS | rw; - bp->b_dev = dev; - bp->b_blkno = u.u_offset >> PGSHIFT; - bp->b_bcount = u.u_count; - (*mincnt)(bp); - c = bp->b_bcount; - u.u_procp->p_flag |= SPHYSIO; - vslock(a = bp->b_un.b_addr, c); - (*strat)(bp); - VOID spl6(); - while ((bp->b_flags&B_DONE) == 0) - sleep((caddr_t)bp, PRIBIO); - vsunlock(a, c, rw); - u.u_procp->p_flag &= ~SPHYSIO; - if (bp->b_flags&B_WANTED) - wakeup((caddr_t)bp); - VOID spl0(); - bp->b_un.b_addr += c; - u.u_count -= c; - u.u_offset += c; - } - bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); - u.u_count = bp->b_resid; - geterror(bp); -} - -/*ARGSUSED*/ -unsigned -minphys(bp) -struct buf *bp; -{ - - if (bp->b_bcount > 60 * 1024) - bp->b_bcount = 60 * 1024; -} - -/* - * Pick up the device's error number and pass it to the user; - * if there is an error but the number is 0 set a generalized - * code. Actually the latter is always true because devices - * don't yet return specific errors. - */ -geterror(bp) -register struct buf *bp; -{ - - if (bp->b_flags&B_ERROR) - if ((u.u_error = bp->b_error)==0) - u.u_error = EIO; + vp = bp->b_vp; + bp->b_vp = (struct vnode *) 0; + vrele(vp); }