date and time created 82/05/26 15:16:03 by sam
[unix-history] / usr / src / sys / kern / vfs_bio.c
CommitLineData
756435cc 1/* vfs_bio.c 4.31 82/05/22 */
ad30fb67
KM
2
3/* merged into kernel: @(#)bio.c 2.3 4/8/82 */
663dbc72
BJ
4
5#include "../h/param.h"
6#include "../h/systm.h"
7#include "../h/dir.h"
8#include "../h/user.h"
9#include "../h/buf.h"
10#include "../h/conf.h"
11#include "../h/proc.h"
12#include "../h/seg.h"
13#include "../h/pte.h"
14#include "../h/vm.h"
973ecc4f 15#include "../h/trace.h"
663dbc72 16
5603d07d
BJ
17/*
18 * The following several routines allocate and free
19 * buffers with various side effects. In general the
20 * arguments to an allocate routine are a device and
21 * a block number, and the value is a pointer to
22 * to the buffer header; the buffer is marked "busy"
23 * so that no one else can touch it. If the block was
24 * already in core, no I/O need be done; if it is
25 * already busy, the process waits until it becomes free.
26 * The following routines allocate a buffer:
27 * getblk
28 * bread
29 * breada
30 * baddr (if it is incore)
31 * Eventually the buffer must be released, possibly with the
32 * side effect of writing it out, by using one of
33 * bwrite
34 * bdwrite
35 * bawrite
36 * brelse
37 */
38
a0eab615
BJ
39struct buf bfreelist[BQUEUES];
40struct buf bswlist, *bclnlist;
41
5603d07d 42#define BUFHSZ 63
ad30fb67 43#define RND (MAXBSIZE/DEV_BSIZE)
46387ee3 44struct bufhd bufhash[BUFHSZ];
337ed2cc 45#define BUFHASH(dev, dblkno) \
ad30fb67 46 ((struct buf *)&bufhash[((int)(dev)+(((int)(dblkno))/RND)) % BUFHSZ])
5603d07d
BJ
47
48/*
49 * Initialize hash links for buffers.
50 */
51bhinit()
52{
53 register int i;
46387ee3 54 register struct bufhd *bp;
5603d07d 55
46387ee3
BJ
56 for (bp = bufhash, i = 0; i < BUFHSZ; i++, bp++)
57 bp->b_forw = bp->b_back = (struct buf *)bp;
5603d07d
BJ
58}
59
663dbc72
BJ
60/* #define DISKMON 1 */
61
62#ifdef DISKMON
63struct {
64 int nbuf;
65 long nread;
66 long nreada;
67 long ncache;
68 long nwrite;
4c05b581 69 long bufcount[64];
663dbc72
BJ
70} io_info;
71#endif
72
73/*
663dbc72 74
443c8066 75#ifndef UNFAST
663dbc72
BJ
76#define notavail(bp) \
77{ \
668cc26d 78 int x = spl6(); \
663dbc72
BJ
79 (bp)->av_back->av_forw = (bp)->av_forw; \
80 (bp)->av_forw->av_back = (bp)->av_back; \
81 (bp)->b_flags |= B_BUSY; \
668cc26d 82 splx(x); \
663dbc72
BJ
83}
84#endif
85
86/*
87 * Read in (if necessary) the block and return a buffer pointer.
88 */
89struct buf *
ad30fb67
KM
90bread(dev, blkno, size)
91 dev_t dev;
92 daddr_t blkno;
93 int size;
663dbc72
BJ
94{
95 register struct buf *bp;
96
ad30fb67 97 bp = getblk(dev, blkno, size);
663dbc72 98 if (bp->b_flags&B_DONE) {
15f77b9b
BJ
99#ifdef TRACE
100 trace(TR_BREADHIT, dev, blkno);
973ecc4f 101#endif
663dbc72
BJ
102#ifdef DISKMON
103 io_info.ncache++;
104#endif
105 return(bp);
106 }
107 bp->b_flags |= B_READ;
663dbc72 108 (*bdevsw[major(dev)].d_strategy)(bp);
15f77b9b
BJ
109#ifdef TRACE
110 trace(TR_BREADMISS, dev, blkno);
973ecc4f 111#endif
663dbc72
BJ
112#ifdef DISKMON
113 io_info.nread++;
114#endif
115 u.u_vm.vm_inblk++; /* pay for read */
116 iowait(bp);
117 return(bp);
118}
119
120/*
121 * Read in the block, like bread, but also start I/O on the
122 * read-ahead block (which is not allocated to the caller)
123 */
124struct buf *
ad30fb67
KM
125breada(dev, blkno, rablkno, size)
126 dev_t dev;
127 daddr_t blkno, rablkno;
128 int size;
663dbc72
BJ
129{
130 register struct buf *bp, *rabp;
131
132 bp = NULL;
133 if (!incore(dev, blkno)) {
ad30fb67 134 bp = getblk(dev, blkno, size);
663dbc72
BJ
135 if ((bp->b_flags&B_DONE) == 0) {
136 bp->b_flags |= B_READ;
663dbc72 137 (*bdevsw[major(dev)].d_strategy)(bp);
15f77b9b
BJ
138#ifdef TRACE
139 trace(TR_BREADMISS, dev, blkno);
973ecc4f 140#endif
663dbc72
BJ
141#ifdef DISKMON
142 io_info.nread++;
143#endif
144 u.u_vm.vm_inblk++; /* pay for read */
145 }
15f77b9b 146#ifdef TRACE
973ecc4f 147 else
15f77b9b 148 trace(TR_BREADHIT, dev, blkno);
973ecc4f 149#endif
663dbc72
BJ
150 }
151 if (rablkno && !incore(dev, rablkno)) {
ad30fb67 152 rabp = getblk(dev, rablkno, size);
973ecc4f 153 if (rabp->b_flags & B_DONE) {
663dbc72 154 brelse(rabp);
15f77b9b
BJ
155#ifdef TRACE
156 trace(TR_BREADHITRA, dev, blkno);
973ecc4f
BJ
157#endif
158 } else {
663dbc72 159 rabp->b_flags |= B_READ|B_ASYNC;
663dbc72 160 (*bdevsw[major(dev)].d_strategy)(rabp);
15f77b9b
BJ
161#ifdef TRACE
162 trace(TR_BREADMISSRA, dev, rablock);
973ecc4f 163#endif
663dbc72
BJ
164#ifdef DISKMON
165 io_info.nreada++;
166#endif
167 u.u_vm.vm_inblk++; /* pay in advance */
168 }
169 }
170 if(bp == NULL)
ad30fb67 171 return(bread(dev, blkno, size));
663dbc72
BJ
172 iowait(bp);
173 return(bp);
174}
175
176/*
177 * Write the buffer, waiting for completion.
178 * Then release the buffer.
179 */
180bwrite(bp)
181register struct buf *bp;
182{
183 register flag;
184
185 flag = bp->b_flags;
186 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
663dbc72
BJ
187#ifdef DISKMON
188 io_info.nwrite++;
189#endif
190 if ((flag&B_DELWRI) == 0)
191 u.u_vm.vm_oublk++; /* noone paid yet */
15f77b9b 192#ifdef TRACE
53f9ca20 193 trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
973ecc4f 194#endif
663dbc72
BJ
195 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
196 if ((flag&B_ASYNC) == 0) {
197 iowait(bp);
198 brelse(bp);
199 } else if (flag & B_DELWRI)
200 bp->b_flags |= B_AGE;
201 else
202 geterror(bp);
203}
204
205/*
206 * Release the buffer, marking it so that if it is grabbed
207 * for another purpose it will be written out before being
208 * given up (e.g. when writing a partial block where it is
209 * assumed that another write for the same block will soon follow).
210 * This can't be done for magtape, since writes must be done
211 * in the same order as requested.
212 */
213bdwrite(bp)
214register struct buf *bp;
215{
e1e57888 216 register int flags;
663dbc72
BJ
217
218 if ((bp->b_flags&B_DELWRI) == 0)
219 u.u_vm.vm_oublk++; /* noone paid yet */
e1e57888
RE
220 flags = bdevsw[major(bp->b_dev)].d_flags;
221 if(flags & B_TAPE)
663dbc72
BJ
222 bawrite(bp);
223 else {
224 bp->b_flags |= B_DELWRI | B_DONE;
225 brelse(bp);
226 }
227}
228
229/*
230 * Release the buffer, start I/O on it, but don't wait for completion.
231 */
232bawrite(bp)
233register struct buf *bp;
234{
235
236 bp->b_flags |= B_ASYNC;
237 bwrite(bp);
238}
239
240/*
241 * release the buffer, with no I/O implied.
242 */
243brelse(bp)
244register struct buf *bp;
245{
46387ee3 246 register struct buf *flist;
663dbc72
BJ
247 register s;
248
249 if (bp->b_flags&B_WANTED)
250 wakeup((caddr_t)bp);
46387ee3
BJ
251 if (bfreelist[0].b_flags&B_WANTED) {
252 bfreelist[0].b_flags &= ~B_WANTED;
253 wakeup((caddr_t)bfreelist);
663dbc72 254 }
60a71525
BJ
255 if (bp->b_flags&B_ERROR)
256 if (bp->b_flags & B_LOCKED)
257 bp->b_flags &= ~B_ERROR; /* try again later */
258 else
259 bp->b_dev = NODEV; /* no assoc */
663dbc72 260 s = spl6();
46387ee3
BJ
261 if (bp->b_flags & (B_ERROR|B_INVAL)) {
262 /* block has no info ... put at front of most free list */
263 flist = &bfreelist[BQUEUES-1];
264 flist->av_forw->av_back = bp;
265 bp->av_forw = flist->av_forw;
266 flist->av_forw = bp;
267 bp->av_back = flist;
663dbc72 268 } else {
46387ee3
BJ
269 if (bp->b_flags & B_LOCKED)
270 flist = &bfreelist[BQ_LOCKED];
271 else if (bp->b_flags & B_AGE)
272 flist = &bfreelist[BQ_AGE];
273 else
274 flist = &bfreelist[BQ_LRU];
275 flist->av_back->av_forw = bp;
276 bp->av_back = flist->av_back;
277 flist->av_back = bp;
278 bp->av_forw = flist;
663dbc72
BJ
279 }
280 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
281 splx(s);
282}
283
284/*
285 * See if the block is associated with some buffer
286 * (mainly to avoid getting hung up on a wait in breada)
287 */
288incore(dev, blkno)
289dev_t dev;
290daddr_t blkno;
291{
292 register struct buf *bp;
46387ee3 293 register struct buf *dp;
663dbc72 294
ad30fb67 295 dp = BUFHASH(dev, blkno);
46387ee3 296 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
ad30fb67 297 if (bp->b_blkno == blkno && bp->b_dev == dev &&
46387ee3 298 !(bp->b_flags & B_INVAL))
5603d07d 299 return (1);
5603d07d 300 return (0);
663dbc72
BJ
301}
302
303struct buf *
ad30fb67
KM
304baddr(dev, blkno, size)
305 dev_t dev;
306 daddr_t blkno;
307 int size;
663dbc72
BJ
308{
309
310 if (incore(dev, blkno))
ad30fb67 311 return (bread(dev, blkno, size));
663dbc72
BJ
312 return (0);
313}
314
315/*
316 * Assign a buffer for the given block. If the appropriate
317 * block is already associated, return it; otherwise search
318 * for the oldest non-busy buffer and reassign it.
23900030
BJ
319 *
320 * We use splx here because this routine may be called
321 * on the interrupt stack during a dump, and we don't
322 * want to lower the ipl back to 0.
663dbc72
BJ
323 */
324struct buf *
ad30fb67
KM
325getblk(dev, blkno, size)
326 dev_t dev;
327 daddr_t blkno;
328 int size;
663dbc72 329{
5603d07d 330 register struct buf *bp, *dp, *ep;
5aa9d5ea
RE
331#ifdef DISKMON
332 register int i;
333#endif
23900030 334 int s;
663dbc72 335
01659974
BJ
336 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
337 blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
ad30fb67 338 dp = BUFHASH(dev, blkno);
663dbc72 339 loop:
46387ee3 340 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
ad30fb67 341 if (bp->b_blkno != blkno || bp->b_dev != dev ||
46387ee3 342 bp->b_flags&B_INVAL)
663dbc72 343 continue;
23900030 344 s = spl6();
663dbc72
BJ
345 if (bp->b_flags&B_BUSY) {
346 bp->b_flags |= B_WANTED;
347 sleep((caddr_t)bp, PRIBIO+1);
23900030 348 splx(s);
663dbc72
BJ
349 goto loop;
350 }
23900030 351 splx(s);
663dbc72
BJ
352#ifdef DISKMON
353 i = 0;
354 dp = bp->av_forw;
46387ee3 355 while ((dp->b_flags & B_HEAD) == 0) {
663dbc72
BJ
356 i++;
357 dp = dp->av_forw;
358 }
4c05b581 359 if (i<64)
663dbc72
BJ
360 io_info.bufcount[i]++;
361#endif
362 notavail(bp);
ad30fb67 363 brealloc(bp, size);
663dbc72
BJ
364 bp->b_flags |= B_CACHE;
365 return(bp);
366 }
5603d07d
BJ
367 if (major(dev) >= nblkdev)
368 panic("blkdev");
23900030 369 s = spl6();
46387ee3
BJ
370 for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
371 if (ep->av_forw != ep)
372 break;
373 if (ep == bfreelist) { /* no free blocks at all */
374 ep->b_flags |= B_WANTED;
375 sleep((caddr_t)ep, PRIBIO+1);
23900030 376 splx(s);
663dbc72
BJ
377 goto loop;
378 }
23900030 379 splx(s);
46387ee3 380 bp = ep->av_forw;
663dbc72
BJ
381 notavail(bp);
382 if (bp->b_flags & B_DELWRI) {
383 bp->b_flags |= B_ASYNC;
384 bwrite(bp);
385 goto loop;
386 }
15f77b9b 387#ifdef TRACE
53f9ca20 388 trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
973ecc4f 389#endif
663dbc72 390 bp->b_flags = B_BUSY;
ad30fb67 391 bfree(bp);
663dbc72
BJ
392 bp->b_back->b_forw = bp->b_forw;
393 bp->b_forw->b_back = bp->b_back;
394 bp->b_forw = dp->b_forw;
395 bp->b_back = dp;
396 dp->b_forw->b_back = bp;
397 dp->b_forw = bp;
398 bp->b_dev = dev;
ad30fb67
KM
399 bp->b_blkno = blkno;
400 brealloc(bp, size);
663dbc72
BJ
401 return(bp);
402}
403
404/*
405 * get an empty block,
406 * not assigned to any particular device
407 */
408struct buf *
ad30fb67
KM
409geteblk(size)
410 int size;
663dbc72 411{
436518b9 412 register struct buf *bp, *dp;
530d0032 413 int s;
663dbc72
BJ
414
415loop:
530d0032 416 s = spl6();
46387ee3
BJ
417 for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
418 if (dp->av_forw != dp)
419 break;
420 if (dp == bfreelist) { /* no free blocks */
421 dp->b_flags |= B_WANTED;
422 sleep((caddr_t)dp, PRIBIO+1);
423 goto loop;
663dbc72 424 }
530d0032 425 splx(s);
46387ee3 426 bp = dp->av_forw;
663dbc72
BJ
427 notavail(bp);
428 if (bp->b_flags & B_DELWRI) {
429 bp->b_flags |= B_ASYNC;
430 bwrite(bp);
431 goto loop;
432 }
15f77b9b 433#ifdef TRACE
53f9ca20 434 trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
973ecc4f 435#endif
46387ee3 436 bp->b_flags = B_BUSY|B_INVAL;
663dbc72
BJ
437 bp->b_back->b_forw = bp->b_forw;
438 bp->b_forw->b_back = bp->b_back;
439 bp->b_forw = dp->b_forw;
440 bp->b_back = dp;
441 dp->b_forw->b_back = bp;
442 dp->b_forw = bp;
443 bp->b_dev = (dev_t)NODEV;
ad30fb67 444 bp->b_bcount = size;
663dbc72
BJ
445 return(bp);
446}
447
ad30fb67
KM
448/*
449 * Allocate space associated with a buffer.
450 */
451brealloc(bp, size)
452 register struct buf *bp;
453 int size;
454{
455 daddr_t start, last;
456 register struct buf *ep;
457 struct buf *dp;
458 int s;
459
460 /*
461 * First need to make sure that all overlaping previous I/O
462 * is dispatched with.
463 */
464 if (size == bp->b_bcount)
465 return;
466 if (size < bp->b_bcount) {
467 bp->b_bcount = size;
468 return;
469 }
470 start = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);
471 last = bp->b_blkno + (size / DEV_BSIZE) - 1;
472 if (bp->b_bcount == 0) {
473 start++;
474 if (start == last)
475 goto allocit;
476 }
477 dp = BUFHASH(bp->b_dev, bp->b_blkno);
478loop:
479 (void) spl0();
480 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
481 if (ep->b_blkno < start || ep->b_blkno > last ||
482 ep->b_dev != bp->b_dev || ep->b_flags&B_INVAL)
483 continue;
484 s = spl6();
485 if (ep->b_flags&B_BUSY) {
486 ep->b_flags |= B_WANTED;
487 sleep((caddr_t)ep, PRIBIO+1);
488 splx(s);
489 goto loop;
490 }
491 (void) spl0();
492 /*
493 * What we would really like to do is kill this
494 * I/O since it is now useless. We cannot do that
495 * so we force it to complete, so that it cannot
496 * over-write our useful data later.
497 */
498 if (ep->b_flags & B_DELWRI) {
499 notavail(ep);
500 ep->b_flags |= B_ASYNC;
501 bwrite(ep);
502 goto loop;
503 }
504 }
505allocit:
506 /*
507 * Here the buffer is already available, so all we
508 * need to do is set the size. Someday a better memory
509 * management scheme will be implemented.
510 */
511 bp->b_bcount = size;
512}
513
514/*
515 * Release space associated with a buffer.
516 */
517bfree(bp)
518 struct buf *bp;
519{
520 /*
521 * Here the buffer does not change, so all we
522 * need to do is set the size. Someday a better memory
523 * management scheme will be implemented.
524 */
525 bp->b_bcount = 0;
526}
527
663dbc72
BJ
528/*
529 * Wait for I/O completion on the buffer; return errors
530 * to the user.
531 */
532iowait(bp)
ad30fb67 533 register struct buf *bp;
663dbc72 534{
530d0032 535 int s;
663dbc72 536
530d0032 537 s = spl6();
663dbc72
BJ
538 while ((bp->b_flags&B_DONE)==0)
539 sleep((caddr_t)bp, PRIBIO);
530d0032 540 splx(s);
663dbc72
BJ
541 geterror(bp);
542}
543
443c8066 544#ifdef UNFAST
663dbc72
BJ
545/*
546 * Unlink a buffer from the available list and mark it busy.
547 * (internal interface)
548 */
549notavail(bp)
550register struct buf *bp;
551{
552 register s;
553
554 s = spl6();
555 bp->av_back->av_forw = bp->av_forw;
556 bp->av_forw->av_back = bp->av_back;
557 bp->b_flags |= B_BUSY;
558 splx(s);
559}
560#endif
561
562/*
563 * Mark I/O complete on a buffer. If the header
564 * indicates a dirty page push completion, the
565 * header is inserted into the ``cleaned'' list
566 * to be processed by the pageout daemon. Otherwise
567 * release it if I/O is asynchronous, and wake
568 * up anyone waiting for it.
569 */
570iodone(bp)
571register struct buf *bp;
572{
573 register int s;
574
80e7c811
BJ
575 if (bp->b_flags & B_DONE)
576 panic("dup iodone");
663dbc72
BJ
577 bp->b_flags |= B_DONE;
578 if (bp->b_flags & B_DIRTY) {
579 if (bp->b_flags & B_ERROR)
580 panic("IO err in push");
581 s = spl6();
663dbc72
BJ
582 bp->av_forw = bclnlist;
583 bp->b_bcount = swsize[bp - swbuf];
584 bp->b_pfcent = swpf[bp - swbuf];
796c66c0
BJ
585 cnt.v_pgout++;
586 cnt.v_pgpgout += bp->b_bcount / NBPG;
663dbc72
BJ
587 bclnlist = bp;
588 if (bswlist.b_flags & B_WANTED)
589 wakeup((caddr_t)&proc[2]);
590 splx(s);
a3ee1d55 591 return;
663dbc72
BJ
592 }
593 if (bp->b_flags&B_ASYNC)
594 brelse(bp);
595 else {
596 bp->b_flags &= ~B_WANTED;
597 wakeup((caddr_t)bp);
598 }
599}
600
601/*
602 * Zero the core associated with a buffer.
603 */
604clrbuf(bp)
ad30fb67 605 struct buf *bp;
663dbc72 606{
ad30fb67
KM
607 register int *p;
608 register int c;
663dbc72
BJ
609
610 p = bp->b_un.b_words;
ad30fb67 611 c = bp->b_bcount/sizeof(int);
663dbc72
BJ
612 do
613 *p++ = 0;
614 while (--c);
615 bp->b_resid = 0;
616}
617
663dbc72
BJ
618/*
619 * make sure all write-behind blocks
620 * on dev (or NODEV for all)
621 * are flushed out.
622 * (from umount and update)
ad30fb67 623 * (and temporarily pagein)
663dbc72
BJ
624 */
625bflush(dev)
626dev_t dev;
627{
628 register struct buf *bp;
46387ee3 629 register struct buf *flist;
530d0032 630 int s;
663dbc72
BJ
631
632loop:
530d0032 633 s = spl6();
46387ee3
BJ
634 for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
635 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
663dbc72
BJ
636 if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
637 bp->b_flags |= B_ASYNC;
638 notavail(bp);
639 bwrite(bp);
640 goto loop;
641 }
642 }
530d0032 643 splx(s);
663dbc72
BJ
644}
645
663dbc72
BJ
646/*
647 * Pick up the device's error number and pass it to the user;
648 * if there is an error but the number is 0 set a generalized
649 * code. Actually the latter is always true because devices
650 * don't yet return specific errors.
651 */
652geterror(bp)
653register struct buf *bp;
654{
655
656 if (bp->b_flags&B_ERROR)
657 if ((u.u_error = bp->b_error)==0)
658 u.u_error = EIO;
659}
7b8b5a01
RE
660
661/*
662 * Invalidate in core blocks belonging to closed or umounted filesystem
663 *
664 * This is not nicely done at all - the buffer ought to be removed from the
665 * hash chains & have its dev/blkno fields clobbered, but unfortunately we
666 * can't do that here, as it is quite possible that the block is still
667 * being used for i/o. Eventually, all disc drivers should be forced to
668 * have a close routine, which ought ensure that the queue is empty, then
669 * properly flush the queues. Until that happy day, this suffices for
670 * correctness. ... kre
671 */
672binval(dev)
673dev_t dev;
674{
634ebdbe
RE
675 register struct buf *bp;
676 register struct bufhd *hp;
677#define dp ((struct buf *)hp)
7b8b5a01 678
634ebdbe
RE
679 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
680 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
681 if (bp->b_dev == dev)
682 bp->b_flags |= B_INVAL;
7b8b5a01 683}