In vvidentify(), avoid losing a mbuf on failure; also remove an UBAPURGE.
[unix-history] / usr / src / sys / kern / vfs_bio.c
CommitLineData
b11be056 1/* vfs_bio.c 6.1 83/07/29 */
961945a8
SL
2
3#include "../machine/pte.h"
663dbc72
BJ
4
5#include "../h/param.h"
6#include "../h/systm.h"
7#include "../h/dir.h"
8#include "../h/user.h"
9#include "../h/buf.h"
10#include "../h/conf.h"
11#include "../h/proc.h"
12#include "../h/seg.h"
663dbc72 13#include "../h/vm.h"
973ecc4f 14#include "../h/trace.h"
663dbc72 15
663dbc72
BJ
16/*
17 * Read in (if necessary) the block and return a buffer pointer.
18 */
19struct buf *
ad30fb67
KM
20bread(dev, blkno, size)
21 dev_t dev;
22 daddr_t blkno;
23 int size;
663dbc72
BJ
24{
25 register struct buf *bp;
26
4f083fd7
SL
27 if (size == 0)
28 panic("bread: size 0");
ad30fb67 29 bp = getblk(dev, blkno, size);
663dbc72 30 if (bp->b_flags&B_DONE) {
15f77b9b 31 trace(TR_BREADHIT, dev, blkno);
663dbc72
BJ
32 return(bp);
33 }
34 bp->b_flags |= B_READ;
4f083fd7
SL
35 if (bp->b_bcount > bp->b_bufsize)
36 panic("bread");
663dbc72 37 (*bdevsw[major(dev)].d_strategy)(bp);
15f77b9b 38 trace(TR_BREADMISS, dev, blkno);
fb99a9a1 39 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 40 biowait(bp);
663dbc72
BJ
41 return(bp);
42}
43
44/*
45 * Read in the block, like bread, but also start I/O on the
46 * read-ahead block (which is not allocated to the caller)
47 */
48struct buf *
a8d3bf7f 49breada(dev, blkno, size, rablkno, rabsize)
ad30fb67 50 dev_t dev;
84baaab3 51 daddr_t blkno; int size;
a8d3bf7f 52 daddr_t rablkno; int rabsize;
663dbc72
BJ
53{
54 register struct buf *bp, *rabp;
55
56 bp = NULL;
3efdd860
KM
57 /*
58 * If the block isn't in core, then allocate
59 * a buffer and initiate i/o (getblk checks
60 * for a cache hit).
61 */
663dbc72 62 if (!incore(dev, blkno)) {
ad30fb67 63 bp = getblk(dev, blkno, size);
663dbc72
BJ
64 if ((bp->b_flags&B_DONE) == 0) {
65 bp->b_flags |= B_READ;
4f083fd7
SL
66 if (bp->b_bcount > bp->b_bufsize)
67 panic("breada");
663dbc72 68 (*bdevsw[major(dev)].d_strategy)(bp);
15f77b9b 69 trace(TR_BREADMISS, dev, blkno);
fb99a9a1 70 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 71 } else
15f77b9b 72 trace(TR_BREADHIT, dev, blkno);
663dbc72 73 }
3efdd860
KM
74
75 /*
76 * If there's a read-ahead block, start i/o
77 * on it also (as above).
78 */
663dbc72 79 if (rablkno && !incore(dev, rablkno)) {
a8d3bf7f 80 rabp = getblk(dev, rablkno, rabsize);
973ecc4f 81 if (rabp->b_flags & B_DONE) {
663dbc72 82 brelse(rabp);
15f77b9b 83 trace(TR_BREADHITRA, dev, blkno);
973ecc4f 84 } else {
663dbc72 85 rabp->b_flags |= B_READ|B_ASYNC;
4f083fd7
SL
86 if (rabp->b_bcount > rabp->b_bufsize)
87 panic("breadrabp");
663dbc72 88 (*bdevsw[major(dev)].d_strategy)(rabp);
15f77b9b 89 trace(TR_BREADMISSRA, dev, rablock);
fb99a9a1 90 u.u_ru.ru_inblock++; /* pay in advance */
663dbc72
BJ
91 }
92 }
3efdd860
KM
93
94 /*
84baaab3
KM
95 * If block was in core, let bread get it.
96 * If block wasn't in core, then the read was started
97 * above, and just wait for it.
3efdd860 98 */
84baaab3
KM
99 if (bp == NULL)
100 return (bread(dev, blkno, size));
3efdd860 101 biowait(bp);
84baaab3 102 return (bp);
663dbc72
BJ
103}
104
105/*
106 * Write the buffer, waiting for completion.
107 * Then release the buffer.
108 */
109bwrite(bp)
3efdd860 110 register struct buf *bp;
663dbc72
BJ
111{
112 register flag;
113
114 flag = bp->b_flags;
f844ee62 115 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
663dbc72 116 if ((flag&B_DELWRI) == 0)
fb99a9a1 117 u.u_ru.ru_oublock++; /* noone paid yet */
53f9ca20 118 trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
4f083fd7
SL
119 if (bp->b_bcount > bp->b_bufsize)
120 panic("bwrite");
663dbc72 121 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
3efdd860
KM
122
123 /*
124 * If the write was synchronous, then await i/o completion.
125 * If the write was "delayed", then we put the buffer on
126 * the q of blocks awaiting i/o completion status.
3efdd860 127 */
663dbc72 128 if ((flag&B_ASYNC) == 0) {
3efdd860 129 biowait(bp);
663dbc72
BJ
130 brelse(bp);
131 } else if (flag & B_DELWRI)
132 bp->b_flags |= B_AGE;
663dbc72
BJ
133}
134
135/*
136 * Release the buffer, marking it so that if it is grabbed
137 * for another purpose it will be written out before being
138 * given up (e.g. when writing a partial block where it is
139 * assumed that another write for the same block will soon follow).
140 * This can't be done for magtape, since writes must be done
141 * in the same order as requested.
142 */
143bdwrite(bp)
3efdd860 144 register struct buf *bp;
663dbc72 145{
e1e57888 146 register int flags;
663dbc72
BJ
147
148 if ((bp->b_flags&B_DELWRI) == 0)
fb99a9a1 149 u.u_ru.ru_oublock++; /* noone paid yet */
e1e57888
RE
150 flags = bdevsw[major(bp->b_dev)].d_flags;
151 if(flags & B_TAPE)
663dbc72
BJ
152 bawrite(bp);
153 else {
154 bp->b_flags |= B_DELWRI | B_DONE;
155 brelse(bp);
156 }
157}
158
159/*
160 * Release the buffer, start I/O on it, but don't wait for completion.
161 */
162bawrite(bp)
3efdd860 163 register struct buf *bp;
663dbc72
BJ
164{
165
166 bp->b_flags |= B_ASYNC;
167 bwrite(bp);
168}
169
170/*
3efdd860 171 * Release the buffer, with no I/O implied.
663dbc72
BJ
172 */
173brelse(bp)
3efdd860 174 register struct buf *bp;
663dbc72 175{
46387ee3 176 register struct buf *flist;
663dbc72
BJ
177 register s;
178
3efdd860
KM
179 /*
180 * If someone's waiting for the buffer, or
181 * is waiting for a buffer wake 'em up.
182 */
663dbc72
BJ
183 if (bp->b_flags&B_WANTED)
184 wakeup((caddr_t)bp);
46387ee3
BJ
185 if (bfreelist[0].b_flags&B_WANTED) {
186 bfreelist[0].b_flags &= ~B_WANTED;
187 wakeup((caddr_t)bfreelist);
663dbc72 188 }
60a71525
BJ
189 if (bp->b_flags&B_ERROR)
190 if (bp->b_flags & B_LOCKED)
191 bp->b_flags &= ~B_ERROR; /* try again later */
192 else
193 bp->b_dev = NODEV; /* no assoc */
3efdd860
KM
194
195 /*
196 * Stick the buffer back on a free list.
197 */
663dbc72 198 s = spl6();
4f083fd7
SL
199 if (bp->b_bufsize <= 0) {
200 /* block has no buffer ... put at front of unused buffer list */
201 flist = &bfreelist[BQ_EMPTY];
202 binsheadfree(bp, flist);
203 } else if (bp->b_flags & (B_ERROR|B_INVAL)) {
46387ee3 204 /* block has no info ... put at front of most free list */
4f083fd7 205 flist = &bfreelist[BQ_AGE];
3efdd860 206 binsheadfree(bp, flist);
663dbc72 207 } else {
46387ee3
BJ
208 if (bp->b_flags & B_LOCKED)
209 flist = &bfreelist[BQ_LOCKED];
210 else if (bp->b_flags & B_AGE)
211 flist = &bfreelist[BQ_AGE];
212 else
213 flist = &bfreelist[BQ_LRU];
3efdd860 214 binstailfree(bp, flist);
663dbc72
BJ
215 }
216 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
217 splx(s);
218}
219
220/*
221 * See if the block is associated with some buffer
222 * (mainly to avoid getting hung up on a wait in breada)
223 */
224incore(dev, blkno)
3efdd860
KM
225 dev_t dev;
226 daddr_t blkno;
663dbc72
BJ
227{
228 register struct buf *bp;
46387ee3 229 register struct buf *dp;
663dbc72 230
ad30fb67 231 dp = BUFHASH(dev, blkno);
46387ee3 232 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
ad30fb67 233 if (bp->b_blkno == blkno && bp->b_dev == dev &&
3efdd860 234 (bp->b_flags & B_INVAL) == 0)
5603d07d 235 return (1);
5603d07d 236 return (0);
663dbc72
BJ
237}
238
239struct buf *
ad30fb67
KM
240baddr(dev, blkno, size)
241 dev_t dev;
242 daddr_t blkno;
243 int size;
663dbc72
BJ
244{
245
246 if (incore(dev, blkno))
ad30fb67 247 return (bread(dev, blkno, size));
663dbc72
BJ
248 return (0);
249}
250
251/*
252 * Assign a buffer for the given block. If the appropriate
253 * block is already associated, return it; otherwise search
254 * for the oldest non-busy buffer and reassign it.
23900030
BJ
255 *
256 * We use splx here because this routine may be called
257 * on the interrupt stack during a dump, and we don't
258 * want to lower the ipl back to 0.
663dbc72
BJ
259 */
260struct buf *
ad30fb67
KM
261getblk(dev, blkno, size)
262 dev_t dev;
263 daddr_t blkno;
264 int size;
663dbc72 265{
4f083fd7 266 register struct buf *bp, *dp;
23900030 267 int s;
663dbc72 268
961945a8 269 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT)) /* XXX */
01659974 270 blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
3efdd860
KM
271 /*
272 * Search the cache for the block. If we hit, but
273 * the buffer is in use for i/o, then we wait until
274 * the i/o has completed.
275 */
ad30fb67 276 dp = BUFHASH(dev, blkno);
3efdd860 277loop:
46387ee3 278 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
ad30fb67 279 if (bp->b_blkno != blkno || bp->b_dev != dev ||
46387ee3 280 bp->b_flags&B_INVAL)
663dbc72 281 continue;
23900030 282 s = spl6();
663dbc72
BJ
283 if (bp->b_flags&B_BUSY) {
284 bp->b_flags |= B_WANTED;
285 sleep((caddr_t)bp, PRIBIO+1);
23900030 286 splx(s);
663dbc72
BJ
287 goto loop;
288 }
23900030 289 splx(s);
663dbc72 290 notavail(bp);
9d6d37ce
BJ
291 if (brealloc(bp, size) == 0)
292 goto loop;
663dbc72
BJ
293 bp->b_flags |= B_CACHE;
294 return(bp);
295 }
5603d07d
BJ
296 if (major(dev) >= nblkdev)
297 panic("blkdev");
4f083fd7 298 bp = getnewbuf();
ad30fb67 299 bfree(bp);
3efdd860
KM
300 bremhash(bp);
301 binshash(bp, dp);
663dbc72 302 bp->b_dev = dev;
ad30fb67 303 bp->b_blkno = blkno;
4f083fd7 304 bp->b_error = 0;
9d6d37ce
BJ
305 if (brealloc(bp, size) == 0)
306 goto loop;
663dbc72
BJ
307 return(bp);
308}
309
310/*
311 * get an empty block,
312 * not assigned to any particular device
313 */
314struct buf *
ad30fb67
KM
315geteblk(size)
316 int size;
663dbc72 317{
4f083fd7 318 register struct buf *bp, *flist;
663dbc72
BJ
319
320loop:
4f083fd7
SL
321 bp = getnewbuf();
322 bp->b_flags |= B_INVAL;
3efdd860
KM
323 bfree(bp);
324 bremhash(bp);
4f083fd7
SL
325 flist = &bfreelist[BQ_AGE];
326 binshash(bp, flist);
663dbc72 327 bp->b_dev = (dev_t)NODEV;
4f083fd7 328 bp->b_error = 0;
9d6d37ce
BJ
329 if (brealloc(bp, size) == 0)
330 goto loop;
663dbc72
BJ
331 return(bp);
332}
333
ad30fb67
KM
334/*
335 * Allocate space associated with a buffer.
961945a8 336 * If can't get space, buffer is released
ad30fb67
KM
337 */
338brealloc(bp, size)
339 register struct buf *bp;
340 int size;
341{
342 daddr_t start, last;
343 register struct buf *ep;
344 struct buf *dp;
345 int s;
346
347 /*
348 * First need to make sure that all overlaping previous I/O
349 * is dispatched with.
350 */
351 if (size == bp->b_bcount)
9d6d37ce
BJ
352 return (1);
353 if (size < bp->b_bcount) {
354 if (bp->b_flags & B_DELWRI) {
355 bwrite(bp);
356 return (0);
357 }
358 if (bp->b_flags & B_LOCKED)
359 panic("brealloc");
961945a8 360 return (allocbuf(bp, size));
ad30fb67 361 }
9d6d37ce 362 bp->b_flags &= ~B_DONE;
961945a8
SL
363 if (bp->b_dev == NODEV)
364 return (allocbuf(bp, size));
9d6d37ce
BJ
365
366 /*
367 * Search cache for any buffers that overlap the one that we
368 * are trying to allocate. Overlapping buffers must be marked
369 * invalid, after being written out if they are dirty. (indicated
370 * by B_DELWRI) A disk block must be mapped by at most one buffer
371 * at any point in time. Care must be taken to avoid deadlocking
372 * when two buffer are trying to get the same set of disk blocks.
373 */
374 start = bp->b_blkno;
ad891b02 375 last = start + btodb(size) - 1;
ad30fb67
KM
376 dp = BUFHASH(bp->b_dev, bp->b_blkno);
377loop:
ad30fb67 378 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
9d6d37ce
BJ
379 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
380 continue;
381 /* look for overlap */
382 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 383 ep->b_blkno + btodb(ep->b_bcount) <= start)
ad30fb67
KM
384 continue;
385 s = spl6();
386 if (ep->b_flags&B_BUSY) {
387 ep->b_flags |= B_WANTED;
388 sleep((caddr_t)ep, PRIBIO+1);
4f083fd7 389 splx(s);
ad30fb67
KM
390 goto loop;
391 }
4f083fd7 392 splx(s);
9d6d37ce 393 notavail(ep);
ad30fb67 394 if (ep->b_flags & B_DELWRI) {
ad30fb67
KM
395 bwrite(ep);
396 goto loop;
397 }
9d6d37ce
BJ
398 ep->b_flags |= B_INVAL;
399 brelse(ep);
ad30fb67 400 }
961945a8 401 return (allocbuf(bp, size));
4f083fd7
SL
402}
403
4f083fd7
SL
404/*
405 * Find a buffer which is available for use.
406 * Select something from a free list.
407 * Preference is to AGE list, then LRU list.
408 */
409struct buf *
410getnewbuf()
411{
412 register struct buf *bp, *dp;
413 int s;
414
415loop:
416 s = spl6();
417 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
418 if (dp->av_forw != dp)
419 break;
420 if (dp == bfreelist) { /* no free blocks */
421 dp->b_flags |= B_WANTED;
422 sleep((caddr_t)dp, PRIBIO+1);
4b7d506c 423 splx(s);
4f083fd7
SL
424 goto loop;
425 }
426 splx(s);
427 bp = dp->av_forw;
428 notavail(bp);
429 if (bp->b_flags & B_DELWRI) {
430 bp->b_flags |= B_ASYNC;
431 bwrite(bp);
432 goto loop;
433 }
434 trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
435 bp->b_flags = B_BUSY;
436 return (bp);
437}
438
663dbc72
BJ
439/*
440 * Wait for I/O completion on the buffer; return errors
441 * to the user.
442 */
3efdd860 443biowait(bp)
ad30fb67 444 register struct buf *bp;
663dbc72 445{
530d0032 446 int s;
663dbc72 447
530d0032 448 s = spl6();
663dbc72
BJ
449 while ((bp->b_flags&B_DONE)==0)
450 sleep((caddr_t)bp, PRIBIO);
530d0032 451 splx(s);
11391203
SL
452 if (u.u_error == 0) /* XXX */
453 u.u_error = geterror(bp);
663dbc72
BJ
454}
455
663dbc72 456/*
af04ce66
SL
457 * Mark I/O complete on a buffer.
458 * If someone should be called, e.g. the pageout
459 * daemon, do so. Otherwise, wake up anyone
460 * waiting for it.
663dbc72 461 */
3efdd860
KM
462biodone(bp)
463 register struct buf *bp;
663dbc72 464{
663dbc72 465
80e7c811 466 if (bp->b_flags & B_DONE)
3efdd860 467 panic("dup biodone");
663dbc72 468 bp->b_flags |= B_DONE;
961945a8
SL
469 if (bp->b_flags & B_CALL) {
470 bp->b_flags &= ~B_CALL;
471 (*bp->b_iodone)(bp);
472 return;
473 }
663dbc72
BJ
474 if (bp->b_flags&B_ASYNC)
475 brelse(bp);
476 else {
477 bp->b_flags &= ~B_WANTED;
478 wakeup((caddr_t)bp);
479 }
480}
481
4f083fd7
SL
482/*
483 * Insure that no part of a specified block is in an incore buffer.
484 */
485blkflush(dev, blkno, size)
486 dev_t dev;
487 daddr_t blkno;
488 long size;
489{
490 register struct buf *ep;
491 struct buf *dp;
492 daddr_t start, last;
493 int s;
494
495 start = blkno;
ad891b02 496 last = start + btodb(size) - 1;
4f083fd7
SL
497 dp = BUFHASH(dev, blkno);
498loop:
499 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
500 if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
501 continue;
502 /* look for overlap */
503 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 504 ep->b_blkno + btodb(ep->b_bcount) <= start)
4f083fd7
SL
505 continue;
506 s = spl6();
507 if (ep->b_flags&B_BUSY) {
508 ep->b_flags |= B_WANTED;
509 sleep((caddr_t)ep, PRIBIO+1);
510 splx(s);
511 goto loop;
512 }
513 if (ep->b_flags & B_DELWRI) {
514 splx(s);
515 notavail(ep);
516 bwrite(ep);
517 goto loop;
518 }
519 splx(s);
520 }
521}
522
663dbc72 523/*
af04ce66 524 * Make sure all write-behind blocks
663dbc72
BJ
525 * on dev (or NODEV for all)
526 * are flushed out.
527 * (from umount and update)
528 */
529bflush(dev)
3efdd860 530 dev_t dev;
663dbc72
BJ
531{
532 register struct buf *bp;
46387ee3 533 register struct buf *flist;
530d0032 534 int s;
663dbc72
BJ
535
536loop:
530d0032 537 s = spl6();
4f083fd7 538 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
46387ee3 539 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
3efdd860
KM
540 if ((bp->b_flags & B_DELWRI) == 0)
541 continue;
542 if (dev == NODEV || dev == bp->b_dev) {
663dbc72
BJ
543 bp->b_flags |= B_ASYNC;
544 notavail(bp);
545 bwrite(bp);
f7916124 546 splx(s);
663dbc72
BJ
547 goto loop;
548 }
549 }
530d0032 550 splx(s);
663dbc72
BJ
551}
552
663dbc72
BJ
553/*
554 * Pick up the device's error number and pass it to the user;
555 * if there is an error but the number is 0 set a generalized
556 * code. Actually the latter is always true because devices
557 * don't yet return specific errors.
558 */
559geterror(bp)
3efdd860 560 register struct buf *bp;
663dbc72 561{
d6d7360b 562 int error = 0;
663dbc72
BJ
563
564 if (bp->b_flags&B_ERROR)
d6d7360b
BJ
565 if ((error = bp->b_error)==0)
566 return (EIO);
567 return (error);
663dbc72 568}
7b8b5a01
RE
569
570/*
571 * Invalidate in core blocks belonging to closed or umounted filesystem
572 *
573 * This is not nicely done at all - the buffer ought to be removed from the
574 * hash chains & have its dev/blkno fields clobbered, but unfortunately we
575 * can't do that here, as it is quite possible that the block is still
576 * being used for i/o. Eventually, all disc drivers should be forced to
577 * have a close routine, which ought ensure that the queue is empty, then
578 * properly flush the queues. Until that happy day, this suffices for
579 * correctness. ... kre
580 */
581binval(dev)
3efdd860 582 dev_t dev;
7b8b5a01 583{
634ebdbe
RE
584 register struct buf *bp;
585 register struct bufhd *hp;
586#define dp ((struct buf *)hp)
7b8b5a01 587
634ebdbe
RE
588 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
589 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
590 if (bp->b_dev == dev)
591 bp->b_flags |= B_INVAL;
7b8b5a01 592}