working again
[unix-history] / usr / src / sys / kern / vfs_bio.c
CommitLineData
da7c5cc6 1/*
0880b18e 2 * Copyright (c) 1982, 1986 Regents of the University of California.
da7c5cc6
KM
3 * All rights reserved. The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 *
0880b18e 6 * @(#)vfs_bio.c 7.1 (Berkeley) %G%
da7c5cc6 7 */
961945a8
SL
8
9#include "../machine/pte.h"
663dbc72 10
94368568
JB
11#include "param.h"
12#include "systm.h"
13#include "dir.h"
14#include "user.h"
15#include "buf.h"
16#include "conf.h"
17#include "proc.h"
18#include "seg.h"
19#include "vm.h"
20#include "trace.h"
663dbc72 21
663dbc72
BJ
22/*
23 * Read in (if necessary) the block and return a buffer pointer.
24 */
25struct buf *
ad30fb67
KM
26bread(dev, blkno, size)
27 dev_t dev;
28 daddr_t blkno;
29 int size;
663dbc72
BJ
30{
31 register struct buf *bp;
32
4f083fd7
SL
33 if (size == 0)
34 panic("bread: size 0");
ad30fb67 35 bp = getblk(dev, blkno, size);
663dbc72 36 if (bp->b_flags&B_DONE) {
720c861e 37 trace(TR_BREADHIT, pack(dev, size), blkno);
a5e62f37 38 return (bp);
663dbc72
BJ
39 }
40 bp->b_flags |= B_READ;
4f083fd7
SL
41 if (bp->b_bcount > bp->b_bufsize)
42 panic("bread");
663dbc72 43 (*bdevsw[major(dev)].d_strategy)(bp);
720c861e 44 trace(TR_BREADMISS, pack(dev, size), blkno);
fb99a9a1 45 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 46 biowait(bp);
a5e62f37 47 return (bp);
663dbc72
BJ
48}
49
50/*
51 * Read in the block, like bread, but also start I/O on the
52 * read-ahead block (which is not allocated to the caller)
53 */
54struct buf *
a8d3bf7f 55breada(dev, blkno, size, rablkno, rabsize)
ad30fb67 56 dev_t dev;
84baaab3 57 daddr_t blkno; int size;
a8d3bf7f 58 daddr_t rablkno; int rabsize;
663dbc72
BJ
59{
60 register struct buf *bp, *rabp;
61
62 bp = NULL;
3efdd860
KM
63 /*
64 * If the block isn't in core, then allocate
65 * a buffer and initiate i/o (getblk checks
66 * for a cache hit).
67 */
663dbc72 68 if (!incore(dev, blkno)) {
ad30fb67 69 bp = getblk(dev, blkno, size);
663dbc72
BJ
70 if ((bp->b_flags&B_DONE) == 0) {
71 bp->b_flags |= B_READ;
4f083fd7
SL
72 if (bp->b_bcount > bp->b_bufsize)
73 panic("breada");
663dbc72 74 (*bdevsw[major(dev)].d_strategy)(bp);
720c861e 75 trace(TR_BREADMISS, pack(dev, size), blkno);
fb99a9a1 76 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 77 } else
720c861e 78 trace(TR_BREADHIT, pack(dev, size), blkno);
663dbc72 79 }
3efdd860
KM
80
81 /*
82 * If there's a read-ahead block, start i/o
83 * on it also (as above).
84 */
663dbc72 85 if (rablkno && !incore(dev, rablkno)) {
a8d3bf7f 86 rabp = getblk(dev, rablkno, rabsize);
973ecc4f 87 if (rabp->b_flags & B_DONE) {
663dbc72 88 brelse(rabp);
720c861e 89 trace(TR_BREADHITRA, pack(dev, rabsize), blkno);
973ecc4f 90 } else {
663dbc72 91 rabp->b_flags |= B_READ|B_ASYNC;
4f083fd7
SL
92 if (rabp->b_bcount > rabp->b_bufsize)
93 panic("breadrabp");
663dbc72 94 (*bdevsw[major(dev)].d_strategy)(rabp);
720c861e 95 trace(TR_BREADMISSRA, pack(dev, rabsize), rablock);
fb99a9a1 96 u.u_ru.ru_inblock++; /* pay in advance */
663dbc72
BJ
97 }
98 }
3efdd860
KM
99
100 /*
84baaab3
KM
101 * If block was in core, let bread get it.
102 * If block wasn't in core, then the read was started
103 * above, and just wait for it.
3efdd860 104 */
84baaab3
KM
105 if (bp == NULL)
106 return (bread(dev, blkno, size));
3efdd860 107 biowait(bp);
84baaab3 108 return (bp);
663dbc72
BJ
109}
110
111/*
112 * Write the buffer, waiting for completion.
113 * Then release the buffer.
114 */
115bwrite(bp)
3efdd860 116 register struct buf *bp;
663dbc72
BJ
117{
118 register flag;
119
120 flag = bp->b_flags;
f844ee62 121 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
663dbc72 122 if ((flag&B_DELWRI) == 0)
fb99a9a1 123 u.u_ru.ru_oublock++; /* noone paid yet */
720c861e 124 trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno);
4f083fd7
SL
125 if (bp->b_bcount > bp->b_bufsize)
126 panic("bwrite");
663dbc72 127 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
3efdd860
KM
128
129 /*
130 * If the write was synchronous, then await i/o completion.
131 * If the write was "delayed", then we put the buffer on
132 * the q of blocks awaiting i/o completion status.
3efdd860 133 */
663dbc72 134 if ((flag&B_ASYNC) == 0) {
3efdd860 135 biowait(bp);
663dbc72
BJ
136 brelse(bp);
137 } else if (flag & B_DELWRI)
138 bp->b_flags |= B_AGE;
663dbc72
BJ
139}
140
141/*
142 * Release the buffer, marking it so that if it is grabbed
143 * for another purpose it will be written out before being
144 * given up (e.g. when writing a partial block where it is
145 * assumed that another write for the same block will soon follow).
146 * This can't be done for magtape, since writes must be done
147 * in the same order as requested.
148 */
149bdwrite(bp)
3efdd860 150 register struct buf *bp;
663dbc72 151{
e1e57888 152 register int flags;
663dbc72
BJ
153
154 if ((bp->b_flags&B_DELWRI) == 0)
fb99a9a1 155 u.u_ru.ru_oublock++; /* noone paid yet */
e1e57888
RE
156 flags = bdevsw[major(bp->b_dev)].d_flags;
157 if(flags & B_TAPE)
663dbc72
BJ
158 bawrite(bp);
159 else {
160 bp->b_flags |= B_DELWRI | B_DONE;
161 brelse(bp);
162 }
163}
164
165/*
166 * Release the buffer, start I/O on it, but don't wait for completion.
167 */
168bawrite(bp)
3efdd860 169 register struct buf *bp;
663dbc72
BJ
170{
171
172 bp->b_flags |= B_ASYNC;
173 bwrite(bp);
174}
175
176/*
3efdd860 177 * Release the buffer, with no I/O implied.
663dbc72
BJ
178 */
179brelse(bp)
3efdd860 180 register struct buf *bp;
663dbc72 181{
46387ee3 182 register struct buf *flist;
663dbc72
BJ
183 register s;
184
720c861e 185 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
3efdd860
KM
186 /*
187 * If someone's waiting for the buffer, or
188 * is waiting for a buffer wake 'em up.
189 */
663dbc72
BJ
190 if (bp->b_flags&B_WANTED)
191 wakeup((caddr_t)bp);
46387ee3
BJ
192 if (bfreelist[0].b_flags&B_WANTED) {
193 bfreelist[0].b_flags &= ~B_WANTED;
194 wakeup((caddr_t)bfreelist);
663dbc72 195 }
60a71525
BJ
196 if (bp->b_flags&B_ERROR)
197 if (bp->b_flags & B_LOCKED)
198 bp->b_flags &= ~B_ERROR; /* try again later */
199 else
200 bp->b_dev = NODEV; /* no assoc */
3efdd860
KM
201
202 /*
203 * Stick the buffer back on a free list.
204 */
a5e62f37 205 s = splbio();
4f083fd7
SL
206 if (bp->b_bufsize <= 0) {
207 /* block has no buffer ... put at front of unused buffer list */
208 flist = &bfreelist[BQ_EMPTY];
209 binsheadfree(bp, flist);
210 } else if (bp->b_flags & (B_ERROR|B_INVAL)) {
46387ee3 211 /* block has no info ... put at front of most free list */
4f083fd7 212 flist = &bfreelist[BQ_AGE];
3efdd860 213 binsheadfree(bp, flist);
663dbc72 214 } else {
46387ee3
BJ
215 if (bp->b_flags & B_LOCKED)
216 flist = &bfreelist[BQ_LOCKED];
217 else if (bp->b_flags & B_AGE)
218 flist = &bfreelist[BQ_AGE];
219 else
220 flist = &bfreelist[BQ_LRU];
3efdd860 221 binstailfree(bp, flist);
663dbc72
BJ
222 }
223 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
224 splx(s);
225}
226
227/*
228 * See if the block is associated with some buffer
229 * (mainly to avoid getting hung up on a wait in breada)
230 */
231incore(dev, blkno)
3efdd860
KM
232 dev_t dev;
233 daddr_t blkno;
663dbc72
BJ
234{
235 register struct buf *bp;
46387ee3 236 register struct buf *dp;
663dbc72 237
ad30fb67 238 dp = BUFHASH(dev, blkno);
46387ee3 239 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
ad30fb67 240 if (bp->b_blkno == blkno && bp->b_dev == dev &&
3efdd860 241 (bp->b_flags & B_INVAL) == 0)
5603d07d 242 return (1);
5603d07d 243 return (0);
663dbc72
BJ
244}
245
246struct buf *
ad30fb67
KM
247baddr(dev, blkno, size)
248 dev_t dev;
249 daddr_t blkno;
250 int size;
663dbc72
BJ
251{
252
253 if (incore(dev, blkno))
ad30fb67 254 return (bread(dev, blkno, size));
663dbc72
BJ
255 return (0);
256}
257
258/*
259 * Assign a buffer for the given block. If the appropriate
260 * block is already associated, return it; otherwise search
261 * for the oldest non-busy buffer and reassign it.
23900030
BJ
262 *
263 * We use splx here because this routine may be called
264 * on the interrupt stack during a dump, and we don't
265 * want to lower the ipl back to 0.
663dbc72
BJ
266 */
267struct buf *
ad30fb67
KM
268getblk(dev, blkno, size)
269 dev_t dev;
270 daddr_t blkno;
271 int size;
663dbc72 272{
4f083fd7 273 register struct buf *bp, *dp;
23900030 274 int s;
663dbc72 275
00a6a148
KM
276 if (size > MAXBSIZE)
277 panic("getblk: size too big");
751af33e
KM
278 /*
279 * To prevent overflow of 32-bit ints when converting block
280 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set
281 * to the maximum number that can be converted to a byte offset
282 * without overflow. This is historic code; what bug it fixed,
283 * or whether it is still a reasonable thing to do is open to
284 * dispute. mkm 9/85
285 */
286 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT))
287 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1);
3efdd860
KM
288 /*
289 * Search the cache for the block. If we hit, but
290 * the buffer is in use for i/o, then we wait until
291 * the i/o has completed.
292 */
ad30fb67 293 dp = BUFHASH(dev, blkno);
3efdd860 294loop:
46387ee3 295 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
ad30fb67 296 if (bp->b_blkno != blkno || bp->b_dev != dev ||
46387ee3 297 bp->b_flags&B_INVAL)
663dbc72 298 continue;
a5e62f37 299 s = splbio();
663dbc72
BJ
300 if (bp->b_flags&B_BUSY) {
301 bp->b_flags |= B_WANTED;
302 sleep((caddr_t)bp, PRIBIO+1);
23900030 303 splx(s);
663dbc72
BJ
304 goto loop;
305 }
23900030 306 splx(s);
663dbc72 307 notavail(bp);
b646a125 308 if (bp->b_bcount != size && brealloc(bp, size) == 0)
9d6d37ce 309 goto loop;
663dbc72 310 bp->b_flags |= B_CACHE;
a5e62f37 311 return (bp);
663dbc72 312 }
5603d07d
BJ
313 if (major(dev) >= nblkdev)
314 panic("blkdev");
4f083fd7 315 bp = getnewbuf();
ad30fb67 316 bfree(bp);
3efdd860
KM
317 bremhash(bp);
318 binshash(bp, dp);
663dbc72 319 bp->b_dev = dev;
ad30fb67 320 bp->b_blkno = blkno;
4f083fd7 321 bp->b_error = 0;
9d6d37ce
BJ
322 if (brealloc(bp, size) == 0)
323 goto loop;
a5e62f37 324 return (bp);
663dbc72
BJ
325}
326
327/*
328 * get an empty block,
329 * not assigned to any particular device
330 */
331struct buf *
ad30fb67
KM
332geteblk(size)
333 int size;
663dbc72 334{
4f083fd7 335 register struct buf *bp, *flist;
663dbc72 336
00a6a148
KM
337 if (size > MAXBSIZE)
338 panic("geteblk: size too big");
663dbc72 339loop:
4f083fd7
SL
340 bp = getnewbuf();
341 bp->b_flags |= B_INVAL;
3efdd860
KM
342 bfree(bp);
343 bremhash(bp);
4f083fd7
SL
344 flist = &bfreelist[BQ_AGE];
345 binshash(bp, flist);
663dbc72 346 bp->b_dev = (dev_t)NODEV;
4f083fd7 347 bp->b_error = 0;
9d6d37ce
BJ
348 if (brealloc(bp, size) == 0)
349 goto loop;
a5e62f37 350 return (bp);
663dbc72
BJ
351}
352
ad30fb67
KM
353/*
354 * Allocate space associated with a buffer.
961945a8 355 * If can't get space, buffer is released
ad30fb67
KM
356 */
357brealloc(bp, size)
358 register struct buf *bp;
359 int size;
360{
361 daddr_t start, last;
362 register struct buf *ep;
363 struct buf *dp;
364 int s;
365
366 /*
367 * First need to make sure that all overlaping previous I/O
368 * is dispatched with.
369 */
370 if (size == bp->b_bcount)
9d6d37ce
BJ
371 return (1);
372 if (size < bp->b_bcount) {
373 if (bp->b_flags & B_DELWRI) {
374 bwrite(bp);
375 return (0);
376 }
377 if (bp->b_flags & B_LOCKED)
378 panic("brealloc");
961945a8 379 return (allocbuf(bp, size));
ad30fb67 380 }
9d6d37ce 381 bp->b_flags &= ~B_DONE;
961945a8
SL
382 if (bp->b_dev == NODEV)
383 return (allocbuf(bp, size));
9d6d37ce 384
720c861e 385 trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno);
9d6d37ce
BJ
386 /*
387 * Search cache for any buffers that overlap the one that we
388 * are trying to allocate. Overlapping buffers must be marked
389 * invalid, after being written out if they are dirty. (indicated
390 * by B_DELWRI) A disk block must be mapped by at most one buffer
391 * at any point in time. Care must be taken to avoid deadlocking
392 * when two buffer are trying to get the same set of disk blocks.
393 */
394 start = bp->b_blkno;
ad891b02 395 last = start + btodb(size) - 1;
ad30fb67
KM
396 dp = BUFHASH(bp->b_dev, bp->b_blkno);
397loop:
ad30fb67 398 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
9d6d37ce
BJ
399 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
400 continue;
401 /* look for overlap */
402 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 403 ep->b_blkno + btodb(ep->b_bcount) <= start)
ad30fb67 404 continue;
a5e62f37 405 s = splbio();
ad30fb67
KM
406 if (ep->b_flags&B_BUSY) {
407 ep->b_flags |= B_WANTED;
408 sleep((caddr_t)ep, PRIBIO+1);
4f083fd7 409 splx(s);
ad30fb67
KM
410 goto loop;
411 }
4f083fd7 412 splx(s);
9d6d37ce 413 notavail(ep);
ad30fb67 414 if (ep->b_flags & B_DELWRI) {
ad30fb67
KM
415 bwrite(ep);
416 goto loop;
417 }
9d6d37ce
BJ
418 ep->b_flags |= B_INVAL;
419 brelse(ep);
ad30fb67 420 }
961945a8 421 return (allocbuf(bp, size));
4f083fd7
SL
422}
423
4f083fd7
SL
424/*
425 * Find a buffer which is available for use.
426 * Select something from a free list.
427 * Preference is to AGE list, then LRU list.
428 */
429struct buf *
430getnewbuf()
431{
432 register struct buf *bp, *dp;
433 int s;
434
435loop:
a5e62f37 436 s = splbio();
4f083fd7
SL
437 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
438 if (dp->av_forw != dp)
439 break;
440 if (dp == bfreelist) { /* no free blocks */
441 dp->b_flags |= B_WANTED;
442 sleep((caddr_t)dp, PRIBIO+1);
4b7d506c 443 splx(s);
4f083fd7
SL
444 goto loop;
445 }
446 splx(s);
447 bp = dp->av_forw;
448 notavail(bp);
449 if (bp->b_flags & B_DELWRI) {
450 bp->b_flags |= B_ASYNC;
451 bwrite(bp);
452 goto loop;
453 }
720c861e 454 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
4f083fd7
SL
455 bp->b_flags = B_BUSY;
456 return (bp);
457}
458
663dbc72
BJ
459/*
460 * Wait for I/O completion on the buffer; return errors
461 * to the user.
462 */
3efdd860 463biowait(bp)
ad30fb67 464 register struct buf *bp;
663dbc72 465{
530d0032 466 int s;
663dbc72 467
a5e62f37 468 s = splbio();
663dbc72
BJ
469 while ((bp->b_flags&B_DONE)==0)
470 sleep((caddr_t)bp, PRIBIO);
530d0032 471 splx(s);
11391203
SL
472 if (u.u_error == 0) /* XXX */
473 u.u_error = geterror(bp);
663dbc72
BJ
474}
475
663dbc72 476/*
af04ce66
SL
477 * Mark I/O complete on a buffer.
478 * If someone should be called, e.g. the pageout
479 * daemon, do so. Otherwise, wake up anyone
480 * waiting for it.
663dbc72 481 */
3efdd860
KM
482biodone(bp)
483 register struct buf *bp;
663dbc72 484{
663dbc72 485
80e7c811 486 if (bp->b_flags & B_DONE)
3efdd860 487 panic("dup biodone");
663dbc72 488 bp->b_flags |= B_DONE;
961945a8
SL
489 if (bp->b_flags & B_CALL) {
490 bp->b_flags &= ~B_CALL;
491 (*bp->b_iodone)(bp);
492 return;
493 }
663dbc72
BJ
494 if (bp->b_flags&B_ASYNC)
495 brelse(bp);
496 else {
497 bp->b_flags &= ~B_WANTED;
498 wakeup((caddr_t)bp);
499 }
500}
501
4f083fd7
SL
502/*
503 * Insure that no part of a specified block is in an incore buffer.
504 */
505blkflush(dev, blkno, size)
506 dev_t dev;
507 daddr_t blkno;
508 long size;
509{
510 register struct buf *ep;
511 struct buf *dp;
512 daddr_t start, last;
513 int s;
514
515 start = blkno;
ad891b02 516 last = start + btodb(size) - 1;
4f083fd7
SL
517 dp = BUFHASH(dev, blkno);
518loop:
519 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
520 if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
521 continue;
522 /* look for overlap */
523 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 524 ep->b_blkno + btodb(ep->b_bcount) <= start)
4f083fd7 525 continue;
a5e62f37 526 s = splbio();
4f083fd7
SL
527 if (ep->b_flags&B_BUSY) {
528 ep->b_flags |= B_WANTED;
529 sleep((caddr_t)ep, PRIBIO+1);
530 splx(s);
531 goto loop;
532 }
533 if (ep->b_flags & B_DELWRI) {
534 splx(s);
535 notavail(ep);
536 bwrite(ep);
537 goto loop;
538 }
539 splx(s);
540 }
541}
542
663dbc72 543/*
af04ce66 544 * Make sure all write-behind blocks
663dbc72
BJ
545 * on dev (or NODEV for all)
546 * are flushed out.
547 * (from umount and update)
548 */
549bflush(dev)
3efdd860 550 dev_t dev;
663dbc72
BJ
551{
552 register struct buf *bp;
46387ee3 553 register struct buf *flist;
530d0032 554 int s;
663dbc72
BJ
555
556loop:
a5e62f37 557 s = splbio();
4f083fd7 558 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
46387ee3 559 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
3efdd860
KM
560 if ((bp->b_flags & B_DELWRI) == 0)
561 continue;
562 if (dev == NODEV || dev == bp->b_dev) {
663dbc72
BJ
563 bp->b_flags |= B_ASYNC;
564 notavail(bp);
565 bwrite(bp);
f7916124 566 splx(s);
663dbc72
BJ
567 goto loop;
568 }
569 }
530d0032 570 splx(s);
663dbc72
BJ
571}
572
663dbc72
BJ
573/*
574 * Pick up the device's error number and pass it to the user;
a5b32f74 575 * if there is an error but the number is 0 set a generalized code.
663dbc72
BJ
576 */
577geterror(bp)
3efdd860 578 register struct buf *bp;
663dbc72 579{
d6d7360b 580 int error = 0;
663dbc72
BJ
581
582 if (bp->b_flags&B_ERROR)
d6d7360b
BJ
583 if ((error = bp->b_error)==0)
584 return (EIO);
585 return (error);
663dbc72 586}
7b8b5a01
RE
587
588/*
589 * Invalidate in core blocks belonging to closed or umounted filesystem
590 *
591 * This is not nicely done at all - the buffer ought to be removed from the
592 * hash chains & have its dev/blkno fields clobbered, but unfortunately we
593 * can't do that here, as it is quite possible that the block is still
594 * being used for i/o. Eventually, all disc drivers should be forced to
595 * have a close routine, which ought ensure that the queue is empty, then
596 * properly flush the queues. Until that happy day, this suffices for
597 * correctness. ... kre
598 */
599binval(dev)
3efdd860 600 dev_t dev;
7b8b5a01 601{
634ebdbe
RE
602 register struct buf *bp;
603 register struct bufhd *hp;
604#define dp ((struct buf *)hp)
7b8b5a01 605
634ebdbe
RE
606 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
607 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
608 if (bp->b_dev == dev)
609 bp->b_flags |= B_INVAL;
7b8b5a01 610}