wrong constant used by coincidence (PGSHIFT should be DEV_BSHIFT)
[unix-history] / usr / src / sys / kern / vfs_cluster.c
CommitLineData
da7c5cc6
KM
1/*
2 * Copyright (c) 1982 Regents of the University of California.
3 * All rights reserved. The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 *
751af33e 6 * @(#)vfs_cluster.c 6.6 (Berkeley) %G%
da7c5cc6 7 */
961945a8
SL
8
9#include "../machine/pte.h"
663dbc72 10
94368568
JB
11#include "param.h"
12#include "systm.h"
13#include "dir.h"
14#include "user.h"
15#include "buf.h"
16#include "conf.h"
17#include "proc.h"
18#include "seg.h"
19#include "vm.h"
20#include "trace.h"
663dbc72 21
663dbc72
BJ
22/*
23 * Read in (if necessary) the block and return a buffer pointer.
24 */
25struct buf *
ad30fb67
KM
26bread(dev, blkno, size)
27 dev_t dev;
28 daddr_t blkno;
29 int size;
663dbc72
BJ
30{
31 register struct buf *bp;
32
4f083fd7
SL
33 if (size == 0)
34 panic("bread: size 0");
ad30fb67 35 bp = getblk(dev, blkno, size);
663dbc72 36 if (bp->b_flags&B_DONE) {
720c861e 37 trace(TR_BREADHIT, pack(dev, size), blkno);
663dbc72
BJ
38 return(bp);
39 }
40 bp->b_flags |= B_READ;
4f083fd7
SL
41 if (bp->b_bcount > bp->b_bufsize)
42 panic("bread");
663dbc72 43 (*bdevsw[major(dev)].d_strategy)(bp);
720c861e 44 trace(TR_BREADMISS, pack(dev, size), blkno);
fb99a9a1 45 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 46 biowait(bp);
663dbc72
BJ
47 return(bp);
48}
49
50/*
51 * Read in the block, like bread, but also start I/O on the
52 * read-ahead block (which is not allocated to the caller)
53 */
54struct buf *
a8d3bf7f 55breada(dev, blkno, size, rablkno, rabsize)
ad30fb67 56 dev_t dev;
84baaab3 57 daddr_t blkno; int size;
a8d3bf7f 58 daddr_t rablkno; int rabsize;
663dbc72
BJ
59{
60 register struct buf *bp, *rabp;
61
62 bp = NULL;
3efdd860
KM
63 /*
64 * If the block isn't in core, then allocate
65 * a buffer and initiate i/o (getblk checks
66 * for a cache hit).
67 */
663dbc72 68 if (!incore(dev, blkno)) {
ad30fb67 69 bp = getblk(dev, blkno, size);
663dbc72
BJ
70 if ((bp->b_flags&B_DONE) == 0) {
71 bp->b_flags |= B_READ;
4f083fd7
SL
72 if (bp->b_bcount > bp->b_bufsize)
73 panic("breada");
663dbc72 74 (*bdevsw[major(dev)].d_strategy)(bp);
720c861e 75 trace(TR_BREADMISS, pack(dev, size), blkno);
fb99a9a1 76 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 77 } else
720c861e 78 trace(TR_BREADHIT, pack(dev, size), blkno);
663dbc72 79 }
3efdd860
KM
80
81 /*
82 * If there's a read-ahead block, start i/o
83 * on it also (as above).
84 */
663dbc72 85 if (rablkno && !incore(dev, rablkno)) {
a8d3bf7f 86 rabp = getblk(dev, rablkno, rabsize);
973ecc4f 87 if (rabp->b_flags & B_DONE) {
663dbc72 88 brelse(rabp);
720c861e 89 trace(TR_BREADHITRA, pack(dev, rabsize), blkno);
973ecc4f 90 } else {
663dbc72 91 rabp->b_flags |= B_READ|B_ASYNC;
4f083fd7
SL
92 if (rabp->b_bcount > rabp->b_bufsize)
93 panic("breadrabp");
663dbc72 94 (*bdevsw[major(dev)].d_strategy)(rabp);
720c861e 95 trace(TR_BREADMISSRA, pack(dev, rabsize), rablock);
fb99a9a1 96 u.u_ru.ru_inblock++; /* pay in advance */
663dbc72
BJ
97 }
98 }
3efdd860
KM
99
100 /*
84baaab3
KM
101 * If block was in core, let bread get it.
102 * If block wasn't in core, then the read was started
103 * above, and just wait for it.
3efdd860 104 */
84baaab3
KM
105 if (bp == NULL)
106 return (bread(dev, blkno, size));
3efdd860 107 biowait(bp);
84baaab3 108 return (bp);
663dbc72
BJ
109}
110
111/*
112 * Write the buffer, waiting for completion.
113 * Then release the buffer.
114 */
115bwrite(bp)
3efdd860 116 register struct buf *bp;
663dbc72
BJ
117{
118 register flag;
119
120 flag = bp->b_flags;
f844ee62 121 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
663dbc72 122 if ((flag&B_DELWRI) == 0)
fb99a9a1 123 u.u_ru.ru_oublock++; /* noone paid yet */
720c861e 124 trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno);
4f083fd7
SL
125 if (bp->b_bcount > bp->b_bufsize)
126 panic("bwrite");
663dbc72 127 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
3efdd860
KM
128
129 /*
130 * If the write was synchronous, then await i/o completion.
131 * If the write was "delayed", then we put the buffer on
132 * the q of blocks awaiting i/o completion status.
3efdd860 133 */
663dbc72 134 if ((flag&B_ASYNC) == 0) {
3efdd860 135 biowait(bp);
663dbc72
BJ
136 brelse(bp);
137 } else if (flag & B_DELWRI)
138 bp->b_flags |= B_AGE;
663dbc72
BJ
139}
140
141/*
142 * Release the buffer, marking it so that if it is grabbed
143 * for another purpose it will be written out before being
144 * given up (e.g. when writing a partial block where it is
145 * assumed that another write for the same block will soon follow).
146 * This can't be done for magtape, since writes must be done
147 * in the same order as requested.
148 */
149bdwrite(bp)
3efdd860 150 register struct buf *bp;
663dbc72 151{
e1e57888 152 register int flags;
663dbc72
BJ
153
154 if ((bp->b_flags&B_DELWRI) == 0)
fb99a9a1 155 u.u_ru.ru_oublock++; /* noone paid yet */
e1e57888
RE
156 flags = bdevsw[major(bp->b_dev)].d_flags;
157 if(flags & B_TAPE)
663dbc72
BJ
158 bawrite(bp);
159 else {
160 bp->b_flags |= B_DELWRI | B_DONE;
161 brelse(bp);
162 }
163}
164
165/*
166 * Release the buffer, start I/O on it, but don't wait for completion.
167 */
168bawrite(bp)
3efdd860 169 register struct buf *bp;
663dbc72
BJ
170{
171
172 bp->b_flags |= B_ASYNC;
173 bwrite(bp);
174}
175
176/*
3efdd860 177 * Release the buffer, with no I/O implied.
663dbc72
BJ
178 */
179brelse(bp)
3efdd860 180 register struct buf *bp;
663dbc72 181{
46387ee3 182 register struct buf *flist;
663dbc72
BJ
183 register s;
184
720c861e 185 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
3efdd860
KM
186 /*
187 * If someone's waiting for the buffer, or
188 * is waiting for a buffer wake 'em up.
189 */
663dbc72
BJ
190 if (bp->b_flags&B_WANTED)
191 wakeup((caddr_t)bp);
46387ee3
BJ
192 if (bfreelist[0].b_flags&B_WANTED) {
193 bfreelist[0].b_flags &= ~B_WANTED;
194 wakeup((caddr_t)bfreelist);
663dbc72 195 }
60a71525
BJ
196 if (bp->b_flags&B_ERROR)
197 if (bp->b_flags & B_LOCKED)
198 bp->b_flags &= ~B_ERROR; /* try again later */
199 else
200 bp->b_dev = NODEV; /* no assoc */
3efdd860
KM
201
202 /*
203 * Stick the buffer back on a free list.
204 */
663dbc72 205 s = spl6();
4f083fd7
SL
206 if (bp->b_bufsize <= 0) {
207 /* block has no buffer ... put at front of unused buffer list */
208 flist = &bfreelist[BQ_EMPTY];
209 binsheadfree(bp, flist);
210 } else if (bp->b_flags & (B_ERROR|B_INVAL)) {
46387ee3 211 /* block has no info ... put at front of most free list */
4f083fd7 212 flist = &bfreelist[BQ_AGE];
3efdd860 213 binsheadfree(bp, flist);
663dbc72 214 } else {
46387ee3
BJ
215 if (bp->b_flags & B_LOCKED)
216 flist = &bfreelist[BQ_LOCKED];
217 else if (bp->b_flags & B_AGE)
218 flist = &bfreelist[BQ_AGE];
219 else
220 flist = &bfreelist[BQ_LRU];
3efdd860 221 binstailfree(bp, flist);
663dbc72
BJ
222 }
223 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
224 splx(s);
225}
226
227/*
228 * See if the block is associated with some buffer
229 * (mainly to avoid getting hung up on a wait in breada)
230 */
231incore(dev, blkno)
3efdd860
KM
232 dev_t dev;
233 daddr_t blkno;
663dbc72
BJ
234{
235 register struct buf *bp;
46387ee3 236 register struct buf *dp;
663dbc72 237
ad30fb67 238 dp = BUFHASH(dev, blkno);
46387ee3 239 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
ad30fb67 240 if (bp->b_blkno == blkno && bp->b_dev == dev &&
3efdd860 241 (bp->b_flags & B_INVAL) == 0)
5603d07d 242 return (1);
5603d07d 243 return (0);
663dbc72
BJ
244}
245
246struct buf *
ad30fb67
KM
247baddr(dev, blkno, size)
248 dev_t dev;
249 daddr_t blkno;
250 int size;
663dbc72
BJ
251{
252
253 if (incore(dev, blkno))
ad30fb67 254 return (bread(dev, blkno, size));
663dbc72
BJ
255 return (0);
256}
257
258/*
259 * Assign a buffer for the given block. If the appropriate
260 * block is already associated, return it; otherwise search
261 * for the oldest non-busy buffer and reassign it.
23900030
BJ
262 *
263 * We use splx here because this routine may be called
264 * on the interrupt stack during a dump, and we don't
265 * want to lower the ipl back to 0.
663dbc72
BJ
266 */
267struct buf *
ad30fb67
KM
268getblk(dev, blkno, size)
269 dev_t dev;
270 daddr_t blkno;
271 int size;
663dbc72 272{
4f083fd7 273 register struct buf *bp, *dp;
23900030 274 int s;
663dbc72 275
751af33e
KM
276 /*
277 * To prevent overflow of 32-bit ints when converting block
278 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set
279 * to the maximum number that can be converted to a byte offset
280 * without overflow. This is historic code; what bug it fixed,
281 * or whether it is still a reasonable thing to do is open to
282 * dispute. mkm 9/85
283 */
284 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT))
285 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1);
3efdd860
KM
286 /*
287 * Search the cache for the block. If we hit, but
288 * the buffer is in use for i/o, then we wait until
289 * the i/o has completed.
290 */
ad30fb67 291 dp = BUFHASH(dev, blkno);
3efdd860 292loop:
46387ee3 293 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
ad30fb67 294 if (bp->b_blkno != blkno || bp->b_dev != dev ||
46387ee3 295 bp->b_flags&B_INVAL)
663dbc72 296 continue;
23900030 297 s = spl6();
663dbc72
BJ
298 if (bp->b_flags&B_BUSY) {
299 bp->b_flags |= B_WANTED;
300 sleep((caddr_t)bp, PRIBIO+1);
23900030 301 splx(s);
663dbc72
BJ
302 goto loop;
303 }
23900030 304 splx(s);
663dbc72 305 notavail(bp);
b646a125 306 if (bp->b_bcount != size && brealloc(bp, size) == 0)
9d6d37ce 307 goto loop;
663dbc72
BJ
308 bp->b_flags |= B_CACHE;
309 return(bp);
310 }
5603d07d
BJ
311 if (major(dev) >= nblkdev)
312 panic("blkdev");
4f083fd7 313 bp = getnewbuf();
ad30fb67 314 bfree(bp);
3efdd860
KM
315 bremhash(bp);
316 binshash(bp, dp);
663dbc72 317 bp->b_dev = dev;
ad30fb67 318 bp->b_blkno = blkno;
4f083fd7 319 bp->b_error = 0;
9d6d37ce
BJ
320 if (brealloc(bp, size) == 0)
321 goto loop;
663dbc72
BJ
322 return(bp);
323}
324
325/*
326 * get an empty block,
327 * not assigned to any particular device
328 */
329struct buf *
ad30fb67
KM
330geteblk(size)
331 int size;
663dbc72 332{
4f083fd7 333 register struct buf *bp, *flist;
663dbc72
BJ
334
335loop:
4f083fd7
SL
336 bp = getnewbuf();
337 bp->b_flags |= B_INVAL;
3efdd860
KM
338 bfree(bp);
339 bremhash(bp);
4f083fd7
SL
340 flist = &bfreelist[BQ_AGE];
341 binshash(bp, flist);
663dbc72 342 bp->b_dev = (dev_t)NODEV;
4f083fd7 343 bp->b_error = 0;
9d6d37ce
BJ
344 if (brealloc(bp, size) == 0)
345 goto loop;
663dbc72
BJ
346 return(bp);
347}
348
ad30fb67
KM
349/*
350 * Allocate space associated with a buffer.
961945a8 351 * If can't get space, buffer is released
ad30fb67
KM
352 */
353brealloc(bp, size)
354 register struct buf *bp;
355 int size;
356{
357 daddr_t start, last;
358 register struct buf *ep;
359 struct buf *dp;
360 int s;
361
362 /*
363 * First need to make sure that all overlaping previous I/O
364 * is dispatched with.
365 */
366 if (size == bp->b_bcount)
9d6d37ce
BJ
367 return (1);
368 if (size < bp->b_bcount) {
369 if (bp->b_flags & B_DELWRI) {
370 bwrite(bp);
371 return (0);
372 }
373 if (bp->b_flags & B_LOCKED)
374 panic("brealloc");
961945a8 375 return (allocbuf(bp, size));
ad30fb67 376 }
9d6d37ce 377 bp->b_flags &= ~B_DONE;
961945a8
SL
378 if (bp->b_dev == NODEV)
379 return (allocbuf(bp, size));
9d6d37ce 380
720c861e 381 trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno);
9d6d37ce
BJ
382 /*
383 * Search cache for any buffers that overlap the one that we
384 * are trying to allocate. Overlapping buffers must be marked
385 * invalid, after being written out if they are dirty. (indicated
386 * by B_DELWRI) A disk block must be mapped by at most one buffer
387 * at any point in time. Care must be taken to avoid deadlocking
388 * when two buffer are trying to get the same set of disk blocks.
389 */
390 start = bp->b_blkno;
ad891b02 391 last = start + btodb(size) - 1;
ad30fb67
KM
392 dp = BUFHASH(bp->b_dev, bp->b_blkno);
393loop:
ad30fb67 394 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
9d6d37ce
BJ
395 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
396 continue;
397 /* look for overlap */
398 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 399 ep->b_blkno + btodb(ep->b_bcount) <= start)
ad30fb67
KM
400 continue;
401 s = spl6();
402 if (ep->b_flags&B_BUSY) {
403 ep->b_flags |= B_WANTED;
404 sleep((caddr_t)ep, PRIBIO+1);
4f083fd7 405 splx(s);
ad30fb67
KM
406 goto loop;
407 }
4f083fd7 408 splx(s);
9d6d37ce 409 notavail(ep);
ad30fb67 410 if (ep->b_flags & B_DELWRI) {
ad30fb67
KM
411 bwrite(ep);
412 goto loop;
413 }
9d6d37ce
BJ
414 ep->b_flags |= B_INVAL;
415 brelse(ep);
ad30fb67 416 }
961945a8 417 return (allocbuf(bp, size));
4f083fd7
SL
418}
419
4f083fd7
SL
420/*
421 * Find a buffer which is available for use.
422 * Select something from a free list.
423 * Preference is to AGE list, then LRU list.
424 */
425struct buf *
426getnewbuf()
427{
428 register struct buf *bp, *dp;
429 int s;
430
431loop:
432 s = spl6();
433 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
434 if (dp->av_forw != dp)
435 break;
436 if (dp == bfreelist) { /* no free blocks */
437 dp->b_flags |= B_WANTED;
438 sleep((caddr_t)dp, PRIBIO+1);
4b7d506c 439 splx(s);
4f083fd7
SL
440 goto loop;
441 }
442 splx(s);
443 bp = dp->av_forw;
444 notavail(bp);
445 if (bp->b_flags & B_DELWRI) {
446 bp->b_flags |= B_ASYNC;
447 bwrite(bp);
448 goto loop;
449 }
720c861e 450 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
4f083fd7
SL
451 bp->b_flags = B_BUSY;
452 return (bp);
453}
454
663dbc72
BJ
455/*
456 * Wait for I/O completion on the buffer; return errors
457 * to the user.
458 */
3efdd860 459biowait(bp)
ad30fb67 460 register struct buf *bp;
663dbc72 461{
530d0032 462 int s;
663dbc72 463
530d0032 464 s = spl6();
663dbc72
BJ
465 while ((bp->b_flags&B_DONE)==0)
466 sleep((caddr_t)bp, PRIBIO);
530d0032 467 splx(s);
11391203
SL
468 if (u.u_error == 0) /* XXX */
469 u.u_error = geterror(bp);
663dbc72
BJ
470}
471
663dbc72 472/*
af04ce66
SL
473 * Mark I/O complete on a buffer.
474 * If someone should be called, e.g. the pageout
475 * daemon, do so. Otherwise, wake up anyone
476 * waiting for it.
663dbc72 477 */
3efdd860
KM
478biodone(bp)
479 register struct buf *bp;
663dbc72 480{
663dbc72 481
80e7c811 482 if (bp->b_flags & B_DONE)
3efdd860 483 panic("dup biodone");
663dbc72 484 bp->b_flags |= B_DONE;
961945a8
SL
485 if (bp->b_flags & B_CALL) {
486 bp->b_flags &= ~B_CALL;
487 (*bp->b_iodone)(bp);
488 return;
489 }
663dbc72
BJ
490 if (bp->b_flags&B_ASYNC)
491 brelse(bp);
492 else {
493 bp->b_flags &= ~B_WANTED;
494 wakeup((caddr_t)bp);
495 }
496}
497
4f083fd7
SL
498/*
499 * Insure that no part of a specified block is in an incore buffer.
500 */
501blkflush(dev, blkno, size)
502 dev_t dev;
503 daddr_t blkno;
504 long size;
505{
506 register struct buf *ep;
507 struct buf *dp;
508 daddr_t start, last;
509 int s;
510
511 start = blkno;
ad891b02 512 last = start + btodb(size) - 1;
4f083fd7
SL
513 dp = BUFHASH(dev, blkno);
514loop:
515 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
516 if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
517 continue;
518 /* look for overlap */
519 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 520 ep->b_blkno + btodb(ep->b_bcount) <= start)
4f083fd7
SL
521 continue;
522 s = spl6();
523 if (ep->b_flags&B_BUSY) {
524 ep->b_flags |= B_WANTED;
525 sleep((caddr_t)ep, PRIBIO+1);
526 splx(s);
527 goto loop;
528 }
529 if (ep->b_flags & B_DELWRI) {
530 splx(s);
531 notavail(ep);
532 bwrite(ep);
533 goto loop;
534 }
535 splx(s);
536 }
537}
538
663dbc72 539/*
af04ce66 540 * Make sure all write-behind blocks
663dbc72
BJ
541 * on dev (or NODEV for all)
542 * are flushed out.
543 * (from umount and update)
544 */
545bflush(dev)
3efdd860 546 dev_t dev;
663dbc72
BJ
547{
548 register struct buf *bp;
46387ee3 549 register struct buf *flist;
530d0032 550 int s;
663dbc72
BJ
551
552loop:
530d0032 553 s = spl6();
4f083fd7 554 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
46387ee3 555 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
3efdd860
KM
556 if ((bp->b_flags & B_DELWRI) == 0)
557 continue;
558 if (dev == NODEV || dev == bp->b_dev) {
663dbc72
BJ
559 bp->b_flags |= B_ASYNC;
560 notavail(bp);
561 bwrite(bp);
f7916124 562 splx(s);
663dbc72
BJ
563 goto loop;
564 }
565 }
530d0032 566 splx(s);
663dbc72
BJ
567}
568
663dbc72
BJ
569/*
570 * Pick up the device's error number and pass it to the user;
571 * if there is an error but the number is 0 set a generalized
572 * code. Actually the latter is always true because devices
573 * don't yet return specific errors.
574 */
575geterror(bp)
3efdd860 576 register struct buf *bp;
663dbc72 577{
d6d7360b 578 int error = 0;
663dbc72
BJ
579
580 if (bp->b_flags&B_ERROR)
d6d7360b
BJ
581 if ((error = bp->b_error)==0)
582 return (EIO);
583 return (error);
663dbc72 584}
7b8b5a01
RE
585
586/*
587 * Invalidate in core blocks belonging to closed or umounted filesystem
588 *
589 * This is not nicely done at all - the buffer ought to be removed from the
590 * hash chains & have its dev/blkno fields clobbered, but unfortunately we
591 * can't do that here, as it is quite possible that the block is still
592 * being used for i/o. Eventually, all disc drivers should be forced to
593 * have a close routine, which ought ensure that the queue is empty, then
594 * properly flush the queues. Until that happy day, this suffices for
595 * correctness. ... kre
596 */
597binval(dev)
3efdd860 598 dev_t dev;
7b8b5a01 599{
634ebdbe
RE
600 register struct buf *bp;
601 register struct bufhd *hp;
602#define dp ((struct buf *)hp)
7b8b5a01 603
634ebdbe
RE
604 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
605 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
606 if (bp->b_dev == dev)
607 bp->b_flags |= B_INVAL;
7b8b5a01 608}