Change to includes. no more ../h
[unix-history] / usr / src / sys / kern / vfs_cluster.c
CommitLineData
94368568 1/* vfs_cluster.c 6.4 84/08/29 */
961945a8
SL
2
3#include "../machine/pte.h"
663dbc72 4
94368568
JB
5#include "param.h"
6#include "systm.h"
7#include "dir.h"
8#include "user.h"
9#include "buf.h"
10#include "conf.h"
11#include "proc.h"
12#include "seg.h"
13#include "vm.h"
14#include "trace.h"
663dbc72 15
663dbc72
BJ
16/*
17 * Read in (if necessary) the block and return a buffer pointer.
18 */
19struct buf *
ad30fb67
KM
20bread(dev, blkno, size)
21 dev_t dev;
22 daddr_t blkno;
23 int size;
663dbc72
BJ
24{
25 register struct buf *bp;
26
4f083fd7
SL
27 if (size == 0)
28 panic("bread: size 0");
ad30fb67 29 bp = getblk(dev, blkno, size);
663dbc72 30 if (bp->b_flags&B_DONE) {
720c861e 31 trace(TR_BREADHIT, pack(dev, size), blkno);
663dbc72
BJ
32 return(bp);
33 }
34 bp->b_flags |= B_READ;
4f083fd7
SL
35 if (bp->b_bcount > bp->b_bufsize)
36 panic("bread");
663dbc72 37 (*bdevsw[major(dev)].d_strategy)(bp);
720c861e 38 trace(TR_BREADMISS, pack(dev, size), blkno);
fb99a9a1 39 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 40 biowait(bp);
663dbc72
BJ
41 return(bp);
42}
43
44/*
45 * Read in the block, like bread, but also start I/O on the
46 * read-ahead block (which is not allocated to the caller)
47 */
48struct buf *
a8d3bf7f 49breada(dev, blkno, size, rablkno, rabsize)
ad30fb67 50 dev_t dev;
84baaab3 51 daddr_t blkno; int size;
a8d3bf7f 52 daddr_t rablkno; int rabsize;
663dbc72
BJ
53{
54 register struct buf *bp, *rabp;
55
56 bp = NULL;
3efdd860
KM
57 /*
58 * If the block isn't in core, then allocate
59 * a buffer and initiate i/o (getblk checks
60 * for a cache hit).
61 */
663dbc72 62 if (!incore(dev, blkno)) {
ad30fb67 63 bp = getblk(dev, blkno, size);
663dbc72
BJ
64 if ((bp->b_flags&B_DONE) == 0) {
65 bp->b_flags |= B_READ;
4f083fd7
SL
66 if (bp->b_bcount > bp->b_bufsize)
67 panic("breada");
663dbc72 68 (*bdevsw[major(dev)].d_strategy)(bp);
720c861e 69 trace(TR_BREADMISS, pack(dev, size), blkno);
fb99a9a1 70 u.u_ru.ru_inblock++; /* pay for read */
3efdd860 71 } else
720c861e 72 trace(TR_BREADHIT, pack(dev, size), blkno);
663dbc72 73 }
3efdd860
KM
74
75 /*
76 * If there's a read-ahead block, start i/o
77 * on it also (as above).
78 */
663dbc72 79 if (rablkno && !incore(dev, rablkno)) {
a8d3bf7f 80 rabp = getblk(dev, rablkno, rabsize);
973ecc4f 81 if (rabp->b_flags & B_DONE) {
663dbc72 82 brelse(rabp);
720c861e 83 trace(TR_BREADHITRA, pack(dev, rabsize), blkno);
973ecc4f 84 } else {
663dbc72 85 rabp->b_flags |= B_READ|B_ASYNC;
4f083fd7
SL
86 if (rabp->b_bcount > rabp->b_bufsize)
87 panic("breadrabp");
663dbc72 88 (*bdevsw[major(dev)].d_strategy)(rabp);
720c861e 89 trace(TR_BREADMISSRA, pack(dev, rabsize), rablock);
fb99a9a1 90 u.u_ru.ru_inblock++; /* pay in advance */
663dbc72
BJ
91 }
92 }
3efdd860
KM
93
94 /*
84baaab3
KM
95 * If block was in core, let bread get it.
96 * If block wasn't in core, then the read was started
97 * above, and just wait for it.
3efdd860 98 */
84baaab3
KM
99 if (bp == NULL)
100 return (bread(dev, blkno, size));
3efdd860 101 biowait(bp);
84baaab3 102 return (bp);
663dbc72
BJ
103}
104
105/*
106 * Write the buffer, waiting for completion.
107 * Then release the buffer.
108 */
109bwrite(bp)
3efdd860 110 register struct buf *bp;
663dbc72
BJ
111{
112 register flag;
113
114 flag = bp->b_flags;
f844ee62 115 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
663dbc72 116 if ((flag&B_DELWRI) == 0)
fb99a9a1 117 u.u_ru.ru_oublock++; /* noone paid yet */
720c861e 118 trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno);
4f083fd7
SL
119 if (bp->b_bcount > bp->b_bufsize)
120 panic("bwrite");
663dbc72 121 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
3efdd860
KM
122
123 /*
124 * If the write was synchronous, then await i/o completion.
125 * If the write was "delayed", then we put the buffer on
126 * the q of blocks awaiting i/o completion status.
3efdd860 127 */
663dbc72 128 if ((flag&B_ASYNC) == 0) {
3efdd860 129 biowait(bp);
663dbc72
BJ
130 brelse(bp);
131 } else if (flag & B_DELWRI)
132 bp->b_flags |= B_AGE;
663dbc72
BJ
133}
134
135/*
136 * Release the buffer, marking it so that if it is grabbed
137 * for another purpose it will be written out before being
138 * given up (e.g. when writing a partial block where it is
139 * assumed that another write for the same block will soon follow).
140 * This can't be done for magtape, since writes must be done
141 * in the same order as requested.
142 */
143bdwrite(bp)
3efdd860 144 register struct buf *bp;
663dbc72 145{
e1e57888 146 register int flags;
663dbc72
BJ
147
148 if ((bp->b_flags&B_DELWRI) == 0)
fb99a9a1 149 u.u_ru.ru_oublock++; /* noone paid yet */
e1e57888
RE
150 flags = bdevsw[major(bp->b_dev)].d_flags;
151 if(flags & B_TAPE)
663dbc72
BJ
152 bawrite(bp);
153 else {
154 bp->b_flags |= B_DELWRI | B_DONE;
155 brelse(bp);
156 }
157}
158
159/*
160 * Release the buffer, start I/O on it, but don't wait for completion.
161 */
162bawrite(bp)
3efdd860 163 register struct buf *bp;
663dbc72
BJ
164{
165
166 bp->b_flags |= B_ASYNC;
167 bwrite(bp);
168}
169
170/*
3efdd860 171 * Release the buffer, with no I/O implied.
663dbc72
BJ
172 */
173brelse(bp)
3efdd860 174 register struct buf *bp;
663dbc72 175{
46387ee3 176 register struct buf *flist;
663dbc72
BJ
177 register s;
178
720c861e 179 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
3efdd860
KM
180 /*
181 * If someone's waiting for the buffer, or
182 * is waiting for a buffer wake 'em up.
183 */
663dbc72
BJ
184 if (bp->b_flags&B_WANTED)
185 wakeup((caddr_t)bp);
46387ee3
BJ
186 if (bfreelist[0].b_flags&B_WANTED) {
187 bfreelist[0].b_flags &= ~B_WANTED;
188 wakeup((caddr_t)bfreelist);
663dbc72 189 }
60a71525
BJ
190 if (bp->b_flags&B_ERROR)
191 if (bp->b_flags & B_LOCKED)
192 bp->b_flags &= ~B_ERROR; /* try again later */
193 else
194 bp->b_dev = NODEV; /* no assoc */
3efdd860
KM
195
196 /*
197 * Stick the buffer back on a free list.
198 */
663dbc72 199 s = spl6();
4f083fd7
SL
200 if (bp->b_bufsize <= 0) {
201 /* block has no buffer ... put at front of unused buffer list */
202 flist = &bfreelist[BQ_EMPTY];
203 binsheadfree(bp, flist);
204 } else if (bp->b_flags & (B_ERROR|B_INVAL)) {
46387ee3 205 /* block has no info ... put at front of most free list */
4f083fd7 206 flist = &bfreelist[BQ_AGE];
3efdd860 207 binsheadfree(bp, flist);
663dbc72 208 } else {
46387ee3
BJ
209 if (bp->b_flags & B_LOCKED)
210 flist = &bfreelist[BQ_LOCKED];
211 else if (bp->b_flags & B_AGE)
212 flist = &bfreelist[BQ_AGE];
213 else
214 flist = &bfreelist[BQ_LRU];
3efdd860 215 binstailfree(bp, flist);
663dbc72
BJ
216 }
217 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
218 splx(s);
219}
220
221/*
222 * See if the block is associated with some buffer
223 * (mainly to avoid getting hung up on a wait in breada)
224 */
225incore(dev, blkno)
3efdd860
KM
226 dev_t dev;
227 daddr_t blkno;
663dbc72
BJ
228{
229 register struct buf *bp;
46387ee3 230 register struct buf *dp;
663dbc72 231
ad30fb67 232 dp = BUFHASH(dev, blkno);
46387ee3 233 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
ad30fb67 234 if (bp->b_blkno == blkno && bp->b_dev == dev &&
3efdd860 235 (bp->b_flags & B_INVAL) == 0)
5603d07d 236 return (1);
5603d07d 237 return (0);
663dbc72
BJ
238}
239
240struct buf *
ad30fb67
KM
241baddr(dev, blkno, size)
242 dev_t dev;
243 daddr_t blkno;
244 int size;
663dbc72
BJ
245{
246
247 if (incore(dev, blkno))
ad30fb67 248 return (bread(dev, blkno, size));
663dbc72
BJ
249 return (0);
250}
251
252/*
253 * Assign a buffer for the given block. If the appropriate
254 * block is already associated, return it; otherwise search
255 * for the oldest non-busy buffer and reassign it.
23900030
BJ
256 *
257 * We use splx here because this routine may be called
258 * on the interrupt stack during a dump, and we don't
259 * want to lower the ipl back to 0.
663dbc72
BJ
260 */
261struct buf *
ad30fb67
KM
262getblk(dev, blkno, size)
263 dev_t dev;
264 daddr_t blkno;
265 int size;
663dbc72 266{
4f083fd7 267 register struct buf *bp, *dp;
23900030 268 int s;
663dbc72 269
961945a8 270 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT)) /* XXX */
01659974 271 blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
3efdd860
KM
272 /*
273 * Search the cache for the block. If we hit, but
274 * the buffer is in use for i/o, then we wait until
275 * the i/o has completed.
276 */
ad30fb67 277 dp = BUFHASH(dev, blkno);
3efdd860 278loop:
46387ee3 279 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
ad30fb67 280 if (bp->b_blkno != blkno || bp->b_dev != dev ||
46387ee3 281 bp->b_flags&B_INVAL)
663dbc72 282 continue;
23900030 283 s = spl6();
663dbc72
BJ
284 if (bp->b_flags&B_BUSY) {
285 bp->b_flags |= B_WANTED;
286 sleep((caddr_t)bp, PRIBIO+1);
23900030 287 splx(s);
663dbc72
BJ
288 goto loop;
289 }
23900030 290 splx(s);
663dbc72 291 notavail(bp);
b646a125 292 if (bp->b_bcount != size && brealloc(bp, size) == 0)
9d6d37ce 293 goto loop;
663dbc72
BJ
294 bp->b_flags |= B_CACHE;
295 return(bp);
296 }
5603d07d
BJ
297 if (major(dev) >= nblkdev)
298 panic("blkdev");
4f083fd7 299 bp = getnewbuf();
ad30fb67 300 bfree(bp);
3efdd860
KM
301 bremhash(bp);
302 binshash(bp, dp);
663dbc72 303 bp->b_dev = dev;
ad30fb67 304 bp->b_blkno = blkno;
4f083fd7 305 bp->b_error = 0;
9d6d37ce
BJ
306 if (brealloc(bp, size) == 0)
307 goto loop;
663dbc72
BJ
308 return(bp);
309}
310
311/*
312 * get an empty block,
313 * not assigned to any particular device
314 */
315struct buf *
ad30fb67
KM
316geteblk(size)
317 int size;
663dbc72 318{
4f083fd7 319 register struct buf *bp, *flist;
663dbc72
BJ
320
321loop:
4f083fd7
SL
322 bp = getnewbuf();
323 bp->b_flags |= B_INVAL;
3efdd860
KM
324 bfree(bp);
325 bremhash(bp);
4f083fd7
SL
326 flist = &bfreelist[BQ_AGE];
327 binshash(bp, flist);
663dbc72 328 bp->b_dev = (dev_t)NODEV;
4f083fd7 329 bp->b_error = 0;
9d6d37ce
BJ
330 if (brealloc(bp, size) == 0)
331 goto loop;
663dbc72
BJ
332 return(bp);
333}
334
ad30fb67
KM
335/*
336 * Allocate space associated with a buffer.
961945a8 337 * If can't get space, buffer is released
ad30fb67
KM
338 */
339brealloc(bp, size)
340 register struct buf *bp;
341 int size;
342{
343 daddr_t start, last;
344 register struct buf *ep;
345 struct buf *dp;
346 int s;
347
348 /*
349 * First need to make sure that all overlaping previous I/O
350 * is dispatched with.
351 */
352 if (size == bp->b_bcount)
9d6d37ce
BJ
353 return (1);
354 if (size < bp->b_bcount) {
355 if (bp->b_flags & B_DELWRI) {
356 bwrite(bp);
357 return (0);
358 }
359 if (bp->b_flags & B_LOCKED)
360 panic("brealloc");
961945a8 361 return (allocbuf(bp, size));
ad30fb67 362 }
9d6d37ce 363 bp->b_flags &= ~B_DONE;
961945a8
SL
364 if (bp->b_dev == NODEV)
365 return (allocbuf(bp, size));
9d6d37ce 366
720c861e 367 trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno);
9d6d37ce
BJ
368 /*
369 * Search cache for any buffers that overlap the one that we
370 * are trying to allocate. Overlapping buffers must be marked
371 * invalid, after being written out if they are dirty. (indicated
372 * by B_DELWRI) A disk block must be mapped by at most one buffer
373 * at any point in time. Care must be taken to avoid deadlocking
374 * when two buffer are trying to get the same set of disk blocks.
375 */
376 start = bp->b_blkno;
ad891b02 377 last = start + btodb(size) - 1;
ad30fb67
KM
378 dp = BUFHASH(bp->b_dev, bp->b_blkno);
379loop:
ad30fb67 380 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
9d6d37ce
BJ
381 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
382 continue;
383 /* look for overlap */
384 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 385 ep->b_blkno + btodb(ep->b_bcount) <= start)
ad30fb67
KM
386 continue;
387 s = spl6();
388 if (ep->b_flags&B_BUSY) {
389 ep->b_flags |= B_WANTED;
390 sleep((caddr_t)ep, PRIBIO+1);
4f083fd7 391 splx(s);
ad30fb67
KM
392 goto loop;
393 }
4f083fd7 394 splx(s);
9d6d37ce 395 notavail(ep);
ad30fb67 396 if (ep->b_flags & B_DELWRI) {
ad30fb67
KM
397 bwrite(ep);
398 goto loop;
399 }
9d6d37ce
BJ
400 ep->b_flags |= B_INVAL;
401 brelse(ep);
ad30fb67 402 }
961945a8 403 return (allocbuf(bp, size));
4f083fd7
SL
404}
405
4f083fd7
SL
406/*
407 * Find a buffer which is available for use.
408 * Select something from a free list.
409 * Preference is to AGE list, then LRU list.
410 */
411struct buf *
412getnewbuf()
413{
414 register struct buf *bp, *dp;
415 int s;
416
417loop:
418 s = spl6();
419 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
420 if (dp->av_forw != dp)
421 break;
422 if (dp == bfreelist) { /* no free blocks */
423 dp->b_flags |= B_WANTED;
424 sleep((caddr_t)dp, PRIBIO+1);
4b7d506c 425 splx(s);
4f083fd7
SL
426 goto loop;
427 }
428 splx(s);
429 bp = dp->av_forw;
430 notavail(bp);
431 if (bp->b_flags & B_DELWRI) {
432 bp->b_flags |= B_ASYNC;
433 bwrite(bp);
434 goto loop;
435 }
720c861e 436 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
4f083fd7
SL
437 bp->b_flags = B_BUSY;
438 return (bp);
439}
440
663dbc72
BJ
441/*
442 * Wait for I/O completion on the buffer; return errors
443 * to the user.
444 */
3efdd860 445biowait(bp)
ad30fb67 446 register struct buf *bp;
663dbc72 447{
530d0032 448 int s;
663dbc72 449
530d0032 450 s = spl6();
663dbc72
BJ
451 while ((bp->b_flags&B_DONE)==0)
452 sleep((caddr_t)bp, PRIBIO);
530d0032 453 splx(s);
11391203
SL
454 if (u.u_error == 0) /* XXX */
455 u.u_error = geterror(bp);
663dbc72
BJ
456}
457
663dbc72 458/*
af04ce66
SL
459 * Mark I/O complete on a buffer.
460 * If someone should be called, e.g. the pageout
461 * daemon, do so. Otherwise, wake up anyone
462 * waiting for it.
663dbc72 463 */
3efdd860
KM
464biodone(bp)
465 register struct buf *bp;
663dbc72 466{
663dbc72 467
80e7c811 468 if (bp->b_flags & B_DONE)
3efdd860 469 panic("dup biodone");
663dbc72 470 bp->b_flags |= B_DONE;
961945a8
SL
471 if (bp->b_flags & B_CALL) {
472 bp->b_flags &= ~B_CALL;
473 (*bp->b_iodone)(bp);
474 return;
475 }
663dbc72
BJ
476 if (bp->b_flags&B_ASYNC)
477 brelse(bp);
478 else {
479 bp->b_flags &= ~B_WANTED;
480 wakeup((caddr_t)bp);
481 }
482}
483
4f083fd7
SL
484/*
485 * Insure that no part of a specified block is in an incore buffer.
486 */
487blkflush(dev, blkno, size)
488 dev_t dev;
489 daddr_t blkno;
490 long size;
491{
492 register struct buf *ep;
493 struct buf *dp;
494 daddr_t start, last;
495 int s;
496
497 start = blkno;
ad891b02 498 last = start + btodb(size) - 1;
4f083fd7
SL
499 dp = BUFHASH(dev, blkno);
500loop:
501 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
502 if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
503 continue;
504 /* look for overlap */
505 if (ep->b_bcount == 0 || ep->b_blkno > last ||
ad891b02 506 ep->b_blkno + btodb(ep->b_bcount) <= start)
4f083fd7
SL
507 continue;
508 s = spl6();
509 if (ep->b_flags&B_BUSY) {
510 ep->b_flags |= B_WANTED;
511 sleep((caddr_t)ep, PRIBIO+1);
512 splx(s);
513 goto loop;
514 }
515 if (ep->b_flags & B_DELWRI) {
516 splx(s);
517 notavail(ep);
518 bwrite(ep);
519 goto loop;
520 }
521 splx(s);
522 }
523}
524
663dbc72 525/*
af04ce66 526 * Make sure all write-behind blocks
663dbc72
BJ
527 * on dev (or NODEV for all)
528 * are flushed out.
529 * (from umount and update)
530 */
531bflush(dev)
3efdd860 532 dev_t dev;
663dbc72
BJ
533{
534 register struct buf *bp;
46387ee3 535 register struct buf *flist;
530d0032 536 int s;
663dbc72
BJ
537
538loop:
530d0032 539 s = spl6();
4f083fd7 540 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
46387ee3 541 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
3efdd860
KM
542 if ((bp->b_flags & B_DELWRI) == 0)
543 continue;
544 if (dev == NODEV || dev == bp->b_dev) {
663dbc72
BJ
545 bp->b_flags |= B_ASYNC;
546 notavail(bp);
547 bwrite(bp);
f7916124 548 splx(s);
663dbc72
BJ
549 goto loop;
550 }
551 }
530d0032 552 splx(s);
663dbc72
BJ
553}
554
663dbc72
BJ
555/*
556 * Pick up the device's error number and pass it to the user;
557 * if there is an error but the number is 0 set a generalized
558 * code. Actually the latter is always true because devices
559 * don't yet return specific errors.
560 */
561geterror(bp)
3efdd860 562 register struct buf *bp;
663dbc72 563{
d6d7360b 564 int error = 0;
663dbc72
BJ
565
566 if (bp->b_flags&B_ERROR)
d6d7360b
BJ
567 if ((error = bp->b_error)==0)
568 return (EIO);
569 return (error);
663dbc72 570}
7b8b5a01
RE
571
572/*
573 * Invalidate in core blocks belonging to closed or umounted filesystem
574 *
575 * This is not nicely done at all - the buffer ought to be removed from the
576 * hash chains & have its dev/blkno fields clobbered, but unfortunately we
577 * can't do that here, as it is quite possible that the block is still
578 * being used for i/o. Eventually, all disc drivers should be forced to
579 * have a close routine, which ought ensure that the queue is empty, then
580 * properly flush the queues. Until that happy day, this suffices for
581 * correctness. ... kre
582 */
583binval(dev)
3efdd860 584 dev_t dev;
7b8b5a01 585{
634ebdbe
RE
586 register struct buf *bp;
587 register struct bufhd *hp;
588#define dp ((struct buf *)hp)
7b8b5a01 589
634ebdbe
RE
590 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
591 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
592 if (bp->b_dev == dev)
593 bp->b_flags |= B_INVAL;
7b8b5a01 594}