move copyin() call so if uap->size pointer is null, still return size
[unix-history] / usr / src / sys / kern / vfs_cluster.c
... / ...
CommitLineData
1/*-
2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This module is believed to contain source code proprietary to AT&T.
6 * Use and redistribution is subject to the Berkeley Software License
7 * Agreement and your Software Agreement with AT&T (Western Electric).
8 *
9 * @(#)vfs_cluster.c 7.43 (Berkeley) %G%
10 */
11
12#include <sys/param.h>
13#include <sys/proc.h>
14#include <sys/buf.h>
15#include <sys/vnode.h>
16#include <sys/specdev.h>
17#include <sys/mount.h>
18#include <sys/trace.h>
19#include <sys/resourcevar.h>
20
21/*
22 * Initialize buffers and hash links for buffers.
23 */
24void
25bufinit()
26{
27 register int i;
28 register struct buf *bp, *dp;
29 register struct bufhd *hp;
30 int base, residual;
31
32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
33 hp->b_forw = hp->b_back = (struct buf *)hp;
34
35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
37 dp->b_flags = B_HEAD;
38 }
39 base = bufpages / nbuf;
40 residual = bufpages % nbuf;
41 for (i = 0; i < nbuf; i++) {
42 bp = &buf[i];
43 bp->b_dev = NODEV;
44 bp->b_bcount = 0;
45 bp->b_rcred = NOCRED;
46 bp->b_wcred = NOCRED;
47 bp->b_dirtyoff = 0;
48 bp->b_dirtyend = 0;
49 bp->b_un.b_addr = buffers + i * MAXBSIZE;
50 if (i < residual)
51 bp->b_bufsize = (base + 1) * CLBYTES;
52 else
53 bp->b_bufsize = base * CLBYTES;
54 binshash(bp, &bfreelist[BQ_AGE]);
55 bp->b_flags = B_BUSY|B_INVAL;
56 brelse(bp);
57 }
58}
59
60/*
61 * Find the block in the buffer pool.
62 * If the buffer is not present, allocate a new buffer and load
63 * its contents according to the filesystem fill routine.
64 */
65bread(vp, blkno, size, cred, bpp)
66 struct vnode *vp;
67 daddr_t blkno;
68 int size;
69 struct ucred *cred;
70 struct buf **bpp;
71#ifdef SECSIZE
72 long secsize;
73#endif SECSIZE
74{
75 struct proc *p = curproc; /* XXX */
76 register struct buf *bp;
77
78 if (size == 0)
79 panic("bread: size 0");
80#ifdef SECSIZE
81 bp = getblk(dev, blkno, size, secsize);
82#else SECSIZE
83 *bpp = bp = getblk(vp, blkno, size);
84#endif SECSIZE
85 if (bp->b_flags & (B_DONE | B_DELWRI)) {
86 trace(TR_BREADHIT, pack(vp, size), blkno);
87 return (0);
88 }
89 bp->b_flags |= B_READ;
90 if (bp->b_bcount > bp->b_bufsize)
91 panic("bread");
92 if (bp->b_rcred == NOCRED && cred != NOCRED) {
93 crhold(cred);
94 bp->b_rcred = cred;
95 }
96 VOP_STRATEGY(bp);
97 trace(TR_BREADMISS, pack(vp, size), blkno);
98 p->p_stats->p_ru.ru_inblock++; /* pay for read */
99 return (biowait(bp));
100}
101
102/*
103 * Operates like bread, but also starts I/O on the specified
104 * read-ahead block.
105 */
106breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
107 struct vnode *vp;
108 daddr_t blkno; int size;
109#ifdef SECSIZE
110 long secsize;
111#endif SECSIZE
112 daddr_t rablkno; int rabsize;
113 struct ucred *cred;
114 struct buf **bpp;
115{
116 struct proc *p = curproc; /* XXX */
117 register struct buf *bp, *rabp;
118
119 bp = NULL;
120 /*
121 * If the block is not memory resident,
122 * allocate a buffer and start I/O.
123 */
124 if (!incore(vp, blkno)) {
125 *bpp = bp = getblk(vp, blkno, size);
126#endif SECSIZE
127 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
128 bp->b_flags |= B_READ;
129 if (bp->b_bcount > bp->b_bufsize)
130 panic("breada");
131 if (bp->b_rcred == NOCRED && cred != NOCRED) {
132 crhold(cred);
133 bp->b_rcred = cred;
134 }
135 VOP_STRATEGY(bp);
136 trace(TR_BREADMISS, pack(vp, size), blkno);
137 p->p_stats->p_ru.ru_inblock++; /* pay for read */
138 } else
139 trace(TR_BREADHIT, pack(vp, size), blkno);
140 }
141
142 /*
143 * If there is a read-ahead block, start I/O on it too.
144 */
145 if (!incore(vp, rablkno)) {
146 rabp = getblk(vp, rablkno, rabsize);
147#endif SECSIZE
148 if (rabp->b_flags & (B_DONE | B_DELWRI)) {
149 brelse(rabp);
150 trace(TR_BREADHITRA, pack(vp, rabsize), rablkno);
151 } else {
152 rabp->b_flags |= B_ASYNC | B_READ;
153 if (rabp->b_bcount > rabp->b_bufsize)
154 panic("breadrabp");
155 if (rabp->b_rcred == NOCRED && cred != NOCRED) {
156 crhold(cred);
157 rabp->b_rcred = cred;
158 }
159 VOP_STRATEGY(rabp);
160 trace(TR_BREADMISSRA, pack(vp, rabsize), rablkno);
161 p->p_stats->p_ru.ru_inblock++; /* pay in advance */
162 }
163 }
164
165 /*
166 * If block was memory resident, let bread get it.
167 * If block was not memory resident, the read was
168 * started above, so just wait for the read to complete.
169 */
170 if (bp == NULL)
171#ifdef SECSIZE
172 return (bread(dev, blkno, size, secsize));
173#else SECSIZE
174 return (bread(vp, blkno, size, cred, bpp));
175 return (biowait(bp));
176}
177
178/*
179 * Synchronous write.
180 * Release buffer on completion.
181 */
182bwrite(bp)
183 register struct buf *bp;
184{
185 struct proc *p = curproc; /* XXX */
186 register int flag;
187 int s, error;
188
189 flag = bp->b_flags;
190 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
191 if (flag & B_ASYNC) {
192 if ((flag & B_DELWRI) == 0)
193 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */
194 else
195 reassignbuf(bp, bp->b_vp);
196 }
197 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
198 if (bp->b_bcount > bp->b_bufsize)
199 panic("bwrite");
200 s = splbio();
201 bp->b_vp->v_numoutput++;
202 splx(s);
203 VOP_STRATEGY(bp);
204
205 /*
206 * If the write was synchronous, then await I/O completion.
207 * If the write was "delayed", then we put the buffer on
208 * the queue of blocks awaiting I/O completion status.
209 */
210 if ((flag & B_ASYNC) == 0) {
211 error = biowait(bp);
212 if ((flag&B_DELWRI) == 0)
213 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */
214 else
215 reassignbuf(bp, bp->b_vp);
216 brelse(bp);
217 } else if (flag & B_DELWRI) {
218 bp->b_flags |= B_AGE;
219 error = 0;
220 }
221 return (error);
222}
223
224/*
225 * Delayed write.
226 *
227 * The buffer is marked dirty, but is not queued for I/O.
228 * This routine should be used when the buffer is expected
229 * to be modified again soon, typically a small write that
230 * partially fills a buffer.
231 *
232 * NB: magnetic tapes cannot be delayed; they must be
233 * written in the order that the writes are requested.
234 */
235bdwrite(bp)
236 register struct buf *bp;
237{
238 struct proc *p = curproc; /* XXX */
239
240 if ((bp->b_flags & B_DELWRI) == 0) {
241 bp->b_flags |= B_DELWRI;
242 reassignbuf(bp, bp->b_vp);
243 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */
244 }
245 /*
246 * If this is a tape drive, the write must be initiated.
247 */
248 if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE)
249 bawrite(bp);
250 } else {
251 bp->b_flags |= (B_DONE | B_DELWRI);
252 brelse(bp);
253 }
254}
255
256/*
257 * Asynchronous write.
258 * Start I/O on a buffer, but do not wait for it to complete.
259 * The buffer is released when the I/O completes.
260 */
261bawrite(bp)
262 register struct buf *bp;
263{
264
265 /*
266 * Setting the ASYNC flag causes bwrite to return
267 * after starting the I/O.
268 */
269 bp->b_flags |= B_ASYNC;
270 (void) bwrite(bp);
271}
272
273/*
274 * Release a buffer.
275 * Even if the buffer is dirty, no I/O is started.
276 */
277brelse(bp)
278 register struct buf *bp;
279{
280 register struct buf *flist;
281 int s;
282
283 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
284 /*
285 * If a process is waiting for the buffer, or
286 * is waiting for a free buffer, awaken it.
287 */
288 if (bp->b_flags & B_WANTED)
289 wakeup((caddr_t)bp);
290 if (bfreelist[0].b_flags & B_WANTED) {
291 bfreelist[0].b_flags &= ~B_WANTED;
292 wakeup((caddr_t)bfreelist);
293 }
294 /*
295 * Retry I/O for locked buffers rather than invalidating them.
296 */
297 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
298 bp->b_flags &= ~B_ERROR;
299 /*
300 * Disassociate buffers that are no longer valid.
301 */
302 if (bp->b_flags & (B_NOCACHE | B_ERROR))
303 bp->b_flags |= B_INVAL;
304 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
305 if (bp->b_vp)
306 brelvp(bp);
307 bp->b_flags &= ~B_DELWRI;
308 }
309 /*
310 * Stick the buffer back on a free list.
311 */
312 s = splbio();
313 if (bp->b_bufsize <= 0) {
314 /* block has no buffer ... put at front of unused buffer list */
315 flist = &bfreelist[BQ_EMPTY];
316 binsheadfree(bp, flist);
317 } else if (bp->b_flags & (B_ERROR | B_INVAL)) {
318 /* block has no info ... put at front of most free list */
319 flist = &bfreelist[BQ_AGE];
320 binsheadfree(bp, flist);
321 } else {
322 if (bp->b_flags & B_LOCKED)
323 flist = &bfreelist[BQ_LOCKED];
324 else if (bp->b_flags & B_AGE)
325 flist = &bfreelist[BQ_AGE];
326 else
327 flist = &bfreelist[BQ_LRU];
328 binstailfree(bp, flist);
329 }
330 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
331 splx(s);
332}
333
334/*
335 * Check to see if a block is currently memory resident.
336 */
337incore(vp, blkno)
338 struct vnode *vp;
339 daddr_t blkno;
340{
341 register struct buf *bp;
342 register struct buf *dp;
343
344 dp = BUFHASH(vp, blkno);
345 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
346 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
347 (bp->b_flags & B_INVAL) == 0)
348 return (1);
349 return (0);
350}
351
352/*
353 * Check to see if a block is currently memory resident.
354 * If it is resident, return it. If it is not resident,
355 * allocate a new buffer and assign it to the block.
356 */
357struct buf *
358#ifdef SECSIZE
359getblk(dev, blkno, size, secsize)
360#else SECSIZE
361getblk(vp, blkno, size)
362 register struct vnode *vp;
363 daddr_t blkno;
364 int size;
365#ifdef SECSIZE
366 long secsize;
367#endif SECSIZE
368{
369 register struct buf *bp, *dp;
370 int s;
371
372 if (size > MAXBSIZE)
373 panic("getblk: size too big");
374 /*
375 * Search the cache for the block. If the buffer is found,
376 * but it is currently locked, the we must wait for it to
377 * become available.
378 */
379 dp = BUFHASH(vp, blkno);
380loop:
381 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
382 if (bp->b_lblkno != blkno || bp->b_vp != vp ||
383 (bp->b_flags & B_INVAL))
384 continue;
385 s = splbio();
386 if (bp->b_flags & B_BUSY) {
387 bp->b_flags |= B_WANTED;
388 sleep((caddr_t)bp, PRIBIO + 1);
389 splx(s);
390 goto loop;
391 }
392 bremfree(bp);
393 bp->b_flags |= B_BUSY;
394 splx(s);
395 if (bp->b_bcount != size) {
396 printf("getblk: stray size");
397 bp->b_flags |= B_INVAL;
398 bwrite(bp);
399 goto loop;
400 }
401 bp->b_flags |= B_CACHE;
402 return (bp);
403 }
404 bp = getnewbuf();
405 bremhash(bp);
406 bgetvp(vp, bp);
407 bp->b_bcount = 0;
408 bp->b_lblkno = blkno;
409#ifdef SECSIZE
410 bp->b_blksize = secsize;
411#endif SECSIZE
412 bp->b_blkno = blkno;
413 bp->b_error = 0;
414 bp->b_resid = 0;
415 binshash(bp, dp);
416 allocbuf(bp, size);
417 return (bp);
418}
419
420/*
421 * Allocate a buffer.
422 * The caller will assign it to a block.
423 */
424struct buf *
425geteblk(size)
426 int size;
427{
428 register struct buf *bp, *flist;
429
430 if (size > MAXBSIZE)
431 panic("geteblk: size too big");
432 bp = getnewbuf();
433 bp->b_flags |= B_INVAL;
434 bremhash(bp);
435 flist = &bfreelist[BQ_AGE];
436 bp->b_bcount = 0;
437#ifdef SECSIZE
438 bp->b_blksize = DEV_BSIZE;
439#endif SECSIZE
440 bp->b_error = 0;
441 bp->b_resid = 0;
442 binshash(bp, flist);
443 allocbuf(bp, size);
444 return (bp);
445}
446
447/*
448 * Expand or contract the actual memory allocated to a buffer.
449 * If no memory is available, release buffer and take error exit.
450 */
451allocbuf(tp, size)
452 register struct buf *tp;
453 int size;
454{
455 register struct buf *bp, *ep;
456 int sizealloc, take, s;
457
458 sizealloc = roundup(size, CLBYTES);
459 /*
460 * Buffer size does not change
461 */
462 if (sizealloc == tp->b_bufsize)
463 goto out;
464 /*
465 * Buffer size is shrinking.
466 * Place excess space in a buffer header taken from the
467 * BQ_EMPTY buffer list and placed on the "most free" list.
468 * If no extra buffer headers are available, leave the
469 * extra space in the present buffer.
470 */
471 if (sizealloc < tp->b_bufsize) {
472 ep = bfreelist[BQ_EMPTY].av_forw;
473 if (ep == &bfreelist[BQ_EMPTY])
474 goto out;
475 s = splbio();
476 bremfree(ep);
477 ep->b_flags |= B_BUSY;
478 splx(s);
479 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
480 (int)tp->b_bufsize - sizealloc);
481 ep->b_bufsize = tp->b_bufsize - sizealloc;
482 tp->b_bufsize = sizealloc;
483 ep->b_flags |= B_INVAL;
484 ep->b_bcount = 0;
485 brelse(ep);
486 goto out;
487 }
488 /*
489 * More buffer space is needed. Get it out of buffers on
490 * the "most free" list, placing the empty headers on the
491 * BQ_EMPTY buffer header list.
492 */
493 while (tp->b_bufsize < sizealloc) {
494 take = sizealloc - tp->b_bufsize;
495 bp = getnewbuf();
496 if (take >= bp->b_bufsize)
497 take = bp->b_bufsize;
498 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
499 &tp->b_un.b_addr[tp->b_bufsize], take);
500 tp->b_bufsize += take;
501 bp->b_bufsize = bp->b_bufsize - take;
502 if (bp->b_bcount > bp->b_bufsize)
503 bp->b_bcount = bp->b_bufsize;
504 if (bp->b_bufsize <= 0) {
505 bremhash(bp);
506 binshash(bp, &bfreelist[BQ_EMPTY]);
507 bp->b_dev = NODEV;
508 bp->b_error = 0;
509 bp->b_flags |= B_INVAL;
510 }
511 brelse(bp);
512 }
513out:
514 tp->b_bcount = size;
515 return (1);
516}
517
518/*
519 * Find a buffer which is available for use.
520 * Select something from a free list.
521 * Preference is to AGE list, then LRU list.
522 */
523struct buf *
524getnewbuf()
525{
526 register struct buf *bp, *dp;
527 register struct ucred *cred;
528 int s;
529
530loop:
531 s = splbio();
532 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
533 if (dp->av_forw != dp)
534 break;
535 if (dp == bfreelist) { /* no free blocks */
536 dp->b_flags |= B_WANTED;
537 sleep((caddr_t)dp, PRIBIO + 1);
538 splx(s);
539 goto loop;
540 }
541 bp = dp->av_forw;
542 bremfree(bp);
543 bp->b_flags |= B_BUSY;
544 splx(s);
545 if (bp->b_flags & B_DELWRI) {
546 (void) bawrite(bp);
547 goto loop;
548 }
549 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
550 if (bp->b_vp)
551 brelvp(bp);
552 if (bp->b_rcred != NOCRED) {
553 cred = bp->b_rcred;
554 bp->b_rcred = NOCRED;
555 crfree(cred);
556 }
557 if (bp->b_wcred != NOCRED) {
558 cred = bp->b_wcred;
559 bp->b_wcred = NOCRED;
560 crfree(cred);
561 }
562 bp->b_flags = B_BUSY;
563 bp->b_dirtyoff = bp->b_dirtyend = 0;
564 return (bp);
565}
566
567/*
568 * Wait for I/O to complete.
569 *
570 * Extract and return any errors associated with the I/O.
571 * If the error flag is set, but no specific error is
572 * given, return EIO.
573 */
574biowait(bp)
575 register struct buf *bp;
576{
577 int s;
578
579 s = splbio();
580 while ((bp->b_flags & B_DONE) == 0)
581 sleep((caddr_t)bp, PRIBIO);
582 splx(s);
583 if ((bp->b_flags & B_ERROR) == 0)
584 return (0);
585 if (bp->b_error)
586 return (bp->b_error);
587 return (EIO);
588}
589
590/*
591 * Mark I/O complete on a buffer.
592 *
593 * If a callback has been requested, e.g. the pageout
594 * daemon, do so. Otherwise, awaken waiting processes.
595 */
596void
597biodone(bp)
598 register struct buf *bp;
599{
600
601 if (bp->b_flags & B_DONE)
602 panic("dup biodone");
603 bp->b_flags |= B_DONE;
604 if ((bp->b_flags & B_READ) == 0)
605 vwakeup(bp);
606 if (bp->b_flags & B_CALL) {
607 bp->b_flags &= ~B_CALL;
608 (*bp->b_iodone)(bp);
609 return;
610 }
611 if (bp->b_flags & B_ASYNC)
612 brelse(bp);
613 else {
614 bp->b_flags &= ~B_WANTED;
615 wakeup((caddr_t)bp);
616 }
617}