update syscalls.master
[unix-history] / usr / src / sys / kern / vfs_cluster.c
CommitLineData
5dc2581e 1/*-
5c8652bb 2 * Copyright (c) 1993 The Regents of the University of California.
7188ac27 3 * All rights reserved.
da7c5cc6 4 *
5c8652bb 5 * %sccs.include.redist.c%
7188ac27 6 *
5c8652bb 7 * @(#)vfs_cluster.c 7.60 (Berkeley) %G%
da7c5cc6 8 */
961945a8 9
251f56ba
KB
10#include <sys/param.h>
11#include <sys/proc.h>
12#include <sys/buf.h>
13#include <sys/vnode.h>
251f56ba
KB
14#include <sys/mount.h>
15#include <sys/trace.h>
37392cf8 16#include <sys/malloc.h>
5c8652bb 17#include <sys/resourcevar.h>
37392cf8 18#include <libkern/libkern.h>
b88d365e
KM
19#include <ufs/ufs/quota.h>
20#include <ufs/ufs/inode.h>
37392cf8 21
888c761e
MS
22/*
23 * Local declarations
24 */
25struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
26 daddr_t, long, int));
27struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
28 daddr_t, daddr_t, long, int, long));
29void cluster_wbuild __P((struct vnode *, struct buf *, long size,
30 daddr_t start_lbn, int len, daddr_t lbn));
31
888c761e
MS
32/*
33 * We could optimize this by keeping track of where the last read-ahead
34 * was, but it would involve adding fields to the vnode. For now, let's
35 * just get it working.
36 *
37 * This replaces bread. If this is a bread at the beginning of a file and
38 * lastr is 0, we assume this is the first read and we'll read up to two
39 * blocks if they are sequential. After that, we'll do regular read ahead
40 * in clustered chunks.
41 *
42 * There are 4 or 5 cases depending on how you count:
43 * Desired block is in the cache:
44 * 1 Not sequential access (0 I/Os).
45 * 2 Access is sequential, do read-ahead (1 ASYNC).
46 * Desired block is not in cache:
47 * 3 Not sequential access (1 SYNC).
48 * 4 Sequential access, next block is contiguous (1 SYNC).
49 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
50 *
51 * There are potentially two buffers that require I/O.
52 * bp is the block requested.
53 * rbp is the read-ahead block.
54 * If either is NULL, then you don't have to do the I/O.
55 */
56cluster_read(vp, filesize, lblkno, size, cred, bpp)
57 struct vnode *vp;
58 u_quad_t filesize;
59 daddr_t lblkno;
60 long size;
61 struct ucred *cred;
62 struct buf **bpp;
63{
64 struct buf *bp, *rbp;
65 daddr_t blkno, ioblkno;
66 long flags;
67 int error, num_ra, alreadyincore;
68
69#ifdef DIAGNOSTIC
70 if (size == 0)
71 panic("cluster_read: size = 0");
72#endif
73
74 error = 0;
75 flags = B_READ;
e140149a 76 *bpp = bp = getblk(vp, lblkno, size, 0, 0);
888c761e
MS
77 if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
78 /*
79 * Desired block is in cache; do any readahead ASYNC.
80 * Case 1, 2.
81 */
82 trace(TR_BREADHIT, pack(vp, size), lblkno);
83 flags |= B_ASYNC;
84 ioblkno = lblkno +
85 (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
e140149a 86 alreadyincore = (int)incore(vp, ioblkno);
888c761e
MS
87 bp = NULL;
88 } else {
89 /* Block wasn't in cache, case 3, 4, 5. */
90 trace(TR_BREADMISS, pack(vp, size), lblkno);
91 ioblkno = lblkno;
92 bp->b_flags |= flags;
93 alreadyincore = 0;
94 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
95 }
96 /*
97 * XXX
98 * Replace 1 with a window size based on some permutation of
99 * maxcontig and rot_delay. This will let you figure out how
100 * many blocks you should read-ahead (case 2, 4, 5).
101 *
102 * If the access isn't sequential, cut the window size in half.
103 */
104 rbp = NULL;
105 if (lblkno != vp->v_lastr + 1 && lblkno != 0)
106 vp->v_ralen = max(vp->v_ralen >> 1, 1);
107 else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
108 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
109 /*
110 * Reading sequentially, and the next block is not in the
111 * cache. We are going to try reading ahead. If this is
112 * the first read of a file, then limit read-ahead to a
113 * single block, else read as much as we're allowed.
114 */
115 if (num_ra > vp->v_ralen) {
116 num_ra = vp->v_ralen;
117 vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
118 } else
119 vp->v_ralen = num_ra + 1;
120
121
122 if (num_ra) /* case 2, 4 */
123 rbp = cluster_rbuild(vp, filesize,
124 bp, ioblkno, blkno, size, num_ra, flags);
125 else if (lblkno != 0 && ioblkno == lblkno) {
126 /* Case 5: check how many blocks to read ahead */
127 ++ioblkno;
128 if ((ioblkno + 1) * size > filesize ||
129 (error = VOP_BMAP(vp,
130 ioblkno, NULL, &blkno, &num_ra)))
131 goto skip_readahead;
132 flags |= B_ASYNC;
133 if (num_ra)
134 rbp = cluster_rbuild(vp, filesize,
135 NULL, ioblkno, blkno, size, num_ra, flags);
136 else {
e140149a 137 rbp = getblk(vp, ioblkno, size, 0, 0);
888c761e
MS
138 rbp->b_flags |= flags;
139 rbp->b_blkno = blkno;
140 }
141 } else if (lblkno != 0) {
142 /* case 2; read ahead single block */
e140149a 143 rbp = getblk(vp, ioblkno, size, 0, 0);
888c761e
MS
144 rbp->b_flags |= flags;
145 rbp->b_blkno = blkno;
146 } else if (bp) /* case 1, 3, block 0 */
147 bp->b_blkno = blkno;
148 /* Case 1 on block 0; not really doing sequential I/O */
149
150 if (rbp == bp) /* case 4 */
151 rbp = NULL;
152 else if (rbp) { /* case 2, 5 */
153 trace(TR_BREADMISSRA,
154 pack(vp, (num_ra + 1) * size), ioblkno);
155 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
156 }
157 }
158
159 /* XXX Kirk, do we need to make sure the bp has creds? */
160skip_readahead:
161 if (bp)
162 if (bp->b_flags & (B_DONE | B_DELWRI))
163 panic("cluster_read: DONE bp");
164 else
165 error = VOP_STRATEGY(bp);
166
167 if (rbp)
168 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
169 rbp->b_flags &= ~(B_ASYNC | B_READ);
170 brelse(rbp);
171 } else
172 (void) VOP_STRATEGY(rbp);
173
174 if (bp)
175 return(biowait(bp));
176 return(error);
177}
178
179/*
180 * If blocks are contiguous on disk, use this to provide clustered
181 * read ahead. We will read as many blocks as possible sequentially
182 * and then parcel them up into logical blocks in the buffer hash table.
183 */
184struct buf *
185cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
186 struct vnode *vp;
187 u_quad_t filesize;
188 struct buf *bp;
189 daddr_t lbn;
190 daddr_t blkno;
191 long size;
192 int run;
193 long flags;
194{
195 struct cluster_save *b_save;
196 struct buf *tbp;
197 daddr_t bn;
198 int i, inc;
199
c5e0ddad
MS
200#ifdef DIAGNOSTIC
201 if (size != vp->v_mount->mnt_stat.f_iosize)
202 panic("cluster_rbuild: size %d != filesize %d\n",
203 size, vp->v_mount->mnt_stat.f_iosize);
204#endif
888c761e
MS
205 if (size * (lbn + run + 1) > filesize)
206 --run;
207 if (run == 0) {
208 if (!bp) {
e140149a 209 bp = getblk(vp, lbn, size, 0, 0);
888c761e
MS
210 bp->b_blkno = blkno;
211 bp->b_flags |= flags;
212 }
213 return(bp);
214 }
215
216 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
217 if (bp->b_flags & (B_DONE | B_DELWRI))
218 return (bp);
219
220 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
221 M_SEGMENT, M_WAITOK);
222 b_save->bs_bufsize = b_save->bs_bcount = size;
223 b_save->bs_nchildren = 0;
224 b_save->bs_children = (struct buf **)(b_save + 1);
225 b_save->bs_saveaddr = bp->b_saveaddr;
226 bp->b_saveaddr = (caddr_t) b_save;
227
228 inc = size / DEV_BSIZE;
229 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
230 if (incore(vp, lbn + i)) {
231 if (i == 1) {
232 bp->b_saveaddr = b_save->bs_saveaddr;
233 bp->b_flags &= ~B_CALL;
234 bp->b_iodone = NULL;
235 allocbuf(bp, size);
236 free(b_save, M_SEGMENT);
237 } else
238 allocbuf(bp, size * i);
239 break;
240 }
e140149a 241 tbp = getblk(vp, lbn + i, 0, 0, 0);
888c761e
MS
242 tbp->b_bcount = tbp->b_bufsize = size;
243 tbp->b_blkno = bn;
b88d365e
KM
244 {
245 daddr_t temp;
246 VOP_BMAP(tbp->b_vp, tbp->b_lblkno, NULL, &temp, NULL);
247 if (temp != bn) {
248 printf("Block: %d Assigned address: %x Bmap address: %x\n",
249 tbp->b_lblkno, tbp->b_blkno, temp);
250 panic("cluster_rbuild: wrong disk address");
251 }
252 }
888c761e
MS
253 tbp->b_flags |= flags | B_READ | B_ASYNC;
254 ++b_save->bs_nchildren;
255 b_save->bs_children[i - 1] = tbp;
256 }
257 if (!(bp->b_flags & B_ASYNC))
258 vp->v_ralen = max(vp->v_ralen - 1, 1);
259 return(bp);
260}
261
262/*
263 * Either get a new buffer or grow the existing one.
264 */
265struct buf *
266cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
267 struct vnode *vp;
268 struct buf *bp;
269 long flags;
270 daddr_t blkno;
271 daddr_t lblkno;
272 long size;
273 int run;
274{
275 if (!bp) {
e140149a 276 bp = getblk(vp, lblkno, size, 0, 0);
888c761e
MS
277 if (bp->b_flags & (B_DONE | B_DELWRI)) {
278 bp->b_blkno = blkno;
279 return(bp);
280 }
281 }
282 allocbuf(bp, run * size);
283 bp->b_blkno = blkno;
284 bp->b_iodone = cluster_callback;
285 bp->b_flags |= flags | B_CALL;
286 return(bp);
287}
288
289/*
290 * Cleanup after a clustered read or write.
291 */
292void
293cluster_callback(bp)
294 struct buf *bp;
295{
296 struct cluster_save *b_save;
297 struct buf **tbp;
298 long bsize;
299 caddr_t cp;
b88d365e 300 daddr_t daddr;
888c761e
MS
301 b_save = (struct cluster_save *)(bp->b_saveaddr);
302 bp->b_saveaddr = b_save->bs_saveaddr;
303
304 cp = bp->b_un.b_addr + b_save->bs_bufsize;
b88d365e 305 daddr = bp->b_blkno + b_save->bs_bufsize / DEV_BSIZE;
888c761e
MS
306 for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
307 pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
308 cp += (*tbp)->b_bufsize;
309 bp->b_bufsize -= (*tbp)->b_bufsize;
b88d365e
KM
310 if ((*tbp)->b_blkno != daddr) {
311 struct inode *ip;
312 printf("cluster_callback: bad disk address:\n");
313 printf("Clustered Block: %d DiskAddr: %x bytes left: %d\n",
314 bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
315 printf("\toriginal size: %d flags: %x\n", bp->b_bcount,
316 bp->b_flags);
317 printf("Child Block: %d DiskAddr: %x bytes: %d\n",
318 (*tbp)->b_lblkno, (*tbp)->b_blkno,
319 (*tbp)->b_bufsize);
320 ip = VTOI((*tbp)->b_vp);
321 printf("daddr: %x i_size %qd\n", daddr, ip->i_size);
322 if ((*tbp)->b_lblkno < NDADDR)
323 printf("Child block pointer from inode: %x\n",
324 ip->i_din.di_db[(*tbp)->b_lblkno]);
325 spl0();
326 panic ("cluster_callback: bad disk address");
327 }
328 daddr += (*tbp)->b_bufsize / DEV_BSIZE;
888c761e
MS
329 biodone(*tbp);
330 }
331#ifdef DIAGNOSTIC
332 if (bp->b_bufsize != b_save->bs_bufsize)
333 panic ("cluster_callback: more space to reclaim");
334#endif
335 bp->b_bcount = bp->b_bufsize;
336 bp->b_iodone = NULL;
337 free(b_save, M_SEGMENT);
338 if (bp->b_flags & B_ASYNC)
339 brelse(bp);
340 else
341 wakeup((caddr_t)bp);
342}
343
888c761e
MS
344/*
345 * Do clustered write for FFS.
346 *
347 * Three cases:
348 * 1. Write is not sequential (write asynchronously)
349 * Write is sequential:
350 * 2. beginning of cluster - begin cluster
351 * 3. middle of a cluster - add to cluster
352 * 4. end of a cluster - asynchronously write cluster
353 */
354void
355cluster_write(bp, filesize)
356 struct buf *bp;
357 u_quad_t filesize;
358{
359 struct vnode *vp;
360 daddr_t lbn;
c5e0ddad 361 int clen;
888c761e
MS
362
363 vp = bp->b_vp;
364 lbn = bp->b_lblkno;
888c761e 365
c5e0ddad
MS
366 /* Initialize vnode to beginning of file. */
367 if (lbn == 0)
368 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
369
370 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
371 (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) {
888c761e
MS
372 if (vp->v_clen != 0)
373 /*
374 * Write is not sequential.
375 */
376 cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
377 vp->v_lastw - vp->v_cstart + 1, lbn);
378 /*
379 * Consider beginning a cluster.
380 */
c5e0ddad
MS
381 if ((lbn + 1) * bp->b_bcount == filesize)
382 /* End of file, make cluster as large as possible */
383 clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
384 else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
888c761e 385 bawrite(bp);
c5e0ddad
MS
386 vp->v_clen = 0;
387 vp->v_lasta = bp->b_blkno;
888c761e
MS
388 vp->v_cstart = lbn + 1;
389 vp->v_lastw = lbn;
390 return;
c5e0ddad
MS
391 } else
392 clen = 0;
888c761e
MS
393 vp->v_clen = clen;
394 if (clen == 0) { /* I/O not contiguous */
395 vp->v_cstart = lbn + 1;
396 bawrite(bp);
397 } else { /* Wait for rest of cluster */
398 vp->v_cstart = lbn;
399 bdwrite(bp);
400 }
401 } else if (lbn == vp->v_cstart + vp->v_clen) {
402 /*
403 * At end of cluster, write it out.
404 */
405 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
406 vp->v_clen + 1, lbn);
407 vp->v_clen = 0;
408 vp->v_cstart = lbn + 1;
409 } else
410 /*
411 * In the middle of a cluster, so just delay the
412 * I/O for now.
413 */
414 bdwrite(bp);
415 vp->v_lastw = lbn;
c5e0ddad 416 vp->v_lasta = bp->b_blkno;
888c761e
MS
417}
418
419
420/*
421 * This is an awful lot like cluster_rbuild...wish they could be combined.
422 * The last lbn argument is the current block on which I/O is being
423 * performed. Check to see that it doesn't fall in the middle of
424 * the current block.
425 */
426void
427cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
428 struct vnode *vp;
429 struct buf *last_bp;
430 long size;
431 daddr_t start_lbn;
432 int len;
433 daddr_t lbn;
434{
435 struct cluster_save *b_save;
436 struct buf *bp, *tbp;
437 caddr_t cp;
438 int i, s;
439
c5e0ddad
MS
440#ifdef DIAGNOSTIC
441 if (size != vp->v_mount->mnt_stat.f_iosize)
442 panic("cluster_wbuild: size %d != filesize %d\n",
443 size, vp->v_mount->mnt_stat.f_iosize);
444#endif
888c761e
MS
445redo:
446 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
447 ++start_lbn;
448 --len;
449 }
450
451 /* Get more memory for current buffer */
452 if (len <= 1) {
c5e0ddad 453 if (last_bp) {
888c761e 454 bawrite(last_bp);
c5e0ddad
MS
455 } else if (len) {
456 bp = getblk(vp, start_lbn, size, 0, 0);
457 bawrite(bp);
458 }
888c761e
MS
459 return;
460 }
461
e140149a 462 bp = getblk(vp, start_lbn, size, 0, 0);
888c761e
MS
463 if (!(bp->b_flags & B_DELWRI)) {
464 ++start_lbn;
465 --len;
466 brelse(bp);
467 goto redo;
468 }
469
470 --len;
471 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
472 M_SEGMENT, M_WAITOK);
473 b_save->bs_bcount = bp->b_bcount;
474 b_save->bs_bufsize = bp->b_bufsize;
475 b_save->bs_nchildren = 0;
476 b_save->bs_children = (struct buf **)(b_save + 1);
477 b_save->bs_saveaddr = bp->b_saveaddr;
478 bp->b_saveaddr = (caddr_t) b_save;
479
480
481 bp->b_flags |= B_CALL;
482 bp->b_iodone = cluster_callback;
483 cp = bp->b_un.b_addr + bp->b_bufsize;
484 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
485 if (!incore(vp, start_lbn) || start_lbn == lbn)
486 break;
487
488 if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
e140149a 489 tbp = getblk(vp, start_lbn, size, 0, 0);
888c761e
MS
490#ifdef DIAGNOSTIC
491 if (tbp->b_bcount != tbp->b_bufsize)
492 panic("cluster_wbuild: Buffer too big");
493#endif
494 if (!(tbp->b_flags & B_DELWRI)) {
495 brelse(tbp);
496 break;
497 }
498 } else
499 tbp = last_bp;
500
501 ++b_save->bs_nchildren;
502
503 /* Move memory from children to parent */
c5e0ddad
MS
504 if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) {
505 printf("Clustered Block: %d addr %x bufsize: %d\n",
506 bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
507 printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
508 tbp->b_blkno);
509 panic("Clustered write to wrong blocks");
510 }
511
888c761e
MS
512 pagemove(tbp->b_un.b_daddr, cp, size);
513 bp->b_bcount += size;
514 bp->b_bufsize += size;
515
516 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
517 tbp->b_flags |= B_ASYNC;
518 s = splbio();
519 reassignbuf(tbp, tbp->b_vp); /* put on clean list */
520 ++tbp->b_vp->v_numoutput;
521 splx(s);
522 b_save->bs_children[i] = tbp;
523
524 cp += tbp->b_bufsize;
525 }
526
527 if (i == 0) {
528 /* None to cluster */
529 bp->b_saveaddr = b_save->bs_saveaddr;
530 bp->b_flags &= ~B_CALL;
531 bp->b_iodone = NULL;
532 free(b_save, M_SEGMENT);
533 }
534 bawrite(bp);
535 if (i < len) {
536 len -= i + 1;
537 start_lbn += 1;
538 goto redo;
539 }
540}