add sysctl debug variable to enable/disable reallocblks
[unix-history] / usr / src / sys / kern / vfs_cluster.c
CommitLineData
5dc2581e 1/*-
ec54f0cc
KB
2 * Copyright (c) 1993
3 * The Regents of the University of California. All rights reserved.
da7c5cc6 4 *
5c8652bb 5 * %sccs.include.redist.c%
7188ac27 6 *
4d059b0b 7 * @(#)vfs_cluster.c 8.7 (Berkeley) %G%
da7c5cc6 8 */
961945a8 9
251f56ba
KB
10#include <sys/param.h>
11#include <sys/proc.h>
12#include <sys/buf.h>
13#include <sys/vnode.h>
251f56ba
KB
14#include <sys/mount.h>
15#include <sys/trace.h>
37392cf8 16#include <sys/malloc.h>
5c8652bb 17#include <sys/resourcevar.h>
37392cf8 18#include <libkern/libkern.h>
b88d365e
KM
19#include <ufs/ufs/quota.h>
20#include <ufs/ufs/inode.h>
37392cf8 21
4d059b0b
MH
22#ifdef DEBUG
23#include <vm/vm.h>
24#include <sys/sysctl.h>
25int doreallocblks = 1;
26struct ctldebug debug13 = { "doreallocblks", &doreallocblks };
27#else
28/* XXX for cluster_write */
29#define doreallocblks 1
30#endif
31
888c761e
MS
32/*
33 * Local declarations
34 */
35struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
36 daddr_t, long, int));
37struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
38 daddr_t, daddr_t, long, int, long));
8165a40b
KM
39void cluster_wbuild __P((struct vnode *, struct buf *, long,
40 daddr_t, int, daddr_t));
aa258290 41struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
888c761e 42
5117aa3e 43#ifdef DIAGNOSTIC
888c761e 44/*
5117aa3e
MH
45 * Set to 1 if reads of block zero should cause readahead to be done.
46 * Set to 0 treats a read of block zero as a non-sequential read.
888c761e 47 *
5117aa3e
MH
48 * Setting to one assumes that most reads of block zero of files are due to
49 * sequential passes over the files (e.g. cat, sum) where additional blocks
50 * will soon be needed. Setting to zero assumes that the majority are
51 * surgical strikes to get particular info (e.g. size, file) where readahead
52 * blocks will not be used and, in fact, push out other potentially useful
53 * blocks from the cache. The former seems intuitive, but some quick tests
54 * showed that the latter performed better from a system-wide point of view.
55 */
56int doclusterraz = 0;
57#define ISSEQREAD(vp, blk) \
58 (((blk) != 0 || doclusterraz) && \
59 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
60#else
61#define ISSEQREAD(vp, blk) \
62 ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
63#endif
64
65/*
888c761e
MS
66 * This replaces bread. If this is a bread at the beginning of a file and
67 * lastr is 0, we assume this is the first read and we'll read up to two
68 * blocks if they are sequential. After that, we'll do regular read ahead
69 * in clustered chunks.
70 *
71 * There are 4 or 5 cases depending on how you count:
72 * Desired block is in the cache:
73 * 1 Not sequential access (0 I/Os).
74 * 2 Access is sequential, do read-ahead (1 ASYNC).
75 * Desired block is not in cache:
76 * 3 Not sequential access (1 SYNC).
77 * 4 Sequential access, next block is contiguous (1 SYNC).
78 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
79 *
80 * There are potentially two buffers that require I/O.
81 * bp is the block requested.
82 * rbp is the read-ahead block.
83 * If either is NULL, then you don't have to do the I/O.
84 */
85cluster_read(vp, filesize, lblkno, size, cred, bpp)
86 struct vnode *vp;
87 u_quad_t filesize;
88 daddr_t lblkno;
89 long size;
90 struct ucred *cred;
91 struct buf **bpp;
92{
93 struct buf *bp, *rbp;
94 daddr_t blkno, ioblkno;
95 long flags;
96 int error, num_ra, alreadyincore;
97
98#ifdef DIAGNOSTIC
99 if (size == 0)
100 panic("cluster_read: size = 0");
101#endif
102
103 error = 0;
104 flags = B_READ;
e140149a 105 *bpp = bp = getblk(vp, lblkno, size, 0, 0);
5117aa3e 106 if (bp->b_flags & B_CACHE) {
888c761e
MS
107 /*
108 * Desired block is in cache; do any readahead ASYNC.
109 * Case 1, 2.
110 */
111 trace(TR_BREADHIT, pack(vp, size), lblkno);
112 flags |= B_ASYNC;
5117aa3e 113 ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1);
e140149a 114 alreadyincore = (int)incore(vp, ioblkno);
888c761e
MS
115 bp = NULL;
116 } else {
117 /* Block wasn't in cache, case 3, 4, 5. */
118 trace(TR_BREADMISS, pack(vp, size), lblkno);
5117aa3e 119 bp->b_flags |= B_READ;
888c761e 120 ioblkno = lblkno;
888c761e
MS
121 alreadyincore = 0;
122 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
123 }
124 /*
125 * XXX
126 * Replace 1 with a window size based on some permutation of
127 * maxcontig and rot_delay. This will let you figure out how
128 * many blocks you should read-ahead (case 2, 4, 5).
129 *
5117aa3e
MH
130 * If the access isn't sequential, reset the window to 1.
131 * Note that a read to the same block is considered sequential.
132 * This catches the case where the file is being read sequentially,
133 * but at smaller than the filesystem block size.
888c761e
MS
134 */
135 rbp = NULL;
5117aa3e
MH
136 if (!ISSEQREAD(vp, lblkno)) {
137 vp->v_ralen = 0;
138 vp->v_maxra = lblkno;
139 } else if ((ioblkno + 1) * size <= filesize && !alreadyincore &&
8165a40b
KM
140 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
141 blkno != -1) {
888c761e
MS
142 /*
143 * Reading sequentially, and the next block is not in the
5117aa3e 144 * cache. We are going to try reading ahead.
888c761e 145 */
5117aa3e
MH
146 if (num_ra) {
147 /*
148 * If our desired readahead block had been read
149 * in a previous readahead but is no longer in
150 * core, then we may be reading ahead too far
151 * or are not using our readahead very rapidly.
152 * In this case we scale back the window.
153 */
154 if (!alreadyincore && ioblkno <= vp->v_maxra)
155 vp->v_ralen = max(vp->v_ralen >> 1, 1);
156 /*
157 * There are more sequential blocks than our current
158 * window allows, scale up. Ideally we want to get
159 * in sync with the filesystem maxcontig value.
160 */
161 else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr)
162 vp->v_ralen = vp->v_ralen ?
163 min(num_ra, vp->v_ralen << 1) : 1;
888c761e 164
5117aa3e
MH
165 if (num_ra > vp->v_ralen)
166 num_ra = vp->v_ralen;
167 }
888c761e
MS
168
169 if (num_ra) /* case 2, 4 */
170 rbp = cluster_rbuild(vp, filesize,
171 bp, ioblkno, blkno, size, num_ra, flags);
5117aa3e
MH
172 else if (ioblkno == lblkno) {
173 bp->b_blkno = blkno;
888c761e
MS
174 /* Case 5: check how many blocks to read ahead */
175 ++ioblkno;
176 if ((ioblkno + 1) * size > filesize ||
5117aa3e
MH
177 incore(vp, ioblkno) || (error = VOP_BMAP(vp,
178 ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
888c761e 179 goto skip_readahead;
5117aa3e
MH
180 /*
181 * Adjust readahead as above
182 */
183 if (num_ra) {
184 if (!alreadyincore && ioblkno <= vp->v_maxra)
185 vp->v_ralen = max(vp->v_ralen >> 1, 1);
186 else if (num_ra > vp->v_ralen &&
187 lblkno != vp->v_lastr)
188 vp->v_ralen = vp->v_ralen ?
189 min(num_ra,vp->v_ralen<<1) : 1;
190 if (num_ra > vp->v_ralen)
191 num_ra = vp->v_ralen;
192 }
888c761e
MS
193 flags |= B_ASYNC;
194 if (num_ra)
195 rbp = cluster_rbuild(vp, filesize,
196 NULL, ioblkno, blkno, size, num_ra, flags);
197 else {
e140149a 198 rbp = getblk(vp, ioblkno, size, 0, 0);
888c761e
MS
199 rbp->b_flags |= flags;
200 rbp->b_blkno = blkno;
201 }
5117aa3e 202 } else {
888c761e 203 /* case 2; read ahead single block */
e140149a 204 rbp = getblk(vp, ioblkno, size, 0, 0);
888c761e
MS
205 rbp->b_flags |= flags;
206 rbp->b_blkno = blkno;
5117aa3e 207 }
888c761e 208
5117aa3e 209 if (rbp == bp) /* case 4 */
888c761e
MS
210 rbp = NULL;
211 else if (rbp) { /* case 2, 5 */
212 trace(TR_BREADMISSRA,
213 pack(vp, (num_ra + 1) * size), ioblkno);
214 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
215 }
216 }
217
218 /* XXX Kirk, do we need to make sure the bp has creds? */
219skip_readahead:
220 if (bp)
221 if (bp->b_flags & (B_DONE | B_DELWRI))
222 panic("cluster_read: DONE bp");
223 else
224 error = VOP_STRATEGY(bp);
225
226 if (rbp)
227 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
228 rbp->b_flags &= ~(B_ASYNC | B_READ);
229 brelse(rbp);
230 } else
231 (void) VOP_STRATEGY(rbp);
232
5117aa3e
MH
233 /*
234 * Recalculate our maximum readahead
235 */
236 if (rbp == NULL)
237 rbp = bp;
238 if (rbp)
239 vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1;
240
888c761e
MS
241 if (bp)
242 return(biowait(bp));
243 return(error);
244}
245
246/*
247 * If blocks are contiguous on disk, use this to provide clustered
248 * read ahead. We will read as many blocks as possible sequentially
249 * and then parcel them up into logical blocks in the buffer hash table.
250 */
251struct buf *
252cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
253 struct vnode *vp;
254 u_quad_t filesize;
255 struct buf *bp;
256 daddr_t lbn;
257 daddr_t blkno;
258 long size;
259 int run;
260 long flags;
261{
262 struct cluster_save *b_save;
263 struct buf *tbp;
264 daddr_t bn;
265 int i, inc;
266
c5e0ddad
MS
267#ifdef DIAGNOSTIC
268 if (size != vp->v_mount->mnt_stat.f_iosize)
269 panic("cluster_rbuild: size %d != filesize %d\n",
270 size, vp->v_mount->mnt_stat.f_iosize);
271#endif
888c761e
MS
272 if (size * (lbn + run + 1) > filesize)
273 --run;
274 if (run == 0) {
275 if (!bp) {
e140149a 276 bp = getblk(vp, lbn, size, 0, 0);
888c761e
MS
277 bp->b_blkno = blkno;
278 bp->b_flags |= flags;
279 }
280 return(bp);
281 }
282
283 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
284 if (bp->b_flags & (B_DONE | B_DELWRI))
285 return (bp);
286
287 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
288 M_SEGMENT, M_WAITOK);
289 b_save->bs_bufsize = b_save->bs_bcount = size;
290 b_save->bs_nchildren = 0;
291 b_save->bs_children = (struct buf **)(b_save + 1);
292 b_save->bs_saveaddr = bp->b_saveaddr;
293 bp->b_saveaddr = (caddr_t) b_save;
294
5117aa3e 295 inc = btodb(size);
888c761e
MS
296 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
297 if (incore(vp, lbn + i)) {
298 if (i == 1) {
299 bp->b_saveaddr = b_save->bs_saveaddr;
300 bp->b_flags &= ~B_CALL;
301 bp->b_iodone = NULL;
302 allocbuf(bp, size);
303 free(b_save, M_SEGMENT);
304 } else
305 allocbuf(bp, size * i);
306 break;
307 }
e140149a 308 tbp = getblk(vp, lbn + i, 0, 0, 0);
5117aa3e
MH
309 /*
310 * getblk may return some memory in the buffer if there were
311 * no empty buffers to shed it to. If there is currently
312 * memory in the buffer, we move it down size bytes to make
313 * room for the valid pages that cluster_callback will insert.
314 * We do this now so we don't have to do it at interrupt time
315 * in the callback routine.
316 */
317 if (tbp->b_bufsize != 0) {
318 caddr_t bdata = (char *)tbp->b_data;
319
320 if (tbp->b_bufsize + size > MAXBSIZE)
321 panic("cluster_rbuild: too much memory");
322 if (tbp->b_bufsize > size) {
323 /*
324 * XXX if the source and destination regions
325 * overlap we have to copy backward to avoid
326 * clobbering any valid pages (i.e. pagemove
327 * implementations typically can't handle
328 * overlap).
329 */
330 bdata += tbp->b_bufsize;
331 while (bdata > (char *)tbp->b_data) {
332 bdata -= CLBYTES;
333 pagemove(bdata, bdata + size, CLBYTES);
334 }
335 } else
336 pagemove(bdata, bdata + size, tbp->b_bufsize);
337 }
888c761e 338 tbp->b_blkno = bn;
b88d365e
KM
339 {
340 daddr_t temp;
341 VOP_BMAP(tbp->b_vp, tbp->b_lblkno, NULL, &temp, NULL);
342 if (temp != bn) {
343 printf("Block: %d Assigned address: %x Bmap address: %x\n",
344 tbp->b_lblkno, tbp->b_blkno, temp);
345 panic("cluster_rbuild: wrong disk address");
346 }
347 }
888c761e
MS
348 tbp->b_flags |= flags | B_READ | B_ASYNC;
349 ++b_save->bs_nchildren;
350 b_save->bs_children[i - 1] = tbp;
351 }
888c761e
MS
352 return(bp);
353}
354
355/*
356 * Either get a new buffer or grow the existing one.
357 */
358struct buf *
359cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
360 struct vnode *vp;
361 struct buf *bp;
362 long flags;
363 daddr_t blkno;
364 daddr_t lblkno;
365 long size;
366 int run;
367{
368 if (!bp) {
e140149a 369 bp = getblk(vp, lblkno, size, 0, 0);
888c761e
MS
370 if (bp->b_flags & (B_DONE | B_DELWRI)) {
371 bp->b_blkno = blkno;
372 return(bp);
373 }
374 }
375 allocbuf(bp, run * size);
376 bp->b_blkno = blkno;
377 bp->b_iodone = cluster_callback;
378 bp->b_flags |= flags | B_CALL;
379 return(bp);
380}
381
382/*
383 * Cleanup after a clustered read or write.
5117aa3e
MH
384 * This is complicated by the fact that any of the buffers might have
385 * extra memory (if there were no empty buffer headers at allocbuf time)
386 * that we will need to shift around.
888c761e
MS
387 */
388void
389cluster_callback(bp)
390 struct buf *bp;
391{
392 struct cluster_save *b_save;
5117aa3e
MH
393 struct buf **bpp, *tbp;
394 long bsize;
888c761e 395 caddr_t cp;
5117aa3e
MH
396 int error = 0;
397
398 /*
399 * Must propogate errors to all the components.
400 */
401 if (bp->b_flags & B_ERROR)
402 error = bp->b_error;
8165a40b 403
b88d365e 404 daddr_t daddr;
888c761e
MS
405 b_save = (struct cluster_save *)(bp->b_saveaddr);
406 bp->b_saveaddr = b_save->bs_saveaddr;
407
5117aa3e
MH
408 bsize = b_save->bs_bufsize;
409 cp = (char *)bp->b_data + bsize;
410 /*
411 * Move memory from the large cluster buffer into the component
412 * buffers and mark IO as done on these.
413 */
414 for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
415 tbp = *bpp;
416 pagemove(cp, tbp->b_data, bsize);
417 tbp->b_bufsize += bsize;
418 tbp->b_bcount = bsize;
419 if (error) {
420 tbp->b_flags |= B_ERROR;
421 tbp->b_error = error;
b88d365e 422 }
5117aa3e
MH
423 biodone(tbp);
424 bp->b_bufsize -= bsize;
425 cp += bsize;
888c761e 426 }
5117aa3e
MH
427 /*
428 * If there was excess memory in the cluster buffer,
429 * slide it up adjacent to the remaining valid data.
430 */
431 if (bp->b_bufsize != bsize) {
432 if (bp->b_bufsize < bsize)
433 panic("cluster_callback: too little memory");
434 pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize);
435 }
436 bp->b_bcount = bsize;
888c761e
MS
437 bp->b_iodone = NULL;
438 free(b_save, M_SEGMENT);
439 if (bp->b_flags & B_ASYNC)
440 brelse(bp);
5117aa3e
MH
441 else {
442 bp->b_flags &= ~B_WANTED;
888c761e 443 wakeup((caddr_t)bp);
5117aa3e 444 }
888c761e
MS
445}
446
888c761e
MS
447/*
448 * Do clustered write for FFS.
449 *
450 * Three cases:
451 * 1. Write is not sequential (write asynchronously)
452 * Write is sequential:
453 * 2. beginning of cluster - begin cluster
454 * 3. middle of a cluster - add to cluster
455 * 4. end of a cluster - asynchronously write cluster
456 */
457void
458cluster_write(bp, filesize)
459 struct buf *bp;
460 u_quad_t filesize;
461{
462 struct vnode *vp;
463 daddr_t lbn;
aa258290 464 int maxclen, cursize;
888c761e
MS
465
466 vp = bp->b_vp;
467 lbn = bp->b_lblkno;
888c761e 468
c5e0ddad
MS
469 /* Initialize vnode to beginning of file. */
470 if (lbn == 0)
471 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
472
473 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
5117aa3e 474 (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) {
aa258290
KM
475 maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
476 if (vp->v_clen != 0) {
888c761e 477 /*
aa258290
KM
478 * Next block is not sequential.
479 *
480 * If we are not writing at end of file, the process
481 * seeked to another point in the file since its
482 * last write, or we have reached our maximum
483 * cluster size, then push the previous cluster.
484 * Otherwise try reallocating to make it sequential.
888c761e 485 */
aa258290 486 cursize = vp->v_lastw - vp->v_cstart + 1;
4d059b0b
MH
487 if (!doreallocblks ||
488 (lbn + 1) * bp->b_bcount != filesize ||
aa258290
KM
489 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
490 cluster_wbuild(vp, NULL, bp->b_bcount,
491 vp->v_cstart, cursize, lbn);
492 } else {
493 struct buf **bpp, **endbp;
494 struct cluster_save *buflist;
495
496 buflist = cluster_collectbufs(vp, bp);
497 endbp = &buflist->bs_children
498 [buflist->bs_nchildren - 1];
499 if (VOP_REALLOCBLKS(vp, buflist)) {
500 /*
501 * Failed, push the previous cluster.
502 */
503 for (bpp = buflist->bs_children;
504 bpp < endbp; bpp++)
505 brelse(*bpp);
506 free(buflist, M_SEGMENT);
507 cluster_wbuild(vp, NULL, bp->b_bcount,
508 vp->v_cstart, cursize, lbn);
509 } else {
510 /*
511 * Succeeded, keep building cluster.
512 */
513 for (bpp = buflist->bs_children;
514 bpp <= endbp; bpp++)
515 bdwrite(*bpp);
516 free(buflist, M_SEGMENT);
517 vp->v_lastw = lbn;
518 vp->v_lasta = bp->b_blkno;
519 return;
520 }
521 }
522 }
888c761e
MS
523 /*
524 * Consider beginning a cluster.
aa258290
KM
525 * If at end of file, make cluster as large as possible,
526 * otherwise find size of existing cluster.
888c761e 527 */
aa258290
KM
528 if ((lbn + 1) * bp->b_bcount != filesize &&
529 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
530 bp->b_blkno == -1)) {
888c761e 531 bawrite(bp);
c5e0ddad
MS
532 vp->v_clen = 0;
533 vp->v_lasta = bp->b_blkno;
888c761e
MS
534 vp->v_cstart = lbn + 1;
535 vp->v_lastw = lbn;
536 return;
8165a40b 537 }
aa258290
KM
538 vp->v_clen = maxclen;
539 if (maxclen == 0) { /* I/O not contiguous */
888c761e
MS
540 vp->v_cstart = lbn + 1;
541 bawrite(bp);
542 } else { /* Wait for rest of cluster */
543 vp->v_cstart = lbn;
544 bdwrite(bp);
545 }
5117aa3e 546 } else if (lbn == vp->v_cstart + vp->v_clen) {
888c761e
MS
547 /*
548 * At end of cluster, write it out.
549 */
550 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
551 vp->v_clen + 1, lbn);
552 vp->v_clen = 0;
553 vp->v_cstart = lbn + 1;
5117aa3e 554 } else
888c761e
MS
555 /*
556 * In the middle of a cluster, so just delay the
557 * I/O for now.
558 */
5117aa3e
MH
559 bdwrite(bp);
560 vp->v_lastw = lbn;
c5e0ddad 561 vp->v_lasta = bp->b_blkno;
888c761e
MS
562}
563
564
565/*
566 * This is an awful lot like cluster_rbuild...wish they could be combined.
567 * The last lbn argument is the current block on which I/O is being
568 * performed. Check to see that it doesn't fall in the middle of
5117aa3e 569 * the current block (if last_bp == NULL).
888c761e
MS
570 */
571void
572cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
573 struct vnode *vp;
574 struct buf *last_bp;
575 long size;
576 daddr_t start_lbn;
577 int len;
578 daddr_t lbn;
579{
580 struct cluster_save *b_save;
581 struct buf *bp, *tbp;
582 caddr_t cp;
583 int i, s;
584
c5e0ddad
MS
585#ifdef DIAGNOSTIC
586 if (size != vp->v_mount->mnt_stat.f_iosize)
587 panic("cluster_wbuild: size %d != filesize %d\n",
588 size, vp->v_mount->mnt_stat.f_iosize);
589#endif
888c761e
MS
590redo:
591 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
592 ++start_lbn;
593 --len;
594 }
595
596 /* Get more memory for current buffer */
597 if (len <= 1) {
c5e0ddad 598 if (last_bp) {
888c761e 599 bawrite(last_bp);
c5e0ddad
MS
600 } else if (len) {
601 bp = getblk(vp, start_lbn, size, 0, 0);
602 bawrite(bp);
603 }
888c761e
MS
604 return;
605 }
606
e140149a 607 bp = getblk(vp, start_lbn, size, 0, 0);
888c761e
MS
608 if (!(bp->b_flags & B_DELWRI)) {
609 ++start_lbn;
610 --len;
611 brelse(bp);
612 goto redo;
613 }
614
5117aa3e
MH
615 /*
616 * Extra memory in the buffer, punt on this buffer.
617 * XXX we could handle this in most cases, but we would have to
618 * push the extra memory down to after our max possible cluster
619 * size and then potentially pull it back up if the cluster was
620 * terminated prematurely--too much hassle.
621 */
622 if (bp->b_bcount != bp->b_bufsize) {
623 ++start_lbn;
624 --len;
625 bawrite(bp);
626 goto redo;
627 }
628
888c761e
MS
629 --len;
630 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
631 M_SEGMENT, M_WAITOK);
632 b_save->bs_bcount = bp->b_bcount;
633 b_save->bs_bufsize = bp->b_bufsize;
634 b_save->bs_nchildren = 0;
635 b_save->bs_children = (struct buf **)(b_save + 1);
636 b_save->bs_saveaddr = bp->b_saveaddr;
637 bp->b_saveaddr = (caddr_t) b_save;
638
888c761e
MS
639 bp->b_flags |= B_CALL;
640 bp->b_iodone = cluster_callback;
5117aa3e 641 cp = (char *)bp->b_data + size;
888c761e 642 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
5117aa3e
MH
643 /*
644 * Block is not in core or the non-sequential block
645 * ending our cluster was part of the cluster (in which
646 * case we don't want to write it twice).
647 */
648 if (!incore(vp, start_lbn) ||
649 last_bp == NULL && start_lbn == lbn)
888c761e
MS
650 break;
651
5117aa3e
MH
652 /*
653 * Get the desired block buffer (unless it is the final
654 * sequential block whose buffer was passed in explictly
655 * as last_bp).
656 */
657 if (last_bp == NULL || start_lbn != lbn) {
e140149a 658 tbp = getblk(vp, start_lbn, size, 0, 0);
888c761e
MS
659 if (!(tbp->b_flags & B_DELWRI)) {
660 brelse(tbp);
661 break;
662 }
663 } else
664 tbp = last_bp;
665
666 ++b_save->bs_nchildren;
667
668 /* Move memory from children to parent */
5117aa3e 669 if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) {
c5e0ddad
MS
670 printf("Clustered Block: %d addr %x bufsize: %d\n",
671 bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
672 printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
673 tbp->b_blkno);
674 panic("Clustered write to wrong blocks");
675 }
676
cb84e0ab 677 pagemove(tbp->b_data, cp, size);
888c761e
MS
678 bp->b_bcount += size;
679 bp->b_bufsize += size;
680
5117aa3e 681 tbp->b_bufsize -= size;
888c761e 682 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
5117aa3e 683 tbp->b_flags |= (B_ASYNC | B_AGE);
888c761e
MS
684 s = splbio();
685 reassignbuf(tbp, tbp->b_vp); /* put on clean list */
686 ++tbp->b_vp->v_numoutput;
687 splx(s);
688 b_save->bs_children[i] = tbp;
689
5117aa3e 690 cp += size;
888c761e
MS
691 }
692
693 if (i == 0) {
694 /* None to cluster */
695 bp->b_saveaddr = b_save->bs_saveaddr;
696 bp->b_flags &= ~B_CALL;
697 bp->b_iodone = NULL;
698 free(b_save, M_SEGMENT);
699 }
700 bawrite(bp);
701 if (i < len) {
702 len -= i + 1;
703 start_lbn += 1;
704 goto redo;
705 }
706}
aa258290
KM
707
708/*
709 * Collect together all the buffers in a cluster.
710 * Plus add one additional buffer.
711 */
712struct cluster_save *
713cluster_collectbufs(vp, last_bp)
714 struct vnode *vp;
715 struct buf *last_bp;
716{
717 struct cluster_save *buflist;
718 daddr_t lbn;
719 int i, len;
720
721 len = vp->v_lastw - vp->v_cstart + 1;
722 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
723 M_SEGMENT, M_WAITOK);
724 buflist->bs_nchildren = 0;
725 buflist->bs_children = (struct buf **)(buflist + 1);
726 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
727 (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
728 &buflist->bs_children[i]);
729 buflist->bs_children[i] = last_bp;
730 buflist->bs_nchildren = i + 1;
731 return (buflist);
732}