Commit | Line | Data |
---|---|---|
5dc2581e | 1 | /*- |
ec54f0cc KB |
2 | * Copyright (c) 1993 |
3 | * The Regents of the University of California. All rights reserved. | |
da7c5cc6 | 4 | * |
5c8652bb | 5 | * %sccs.include.redist.c% |
7188ac27 | 6 | * |
4d059b0b | 7 | * @(#)vfs_cluster.c 8.7 (Berkeley) %G% |
da7c5cc6 | 8 | */ |
961945a8 | 9 | |
251f56ba KB |
10 | #include <sys/param.h> |
11 | #include <sys/proc.h> | |
12 | #include <sys/buf.h> | |
13 | #include <sys/vnode.h> | |
251f56ba KB |
14 | #include <sys/mount.h> |
15 | #include <sys/trace.h> | |
37392cf8 | 16 | #include <sys/malloc.h> |
5c8652bb | 17 | #include <sys/resourcevar.h> |
37392cf8 | 18 | #include <libkern/libkern.h> |
b88d365e KM |
19 | #include <ufs/ufs/quota.h> |
20 | #include <ufs/ufs/inode.h> | |
37392cf8 | 21 | |
4d059b0b MH |
22 | #ifdef DEBUG |
23 | #include <vm/vm.h> | |
24 | #include <sys/sysctl.h> | |
25 | int doreallocblks = 1; | |
26 | struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; | |
27 | #else | |
28 | /* XXX for cluster_write */ | |
29 | #define doreallocblks 1 | |
30 | #endif | |
31 | ||
888c761e MS |
32 | /* |
33 | * Local declarations | |
34 | */ | |
35 | struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, | |
36 | daddr_t, long, int)); | |
37 | struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, | |
38 | daddr_t, daddr_t, long, int, long)); | |
8165a40b KM |
39 | void cluster_wbuild __P((struct vnode *, struct buf *, long, |
40 | daddr_t, int, daddr_t)); | |
aa258290 | 41 | struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); |
888c761e | 42 | |
5117aa3e | 43 | #ifdef DIAGNOSTIC |
888c761e | 44 | /* |
5117aa3e MH |
45 | * Set to 1 if reads of block zero should cause readahead to be done. |
46 | * Set to 0 treats a read of block zero as a non-sequential read. | |
888c761e | 47 | * |
5117aa3e MH |
48 | * Setting to one assumes that most reads of block zero of files are due to |
49 | * sequential passes over the files (e.g. cat, sum) where additional blocks | |
50 | * will soon be needed. Setting to zero assumes that the majority are | |
51 | * surgical strikes to get particular info (e.g. size, file) where readahead | |
52 | * blocks will not be used and, in fact, push out other potentially useful | |
53 | * blocks from the cache. The former seems intuitive, but some quick tests | |
54 | * showed that the latter performed better from a system-wide point of view. | |
55 | */ | |
56 | int doclusterraz = 0; | |
57 | #define ISSEQREAD(vp, blk) \ | |
58 | (((blk) != 0 || doclusterraz) && \ | |
59 | ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) | |
60 | #else | |
61 | #define ISSEQREAD(vp, blk) \ | |
62 | ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) | |
63 | #endif | |
64 | ||
65 | /* | |
888c761e MS |
66 | * This replaces bread. If this is a bread at the beginning of a file and |
67 | * lastr is 0, we assume this is the first read and we'll read up to two | |
68 | * blocks if they are sequential. After that, we'll do regular read ahead | |
69 | * in clustered chunks. | |
70 | * | |
71 | * There are 4 or 5 cases depending on how you count: | |
72 | * Desired block is in the cache: | |
73 | * 1 Not sequential access (0 I/Os). | |
74 | * 2 Access is sequential, do read-ahead (1 ASYNC). | |
75 | * Desired block is not in cache: | |
76 | * 3 Not sequential access (1 SYNC). | |
77 | * 4 Sequential access, next block is contiguous (1 SYNC). | |
78 | * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) | |
79 | * | |
80 | * There are potentially two buffers that require I/O. | |
81 | * bp is the block requested. | |
82 | * rbp is the read-ahead block. | |
83 | * If either is NULL, then you don't have to do the I/O. | |
84 | */ | |
85 | cluster_read(vp, filesize, lblkno, size, cred, bpp) | |
86 | struct vnode *vp; | |
87 | u_quad_t filesize; | |
88 | daddr_t lblkno; | |
89 | long size; | |
90 | struct ucred *cred; | |
91 | struct buf **bpp; | |
92 | { | |
93 | struct buf *bp, *rbp; | |
94 | daddr_t blkno, ioblkno; | |
95 | long flags; | |
96 | int error, num_ra, alreadyincore; | |
97 | ||
98 | #ifdef DIAGNOSTIC | |
99 | if (size == 0) | |
100 | panic("cluster_read: size = 0"); | |
101 | #endif | |
102 | ||
103 | error = 0; | |
104 | flags = B_READ; | |
e140149a | 105 | *bpp = bp = getblk(vp, lblkno, size, 0, 0); |
5117aa3e | 106 | if (bp->b_flags & B_CACHE) { |
888c761e MS |
107 | /* |
108 | * Desired block is in cache; do any readahead ASYNC. | |
109 | * Case 1, 2. | |
110 | */ | |
111 | trace(TR_BREADHIT, pack(vp, size), lblkno); | |
112 | flags |= B_ASYNC; | |
5117aa3e | 113 | ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); |
e140149a | 114 | alreadyincore = (int)incore(vp, ioblkno); |
888c761e MS |
115 | bp = NULL; |
116 | } else { | |
117 | /* Block wasn't in cache, case 3, 4, 5. */ | |
118 | trace(TR_BREADMISS, pack(vp, size), lblkno); | |
5117aa3e | 119 | bp->b_flags |= B_READ; |
888c761e | 120 | ioblkno = lblkno; |
888c761e MS |
121 | alreadyincore = 0; |
122 | curproc->p_stats->p_ru.ru_inblock++; /* XXX */ | |
123 | } | |
124 | /* | |
125 | * XXX | |
126 | * Replace 1 with a window size based on some permutation of | |
127 | * maxcontig and rot_delay. This will let you figure out how | |
128 | * many blocks you should read-ahead (case 2, 4, 5). | |
129 | * | |
5117aa3e MH |
130 | * If the access isn't sequential, reset the window to 1. |
131 | * Note that a read to the same block is considered sequential. | |
132 | * This catches the case where the file is being read sequentially, | |
133 | * but at smaller than the filesystem block size. | |
888c761e MS |
134 | */ |
135 | rbp = NULL; | |
5117aa3e MH |
136 | if (!ISSEQREAD(vp, lblkno)) { |
137 | vp->v_ralen = 0; | |
138 | vp->v_maxra = lblkno; | |
139 | } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && | |
8165a40b KM |
140 | !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && |
141 | blkno != -1) { | |
888c761e MS |
142 | /* |
143 | * Reading sequentially, and the next block is not in the | |
5117aa3e | 144 | * cache. We are going to try reading ahead. |
888c761e | 145 | */ |
5117aa3e MH |
146 | if (num_ra) { |
147 | /* | |
148 | * If our desired readahead block had been read | |
149 | * in a previous readahead but is no longer in | |
150 | * core, then we may be reading ahead too far | |
151 | * or are not using our readahead very rapidly. | |
152 | * In this case we scale back the window. | |
153 | */ | |
154 | if (!alreadyincore && ioblkno <= vp->v_maxra) | |
155 | vp->v_ralen = max(vp->v_ralen >> 1, 1); | |
156 | /* | |
157 | * There are more sequential blocks than our current | |
158 | * window allows, scale up. Ideally we want to get | |
159 | * in sync with the filesystem maxcontig value. | |
160 | */ | |
161 | else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) | |
162 | vp->v_ralen = vp->v_ralen ? | |
163 | min(num_ra, vp->v_ralen << 1) : 1; | |
888c761e | 164 | |
5117aa3e MH |
165 | if (num_ra > vp->v_ralen) |
166 | num_ra = vp->v_ralen; | |
167 | } | |
888c761e MS |
168 | |
169 | if (num_ra) /* case 2, 4 */ | |
170 | rbp = cluster_rbuild(vp, filesize, | |
171 | bp, ioblkno, blkno, size, num_ra, flags); | |
5117aa3e MH |
172 | else if (ioblkno == lblkno) { |
173 | bp->b_blkno = blkno; | |
888c761e MS |
174 | /* Case 5: check how many blocks to read ahead */ |
175 | ++ioblkno; | |
176 | if ((ioblkno + 1) * size > filesize || | |
5117aa3e MH |
177 | incore(vp, ioblkno) || (error = VOP_BMAP(vp, |
178 | ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) | |
888c761e | 179 | goto skip_readahead; |
5117aa3e MH |
180 | /* |
181 | * Adjust readahead as above | |
182 | */ | |
183 | if (num_ra) { | |
184 | if (!alreadyincore && ioblkno <= vp->v_maxra) | |
185 | vp->v_ralen = max(vp->v_ralen >> 1, 1); | |
186 | else if (num_ra > vp->v_ralen && | |
187 | lblkno != vp->v_lastr) | |
188 | vp->v_ralen = vp->v_ralen ? | |
189 | min(num_ra,vp->v_ralen<<1) : 1; | |
190 | if (num_ra > vp->v_ralen) | |
191 | num_ra = vp->v_ralen; | |
192 | } | |
888c761e MS |
193 | flags |= B_ASYNC; |
194 | if (num_ra) | |
195 | rbp = cluster_rbuild(vp, filesize, | |
196 | NULL, ioblkno, blkno, size, num_ra, flags); | |
197 | else { | |
e140149a | 198 | rbp = getblk(vp, ioblkno, size, 0, 0); |
888c761e MS |
199 | rbp->b_flags |= flags; |
200 | rbp->b_blkno = blkno; | |
201 | } | |
5117aa3e | 202 | } else { |
888c761e | 203 | /* case 2; read ahead single block */ |
e140149a | 204 | rbp = getblk(vp, ioblkno, size, 0, 0); |
888c761e MS |
205 | rbp->b_flags |= flags; |
206 | rbp->b_blkno = blkno; | |
5117aa3e | 207 | } |
888c761e | 208 | |
5117aa3e | 209 | if (rbp == bp) /* case 4 */ |
888c761e MS |
210 | rbp = NULL; |
211 | else if (rbp) { /* case 2, 5 */ | |
212 | trace(TR_BREADMISSRA, | |
213 | pack(vp, (num_ra + 1) * size), ioblkno); | |
214 | curproc->p_stats->p_ru.ru_inblock++; /* XXX */ | |
215 | } | |
216 | } | |
217 | ||
218 | /* XXX Kirk, do we need to make sure the bp has creds? */ | |
219 | skip_readahead: | |
220 | if (bp) | |
221 | if (bp->b_flags & (B_DONE | B_DELWRI)) | |
222 | panic("cluster_read: DONE bp"); | |
223 | else | |
224 | error = VOP_STRATEGY(bp); | |
225 | ||
226 | if (rbp) | |
227 | if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { | |
228 | rbp->b_flags &= ~(B_ASYNC | B_READ); | |
229 | brelse(rbp); | |
230 | } else | |
231 | (void) VOP_STRATEGY(rbp); | |
232 | ||
5117aa3e MH |
233 | /* |
234 | * Recalculate our maximum readahead | |
235 | */ | |
236 | if (rbp == NULL) | |
237 | rbp = bp; | |
238 | if (rbp) | |
239 | vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; | |
240 | ||
888c761e MS |
241 | if (bp) |
242 | return(biowait(bp)); | |
243 | return(error); | |
244 | } | |
245 | ||
246 | /* | |
247 | * If blocks are contiguous on disk, use this to provide clustered | |
248 | * read ahead. We will read as many blocks as possible sequentially | |
249 | * and then parcel them up into logical blocks in the buffer hash table. | |
250 | */ | |
251 | struct buf * | |
252 | cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) | |
253 | struct vnode *vp; | |
254 | u_quad_t filesize; | |
255 | struct buf *bp; | |
256 | daddr_t lbn; | |
257 | daddr_t blkno; | |
258 | long size; | |
259 | int run; | |
260 | long flags; | |
261 | { | |
262 | struct cluster_save *b_save; | |
263 | struct buf *tbp; | |
264 | daddr_t bn; | |
265 | int i, inc; | |
266 | ||
c5e0ddad MS |
267 | #ifdef DIAGNOSTIC |
268 | if (size != vp->v_mount->mnt_stat.f_iosize) | |
269 | panic("cluster_rbuild: size %d != filesize %d\n", | |
270 | size, vp->v_mount->mnt_stat.f_iosize); | |
271 | #endif | |
888c761e MS |
272 | if (size * (lbn + run + 1) > filesize) |
273 | --run; | |
274 | if (run == 0) { | |
275 | if (!bp) { | |
e140149a | 276 | bp = getblk(vp, lbn, size, 0, 0); |
888c761e MS |
277 | bp->b_blkno = blkno; |
278 | bp->b_flags |= flags; | |
279 | } | |
280 | return(bp); | |
281 | } | |
282 | ||
283 | bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); | |
284 | if (bp->b_flags & (B_DONE | B_DELWRI)) | |
285 | return (bp); | |
286 | ||
287 | b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), | |
288 | M_SEGMENT, M_WAITOK); | |
289 | b_save->bs_bufsize = b_save->bs_bcount = size; | |
290 | b_save->bs_nchildren = 0; | |
291 | b_save->bs_children = (struct buf **)(b_save + 1); | |
292 | b_save->bs_saveaddr = bp->b_saveaddr; | |
293 | bp->b_saveaddr = (caddr_t) b_save; | |
294 | ||
5117aa3e | 295 | inc = btodb(size); |
888c761e MS |
296 | for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { |
297 | if (incore(vp, lbn + i)) { | |
298 | if (i == 1) { | |
299 | bp->b_saveaddr = b_save->bs_saveaddr; | |
300 | bp->b_flags &= ~B_CALL; | |
301 | bp->b_iodone = NULL; | |
302 | allocbuf(bp, size); | |
303 | free(b_save, M_SEGMENT); | |
304 | } else | |
305 | allocbuf(bp, size * i); | |
306 | break; | |
307 | } | |
e140149a | 308 | tbp = getblk(vp, lbn + i, 0, 0, 0); |
5117aa3e MH |
309 | /* |
310 | * getblk may return some memory in the buffer if there were | |
311 | * no empty buffers to shed it to. If there is currently | |
312 | * memory in the buffer, we move it down size bytes to make | |
313 | * room for the valid pages that cluster_callback will insert. | |
314 | * We do this now so we don't have to do it at interrupt time | |
315 | * in the callback routine. | |
316 | */ | |
317 | if (tbp->b_bufsize != 0) { | |
318 | caddr_t bdata = (char *)tbp->b_data; | |
319 | ||
320 | if (tbp->b_bufsize + size > MAXBSIZE) | |
321 | panic("cluster_rbuild: too much memory"); | |
322 | if (tbp->b_bufsize > size) { | |
323 | /* | |
324 | * XXX if the source and destination regions | |
325 | * overlap we have to copy backward to avoid | |
326 | * clobbering any valid pages (i.e. pagemove | |
327 | * implementations typically can't handle | |
328 | * overlap). | |
329 | */ | |
330 | bdata += tbp->b_bufsize; | |
331 | while (bdata > (char *)tbp->b_data) { | |
332 | bdata -= CLBYTES; | |
333 | pagemove(bdata, bdata + size, CLBYTES); | |
334 | } | |
335 | } else | |
336 | pagemove(bdata, bdata + size, tbp->b_bufsize); | |
337 | } | |
888c761e | 338 | tbp->b_blkno = bn; |
b88d365e KM |
339 | { |
340 | daddr_t temp; | |
341 | VOP_BMAP(tbp->b_vp, tbp->b_lblkno, NULL, &temp, NULL); | |
342 | if (temp != bn) { | |
343 | printf("Block: %d Assigned address: %x Bmap address: %x\n", | |
344 | tbp->b_lblkno, tbp->b_blkno, temp); | |
345 | panic("cluster_rbuild: wrong disk address"); | |
346 | } | |
347 | } | |
888c761e MS |
348 | tbp->b_flags |= flags | B_READ | B_ASYNC; |
349 | ++b_save->bs_nchildren; | |
350 | b_save->bs_children[i - 1] = tbp; | |
351 | } | |
888c761e MS |
352 | return(bp); |
353 | } | |
354 | ||
355 | /* | |
356 | * Either get a new buffer or grow the existing one. | |
357 | */ | |
358 | struct buf * | |
359 | cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) | |
360 | struct vnode *vp; | |
361 | struct buf *bp; | |
362 | long flags; | |
363 | daddr_t blkno; | |
364 | daddr_t lblkno; | |
365 | long size; | |
366 | int run; | |
367 | { | |
368 | if (!bp) { | |
e140149a | 369 | bp = getblk(vp, lblkno, size, 0, 0); |
888c761e MS |
370 | if (bp->b_flags & (B_DONE | B_DELWRI)) { |
371 | bp->b_blkno = blkno; | |
372 | return(bp); | |
373 | } | |
374 | } | |
375 | allocbuf(bp, run * size); | |
376 | bp->b_blkno = blkno; | |
377 | bp->b_iodone = cluster_callback; | |
378 | bp->b_flags |= flags | B_CALL; | |
379 | return(bp); | |
380 | } | |
381 | ||
382 | /* | |
383 | * Cleanup after a clustered read or write. | |
5117aa3e MH |
384 | * This is complicated by the fact that any of the buffers might have |
385 | * extra memory (if there were no empty buffer headers at allocbuf time) | |
386 | * that we will need to shift around. | |
888c761e MS |
387 | */ |
388 | void | |
389 | cluster_callback(bp) | |
390 | struct buf *bp; | |
391 | { | |
392 | struct cluster_save *b_save; | |
5117aa3e MH |
393 | struct buf **bpp, *tbp; |
394 | long bsize; | |
888c761e | 395 | caddr_t cp; |
5117aa3e MH |
396 | int error = 0; |
397 | ||
398 | /* | |
399 | * Must propogate errors to all the components. | |
400 | */ | |
401 | if (bp->b_flags & B_ERROR) | |
402 | error = bp->b_error; | |
8165a40b | 403 | |
b88d365e | 404 | daddr_t daddr; |
888c761e MS |
405 | b_save = (struct cluster_save *)(bp->b_saveaddr); |
406 | bp->b_saveaddr = b_save->bs_saveaddr; | |
407 | ||
5117aa3e MH |
408 | bsize = b_save->bs_bufsize; |
409 | cp = (char *)bp->b_data + bsize; | |
410 | /* | |
411 | * Move memory from the large cluster buffer into the component | |
412 | * buffers and mark IO as done on these. | |
413 | */ | |
414 | for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { | |
415 | tbp = *bpp; | |
416 | pagemove(cp, tbp->b_data, bsize); | |
417 | tbp->b_bufsize += bsize; | |
418 | tbp->b_bcount = bsize; | |
419 | if (error) { | |
420 | tbp->b_flags |= B_ERROR; | |
421 | tbp->b_error = error; | |
b88d365e | 422 | } |
5117aa3e MH |
423 | biodone(tbp); |
424 | bp->b_bufsize -= bsize; | |
425 | cp += bsize; | |
888c761e | 426 | } |
5117aa3e MH |
427 | /* |
428 | * If there was excess memory in the cluster buffer, | |
429 | * slide it up adjacent to the remaining valid data. | |
430 | */ | |
431 | if (bp->b_bufsize != bsize) { | |
432 | if (bp->b_bufsize < bsize) | |
433 | panic("cluster_callback: too little memory"); | |
434 | pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); | |
435 | } | |
436 | bp->b_bcount = bsize; | |
888c761e MS |
437 | bp->b_iodone = NULL; |
438 | free(b_save, M_SEGMENT); | |
439 | if (bp->b_flags & B_ASYNC) | |
440 | brelse(bp); | |
5117aa3e MH |
441 | else { |
442 | bp->b_flags &= ~B_WANTED; | |
888c761e | 443 | wakeup((caddr_t)bp); |
5117aa3e | 444 | } |
888c761e MS |
445 | } |
446 | ||
888c761e MS |
447 | /* |
448 | * Do clustered write for FFS. | |
449 | * | |
450 | * Three cases: | |
451 | * 1. Write is not sequential (write asynchronously) | |
452 | * Write is sequential: | |
453 | * 2. beginning of cluster - begin cluster | |
454 | * 3. middle of a cluster - add to cluster | |
455 | * 4. end of a cluster - asynchronously write cluster | |
456 | */ | |
457 | void | |
458 | cluster_write(bp, filesize) | |
459 | struct buf *bp; | |
460 | u_quad_t filesize; | |
461 | { | |
462 | struct vnode *vp; | |
463 | daddr_t lbn; | |
aa258290 | 464 | int maxclen, cursize; |
888c761e MS |
465 | |
466 | vp = bp->b_vp; | |
467 | lbn = bp->b_lblkno; | |
888c761e | 468 | |
c5e0ddad MS |
469 | /* Initialize vnode to beginning of file. */ |
470 | if (lbn == 0) | |
471 | vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; | |
472 | ||
473 | if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || | |
5117aa3e | 474 | (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { |
aa258290 KM |
475 | maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; |
476 | if (vp->v_clen != 0) { | |
888c761e | 477 | /* |
aa258290 KM |
478 | * Next block is not sequential. |
479 | * | |
480 | * If we are not writing at end of file, the process | |
481 | * seeked to another point in the file since its | |
482 | * last write, or we have reached our maximum | |
483 | * cluster size, then push the previous cluster. | |
484 | * Otherwise try reallocating to make it sequential. | |
888c761e | 485 | */ |
aa258290 | 486 | cursize = vp->v_lastw - vp->v_cstart + 1; |
4d059b0b MH |
487 | if (!doreallocblks || |
488 | (lbn + 1) * bp->b_bcount != filesize || | |
aa258290 KM |
489 | lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { |
490 | cluster_wbuild(vp, NULL, bp->b_bcount, | |
491 | vp->v_cstart, cursize, lbn); | |
492 | } else { | |
493 | struct buf **bpp, **endbp; | |
494 | struct cluster_save *buflist; | |
495 | ||
496 | buflist = cluster_collectbufs(vp, bp); | |
497 | endbp = &buflist->bs_children | |
498 | [buflist->bs_nchildren - 1]; | |
499 | if (VOP_REALLOCBLKS(vp, buflist)) { | |
500 | /* | |
501 | * Failed, push the previous cluster. | |
502 | */ | |
503 | for (bpp = buflist->bs_children; | |
504 | bpp < endbp; bpp++) | |
505 | brelse(*bpp); | |
506 | free(buflist, M_SEGMENT); | |
507 | cluster_wbuild(vp, NULL, bp->b_bcount, | |
508 | vp->v_cstart, cursize, lbn); | |
509 | } else { | |
510 | /* | |
511 | * Succeeded, keep building cluster. | |
512 | */ | |
513 | for (bpp = buflist->bs_children; | |
514 | bpp <= endbp; bpp++) | |
515 | bdwrite(*bpp); | |
516 | free(buflist, M_SEGMENT); | |
517 | vp->v_lastw = lbn; | |
518 | vp->v_lasta = bp->b_blkno; | |
519 | return; | |
520 | } | |
521 | } | |
522 | } | |
888c761e MS |
523 | /* |
524 | * Consider beginning a cluster. | |
aa258290 KM |
525 | * If at end of file, make cluster as large as possible, |
526 | * otherwise find size of existing cluster. | |
888c761e | 527 | */ |
aa258290 KM |
528 | if ((lbn + 1) * bp->b_bcount != filesize && |
529 | (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || | |
530 | bp->b_blkno == -1)) { | |
888c761e | 531 | bawrite(bp); |
c5e0ddad MS |
532 | vp->v_clen = 0; |
533 | vp->v_lasta = bp->b_blkno; | |
888c761e MS |
534 | vp->v_cstart = lbn + 1; |
535 | vp->v_lastw = lbn; | |
536 | return; | |
8165a40b | 537 | } |
aa258290 KM |
538 | vp->v_clen = maxclen; |
539 | if (maxclen == 0) { /* I/O not contiguous */ | |
888c761e MS |
540 | vp->v_cstart = lbn + 1; |
541 | bawrite(bp); | |
542 | } else { /* Wait for rest of cluster */ | |
543 | vp->v_cstart = lbn; | |
544 | bdwrite(bp); | |
545 | } | |
5117aa3e | 546 | } else if (lbn == vp->v_cstart + vp->v_clen) { |
888c761e MS |
547 | /* |
548 | * At end of cluster, write it out. | |
549 | */ | |
550 | cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, | |
551 | vp->v_clen + 1, lbn); | |
552 | vp->v_clen = 0; | |
553 | vp->v_cstart = lbn + 1; | |
5117aa3e | 554 | } else |
888c761e MS |
555 | /* |
556 | * In the middle of a cluster, so just delay the | |
557 | * I/O for now. | |
558 | */ | |
5117aa3e MH |
559 | bdwrite(bp); |
560 | vp->v_lastw = lbn; | |
c5e0ddad | 561 | vp->v_lasta = bp->b_blkno; |
888c761e MS |
562 | } |
563 | ||
564 | ||
565 | /* | |
566 | * This is an awful lot like cluster_rbuild...wish they could be combined. | |
567 | * The last lbn argument is the current block on which I/O is being | |
568 | * performed. Check to see that it doesn't fall in the middle of | |
5117aa3e | 569 | * the current block (if last_bp == NULL). |
888c761e MS |
570 | */ |
571 | void | |
572 | cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) | |
573 | struct vnode *vp; | |
574 | struct buf *last_bp; | |
575 | long size; | |
576 | daddr_t start_lbn; | |
577 | int len; | |
578 | daddr_t lbn; | |
579 | { | |
580 | struct cluster_save *b_save; | |
581 | struct buf *bp, *tbp; | |
582 | caddr_t cp; | |
583 | int i, s; | |
584 | ||
c5e0ddad MS |
585 | #ifdef DIAGNOSTIC |
586 | if (size != vp->v_mount->mnt_stat.f_iosize) | |
587 | panic("cluster_wbuild: size %d != filesize %d\n", | |
588 | size, vp->v_mount->mnt_stat.f_iosize); | |
589 | #endif | |
888c761e MS |
590 | redo: |
591 | while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { | |
592 | ++start_lbn; | |
593 | --len; | |
594 | } | |
595 | ||
596 | /* Get more memory for current buffer */ | |
597 | if (len <= 1) { | |
c5e0ddad | 598 | if (last_bp) { |
888c761e | 599 | bawrite(last_bp); |
c5e0ddad MS |
600 | } else if (len) { |
601 | bp = getblk(vp, start_lbn, size, 0, 0); | |
602 | bawrite(bp); | |
603 | } | |
888c761e MS |
604 | return; |
605 | } | |
606 | ||
e140149a | 607 | bp = getblk(vp, start_lbn, size, 0, 0); |
888c761e MS |
608 | if (!(bp->b_flags & B_DELWRI)) { |
609 | ++start_lbn; | |
610 | --len; | |
611 | brelse(bp); | |
612 | goto redo; | |
613 | } | |
614 | ||
5117aa3e MH |
615 | /* |
616 | * Extra memory in the buffer, punt on this buffer. | |
617 | * XXX we could handle this in most cases, but we would have to | |
618 | * push the extra memory down to after our max possible cluster | |
619 | * size and then potentially pull it back up if the cluster was | |
620 | * terminated prematurely--too much hassle. | |
621 | */ | |
622 | if (bp->b_bcount != bp->b_bufsize) { | |
623 | ++start_lbn; | |
624 | --len; | |
625 | bawrite(bp); | |
626 | goto redo; | |
627 | } | |
628 | ||
888c761e MS |
629 | --len; |
630 | b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), | |
631 | M_SEGMENT, M_WAITOK); | |
632 | b_save->bs_bcount = bp->b_bcount; | |
633 | b_save->bs_bufsize = bp->b_bufsize; | |
634 | b_save->bs_nchildren = 0; | |
635 | b_save->bs_children = (struct buf **)(b_save + 1); | |
636 | b_save->bs_saveaddr = bp->b_saveaddr; | |
637 | bp->b_saveaddr = (caddr_t) b_save; | |
638 | ||
888c761e MS |
639 | bp->b_flags |= B_CALL; |
640 | bp->b_iodone = cluster_callback; | |
5117aa3e | 641 | cp = (char *)bp->b_data + size; |
888c761e | 642 | for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { |
5117aa3e MH |
643 | /* |
644 | * Block is not in core or the non-sequential block | |
645 | * ending our cluster was part of the cluster (in which | |
646 | * case we don't want to write it twice). | |
647 | */ | |
648 | if (!incore(vp, start_lbn) || | |
649 | last_bp == NULL && start_lbn == lbn) | |
888c761e MS |
650 | break; |
651 | ||
5117aa3e MH |
652 | /* |
653 | * Get the desired block buffer (unless it is the final | |
654 | * sequential block whose buffer was passed in explictly | |
655 | * as last_bp). | |
656 | */ | |
657 | if (last_bp == NULL || start_lbn != lbn) { | |
e140149a | 658 | tbp = getblk(vp, start_lbn, size, 0, 0); |
888c761e MS |
659 | if (!(tbp->b_flags & B_DELWRI)) { |
660 | brelse(tbp); | |
661 | break; | |
662 | } | |
663 | } else | |
664 | tbp = last_bp; | |
665 | ||
666 | ++b_save->bs_nchildren; | |
667 | ||
668 | /* Move memory from children to parent */ | |
5117aa3e | 669 | if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { |
c5e0ddad MS |
670 | printf("Clustered Block: %d addr %x bufsize: %d\n", |
671 | bp->b_lblkno, bp->b_blkno, bp->b_bufsize); | |
672 | printf("Child Block: %d addr: %x\n", tbp->b_lblkno, | |
673 | tbp->b_blkno); | |
674 | panic("Clustered write to wrong blocks"); | |
675 | } | |
676 | ||
cb84e0ab | 677 | pagemove(tbp->b_data, cp, size); |
888c761e MS |
678 | bp->b_bcount += size; |
679 | bp->b_bufsize += size; | |
680 | ||
5117aa3e | 681 | tbp->b_bufsize -= size; |
888c761e | 682 | tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); |
5117aa3e | 683 | tbp->b_flags |= (B_ASYNC | B_AGE); |
888c761e MS |
684 | s = splbio(); |
685 | reassignbuf(tbp, tbp->b_vp); /* put on clean list */ | |
686 | ++tbp->b_vp->v_numoutput; | |
687 | splx(s); | |
688 | b_save->bs_children[i] = tbp; | |
689 | ||
5117aa3e | 690 | cp += size; |
888c761e MS |
691 | } |
692 | ||
693 | if (i == 0) { | |
694 | /* None to cluster */ | |
695 | bp->b_saveaddr = b_save->bs_saveaddr; | |
696 | bp->b_flags &= ~B_CALL; | |
697 | bp->b_iodone = NULL; | |
698 | free(b_save, M_SEGMENT); | |
699 | } | |
700 | bawrite(bp); | |
701 | if (i < len) { | |
702 | len -= i + 1; | |
703 | start_lbn += 1; | |
704 | goto redo; | |
705 | } | |
706 | } | |
aa258290 KM |
707 | |
708 | /* | |
709 | * Collect together all the buffers in a cluster. | |
710 | * Plus add one additional buffer. | |
711 | */ | |
712 | struct cluster_save * | |
713 | cluster_collectbufs(vp, last_bp) | |
714 | struct vnode *vp; | |
715 | struct buf *last_bp; | |
716 | { | |
717 | struct cluster_save *buflist; | |
718 | daddr_t lbn; | |
719 | int i, len; | |
720 | ||
721 | len = vp->v_lastw - vp->v_cstart + 1; | |
722 | buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), | |
723 | M_SEGMENT, M_WAITOK); | |
724 | buflist->bs_nchildren = 0; | |
725 | buflist->bs_children = (struct buf **)(buflist + 1); | |
726 | for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) | |
727 | (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, | |
728 | &buflist->bs_children[i]); | |
729 | buflist->bs_children[i] = last_bp; | |
730 | buflist->bs_nchildren = i + 1; | |
731 | return (buflist); | |
732 | } |