Commit | Line | Data |
---|---|---|
5dc2581e KB |
1 | /*- |
2 | * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. | |
7188ac27 | 3 | * All rights reserved. |
da7c5cc6 | 4 | * |
217c3be4 KM |
5 | * This module is believed to contain source code proprietary to AT&T. |
6 | * Use and redistribution is subject to the Berkeley Software License | |
7 | * Agreement and your Software Agreement with AT&T (Western Electric). | |
7188ac27 | 8 | * |
e140149a | 9 | * @(#)vfs_cluster.c 7.58 (Berkeley) %G% |
da7c5cc6 | 10 | */ |
961945a8 | 11 | |
251f56ba KB |
12 | #include <sys/param.h> |
13 | #include <sys/proc.h> | |
14 | #include <sys/buf.h> | |
15 | #include <sys/vnode.h> | |
251f56ba KB |
16 | #include <sys/mount.h> |
17 | #include <sys/trace.h> | |
18 | #include <sys/resourcevar.h> | |
37392cf8 KM |
19 | #include <sys/malloc.h> |
20 | #include <libkern/libkern.h> | |
21 | ||
22 | /* | |
23 | * Definitions for the buffer hash lists. | |
24 | */ | |
25 | #define BUFHASH(dvp, lbn) \ | |
26 | (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) | |
e3249ec0 | 27 | struct list_entry *bufhashtbl, invalhash; |
37392cf8 KM |
28 | u_long bufhash; |
29 | ||
30 | /* | |
31 | * Insq/Remq for the buffer hash lists. | |
32 | */ | |
e3249ec0 KM |
33 | #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) |
34 | #define bremhash(bp) list_remove(bp, struct buf *, b_hash) | |
37392cf8 KM |
35 | |
36 | /* | |
37 | * Definitions for the buffer free lists. | |
38 | */ | |
39 | #define BQUEUES 4 /* number of free buffer queues */ | |
40 | ||
41 | #define BQ_LOCKED 0 /* super-blocks &c */ | |
42 | #define BQ_LRU 1 /* lru, useful buffers */ | |
43 | #define BQ_AGE 2 /* rubbish */ | |
44 | #define BQ_EMPTY 3 /* buffer headers with no memory */ | |
45 | ||
e3249ec0 | 46 | struct queue_entry bufqueues[BQUEUES]; |
37392cf8 KM |
47 | int needbuffer; |
48 | ||
49 | /* | |
50 | * Insq/Remq for the buffer free lists. | |
51 | */ | |
e3249ec0 KM |
52 | #define binsheadfree(bp, dp) \ |
53 | queue_enter_head(dp, bp, struct buf *, b_freelist) | |
54 | #define binstailfree(bp, dp) \ | |
55 | queue_enter_tail(dp, bp, struct buf *, b_freelist) | |
56 | ||
888c761e MS |
57 | /* |
58 | * Local declarations | |
59 | */ | |
60 | struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, | |
61 | daddr_t, long, int)); | |
62 | struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, | |
63 | daddr_t, daddr_t, long, int, long)); | |
64 | void cluster_wbuild __P((struct vnode *, struct buf *, long size, | |
65 | daddr_t start_lbn, int len, daddr_t lbn)); | |
66 | ||
37392cf8 KM |
67 | void |
68 | bremfree(bp) | |
69 | struct buf *bp; | |
70 | { | |
e3249ec0 | 71 | struct queue_entry *dp; |
37392cf8 | 72 | |
e3249ec0 KM |
73 | /* |
74 | * We only calculate the head of the freelist when removing | |
75 | * the last element of the list as that is the only time that | |
76 | * it is needed (e.g. to reset the tail pointer). | |
77 | */ | |
78 | if (bp->b_freelist.qe_next == NULL) { | |
37392cf8 | 79 | for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) |
e3249ec0 | 80 | if (dp->qe_prev == &bp->b_freelist.qe_next) |
37392cf8 KM |
81 | break; |
82 | if (dp == &bufqueues[BQUEUES]) | |
83 | panic("bremfree: lost tail"); | |
37392cf8 | 84 | } |
e3249ec0 | 85 | queue_remove(dp, bp, struct buf *, b_freelist); |
37392cf8 | 86 | } |
663dbc72 | 87 | |
e7db227e MK |
88 | /* |
89 | * Initialize buffers and hash links for buffers. | |
90 | */ | |
251f56ba | 91 | void |
e7db227e MK |
92 | bufinit() |
93 | { | |
37392cf8 | 94 | register struct buf *bp; |
e3249ec0 | 95 | struct queue_entry *dp; |
e7db227e | 96 | register int i; |
e7db227e MK |
97 | int base, residual; |
98 | ||
37392cf8 | 99 | for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) |
e3249ec0 KM |
100 | queue_init(dp); |
101 | bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); | |
e7db227e MK |
102 | base = bufpages / nbuf; |
103 | residual = bufpages % nbuf; | |
104 | for (i = 0; i < nbuf; i++) { | |
105 | bp = &buf[i]; | |
37392cf8 | 106 | bzero((char *)bp, sizeof *bp); |
e7db227e | 107 | bp->b_dev = NODEV; |
e7db227e MK |
108 | bp->b_rcred = NOCRED; |
109 | bp->b_wcred = NOCRED; | |
e7db227e MK |
110 | bp->b_un.b_addr = buffers + i * MAXBSIZE; |
111 | if (i < residual) | |
112 | bp->b_bufsize = (base + 1) * CLBYTES; | |
113 | else | |
114 | bp->b_bufsize = base * CLBYTES; | |
31222d0d | 115 | bp->b_flags = B_INVAL; |
37392cf8 | 116 | dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; |
31222d0d | 117 | binsheadfree(bp, dp); |
37392cf8 | 118 | binshash(bp, &invalhash); |
e7db227e MK |
119 | } |
120 | } | |
121 | ||
663dbc72 | 122 | /* |
d42a4811 KM |
123 | * Find the block in the buffer pool. |
124 | * If the buffer is not present, allocate a new buffer and load | |
125 | * its contents according to the filesystem fill routine. | |
663dbc72 | 126 | */ |
a937f856 | 127 | bread(vp, blkno, size, cred, bpp) |
7188ac27 | 128 | struct vnode *vp; |
ad30fb67 KM |
129 | daddr_t blkno; |
130 | int size; | |
a937f856 | 131 | struct ucred *cred; |
7188ac27 | 132 | struct buf **bpp; |
ec67a3ce MK |
133 | #ifdef SECSIZE |
134 | long secsize; | |
135 | #endif SECSIZE | |
663dbc72 | 136 | { |
3789a403 | 137 | struct proc *p = curproc; /* XXX */ |
663dbc72 BJ |
138 | register struct buf *bp; |
139 | ||
4f083fd7 SL |
140 | if (size == 0) |
141 | panic("bread: size 0"); | |
ec67a3ce MK |
142 | #ifdef SECSIZE |
143 | bp = getblk(dev, blkno, size, secsize); | |
144 | #else SECSIZE | |
e140149a | 145 | *bpp = bp = getblk(vp, blkno, size, 0, 0); |
ec67a3ce | 146 | #endif SECSIZE |
d42a4811 | 147 | if (bp->b_flags & (B_DONE | B_DELWRI)) { |
c5a600cf | 148 | trace(TR_BREADHIT, pack(vp, size), blkno); |
7188ac27 | 149 | return (0); |
663dbc72 BJ |
150 | } |
151 | bp->b_flags |= B_READ; | |
4f083fd7 SL |
152 | if (bp->b_bcount > bp->b_bufsize) |
153 | panic("bread"); | |
a937f856 KM |
154 | if (bp->b_rcred == NOCRED && cred != NOCRED) { |
155 | crhold(cred); | |
156 | bp->b_rcred = cred; | |
157 | } | |
7188ac27 | 158 | VOP_STRATEGY(bp); |
c5a600cf | 159 | trace(TR_BREADMISS, pack(vp, size), blkno); |
3789a403 | 160 | p->p_stats->p_ru.ru_inblock++; /* pay for read */ |
7188ac27 | 161 | return (biowait(bp)); |
663dbc72 BJ |
162 | } |
163 | ||
164 | /* | |
bb1626f7 KM |
165 | * Operates like bread, but also starts I/O on the N specified |
166 | * read-ahead blocks. | |
663dbc72 | 167 | */ |
bb1626f7 | 168 | breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) |
7188ac27 | 169 | struct vnode *vp; |
84baaab3 | 170 | daddr_t blkno; int size; |
ec67a3ce MK |
171 | #ifdef SECSIZE |
172 | long secsize; | |
173 | #endif SECSIZE | |
bb1626f7 KM |
174 | daddr_t rablkno[]; int rabsize[]; |
175 | int num; | |
a937f856 | 176 | struct ucred *cred; |
7188ac27 | 177 | struct buf **bpp; |
663dbc72 | 178 | { |
3789a403 | 179 | struct proc *p = curproc; /* XXX */ |
663dbc72 | 180 | register struct buf *bp, *rabp; |
bb1626f7 | 181 | register int i; |
663dbc72 BJ |
182 | |
183 | bp = NULL; | |
3efdd860 | 184 | /* |
d42a4811 KM |
185 | * If the block is not memory resident, |
186 | * allocate a buffer and start I/O. | |
3efdd860 | 187 | */ |
7188ac27 | 188 | if (!incore(vp, blkno)) { |
e140149a | 189 | *bpp = bp = getblk(vp, blkno, size, 0, 0); |
ec67a3ce | 190 | #endif SECSIZE |
d42a4811 | 191 | if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { |
663dbc72 | 192 | bp->b_flags |= B_READ; |
4f083fd7 | 193 | if (bp->b_bcount > bp->b_bufsize) |
bb1626f7 | 194 | panic("breadn"); |
a937f856 KM |
195 | if (bp->b_rcred == NOCRED && cred != NOCRED) { |
196 | crhold(cred); | |
197 | bp->b_rcred = cred; | |
198 | } | |
7188ac27 | 199 | VOP_STRATEGY(bp); |
c5a600cf | 200 | trace(TR_BREADMISS, pack(vp, size), blkno); |
3789a403 | 201 | p->p_stats->p_ru.ru_inblock++; /* pay for read */ |
7d1e9cf4 | 202 | } else { |
c5a600cf | 203 | trace(TR_BREADHIT, pack(vp, size), blkno); |
7d1e9cf4 | 204 | } |
663dbc72 | 205 | } |
3efdd860 KM |
206 | |
207 | /* | |
bb1626f7 KM |
208 | * If there's read-ahead block(s), start I/O |
209 | * on them also (as above). | |
3efdd860 | 210 | */ |
bb1626f7 KM |
211 | for (i = 0; i < num; i++) { |
212 | if (incore(vp, rablkno[i])) | |
213 | continue; | |
e140149a | 214 | rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0); |
ec67a3ce | 215 | #endif SECSIZE |
d42a4811 | 216 | if (rabp->b_flags & (B_DONE | B_DELWRI)) { |
663dbc72 | 217 | brelse(rabp); |
bb1626f7 | 218 | trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); |
973ecc4f | 219 | } else { |
d42a4811 | 220 | rabp->b_flags |= B_ASYNC | B_READ; |
4f083fd7 SL |
221 | if (rabp->b_bcount > rabp->b_bufsize) |
222 | panic("breadrabp"); | |
5062ac4a | 223 | if (rabp->b_rcred == NOCRED && cred != NOCRED) { |
a937f856 | 224 | crhold(cred); |
5062ac4a | 225 | rabp->b_rcred = cred; |
a937f856 | 226 | } |
7188ac27 | 227 | VOP_STRATEGY(rabp); |
bb1626f7 | 228 | trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); |
3789a403 | 229 | p->p_stats->p_ru.ru_inblock++; /* pay in advance */ |
663dbc72 BJ |
230 | } |
231 | } | |
3efdd860 KM |
232 | |
233 | /* | |
d42a4811 KM |
234 | * If block was memory resident, let bread get it. |
235 | * If block was not memory resident, the read was | |
236 | * started above, so just wait for the read to complete. | |
3efdd860 | 237 | */ |
84baaab3 | 238 | if (bp == NULL) |
ec67a3ce MK |
239 | #ifdef SECSIZE |
240 | return (bread(dev, blkno, size, secsize)); | |
241 | #else SECSIZE | |
a937f856 | 242 | return (bread(vp, blkno, size, cred, bpp)); |
7188ac27 | 243 | return (biowait(bp)); |
663dbc72 BJ |
244 | } |
245 | ||
888c761e MS |
246 | /* |
247 | * We could optimize this by keeping track of where the last read-ahead | |
248 | * was, but it would involve adding fields to the vnode. For now, let's | |
249 | * just get it working. | |
250 | * | |
251 | * This replaces bread. If this is a bread at the beginning of a file and | |
252 | * lastr is 0, we assume this is the first read and we'll read up to two | |
253 | * blocks if they are sequential. After that, we'll do regular read ahead | |
254 | * in clustered chunks. | |
255 | * | |
256 | * There are 4 or 5 cases depending on how you count: | |
257 | * Desired block is in the cache: | |
258 | * 1 Not sequential access (0 I/Os). | |
259 | * 2 Access is sequential, do read-ahead (1 ASYNC). | |
260 | * Desired block is not in cache: | |
261 | * 3 Not sequential access (1 SYNC). | |
262 | * 4 Sequential access, next block is contiguous (1 SYNC). | |
263 | * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) | |
264 | * | |
265 | * There are potentially two buffers that require I/O. | |
266 | * bp is the block requested. | |
267 | * rbp is the read-ahead block. | |
268 | * If either is NULL, then you don't have to do the I/O. | |
269 | */ | |
270 | cluster_read(vp, filesize, lblkno, size, cred, bpp) | |
271 | struct vnode *vp; | |
272 | u_quad_t filesize; | |
273 | daddr_t lblkno; | |
274 | long size; | |
275 | struct ucred *cred; | |
276 | struct buf **bpp; | |
277 | { | |
278 | struct buf *bp, *rbp; | |
279 | daddr_t blkno, ioblkno; | |
280 | long flags; | |
281 | int error, num_ra, alreadyincore; | |
282 | ||
283 | #ifdef DIAGNOSTIC | |
284 | if (size == 0) | |
285 | panic("cluster_read: size = 0"); | |
286 | #endif | |
287 | ||
288 | error = 0; | |
289 | flags = B_READ; | |
e140149a | 290 | *bpp = bp = getblk(vp, lblkno, size, 0, 0); |
888c761e MS |
291 | if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { |
292 | /* | |
293 | * Desired block is in cache; do any readahead ASYNC. | |
294 | * Case 1, 2. | |
295 | */ | |
296 | trace(TR_BREADHIT, pack(vp, size), lblkno); | |
297 | flags |= B_ASYNC; | |
298 | ioblkno = lblkno + | |
299 | (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); | |
e140149a | 300 | alreadyincore = (int)incore(vp, ioblkno); |
888c761e MS |
301 | bp = NULL; |
302 | } else { | |
303 | /* Block wasn't in cache, case 3, 4, 5. */ | |
304 | trace(TR_BREADMISS, pack(vp, size), lblkno); | |
305 | ioblkno = lblkno; | |
306 | bp->b_flags |= flags; | |
307 | alreadyincore = 0; | |
308 | curproc->p_stats->p_ru.ru_inblock++; /* XXX */ | |
309 | } | |
310 | /* | |
311 | * XXX | |
312 | * Replace 1 with a window size based on some permutation of | |
313 | * maxcontig and rot_delay. This will let you figure out how | |
314 | * many blocks you should read-ahead (case 2, 4, 5). | |
315 | * | |
316 | * If the access isn't sequential, cut the window size in half. | |
317 | */ | |
318 | rbp = NULL; | |
319 | if (lblkno != vp->v_lastr + 1 && lblkno != 0) | |
320 | vp->v_ralen = max(vp->v_ralen >> 1, 1); | |
321 | else if ((ioblkno + 1) * size < filesize && !alreadyincore && | |
322 | !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { | |
323 | /* | |
324 | * Reading sequentially, and the next block is not in the | |
325 | * cache. We are going to try reading ahead. If this is | |
326 | * the first read of a file, then limit read-ahead to a | |
327 | * single block, else read as much as we're allowed. | |
328 | */ | |
329 | if (num_ra > vp->v_ralen) { | |
330 | num_ra = vp->v_ralen; | |
331 | vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); | |
332 | } else | |
333 | vp->v_ralen = num_ra + 1; | |
334 | ||
335 | ||
336 | if (num_ra) /* case 2, 4 */ | |
337 | rbp = cluster_rbuild(vp, filesize, | |
338 | bp, ioblkno, blkno, size, num_ra, flags); | |
339 | else if (lblkno != 0 && ioblkno == lblkno) { | |
340 | /* Case 5: check how many blocks to read ahead */ | |
341 | ++ioblkno; | |
342 | if ((ioblkno + 1) * size > filesize || | |
343 | (error = VOP_BMAP(vp, | |
344 | ioblkno, NULL, &blkno, &num_ra))) | |
345 | goto skip_readahead; | |
346 | flags |= B_ASYNC; | |
347 | if (num_ra) | |
348 | rbp = cluster_rbuild(vp, filesize, | |
349 | NULL, ioblkno, blkno, size, num_ra, flags); | |
350 | else { | |
e140149a | 351 | rbp = getblk(vp, ioblkno, size, 0, 0); |
888c761e MS |
352 | rbp->b_flags |= flags; |
353 | rbp->b_blkno = blkno; | |
354 | } | |
355 | } else if (lblkno != 0) { | |
356 | /* case 2; read ahead single block */ | |
e140149a | 357 | rbp = getblk(vp, ioblkno, size, 0, 0); |
888c761e MS |
358 | rbp->b_flags |= flags; |
359 | rbp->b_blkno = blkno; | |
360 | } else if (bp) /* case 1, 3, block 0 */ | |
361 | bp->b_blkno = blkno; | |
362 | /* Case 1 on block 0; not really doing sequential I/O */ | |
363 | ||
364 | if (rbp == bp) /* case 4 */ | |
365 | rbp = NULL; | |
366 | else if (rbp) { /* case 2, 5 */ | |
367 | trace(TR_BREADMISSRA, | |
368 | pack(vp, (num_ra + 1) * size), ioblkno); | |
369 | curproc->p_stats->p_ru.ru_inblock++; /* XXX */ | |
370 | } | |
371 | } | |
372 | ||
373 | /* XXX Kirk, do we need to make sure the bp has creds? */ | |
374 | skip_readahead: | |
375 | if (bp) | |
376 | if (bp->b_flags & (B_DONE | B_DELWRI)) | |
377 | panic("cluster_read: DONE bp"); | |
378 | else | |
379 | error = VOP_STRATEGY(bp); | |
380 | ||
381 | if (rbp) | |
382 | if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { | |
383 | rbp->b_flags &= ~(B_ASYNC | B_READ); | |
384 | brelse(rbp); | |
385 | } else | |
386 | (void) VOP_STRATEGY(rbp); | |
387 | ||
388 | if (bp) | |
389 | return(biowait(bp)); | |
390 | return(error); | |
391 | } | |
392 | ||
393 | /* | |
394 | * If blocks are contiguous on disk, use this to provide clustered | |
395 | * read ahead. We will read as many blocks as possible sequentially | |
396 | * and then parcel them up into logical blocks in the buffer hash table. | |
397 | */ | |
398 | struct buf * | |
399 | cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) | |
400 | struct vnode *vp; | |
401 | u_quad_t filesize; | |
402 | struct buf *bp; | |
403 | daddr_t lbn; | |
404 | daddr_t blkno; | |
405 | long size; | |
406 | int run; | |
407 | long flags; | |
408 | { | |
409 | struct cluster_save *b_save; | |
410 | struct buf *tbp; | |
411 | daddr_t bn; | |
412 | int i, inc; | |
413 | ||
414 | if (size * (lbn + run + 1) > filesize) | |
415 | --run; | |
416 | if (run == 0) { | |
417 | if (!bp) { | |
e140149a | 418 | bp = getblk(vp, lbn, size, 0, 0); |
888c761e MS |
419 | bp->b_blkno = blkno; |
420 | bp->b_flags |= flags; | |
421 | } | |
422 | return(bp); | |
423 | } | |
424 | ||
425 | bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); | |
426 | if (bp->b_flags & (B_DONE | B_DELWRI)) | |
427 | return (bp); | |
428 | ||
429 | b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), | |
430 | M_SEGMENT, M_WAITOK); | |
431 | b_save->bs_bufsize = b_save->bs_bcount = size; | |
432 | b_save->bs_nchildren = 0; | |
433 | b_save->bs_children = (struct buf **)(b_save + 1); | |
434 | b_save->bs_saveaddr = bp->b_saveaddr; | |
435 | bp->b_saveaddr = (caddr_t) b_save; | |
436 | ||
437 | inc = size / DEV_BSIZE; | |
438 | for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { | |
439 | if (incore(vp, lbn + i)) { | |
440 | if (i == 1) { | |
441 | bp->b_saveaddr = b_save->bs_saveaddr; | |
442 | bp->b_flags &= ~B_CALL; | |
443 | bp->b_iodone = NULL; | |
444 | allocbuf(bp, size); | |
445 | free(b_save, M_SEGMENT); | |
446 | } else | |
447 | allocbuf(bp, size * i); | |
448 | break; | |
449 | } | |
e140149a | 450 | tbp = getblk(vp, lbn + i, 0, 0, 0); |
888c761e MS |
451 | tbp->b_bcount = tbp->b_bufsize = size; |
452 | tbp->b_blkno = bn; | |
453 | tbp->b_flags |= flags | B_READ | B_ASYNC; | |
454 | ++b_save->bs_nchildren; | |
455 | b_save->bs_children[i - 1] = tbp; | |
456 | } | |
457 | if (!(bp->b_flags & B_ASYNC)) | |
458 | vp->v_ralen = max(vp->v_ralen - 1, 1); | |
459 | return(bp); | |
460 | } | |
461 | ||
462 | /* | |
463 | * Either get a new buffer or grow the existing one. | |
464 | */ | |
465 | struct buf * | |
466 | cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) | |
467 | struct vnode *vp; | |
468 | struct buf *bp; | |
469 | long flags; | |
470 | daddr_t blkno; | |
471 | daddr_t lblkno; | |
472 | long size; | |
473 | int run; | |
474 | { | |
475 | if (!bp) { | |
e140149a | 476 | bp = getblk(vp, lblkno, size, 0, 0); |
888c761e MS |
477 | if (bp->b_flags & (B_DONE | B_DELWRI)) { |
478 | bp->b_blkno = blkno; | |
479 | return(bp); | |
480 | } | |
481 | } | |
482 | allocbuf(bp, run * size); | |
483 | bp->b_blkno = blkno; | |
484 | bp->b_iodone = cluster_callback; | |
485 | bp->b_flags |= flags | B_CALL; | |
486 | return(bp); | |
487 | } | |
488 | ||
489 | /* | |
490 | * Cleanup after a clustered read or write. | |
491 | */ | |
492 | void | |
493 | cluster_callback(bp) | |
494 | struct buf *bp; | |
495 | { | |
496 | struct cluster_save *b_save; | |
497 | struct buf **tbp; | |
498 | long bsize; | |
499 | caddr_t cp; | |
888c761e MS |
500 | b_save = (struct cluster_save *)(bp->b_saveaddr); |
501 | bp->b_saveaddr = b_save->bs_saveaddr; | |
502 | ||
503 | cp = bp->b_un.b_addr + b_save->bs_bufsize; | |
504 | for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { | |
505 | pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); | |
506 | cp += (*tbp)->b_bufsize; | |
507 | bp->b_bufsize -= (*tbp)->b_bufsize; | |
508 | biodone(*tbp); | |
509 | } | |
510 | #ifdef DIAGNOSTIC | |
511 | if (bp->b_bufsize != b_save->bs_bufsize) | |
512 | panic ("cluster_callback: more space to reclaim"); | |
513 | #endif | |
514 | bp->b_bcount = bp->b_bufsize; | |
515 | bp->b_iodone = NULL; | |
516 | free(b_save, M_SEGMENT); | |
517 | if (bp->b_flags & B_ASYNC) | |
518 | brelse(bp); | |
519 | else | |
520 | wakeup((caddr_t)bp); | |
521 | } | |
522 | ||
663dbc72 | 523 | /* |
d42a4811 KM |
524 | * Synchronous write. |
525 | * Release buffer on completion. | |
663dbc72 BJ |
526 | */ |
527 | bwrite(bp) | |
3efdd860 | 528 | register struct buf *bp; |
663dbc72 | 529 | { |
3789a403 | 530 | struct proc *p = curproc; /* XXX */ |
7188ac27 | 531 | register int flag; |
31222d0d | 532 | int s, error = 0; |
663dbc72 BJ |
533 | |
534 | flag = bp->b_flags; | |
f844ee62 | 535 | bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); |
77dc8a8c KM |
536 | if (flag & B_ASYNC) { |
537 | if ((flag & B_DELWRI) == 0) | |
538 | p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ | |
539 | else | |
540 | reassignbuf(bp, bp->b_vp); | |
541 | } | |
c5a600cf | 542 | trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); |
4f083fd7 SL |
543 | if (bp->b_bcount > bp->b_bufsize) |
544 | panic("bwrite"); | |
86e7dd3b | 545 | s = splbio(); |
c669f646 | 546 | bp->b_vp->v_numoutput++; |
e140149a | 547 | bp->b_flags |= B_WRITEINPROG; |
86e7dd3b | 548 | splx(s); |
7188ac27 | 549 | VOP_STRATEGY(bp); |
3efdd860 KM |
550 | |
551 | /* | |
d42a4811 | 552 | * If the write was synchronous, then await I/O completion. |
3efdd860 | 553 | * If the write was "delayed", then we put the buffer on |
d42a4811 | 554 | * the queue of blocks awaiting I/O completion status. |
3efdd860 | 555 | */ |
d42a4811 | 556 | if ((flag & B_ASYNC) == 0) { |
7188ac27 | 557 | error = biowait(bp); |
77dc8a8c KM |
558 | if ((flag&B_DELWRI) == 0) |
559 | p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ | |
560 | else | |
561 | reassignbuf(bp, bp->b_vp); | |
e140149a KM |
562 | if (bp->b_flags & B_EINTR) { |
563 | bp->b_flags &= ~B_EINTR; | |
564 | error = EINTR; | |
565 | } | |
663dbc72 | 566 | brelse(bp); |
7188ac27 | 567 | } else if (flag & B_DELWRI) { |
31222d0d | 568 | s = splbio(); |
663dbc72 | 569 | bp->b_flags |= B_AGE; |
31222d0d | 570 | splx(s); |
7188ac27 KM |
571 | } |
572 | return (error); | |
663dbc72 BJ |
573 | } |
574 | ||
80746147 JH |
575 | int |
576 | vn_bwrite(ap) | |
577 | struct vop_bwrite_args *ap; | |
578 | { | |
37392cf8 | 579 | return (bwrite(ap->a_bp)); |
80746147 JH |
580 | } |
581 | ||
582 | ||
663dbc72 | 583 | /* |
d42a4811 KM |
584 | * Delayed write. |
585 | * | |
586 | * The buffer is marked dirty, but is not queued for I/O. | |
587 | * This routine should be used when the buffer is expected | |
588 | * to be modified again soon, typically a small write that | |
589 | * partially fills a buffer. | |
590 | * | |
591 | * NB: magnetic tapes cannot be delayed; they must be | |
592 | * written in the order that the writes are requested. | |
663dbc72 BJ |
593 | */ |
594 | bdwrite(bp) | |
3efdd860 | 595 | register struct buf *bp; |
663dbc72 | 596 | { |
3789a403 | 597 | struct proc *p = curproc; /* XXX */ |
663dbc72 | 598 | |
c669f646 KM |
599 | if ((bp->b_flags & B_DELWRI) == 0) { |
600 | bp->b_flags |= B_DELWRI; | |
601 | reassignbuf(bp, bp->b_vp); | |
3789a403 | 602 | p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ |
c669f646 | 603 | } |
7188ac27 | 604 | /* |
edadbc2c | 605 | * If this is a tape drive, the write must be initiated. |
7188ac27 | 606 | */ |
ec67a3ce | 607 | if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) |
663dbc72 | 608 | bawrite(bp); |
edadbc2c | 609 | } else { |
d42a4811 | 610 | bp->b_flags |= (B_DONE | B_DELWRI); |
663dbc72 BJ |
611 | brelse(bp); |
612 | } | |
613 | } | |
614 | ||
615 | /* | |
d42a4811 KM |
616 | * Asynchronous write. |
617 | * Start I/O on a buffer, but do not wait for it to complete. | |
618 | * The buffer is released when the I/O completes. | |
663dbc72 BJ |
619 | */ |
620 | bawrite(bp) | |
3efdd860 | 621 | register struct buf *bp; |
663dbc72 BJ |
622 | { |
623 | ||
d42a4811 KM |
624 | /* |
625 | * Setting the ASYNC flag causes bwrite to return | |
626 | * after starting the I/O. | |
627 | */ | |
663dbc72 | 628 | bp->b_flags |= B_ASYNC; |
e140149a | 629 | (void) VOP_BWRITE(bp); |
663dbc72 BJ |
630 | } |
631 | ||
888c761e MS |
632 | /* |
633 | * Do clustered write for FFS. | |
634 | * | |
635 | * Three cases: | |
636 | * 1. Write is not sequential (write asynchronously) | |
637 | * Write is sequential: | |
638 | * 2. beginning of cluster - begin cluster | |
639 | * 3. middle of a cluster - add to cluster | |
640 | * 4. end of a cluster - asynchronously write cluster | |
641 | */ | |
642 | void | |
643 | cluster_write(bp, filesize) | |
644 | struct buf *bp; | |
645 | u_quad_t filesize; | |
646 | { | |
647 | struct vnode *vp; | |
648 | daddr_t lbn; | |
649 | int clen, error, maxrun; | |
650 | ||
651 | vp = bp->b_vp; | |
652 | lbn = bp->b_lblkno; | |
653 | clen = 0; | |
654 | ||
655 | /* | |
656 | * Handle end of file first. If we are appending, we need to check | |
657 | * if the current block was allocated contiguously. If it wasn't, | |
658 | * then we need to fire off a previous cluster if it existed. | |
659 | * Additionally, when we're appending, we need to figure out how | |
660 | * to initialize vp->v_clen. | |
661 | */ | |
662 | if ((lbn + 1) * bp->b_bcount == filesize) { | |
663 | if (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE) { | |
664 | /* This block was not allocated contiguously */ | |
665 | if (vp->v_clen) | |
666 | cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, | |
667 | vp->v_lastw - vp->v_cstart + 1, lbn); | |
668 | vp->v_cstart = lbn; | |
669 | clen = vp->v_clen = | |
670 | MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; | |
671 | /* | |
672 | * Next cluster started. Write this buffer and return. | |
673 | */ | |
674 | vp->v_lastw = lbn; | |
675 | vp->v_lasta = bp->b_blkno; | |
676 | bdwrite(bp); | |
677 | return; | |
678 | } | |
679 | vp->v_lasta = bp->b_blkno; | |
680 | } else if (lbn == 0) { | |
681 | vp->v_clen = vp->v_cstart = vp->v_lastw = 0; | |
682 | } | |
683 | if (vp->v_clen == 0 || lbn != vp->v_lastw + 1) { | |
684 | if (vp->v_clen != 0) | |
685 | /* | |
686 | * Write is not sequential. | |
687 | */ | |
688 | cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, | |
689 | vp->v_lastw - vp->v_cstart + 1, lbn); | |
690 | /* | |
691 | * Consider beginning a cluster. | |
692 | */ | |
693 | if (error = VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { | |
694 | bawrite(bp); | |
695 | vp->v_cstart = lbn + 1; | |
696 | vp->v_lastw = lbn; | |
697 | return; | |
698 | } | |
699 | vp->v_clen = clen; | |
700 | if (clen == 0) { /* I/O not contiguous */ | |
701 | vp->v_cstart = lbn + 1; | |
702 | bawrite(bp); | |
703 | } else { /* Wait for rest of cluster */ | |
704 | vp->v_cstart = lbn; | |
705 | bdwrite(bp); | |
706 | } | |
707 | } else if (lbn == vp->v_cstart + vp->v_clen) { | |
708 | /* | |
709 | * At end of cluster, write it out. | |
710 | */ | |
711 | cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, | |
712 | vp->v_clen + 1, lbn); | |
713 | vp->v_clen = 0; | |
714 | vp->v_cstart = lbn + 1; | |
715 | } else | |
716 | /* | |
717 | * In the middle of a cluster, so just delay the | |
718 | * I/O for now. | |
719 | */ | |
720 | bdwrite(bp); | |
721 | vp->v_lastw = lbn; | |
722 | } | |
723 | ||
724 | ||
725 | /* | |
726 | * This is an awful lot like cluster_rbuild...wish they could be combined. | |
727 | * The last lbn argument is the current block on which I/O is being | |
728 | * performed. Check to see that it doesn't fall in the middle of | |
729 | * the current block. | |
730 | */ | |
731 | void | |
732 | cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) | |
733 | struct vnode *vp; | |
734 | struct buf *last_bp; | |
735 | long size; | |
736 | daddr_t start_lbn; | |
737 | int len; | |
738 | daddr_t lbn; | |
739 | { | |
740 | struct cluster_save *b_save; | |
741 | struct buf *bp, *tbp; | |
742 | caddr_t cp; | |
743 | int i, s; | |
744 | ||
745 | redo: | |
746 | while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { | |
747 | ++start_lbn; | |
748 | --len; | |
749 | } | |
750 | ||
751 | /* Get more memory for current buffer */ | |
752 | if (len <= 1) { | |
753 | if (last_bp) | |
754 | bawrite(last_bp); | |
755 | return; | |
756 | } | |
757 | ||
e140149a | 758 | bp = getblk(vp, start_lbn, size, 0, 0); |
888c761e MS |
759 | if (!(bp->b_flags & B_DELWRI)) { |
760 | ++start_lbn; | |
761 | --len; | |
762 | brelse(bp); | |
763 | goto redo; | |
764 | } | |
765 | ||
766 | --len; | |
767 | b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), | |
768 | M_SEGMENT, M_WAITOK); | |
769 | b_save->bs_bcount = bp->b_bcount; | |
770 | b_save->bs_bufsize = bp->b_bufsize; | |
771 | b_save->bs_nchildren = 0; | |
772 | b_save->bs_children = (struct buf **)(b_save + 1); | |
773 | b_save->bs_saveaddr = bp->b_saveaddr; | |
774 | bp->b_saveaddr = (caddr_t) b_save; | |
775 | ||
776 | ||
777 | bp->b_flags |= B_CALL; | |
778 | bp->b_iodone = cluster_callback; | |
779 | cp = bp->b_un.b_addr + bp->b_bufsize; | |
780 | for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { | |
781 | if (!incore(vp, start_lbn) || start_lbn == lbn) | |
782 | break; | |
783 | ||
784 | if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { | |
e140149a | 785 | tbp = getblk(vp, start_lbn, size, 0, 0); |
888c761e MS |
786 | #ifdef DIAGNOSTIC |
787 | if (tbp->b_bcount != tbp->b_bufsize) | |
788 | panic("cluster_wbuild: Buffer too big"); | |
789 | #endif | |
790 | if (!(tbp->b_flags & B_DELWRI)) { | |
791 | brelse(tbp); | |
792 | break; | |
793 | } | |
794 | } else | |
795 | tbp = last_bp; | |
796 | ||
797 | ++b_save->bs_nchildren; | |
798 | ||
799 | /* Move memory from children to parent */ | |
800 | pagemove(tbp->b_un.b_daddr, cp, size); | |
801 | bp->b_bcount += size; | |
802 | bp->b_bufsize += size; | |
803 | ||
804 | tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); | |
805 | tbp->b_flags |= B_ASYNC; | |
806 | s = splbio(); | |
807 | reassignbuf(tbp, tbp->b_vp); /* put on clean list */ | |
808 | ++tbp->b_vp->v_numoutput; | |
809 | splx(s); | |
810 | b_save->bs_children[i] = tbp; | |
811 | ||
812 | cp += tbp->b_bufsize; | |
813 | } | |
814 | ||
815 | if (i == 0) { | |
816 | /* None to cluster */ | |
817 | bp->b_saveaddr = b_save->bs_saveaddr; | |
818 | bp->b_flags &= ~B_CALL; | |
819 | bp->b_iodone = NULL; | |
820 | free(b_save, M_SEGMENT); | |
821 | } | |
822 | bawrite(bp); | |
823 | if (i < len) { | |
824 | len -= i + 1; | |
825 | start_lbn += 1; | |
826 | goto redo; | |
827 | } | |
828 | } | |
829 | ||
663dbc72 | 830 | /* |
d42a4811 KM |
831 | * Release a buffer. |
832 | * Even if the buffer is dirty, no I/O is started. | |
663dbc72 BJ |
833 | */ |
834 | brelse(bp) | |
3efdd860 | 835 | register struct buf *bp; |
663dbc72 | 836 | { |
e3249ec0 | 837 | register struct queue_entry *flist; |
d42a4811 | 838 | int s; |
663dbc72 | 839 | |
c5a600cf | 840 | trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); |
3efdd860 | 841 | /* |
edadbc2c KM |
842 | * If a process is waiting for the buffer, or |
843 | * is waiting for a free buffer, awaken it. | |
3efdd860 | 844 | */ |
d42a4811 | 845 | if (bp->b_flags & B_WANTED) |
663dbc72 | 846 | wakeup((caddr_t)bp); |
37392cf8 KM |
847 | if (needbuffer) { |
848 | needbuffer = 0; | |
849 | wakeup((caddr_t)&needbuffer); | |
663dbc72 | 850 | } |
edadbc2c KM |
851 | /* |
852 | * Retry I/O for locked buffers rather than invalidating them. | |
853 | */ | |
31222d0d | 854 | s = splbio(); |
edadbc2c KM |
855 | if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) |
856 | bp->b_flags &= ~B_ERROR; | |
edadbc2c KM |
857 | /* |
858 | * Disassociate buffers that are no longer valid. | |
859 | */ | |
d42a4811 | 860 | if (bp->b_flags & (B_NOCACHE | B_ERROR)) |
7188ac27 | 861 | bp->b_flags |= B_INVAL; |
d42a4811 | 862 | if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { |
edadbc2c KM |
863 | if (bp->b_vp) |
864 | brelvp(bp); | |
865 | bp->b_flags &= ~B_DELWRI; | |
7188ac27 | 866 | } |
3efdd860 KM |
867 | /* |
868 | * Stick the buffer back on a free list. | |
869 | */ | |
4f083fd7 SL |
870 | if (bp->b_bufsize <= 0) { |
871 | /* block has no buffer ... put at front of unused buffer list */ | |
37392cf8 | 872 | flist = &bufqueues[BQ_EMPTY]; |
4f083fd7 | 873 | binsheadfree(bp, flist); |
d42a4811 | 874 | } else if (bp->b_flags & (B_ERROR | B_INVAL)) { |
46387ee3 | 875 | /* block has no info ... put at front of most free list */ |
37392cf8 | 876 | flist = &bufqueues[BQ_AGE]; |
3efdd860 | 877 | binsheadfree(bp, flist); |
663dbc72 | 878 | } else { |
46387ee3 | 879 | if (bp->b_flags & B_LOCKED) |
37392cf8 | 880 | flist = &bufqueues[BQ_LOCKED]; |
46387ee3 | 881 | else if (bp->b_flags & B_AGE) |
37392cf8 | 882 | flist = &bufqueues[BQ_AGE]; |
46387ee3 | 883 | else |
37392cf8 | 884 | flist = &bufqueues[BQ_LRU]; |
3efdd860 | 885 | binstailfree(bp, flist); |
663dbc72 | 886 | } |
d42a4811 | 887 | bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); |
663dbc72 BJ |
888 | splx(s); |
889 | } | |
890 | ||
891 | /* | |
d42a4811 | 892 | * Check to see if a block is currently memory resident. |
663dbc72 | 893 | */ |
e140149a | 894 | struct buf * |
7188ac27 KM |
895 | incore(vp, blkno) |
896 | struct vnode *vp; | |
3efdd860 | 897 | daddr_t blkno; |
663dbc72 BJ |
898 | { |
899 | register struct buf *bp; | |
663dbc72 | 900 | |
e3249ec0 | 901 | for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) |
edadbc2c | 902 | if (bp->b_lblkno == blkno && bp->b_vp == vp && |
3efdd860 | 903 | (bp->b_flags & B_INVAL) == 0) |
e140149a KM |
904 | return (bp); |
905 | return (NULL); | |
663dbc72 BJ |
906 | } |
907 | ||
edadbc2c | 908 | /* |
d42a4811 KM |
909 | * Check to see if a block is currently memory resident. |
910 | * If it is resident, return it. If it is not resident, | |
911 | * allocate a new buffer and assign it to the block. | |
663dbc72 BJ |
912 | */ |
913 | struct buf * | |
ec67a3ce MK |
914 | #ifdef SECSIZE |
915 | getblk(dev, blkno, size, secsize) | |
916 | #else SECSIZE | |
e140149a | 917 | getblk(vp, blkno, size, slpflag, slptimeo) |
7188ac27 | 918 | register struct vnode *vp; |
ad30fb67 | 919 | daddr_t blkno; |
e140149a | 920 | int size, slpflag, slptimeo; |
ec67a3ce MK |
921 | #ifdef SECSIZE |
922 | long secsize; | |
923 | #endif SECSIZE | |
663dbc72 | 924 | { |
e3249ec0 KM |
925 | register struct buf *bp; |
926 | struct list_entry *dp; | |
e140149a | 927 | int s, error; |
663dbc72 | 928 | |
00a6a148 KM |
929 | if (size > MAXBSIZE) |
930 | panic("getblk: size too big"); | |
3efdd860 | 931 | /* |
d42a4811 KM |
932 | * Search the cache for the block. If the buffer is found, |
933 | * but it is currently locked, the we must wait for it to | |
934 | * become available. | |
3efdd860 | 935 | */ |
7188ac27 | 936 | dp = BUFHASH(vp, blkno); |
3efdd860 | 937 | loop: |
e3249ec0 | 938 | for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { |
e140149a | 939 | if (bp->b_lblkno != blkno || bp->b_vp != vp) |
663dbc72 | 940 | continue; |
a5e62f37 | 941 | s = splbio(); |
d42a4811 | 942 | if (bp->b_flags & B_BUSY) { |
663dbc72 | 943 | bp->b_flags |= B_WANTED; |
e140149a KM |
944 | error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), |
945 | "getblk", slptimeo); | |
23900030 | 946 | splx(s); |
e140149a KM |
947 | if (error) |
948 | return (NULL); | |
663dbc72 BJ |
949 | goto loop; |
950 | } | |
e140149a KM |
951 | /* |
952 | * The test for B_INVAL is moved down here, since there | |
953 | * are cases where B_INVAL is set before VOP_BWRITE() is | |
954 | * called and for NFS, the process cannot be allowed to | |
955 | * allocate a new buffer for the same block until the write | |
956 | * back to the server has been completed. (ie. B_BUSY clears) | |
957 | */ | |
958 | if (bp->b_flags & B_INVAL) { | |
959 | splx(s); | |
960 | continue; | |
961 | } | |
c669f646 KM |
962 | bremfree(bp); |
963 | bp->b_flags |= B_BUSY; | |
23900030 | 964 | splx(s); |
32a56bda | 965 | if (bp->b_bcount != size) { |
edadbc2c KM |
966 | printf("getblk: stray size"); |
967 | bp->b_flags |= B_INVAL; | |
e140149a | 968 | VOP_BWRITE(bp); |
9d6d37ce | 969 | goto loop; |
edadbc2c | 970 | } |
663dbc72 | 971 | bp->b_flags |= B_CACHE; |
a5e62f37 | 972 | return (bp); |
663dbc72 | 973 | } |
e140149a KM |
974 | /* |
975 | * The loop back to the top when getnewbuf() fails is because | |
976 | * stateless filesystems like NFS have no node locks. Thus, | |
977 | * there is a slight chance that more than one process will | |
978 | * try and getnewbuf() for the same block concurrently when | |
979 | * the first sleeps in getnewbuf(). So after a sleep, go back | |
980 | * up to the top to check the hash lists again. | |
981 | */ | |
982 | if ((bp = getnewbuf(slpflag, slptimeo)) == 0) | |
983 | goto loop; | |
3efdd860 | 984 | bremhash(bp); |
edadbc2c | 985 | bgetvp(vp, bp); |
521a4688 | 986 | bp->b_bcount = 0; |
edadbc2c | 987 | bp->b_lblkno = blkno; |
ec67a3ce MK |
988 | #ifdef SECSIZE |
989 | bp->b_blksize = secsize; | |
990 | #endif SECSIZE | |
ad30fb67 | 991 | bp->b_blkno = blkno; |
4f083fd7 | 992 | bp->b_error = 0; |
7188ac27 KM |
993 | bp->b_resid = 0; |
994 | binshash(bp, dp); | |
521a4688 | 995 | allocbuf(bp, size); |
a5e62f37 | 996 | return (bp); |
663dbc72 BJ |
997 | } |
998 | ||
999 | /* | |
d42a4811 KM |
1000 | * Allocate a buffer. |
1001 | * The caller will assign it to a block. | |
663dbc72 BJ |
1002 | */ |
1003 | struct buf * | |
ad30fb67 KM |
1004 | geteblk(size) |
1005 | int size; | |
663dbc72 | 1006 | { |
37392cf8 | 1007 | register struct buf *bp; |
663dbc72 | 1008 | |
00a6a148 KM |
1009 | if (size > MAXBSIZE) |
1010 | panic("geteblk: size too big"); | |
e140149a KM |
1011 | while ((bp = getnewbuf(0, 0)) == NULL) |
1012 | /* void */; | |
4f083fd7 | 1013 | bp->b_flags |= B_INVAL; |
3efdd860 | 1014 | bremhash(bp); |
37392cf8 | 1015 | binshash(bp, &invalhash); |
521a4688 | 1016 | bp->b_bcount = 0; |
ec67a3ce MK |
1017 | #ifdef SECSIZE |
1018 | bp->b_blksize = DEV_BSIZE; | |
1019 | #endif SECSIZE | |
4f083fd7 | 1020 | bp->b_error = 0; |
7188ac27 | 1021 | bp->b_resid = 0; |
521a4688 | 1022 | allocbuf(bp, size); |
a5e62f37 | 1023 | return (bp); |
663dbc72 BJ |
1024 | } |
1025 | ||
ad30fb67 | 1026 | /* |
521a4688 | 1027 | * Expand or contract the actual memory allocated to a buffer. |
d42a4811 | 1028 | * If no memory is available, release buffer and take error exit. |
ad30fb67 | 1029 | */ |
521a4688 KM |
1030 | allocbuf(tp, size) |
1031 | register struct buf *tp; | |
ad30fb67 KM |
1032 | int size; |
1033 | { | |
521a4688 KM |
1034 | register struct buf *bp, *ep; |
1035 | int sizealloc, take, s; | |
ad30fb67 | 1036 | |
521a4688 KM |
1037 | sizealloc = roundup(size, CLBYTES); |
1038 | /* | |
1039 | * Buffer size does not change | |
1040 | */ | |
1041 | if (sizealloc == tp->b_bufsize) | |
1042 | goto out; | |
1043 | /* | |
1044 | * Buffer size is shrinking. | |
1045 | * Place excess space in a buffer header taken from the | |
1046 | * BQ_EMPTY buffer list and placed on the "most free" list. | |
1047 | * If no extra buffer headers are available, leave the | |
1048 | * extra space in the present buffer. | |
1049 | */ | |
1050 | if (sizealloc < tp->b_bufsize) { | |
e3249ec0 | 1051 | if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) |
521a4688 KM |
1052 | goto out; |
1053 | s = splbio(); | |
1054 | bremfree(ep); | |
1055 | ep->b_flags |= B_BUSY; | |
1056 | splx(s); | |
1057 | pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, | |
1058 | (int)tp->b_bufsize - sizealloc); | |
1059 | ep->b_bufsize = tp->b_bufsize - sizealloc; | |
1060 | tp->b_bufsize = sizealloc; | |
1061 | ep->b_flags |= B_INVAL; | |
1062 | ep->b_bcount = 0; | |
1063 | brelse(ep); | |
1064 | goto out; | |
1065 | } | |
1066 | /* | |
1067 | * More buffer space is needed. Get it out of buffers on | |
1068 | * the "most free" list, placing the empty headers on the | |
1069 | * BQ_EMPTY buffer header list. | |
1070 | */ | |
1071 | while (tp->b_bufsize < sizealloc) { | |
1072 | take = sizealloc - tp->b_bufsize; | |
e140149a KM |
1073 | while ((bp = getnewbuf(0, 0)) == NULL) |
1074 | /* void */; | |
521a4688 KM |
1075 | if (take >= bp->b_bufsize) |
1076 | take = bp->b_bufsize; | |
1077 | pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], | |
1078 | &tp->b_un.b_addr[tp->b_bufsize], take); | |
1079 | tp->b_bufsize += take; | |
1080 | bp->b_bufsize = bp->b_bufsize - take; | |
1081 | if (bp->b_bcount > bp->b_bufsize) | |
1082 | bp->b_bcount = bp->b_bufsize; | |
1083 | if (bp->b_bufsize <= 0) { | |
1084 | bremhash(bp); | |
37392cf8 | 1085 | binshash(bp, &invalhash); |
d42a4811 | 1086 | bp->b_dev = NODEV; |
521a4688 KM |
1087 | bp->b_error = 0; |
1088 | bp->b_flags |= B_INVAL; | |
1089 | } | |
1090 | brelse(bp); | |
1091 | } | |
1092 | out: | |
1093 | tp->b_bcount = size; | |
1094 | return (1); | |
4f083fd7 SL |
1095 | } |
1096 | ||
4f083fd7 SL |
1097 | /* |
1098 | * Find a buffer which is available for use. | |
1099 | * Select something from a free list. | |
1100 | * Preference is to AGE list, then LRU list. | |
1101 | */ | |
1102 | struct buf * | |
e140149a KM |
1103 | getnewbuf(slpflag, slptimeo) |
1104 | int slpflag, slptimeo; | |
4f083fd7 | 1105 | { |
37392cf8 | 1106 | register struct buf *bp; |
e3249ec0 | 1107 | register struct queue_entry *dp; |
a937f856 | 1108 | register struct ucred *cred; |
4f083fd7 SL |
1109 | int s; |
1110 | ||
1111 | loop: | |
a5e62f37 | 1112 | s = splbio(); |
37392cf8 | 1113 | for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) |
e3249ec0 | 1114 | if (dp->qe_next) |
4f083fd7 | 1115 | break; |
37392cf8 KM |
1116 | if (dp == bufqueues) { /* no free blocks */ |
1117 | needbuffer = 1; | |
e140149a KM |
1118 | (void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1), |
1119 | "getnewbuf", slptimeo); | |
4b7d506c | 1120 | splx(s); |
e140149a | 1121 | return (NULL); |
4f083fd7 | 1122 | } |
e3249ec0 | 1123 | bp = dp->qe_next; |
c669f646 KM |
1124 | bremfree(bp); |
1125 | bp->b_flags |= B_BUSY; | |
1126 | splx(s); | |
4f083fd7 | 1127 | if (bp->b_flags & B_DELWRI) { |
033a786e | 1128 | (void) bawrite(bp); |
4f083fd7 SL |
1129 | goto loop; |
1130 | } | |
c5a600cf | 1131 | trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); |
edadbc2c KM |
1132 | if (bp->b_vp) |
1133 | brelvp(bp); | |
a937f856 KM |
1134 | if (bp->b_rcred != NOCRED) { |
1135 | cred = bp->b_rcred; | |
1136 | bp->b_rcred = NOCRED; | |
1137 | crfree(cred); | |
1138 | } | |
1139 | if (bp->b_wcred != NOCRED) { | |
1140 | cred = bp->b_wcred; | |
1141 | bp->b_wcred = NOCRED; | |
1142 | crfree(cred); | |
1143 | } | |
4f083fd7 | 1144 | bp->b_flags = B_BUSY; |
1c89915d | 1145 | bp->b_dirtyoff = bp->b_dirtyend = 0; |
bb1626f7 | 1146 | bp->b_validoff = bp->b_validend = 0; |
4f083fd7 SL |
1147 | return (bp); |
1148 | } | |
1149 | ||
663dbc72 | 1150 | /* |
d42a4811 KM |
1151 | * Wait for I/O to complete. |
1152 | * | |
1153 | * Extract and return any errors associated with the I/O. | |
1154 | * If the error flag is set, but no specific error is | |
1155 | * given, return EIO. | |
663dbc72 | 1156 | */ |
3efdd860 | 1157 | biowait(bp) |
ad30fb67 | 1158 | register struct buf *bp; |
663dbc72 | 1159 | { |
530d0032 | 1160 | int s; |
663dbc72 | 1161 | |
a5e62f37 | 1162 | s = splbio(); |
a937f856 | 1163 | while ((bp->b_flags & B_DONE) == 0) |
663dbc72 | 1164 | sleep((caddr_t)bp, PRIBIO); |
530d0032 | 1165 | splx(s); |
7188ac27 KM |
1166 | if ((bp->b_flags & B_ERROR) == 0) |
1167 | return (0); | |
1168 | if (bp->b_error) | |
1169 | return (bp->b_error); | |
1170 | return (EIO); | |
663dbc72 BJ |
1171 | } |
1172 | ||
663dbc72 | 1173 | /* |
af04ce66 | 1174 | * Mark I/O complete on a buffer. |
d42a4811 KM |
1175 | * |
1176 | * If a callback has been requested, e.g. the pageout | |
1177 | * daemon, do so. Otherwise, awaken waiting processes. | |
663dbc72 | 1178 | */ |
251f56ba | 1179 | void |
3efdd860 KM |
1180 | biodone(bp) |
1181 | register struct buf *bp; | |
663dbc72 | 1182 | { |
663dbc72 | 1183 | |
80e7c811 | 1184 | if (bp->b_flags & B_DONE) |
3efdd860 | 1185 | panic("dup biodone"); |
663dbc72 | 1186 | bp->b_flags |= B_DONE; |
76429560 KM |
1187 | if ((bp->b_flags & B_READ) == 0) |
1188 | vwakeup(bp); | |
961945a8 SL |
1189 | if (bp->b_flags & B_CALL) { |
1190 | bp->b_flags &= ~B_CALL; | |
1191 | (*bp->b_iodone)(bp); | |
1192 | return; | |
1193 | } | |
d42a4811 | 1194 | if (bp->b_flags & B_ASYNC) |
663dbc72 BJ |
1195 | brelse(bp); |
1196 | else { | |
1197 | bp->b_flags &= ~B_WANTED; | |
1198 | wakeup((caddr_t)bp); | |
1199 | } | |
1200 | } | |
aa95c6fc | 1201 | |
b5d79df9 MS |
1202 | int |
1203 | count_lock_queue() | |
1204 | { | |
1205 | register struct buf *bp; | |
1206 | register int ret; | |
1207 | ||
1208 | for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next; | |
1209 | bp; bp = (struct buf *)bp->b_freelist.qe_next) | |
1210 | ++ret; | |
1211 | return(ret); | |
1212 | } | |
1213 | ||
aa95c6fc KM |
1214 | #ifdef DIAGNOSTIC |
1215 | /* | |
1216 | * Print out statistics on the current allocation of the buffer pool. | |
1217 | * Can be enabled to print out on every ``sync'' by setting "syncprt" | |
1218 | * above. | |
1219 | */ | |
1220 | void | |
1221 | vfs_bufstats() | |
1222 | { | |
1223 | int s, i, j, count; | |
37392cf8 | 1224 | register struct buf *bp; |
e3249ec0 | 1225 | register struct queue_entry *dp; |
aa95c6fc KM |
1226 | int counts[MAXBSIZE/CLBYTES+1]; |
1227 | static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; | |
1228 | ||
37392cf8 | 1229 | for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { |
aa95c6fc KM |
1230 | count = 0; |
1231 | for (j = 0; j <= MAXBSIZE/CLBYTES; j++) | |
1232 | counts[j] = 0; | |
1233 | s = splbio(); | |
e3249ec0 | 1234 | for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { |
aa95c6fc KM |
1235 | counts[bp->b_bufsize/CLBYTES]++; |
1236 | count++; | |
1237 | } | |
1238 | splx(s); | |
1239 | printf("%s: total-%d", bname[i], count); | |
1240 | for (j = 0; j <= MAXBSIZE/CLBYTES; j++) | |
1241 | if (counts[j] != 0) | |
1242 | printf(", %d-%d", j * CLBYTES, counts[j]); | |
1243 | printf("\n"); | |
1244 | } | |
1245 | } | |
1246 | #endif /* DIAGNOSTIC */ |