Commit | Line | Data |
---|---|---|
5dc2581e KB |
1 | /*- |
2 | * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. | |
7188ac27 | 3 | * All rights reserved. |
da7c5cc6 | 4 | * |
217c3be4 KM |
5 | * This module is believed to contain source code proprietary to AT&T. |
6 | * Use and redistribution is subject to the Berkeley Software License | |
7 | * Agreement and your Software Agreement with AT&T (Western Electric). | |
7188ac27 | 8 | * |
c5e0ddad | 9 | * @(#)vfs_cluster.c 7.59 (Berkeley) %G% |
da7c5cc6 | 10 | */ |
961945a8 | 11 | |
251f56ba KB |
12 | #include <sys/param.h> |
13 | #include <sys/proc.h> | |
14 | #include <sys/buf.h> | |
15 | #include <sys/vnode.h> | |
251f56ba KB |
16 | #include <sys/mount.h> |
17 | #include <sys/trace.h> | |
18 | #include <sys/resourcevar.h> | |
37392cf8 KM |
19 | #include <sys/malloc.h> |
20 | #include <libkern/libkern.h> | |
21 | ||
22 | /* | |
23 | * Definitions for the buffer hash lists. | |
24 | */ | |
25 | #define BUFHASH(dvp, lbn) \ | |
26 | (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) | |
e3249ec0 | 27 | struct list_entry *bufhashtbl, invalhash; |
37392cf8 KM |
28 | u_long bufhash; |
29 | ||
30 | /* | |
31 | * Insq/Remq for the buffer hash lists. | |
32 | */ | |
e3249ec0 KM |
33 | #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) |
34 | #define bremhash(bp) list_remove(bp, struct buf *, b_hash) | |
37392cf8 KM |
35 | |
36 | /* | |
37 | * Definitions for the buffer free lists. | |
38 | */ | |
39 | #define BQUEUES 4 /* number of free buffer queues */ | |
40 | ||
41 | #define BQ_LOCKED 0 /* super-blocks &c */ | |
42 | #define BQ_LRU 1 /* lru, useful buffers */ | |
43 | #define BQ_AGE 2 /* rubbish */ | |
44 | #define BQ_EMPTY 3 /* buffer headers with no memory */ | |
45 | ||
e3249ec0 | 46 | struct queue_entry bufqueues[BQUEUES]; |
37392cf8 KM |
47 | int needbuffer; |
48 | ||
49 | /* | |
50 | * Insq/Remq for the buffer free lists. | |
51 | */ | |
e3249ec0 KM |
52 | #define binsheadfree(bp, dp) \ |
53 | queue_enter_head(dp, bp, struct buf *, b_freelist) | |
54 | #define binstailfree(bp, dp) \ | |
55 | queue_enter_tail(dp, bp, struct buf *, b_freelist) | |
56 | ||
888c761e MS |
57 | /* |
58 | * Local declarations | |
59 | */ | |
60 | struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, | |
61 | daddr_t, long, int)); | |
62 | struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, | |
63 | daddr_t, daddr_t, long, int, long)); | |
64 | void cluster_wbuild __P((struct vnode *, struct buf *, long size, | |
65 | daddr_t start_lbn, int len, daddr_t lbn)); | |
66 | ||
37392cf8 KM |
67 | void |
68 | bremfree(bp) | |
69 | struct buf *bp; | |
70 | { | |
e3249ec0 | 71 | struct queue_entry *dp; |
37392cf8 | 72 | |
e3249ec0 KM |
73 | /* |
74 | * We only calculate the head of the freelist when removing | |
75 | * the last element of the list as that is the only time that | |
76 | * it is needed (e.g. to reset the tail pointer). | |
77 | */ | |
78 | if (bp->b_freelist.qe_next == NULL) { | |
37392cf8 | 79 | for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) |
e3249ec0 | 80 | if (dp->qe_prev == &bp->b_freelist.qe_next) |
37392cf8 KM |
81 | break; |
82 | if (dp == &bufqueues[BQUEUES]) | |
83 | panic("bremfree: lost tail"); | |
37392cf8 | 84 | } |
e3249ec0 | 85 | queue_remove(dp, bp, struct buf *, b_freelist); |
37392cf8 | 86 | } |
663dbc72 | 87 | |
e7db227e MK |
88 | /* |
89 | * Initialize buffers and hash links for buffers. | |
90 | */ | |
251f56ba | 91 | void |
e7db227e MK |
92 | bufinit() |
93 | { | |
37392cf8 | 94 | register struct buf *bp; |
e3249ec0 | 95 | struct queue_entry *dp; |
e7db227e | 96 | register int i; |
e7db227e MK |
97 | int base, residual; |
98 | ||
37392cf8 | 99 | for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) |
e3249ec0 KM |
100 | queue_init(dp); |
101 | bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); | |
e7db227e MK |
102 | base = bufpages / nbuf; |
103 | residual = bufpages % nbuf; | |
104 | for (i = 0; i < nbuf; i++) { | |
105 | bp = &buf[i]; | |
37392cf8 | 106 | bzero((char *)bp, sizeof *bp); |
e7db227e | 107 | bp->b_dev = NODEV; |
e7db227e MK |
108 | bp->b_rcred = NOCRED; |
109 | bp->b_wcred = NOCRED; | |
e7db227e MK |
110 | bp->b_un.b_addr = buffers + i * MAXBSIZE; |
111 | if (i < residual) | |
112 | bp->b_bufsize = (base + 1) * CLBYTES; | |
113 | else | |
114 | bp->b_bufsize = base * CLBYTES; | |
31222d0d | 115 | bp->b_flags = B_INVAL; |
37392cf8 | 116 | dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; |
31222d0d | 117 | binsheadfree(bp, dp); |
37392cf8 | 118 | binshash(bp, &invalhash); |
e7db227e MK |
119 | } |
120 | } | |
121 | ||
663dbc72 | 122 | /* |
d42a4811 KM |
123 | * Find the block in the buffer pool. |
124 | * If the buffer is not present, allocate a new buffer and load | |
125 | * its contents according to the filesystem fill routine. | |
663dbc72 | 126 | */ |
a937f856 | 127 | bread(vp, blkno, size, cred, bpp) |
7188ac27 | 128 | struct vnode *vp; |
ad30fb67 KM |
129 | daddr_t blkno; |
130 | int size; | |
a937f856 | 131 | struct ucred *cred; |
7188ac27 | 132 | struct buf **bpp; |
ec67a3ce MK |
133 | #ifdef SECSIZE |
134 | long secsize; | |
135 | #endif SECSIZE | |
663dbc72 | 136 | { |
3789a403 | 137 | struct proc *p = curproc; /* XXX */ |
663dbc72 BJ |
138 | register struct buf *bp; |
139 | ||
4f083fd7 SL |
140 | if (size == 0) |
141 | panic("bread: size 0"); | |
ec67a3ce MK |
142 | #ifdef SECSIZE |
143 | bp = getblk(dev, blkno, size, secsize); | |
144 | #else SECSIZE | |
e140149a | 145 | *bpp = bp = getblk(vp, blkno, size, 0, 0); |
ec67a3ce | 146 | #endif SECSIZE |
d42a4811 | 147 | if (bp->b_flags & (B_DONE | B_DELWRI)) { |
c5a600cf | 148 | trace(TR_BREADHIT, pack(vp, size), blkno); |
7188ac27 | 149 | return (0); |
663dbc72 BJ |
150 | } |
151 | bp->b_flags |= B_READ; | |
4f083fd7 SL |
152 | if (bp->b_bcount > bp->b_bufsize) |
153 | panic("bread"); | |
a937f856 KM |
154 | if (bp->b_rcred == NOCRED && cred != NOCRED) { |
155 | crhold(cred); | |
156 | bp->b_rcred = cred; | |
157 | } | |
7188ac27 | 158 | VOP_STRATEGY(bp); |
c5a600cf | 159 | trace(TR_BREADMISS, pack(vp, size), blkno); |
3789a403 | 160 | p->p_stats->p_ru.ru_inblock++; /* pay for read */ |
7188ac27 | 161 | return (biowait(bp)); |
663dbc72 BJ |
162 | } |
163 | ||
164 | /* | |
bb1626f7 KM |
165 | * Operates like bread, but also starts I/O on the N specified |
166 | * read-ahead blocks. | |
663dbc72 | 167 | */ |
bb1626f7 | 168 | breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) |
7188ac27 | 169 | struct vnode *vp; |
84baaab3 | 170 | daddr_t blkno; int size; |
ec67a3ce MK |
171 | #ifdef SECSIZE |
172 | long secsize; | |
173 | #endif SECSIZE | |
bb1626f7 KM |
174 | daddr_t rablkno[]; int rabsize[]; |
175 | int num; | |
a937f856 | 176 | struct ucred *cred; |
7188ac27 | 177 | struct buf **bpp; |
663dbc72 | 178 | { |
3789a403 | 179 | struct proc *p = curproc; /* XXX */ |
663dbc72 | 180 | register struct buf *bp, *rabp; |
bb1626f7 | 181 | register int i; |
663dbc72 BJ |
182 | |
183 | bp = NULL; | |
3efdd860 | 184 | /* |
d42a4811 KM |
185 | * If the block is not memory resident, |
186 | * allocate a buffer and start I/O. | |
3efdd860 | 187 | */ |
7188ac27 | 188 | if (!incore(vp, blkno)) { |
e140149a | 189 | *bpp = bp = getblk(vp, blkno, size, 0, 0); |
ec67a3ce | 190 | #endif SECSIZE |
d42a4811 | 191 | if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { |
663dbc72 | 192 | bp->b_flags |= B_READ; |
4f083fd7 | 193 | if (bp->b_bcount > bp->b_bufsize) |
bb1626f7 | 194 | panic("breadn"); |
a937f856 KM |
195 | if (bp->b_rcred == NOCRED && cred != NOCRED) { |
196 | crhold(cred); | |
197 | bp->b_rcred = cred; | |
198 | } | |
7188ac27 | 199 | VOP_STRATEGY(bp); |
c5a600cf | 200 | trace(TR_BREADMISS, pack(vp, size), blkno); |
3789a403 | 201 | p->p_stats->p_ru.ru_inblock++; /* pay for read */ |
7d1e9cf4 | 202 | } else { |
c5a600cf | 203 | trace(TR_BREADHIT, pack(vp, size), blkno); |
7d1e9cf4 | 204 | } |
663dbc72 | 205 | } |
3efdd860 KM |
206 | |
207 | /* | |
bb1626f7 KM |
208 | * If there's read-ahead block(s), start I/O |
209 | * on them also (as above). | |
3efdd860 | 210 | */ |
bb1626f7 KM |
211 | for (i = 0; i < num; i++) { |
212 | if (incore(vp, rablkno[i])) | |
213 | continue; | |
e140149a | 214 | rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0); |
ec67a3ce | 215 | #endif SECSIZE |
d42a4811 | 216 | if (rabp->b_flags & (B_DONE | B_DELWRI)) { |
663dbc72 | 217 | brelse(rabp); |
bb1626f7 | 218 | trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); |
973ecc4f | 219 | } else { |
d42a4811 | 220 | rabp->b_flags |= B_ASYNC | B_READ; |
4f083fd7 SL |
221 | if (rabp->b_bcount > rabp->b_bufsize) |
222 | panic("breadrabp"); | |
5062ac4a | 223 | if (rabp->b_rcred == NOCRED && cred != NOCRED) { |
a937f856 | 224 | crhold(cred); |
5062ac4a | 225 | rabp->b_rcred = cred; |
a937f856 | 226 | } |
7188ac27 | 227 | VOP_STRATEGY(rabp); |
bb1626f7 | 228 | trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); |
3789a403 | 229 | p->p_stats->p_ru.ru_inblock++; /* pay in advance */ |
663dbc72 BJ |
230 | } |
231 | } | |
3efdd860 KM |
232 | |
233 | /* | |
d42a4811 KM |
234 | * If block was memory resident, let bread get it. |
235 | * If block was not memory resident, the read was | |
236 | * started above, so just wait for the read to complete. | |
3efdd860 | 237 | */ |
84baaab3 | 238 | if (bp == NULL) |
ec67a3ce MK |
239 | #ifdef SECSIZE |
240 | return (bread(dev, blkno, size, secsize)); | |
241 | #else SECSIZE | |
a937f856 | 242 | return (bread(vp, blkno, size, cred, bpp)); |
7188ac27 | 243 | return (biowait(bp)); |
663dbc72 BJ |
244 | } |
245 | ||
888c761e MS |
246 | /* |
247 | * We could optimize this by keeping track of where the last read-ahead | |
248 | * was, but it would involve adding fields to the vnode. For now, let's | |
249 | * just get it working. | |
250 | * | |
251 | * This replaces bread. If this is a bread at the beginning of a file and | |
252 | * lastr is 0, we assume this is the first read and we'll read up to two | |
253 | * blocks if they are sequential. After that, we'll do regular read ahead | |
254 | * in clustered chunks. | |
255 | * | |
256 | * There are 4 or 5 cases depending on how you count: | |
257 | * Desired block is in the cache: | |
258 | * 1 Not sequential access (0 I/Os). | |
259 | * 2 Access is sequential, do read-ahead (1 ASYNC). | |
260 | * Desired block is not in cache: | |
261 | * 3 Not sequential access (1 SYNC). | |
262 | * 4 Sequential access, next block is contiguous (1 SYNC). | |
263 | * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) | |
264 | * | |
265 | * There are potentially two buffers that require I/O. | |
266 | * bp is the block requested. | |
267 | * rbp is the read-ahead block. | |
268 | * If either is NULL, then you don't have to do the I/O. | |
269 | */ | |
270 | cluster_read(vp, filesize, lblkno, size, cred, bpp) | |
271 | struct vnode *vp; | |
272 | u_quad_t filesize; | |
273 | daddr_t lblkno; | |
274 | long size; | |
275 | struct ucred *cred; | |
276 | struct buf **bpp; | |
277 | { | |
278 | struct buf *bp, *rbp; | |
279 | daddr_t blkno, ioblkno; | |
280 | long flags; | |
281 | int error, num_ra, alreadyincore; | |
282 | ||
283 | #ifdef DIAGNOSTIC | |
284 | if (size == 0) | |
285 | panic("cluster_read: size = 0"); | |
286 | #endif | |
287 | ||
288 | error = 0; | |
289 | flags = B_READ; | |
e140149a | 290 | *bpp = bp = getblk(vp, lblkno, size, 0, 0); |
888c761e MS |
291 | if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { |
292 | /* | |
293 | * Desired block is in cache; do any readahead ASYNC. | |
294 | * Case 1, 2. | |
295 | */ | |
296 | trace(TR_BREADHIT, pack(vp, size), lblkno); | |
297 | flags |= B_ASYNC; | |
298 | ioblkno = lblkno + | |
299 | (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); | |
e140149a | 300 | alreadyincore = (int)incore(vp, ioblkno); |
888c761e MS |
301 | bp = NULL; |
302 | } else { | |
303 | /* Block wasn't in cache, case 3, 4, 5. */ | |
304 | trace(TR_BREADMISS, pack(vp, size), lblkno); | |
305 | ioblkno = lblkno; | |
306 | bp->b_flags |= flags; | |
307 | alreadyincore = 0; | |
308 | curproc->p_stats->p_ru.ru_inblock++; /* XXX */ | |
309 | } | |
310 | /* | |
311 | * XXX | |
312 | * Replace 1 with a window size based on some permutation of | |
313 | * maxcontig and rot_delay. This will let you figure out how | |
314 | * many blocks you should read-ahead (case 2, 4, 5). | |
315 | * | |
316 | * If the access isn't sequential, cut the window size in half. | |
317 | */ | |
318 | rbp = NULL; | |
319 | if (lblkno != vp->v_lastr + 1 && lblkno != 0) | |
320 | vp->v_ralen = max(vp->v_ralen >> 1, 1); | |
321 | else if ((ioblkno + 1) * size < filesize && !alreadyincore && | |
322 | !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { | |
323 | /* | |
324 | * Reading sequentially, and the next block is not in the | |
325 | * cache. We are going to try reading ahead. If this is | |
326 | * the first read of a file, then limit read-ahead to a | |
327 | * single block, else read as much as we're allowed. | |
328 | */ | |
329 | if (num_ra > vp->v_ralen) { | |
330 | num_ra = vp->v_ralen; | |
331 | vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); | |
332 | } else | |
333 | vp->v_ralen = num_ra + 1; | |
334 | ||
335 | ||
336 | if (num_ra) /* case 2, 4 */ | |
337 | rbp = cluster_rbuild(vp, filesize, | |
338 | bp, ioblkno, blkno, size, num_ra, flags); | |
339 | else if (lblkno != 0 && ioblkno == lblkno) { | |
340 | /* Case 5: check how many blocks to read ahead */ | |
341 | ++ioblkno; | |
342 | if ((ioblkno + 1) * size > filesize || | |
343 | (error = VOP_BMAP(vp, | |
344 | ioblkno, NULL, &blkno, &num_ra))) | |
345 | goto skip_readahead; | |
346 | flags |= B_ASYNC; | |
347 | if (num_ra) | |
348 | rbp = cluster_rbuild(vp, filesize, | |
349 | NULL, ioblkno, blkno, size, num_ra, flags); | |
350 | else { | |
e140149a | 351 | rbp = getblk(vp, ioblkno, size, 0, 0); |
888c761e MS |
352 | rbp->b_flags |= flags; |
353 | rbp->b_blkno = blkno; | |
354 | } | |
355 | } else if (lblkno != 0) { | |
356 | /* case 2; read ahead single block */ | |
e140149a | 357 | rbp = getblk(vp, ioblkno, size, 0, 0); |
888c761e MS |
358 | rbp->b_flags |= flags; |
359 | rbp->b_blkno = blkno; | |
360 | } else if (bp) /* case 1, 3, block 0 */ | |
361 | bp->b_blkno = blkno; | |
362 | /* Case 1 on block 0; not really doing sequential I/O */ | |
363 | ||
364 | if (rbp == bp) /* case 4 */ | |
365 | rbp = NULL; | |
366 | else if (rbp) { /* case 2, 5 */ | |
367 | trace(TR_BREADMISSRA, | |
368 | pack(vp, (num_ra + 1) * size), ioblkno); | |
369 | curproc->p_stats->p_ru.ru_inblock++; /* XXX */ | |
370 | } | |
371 | } | |
372 | ||
373 | /* XXX Kirk, do we need to make sure the bp has creds? */ | |
374 | skip_readahead: | |
375 | if (bp) | |
376 | if (bp->b_flags & (B_DONE | B_DELWRI)) | |
377 | panic("cluster_read: DONE bp"); | |
378 | else | |
379 | error = VOP_STRATEGY(bp); | |
380 | ||
381 | if (rbp) | |
382 | if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { | |
383 | rbp->b_flags &= ~(B_ASYNC | B_READ); | |
384 | brelse(rbp); | |
385 | } else | |
386 | (void) VOP_STRATEGY(rbp); | |
387 | ||
388 | if (bp) | |
389 | return(biowait(bp)); | |
390 | return(error); | |
391 | } | |
392 | ||
393 | /* | |
394 | * If blocks are contiguous on disk, use this to provide clustered | |
395 | * read ahead. We will read as many blocks as possible sequentially | |
396 | * and then parcel them up into logical blocks in the buffer hash table. | |
397 | */ | |
398 | struct buf * | |
399 | cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) | |
400 | struct vnode *vp; | |
401 | u_quad_t filesize; | |
402 | struct buf *bp; | |
403 | daddr_t lbn; | |
404 | daddr_t blkno; | |
405 | long size; | |
406 | int run; | |
407 | long flags; | |
408 | { | |
409 | struct cluster_save *b_save; | |
410 | struct buf *tbp; | |
411 | daddr_t bn; | |
412 | int i, inc; | |
413 | ||
c5e0ddad MS |
414 | #ifdef DIAGNOSTIC |
415 | if (size != vp->v_mount->mnt_stat.f_iosize) | |
416 | panic("cluster_rbuild: size %d != filesize %d\n", | |
417 | size, vp->v_mount->mnt_stat.f_iosize); | |
418 | #endif | |
888c761e MS |
419 | if (size * (lbn + run + 1) > filesize) |
420 | --run; | |
421 | if (run == 0) { | |
422 | if (!bp) { | |
e140149a | 423 | bp = getblk(vp, lbn, size, 0, 0); |
888c761e MS |
424 | bp->b_blkno = blkno; |
425 | bp->b_flags |= flags; | |
426 | } | |
427 | return(bp); | |
428 | } | |
429 | ||
430 | bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); | |
431 | if (bp->b_flags & (B_DONE | B_DELWRI)) | |
432 | return (bp); | |
433 | ||
434 | b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), | |
435 | M_SEGMENT, M_WAITOK); | |
436 | b_save->bs_bufsize = b_save->bs_bcount = size; | |
437 | b_save->bs_nchildren = 0; | |
438 | b_save->bs_children = (struct buf **)(b_save + 1); | |
439 | b_save->bs_saveaddr = bp->b_saveaddr; | |
440 | bp->b_saveaddr = (caddr_t) b_save; | |
441 | ||
442 | inc = size / DEV_BSIZE; | |
443 | for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { | |
444 | if (incore(vp, lbn + i)) { | |
445 | if (i == 1) { | |
446 | bp->b_saveaddr = b_save->bs_saveaddr; | |
447 | bp->b_flags &= ~B_CALL; | |
448 | bp->b_iodone = NULL; | |
449 | allocbuf(bp, size); | |
450 | free(b_save, M_SEGMENT); | |
451 | } else | |
452 | allocbuf(bp, size * i); | |
453 | break; | |
454 | } | |
e140149a | 455 | tbp = getblk(vp, lbn + i, 0, 0, 0); |
888c761e MS |
456 | tbp->b_bcount = tbp->b_bufsize = size; |
457 | tbp->b_blkno = bn; | |
458 | tbp->b_flags |= flags | B_READ | B_ASYNC; | |
459 | ++b_save->bs_nchildren; | |
460 | b_save->bs_children[i - 1] = tbp; | |
461 | } | |
462 | if (!(bp->b_flags & B_ASYNC)) | |
463 | vp->v_ralen = max(vp->v_ralen - 1, 1); | |
464 | return(bp); | |
465 | } | |
466 | ||
467 | /* | |
468 | * Either get a new buffer or grow the existing one. | |
469 | */ | |
470 | struct buf * | |
471 | cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) | |
472 | struct vnode *vp; | |
473 | struct buf *bp; | |
474 | long flags; | |
475 | daddr_t blkno; | |
476 | daddr_t lblkno; | |
477 | long size; | |
478 | int run; | |
479 | { | |
480 | if (!bp) { | |
e140149a | 481 | bp = getblk(vp, lblkno, size, 0, 0); |
888c761e MS |
482 | if (bp->b_flags & (B_DONE | B_DELWRI)) { |
483 | bp->b_blkno = blkno; | |
484 | return(bp); | |
485 | } | |
486 | } | |
487 | allocbuf(bp, run * size); | |
488 | bp->b_blkno = blkno; | |
489 | bp->b_iodone = cluster_callback; | |
490 | bp->b_flags |= flags | B_CALL; | |
491 | return(bp); | |
492 | } | |
493 | ||
494 | /* | |
495 | * Cleanup after a clustered read or write. | |
496 | */ | |
497 | void | |
498 | cluster_callback(bp) | |
499 | struct buf *bp; | |
500 | { | |
501 | struct cluster_save *b_save; | |
502 | struct buf **tbp; | |
503 | long bsize; | |
504 | caddr_t cp; | |
888c761e MS |
505 | b_save = (struct cluster_save *)(bp->b_saveaddr); |
506 | bp->b_saveaddr = b_save->bs_saveaddr; | |
507 | ||
508 | cp = bp->b_un.b_addr + b_save->bs_bufsize; | |
509 | for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { | |
510 | pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); | |
511 | cp += (*tbp)->b_bufsize; | |
512 | bp->b_bufsize -= (*tbp)->b_bufsize; | |
513 | biodone(*tbp); | |
514 | } | |
515 | #ifdef DIAGNOSTIC | |
516 | if (bp->b_bufsize != b_save->bs_bufsize) | |
517 | panic ("cluster_callback: more space to reclaim"); | |
518 | #endif | |
519 | bp->b_bcount = bp->b_bufsize; | |
520 | bp->b_iodone = NULL; | |
521 | free(b_save, M_SEGMENT); | |
522 | if (bp->b_flags & B_ASYNC) | |
523 | brelse(bp); | |
524 | else | |
525 | wakeup((caddr_t)bp); | |
526 | } | |
527 | ||
663dbc72 | 528 | /* |
d42a4811 KM |
529 | * Synchronous write. |
530 | * Release buffer on completion. | |
663dbc72 BJ |
531 | */ |
532 | bwrite(bp) | |
3efdd860 | 533 | register struct buf *bp; |
663dbc72 | 534 | { |
3789a403 | 535 | struct proc *p = curproc; /* XXX */ |
7188ac27 | 536 | register int flag; |
31222d0d | 537 | int s, error = 0; |
663dbc72 BJ |
538 | |
539 | flag = bp->b_flags; | |
f844ee62 | 540 | bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); |
77dc8a8c KM |
541 | if (flag & B_ASYNC) { |
542 | if ((flag & B_DELWRI) == 0) | |
543 | p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ | |
544 | else | |
545 | reassignbuf(bp, bp->b_vp); | |
546 | } | |
c5a600cf | 547 | trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); |
4f083fd7 SL |
548 | if (bp->b_bcount > bp->b_bufsize) |
549 | panic("bwrite"); | |
86e7dd3b | 550 | s = splbio(); |
c669f646 | 551 | bp->b_vp->v_numoutput++; |
e140149a | 552 | bp->b_flags |= B_WRITEINPROG; |
86e7dd3b | 553 | splx(s); |
7188ac27 | 554 | VOP_STRATEGY(bp); |
3efdd860 KM |
555 | |
556 | /* | |
d42a4811 | 557 | * If the write was synchronous, then await I/O completion. |
3efdd860 | 558 | * If the write was "delayed", then we put the buffer on |
d42a4811 | 559 | * the queue of blocks awaiting I/O completion status. |
3efdd860 | 560 | */ |
d42a4811 | 561 | if ((flag & B_ASYNC) == 0) { |
7188ac27 | 562 | error = biowait(bp); |
77dc8a8c KM |
563 | if ((flag&B_DELWRI) == 0) |
564 | p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ | |
565 | else | |
566 | reassignbuf(bp, bp->b_vp); | |
e140149a KM |
567 | if (bp->b_flags & B_EINTR) { |
568 | bp->b_flags &= ~B_EINTR; | |
569 | error = EINTR; | |
570 | } | |
663dbc72 | 571 | brelse(bp); |
7188ac27 | 572 | } else if (flag & B_DELWRI) { |
31222d0d | 573 | s = splbio(); |
663dbc72 | 574 | bp->b_flags |= B_AGE; |
31222d0d | 575 | splx(s); |
7188ac27 KM |
576 | } |
577 | return (error); | |
663dbc72 BJ |
578 | } |
579 | ||
80746147 JH |
580 | int |
581 | vn_bwrite(ap) | |
582 | struct vop_bwrite_args *ap; | |
583 | { | |
37392cf8 | 584 | return (bwrite(ap->a_bp)); |
80746147 JH |
585 | } |
586 | ||
587 | ||
663dbc72 | 588 | /* |
d42a4811 KM |
589 | * Delayed write. |
590 | * | |
591 | * The buffer is marked dirty, but is not queued for I/O. | |
592 | * This routine should be used when the buffer is expected | |
593 | * to be modified again soon, typically a small write that | |
594 | * partially fills a buffer. | |
595 | * | |
596 | * NB: magnetic tapes cannot be delayed; they must be | |
597 | * written in the order that the writes are requested. | |
663dbc72 BJ |
598 | */ |
599 | bdwrite(bp) | |
3efdd860 | 600 | register struct buf *bp; |
663dbc72 | 601 | { |
3789a403 | 602 | struct proc *p = curproc; /* XXX */ |
663dbc72 | 603 | |
c669f646 KM |
604 | if ((bp->b_flags & B_DELWRI) == 0) { |
605 | bp->b_flags |= B_DELWRI; | |
606 | reassignbuf(bp, bp->b_vp); | |
3789a403 | 607 | p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ |
c669f646 | 608 | } |
7188ac27 | 609 | /* |
edadbc2c | 610 | * If this is a tape drive, the write must be initiated. |
7188ac27 | 611 | */ |
ec67a3ce | 612 | if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) |
663dbc72 | 613 | bawrite(bp); |
edadbc2c | 614 | } else { |
d42a4811 | 615 | bp->b_flags |= (B_DONE | B_DELWRI); |
663dbc72 BJ |
616 | brelse(bp); |
617 | } | |
618 | } | |
619 | ||
620 | /* | |
d42a4811 KM |
621 | * Asynchronous write. |
622 | * Start I/O on a buffer, but do not wait for it to complete. | |
623 | * The buffer is released when the I/O completes. | |
663dbc72 BJ |
624 | */ |
625 | bawrite(bp) | |
3efdd860 | 626 | register struct buf *bp; |
663dbc72 BJ |
627 | { |
628 | ||
d42a4811 KM |
629 | /* |
630 | * Setting the ASYNC flag causes bwrite to return | |
631 | * after starting the I/O. | |
632 | */ | |
663dbc72 | 633 | bp->b_flags |= B_ASYNC; |
e140149a | 634 | (void) VOP_BWRITE(bp); |
663dbc72 BJ |
635 | } |
636 | ||
888c761e MS |
637 | /* |
638 | * Do clustered write for FFS. | |
639 | * | |
640 | * Three cases: | |
641 | * 1. Write is not sequential (write asynchronously) | |
642 | * Write is sequential: | |
643 | * 2. beginning of cluster - begin cluster | |
644 | * 3. middle of a cluster - add to cluster | |
645 | * 4. end of a cluster - asynchronously write cluster | |
646 | */ | |
647 | void | |
648 | cluster_write(bp, filesize) | |
649 | struct buf *bp; | |
650 | u_quad_t filesize; | |
651 | { | |
652 | struct vnode *vp; | |
653 | daddr_t lbn; | |
c5e0ddad | 654 | int clen; |
888c761e MS |
655 | |
656 | vp = bp->b_vp; | |
657 | lbn = bp->b_lblkno; | |
888c761e | 658 | |
c5e0ddad MS |
659 | /* Initialize vnode to beginning of file. */ |
660 | if (lbn == 0) | |
661 | vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; | |
662 | ||
663 | if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || | |
664 | (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) { | |
888c761e MS |
665 | if (vp->v_clen != 0) |
666 | /* | |
667 | * Write is not sequential. | |
668 | */ | |
669 | cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, | |
670 | vp->v_lastw - vp->v_cstart + 1, lbn); | |
671 | /* | |
672 | * Consider beginning a cluster. | |
673 | */ | |
c5e0ddad MS |
674 | if ((lbn + 1) * bp->b_bcount == filesize) |
675 | /* End of file, make cluster as large as possible */ | |
676 | clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; | |
677 | else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { | |
888c761e | 678 | bawrite(bp); |
c5e0ddad MS |
679 | vp->v_clen = 0; |
680 | vp->v_lasta = bp->b_blkno; | |
888c761e MS |
681 | vp->v_cstart = lbn + 1; |
682 | vp->v_lastw = lbn; | |
683 | return; | |
c5e0ddad MS |
684 | } else |
685 | clen = 0; | |
888c761e MS |
686 | vp->v_clen = clen; |
687 | if (clen == 0) { /* I/O not contiguous */ | |
688 | vp->v_cstart = lbn + 1; | |
689 | bawrite(bp); | |
690 | } else { /* Wait for rest of cluster */ | |
691 | vp->v_cstart = lbn; | |
692 | bdwrite(bp); | |
693 | } | |
694 | } else if (lbn == vp->v_cstart + vp->v_clen) { | |
695 | /* | |
696 | * At end of cluster, write it out. | |
697 | */ | |
698 | cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, | |
699 | vp->v_clen + 1, lbn); | |
700 | vp->v_clen = 0; | |
701 | vp->v_cstart = lbn + 1; | |
702 | } else | |
703 | /* | |
704 | * In the middle of a cluster, so just delay the | |
705 | * I/O for now. | |
706 | */ | |
707 | bdwrite(bp); | |
708 | vp->v_lastw = lbn; | |
c5e0ddad | 709 | vp->v_lasta = bp->b_blkno; |
888c761e MS |
710 | } |
711 | ||
712 | ||
713 | /* | |
714 | * This is an awful lot like cluster_rbuild...wish they could be combined. | |
715 | * The last lbn argument is the current block on which I/O is being | |
716 | * performed. Check to see that it doesn't fall in the middle of | |
717 | * the current block. | |
718 | */ | |
719 | void | |
720 | cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) | |
721 | struct vnode *vp; | |
722 | struct buf *last_bp; | |
723 | long size; | |
724 | daddr_t start_lbn; | |
725 | int len; | |
726 | daddr_t lbn; | |
727 | { | |
728 | struct cluster_save *b_save; | |
729 | struct buf *bp, *tbp; | |
730 | caddr_t cp; | |
731 | int i, s; | |
732 | ||
c5e0ddad MS |
733 | #ifdef DIAGNOSTIC |
734 | if (size != vp->v_mount->mnt_stat.f_iosize) | |
735 | panic("cluster_wbuild: size %d != filesize %d\n", | |
736 | size, vp->v_mount->mnt_stat.f_iosize); | |
737 | #endif | |
888c761e MS |
738 | redo: |
739 | while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { | |
740 | ++start_lbn; | |
741 | --len; | |
742 | } | |
743 | ||
744 | /* Get more memory for current buffer */ | |
745 | if (len <= 1) { | |
c5e0ddad | 746 | if (last_bp) { |
888c761e | 747 | bawrite(last_bp); |
c5e0ddad MS |
748 | } else if (len) { |
749 | bp = getblk(vp, start_lbn, size, 0, 0); | |
750 | bawrite(bp); | |
751 | } | |
888c761e MS |
752 | return; |
753 | } | |
754 | ||
e140149a | 755 | bp = getblk(vp, start_lbn, size, 0, 0); |
888c761e MS |
756 | if (!(bp->b_flags & B_DELWRI)) { |
757 | ++start_lbn; | |
758 | --len; | |
759 | brelse(bp); | |
760 | goto redo; | |
761 | } | |
762 | ||
763 | --len; | |
764 | b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), | |
765 | M_SEGMENT, M_WAITOK); | |
766 | b_save->bs_bcount = bp->b_bcount; | |
767 | b_save->bs_bufsize = bp->b_bufsize; | |
768 | b_save->bs_nchildren = 0; | |
769 | b_save->bs_children = (struct buf **)(b_save + 1); | |
770 | b_save->bs_saveaddr = bp->b_saveaddr; | |
771 | bp->b_saveaddr = (caddr_t) b_save; | |
772 | ||
773 | ||
774 | bp->b_flags |= B_CALL; | |
775 | bp->b_iodone = cluster_callback; | |
776 | cp = bp->b_un.b_addr + bp->b_bufsize; | |
777 | for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { | |
778 | if (!incore(vp, start_lbn) || start_lbn == lbn) | |
779 | break; | |
780 | ||
781 | if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { | |
e140149a | 782 | tbp = getblk(vp, start_lbn, size, 0, 0); |
888c761e MS |
783 | #ifdef DIAGNOSTIC |
784 | if (tbp->b_bcount != tbp->b_bufsize) | |
785 | panic("cluster_wbuild: Buffer too big"); | |
786 | #endif | |
787 | if (!(tbp->b_flags & B_DELWRI)) { | |
788 | brelse(tbp); | |
789 | break; | |
790 | } | |
791 | } else | |
792 | tbp = last_bp; | |
793 | ||
794 | ++b_save->bs_nchildren; | |
795 | ||
796 | /* Move memory from children to parent */ | |
c5e0ddad MS |
797 | if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) { |
798 | printf("Clustered Block: %d addr %x bufsize: %d\n", | |
799 | bp->b_lblkno, bp->b_blkno, bp->b_bufsize); | |
800 | printf("Child Block: %d addr: %x\n", tbp->b_lblkno, | |
801 | tbp->b_blkno); | |
802 | panic("Clustered write to wrong blocks"); | |
803 | } | |
804 | ||
888c761e MS |
805 | pagemove(tbp->b_un.b_daddr, cp, size); |
806 | bp->b_bcount += size; | |
807 | bp->b_bufsize += size; | |
808 | ||
809 | tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); | |
810 | tbp->b_flags |= B_ASYNC; | |
811 | s = splbio(); | |
812 | reassignbuf(tbp, tbp->b_vp); /* put on clean list */ | |
813 | ++tbp->b_vp->v_numoutput; | |
814 | splx(s); | |
815 | b_save->bs_children[i] = tbp; | |
816 | ||
817 | cp += tbp->b_bufsize; | |
818 | } | |
819 | ||
820 | if (i == 0) { | |
821 | /* None to cluster */ | |
822 | bp->b_saveaddr = b_save->bs_saveaddr; | |
823 | bp->b_flags &= ~B_CALL; | |
824 | bp->b_iodone = NULL; | |
825 | free(b_save, M_SEGMENT); | |
826 | } | |
827 | bawrite(bp); | |
828 | if (i < len) { | |
829 | len -= i + 1; | |
830 | start_lbn += 1; | |
831 | goto redo; | |
832 | } | |
833 | } | |
834 | ||
663dbc72 | 835 | /* |
d42a4811 KM |
836 | * Release a buffer. |
837 | * Even if the buffer is dirty, no I/O is started. | |
663dbc72 BJ |
838 | */ |
839 | brelse(bp) | |
3efdd860 | 840 | register struct buf *bp; |
663dbc72 | 841 | { |
e3249ec0 | 842 | register struct queue_entry *flist; |
d42a4811 | 843 | int s; |
663dbc72 | 844 | |
c5a600cf | 845 | trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); |
3efdd860 | 846 | /* |
edadbc2c KM |
847 | * If a process is waiting for the buffer, or |
848 | * is waiting for a free buffer, awaken it. | |
3efdd860 | 849 | */ |
d42a4811 | 850 | if (bp->b_flags & B_WANTED) |
663dbc72 | 851 | wakeup((caddr_t)bp); |
37392cf8 KM |
852 | if (needbuffer) { |
853 | needbuffer = 0; | |
854 | wakeup((caddr_t)&needbuffer); | |
663dbc72 | 855 | } |
edadbc2c KM |
856 | /* |
857 | * Retry I/O for locked buffers rather than invalidating them. | |
858 | */ | |
31222d0d | 859 | s = splbio(); |
edadbc2c KM |
860 | if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) |
861 | bp->b_flags &= ~B_ERROR; | |
edadbc2c KM |
862 | /* |
863 | * Disassociate buffers that are no longer valid. | |
864 | */ | |
d42a4811 | 865 | if (bp->b_flags & (B_NOCACHE | B_ERROR)) |
7188ac27 | 866 | bp->b_flags |= B_INVAL; |
d42a4811 | 867 | if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { |
edadbc2c KM |
868 | if (bp->b_vp) |
869 | brelvp(bp); | |
870 | bp->b_flags &= ~B_DELWRI; | |
7188ac27 | 871 | } |
3efdd860 KM |
872 | /* |
873 | * Stick the buffer back on a free list. | |
874 | */ | |
4f083fd7 SL |
875 | if (bp->b_bufsize <= 0) { |
876 | /* block has no buffer ... put at front of unused buffer list */ | |
37392cf8 | 877 | flist = &bufqueues[BQ_EMPTY]; |
4f083fd7 | 878 | binsheadfree(bp, flist); |
d42a4811 | 879 | } else if (bp->b_flags & (B_ERROR | B_INVAL)) { |
46387ee3 | 880 | /* block has no info ... put at front of most free list */ |
37392cf8 | 881 | flist = &bufqueues[BQ_AGE]; |
3efdd860 | 882 | binsheadfree(bp, flist); |
663dbc72 | 883 | } else { |
46387ee3 | 884 | if (bp->b_flags & B_LOCKED) |
37392cf8 | 885 | flist = &bufqueues[BQ_LOCKED]; |
46387ee3 | 886 | else if (bp->b_flags & B_AGE) |
37392cf8 | 887 | flist = &bufqueues[BQ_AGE]; |
46387ee3 | 888 | else |
37392cf8 | 889 | flist = &bufqueues[BQ_LRU]; |
3efdd860 | 890 | binstailfree(bp, flist); |
663dbc72 | 891 | } |
d42a4811 | 892 | bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); |
663dbc72 BJ |
893 | splx(s); |
894 | } | |
895 | ||
896 | /* | |
d42a4811 | 897 | * Check to see if a block is currently memory resident. |
663dbc72 | 898 | */ |
e140149a | 899 | struct buf * |
7188ac27 KM |
900 | incore(vp, blkno) |
901 | struct vnode *vp; | |
3efdd860 | 902 | daddr_t blkno; |
663dbc72 BJ |
903 | { |
904 | register struct buf *bp; | |
663dbc72 | 905 | |
e3249ec0 | 906 | for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) |
edadbc2c | 907 | if (bp->b_lblkno == blkno && bp->b_vp == vp && |
3efdd860 | 908 | (bp->b_flags & B_INVAL) == 0) |
e140149a KM |
909 | return (bp); |
910 | return (NULL); | |
663dbc72 BJ |
911 | } |
912 | ||
edadbc2c | 913 | /* |
d42a4811 KM |
914 | * Check to see if a block is currently memory resident. |
915 | * If it is resident, return it. If it is not resident, | |
916 | * allocate a new buffer and assign it to the block. | |
663dbc72 BJ |
917 | */ |
918 | struct buf * | |
ec67a3ce MK |
919 | #ifdef SECSIZE |
920 | getblk(dev, blkno, size, secsize) | |
921 | #else SECSIZE | |
e140149a | 922 | getblk(vp, blkno, size, slpflag, slptimeo) |
7188ac27 | 923 | register struct vnode *vp; |
ad30fb67 | 924 | daddr_t blkno; |
e140149a | 925 | int size, slpflag, slptimeo; |
ec67a3ce MK |
926 | #ifdef SECSIZE |
927 | long secsize; | |
928 | #endif SECSIZE | |
663dbc72 | 929 | { |
e3249ec0 KM |
930 | register struct buf *bp; |
931 | struct list_entry *dp; | |
e140149a | 932 | int s, error; |
663dbc72 | 933 | |
00a6a148 KM |
934 | if (size > MAXBSIZE) |
935 | panic("getblk: size too big"); | |
3efdd860 | 936 | /* |
d42a4811 KM |
937 | * Search the cache for the block. If the buffer is found, |
938 | * but it is currently locked, the we must wait for it to | |
939 | * become available. | |
3efdd860 | 940 | */ |
7188ac27 | 941 | dp = BUFHASH(vp, blkno); |
3efdd860 | 942 | loop: |
e3249ec0 | 943 | for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { |
e140149a | 944 | if (bp->b_lblkno != blkno || bp->b_vp != vp) |
663dbc72 | 945 | continue; |
a5e62f37 | 946 | s = splbio(); |
d42a4811 | 947 | if (bp->b_flags & B_BUSY) { |
663dbc72 | 948 | bp->b_flags |= B_WANTED; |
e140149a KM |
949 | error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), |
950 | "getblk", slptimeo); | |
23900030 | 951 | splx(s); |
e140149a KM |
952 | if (error) |
953 | return (NULL); | |
663dbc72 BJ |
954 | goto loop; |
955 | } | |
e140149a KM |
956 | /* |
957 | * The test for B_INVAL is moved down here, since there | |
958 | * are cases where B_INVAL is set before VOP_BWRITE() is | |
959 | * called and for NFS, the process cannot be allowed to | |
960 | * allocate a new buffer for the same block until the write | |
961 | * back to the server has been completed. (ie. B_BUSY clears) | |
962 | */ | |
963 | if (bp->b_flags & B_INVAL) { | |
964 | splx(s); | |
965 | continue; | |
966 | } | |
c669f646 KM |
967 | bremfree(bp); |
968 | bp->b_flags |= B_BUSY; | |
23900030 | 969 | splx(s); |
32a56bda | 970 | if (bp->b_bcount != size) { |
edadbc2c KM |
971 | printf("getblk: stray size"); |
972 | bp->b_flags |= B_INVAL; | |
e140149a | 973 | VOP_BWRITE(bp); |
9d6d37ce | 974 | goto loop; |
edadbc2c | 975 | } |
663dbc72 | 976 | bp->b_flags |= B_CACHE; |
a5e62f37 | 977 | return (bp); |
663dbc72 | 978 | } |
e140149a KM |
979 | /* |
980 | * The loop back to the top when getnewbuf() fails is because | |
981 | * stateless filesystems like NFS have no node locks. Thus, | |
982 | * there is a slight chance that more than one process will | |
983 | * try and getnewbuf() for the same block concurrently when | |
984 | * the first sleeps in getnewbuf(). So after a sleep, go back | |
985 | * up to the top to check the hash lists again. | |
986 | */ | |
987 | if ((bp = getnewbuf(slpflag, slptimeo)) == 0) | |
988 | goto loop; | |
3efdd860 | 989 | bremhash(bp); |
edadbc2c | 990 | bgetvp(vp, bp); |
521a4688 | 991 | bp->b_bcount = 0; |
edadbc2c | 992 | bp->b_lblkno = blkno; |
ec67a3ce MK |
993 | #ifdef SECSIZE |
994 | bp->b_blksize = secsize; | |
995 | #endif SECSIZE | |
ad30fb67 | 996 | bp->b_blkno = blkno; |
4f083fd7 | 997 | bp->b_error = 0; |
7188ac27 KM |
998 | bp->b_resid = 0; |
999 | binshash(bp, dp); | |
521a4688 | 1000 | allocbuf(bp, size); |
a5e62f37 | 1001 | return (bp); |
663dbc72 BJ |
1002 | } |
1003 | ||
1004 | /* | |
d42a4811 KM |
1005 | * Allocate a buffer. |
1006 | * The caller will assign it to a block. | |
663dbc72 BJ |
1007 | */ |
1008 | struct buf * | |
ad30fb67 KM |
1009 | geteblk(size) |
1010 | int size; | |
663dbc72 | 1011 | { |
37392cf8 | 1012 | register struct buf *bp; |
663dbc72 | 1013 | |
00a6a148 KM |
1014 | if (size > MAXBSIZE) |
1015 | panic("geteblk: size too big"); | |
e140149a KM |
1016 | while ((bp = getnewbuf(0, 0)) == NULL) |
1017 | /* void */; | |
4f083fd7 | 1018 | bp->b_flags |= B_INVAL; |
3efdd860 | 1019 | bremhash(bp); |
37392cf8 | 1020 | binshash(bp, &invalhash); |
521a4688 | 1021 | bp->b_bcount = 0; |
ec67a3ce MK |
1022 | #ifdef SECSIZE |
1023 | bp->b_blksize = DEV_BSIZE; | |
1024 | #endif SECSIZE | |
4f083fd7 | 1025 | bp->b_error = 0; |
7188ac27 | 1026 | bp->b_resid = 0; |
521a4688 | 1027 | allocbuf(bp, size); |
a5e62f37 | 1028 | return (bp); |
663dbc72 BJ |
1029 | } |
1030 | ||
ad30fb67 | 1031 | /* |
521a4688 | 1032 | * Expand or contract the actual memory allocated to a buffer. |
d42a4811 | 1033 | * If no memory is available, release buffer and take error exit. |
ad30fb67 | 1034 | */ |
521a4688 KM |
1035 | allocbuf(tp, size) |
1036 | register struct buf *tp; | |
ad30fb67 KM |
1037 | int size; |
1038 | { | |
521a4688 KM |
1039 | register struct buf *bp, *ep; |
1040 | int sizealloc, take, s; | |
ad30fb67 | 1041 | |
521a4688 KM |
1042 | sizealloc = roundup(size, CLBYTES); |
1043 | /* | |
1044 | * Buffer size does not change | |
1045 | */ | |
1046 | if (sizealloc == tp->b_bufsize) | |
1047 | goto out; | |
1048 | /* | |
1049 | * Buffer size is shrinking. | |
1050 | * Place excess space in a buffer header taken from the | |
1051 | * BQ_EMPTY buffer list and placed on the "most free" list. | |
1052 | * If no extra buffer headers are available, leave the | |
1053 | * extra space in the present buffer. | |
1054 | */ | |
1055 | if (sizealloc < tp->b_bufsize) { | |
e3249ec0 | 1056 | if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) |
521a4688 KM |
1057 | goto out; |
1058 | s = splbio(); | |
1059 | bremfree(ep); | |
1060 | ep->b_flags |= B_BUSY; | |
1061 | splx(s); | |
1062 | pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, | |
1063 | (int)tp->b_bufsize - sizealloc); | |
1064 | ep->b_bufsize = tp->b_bufsize - sizealloc; | |
1065 | tp->b_bufsize = sizealloc; | |
1066 | ep->b_flags |= B_INVAL; | |
1067 | ep->b_bcount = 0; | |
1068 | brelse(ep); | |
1069 | goto out; | |
1070 | } | |
1071 | /* | |
1072 | * More buffer space is needed. Get it out of buffers on | |
1073 | * the "most free" list, placing the empty headers on the | |
1074 | * BQ_EMPTY buffer header list. | |
1075 | */ | |
1076 | while (tp->b_bufsize < sizealloc) { | |
1077 | take = sizealloc - tp->b_bufsize; | |
e140149a KM |
1078 | while ((bp = getnewbuf(0, 0)) == NULL) |
1079 | /* void */; | |
521a4688 KM |
1080 | if (take >= bp->b_bufsize) |
1081 | take = bp->b_bufsize; | |
1082 | pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], | |
1083 | &tp->b_un.b_addr[tp->b_bufsize], take); | |
1084 | tp->b_bufsize += take; | |
1085 | bp->b_bufsize = bp->b_bufsize - take; | |
1086 | if (bp->b_bcount > bp->b_bufsize) | |
1087 | bp->b_bcount = bp->b_bufsize; | |
1088 | if (bp->b_bufsize <= 0) { | |
1089 | bremhash(bp); | |
37392cf8 | 1090 | binshash(bp, &invalhash); |
d42a4811 | 1091 | bp->b_dev = NODEV; |
521a4688 KM |
1092 | bp->b_error = 0; |
1093 | bp->b_flags |= B_INVAL; | |
1094 | } | |
1095 | brelse(bp); | |
1096 | } | |
1097 | out: | |
1098 | tp->b_bcount = size; | |
1099 | return (1); | |
4f083fd7 SL |
1100 | } |
1101 | ||
4f083fd7 SL |
1102 | /* |
1103 | * Find a buffer which is available for use. | |
1104 | * Select something from a free list. | |
1105 | * Preference is to AGE list, then LRU list. | |
1106 | */ | |
1107 | struct buf * | |
e140149a KM |
1108 | getnewbuf(slpflag, slptimeo) |
1109 | int slpflag, slptimeo; | |
4f083fd7 | 1110 | { |
37392cf8 | 1111 | register struct buf *bp; |
e3249ec0 | 1112 | register struct queue_entry *dp; |
a937f856 | 1113 | register struct ucred *cred; |
4f083fd7 SL |
1114 | int s; |
1115 | ||
1116 | loop: | |
a5e62f37 | 1117 | s = splbio(); |
37392cf8 | 1118 | for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) |
e3249ec0 | 1119 | if (dp->qe_next) |
4f083fd7 | 1120 | break; |
37392cf8 KM |
1121 | if (dp == bufqueues) { /* no free blocks */ |
1122 | needbuffer = 1; | |
e140149a KM |
1123 | (void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1), |
1124 | "getnewbuf", slptimeo); | |
4b7d506c | 1125 | splx(s); |
e140149a | 1126 | return (NULL); |
4f083fd7 | 1127 | } |
e3249ec0 | 1128 | bp = dp->qe_next; |
c669f646 KM |
1129 | bremfree(bp); |
1130 | bp->b_flags |= B_BUSY; | |
1131 | splx(s); | |
4f083fd7 | 1132 | if (bp->b_flags & B_DELWRI) { |
033a786e | 1133 | (void) bawrite(bp); |
4f083fd7 SL |
1134 | goto loop; |
1135 | } | |
c5a600cf | 1136 | trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); |
edadbc2c KM |
1137 | if (bp->b_vp) |
1138 | brelvp(bp); | |
a937f856 KM |
1139 | if (bp->b_rcred != NOCRED) { |
1140 | cred = bp->b_rcred; | |
1141 | bp->b_rcred = NOCRED; | |
1142 | crfree(cred); | |
1143 | } | |
1144 | if (bp->b_wcred != NOCRED) { | |
1145 | cred = bp->b_wcred; | |
1146 | bp->b_wcred = NOCRED; | |
1147 | crfree(cred); | |
1148 | } | |
4f083fd7 | 1149 | bp->b_flags = B_BUSY; |
1c89915d | 1150 | bp->b_dirtyoff = bp->b_dirtyend = 0; |
bb1626f7 | 1151 | bp->b_validoff = bp->b_validend = 0; |
4f083fd7 SL |
1152 | return (bp); |
1153 | } | |
1154 | ||
663dbc72 | 1155 | /* |
d42a4811 KM |
1156 | * Wait for I/O to complete. |
1157 | * | |
1158 | * Extract and return any errors associated with the I/O. | |
1159 | * If the error flag is set, but no specific error is | |
1160 | * given, return EIO. | |
663dbc72 | 1161 | */ |
3efdd860 | 1162 | biowait(bp) |
ad30fb67 | 1163 | register struct buf *bp; |
663dbc72 | 1164 | { |
530d0032 | 1165 | int s; |
663dbc72 | 1166 | |
a5e62f37 | 1167 | s = splbio(); |
a937f856 | 1168 | while ((bp->b_flags & B_DONE) == 0) |
663dbc72 | 1169 | sleep((caddr_t)bp, PRIBIO); |
530d0032 | 1170 | splx(s); |
7188ac27 KM |
1171 | if ((bp->b_flags & B_ERROR) == 0) |
1172 | return (0); | |
1173 | if (bp->b_error) | |
1174 | return (bp->b_error); | |
1175 | return (EIO); | |
663dbc72 BJ |
1176 | } |
1177 | ||
663dbc72 | 1178 | /* |
af04ce66 | 1179 | * Mark I/O complete on a buffer. |
d42a4811 KM |
1180 | * |
1181 | * If a callback has been requested, e.g. the pageout | |
1182 | * daemon, do so. Otherwise, awaken waiting processes. | |
663dbc72 | 1183 | */ |
251f56ba | 1184 | void |
3efdd860 KM |
1185 | biodone(bp) |
1186 | register struct buf *bp; | |
663dbc72 | 1187 | { |
663dbc72 | 1188 | |
80e7c811 | 1189 | if (bp->b_flags & B_DONE) |
3efdd860 | 1190 | panic("dup biodone"); |
663dbc72 | 1191 | bp->b_flags |= B_DONE; |
76429560 KM |
1192 | if ((bp->b_flags & B_READ) == 0) |
1193 | vwakeup(bp); | |
961945a8 SL |
1194 | if (bp->b_flags & B_CALL) { |
1195 | bp->b_flags &= ~B_CALL; | |
1196 | (*bp->b_iodone)(bp); | |
1197 | return; | |
1198 | } | |
d42a4811 | 1199 | if (bp->b_flags & B_ASYNC) |
663dbc72 BJ |
1200 | brelse(bp); |
1201 | else { | |
1202 | bp->b_flags &= ~B_WANTED; | |
1203 | wakeup((caddr_t)bp); | |
1204 | } | |
1205 | } | |
aa95c6fc | 1206 | |
b5d79df9 MS |
1207 | int |
1208 | count_lock_queue() | |
1209 | { | |
1210 | register struct buf *bp; | |
1211 | register int ret; | |
1212 | ||
1213 | for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next; | |
1214 | bp; bp = (struct buf *)bp->b_freelist.qe_next) | |
1215 | ++ret; | |
1216 | return(ret); | |
1217 | } | |
1218 | ||
aa95c6fc KM |
1219 | #ifdef DIAGNOSTIC |
1220 | /* | |
1221 | * Print out statistics on the current allocation of the buffer pool. | |
1222 | * Can be enabled to print out on every ``sync'' by setting "syncprt" | |
1223 | * above. | |
1224 | */ | |
1225 | void | |
1226 | vfs_bufstats() | |
1227 | { | |
1228 | int s, i, j, count; | |
37392cf8 | 1229 | register struct buf *bp; |
e3249ec0 | 1230 | register struct queue_entry *dp; |
aa95c6fc KM |
1231 | int counts[MAXBSIZE/CLBYTES+1]; |
1232 | static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; | |
1233 | ||
37392cf8 | 1234 | for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { |
aa95c6fc KM |
1235 | count = 0; |
1236 | for (j = 0; j <= MAXBSIZE/CLBYTES; j++) | |
1237 | counts[j] = 0; | |
1238 | s = splbio(); | |
e3249ec0 | 1239 | for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { |
aa95c6fc KM |
1240 | counts[bp->b_bufsize/CLBYTES]++; |
1241 | count++; | |
1242 | } | |
1243 | splx(s); | |
1244 | printf("%s: total-%d", bname[i], count); | |
1245 | for (j = 0; j <= MAXBSIZE/CLBYTES; j++) | |
1246 | if (counts[j] != 0) | |
1247 | printf(", %d-%d", j * CLBYTES, counts[j]); | |
1248 | printf("\n"); | |
1249 | } | |
1250 | } | |
1251 | #endif /* DIAGNOSTIC */ |