* Copyright (c) 1993 The Regents of the University of California.
* %sccs.include.redist.c%
* @(#)vfs_cluster.c 7.60 (Berkeley) %G%
#include <sys/resourcevar.h>
#include <libkern/libkern.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
struct buf
*cluster_newbuf
__P((struct vnode
*, struct buf
*, long, daddr_t
,
struct buf
*cluster_rbuild
__P((struct vnode
*, u_quad_t
, struct buf
*,
daddr_t
, daddr_t
, long, int, long));
void cluster_wbuild
__P((struct vnode
*, struct buf
*, long size
,
daddr_t start_lbn
, int len
, daddr_t lbn
));
* We could optimize this by keeping track of where the last read-ahead
* was, but it would involve adding fields to the vnode. For now, let's
* This replaces bread. If this is a bread at the beginning of a file and
* lastr is 0, we assume this is the first read and we'll read up to two
* blocks if they are sequential. After that, we'll do regular read ahead
* There are 4 or 5 cases depending on how you count:
* Desired block is in the cache:
* 1 Not sequential access (0 I/Os).
* 2 Access is sequential, do read-ahead (1 ASYNC).
* Desired block is not in cache:
* 3 Not sequential access (1 SYNC).
* 4 Sequential access, next block is contiguous (1 SYNC).
* 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
* There are potentially two buffers that require I/O.
* bp is the block requested.
* rbp is the read-ahead block.
* If either is NULL, then you don't have to do the I/O.
cluster_read(vp
, filesize
, lblkno
, size
, cred
, bpp
)
int error
, num_ra
, alreadyincore
;
panic("cluster_read: size = 0");
*bpp
= bp
= getblk(vp
, lblkno
, size
, 0, 0);
if (bp
->b_flags
& (B_CACHE
| B_DONE
| B_DELWRI
)) {
* Desired block is in cache; do any readahead ASYNC.
trace(TR_BREADHIT
, pack(vp
, size
), lblkno
);
(lblkno
< vp
->v_ralen
? vp
->v_ralen
>> 1 : vp
->v_ralen
);
alreadyincore
= (int)incore(vp
, ioblkno
);
/* Block wasn't in cache, case 3, 4, 5. */
trace(TR_BREADMISS
, pack(vp
, size
), lblkno
);
curproc
->p_stats
->p_ru
.ru_inblock
++; /* XXX */
* Replace 1 with a window size based on some permutation of
* maxcontig and rot_delay. This will let you figure out how
* many blocks you should read-ahead (case 2, 4, 5).
* If the access isn't sequential, cut the window size in half.
if (lblkno
!= vp
->v_lastr
+ 1 && lblkno
!= 0)
vp
->v_ralen
= max(vp
->v_ralen
>> 1, 1);
else if ((ioblkno
+ 1) * size
< filesize
&& !alreadyincore
&&
!(error
= VOP_BMAP(vp
, ioblkno
, NULL
, &blkno
, &num_ra
))) {
* Reading sequentially, and the next block is not in the
* cache. We are going to try reading ahead. If this is
* the first read of a file, then limit read-ahead to a
* single block, else read as much as we're allowed.
if (num_ra
> vp
->v_ralen
) {
vp
->v_ralen
= min(MAXPHYS
/ size
, vp
->v_ralen
<< 1);
vp
->v_ralen
= num_ra
+ 1;
if (num_ra
) /* case 2, 4 */
rbp
= cluster_rbuild(vp
, filesize
,
bp
, ioblkno
, blkno
, size
, num_ra
, flags
);
else if (lblkno
!= 0 && ioblkno
== lblkno
) {
/* Case 5: check how many blocks to read ahead */
if ((ioblkno
+ 1) * size
> filesize
||
ioblkno
, NULL
, &blkno
, &num_ra
)))
rbp
= cluster_rbuild(vp
, filesize
,
NULL
, ioblkno
, blkno
, size
, num_ra
, flags
);
rbp
= getblk(vp
, ioblkno
, size
, 0, 0);
} else if (lblkno
!= 0) {
/* case 2; read ahead single block */
rbp
= getblk(vp
, ioblkno
, size
, 0, 0);
} else if (bp
) /* case 1, 3, block 0 */
/* Case 1 on block 0; not really doing sequential I/O */
if (rbp
== bp
) /* case 4 */
else if (rbp
) { /* case 2, 5 */
pack(vp
, (num_ra
+ 1) * size
), ioblkno
);
curproc
->p_stats
->p_ru
.ru_inblock
++; /* XXX */
/* XXX Kirk, do we need to make sure the bp has creds? */
if (bp
->b_flags
& (B_DONE
| B_DELWRI
))
panic("cluster_read: DONE bp");
error
= VOP_STRATEGY(bp
);
if (error
|| rbp
->b_flags
& (B_DONE
| B_DELWRI
)) {
rbp
->b_flags
&= ~(B_ASYNC
| B_READ
);
(void) VOP_STRATEGY(rbp
);
* If blocks are contiguous on disk, use this to provide clustered
* read ahead. We will read as many blocks as possible sequentially
* and then parcel them up into logical blocks in the buffer hash table.
cluster_rbuild(vp
, filesize
, bp
, lbn
, blkno
, size
, run
, flags
)
struct cluster_save
*b_save
;
if (size
!= vp
->v_mount
->mnt_stat
.f_iosize
)
panic("cluster_rbuild: size %d != filesize %d\n",
size
, vp
->v_mount
->mnt_stat
.f_iosize
);
if (size
* (lbn
+ run
+ 1) > filesize
)
bp
= getblk(vp
, lbn
, size
, 0, 0);
bp
= cluster_newbuf(vp
, bp
, flags
, blkno
, lbn
, size
, run
+ 1);
if (bp
->b_flags
& (B_DONE
| B_DELWRI
))
b_save
= malloc(sizeof(struct buf
*) * run
+ sizeof(struct cluster_save
),
b_save
->bs_bufsize
= b_save
->bs_bcount
= size
;
b_save
->bs_nchildren
= 0;
b_save
->bs_children
= (struct buf
**)(b_save
+ 1);
b_save
->bs_saveaddr
= bp
->b_saveaddr
;
bp
->b_saveaddr
= (caddr_t
) b_save
;
for (bn
= blkno
+ inc
, i
= 1; i
<= run
; ++i
, bn
+= inc
) {
if (incore(vp
, lbn
+ i
)) {
bp
->b_saveaddr
= b_save
->bs_saveaddr
;
tbp
= getblk(vp
, lbn
+ i
, 0, 0, 0);
tbp
->b_bcount
= tbp
->b_bufsize
= size
;
VOP_BMAP(tbp
->b_vp
, tbp
->b_lblkno
, NULL
, &temp
, NULL
);
printf("Block: %d Assigned address: %x Bmap address: %x\n",
tbp
->b_lblkno
, tbp
->b_blkno
, temp
);
panic("cluster_rbuild: wrong disk address");
tbp
->b_flags
|= flags
| B_READ
| B_ASYNC
;
b_save
->bs_children
[i
- 1] = tbp
;
if (!(bp
->b_flags
& B_ASYNC
))
vp
->v_ralen
= max(vp
->v_ralen
- 1, 1);
* Either get a new buffer or grow the existing one.
cluster_newbuf(vp
, bp
, flags
, blkno
, lblkno
, size
, run
)
bp
= getblk(vp
, lblkno
, size
, 0, 0);
if (bp
->b_flags
& (B_DONE
| B_DELWRI
)) {
allocbuf(bp
, run
* size
);
bp
->b_iodone
= cluster_callback
;
bp
->b_flags
|= flags
| B_CALL
;
* Cleanup after a clustered read or write.
struct cluster_save
*b_save
;
b_save
= (struct cluster_save
*)(bp
->b_saveaddr
);
bp
->b_saveaddr
= b_save
->bs_saveaddr
;
cp
= bp
->b_un
.b_addr
+ b_save
->bs_bufsize
;
daddr
= bp
->b_blkno
+ b_save
->bs_bufsize
/ DEV_BSIZE
;
for (tbp
= b_save
->bs_children
; b_save
->bs_nchildren
--; ++tbp
) {
pagemove(cp
, (*tbp
)->b_un
.b_addr
, (*tbp
)->b_bufsize
);
bp
->b_bufsize
-= (*tbp
)->b_bufsize
;
if ((*tbp
)->b_blkno
!= daddr
) {
printf("cluster_callback: bad disk address:\n");
printf("Clustered Block: %d DiskAddr: %x bytes left: %d\n",
bp
->b_lblkno
, bp
->b_blkno
, bp
->b_bufsize
);
printf("\toriginal size: %d flags: %x\n", bp
->b_bcount
,
printf("Child Block: %d DiskAddr: %x bytes: %d\n",
(*tbp
)->b_lblkno
, (*tbp
)->b_blkno
,
printf("daddr: %x i_size %qd\n", daddr
, ip
->i_size
);
if ((*tbp
)->b_lblkno
< NDADDR
)
printf("Child block pointer from inode: %x\n",
ip
->i_din
.di_db
[(*tbp
)->b_lblkno
]);
panic ("cluster_callback: bad disk address");
daddr
+= (*tbp
)->b_bufsize
/ DEV_BSIZE
;
if (bp
->b_bufsize
!= b_save
->bs_bufsize
)
panic ("cluster_callback: more space to reclaim");
bp
->b_bcount
= bp
->b_bufsize
;
if (bp
->b_flags
& B_ASYNC
)
* Do clustered write for FFS.
* 1. Write is not sequential (write asynchronously)
* 2. beginning of cluster - begin cluster
* 3. middle of a cluster - add to cluster
* 4. end of a cluster - asynchronously write cluster
cluster_write(bp
, filesize
)
/* Initialize vnode to beginning of file. */
vp
->v_lasta
= vp
->v_clen
= vp
->v_cstart
= vp
->v_lastw
= 0;
if (vp
->v_clen
== 0 || lbn
!= vp
->v_lastw
+ 1 ||
(bp
->b_blkno
!= vp
->v_lasta
+ bp
->b_bcount
/ DEV_BSIZE
)) {
* Write is not sequential.
cluster_wbuild(vp
, NULL
, bp
->b_bcount
, vp
->v_cstart
,
vp
->v_lastw
- vp
->v_cstart
+ 1, lbn
);
* Consider beginning a cluster.
if ((lbn
+ 1) * bp
->b_bcount
== filesize
)
/* End of file, make cluster as large as possible */
clen
= MAXBSIZE
/ vp
->v_mount
->mnt_stat
.f_iosize
- 1;
else if (VOP_BMAP(vp
, lbn
, NULL
, &bp
->b_blkno
, &clen
)) {
vp
->v_lasta
= bp
->b_blkno
;
if (clen
== 0) { /* I/O not contiguous */
} else { /* Wait for rest of cluster */
} else if (lbn
== vp
->v_cstart
+ vp
->v_clen
) {
* At end of cluster, write it out.
cluster_wbuild(vp
, bp
, bp
->b_bcount
, vp
->v_cstart
,
* In the middle of a cluster, so just delay the
vp
->v_lasta
= bp
->b_blkno
;
* This is an awful lot like cluster_rbuild...wish they could be combined.
* The last lbn argument is the current block on which I/O is being
* performed. Check to see that it doesn't fall in the middle of
cluster_wbuild(vp
, last_bp
, size
, start_lbn
, len
, lbn
)
struct cluster_save
*b_save
;
if (size
!= vp
->v_mount
->mnt_stat
.f_iosize
)
panic("cluster_wbuild: size %d != filesize %d\n",
size
, vp
->v_mount
->mnt_stat
.f_iosize
);
while ((!incore(vp
, start_lbn
) || start_lbn
== lbn
) && len
) {
/* Get more memory for current buffer */
bp
= getblk(vp
, start_lbn
, size
, 0, 0);
bp
= getblk(vp
, start_lbn
, size
, 0, 0);
if (!(bp
->b_flags
& B_DELWRI
)) {
b_save
= malloc(sizeof(struct buf
*) * len
+ sizeof(struct cluster_save
),
b_save
->bs_bcount
= bp
->b_bcount
;
b_save
->bs_bufsize
= bp
->b_bufsize
;
b_save
->bs_nchildren
= 0;
b_save
->bs_children
= (struct buf
**)(b_save
+ 1);
b_save
->bs_saveaddr
= bp
->b_saveaddr
;
bp
->b_saveaddr
= (caddr_t
) b_save
;
bp
->b_iodone
= cluster_callback
;
cp
= bp
->b_un
.b_addr
+ bp
->b_bufsize
;
for (++start_lbn
, i
= 0; i
< len
; ++i
, ++start_lbn
) {
if (!incore(vp
, start_lbn
) || start_lbn
== lbn
)
if (last_bp
== NULL
|| start_lbn
!= last_bp
->b_lblkno
) {
tbp
= getblk(vp
, start_lbn
, size
, 0, 0);
if (tbp
->b_bcount
!= tbp
->b_bufsize
)
panic("cluster_wbuild: Buffer too big");
if (!(tbp
->b_flags
& B_DELWRI
)) {
/* Move memory from children to parent */
if (tbp
->b_blkno
!= (bp
->b_blkno
+ bp
->b_bufsize
/ DEV_BSIZE
)) {
printf("Clustered Block: %d addr %x bufsize: %d\n",
bp
->b_lblkno
, bp
->b_blkno
, bp
->b_bufsize
);
printf("Child Block: %d addr: %x\n", tbp
->b_lblkno
,
panic("Clustered write to wrong blocks");
pagemove(tbp
->b_un
.b_daddr
, cp
, size
);
tbp
->b_flags
&= ~(B_READ
| B_DONE
| B_ERROR
| B_DELWRI
);
reassignbuf(tbp
, tbp
->b_vp
); /* put on clean list */
++tbp
->b_vp
->v_numoutput
;
b_save
->bs_children
[i
] = tbp
;
bp
->b_saveaddr
= b_save
->bs_saveaddr
;