* The Regents of the University of California. All rights reserved.
* %sccs.include.redist.c%
* @(#)vfs_cluster.c 8.7 (Berkeley) %G%
#include <sys/resourcevar.h>
#include <libkern/libkern.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
struct ctldebug debug13
= { "doreallocblks", &doreallocblks
};
/* XXX for cluster_write */
struct buf
*cluster_newbuf
__P((struct vnode
*, struct buf
*, long, daddr_t
,
struct buf
*cluster_rbuild
__P((struct vnode
*, u_quad_t
, struct buf
*,
daddr_t
, daddr_t
, long, int, long));
void cluster_wbuild
__P((struct vnode
*, struct buf
*, long,
struct cluster_save
*cluster_collectbufs
__P((struct vnode
*, struct buf
*));
* Set to 1 if reads of block zero should cause readahead to be done.
* Set to 0 treats a read of block zero as a non-sequential read.
* Setting to one assumes that most reads of block zero of files are due to
* sequential passes over the files (e.g. cat, sum) where additional blocks
* will soon be needed. Setting to zero assumes that the majority are
* surgical strikes to get particular info (e.g. size, file) where readahead
* blocks will not be used and, in fact, push out other potentially useful
* blocks from the cache. The former seems intuitive, but some quick tests
* showed that the latter performed better from a system-wide point of view.
#define ISSEQREAD(vp, blk) \
(((blk) != 0 || doclusterraz) && \
((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
#define ISSEQREAD(vp, blk) \
((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
* This replaces bread. If this is a bread at the beginning of a file and
* lastr is 0, we assume this is the first read and we'll read up to two
* blocks if they are sequential. After that, we'll do regular read ahead
* There are 4 or 5 cases depending on how you count:
* Desired block is in the cache:
* 1 Not sequential access (0 I/Os).
* 2 Access is sequential, do read-ahead (1 ASYNC).
* Desired block is not in cache:
* 3 Not sequential access (1 SYNC).
* 4 Sequential access, next block is contiguous (1 SYNC).
* 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
* There are potentially two buffers that require I/O.
* bp is the block requested.
* rbp is the read-ahead block.
* If either is NULL, then you don't have to do the I/O.
cluster_read(vp
, filesize
, lblkno
, size
, cred
, bpp
)
int error
, num_ra
, alreadyincore
;
panic("cluster_read: size = 0");
*bpp
= bp
= getblk(vp
, lblkno
, size
, 0, 0);
if (bp
->b_flags
& B_CACHE
) {
* Desired block is in cache; do any readahead ASYNC.
trace(TR_BREADHIT
, pack(vp
, size
), lblkno
);
ioblkno
= lblkno
+ (vp
->v_ralen
? vp
->v_ralen
: 1);
alreadyincore
= (int)incore(vp
, ioblkno
);
/* Block wasn't in cache, case 3, 4, 5. */
trace(TR_BREADMISS
, pack(vp
, size
), lblkno
);
curproc
->p_stats
->p_ru
.ru_inblock
++; /* XXX */
* Replace 1 with a window size based on some permutation of
* maxcontig and rot_delay. This will let you figure out how
* many blocks you should read-ahead (case 2, 4, 5).
* If the access isn't sequential, reset the window to 1.
* Note that a read to the same block is considered sequential.
* This catches the case where the file is being read sequentially,
* but at smaller than the filesystem block size.
if (!ISSEQREAD(vp
, lblkno
)) {
} else if ((ioblkno
+ 1) * size
<= filesize
&& !alreadyincore
&&
!(error
= VOP_BMAP(vp
, ioblkno
, NULL
, &blkno
, &num_ra
)) &&
* Reading sequentially, and the next block is not in the
* cache. We are going to try reading ahead.
* If our desired readahead block had been read
* in a previous readahead but is no longer in
* core, then we may be reading ahead too far
* or are not using our readahead very rapidly.
* In this case we scale back the window.
if (!alreadyincore
&& ioblkno
<= vp
->v_maxra
)
vp
->v_ralen
= max(vp
->v_ralen
>> 1, 1);
* There are more sequential blocks than our current
* window allows, scale up. Ideally we want to get
* in sync with the filesystem maxcontig value.
else if (num_ra
> vp
->v_ralen
&& lblkno
!= vp
->v_lastr
)
vp
->v_ralen
= vp
->v_ralen
?
min(num_ra
, vp
->v_ralen
<< 1) : 1;
if (num_ra
> vp
->v_ralen
)
if (num_ra
) /* case 2, 4 */
rbp
= cluster_rbuild(vp
, filesize
,
bp
, ioblkno
, blkno
, size
, num_ra
, flags
);
else if (ioblkno
== lblkno
) {
/* Case 5: check how many blocks to read ahead */
if ((ioblkno
+ 1) * size
> filesize
||
incore(vp
, ioblkno
) || (error
= VOP_BMAP(vp
,
ioblkno
, NULL
, &blkno
, &num_ra
)) || blkno
== -1)
* Adjust readahead as above
if (!alreadyincore
&& ioblkno
<= vp
->v_maxra
)
vp
->v_ralen
= max(vp
->v_ralen
>> 1, 1);
else if (num_ra
> vp
->v_ralen
&&
vp
->v_ralen
= vp
->v_ralen
?
min(num_ra
,vp
->v_ralen
<<1) : 1;
if (num_ra
> vp
->v_ralen
)
rbp
= cluster_rbuild(vp
, filesize
,
NULL
, ioblkno
, blkno
, size
, num_ra
, flags
);
rbp
= getblk(vp
, ioblkno
, size
, 0, 0);
/* case 2; read ahead single block */
rbp
= getblk(vp
, ioblkno
, size
, 0, 0);
if (rbp
== bp
) /* case 4 */
else if (rbp
) { /* case 2, 5 */
pack(vp
, (num_ra
+ 1) * size
), ioblkno
);
curproc
->p_stats
->p_ru
.ru_inblock
++; /* XXX */
/* XXX Kirk, do we need to make sure the bp has creds? */
if (bp
->b_flags
& (B_DONE
| B_DELWRI
))
panic("cluster_read: DONE bp");
error
= VOP_STRATEGY(bp
);
if (error
|| rbp
->b_flags
& (B_DONE
| B_DELWRI
)) {
rbp
->b_flags
&= ~(B_ASYNC
| B_READ
);
(void) VOP_STRATEGY(rbp
);
* Recalculate our maximum readahead
vp
->v_maxra
= rbp
->b_lblkno
+ (rbp
->b_bufsize
/ size
) - 1;
* If blocks are contiguous on disk, use this to provide clustered
* read ahead. We will read as many blocks as possible sequentially
* and then parcel them up into logical blocks in the buffer hash table.
cluster_rbuild(vp
, filesize
, bp
, lbn
, blkno
, size
, run
, flags
)
struct cluster_save
*b_save
;
if (size
!= vp
->v_mount
->mnt_stat
.f_iosize
)
panic("cluster_rbuild: size %d != filesize %d\n",
size
, vp
->v_mount
->mnt_stat
.f_iosize
);
if (size
* (lbn
+ run
+ 1) > filesize
)
bp
= getblk(vp
, lbn
, size
, 0, 0);
bp
= cluster_newbuf(vp
, bp
, flags
, blkno
, lbn
, size
, run
+ 1);
if (bp
->b_flags
& (B_DONE
| B_DELWRI
))
b_save
= malloc(sizeof(struct buf
*) * run
+ sizeof(struct cluster_save
),
b_save
->bs_bufsize
= b_save
->bs_bcount
= size
;
b_save
->bs_nchildren
= 0;
b_save
->bs_children
= (struct buf
**)(b_save
+ 1);
b_save
->bs_saveaddr
= bp
->b_saveaddr
;
bp
->b_saveaddr
= (caddr_t
) b_save
;
for (bn
= blkno
+ inc
, i
= 1; i
<= run
; ++i
, bn
+= inc
) {
if (incore(vp
, lbn
+ i
)) {
bp
->b_saveaddr
= b_save
->bs_saveaddr
;
tbp
= getblk(vp
, lbn
+ i
, 0, 0, 0);
* getblk may return some memory in the buffer if there were
* no empty buffers to shed it to. If there is currently
* memory in the buffer, we move it down size bytes to make
* room for the valid pages that cluster_callback will insert.
* We do this now so we don't have to do it at interrupt time
* in the callback routine.
if (tbp
->b_bufsize
!= 0) {
caddr_t bdata
= (char *)tbp
->b_data
;
if (tbp
->b_bufsize
+ size
> MAXBSIZE
)
panic("cluster_rbuild: too much memory");
if (tbp
->b_bufsize
> size
) {
* XXX if the source and destination regions
* overlap we have to copy backward to avoid
* clobbering any valid pages (i.e. pagemove
* implementations typically can't handle
while (bdata
> (char *)tbp
->b_data
) {
pagemove(bdata
, bdata
+ size
, CLBYTES
);
pagemove(bdata
, bdata
+ size
, tbp
->b_bufsize
);
VOP_BMAP(tbp
->b_vp
, tbp
->b_lblkno
, NULL
, &temp
, NULL
);
printf("Block: %d Assigned address: %x Bmap address: %x\n",
tbp
->b_lblkno
, tbp
->b_blkno
, temp
);
panic("cluster_rbuild: wrong disk address");
tbp
->b_flags
|= flags
| B_READ
| B_ASYNC
;
b_save
->bs_children
[i
- 1] = tbp
;
* Either get a new buffer or grow the existing one.
cluster_newbuf(vp
, bp
, flags
, blkno
, lblkno
, size
, run
)
bp
= getblk(vp
, lblkno
, size
, 0, 0);
if (bp
->b_flags
& (B_DONE
| B_DELWRI
)) {
allocbuf(bp
, run
* size
);
bp
->b_iodone
= cluster_callback
;
bp
->b_flags
|= flags
| B_CALL
;
* Cleanup after a clustered read or write.
* This is complicated by the fact that any of the buffers might have
* extra memory (if there were no empty buffer headers at allocbuf time)
* that we will need to shift around.
struct cluster_save
*b_save
;
* Must propogate errors to all the components.
if (bp
->b_flags
& B_ERROR
)
b_save
= (struct cluster_save
*)(bp
->b_saveaddr
);
bp
->b_saveaddr
= b_save
->bs_saveaddr
;
bsize
= b_save
->bs_bufsize
;
cp
= (char *)bp
->b_data
+ bsize
;
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
for (bpp
= b_save
->bs_children
; b_save
->bs_nchildren
--; ++bpp
) {
pagemove(cp
, tbp
->b_data
, bsize
);
* If there was excess memory in the cluster buffer,
* slide it up adjacent to the remaining valid data.
if (bp
->b_bufsize
!= bsize
) {
if (bp
->b_bufsize
< bsize
)
panic("cluster_callback: too little memory");
pagemove(cp
, (char *)bp
->b_data
+ bsize
, bp
->b_bufsize
- bsize
);
if (bp
->b_flags
& B_ASYNC
)
bp
->b_flags
&= ~B_WANTED
;
* Do clustered write for FFS.
* 1. Write is not sequential (write asynchronously)
* 2. beginning of cluster - begin cluster
* 3. middle of a cluster - add to cluster
* 4. end of a cluster - asynchronously write cluster
cluster_write(bp
, filesize
)
/* Initialize vnode to beginning of file. */
vp
->v_lasta
= vp
->v_clen
= vp
->v_cstart
= vp
->v_lastw
= 0;
if (vp
->v_clen
== 0 || lbn
!= vp
->v_lastw
+ 1 ||
(bp
->b_blkno
!= vp
->v_lasta
+ btodb(bp
->b_bcount
))) {
maxclen
= MAXBSIZE
/ vp
->v_mount
->mnt_stat
.f_iosize
- 1;
* Next block is not sequential.
* If we are not writing at end of file, the process
* seeked to another point in the file since its
* last write, or we have reached our maximum
* cluster size, then push the previous cluster.
* Otherwise try reallocating to make it sequential.
cursize
= vp
->v_lastw
- vp
->v_cstart
+ 1;
(lbn
+ 1) * bp
->b_bcount
!= filesize
||
lbn
!= vp
->v_lastw
+ 1 || vp
->v_clen
<= cursize
) {
cluster_wbuild(vp
, NULL
, bp
->b_bcount
,
vp
->v_cstart
, cursize
, lbn
);
struct buf
**bpp
, **endbp
;
struct cluster_save
*buflist
;
buflist
= cluster_collectbufs(vp
, bp
);
endbp
= &buflist
->bs_children
[buflist
->bs_nchildren
- 1];
if (VOP_REALLOCBLKS(vp
, buflist
)) {
* Failed, push the previous cluster.
for (bpp
= buflist
->bs_children
;
free(buflist
, M_SEGMENT
);
cluster_wbuild(vp
, NULL
, bp
->b_bcount
,
vp
->v_cstart
, cursize
, lbn
);
* Succeeded, keep building cluster.
for (bpp
= buflist
->bs_children
;
free(buflist
, M_SEGMENT
);
vp
->v_lasta
= bp
->b_blkno
;
* Consider beginning a cluster.
* If at end of file, make cluster as large as possible,
* otherwise find size of existing cluster.
if ((lbn
+ 1) * bp
->b_bcount
!= filesize
&&
(VOP_BMAP(vp
, lbn
, NULL
, &bp
->b_blkno
, &maxclen
) ||
vp
->v_lasta
= bp
->b_blkno
;
if (maxclen
== 0) { /* I/O not contiguous */
} else { /* Wait for rest of cluster */
} else if (lbn
== vp
->v_cstart
+ vp
->v_clen
) {
* At end of cluster, write it out.
cluster_wbuild(vp
, bp
, bp
->b_bcount
, vp
->v_cstart
,
* In the middle of a cluster, so just delay the
vp
->v_lasta
= bp
->b_blkno
;
* This is an awful lot like cluster_rbuild...wish they could be combined.
* The last lbn argument is the current block on which I/O is being
* performed. Check to see that it doesn't fall in the middle of
* the current block (if last_bp == NULL).
cluster_wbuild(vp
, last_bp
, size
, start_lbn
, len
, lbn
)
struct cluster_save
*b_save
;
if (size
!= vp
->v_mount
->mnt_stat
.f_iosize
)
panic("cluster_wbuild: size %d != filesize %d\n",
size
, vp
->v_mount
->mnt_stat
.f_iosize
);
while ((!incore(vp
, start_lbn
) || start_lbn
== lbn
) && len
) {
/* Get more memory for current buffer */
bp
= getblk(vp
, start_lbn
, size
, 0, 0);
bp
= getblk(vp
, start_lbn
, size
, 0, 0);
if (!(bp
->b_flags
& B_DELWRI
)) {
* Extra memory in the buffer, punt on this buffer.
* XXX we could handle this in most cases, but we would have to
* push the extra memory down to after our max possible cluster
* size and then potentially pull it back up if the cluster was
* terminated prematurely--too much hassle.
if (bp
->b_bcount
!= bp
->b_bufsize
) {
b_save
= malloc(sizeof(struct buf
*) * len
+ sizeof(struct cluster_save
),
b_save
->bs_bcount
= bp
->b_bcount
;
b_save
->bs_bufsize
= bp
->b_bufsize
;
b_save
->bs_nchildren
= 0;
b_save
->bs_children
= (struct buf
**)(b_save
+ 1);
b_save
->bs_saveaddr
= bp
->b_saveaddr
;
bp
->b_saveaddr
= (caddr_t
) b_save
;
bp
->b_iodone
= cluster_callback
;
cp
= (char *)bp
->b_data
+ size
;
for (++start_lbn
, i
= 0; i
< len
; ++i
, ++start_lbn
) {
* Block is not in core or the non-sequential block
* ending our cluster was part of the cluster (in which
* case we don't want to write it twice).
if (!incore(vp
, start_lbn
) ||
last_bp
== NULL
&& start_lbn
== lbn
)
* Get the desired block buffer (unless it is the final
* sequential block whose buffer was passed in explictly
if (last_bp
== NULL
|| start_lbn
!= lbn
) {
tbp
= getblk(vp
, start_lbn
, size
, 0, 0);
if (!(tbp
->b_flags
& B_DELWRI
)) {
/* Move memory from children to parent */
if (tbp
->b_blkno
!= (bp
->b_blkno
+ btodb(bp
->b_bufsize
))) {
printf("Clustered Block: %d addr %x bufsize: %d\n",
bp
->b_lblkno
, bp
->b_blkno
, bp
->b_bufsize
);
printf("Child Block: %d addr: %x\n", tbp
->b_lblkno
,
panic("Clustered write to wrong blocks");
pagemove(tbp
->b_data
, cp
, size
);
tbp
->b_flags
&= ~(B_READ
| B_DONE
| B_ERROR
| B_DELWRI
);
tbp
->b_flags
|= (B_ASYNC
| B_AGE
);
reassignbuf(tbp
, tbp
->b_vp
); /* put on clean list */
++tbp
->b_vp
->v_numoutput
;
b_save
->bs_children
[i
] = tbp
;
bp
->b_saveaddr
= b_save
->bs_saveaddr
;
* Collect together all the buffers in a cluster.
* Plus add one additional buffer.
cluster_collectbufs(vp
, last_bp
)
struct cluster_save
*buflist
;
len
= vp
->v_lastw
- vp
->v_cstart
+ 1;
buflist
= malloc(sizeof(struct buf
*) * (len
+ 1) + sizeof(*buflist
),
buflist
->bs_nchildren
= 0;
buflist
->bs_children
= (struct buf
**)(buflist
+ 1);
for (lbn
= vp
->v_cstart
, i
= 0; i
< len
; lbn
++, i
++)
(void)bread(vp
, lbn
, last_bp
->b_bcount
, NOCRED
,
&buflist
->bs_children
[i
]);
buflist
->bs_children
[i
] = last_bp
;
buflist
->bs_nchildren
= i
+ 1;