* Copyright (c) 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
* This code is derived from software contributed to Berkeley by
* Berkeley Software Design Inc.
* %sccs.include.redist.c%
* @(#)vfs_bio.c 8.8 (Berkeley) %G%
#include <sys/resourcevar.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
* Definitions for the buffer hash lists.
#define BUFHASH(dvp, lbn) \
(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
LIST_HEAD(bufhashhdr
, buf
) *bufhashtbl
, invalhash
;
* Insq/Remq for the buffer hash lists.
#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
#define bremhash(bp) LIST_REMOVE(bp, b_hash)
* Definitions for the buffer free lists.
#define BQUEUES 4 /* number of free buffer queues */
#define BQ_LOCKED 0 /* super-blocks &c */
#define BQ_LRU 1 /* lru, useful buffers */
#define BQ_AGE 2 /* rubbish */
#define BQ_EMPTY 3 /* buffer headers with no memory */
TAILQ_HEAD(bqueues
, buf
) bufqueues
[BQUEUES
];
* Insq/Remq for the buffer free lists.
#define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist)
#define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist)
struct bqueues
*dp
= NULL
;
* We only calculate the head of the freelist when removing
* the last element of the list as that is the only time that
* it is needed (e.g. to reset the tail pointer).
* NB: This makes an assumption about how tailq's are implemented.
if (bp
->b_freelist
.tqe_next
== NULL
) {
for (dp
= bufqueues
; dp
< &bufqueues
[BQUEUES
]; dp
++)
if (dp
->tqh_last
== &bp
->b_freelist
.tqe_next
)
if (dp
== &bufqueues
[BQUEUES
])
panic("bremfree: lost tail");
TAILQ_REMOVE(dp
, bp
, b_freelist
);
* Initialize buffers and hash links for buffers.
for (dp
= bufqueues
; dp
< &bufqueues
[BQUEUES
]; dp
++)
bufhashtbl
= hashinit(nbuf
, M_CACHE
, &bufhash
);
residual
= bufpages
% nbuf
;
for (i
= 0; i
< nbuf
; i
++) {
bzero((char *)bp
, sizeof *bp
);
bp
->b_vnbufs
.le_next
= NOLIST
;
bp
->b_data
= buffers
+ i
* MAXBSIZE
;
bp
->b_bufsize
= (base
+ 1) * CLBYTES
;
bp
->b_bufsize
= base
* CLBYTES
;
dp
= bp
->b_bufsize
? &bufqueues
[BQ_AGE
] : &bufqueues
[BQ_EMPTY
];
binshash(bp
, &invalhash
);
* Find the block in the buffer pool.
* If the buffer is not present, allocate a new buffer and load
* its contents according to the filesystem fill routine.
bread(vp
, blkno
, size
, cred
, bpp
)
struct proc
*p
= curproc
; /* XXX */
bp
= getblk(dev
, blkno
, size
, secsize
);
*bpp
= bp
= getblk(vp
, blkno
, size
, 0, 0);
if (bp
->b_flags
& (B_DONE
| B_DELWRI
)) {
trace(TR_BREADHIT
, pack(vp
, size
), blkno
);
if (bp
->b_bcount
> bp
->b_bufsize
)
if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
trace(TR_BREADMISS
, pack(vp
, size
), blkno
);
p
->p_stats
->p_ru
.ru_inblock
++; /* pay for read */
* Operates like bread, but also starts I/O on the N specified
breadn(vp
, blkno
, size
, rablkno
, rabsize
, num
, cred
, bpp
)
daddr_t rablkno
[]; int rabsize
[];
struct proc
*p
= curproc
; /* XXX */
register struct buf
*bp
, *rabp
;
* If the block is not memory resident,
* allocate a buffer and start I/O.
if (!incore(vp
, blkno
)) {
*bpp
= bp
= getblk(vp
, blkno
, size
, 0, 0);
if ((bp
->b_flags
& (B_DONE
| B_DELWRI
)) == 0) {
if (bp
->b_bcount
> bp
->b_bufsize
)
if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
trace(TR_BREADMISS
, pack(vp
, size
), blkno
);
p
->p_stats
->p_ru
.ru_inblock
++; /* pay for read */
trace(TR_BREADHIT
, pack(vp
, size
), blkno
);
* If there's read-ahead block(s), start I/O
* on them also (as above).
for (i
= 0; i
< num
; i
++) {
if (incore(vp
, rablkno
[i
]))
rabp
= getblk(vp
, rablkno
[i
], rabsize
[i
], 0, 0);
if (rabp
->b_flags
& (B_DONE
| B_DELWRI
)) {
trace(TR_BREADHITRA
, pack(vp
, rabsize
[i
]), rablkno
[i
]);
rabp
->b_flags
|= B_ASYNC
| B_READ
;
if (rabp
->b_bcount
> rabp
->b_bufsize
)
if (rabp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
trace(TR_BREADMISSRA
, pack(vp
, rabsize
[i
]), rablkno
[i
]);
p
->p_stats
->p_ru
.ru_inblock
++; /* pay in advance */
* If block was memory resident, let bread get it.
* If block was not memory resident, the read was
* started above, so just wait for the read to complete.
return (bread(dev
, blkno
, size
, secsize
));
return (bread(vp
, blkno
, size
, cred
, bpp
));
* Release buffer on completion.
struct proc
*p
= curproc
; /* XXX */
if ((bp
->b_flags
& B_ASYNC
) == 0 &&
bp
->b_vp
&& (bp
->b_vp
->v_mount
->mnt_flag
& MNT_ASYNC
)) {
bp
->b_flags
&= ~(B_READ
| B_DONE
| B_ERROR
| B_DELWRI
);
if ((flag
& B_DELWRI
) == 0)
p
->p_stats
->p_ru
.ru_oublock
++; /* no one paid yet */
reassignbuf(bp
, bp
->b_vp
);
trace(TR_BWRITE
, pack(bp
->b_vp
, bp
->b_bcount
), bp
->b_lblkno
);
if (bp
->b_bcount
> bp
->b_bufsize
)
bp
->b_flags
|= B_WRITEINPROG
;
* If the write was synchronous, then await I/O completion.
* If the write was "delayed", then we put the buffer on
* the queue of blocks awaiting I/O completion status.
if ((flag
& B_ASYNC
) == 0) {
if ((flag
&B_DELWRI
) == 0)
p
->p_stats
->p_ru
.ru_oublock
++; /* no one paid yet */
reassignbuf(bp
, bp
->b_vp
);
if (bp
->b_flags
& B_EINTR
) {
} else if (flag
& B_DELWRI
) {
struct vop_bwrite_args
*ap
;
return (bwrite(ap
->a_bp
));
* The buffer is marked dirty, but is not queued for I/O.
* This routine should be used when the buffer is expected
* to be modified again soon, typically a small write that
* partially fills a buffer.
* NB: magnetic tapes cannot be delayed; they must be
* written in the order that the writes are requested.
struct proc
*p
= curproc
; /* XXX */
if ((bp
->b_flags
& B_DELWRI
) == 0) {
reassignbuf(bp
, bp
->b_vp
);
p
->p_stats
->p_ru
.ru_oublock
++; /* no one paid yet */
* If this is a tape drive, the write must be initiated.
if (bdevsw
[major(bp
->b_dev
)].d_flags
& B_TAPE
)
bp
->b_flags
|= (B_DONE
| B_DELWRI
);
* Start I/O on a buffer, but do not wait for it to complete.
* The buffer is released when the I/O completes.
* Setting the ASYNC flag causes bwrite to return
* after starting the I/O.
* Even if the buffer is dirty, no I/O is started.
register struct bqueues
*flist
;
trace(TR_BRELSE
, pack(bp
->b_vp
, bp
->b_bufsize
), bp
->b_lblkno
);
* If a process is waiting for the buffer, or
* is waiting for a free buffer, awaken it.
if (bp
->b_flags
& B_WANTED
)
wakeup((caddr_t
)&needbuffer
);
* Retry I/O for locked buffers rather than invalidating them.
if ((bp
->b_flags
& B_ERROR
) && (bp
->b_flags
& B_LOCKED
))
* Disassociate buffers that are no longer valid.
if (bp
->b_flags
& (B_NOCACHE
| B_ERROR
))
if ((bp
->b_bufsize
<= 0) || (bp
->b_flags
& (B_ERROR
| B_INVAL
))) {
bp
->b_flags
&= ~B_DELWRI
;
* Stick the buffer back on a free list.
if (bp
->b_bufsize
<= 0) {
/* block has no buffer ... put at front of unused buffer list */
flist
= &bufqueues
[BQ_EMPTY
];
} else if (bp
->b_flags
& (B_ERROR
| B_INVAL
)) {
/* block has no info ... put at front of most free list */
flist
= &bufqueues
[BQ_AGE
];
if (bp
->b_flags
& B_LOCKED
)
flist
= &bufqueues
[BQ_LOCKED
];
else if (bp
->b_flags
& B_AGE
)
flist
= &bufqueues
[BQ_AGE
];
flist
= &bufqueues
[BQ_LRU
];
bp
->b_flags
&= ~(B_WANTED
| B_BUSY
| B_ASYNC
| B_AGE
| B_NOCACHE
);
* Check to see if a block is currently memory resident.
for (bp
= BUFHASH(vp
, blkno
)->lh_first
; bp
; bp
= bp
->b_hash
.le_next
)
if (bp
->b_lblkno
== blkno
&& bp
->b_vp
== vp
&&
(bp
->b_flags
& B_INVAL
) == 0)
* Check to see if a block is currently memory resident.
* If it is resident, return it. If it is not resident,
* allocate a new buffer and assign it to the block.
getblk(dev
, blkno
, size
, secsize
)
getblk(vp
, blkno
, size
, slpflag
, slptimeo
)
register struct vnode
*vp
;
int size
, slpflag
, slptimeo
;
panic("getblk: size too big");
* Search the cache for the block. If the buffer is found,
* but it is currently locked, the we must wait for it to
for (bp
= dp
->lh_first
; bp
; bp
= bp
->b_hash
.le_next
) {
if (bp
->b_lblkno
!= blkno
|| bp
->b_vp
!= vp
)
if (bp
->b_flags
& B_BUSY
) {
error
= tsleep((caddr_t
)bp
, slpflag
| (PRIBIO
+ 1),
* The test for B_INVAL is moved down here, since there
* are cases where B_INVAL is set before VOP_BWRITE() is
* called and for NFS, the process cannot be allowed to
* allocate a new buffer for the same block until the write
* back to the server has been completed. (ie. B_BUSY clears)
if (bp
->b_flags
& B_INVAL
) {
if (bp
->b_bcount
!= size
) {
printf("getblk: stray size");
* The loop back to the top when getnewbuf() fails is because
* stateless filesystems like NFS have no node locks. Thus,
* there is a slight chance that more than one process will
* try and getnewbuf() for the same block concurrently when
* the first sleeps in getnewbuf(). So after a sleep, go back
* up to the top to check the hash lists again.
if ((bp
= getnewbuf(slpflag
, slptimeo
)) == 0)
* The caller will assign it to a block.
panic("geteblk: size too big");
while ((bp
= getnewbuf(0, 0)) == NULL
)
binshash(bp
, &invalhash
);
bp
->b_blksize
= DEV_BSIZE
;
* Expand or contract the actual memory allocated to a buffer.
* If no memory is available, release buffer and take error exit.
register struct buf
*bp
, *ep
;
sizealloc
= roundup(size
, CLBYTES
);
* Buffer size does not change
if (sizealloc
== tp
->b_bufsize
)
* Buffer size is shrinking.
* Place excess space in a buffer header taken from the
* BQ_EMPTY buffer list and placed on the "most free" list.
* If no extra buffer headers are available, leave the
* extra space in the present buffer.
if (sizealloc
< tp
->b_bufsize
) {
if ((ep
= bufqueues
[BQ_EMPTY
].tqh_first
) == NULL
)
pagemove((char *)tp
->b_data
+ sizealloc
, ep
->b_data
,
(int)tp
->b_bufsize
- sizealloc
);
ep
->b_bufsize
= tp
->b_bufsize
- sizealloc
;
tp
->b_bufsize
= sizealloc
;
* More buffer space is needed. Get it out of buffers on
* the "most free" list, placing the empty headers on the
* BQ_EMPTY buffer header list.
while (tp
->b_bufsize
< sizealloc
) {
take
= sizealloc
- tp
->b_bufsize
;
while ((bp
= getnewbuf(0, 0)) == NULL
)
if (take
>= bp
->b_bufsize
)
pagemove(&((char *)bp
->b_data
)[bp
->b_bufsize
- take
],
&((char *)tp
->b_data
)[tp
->b_bufsize
], take
);
bp
->b_bufsize
= bp
->b_bufsize
- take
;
if (bp
->b_bcount
> bp
->b_bufsize
)
bp
->b_bcount
= bp
->b_bufsize
;
if (bp
->b_bufsize
<= 0) {
binshash(bp
, &invalhash
);
* Find a buffer which is available for use.
* Select something from a free list.
* Preference is to AGE list, then LRU list.
getnewbuf(slpflag
, slptimeo
)
register struct bqueues
*dp
;
register struct ucred
*cred
;
for (dp
= &bufqueues
[BQ_AGE
]; dp
> bufqueues
; dp
--) {
for (bp
= dp
->qe_next
; bp
; bp
= bp
->b_freelist
.qe_next
) {
if ((bp
->b_flags
& B_DELWRI
) &&
bp
->b_vp
&& VOP_ISLOCKED(bp
->b_vp
))
if (dp
== bufqueues
) { /* no free blocks */
vprint("skipping blkno check", bp
->b_vp
);
printf("\tlblkno %d, blkno %d\n",
bp
->b_lblkno
, bp
->b_blkno
);
(void) tsleep((caddr_t
)&needbuffer
, slpflag
| (PRIBIO
+ 1),
if (bp
->b_flags
& B_DELWRI
) {
trace(TR_BRELSE
, pack(bp
->b_vp
, bp
->b_bufsize
), bp
->b_lblkno
);
if (bp
->b_rcred
!= NOCRED
) {
if (bp
->b_wcred
!= NOCRED
) {
bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
bp
->b_validoff
= bp
->b_validend
= 0;
* Wait for I/O to complete.
* Extract and return any errors associated with the I/O.
* If the error flag is set, but no specific error is
while ((bp
->b_flags
& B_DONE
) == 0)
sleep((caddr_t
)bp
, PRIBIO
);
if ((bp
->b_flags
& B_ERROR
) == 0)
* Mark I/O complete on a buffer.
* If a callback has been requested, e.g. the pageout
* daemon, do so. Otherwise, awaken waiting processes.
if (bp
->b_flags
& B_DONE
)
if ((bp
->b_flags
& B_READ
) == 0)
if (bp
->b_flags
& B_CALL
) {
if (bp
->b_flags
& B_ASYNC
)
bp
->b_flags
&= ~B_WANTED
;
for (ret
= 0, bp
= (struct buf
*)bufqueues
[BQ_LOCKED
].tqh_first
;
bp
; bp
= (struct buf
*)bp
->b_freelist
.tqe_next
)
* Print out statistics on the current allocation of the buffer pool.
* Can be enabled to print out on every ``sync'' by setting "syncprt"
* in vfs_syscalls.c using sysctl.
register struct bqueues
*dp
;
int counts
[MAXBSIZE
/CLBYTES
+1];
static char *bname
[BQUEUES
] = { "LOCKED", "LRU", "AGE", "EMPTY" };
for (dp
= bufqueues
, i
= 0; dp
< &bufqueues
[BQUEUES
]; dp
++, i
++) {
for (j
= 0; j
<= MAXBSIZE
/CLBYTES
; j
++)
for (bp
= dp
->tqh_first
; bp
; bp
= bp
->b_freelist
.tqe_next
) {
counts
[bp
->b_bufsize
/CLBYTES
]++;
printf("%s: total-%d", bname
[i
], count
);
for (j
= 0; j
<= MAXBSIZE
/CLBYTES
; j
++)
printf(", %d-%d", j
* CLBYTES
, counts
[j
]);