* Copyright (c) 1989 The Regents of the University of California.
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
* %sccs.include.redist.c%
* @(#)nfs_bio.c 7.37 (Berkeley) %G%
#include <sys/resourcevar.h>
#include <nfs/nfsmount.h>
struct buf
*incore(), *nfs_getcacheblk();
extern struct queue_entry nfs_bufq
;
extern struct proc
*nfs_iodwant
[NFS_MAXASYNCDAEMON
];
* Vnode op for read using bio
* Any similarity to readip() is purely coincidental
nfs_bioread(vp
, uio
, ioflag
, cred
)
register struct vnode
*vp
;
register struct uio
*uio
;
register struct nfsnode
*np
= VTONFS(vp
);
register int biosize
, diff
;
int got_buf
, len
, nra
, error
= 0, n
, on
, not_readin
;
if (uio
->uio_rw
!= UIO_READ
)
if (uio
->uio_offset
< 0 && vp
->v_type
!= VDIR
)
nmp
= VFSTONFS(vp
->v_mount
);
* For nfs, cache consistency can only be maintained approximately.
* Although RFC1094 does not specify the criteria, the following is
* believed to be compatible with the reference port.
* For nqnfs, full cache consistency is maintained within the loop.
* If the file's modify time on the server has changed since the
* last read rpc or you have written to the file,
* you may have lost data cache consistency with the
* server, so flush all of the file's data out of the cache.
* Then force a getattr rpc to ensure that you have up to date
* The mount flag NFSMNT_MYWRITE says "Assume that my writes are
* the ones changing the modify time.
* NB: This implies that cache data can be read when up to
* NFS_ATTRTIMEO seconds out of date. If you find that you need current
* attributes this could be forced by setting n_attrstamp to 0 before
* the VOP_GETATTR() call.
if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0 && vp
->v_type
!= VLNK
) {
if (np
->n_flag
& NMODIFIED
) {
if ((nmp
->nm_flag
& NFSMNT_MYWRITE
) == 0 ||
if (error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1))
if (error
= VOP_GETATTR(vp
, &vattr
, cred
, p
))
np
->n_mtime
= vattr
.va_mtime
.ts_sec
;
if (error
= VOP_GETATTR(vp
, &vattr
, cred
, p
))
if (np
->n_mtime
!= vattr
.va_mtime
.ts_sec
) {
if (error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1))
np
->n_mtime
= vattr
.va_mtime
.ts_sec
;
* Get a valid lease. If cached data is stale, flush it.
if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
if (NQNFS_CKINVALID(vp
, np
, NQL_READ
)) {
error
= nqnfs_getlease(vp
, NQL_READ
, cred
, p
);
} while (error
== NQNFS_EXPIRED
);
if (np
->n_lrev
!= np
->n_brev
||
((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
if (vp
->v_type
== VDIR
) {
if (error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1))
} else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
if (error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1))
if (np
->n_flag
& NQNFSNONCACHE
) {
error
= nfs_readrpc(vp
, uio
, cred
);
error
= nfs_readlinkrpc(vp
, uio
, cred
);
error
= nfs_readdirrpc(vp
, uio
, cred
);
nfsstats
.biocache_reads
++;
lbn
= uio
->uio_offset
/ biosize
;
on
= uio
->uio_offset
& (biosize
-1);
bn
= lbn
* (biosize
/ DEV_BSIZE
);
* Start the read ahead(s), as required.
if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
lbn
== vp
->v_lastr
+ 1) {
for (nra
= 0; nra
< nmp
->nm_readahead
&&
(lbn
+ 1 + nra
) * biosize
< np
->n_size
; nra
++) {
rabn
= (lbn
+ 1 + nra
) * (biosize
/ DEV_BSIZE
);
rabp
= nfs_getcacheblk(vp
, rabn
, biosize
, p
);
if ((rabp
->b_flags
& (B_DELWRI
| B_DONE
)) == 0) {
rabp
->b_flags
|= (B_READ
| B_ASYNC
);
if (nfs_asyncio(rabp
, cred
)) {
rabp
->b_flags
|= B_INVAL
;
* If the block is in the cache and has the required data
* in a valid region, just copy it out.
* Otherwise, get the block and write back/read in,
if ((bp
= incore(vp
, bn
)) &&
(bp
->b_flags
& (B_BUSY
| B_WRITEINPROG
)) ==
(B_BUSY
| B_WRITEINPROG
))
bp
= nfs_getcacheblk(vp
, bn
, biosize
, p
);
if ((bp
->b_flags
& (B_DONE
| B_DELWRI
)) == 0) {
if (error
= nfs_doio(bp
, cred
, p
)) {
n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
diff
= np
->n_size
- uio
->uio_offset
;
if (not_readin
&& n
> 0) {
if (on
< bp
->b_validoff
|| (on
+ n
) > bp
->b_validend
) {
bp
= nfs_getcacheblk(vp
, bn
, biosize
, p
);
if (bp
->b_dirtyend
> 0) {
if ((bp
->b_flags
& B_DELWRI
) == 0)
if (VOP_BWRITE(bp
) == EINTR
)
diff
= (on
>= bp
->b_validend
) ? 0 : (bp
->b_validend
- on
);
nfsstats
.biocache_readlinks
++;
bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
);
if ((bp
->b_flags
& B_DONE
) == 0) {
if (error
= nfs_doio(bp
, cred
, p
)) {
n
= min(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
nfsstats
.biocache_readdirs
++;
bn
= (daddr_t
)uio
->uio_offset
;
bp
= nfs_getcacheblk(vp
, bn
, NFS_DIRBLKSIZ
, p
);
if ((bp
->b_flags
& B_DONE
) == 0) {
if (error
= nfs_doio(bp
, cred
, p
)) {
* If not eof and read aheads are enabled, start one.
* (You need the current block first, so that you have the
* directory offset cookie of the next block.
if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
rabn
!= 0 && rabn
!= np
->n_direofoffset
&&
rabp
= nfs_getcacheblk(vp
, rabn
, NFS_DIRBLKSIZ
, p
);
if ((rabp
->b_flags
& (B_DONE
| B_DELWRI
)) == 0) {
rabp
->b_flags
|= (B_READ
| B_ASYNC
);
if (nfs_asyncio(rabp
, cred
)) {
rabp
->b_flags
|= B_INVAL
;
n
= min(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
);
error
= uiomove(baddr
+ on
, (int)n
, uio
);
if (n
+ on
== biosize
|| uio
->uio_offset
== np
->n_size
)
uio
->uio_offset
= bp
->b_blkno
;
} while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
* Vnode op for write using bio
struct vop_write_args
/* {
register struct uio
*uio
= ap
->a_uio
;
struct proc
*p
= uio
->uio_procp
;
register struct vnode
*vp
= ap
->a_vp
;
struct nfsnode
*np
= VTONFS(vp
);
register struct ucred
*cred
= ap
->a_cred
;
int ioflag
= ap
->a_ioflag
;
if (uio
->uio_rw
!= UIO_WRITE
)
if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= curproc
)
if (np
->n_flag
& NWRITEERR
) {
np
->n_flag
&= ~NWRITEERR
;
if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
if (np
->n_flag
& NMODIFIED
) {
if (error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1))
if (ioflag
& IO_APPEND
) {
if (error
= VOP_GETATTR(vp
, &vattr
, cred
, p
))
uio
->uio_offset
= np
->n_size
;
nmp
= VFSTONFS(vp
->v_mount
);
* Maybe this should be above the vnode op call, but so long as
* file servers have no limits, i don't think it matters
if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
* I use nm_rsize, not nm_wsize so that all buffer cache blocks
* will be the same size within a filesystem. nfs_writerpc will
* still use nm_wsize when sizing the rpc's.
* Check for a valid write lease.
* If non-cachable, just do the rpc
if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
NQNFS_CKINVALID(vp
, np
, NQL_WRITE
)) {
error
= nqnfs_getlease(vp
, NQL_WRITE
, cred
, p
);
} while (error
== NQNFS_EXPIRED
);
if (np
->n_lrev
!= np
->n_brev
||
(np
->n_flag
& NQNFSNONCACHE
)) {
if (error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1))
if (np
->n_flag
& NQNFSNONCACHE
)
return (nfs_writerpc(vp
, uio
, cred
, ioflag
));
nfsstats
.biocache_writes
++;
lbn
= uio
->uio_offset
/ biosize
;
on
= uio
->uio_offset
& (biosize
-1);
n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
bn
= lbn
* (biosize
/ DEV_BSIZE
);
bp
= nfs_getcacheblk(vp
, bn
, biosize
, p
);
if (bp
->b_wcred
== NOCRED
) {
if (uio
->uio_offset
+ n
> np
->n_size
) {
np
->n_size
= uio
->uio_offset
+ n
;
vnode_pager_setsize(vp
, (u_long
)np
->n_size
);
* If the new write will leave a contiguous dirty
* area, just update the b_dirtyoff and b_dirtyend,
* otherwise force a write rpc of the old dirty area.
if (bp
->b_dirtyend
> 0 &&
(on
> bp
->b_dirtyend
|| (on
+ n
) < bp
->b_dirtyoff
)) {
if (VOP_BWRITE(bp
) == EINTR
)
* Check for valid write lease and get one as required.
* In case getblk() and/or bwrite() delayed us.
if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
NQNFS_CKINVALID(vp
, np
, NQL_WRITE
)) {
error
= nqnfs_getlease(vp
, NQL_WRITE
, cred
, p
);
} while (error
== NQNFS_EXPIRED
);
if (np
->n_lrev
!= np
->n_brev
||
(np
->n_flag
& NQNFSNONCACHE
)) {
if (error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1))
if (error
= uiomove(bp
->b_un
.b_addr
+ on
, n
, uio
)) {
if (bp
->b_dirtyend
> 0) {
bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
if (bp
->b_validend
== 0 || bp
->b_validend
< bp
->b_dirtyoff
||
bp
->b_validoff
> bp
->b_dirtyend
) {
bp
->b_validoff
= bp
->b_dirtyoff
;
bp
->b_validend
= bp
->b_dirtyend
;
bp
->b_validoff
= min(bp
->b_validoff
, bp
->b_dirtyoff
);
bp
->b_validend
= max(bp
->b_validend
, bp
->b_dirtyend
);
bp
->b_validoff
= bp
->b_dirtyoff
;
bp
->b_validend
= bp
->b_dirtyend
;
bp
->b_flags
|= B_APPENDWRITE
;
* If the lease is non-cachable or IO_SYNC do bwrite().
if ((np
->n_flag
& NQNFSNONCACHE
) || (ioflag
& IO_SYNC
)) {
if (error
= VOP_BWRITE(bp
))
} else if ((n
+ on
) == biosize
&&
(nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
bp
->b_proc
= (struct proc
*)0;
} while (uio
->uio_resid
> 0 && n
> 0);
* Get an nfs cache block.
* Allocate a new one if the block isn't currently in the cache
* and return the block marked busy. If the calling process is
* interrupted by a signal for an interruptible mount point, return
nfs_getcacheblk(vp
, bn
, size
, p
)
struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
if (nmp
->nm_flag
& NFSMNT_INT
) {
bp
= getblk(vp
, bn
, size
, PCATCH
, 0);
while (bp
== (struct buf
*)0) {
if (nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
return ((struct buf
*)0);
bp
= getblk(vp
, bn
, size
, 0, 2 * hz
);
bp
= getblk(vp
, bn
, size
, 0, 0);
* Flush and invalidate all dirty buffers. If another process is already
* doing the flush, just wait for completion.
nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
register struct nfsnode
*np
= VTONFS(vp
);
struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
int error
= 0, slpflag
, slptimeo
;
if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
* First wait for any other process doing a flush to complete.
while (np
->n_flag
& NFLUSHINPROG
) {
np
->n_flag
|= NFLUSHWANT
;
error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
if (error
&& intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
* Now, flush as required.
np
->n_flag
|= NFLUSHINPROG
;
error
= vinvalbuf(vp
, flags
, cred
, p
, slpflag
, 0);
if (intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
)) {
np
->n_flag
&= ~NFLUSHINPROG
;
if (np
->n_flag
& NFLUSHWANT
) {
np
->n_flag
&= ~NFLUSHWANT
;
wakeup((caddr_t
)&np
->n_flag
);
error
= vinvalbuf(vp
, flags
, cred
, p
, 0, slptimeo
);
np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
if (np
->n_flag
& NFLUSHWANT
) {
np
->n_flag
&= ~NFLUSHWANT
;
wakeup((caddr_t
)&np
->n_flag
);
* Initiate asynchronous I/O. Return an error if no nfsiods are available.
* This is mainly to avoid queueing async I/O requests when the nfsiods
* are all hung on a dead server.
for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
if (bp
->b_flags
& B_READ
) {
if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
if (bp
->b_wcred
== NOCRED
&& cred
!= NOCRED
) {
queue_enter_tail(&nfs_bufq
, bp
, struct buf
*, b_freelist
);
nfs_iodwant
[i
] = (struct proc
*)0;
wakeup((caddr_t
)&nfs_iodwant
[i
]);
* Do an I/O operation to/from a cache block. This may be called
* synchronously or from an nfsiod.
register struct uio
*uiop
;
register struct vnode
*vp
;
nmp
= VFSTONFS(vp
->v_mount
);
uiop
->uio_segflg
= UIO_SYSSPACE
;
* Historically, paging was done with physio, but no more.
if (bp
->b_flags
& B_PHYS
)
if (bp
->b_flags
& B_READ
) {
io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
io
.iov_base
= bp
->b_un
.b_addr
;
uiop
->uio_offset
= bp
->b_blkno
* DEV_BSIZE
;
error
= nfs_readrpc(vp
, uiop
, cr
);
* If len > 0, there is a hole in the file and
* no writes after the hole have been pushed to
* Just zero fill the rest of the valid area.
diff
= bp
->b_bcount
- uiop
->uio_resid
;
len
= np
->n_size
- (bp
->b_blkno
* DEV_BSIZE
len
= min(len
, uiop
->uio_resid
);
bzero(bp
->b_un
.b_addr
+ diff
, len
);
bp
->b_validend
= diff
+ len
;
bp
->b_validend
= bp
->b_bcount
;
if (p
&& (vp
->v_flag
& VTEXT
) &&
(((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
np
->n_lrev
!= np
->n_brev
) ||
(!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
np
->n_mtime
!= np
->n_vattr
.va_mtime
.ts_sec
))) {
uprintf("Process killed due to text file modification\n");
nfsstats
.readlink_bios
++;
error
= nfs_readlinkrpc(vp
, uiop
, cr
);
uiop
->uio_offset
= bp
->b_lblkno
;
if (VFSTONFS(vp
->v_mount
)->nm_flag
& NFSMNT_NQNFS
)
error
= nfs_readdirlookrpc(vp
, uiop
, cr
);
error
= nfs_readdirrpc(vp
, uiop
, cr
);
* Save offset cookie in b_blkno.
bp
->b_blkno
= uiop
->uio_offset
;
io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
uiop
->uio_offset
= (bp
->b_blkno
* DEV_BSIZE
)
io
.iov_base
= bp
->b_un
.b_addr
+ bp
->b_dirtyoff
;
uiop
->uio_rw
= UIO_WRITE
;
if (bp
->b_flags
& B_APPENDWRITE
)
error
= nfs_writerpc(vp
, uiop
, cr
, IO_APPEND
);
error
= nfs_writerpc(vp
, uiop
, cr
, 0);
bp
->b_flags
&= ~(B_WRITEINPROG
| B_APPENDWRITE
);
* For an interrupted write, the buffer is still valid and the
* write hasn't been pushed to the server yet, so we can't set
* B_ERROR and report the interruption by setting B_EINTR. For
* the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt
* Since for the B_ASYNC case, nfs_bwrite() has reassigned the
* buffer to the clean list, we have to reassign it back to the
if (bp
->b_flags
& B_ASYNC
)
bp
->b_error
= np
->n_error
= error
;
bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
bp
->b_resid
= uiop
->uio_resid
;