* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991 The Regents of the University of California.
* Copyright (c) 1993 John S. Dyson
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
* $Id: vnode_pager.c,v 1.10 1994/01/31 04:22:01 davidg Exp $
* Page to/from files (vnodes).
* fix credential use (uses current process credentials now)
* John S. Dyson 08 Dec 93
* This file in conjunction with some vm_fault mods, eliminate the performance
* advantage for using the buffer cache and minimize memory copies.
* 1) Supports multiple - block reads
* 2) Bypasses buffer cache for reads
* 1) Totally bypass buffer cache for reads
* (Currently will still sometimes use buffer cache for reads)
* 2) Bypass buffer cache for writes
* (Code does not support it, but mods are simple)
struct pagerops vnodepagerops
= {
static int vnode_pager_io(vn_pager_t vnp
, vm_page_t
*m
, int count
, int reqpage
,
void relpbuf(struct buf
*bp
) ;
extern vm_map_t pager_map
;
queue_head_t vnode_pager_list
; /* list of managed vnodes */
if (vpagerdebug
& VDB_FOLLOW
)
printf("vnode_pager_init()\n");
queue_init(&vnode_pager_list
);
* Allocate (or lookup) pager for a vnode.
* Handle is a vnode pointer.
vnode_pager_alloc(handle
, size
, prot
, offset
)
register vm_pager_t pager
;
struct proc
*p
= curproc
; /* XXX */
if (vpagerdebug
& (VDB_FOLLOW
|VDB_ALLOC
))
printf("vnode_pager_alloc(%x, %x, %x)\n", handle
, size
, prot
);
* Pageout to vnode, no can do yet.
* Vnodes keep a pointer to any associated pager so no need to
* lookup with vm_pager_lookup.
vp
= (struct vnode
*)handle
;
pager
= (vm_pager_t
)vp
->v_vmdata
;
* Allocate pager structures
pager
= (vm_pager_t
)malloc(sizeof *pager
, M_VMPAGER
, M_WAITOK
);
vnp
= (vn_pager_t
)malloc(sizeof *vnp
, M_VMPGDATA
, M_WAITOK
);
free((caddr_t
)pager
, M_VMPAGER
);
* And an object of the appropriate size
if (VOP_GETATTR(vp
, &vattr
, p
->p_ucred
, p
) == 0) {
object
= vm_object_allocate(round_page(vattr
.va_size
));
vm_object_enter(object
, pager
);
vm_object_setpager(object
, pager
, 0, TRUE
);
free((caddr_t
)vnp
, M_VMPGDATA
);
free((caddr_t
)pager
, M_VMPAGER
);
* Hold a reference to the vnode and initialize pager data.
vnp
->vnp_size
= vattr
.va_size
;
queue_enter(&vnode_pager_list
, pager
, vm_pager_t
, pg_list
);
pager
->pg_handle
= handle
;
pager
->pg_type
= PG_VNODE
;
pager
->pg_ops
= &vnodepagerops
;
pager
->pg_data
= (caddr_t
)vnp
;
vp
->v_vmdata
= (caddr_t
)pager
;
* vm_object_lookup() will remove the object from the
* cache if found and also gain a reference to the object.
object
= vm_object_lookup(pager
);
vnp
= (vn_pager_t
)pager
->pg_data
;
if (vpagerdebug
& VDB_ALLOC
)
printf("vnode_pager_setup: vp %x sz %x pager %x object %x\n",
vp
, vnp
->vnp_size
, pager
, object
);
vnode_pager_dealloc(pager
)
register vn_pager_t vnp
= (vn_pager_t
)pager
->pg_data
;
register struct vnode
*vp
;
struct proc
*p
= curproc
; /* XXX */
if (vpagerdebug
& VDB_FOLLOW
)
printf("vnode_pager_dealloc(%x)\n", pager
);
/* can hang if done at reboot on NFS FS */
(void) VOP_FSYNC(vp
, p
->p_ucred
, p
);
queue_remove(&vnode_pager_list
, pager
, vm_pager_t
, pg_list
);
free((caddr_t
)vnp
, M_VMPGDATA
);
free((caddr_t
)pager
, M_VMPAGER
);
vnode_pager_getmulti(pager
, m
, count
, reqpage
, sync
)
return vnode_pager_io((vn_pager_t
) pager
->pg_data
, m
, count
, reqpage
, UIO_READ
);
vnode_pager_getpage(pager
, m
, sync
)
if (vpagerdebug
& VDB_FOLLOW
)
printf("vnode_pager_getpage(%x, %x)\n", pager
, m
);
return vnode_pager_io((vn_pager_t
)pager
->pg_data
, marray
, 1, 0, UIO_READ
);
vnode_pager_putpage(pager
, m
, sync
)
if (vpagerdebug
& VDB_FOLLOW
)
printf("vnode_pager_putpage(%x, %x)\n", pager
, m
);
err
= vnode_pager_io((vn_pager_t
)pager
->pg_data
, marray
, 1, 0, UIO_WRITE
);
vnode_pager_haspage(pager
, offset
)
register vn_pager_t vnp
= (vn_pager_t
)pager
->pg_data
;
if (vpagerdebug
& VDB_FOLLOW
)
printf("vnode_pager_haspage(%x, %x)\n", pager
, offset
);
* Offset beyond end of file, do not have the page
if (offset
>= vnp
->vnp_size
) {
if (vpagerdebug
& (VDB_FAIL
|VDB_SIZE
))
printf("vnode_pager_haspage: pg %x, off %x, size %x\n",
pager
, offset
, vnp
->vnp_size
);
* Read the index to find the disk block to read
* from. If there is no block, report that we don't
* Assumes that the vnode has whole page or nothing.
err
= VOP_BMAP(vnp
->vnp_vp
,
offset
/ vnp
->vnp_vp
->v_mount
->mnt_stat
.f_bsize
,
(struct vnode
**)0, &bn
);
if (vpagerdebug
& VDB_FAIL
)
printf("vnode_pager_haspage: BMAP err %d, pg %x, off %x\n",
return((long)bn
< 0 ? FALSE
: TRUE
);
* Lets the VM system know about a change in size for a file.
* If this vnode is mapped into some address space (i.e. we have a pager
* for it) we adjust our own internal size and flush any cached pages in
* the associated object that are affected by the size change.
* Note: this routine may be invoked as a result of a pager put
* operation (possibly at object termination time), so we must be careful.
vnode_pager_setsize(vp
, nsize
)
register vm_object_t object
;
if (vp
== NULL
|| vp
->v_type
!= VREG
|| vp
->v_vmdata
== NULL
)
pager
= (vm_pager_t
)vp
->v_vmdata
;
vnp
= (vn_pager_t
)pager
->pg_data
;
if (nsize
== vnp
->vnp_size
)
* This can happen during object termination since
* vm_object_page_clean is called after the object
* has been removed from the hash table, and clean
* may cause vnode write operations which can wind
object
= vm_object_lookup(pager
);
if (vpagerdebug
& (VDB_FOLLOW
|VDB_SIZE
))
printf("vnode_pager_setsize: vp %x obj %x osz %d nsz %d\n",
vp
, object
, vnp
->vnp_size
, nsize
);
* Toss any cached pages beyond the new EOF.
nsize
= round_page(nsize
);
if (nsize
< vnp
->vnp_size
) {
vm_object_page_remove(object
,
(vm_offset_t
)nsize
, vnp
->vnp_size
);
vm_object_unlock(object
);
vnp
->vnp_size
= (vm_offset_t
)nsize
;
vm_object_deallocate(object
);
register struct mount
*mp
;
register vm_pager_t pager
, npager
;
pager
= (vm_pager_t
) queue_first(&vnode_pager_list
);
while (!queue_end(&vnode_pager_list
, (queue_entry_t
)pager
)) {
* Save the next pointer now since uncaching may
* terminate the object and render pager invalid
vp
= ((vn_pager_t
)pager
->pg_data
)->vnp_vp
;
npager
= (vm_pager_t
) queue_next(&pager
->pg_list
);
if (mp
== (struct mount
*)0 || vp
->v_mount
== mp
)
(void) vnode_pager_uncache(vp
);
* Remove vnode associated object from the object cache.
* Note: this routine may be invoked as a result of a pager put
* operation (possibly at object termination time), so we must be careful.
register struct vnode
*vp
;
register vm_object_t object
;
boolean_t uncached
, locked
;
pager
= (vm_pager_t
)vp
->v_vmdata
;
* Unlock the vnode if it is currently locked.
* We do this since uncaching the object may result
* in its destruction which may initiate paging
* activity which may necessitate locking the vnode.
locked
= VOP_ISLOCKED(vp
);
* Must use vm_object_lookup() as it actually removes
* the object from the cache list.
object
= vm_object_lookup(pager
);
uncached
= (object
->ref_count
<= 1);
pager_cache(object
, FALSE
);
* calculate the linear (byte) disk address of specified virtual
vnode_pager_addr(vp
, address
)
bsize
= vp
->v_mount
->mnt_stat
.f_bsize
;
vblock
= address
/ bsize
;
voffset
= address
% bsize
;
err
= VOP_BMAP(vp
,vblock
,&rtvp
,&block
);
rtaddress
= block
* DEV_BSIZE
+ voffset
;
* interrupt routine for I/O completion
* Perform read or write operation for vnode_paging
* vnp -- pointer to vnode pager data structure
* containing size and vnode pointer, etc
* m -- pointer to array of vm_page_t entries to
* do I/O to. It is not necessary to fill any
* pages except for the reqpage entry. If a
* page is not filled, it needs to be removed
* count -- number of pages for I/O
* reqpage -- fault requested page for I/O
* (index into vm_page_t entries above)
* rw -- UIO_READ or UIO_WRITE
* NOTICE!!!! direct writes look like that they are close to being
* implemented. They are not really, several things need
* to be done to make it work (subtile things.) Hack at
* your own risk (direct writes are scarey).
* we currently only support direct I/O to filesystems whose
* contiguously allocated blocksize is at least a vm page.
* changes will be made in the future to support more flexibility.
vnode_pager_io(vnp
, m
, count
, reqpage
, rw
)
struct proc
*p
= curproc
; /* XXX */
vm_offset_t paging_offset
;
int errtype
=0; /* 0 is file type otherwise vm type */
object
= m
[reqpage
]->object
; /* all vm_page_t items are in same object */
paging_offset
= object
->paging_offset
;
* get the UNDERLYING device for the file
bsize
= vp
->v_mount
->mnt_stat
.f_bsize
;
* trim off unnecessary pages
for (i
= reqpage
- 1; i
>= 0; --i
) {
if (m
[i
]->object
!= object
) {
vnode_pager_freepage(m
[j
]);
for (j
= i
+ 1; j
< count
; j
++) {
for (i
= reqpage
+ 1; i
< count
; i
++) {
if ((m
[i
]->object
!= object
) ||
(m
[i
]->offset
+ paging_offset
>= vnp
->vnp_size
)) {
for (j
= i
; j
< count
; j
++)
vnode_pager_freepage(m
[j
]);
* we only do direct I/O if the file is on a local
* BLOCK device and currently if it is a read operation only.
if (rw
== UIO_READ
&& dp
->v_type
== VBLK
&&
vp
->v_mount
->mnt_stat
.f_type
== MOUNT_UFS
) {
* we do not block for a kva, notice we default to a kva conservative behavior
kva
= kmem_alloc_pageable(pager_map
, (mapsize
= count
*NBPG
));
* here on I/O through VFS
for (i
= 0; i
< count
; i
++) {
vnode_pager_freepage(m
[i
]);
foff
= m
[0]->offset
+ paging_offset
;
* Return failure if beyond current EOF
if (foff
>= vnp
->vnp_size
) {
if (foff
+ NBPG
> vnp
->vnp_size
)
size
= vnp
->vnp_size
- foff
;
* Allocate a kernel virtual address and initialize so that
* we can use VOP_READ/WRITE routines.
kva
= vm_pager_map_page(m
[0]);
aiov
.iov_base
= (caddr_t
)kva
;
auio
.uio_segflg
= UIO_SYSSPACE
;
auio
.uio_procp
= (struct proc
*)0;
error
= VOP_READ(vp
, &auio
, IO_PAGER
, p
->p_ucred
);
error
= VOP_WRITE(vp
, &auio
, IO_PAGER
, p
->p_ucred
);
register int count
= size
- auio
.uio_resid
;
else if (count
!= NBPG
&& rw
== UIO_READ
)
bzero((caddr_t
)kva
+ count
, NBPG
- count
);
vm_pager_unmap_page(kva
);
* here on direct device I/O
foff
= m
[reqpage
]->offset
+ paging_offset
;
* This pathetic hack gets data from the buffer cache, if it's there.
* I believe that this is not really necessary, and the ends can
* be gotten by defaulting to the normal vfs read behavior, but this
* might be more efficient, because the will NOT invoke read-aheads
* and one of the purposes of this code is to bypass the buffer
* cache and keep from flushing it by reading in a program.
* calculate logical block and offset
* if we have a buffer in core, then try to use it
while (bp
= incore(vp
, block
)) {
* wait until the buffer is avail or gone
if (bp
->b_flags
& B_BUSY
) {
tsleep ((caddr_t
)bp
, PVM
, "vnwblk", 0);
if ((foff
+ amount
) > vnp
->vnp_size
)
amount
= vnp
->vnp_size
- foff
;
* make sure that this page is in the buffer
if ((amount
> 0) && (offset
+ amount
) <= bp
->b_bcount
) {
pmap_enter(vm_map_pmap(pager_map
),
kva
, VM_PAGE_TO_PHYS(m
[reqpage
]),
* copy the data from the buffer
bcopy(bp
->b_un
.b_addr
+ offset
, (caddr_t
)kva
, amount
);
bzero((caddr_t
)kva
+ amount
, NBPG
- amount
);
* unmap the page and free the kva
pmap_remove(vm_map_pmap(pager_map
), kva
, kva
+ NBPG
);
kmem_free_wakeup(pager_map
, kva
, mapsize
);
* release the buffer back to the block subsystem
* we did not have to do any work to get the requested
* page, the read behind/ahead does not justify a read
for (i
= 0; i
< count
; i
++) {
vnode_pager_freepage(m
[i
]);
* buffer is nowhere to be found, read from the disk
foff
= m
[reqpage
]->offset
+ paging_offset
;
reqaddr
= vnode_pager_addr(vp
, foff
);
* Make sure that our I/O request is contiguous.
* Scan backward and stop for the first discontiguous
* entry or stop for a page being in buffer cache.
for (i
= reqpage
- 1; i
>= 0; --i
) {
incore(vp
, (foff
+ (i
- reqpage
) * NBPG
) / bsize
) ||
(myaddr
= vnode_pager_addr(vp
, m
[i
]->offset
+ paging_offset
))
!= reqaddr
+ (i
- reqpage
) * NBPG
) {
vnode_pager_freepage(m
[i
]);
* Scan forward and stop for the first discontiguous
* entry or stop for a page being in buffer cache.
for (i
= reqpage
+ 1; i
< count
; i
++) {
incore(vp
, (foff
+ (i
- reqpage
) * NBPG
) / bsize
) ||
(myaddr
= vnode_pager_addr(vp
, m
[i
]->offset
+ paging_offset
))
!= reqaddr
+ (i
- reqpage
) * NBPG
) {
vnode_pager_freepage(m
[i
]);
* the first and last page have been calculated now, move input
* pages to be zero based...
for (i
= first
; i
< count
; i
++) {
* calculate the file virtual address for the transfer
foff
= m
[0]->offset
+ paging_offset
;
* and get the disk physical address (in bytes)
firstaddr
= vnode_pager_addr(vp
, foff
);
* calculate the size of the transfer
if ((m
[count
- 1]->offset
+ paging_offset
) + NBPG
> vnp
->vnp_size
)
size
= vnp
->vnp_size
- foff
;
* and map the pages to be read into the kva
for (i
= 0; i
< count
; i
++)
pmap_enter(vm_map_pmap(pager_map
),
kva
+ NBPG
* i
, VM_PAGE_TO_PHYS(m
[i
]),
/* build a minimal buffer header */
bzero((caddr_t
)bp
, sizeof(struct buf
));
bp
->b_flags
= B_BUSY
| B_READ
| B_CALL
;
bp
->b_iodone
= vnode_pager_iodone
;
/* B_PHYS is not set, but it is nice to fill this in */
bp
->b_un
.b_addr
= (caddr_t
) kva
;
bp
->b_blkno
= firstaddr
/ DEV_BSIZE
;
/* Should be a BLOCK or character DEVICE if we get here */
bp
->b_bcount
= NBPG
* count
;
/* we definitely need to be at splbio here */
while ((bp
->b_flags
& B_DONE
) == 0) {
tsleep((caddr_t
)bp
, PVM
, "vnread", 0);
if ((bp
->b_flags
& B_ERROR
) != 0)
if (size
!= count
* NBPG
)
bzero((caddr_t
)kva
+ size
, NBPG
* count
- size
);
pmap_remove(vm_map_pmap(pager_map
), kva
, kva
+ NBPG
* count
);
kmem_free_wakeup(pager_map
, kva
, mapsize
);
* free the buffer header back to the swap buffer pool
for (i
= 0; i
< count
; i
++) {
* we dont mess with pages that have been already
pmap_clear_modify(VM_PAGE_TO_PHYS(m
[i
]));
m
[i
]->flags
&= ~PG_LAUNDRY
;
* whether or not to leave the page activated
* is up in the air, but we should put the page
* on a page queue somewhere. (it already is in
* Result: It appears that emperical results show
* that deactivating pages is best.
* just in case someone was asking for this
* page we now tell them that it is ok to use
vm_page_deactivate(m
[i
]);
vnode_pager_freepage(m
[i
]);
if (!error
&& rw
== UIO_WRITE
) {
pmap_clear_modify(VM_PAGE_TO_PHYS(m
[reqpage
]));
m
[reqpage
]->flags
|= PG_CLEAN
;
m
[reqpage
]->flags
&= ~PG_LAUNDRY
;
printf("vnode pager error: %d\n", error
);
return (error
? VM_PAGER_FAIL
: VM_PAGER_OK
);