/* kern_physio.c 4.23 81/07/25 */
* The following several routines allocate and free
* buffers with various side effects. In general the
* arguments to an allocate routine are a device and
* a block number, and the value is a pointer to
* to the buffer header; the buffer is marked "busy"
* so that no one else can touch it. If the block was
* already in core, no I/O need be done; if it is
* already busy, the process waits until it becomes free.
* The following routines allocate a buffer:
* baddr (if it is incore)
* Eventually the buffer must be released, possibly with the
* side effect of writing it out, by using one of
struct buf bfreelist
[BQUEUES
];
struct buf bswlist
, *bclnlist
;
struct bufhd bufhash
[BUFHSZ
];
#define BUFHASH(dev, dblkno) \
((struct buf *)&bufhash[((int)(dev)+(int)(dblkno)) % BUFHSZ])
* Initialize hash links for buffers.
register struct bufhd
*bp
;
for (bp
= bufhash
, i
= 0; i
< BUFHSZ
; i
++, bp
++)
bp
->b_forw
= bp
->b_back
= (struct buf
*)bp
;
* They contain the necessary information for the swap I/O.
* At any given time, a swap header can be in three
* different lists. When free it is in the free list,
* when allocated and the I/O queued, it is on the swap
* device list, and finally, if the operation was a dirty
* page push, when the I/O completes, it is inserted
* in a list of cleaned pages to be processed by the pageout daemon.
short *swsize
; /* CAN WE JUST USE B_BCOUNT? */
(bp)->av_back->av_forw = (bp)->av_forw; \
(bp)->av_forw->av_back = (bp)->av_back; \
(bp)->b_flags |= B_BUSY; \
* Read in (if necessary) the block and return a buffer pointer.
if (bp
->b_flags
&B_DONE
) {
trace(TR_BREADHIT
, dev
, blkno
);
(*bdevsw
[major(dev
)].d_strategy
)(bp
);
trace(TR_BREADMISS
, dev
, blkno
);
u
.u_vm
.vm_inblk
++; /* pay for read */
* Read in the block, like bread, but also start I/O on the
* read-ahead block (which is not allocated to the caller)
breada(dev
, blkno
, rablkno
)
register struct buf
*bp
, *rabp
;
if (!incore(dev
, blkno
)) {
if ((bp
->b_flags
&B_DONE
) == 0) {
(*bdevsw
[major(dev
)].d_strategy
)(bp
);
trace(TR_BREADMISS
, dev
, blkno
);
u
.u_vm
.vm_inblk
++; /* pay for read */
trace(TR_BREADHIT
, dev
, blkno
);
if (rablkno
&& !incore(dev
, rablkno
)) {
rabp
= getblk(dev
, rablkno
);
if (rabp
->b_flags
& B_DONE
) {
trace(TR_BREADHITRA
, dev
, blkno
);
rabp
->b_flags
|= B_READ
|B_ASYNC
;
(*bdevsw
[major(dev
)].d_strategy
)(rabp
);
trace(TR_BREADMISSRA
, dev
, rablock
);
u
.u_vm
.vm_inblk
++; /* pay in advance */
return(bread(dev
, blkno
));
* Write the buffer, waiting for completion.
* Then release the buffer.
bp
->b_flags
&= ~(B_READ
| B_DONE
| B_ERROR
| B_DELWRI
| B_AGE
);
if ((flag
&B_DELWRI
) == 0)
u
.u_vm
.vm_oublk
++; /* noone paid yet */
trace(TR_BWRITE
, bp
->b_dev
, bp
->b_blkno
);
(*bdevsw
[major(bp
->b_dev
)].d_strategy
)(bp
);
if ((flag
&B_ASYNC
) == 0) {
} else if (flag
& B_DELWRI
)
* Release the buffer, marking it so that if it is grabbed
* for another purpose it will be written out before being
* given up (e.g. when writing a partial block where it is
* assumed that another write for the same block will soon follow).
* This can't be done for magtape, since writes must be done
* in the same order as requested.
if ((bp
->b_flags
&B_DELWRI
) == 0)
u
.u_vm
.vm_oublk
++; /* noone paid yet */
flags
= bdevsw
[major(bp
->b_dev
)].d_flags
;
bp
->b_flags
|= B_DELWRI
| B_DONE
;
* Release the buffer, start I/O on it, but don't wait for completion.
* release the buffer, with no I/O implied.
register struct buf
*flist
;
if (bp
->b_flags
&B_WANTED
)
if (bfreelist
[0].b_flags
&B_WANTED
) {
bfreelist
[0].b_flags
&= ~B_WANTED
;
wakeup((caddr_t
)bfreelist
);
if (bp
->b_flags
& B_LOCKED
)
bp
->b_flags
&= ~B_ERROR
; /* try again later */
bp
->b_dev
= NODEV
; /* no assoc */
if (bp
->b_flags
& (B_ERROR
|B_INVAL
)) {
/* block has no info ... put at front of most free list */
flist
= &bfreelist
[BQUEUES
-1];
flist
->av_forw
->av_back
= bp
;
bp
->av_forw
= flist
->av_forw
;
if (bp
->b_flags
& B_LOCKED
)
flist
= &bfreelist
[BQ_LOCKED
];
else if (bp
->b_flags
& B_AGE
)
flist
= &bfreelist
[BQ_AGE
];
flist
= &bfreelist
[BQ_LRU
];
flist
->av_back
->av_forw
= bp
;
bp
->av_back
= flist
->av_back
;
bp
->b_flags
&= ~(B_WANTED
|B_BUSY
|B_ASYNC
|B_AGE
);
* See if the block is associated with some buffer
* (mainly to avoid getting hung up on a wait in breada)
register int dblkno
= fsbtodb(blkno
);
dp
= BUFHASH(dev
, dblkno
);
for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
)
if (bp
->b_blkno
== dblkno
&& bp
->b_dev
== dev
&&
!(bp
->b_flags
& B_INVAL
))
return (bread(dev
, blkno
));
* Assign a buffer for the given block. If the appropriate
* block is already associated, return it; otherwise search
* for the oldest non-busy buffer and reassign it.
register struct buf
*bp
, *dp
, *ep
;
register int dblkno
= fsbtodb(blkno
);
if ((unsigned)blkno
>= 1 << (sizeof(int)*NBBY
-PGSHIFT
))
blkno
= 1 << ((sizeof(int)*NBBY
-PGSHIFT
) + 1);
dp
= BUFHASH(dev
, dblkno
);
for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
if (bp
->b_blkno
!= dblkno
|| bp
->b_dev
!= dev
||
if (bp
->b_flags
&B_BUSY
) {
sleep((caddr_t
)bp
, PRIBIO
+1);
while ((dp
->b_flags
& B_HEAD
) == 0) {
if (major(dev
) >= nblkdev
)
for (ep
= &bfreelist
[BQUEUES
-1]; ep
> bfreelist
; ep
--)
if (ep
== bfreelist
) { /* no free blocks at all */
sleep((caddr_t
)ep
, PRIBIO
+1);
if (bp
->b_flags
& B_DELWRI
) {
trace(TR_BRELSE
, bp
->b_dev
, bp
->b_blkno
);
bp
->b_back
->b_forw
= bp
->b_forw
;
bp
->b_forw
->b_back
= bp
->b_back
;
* not assigned to any particular device
register struct buf
*bp
, *dp
;
for (dp
= &bfreelist
[BQUEUES
-1]; dp
> bfreelist
; dp
--)
if (dp
== bfreelist
) { /* no free blocks */
sleep((caddr_t
)dp
, PRIBIO
+1);
if (bp
->b_flags
& B_DELWRI
) {
trace(TR_BRELSE
, bp
->b_dev
, bp
->b_blkno
);
bp
->b_flags
= B_BUSY
|B_INVAL
;
bp
->b_back
->b_forw
= bp
->b_forw
;
bp
->b_forw
->b_back
= bp
->b_back
;
bp
->b_dev
= (dev_t
)NODEV
;
* Wait for I/O completion on the buffer; return errors
while ((bp
->b_flags
&B_DONE
)==0)
sleep((caddr_t
)bp
, PRIBIO
);
* Unlink a buffer from the available list and mark it busy.
bp
->av_back
->av_forw
= bp
->av_forw
;
bp
->av_forw
->av_back
= bp
->av_back
;
* Mark I/O complete on a buffer. If the header
* indicates a dirty page push completion, the
* header is inserted into the ``cleaned'' list
* to be processed by the pageout daemon. Otherwise
* release it if I/O is asynchronous, and wake
* up anyone waiting for it.
if (bp
->b_flags
& B_DONE
)
if (bp
->b_flags
& B_DIRTY
) {
if (bp
->b_flags
& B_ERROR
)
bp
->b_bcount
= swsize
[bp
- swbuf
];
bp
->b_pfcent
= swpf
[bp
- swbuf
];
cnt
.v_pgpgout
+= bp
->b_bcount
/ NBPG
;
if (bswlist
.b_flags
& B_WANTED
)
wakeup((caddr_t
)&proc
[2]);
bp
->b_flags
&= ~B_WANTED
;
* Zero the core associated with a buffer.
* If the flag indicates a dirty page push initiated
* by the pageout daemon, we map the page into the i th
* virtual page of process 2 (the daemon itself) where i is
* the index of the swap header that has been allocated.
* We simply initialize the header and queue the I/O but
* do not wait for completion. When the I/O completes,
* iodone() will link the header to a list of cleaned
* pages to be processed by the pageout daemon.
swap(p
, dblkno
, addr
, nbytes
, rdflg
, flag
, dev
, pfcent
)
register struct pte
*dpte
, *vpte
;
while (bswlist
.av_forw
== NULL
) {
bswlist
.b_flags
|= B_WANTED
;
sleep((caddr_t
)&bswlist
, PSWP
+1);
bswlist
.av_forw
= bp
->av_forw
;
bp
->b_flags
= B_BUSY
| B_PHYS
| rdflg
| flag
;
if ((bp
->b_flags
& (B_DIRTY
|B_PGIN
)) == 0)
sum
.v_pswpin
+= btoc(nbytes
);
sum
.v_pswpout
+= btoc(nbytes
);
p2dp
= ((bp
- swbuf
) * CLSIZE
) * KLMAX
;
dpte
= dptopte(&proc
[2], p2dp
);
vpte
= vtopte(p
, btop(addr
));
for (c
= 0; c
< nbytes
; c
+= NBPG
) {
if (vpte
->pg_pfnum
== 0 || vpte
->pg_fod
)
bp
->b_un
.b_addr
= (caddr_t
)ctob(p2dp
);
c
= imin(ctob(120), nbytes
);
swpf
[bp
- swbuf
] = pfcent
;
swsize
[bp
- swbuf
] = nbytes
;
trace(TR_SWAPIO
, dev
, bp
->b_blkno
);
(*bdevsw
[major(dev
)].d_strategy
)(bp
);
while((bp
->b_flags
&B_DONE
)==0)
sleep((caddr_t
)bp
, PSWP
);
if (bp
->b_flags
& B_ERROR
) {
if ((flag
& (B_UAREA
|B_PAGET
)) || rdflg
== B_WRITE
)
panic("hard IO err in swap");
bp
->b_flags
&= ~(B_BUSY
|B_WANTED
|B_PHYS
|B_PAGET
|B_UAREA
|B_DIRTY
);
bp
->av_forw
= bswlist
.av_forw
;
if (bswlist
.b_flags
& B_WANTED
) {
bswlist
.b_flags
&= ~B_WANTED
;
wakeup((caddr_t
)&bswlist
);
wakeup((caddr_t
)&proc
[2]);
* If rout == 0 then killed on swap error, else
* rout is the name of the routine where we ran out of
printf("pid %d: ", p
->p_pid
);
printf(mesg
= "killed due to no swap space\n");
printf(mesg
= "killed on swap error\n");
uprintf("sorry, pid %d was %s", p
->p_pid
, mesg
);
* To be sure no looping (e.g. in vmsched trying to
* swap out) mark process locked in core (as though
* done by user) after killing it so noone will try
* make sure all write-behind blocks
* on dev (or NODEV for all)
* (from umount and update)
register struct buf
*flist
;
for (flist
= bfreelist
; flist
< &bfreelist
[BQUEUES
]; flist
++)
for (bp
= flist
->av_forw
; bp
!= flist
; bp
= bp
->av_forw
) {
if (bp
->b_flags
&B_DELWRI
&& (dev
== NODEV
||dev
==bp
->b_dev
)) {
* Raw I/O. The arguments are
* The strategy routine for the device
* A buffer, which will always be a special buffer
* header owned exclusively by the device for this purpose
* Essentially all the work is computing physical addresses and
* If the user has the proper access privilidges, the process is
* marked 'delayed unlock' and the pages involved in the I/O are
* faulted and locked. After the completion of the I/O, the above pages
physio(strat
, bp
, dev
, rw
, mincnt
)
if (useracc(u
.u_base
,u
.u_count
,rw
==B_READ
?B_WRITE
:B_READ
) == NULL
) {
while (bp
->b_flags
&B_BUSY
) {
sleep((caddr_t
)bp
, PRIBIO
+1);
bp
->b_un
.b_addr
= u
.u_base
;
bp
->b_flags
= B_BUSY
| B_PHYS
| rw
;
bp
->b_blkno
= u
.u_offset
>> PGSHIFT
;
bp
->b_bcount
= u
.u_count
;
u
.u_procp
->p_flag
|= SPHYSIO
;
vslock(a
= bp
->b_un
.b_addr
, c
);
while ((bp
->b_flags
&B_DONE
) == 0)
sleep((caddr_t
)bp
, PRIBIO
);
u
.u_procp
->p_flag
&= ~SPHYSIO
;
if (bp
->b_flags
&B_WANTED
)
bp
->b_flags
&= ~(B_BUSY
|B_WANTED
|B_PHYS
);
if (bp
->b_bcount
> 60 * 1024)
bp
->b_bcount
= 60 * 1024;
* Pick up the device's error number and pass it to the user;
* if there is an error but the number is 0 set a generalized
* code. Actually the latter is always true because devices
* don't yet return specific errors.
if ((u
.u_error
= bp
->b_error
)==0)
* Invalidate in core blocks belonging to closed or umounted filesystem
* This is not nicely done at all - the buffer ought to be removed from the
* hash chains & have its dev/blkno fields clobbered, but unfortunately we
* can't do that here, as it is quite possible that the block is still
* being used for i/o. Eventually, all disc drivers should be forced to
* have a close routine, which ought ensure that the queue is empty, then
* properly flush the queues. Until that happy day, this suffices for
register struct bufhd
*hp
;
#define dp ((struct buf *)hp)
for (hp
= bufhash
; hp
< &bufhash
[BUFHSZ
]; hp
++)
for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
)