maxmem=2M for benchmarking
[unix-history] / usr / src / sys / kern / vfs_bio.c
CommitLineData
663dbc72
BJ
1/* vfs_bio.c 3.1 %H% */
2
3#include "../h/param.h"
4#include "../h/systm.h"
5#include "../h/dir.h"
6#include "../h/user.h"
7#include "../h/buf.h"
8#include "../h/conf.h"
9#include "../h/proc.h"
10#include "../h/seg.h"
11#include "../h/pte.h"
12#include "../h/vm.h"
13
14/* #define DISKMON 1 */
15
16#ifdef DISKMON
17struct {
18 int nbuf;
19 long nread;
20 long nreada;
21 long ncache;
22 long nwrite;
23 long bufcount[NBUF];
24} io_info;
25#endif
26
27/*
28 * Swap IO headers -
29 * They contain the necessary information for the swap I/O.
30 * At any given time, a swap header can be in three
31 * different lists. When free it is in the free list,
32 * when allocated and the I/O queued, it is on the swap
33 * device list, and finally, if the operation was a dirty
34 * page push, when the I/O completes, it is inserted
35 * in a list of cleaned pages to be processed by the pageout daemon.
36 */
37struct buf swbuf[NSWBUF];
38short swsize[NSWBUF]; /* CAN WE JUST USE B_BCOUNT? */
39int swpf[NSWBUF];
40
41/*
42 * The following several routines allocate and free
43 * buffers with various side effects. In general the
44 * arguments to an allocate routine are a device and
45 * a block number, and the value is a pointer to
46 * to the buffer header; the buffer is marked "busy"
47 * so that no one else can touch it. If the block was
48 * already in core, no I/O need be done; if it is
49 * already busy, the process waits until it becomes free.
50 * The following routines allocate a buffer:
51 * getblk
52 * bread
53 * breada
54 * baddr (if it is incore)
55 * Eventually the buffer must be released, possibly with the
56 * side effect of writing it out, by using one of
57 * bwrite
58 * bdwrite
59 * bawrite
60 * brelse
61 */
62
63#ifdef FASTVAX
64#define notavail(bp) \
65{ \
66 int s = spl6(); \
67 (bp)->av_back->av_forw = (bp)->av_forw; \
68 (bp)->av_forw->av_back = (bp)->av_back; \
69 (bp)->b_flags |= B_BUSY; \
70 splx(s); \
71}
72#endif
73
74/*
75 * Read in (if necessary) the block and return a buffer pointer.
76 */
77struct buf *
78bread(dev, blkno)
79dev_t dev;
80daddr_t blkno;
81{
82 register struct buf *bp;
83
84 bp = getblk(dev, blkno);
85 if (bp->b_flags&B_DONE) {
86#ifdef DISKMON
87 io_info.ncache++;
88#endif
89 return(bp);
90 }
91 bp->b_flags |= B_READ;
92 bp->b_bcount = BSIZE;
93 (*bdevsw[major(dev)].d_strategy)(bp);
94#ifdef DISKMON
95 io_info.nread++;
96#endif
97 u.u_vm.vm_inblk++; /* pay for read */
98 iowait(bp);
99 return(bp);
100}
101
102/*
103 * Read in the block, like bread, but also start I/O on the
104 * read-ahead block (which is not allocated to the caller)
105 */
106struct buf *
107breada(dev, blkno, rablkno)
108dev_t dev;
109daddr_t blkno, rablkno;
110{
111 register struct buf *bp, *rabp;
112
113 bp = NULL;
114 if (!incore(dev, blkno)) {
115 bp = getblk(dev, blkno);
116 if ((bp->b_flags&B_DONE) == 0) {
117 bp->b_flags |= B_READ;
118 bp->b_bcount = BSIZE;
119 (*bdevsw[major(dev)].d_strategy)(bp);
120#ifdef DISKMON
121 io_info.nread++;
122#endif
123 u.u_vm.vm_inblk++; /* pay for read */
124 }
125 }
126 if (rablkno && !incore(dev, rablkno)) {
127 rabp = getblk(dev, rablkno);
128 if (rabp->b_flags & B_DONE)
129 brelse(rabp);
130 else {
131 rabp->b_flags |= B_READ|B_ASYNC;
132 rabp->b_bcount = BSIZE;
133 (*bdevsw[major(dev)].d_strategy)(rabp);
134#ifdef DISKMON
135 io_info.nreada++;
136#endif
137 u.u_vm.vm_inblk++; /* pay in advance */
138 }
139 }
140 if(bp == NULL)
141 return(bread(dev, blkno));
142 iowait(bp);
143 return(bp);
144}
145
146/*
147 * Write the buffer, waiting for completion.
148 * Then release the buffer.
149 */
150bwrite(bp)
151register struct buf *bp;
152{
153 register flag;
154
155 flag = bp->b_flags;
156 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
157 bp->b_bcount = BSIZE;
158#ifdef DISKMON
159 io_info.nwrite++;
160#endif
161 if ((flag&B_DELWRI) == 0)
162 u.u_vm.vm_oublk++; /* noone paid yet */
163 (*bdevsw[major(bp->b_dev)].d_strategy)(bp);
164 if ((flag&B_ASYNC) == 0) {
165 iowait(bp);
166 brelse(bp);
167 } else if (flag & B_DELWRI)
168 bp->b_flags |= B_AGE;
169 else
170 geterror(bp);
171}
172
173/*
174 * Release the buffer, marking it so that if it is grabbed
175 * for another purpose it will be written out before being
176 * given up (e.g. when writing a partial block where it is
177 * assumed that another write for the same block will soon follow).
178 * This can't be done for magtape, since writes must be done
179 * in the same order as requested.
180 */
181bdwrite(bp)
182register struct buf *bp;
183{
184 register struct buf *dp;
185
186 if ((bp->b_flags&B_DELWRI) == 0)
187 u.u_vm.vm_oublk++; /* noone paid yet */
188 dp = bdevsw[major(bp->b_dev)].d_tab;
189 if(dp->b_flags & B_TAPE)
190 bawrite(bp);
191 else {
192 bp->b_flags |= B_DELWRI | B_DONE;
193 brelse(bp);
194 }
195}
196
197/*
198 * Release the buffer, start I/O on it, but don't wait for completion.
199 */
200bawrite(bp)
201register struct buf *bp;
202{
203
204 bp->b_flags |= B_ASYNC;
205 bwrite(bp);
206}
207
208/*
209 * release the buffer, with no I/O implied.
210 */
211brelse(bp)
212register struct buf *bp;
213{
214 register struct buf **backp;
215 register s;
216
217 if (bp->b_flags&B_WANTED)
218 wakeup((caddr_t)bp);
219 if (bfreelist.b_flags&B_WANTED) {
220 bfreelist.b_flags &= ~B_WANTED;
221 wakeup((caddr_t)&bfreelist);
222 }
223 if (bp->b_flags&B_ERROR)
224 bp->b_dev = NODEV; /* no assoc. on error */
225 s = spl6();
226 if(bp->b_flags & (B_AGE|B_ERROR)) {
227 backp = &bfreelist.av_forw;
228 (*backp)->av_back = bp;
229 bp->av_forw = *backp;
230 *backp = bp;
231 bp->av_back = &bfreelist;
232 } else {
233 backp = &bfreelist.av_back;
234 (*backp)->av_forw = bp;
235 bp->av_back = *backp;
236 *backp = bp;
237 bp->av_forw = &bfreelist;
238 }
239 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
240 splx(s);
241}
242
243/*
244 * See if the block is associated with some buffer
245 * (mainly to avoid getting hung up on a wait in breada)
246 */
247incore(dev, blkno)
248dev_t dev;
249daddr_t blkno;
250{
251 register struct buf *bp;
252 register struct buf *dp;
253 register int dblkno = fsbtodb(blkno);
254
255 dp = bdevsw[major(dev)].d_tab;
256 for (bp=dp->b_forw; bp != dp; bp = bp->b_forw)
257 if (bp->b_blkno==dblkno && bp->b_dev==dev)
258 return(1);
259 return(0);
260}
261
262struct buf *
263baddr(dev, blkno)
264dev_t dev;
265daddr_t blkno;
266{
267
268 if (incore(dev, blkno))
269 return (bread(dev, blkno));
270 return (0);
271}
272
273/*
274 * Assign a buffer for the given block. If the appropriate
275 * block is already associated, return it; otherwise search
276 * for the oldest non-busy buffer and reassign it.
277 */
278struct buf *
279getblk(dev, blkno)
280dev_t dev;
281daddr_t blkno;
282{
283 register struct buf *bp;
284 register struct buf *dp;
285#ifdef DISKMON
286 register i;
287#endif
288 register int dblkno = fsbtodb(blkno);
289
290 if(major(dev) >= nblkdev)
291 panic("blkdev");
292
293 loop:
294 VOID spl0();
295 dp = bdevsw[major(dev)].d_tab;
296 if(dp == NULL)
297 panic("devtab");
298 for (bp=dp->b_forw; bp != dp; bp = bp->b_forw) {
299 if (bp->b_blkno!=dblkno || bp->b_dev!=dev)
300 continue;
301 VOID spl6();
302 if (bp->b_flags&B_BUSY) {
303 bp->b_flags |= B_WANTED;
304 sleep((caddr_t)bp, PRIBIO+1);
305 goto loop;
306 }
307 VOID spl0();
308#ifdef DISKMON
309 i = 0;
310 dp = bp->av_forw;
311 while (dp != &bfreelist) {
312 i++;
313 dp = dp->av_forw;
314 }
315 if (i<NBUF)
316 io_info.bufcount[i]++;
317#endif
318 notavail(bp);
319 bp->b_flags |= B_CACHE;
320 return(bp);
321 }
322 VOID spl6();
323 if (bfreelist.av_forw == &bfreelist) {
324 bfreelist.b_flags |= B_WANTED;
325 sleep((caddr_t)&bfreelist, PRIBIO+1);
326 goto loop;
327 }
328 spl0();
329 bp = bfreelist.av_forw;
330 notavail(bp);
331 if (bp->b_flags & B_DELWRI) {
332 bp->b_flags |= B_ASYNC;
333 bwrite(bp);
334 goto loop;
335 }
336 bp->b_flags = B_BUSY;
337 bp->b_back->b_forw = bp->b_forw;
338 bp->b_forw->b_back = bp->b_back;
339 bp->b_forw = dp->b_forw;
340 bp->b_back = dp;
341 dp->b_forw->b_back = bp;
342 dp->b_forw = bp;
343 bp->b_dev = dev;
344 bp->b_blkno = dblkno;
345 return(bp);
346}
347
348/*
349 * get an empty block,
350 * not assigned to any particular device
351 */
352struct buf *
353geteblk()
354{
355 register struct buf *bp;
356 register struct buf *dp;
357
358loop:
359 VOID spl6();
360 while (bfreelist.av_forw == &bfreelist) {
361 bfreelist.b_flags |= B_WANTED;
362 sleep((caddr_t)&bfreelist, PRIBIO+1);
363 }
364 VOID spl0();
365 dp = &bfreelist;
366 bp = bfreelist.av_forw;
367 notavail(bp);
368 if (bp->b_flags & B_DELWRI) {
369 bp->b_flags |= B_ASYNC;
370 bwrite(bp);
371 goto loop;
372 }
373 bp->b_flags = B_BUSY;
374 bp->b_back->b_forw = bp->b_forw;
375 bp->b_forw->b_back = bp->b_back;
376 bp->b_forw = dp->b_forw;
377 bp->b_back = dp;
378 dp->b_forw->b_back = bp;
379 dp->b_forw = bp;
380 bp->b_dev = (dev_t)NODEV;
381 return(bp);
382}
383
384/*
385 * Wait for I/O completion on the buffer; return errors
386 * to the user.
387 */
388iowait(bp)
389register struct buf *bp;
390{
391
392 VOID spl6();
393 while ((bp->b_flags&B_DONE)==0)
394 sleep((caddr_t)bp, PRIBIO);
395 VOID spl0();
396 geterror(bp);
397}
398
399#ifndef FASTVAX
400/*
401 * Unlink a buffer from the available list and mark it busy.
402 * (internal interface)
403 */
404notavail(bp)
405register struct buf *bp;
406{
407 register s;
408
409 s = spl6();
410 bp->av_back->av_forw = bp->av_forw;
411 bp->av_forw->av_back = bp->av_back;
412 bp->b_flags |= B_BUSY;
413 splx(s);
414}
415#endif
416
417/*
418 * Mark I/O complete on a buffer. If the header
419 * indicates a dirty page push completion, the
420 * header is inserted into the ``cleaned'' list
421 * to be processed by the pageout daemon. Otherwise
422 * release it if I/O is asynchronous, and wake
423 * up anyone waiting for it.
424 */
425iodone(bp)
426register struct buf *bp;
427{
428 register int s;
429
430 bp->b_flags |= B_DONE;
431 if (bp->b_flags & B_DIRTY) {
432 if (bp->b_flags & B_ERROR)
433 panic("IO err in push");
434 s = spl6();
435 cnt.v_pgout++;
436 bp->av_forw = bclnlist;
437 bp->b_bcount = swsize[bp - swbuf];
438 bp->b_pfcent = swpf[bp - swbuf];
439 bclnlist = bp;
440 if (bswlist.b_flags & B_WANTED)
441 wakeup((caddr_t)&proc[2]);
442 splx(s);
443 }
444 if (bp->b_flags&B_ASYNC)
445 brelse(bp);
446 else {
447 bp->b_flags &= ~B_WANTED;
448 wakeup((caddr_t)bp);
449 }
450}
451
452/*
453 * Zero the core associated with a buffer.
454 */
455clrbuf(bp)
456struct buf *bp;
457{
458 register *p;
459 register c;
460
461 p = bp->b_un.b_words;
462 c = BSIZE/sizeof(int);
463 do
464 *p++ = 0;
465 while (--c);
466 bp->b_resid = 0;
467}
468
469/*
470 * swap I/O -
471 *
472 * If the flag indicates a dirty page push initiated
473 * by the pageout daemon, we map the page into the i th
474 * virtual page of process 2 (the daemon itself) where i is
475 * the index of the swap header that has been allocated.
476 * We simply initialize the header and queue the I/O but
477 * do not wait for completion. When the I/O completes,
478 * iodone() will link the header to a list of cleaned
479 * pages to be processed by the pageout daemon.
480 */
481swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent)
482 struct proc *p;
483 swblk_t dblkno;
484 caddr_t addr;
485 int flag, nbytes;
486 dev_t dev;
487 unsigned pfcent;
488{
489 register struct buf *bp;
490 register int c;
491 int p2dp;
492 register struct pte *dpte, *vpte;
493
494 VOID spl6();
495 while (bswlist.av_forw == NULL) {
496 bswlist.b_flags |= B_WANTED;
497 sleep((caddr_t)&bswlist, PSWP+1);
498 }
499 bp = bswlist.av_forw;
500 bswlist.av_forw = bp->av_forw;
501 VOID spl0();
502
503 bp->b_flags = B_BUSY | B_PHYS | rdflg | flag;
504 if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0)
505 if (rdflg == B_READ)
506 sum.v_pswpin += btoc(nbytes);
507 else
508 sum.v_pswpout += btoc(nbytes);
509 bp->b_proc = p;
510 if (flag & B_DIRTY) {
511 p2dp = ((bp - swbuf) * CLSIZE) * KLMAX;
512 dpte = dptopte(&proc[2], p2dp);
513 vpte = vtopte(p, btop(addr));
514 for (c = 0; c < nbytes; c += NBPG) {
515 if (vpte->pg_pfnum == 0 || vpte->pg_fod)
516 panic("swap bad pte");
517 *dpte++ = *vpte++;
518 }
519 bp->b_un.b_addr = (caddr_t)ctob(p2dp);
520 } else
521 bp->b_un.b_addr = addr;
522 while (nbytes > 0) {
523 c = imin(ctob(120), nbytes);
524 bp->b_bcount = c;
525 bp->b_blkno = dblkno;
526 bp->b_dev = dev;
527 if (dev == swapdev)
528 bp->b_blkno += swplo;
529 (*bdevsw[major(dev)].d_strategy)(bp);
530 if (flag & B_DIRTY) {
531 if (c < nbytes)
532 panic("big push");
533 swsize[bp - swbuf] = nbytes;
534 swpf[bp - swbuf] = pfcent;
535 return;
536 }
537 VOID spl6();
538 while((bp->b_flags&B_DONE)==0)
539 sleep((caddr_t)bp, PSWP);
540 VOID spl0();
541 bp->b_un.b_addr += c;
542 bp->b_flags &= ~B_DONE;
543 if (bp->b_flags & B_ERROR) {
544 if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE)
545 panic("hard IO err in swap");
546 swkill(p, (char *)0);
547 }
548 nbytes -= c;
549 dblkno += btoc(c);
550 }
551 VOID spl6();
552 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
553 bp->av_forw = bswlist.av_forw;
554 bswlist.av_forw = bp;
555 if (bswlist.b_flags & B_WANTED) {
556 bswlist.b_flags &= ~B_WANTED;
557 wakeup((caddr_t)&bswlist);
558 wakeup((caddr_t)&proc[2]);
559 }
560 VOID spl0();
561}
562
563/*
564 * If rout == 0 then killed on swap error, else
565 * rout is the name of the routine where we ran out of
566 * swap space.
567 */
568swkill(p, rout)
569 struct proc *p;
570 char *rout;
571{
572
573 printf("%d: ", p->p_pid);
574 if (rout)
575 printf("out of swap space in %s\n", rout);
576 else
577 printf("killed on swap error\n");
578 /*
579 * To be sure no looping (e.g. in vmsched trying to
580 * swap out) mark process locked in core (as though
581 * done by user) after killing it so noone will try
582 * to swap it out.
583 */
584 psignal(p, SIGKIL);
585 p->p_flag |= SULOCK;
586}
587
588/*
589 * make sure all write-behind blocks
590 * on dev (or NODEV for all)
591 * are flushed out.
592 * (from umount and update)
593 */
594bflush(dev)
595dev_t dev;
596{
597 register struct buf *bp;
598
599loop:
600 VOID spl6();
601 for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) {
602 if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
603 bp->b_flags |= B_ASYNC;
604 notavail(bp);
605 bwrite(bp);
606 goto loop;
607 }
608 }
609 VOID spl0();
610}
611
612/*
613 * Raw I/O. The arguments are
614 * The strategy routine for the device
615 * A buffer, which will always be a special buffer
616 * header owned exclusively by the device for this purpose
617 * The device number
618 * Read/write flag
619 * Essentially all the work is computing physical addresses and
620 * validating them.
621 * If the user has the proper access privilidges, the process is
622 * marked 'delayed unlock' and the pages involved in the I/O are
623 * faulted and locked. After the completion of the I/O, the above pages
624 * are unlocked.
625 */
626physio(strat, bp, dev, rw, mincnt)
627int (*strat)();
628register struct buf *bp;
629unsigned (*mincnt)();
630{
631 register int c;
632 char *a;
633
634 if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) {
635 u.u_error = EFAULT;
636 return;
637 }
638 VOID spl6();
639 while (bp->b_flags&B_BUSY) {
640 bp->b_flags |= B_WANTED;
641 sleep((caddr_t)bp, PRIBIO+1);
642 }
643 bp->b_error = 0;
644 bp->b_proc = u.u_procp;
645 bp->b_un.b_addr = u.u_base;
646 while (u.u_count != 0 && bp->b_error==0) {
647 bp->b_flags = B_BUSY | B_PHYS | rw;
648 bp->b_dev = dev;
649 bp->b_blkno = u.u_offset >> PGSHIFT;
650 bp->b_bcount = u.u_count;
651 (*mincnt)(bp);
652 c = bp->b_bcount;
653 u.u_procp->p_flag |= SPHYSIO;
654 vslock(a = bp->b_un.b_addr, c);
655 (*strat)(bp);
656 VOID spl6();
657 while ((bp->b_flags&B_DONE) == 0)
658 sleep((caddr_t)bp, PRIBIO);
659 vsunlock(a, c, rw);
660 u.u_procp->p_flag &= ~SPHYSIO;
661 if (bp->b_flags&B_WANTED)
662 wakeup((caddr_t)bp);
663 VOID spl0();
664 bp->b_un.b_addr += c;
665 u.u_count -= c;
666 u.u_offset += c;
667 }
668 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
669 u.u_count = bp->b_resid;
670 geterror(bp);
671}
672
673/*ARGSUSED*/
674unsigned
675minphys(bp)
676struct buf *bp;
677{
678
679 if (bp->b_bcount > 60 * 1024)
680 bp->b_bcount = 60 * 1024;
681}
682
683/*
684 * Pick up the device's error number and pass it to the user;
685 * if there is an error but the number is 0 set a generalized
686 * code. Actually the latter is always true because devices
687 * don't yet return specific errors.
688 */
689geterror(bp)
690register struct buf *bp;
691{
692
693 if (bp->b_flags&B_ERROR)
694 if ((u.u_error = bp->b_error)==0)
695 u.u_error = EIO;
696}