merge in vnode changes
[unix-history] / usr / src / sys / kern / kern_physio.c
CommitLineData
da7c5cc6 1/*
0880b18e 2 * Copyright (c) 1982, 1986 Regents of the University of California.
da7c5cc6
KM
3 * All rights reserved. The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 *
d301d150 6 * @(#)kern_physio.c 7.5 (Berkeley) %G%
da7c5cc6 7 */
961945a8 8
94368568
JB
9#include "param.h"
10#include "systm.h"
11#include "dir.h"
12#include "user.h"
13#include "buf.h"
14#include "conf.h"
15#include "proc.h"
16#include "seg.h"
17#include "vm.h"
18#include "trace.h"
19#include "map.h"
20#include "uio.h"
d301d150
KM
21
22#include "machine/pte.h"
ec67a3ce
MK
23#ifdef SECSIZE
24#include "file.h"
25#include "ioctl.h"
26#include "disklabel.h"
27#endif SECSIZE
663dbc72 28
663dbc72
BJ
29/*
30 * Swap IO headers -
31 * They contain the necessary information for the swap I/O.
32 * At any given time, a swap header can be in three
33 * different lists. When free it is in the free list,
34 * when allocated and the I/O queued, it is on the swap
35 * device list, and finally, if the operation was a dirty
36 * page push, when the I/O completes, it is inserted
37 * in a list of cleaned pages to be processed by the pageout daemon.
38 */
4c05b581 39struct buf *swbuf;
663dbc72 40
663dbc72
BJ
41/*
42 * swap I/O -
43 *
44 * If the flag indicates a dirty page push initiated
45 * by the pageout daemon, we map the page into the i th
46 * virtual page of process 2 (the daemon itself) where i is
47 * the index of the swap header that has been allocated.
48 * We simply initialize the header and queue the I/O but
49 * do not wait for completion. When the I/O completes,
ec67a3ce 50 * biodone() will link the header to a list of cleaned
663dbc72
BJ
51 * pages to be processed by the pageout daemon.
52 */
53swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent)
54 struct proc *p;
55 swblk_t dblkno;
56 caddr_t addr;
39d536e6 57 int nbytes, rdflg, flag;
663dbc72 58 dev_t dev;
39d536e6 59 u_int pfcent;
663dbc72
BJ
60{
61 register struct buf *bp;
663dbc72 62 register struct pte *dpte, *vpte;
c5648f55
KB
63 register u_int c;
64 int p2dp, s, error = 0;
65 struct buf *getswbuf();
66 int swdone();
663dbc72 67
c5648f55 68 bp = getswbuf(PSWP+1);
663dbc72 69 bp->b_flags = B_BUSY | B_PHYS | rdflg | flag;
ec67a3ce
MK
70#ifdef SECSIZE
71 bp->b_blksize = DEV_BSIZE;
72#endif SECSIZE
663dbc72
BJ
73 if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0)
74 if (rdflg == B_READ)
75 sum.v_pswpin += btoc(nbytes);
76 else
77 sum.v_pswpout += btoc(nbytes);
78 bp->b_proc = p;
79 if (flag & B_DIRTY) {
80 p2dp = ((bp - swbuf) * CLSIZE) * KLMAX;
81 dpte = dptopte(&proc[2], p2dp);
82 vpte = vtopte(p, btop(addr));
83 for (c = 0; c < nbytes; c += NBPG) {
84 if (vpte->pg_pfnum == 0 || vpte->pg_fod)
85 panic("swap bad pte");
86 *dpte++ = *vpte++;
87 }
d668d9ba
SL
88 bp->b_un.b_addr = (caddr_t)ctob(dptov(&proc[2], p2dp));
89 bp->b_flags |= B_CALL;
90 bp->b_iodone = swdone;
91 bp->b_pfcent = pfcent;
663dbc72
BJ
92 } else
93 bp->b_un.b_addr = addr;
94 while (nbytes > 0) {
e438ed8e
BJ
95 bp->b_bcount = nbytes;
96 minphys(bp);
97 c = bp->b_bcount;
663dbc72
BJ
98 bp->b_blkno = dblkno;
99 bp->b_dev = dev;
53f9ca20
BJ
100#ifdef TRACE
101 trace(TR_SWAPIO, dev, bp->b_blkno);
102#endif
c5648f55
KB
103 (*bdevsw[major(dev)].d_strategy)(bp);
104 /* pageout daemon doesn't wait for pushed pages */
663dbc72
BJ
105 if (flag & B_DIRTY) {
106 if (c < nbytes)
107 panic("big push");
ec67a3ce 108 return (0);
663dbc72 109 }
663dbc72
BJ
110 bp->b_un.b_addr += c;
111 bp->b_flags &= ~B_DONE;
112 if (bp->b_flags & B_ERROR) {
113 if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE)
114 panic("hard IO err in swap");
d03b3d84 115 swkill(p, "swap: read error from swap device");
699e2902 116 error = EIO;
663dbc72
BJ
117 }
118 nbytes -= c;
ec67a3ce
MK
119#ifdef SECSIZE
120 if (flag & B_PGIN && nbytes > 0)
121 panic("big pgin");
122#endif SECSIZE
919fe934 123 dblkno += btodb(c);
663dbc72 124 }
663dbc72 125 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
c5648f55 126 freeswbuf(bp);
699e2902 127 return (error);
663dbc72
BJ
128}
129
d668d9ba
SL
130/*
131 * Put a buffer on the clean list after I/O is done.
132 * Called from biodone.
133 */
134swdone(bp)
135 register struct buf *bp;
136{
137 register int s;
138
139 if (bp->b_flags & B_ERROR)
140 panic("IO err in push");
d95fc990 141 s = splbio();
d668d9ba
SL
142 bp->av_forw = bclnlist;
143 cnt.v_pgout++;
144 cnt.v_pgpgout += bp->b_bcount / NBPG;
145 bclnlist = bp;
146 if (bswlist.b_flags & B_WANTED)
147 wakeup((caddr_t)&proc[2]);
148 splx(s);
149}
150
663dbc72
BJ
151/*
152 * If rout == 0 then killed on swap error, else
153 * rout is the name of the routine where we ran out of
154 * swap space.
155 */
156swkill(p, rout)
157 struct proc *p;
158 char *rout;
159{
160
7cd10076
JB
161 printf("pid %d: %s\n", p->p_pid, rout);
162 uprintf("sorry, pid %d was killed in %s\n", p->p_pid, rout);
663dbc72
BJ
163 /*
164 * To be sure no looping (e.g. in vmsched trying to
165 * swap out) mark process locked in core (as though
166 * done by user) after killing it so noone will try
167 * to swap it out.
168 */
a30d2e97 169 psignal(p, SIGKILL);
663dbc72
BJ
170 p->p_flag |= SULOCK;
171}
172
663dbc72
BJ
173/*
174 * Raw I/O. The arguments are
175 * The strategy routine for the device
c5648f55
KB
176 * A buffer, which will either be a special buffer header owned
177 * exclusively by the device for this purpose, or NULL,
178 * indicating that we should use a swap buffer
663dbc72
BJ
179 * The device number
180 * Read/write flag
181 * Essentially all the work is computing physical addresses and
182 * validating them.
183 * If the user has the proper access privilidges, the process is
184 * marked 'delayed unlock' and the pages involved in the I/O are
185 * faulted and locked. After the completion of the I/O, the above pages
186 * are unlocked.
187 */
d6d7360b
BJ
188physio(strat, bp, dev, rw, mincnt, uio)
189 int (*strat)();
190 register struct buf *bp;
191 dev_t dev;
192 int rw;
c5648f55 193 u_int (*mincnt)();
d6d7360b 194 struct uio *uio;
663dbc72 195{
a196746e 196 register struct iovec *iov;
663dbc72
BJ
197 register int c;
198 char *a;
c5648f55
KB
199 int s, allocbuf = 0, error = 0;
200 struct buf *getswbuf();
ec67a3ce
MK
201#ifdef SECSIZE
202 int bsize;
203 struct partinfo dpart;
204#endif SECSIZE
663dbc72 205
ec67a3ce
MK
206#ifdef SECSIZE
207 if ((unsigned)major(dev) < nchrdev &&
208 (*cdevsw[major(dev)].d_ioctl)(dev, DIOCGPART, (caddr_t)&dpart,
209 FREAD) == 0)
210 bsize = dpart.disklab->d_secsize;
211 else
212 bsize = DEV_BSIZE;
213#endif SECSIZE
214 for (;;) {
215 if (uio->uio_iovcnt == 0)
216 return (0);
217 iov = uio->uio_iov;
218 if (useracc(iov->iov_base, (u_int)iov->iov_len,
219 rw==B_READ? B_WRITE : B_READ) == NULL)
220 return (EFAULT);
221 s = splbio();
222 while (bp->b_flags&B_BUSY) {
223 bp->b_flags |= B_WANTED;
224 sleep((caddr_t)bp, PRIBIO+1);
225 }
c5648f55
KB
226 if (!allocbuf) { /* only if sharing caller's buffer */
227 s = splbio();
228 while (bp->b_flags&B_BUSY) {
229 bp->b_flags |= B_WANTED;
230 sleep((caddr_t)bp, PRIBIO+1);
231 }
232 splx(s);
233 }
ec67a3ce
MK
234 bp->b_error = 0;
235 bp->b_proc = u.u_procp;
236#ifdef SECSIZE
237 bp->b_blksize = bsize;
238#endif SECSIZE
239 bp->b_un.b_addr = iov->iov_base;
240 while (iov->iov_len > 0) {
241 bp->b_flags = B_BUSY | B_PHYS | rw;
242 bp->b_dev = dev;
243#ifdef SECSIZE
244 bp->b_blkno = uio->uio_offset / bsize;
245#else SECSIZE
246 bp->b_blkno = btodb(uio->uio_offset);
247#endif SECSIZE
248 bp->b_bcount = iov->iov_len;
249 (*mincnt)(bp);
250 c = bp->b_bcount;
251 u.u_procp->p_flag |= SPHYSIO;
252 vslock(a = bp->b_un.b_addr, c);
253 physstrat(bp, strat, PRIBIO);
254 (void) splbio();
255 vsunlock(a, c, rw);
256 u.u_procp->p_flag &= ~SPHYSIO;
257 if (bp->b_flags&B_WANTED)
258 wakeup((caddr_t)bp);
259 splx(s);
260 c -= bp->b_resid;
261 bp->b_un.b_addr += c;
262 iov->iov_len -= c;
263 uio->uio_resid -= c;
264 uio->uio_offset += c;
265 /* temp kludge for tape drives */
266 if (bp->b_resid || (bp->b_flags&B_ERROR))
267 break;
268 }
269 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
270 error = geterror(bp);
961945a8 271 /* temp kludge for tape drives */
ec67a3ce
MK
272 if (bp->b_resid || error)
273 return (error);
274 uio->uio_iov++;
275 uio->uio_iovcnt--;
663dbc72 276 }
c5648f55
KB
277 if (allocbuf)
278 freeswbuf(bp);
279 return (error);
663dbc72
BJ
280}
281
c5648f55 282u_int
663dbc72 283minphys(bp)
d6d7360b 284 struct buf *bp;
663dbc72 285{
35a494b8
SL
286 if (bp->b_bcount > MAXPHYS)
287 bp->b_bcount = MAXPHYS;
663dbc72 288}
c5648f55
KB
289
290static
291struct buf *
292getswbuf(prio)
293 int prio;
294{
295 int s;
296 struct buf *bp;
297
298 s = splbio();
299 while (bswlist.av_forw == NULL) {
300 bswlist.b_flags |= B_WANTED;
301 sleep((caddr_t)&bswlist, prio);
302 }
303 bp = bswlist.av_forw;
304 bswlist.av_forw = bp->av_forw;
305 splx(s);
306 return (bp);
307}
308
309static
310freeswbuf(bp)
311 struct buf *bp;
312{
313 int s;
314
315 s = splbio();
316 bp->av_forw = bswlist.av_forw;
317 bswlist.av_forw = bp;
318 if (bswlist.b_flags & B_WANTED) {
319 bswlist.b_flags &= ~B_WANTED;
320 wakeup((caddr_t)&bswlist);
321 wakeup((caddr_t)&proc[2]);
322 }
323 splx(s);
324}
325
326rawread(dev, uio)
327 dev_t dev;
328 struct uio *uio;
329{
330 return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
331 dev, B_READ, minphys, uio));
332}
333
334rawwrite(dev, uio)
335 dev_t dev;
336 struct uio *uio;
337{
338 return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
339 dev, B_WRITE, minphys, uio));
340}