Commit | Line | Data |
---|---|---|
da7c5cc6 | 1 | /* |
0880b18e | 2 | * Copyright (c) 1982, 1986 Regents of the University of California. |
da7c5cc6 KM |
3 | * All rights reserved. The Berkeley software License Agreement |
4 | * specifies the terms and conditions for redistribution. | |
5 | * | |
26bd0870 | 6 | * @(#)kern_physio.c 7.12 (Berkeley) %G% |
da7c5cc6 | 7 | */ |
961945a8 | 8 | |
94368568 JB |
9 | #include "param.h" |
10 | #include "systm.h" | |
94368568 JB |
11 | #include "user.h" |
12 | #include "buf.h" | |
13 | #include "conf.h" | |
14 | #include "proc.h" | |
15 | #include "seg.h" | |
16 | #include "vm.h" | |
17 | #include "trace.h" | |
18 | #include "map.h" | |
c4ec2128 | 19 | #include "vnode.h" |
d301d150 KM |
20 | |
21 | #include "machine/pte.h" | |
ec67a3ce MK |
22 | #ifdef SECSIZE |
23 | #include "file.h" | |
24 | #include "ioctl.h" | |
25 | #include "disklabel.h" | |
26 | #endif SECSIZE | |
663dbc72 | 27 | |
663dbc72 BJ |
28 | /* |
29 | * Swap IO headers - | |
30 | * They contain the necessary information for the swap I/O. | |
31 | * At any given time, a swap header can be in three | |
32 | * different lists. When free it is in the free list, | |
33 | * when allocated and the I/O queued, it is on the swap | |
34 | * device list, and finally, if the operation was a dirty | |
35 | * page push, when the I/O completes, it is inserted | |
36 | * in a list of cleaned pages to be processed by the pageout daemon. | |
37 | */ | |
4c05b581 | 38 | struct buf *swbuf; |
663dbc72 | 39 | |
663dbc72 BJ |
40 | /* |
41 | * swap I/O - | |
42 | * | |
43 | * If the flag indicates a dirty page push initiated | |
44 | * by the pageout daemon, we map the page into the i th | |
45 | * virtual page of process 2 (the daemon itself) where i is | |
46 | * the index of the swap header that has been allocated. | |
47 | * We simply initialize the header and queue the I/O but | |
48 | * do not wait for completion. When the I/O completes, | |
ec67a3ce | 49 | * biodone() will link the header to a list of cleaned |
663dbc72 BJ |
50 | * pages to be processed by the pageout daemon. |
51 | */ | |
c4ec2128 | 52 | swap(p, dblkno, addr, nbytes, rdflg, flag, vp, pfcent) |
663dbc72 BJ |
53 | struct proc *p; |
54 | swblk_t dblkno; | |
55 | caddr_t addr; | |
39d536e6 | 56 | int nbytes, rdflg, flag; |
c4ec2128 | 57 | struct vnode *vp; |
39d536e6 | 58 | u_int pfcent; |
663dbc72 BJ |
59 | { |
60 | register struct buf *bp; | |
663dbc72 | 61 | register struct pte *dpte, *vpte; |
c5648f55 KB |
62 | register u_int c; |
63 | int p2dp, s, error = 0; | |
64 | struct buf *getswbuf(); | |
65 | int swdone(); | |
663dbc72 | 66 | |
c5648f55 | 67 | bp = getswbuf(PSWP+1); |
663dbc72 | 68 | bp->b_flags = B_BUSY | B_PHYS | rdflg | flag; |
ec67a3ce MK |
69 | #ifdef SECSIZE |
70 | bp->b_blksize = DEV_BSIZE; | |
71 | #endif SECSIZE | |
663dbc72 BJ |
72 | if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0) |
73 | if (rdflg == B_READ) | |
74 | sum.v_pswpin += btoc(nbytes); | |
75 | else | |
76 | sum.v_pswpout += btoc(nbytes); | |
77 | bp->b_proc = p; | |
78 | if (flag & B_DIRTY) { | |
79 | p2dp = ((bp - swbuf) * CLSIZE) * KLMAX; | |
80 | dpte = dptopte(&proc[2], p2dp); | |
81 | vpte = vtopte(p, btop(addr)); | |
82 | for (c = 0; c < nbytes; c += NBPG) { | |
83 | if (vpte->pg_pfnum == 0 || vpte->pg_fod) | |
84 | panic("swap bad pte"); | |
85 | *dpte++ = *vpte++; | |
86 | } | |
d668d9ba SL |
87 | bp->b_un.b_addr = (caddr_t)ctob(dptov(&proc[2], p2dp)); |
88 | bp->b_flags |= B_CALL; | |
89 | bp->b_iodone = swdone; | |
90 | bp->b_pfcent = pfcent; | |
663dbc72 BJ |
91 | } else |
92 | bp->b_un.b_addr = addr; | |
93 | while (nbytes > 0) { | |
c4ec2128 | 94 | bp->b_blkno = dblkno; |
343a57bd KM |
95 | if (bp->b_vp) |
96 | brelvp(bp); | |
5dccc1f9 | 97 | VHOLD(vp); |
343a57bd KM |
98 | bp->b_vp = vp; |
99 | bp->b_dev = vp->v_rdev; | |
e438ed8e | 100 | bp->b_bcount = nbytes; |
26bd0870 KM |
101 | if ((bp->b_flags & B_READ) == 0) |
102 | vp->v_numoutput++; | |
e438ed8e BJ |
103 | minphys(bp); |
104 | c = bp->b_bcount; | |
53f9ca20 | 105 | #ifdef TRACE |
c4ec2128 | 106 | trace(TR_SWAPIO, vp, bp->b_blkno); |
53f9ca20 | 107 | #endif |
c4ec2128 | 108 | VOP_STRATEGY(bp); |
c5648f55 | 109 | /* pageout daemon doesn't wait for pushed pages */ |
663dbc72 BJ |
110 | if (flag & B_DIRTY) { |
111 | if (c < nbytes) | |
112 | panic("big push"); | |
ec67a3ce | 113 | return (0); |
663dbc72 | 114 | } |
663dbc72 BJ |
115 | bp->b_un.b_addr += c; |
116 | bp->b_flags &= ~B_DONE; | |
117 | if (bp->b_flags & B_ERROR) { | |
118 | if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE) | |
119 | panic("hard IO err in swap"); | |
d03b3d84 | 120 | swkill(p, "swap: read error from swap device"); |
699e2902 | 121 | error = EIO; |
663dbc72 BJ |
122 | } |
123 | nbytes -= c; | |
ec67a3ce MK |
124 | #ifdef SECSIZE |
125 | if (flag & B_PGIN && nbytes > 0) | |
126 | panic("big pgin"); | |
127 | #endif SECSIZE | |
919fe934 | 128 | dblkno += btodb(c); |
663dbc72 | 129 | } |
663dbc72 | 130 | bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); |
c5648f55 | 131 | freeswbuf(bp); |
699e2902 | 132 | return (error); |
663dbc72 BJ |
133 | } |
134 | ||
d668d9ba SL |
135 | /* |
136 | * Put a buffer on the clean list after I/O is done. | |
137 | * Called from biodone. | |
138 | */ | |
139 | swdone(bp) | |
140 | register struct buf *bp; | |
141 | { | |
142 | register int s; | |
143 | ||
144 | if (bp->b_flags & B_ERROR) | |
145 | panic("IO err in push"); | |
d95fc990 | 146 | s = splbio(); |
d668d9ba SL |
147 | bp->av_forw = bclnlist; |
148 | cnt.v_pgout++; | |
149 | cnt.v_pgpgout += bp->b_bcount / NBPG; | |
150 | bclnlist = bp; | |
151 | if (bswlist.b_flags & B_WANTED) | |
152 | wakeup((caddr_t)&proc[2]); | |
153 | splx(s); | |
154 | } | |
155 | ||
663dbc72 BJ |
156 | /* |
157 | * If rout == 0 then killed on swap error, else | |
158 | * rout is the name of the routine where we ran out of | |
159 | * swap space. | |
160 | */ | |
161 | swkill(p, rout) | |
162 | struct proc *p; | |
163 | char *rout; | |
164 | { | |
165 | ||
7cd10076 JB |
166 | printf("pid %d: %s\n", p->p_pid, rout); |
167 | uprintf("sorry, pid %d was killed in %s\n", p->p_pid, rout); | |
663dbc72 BJ |
168 | /* |
169 | * To be sure no looping (e.g. in vmsched trying to | |
170 | * swap out) mark process locked in core (as though | |
171 | * done by user) after killing it so noone will try | |
172 | * to swap it out. | |
173 | */ | |
a30d2e97 | 174 | psignal(p, SIGKILL); |
663dbc72 BJ |
175 | p->p_flag |= SULOCK; |
176 | } | |
177 | ||
663dbc72 BJ |
178 | /* |
179 | * Raw I/O. The arguments are | |
180 | * The strategy routine for the device | |
c5648f55 KB |
181 | * A buffer, which will either be a special buffer header owned |
182 | * exclusively by the device for this purpose, or NULL, | |
183 | * indicating that we should use a swap buffer | |
663dbc72 BJ |
184 | * The device number |
185 | * Read/write flag | |
186 | * Essentially all the work is computing physical addresses and | |
187 | * validating them. | |
188 | * If the user has the proper access privilidges, the process is | |
189 | * marked 'delayed unlock' and the pages involved in the I/O are | |
190 | * faulted and locked. After the completion of the I/O, the above pages | |
191 | * are unlocked. | |
192 | */ | |
d6d7360b BJ |
193 | physio(strat, bp, dev, rw, mincnt, uio) |
194 | int (*strat)(); | |
195 | register struct buf *bp; | |
196 | dev_t dev; | |
197 | int rw; | |
c5648f55 | 198 | u_int (*mincnt)(); |
d6d7360b | 199 | struct uio *uio; |
663dbc72 | 200 | { |
a196746e | 201 | register struct iovec *iov; |
58c3cad7 | 202 | register int requested, done; |
663dbc72 | 203 | char *a; |
c5648f55 KB |
204 | int s, allocbuf = 0, error = 0; |
205 | struct buf *getswbuf(); | |
ec67a3ce MK |
206 | #ifdef SECSIZE |
207 | int bsize; | |
208 | struct partinfo dpart; | |
209 | #endif SECSIZE | |
663dbc72 | 210 | |
ec67a3ce MK |
211 | #ifdef SECSIZE |
212 | if ((unsigned)major(dev) < nchrdev && | |
213 | (*cdevsw[major(dev)].d_ioctl)(dev, DIOCGPART, (caddr_t)&dpart, | |
214 | FREAD) == 0) | |
215 | bsize = dpart.disklab->d_secsize; | |
216 | else | |
217 | bsize = DEV_BSIZE; | |
218 | #endif SECSIZE | |
219 | for (;;) { | |
220 | if (uio->uio_iovcnt == 0) | |
221 | return (0); | |
222 | iov = uio->uio_iov; | |
223 | if (useracc(iov->iov_base, (u_int)iov->iov_len, | |
224 | rw==B_READ? B_WRITE : B_READ) == NULL) | |
225 | return (EFAULT); | |
226 | s = splbio(); | |
227 | while (bp->b_flags&B_BUSY) { | |
228 | bp->b_flags |= B_WANTED; | |
229 | sleep((caddr_t)bp, PRIBIO+1); | |
230 | } | |
c5648f55 KB |
231 | if (!allocbuf) { /* only if sharing caller's buffer */ |
232 | s = splbio(); | |
233 | while (bp->b_flags&B_BUSY) { | |
234 | bp->b_flags |= B_WANTED; | |
235 | sleep((caddr_t)bp, PRIBIO+1); | |
236 | } | |
237 | splx(s); | |
238 | } | |
ec67a3ce MK |
239 | bp->b_error = 0; |
240 | bp->b_proc = u.u_procp; | |
241 | #ifdef SECSIZE | |
242 | bp->b_blksize = bsize; | |
243 | #endif SECSIZE | |
244 | bp->b_un.b_addr = iov->iov_base; | |
245 | while (iov->iov_len > 0) { | |
246 | bp->b_flags = B_BUSY | B_PHYS | rw; | |
247 | bp->b_dev = dev; | |
248 | #ifdef SECSIZE | |
249 | bp->b_blkno = uio->uio_offset / bsize; | |
250 | #else SECSIZE | |
251 | bp->b_blkno = btodb(uio->uio_offset); | |
252 | #endif SECSIZE | |
253 | bp->b_bcount = iov->iov_len; | |
254 | (*mincnt)(bp); | |
255 | c = bp->b_bcount; | |
256 | u.u_procp->p_flag |= SPHYSIO; | |
257 | vslock(a = bp->b_un.b_addr, c); | |
258 | physstrat(bp, strat, PRIBIO); | |
259 | (void) splbio(); | |
260 | vsunlock(a, c, rw); | |
261 | u.u_procp->p_flag &= ~SPHYSIO; | |
262 | if (bp->b_flags&B_WANTED) | |
263 | wakeup((caddr_t)bp); | |
264 | splx(s); | |
265 | c -= bp->b_resid; | |
266 | bp->b_un.b_addr += c; | |
267 | iov->iov_len -= c; | |
268 | uio->uio_resid -= c; | |
269 | uio->uio_offset += c; | |
270 | /* temp kludge for tape drives */ | |
271 | if (bp->b_resid || (bp->b_flags&B_ERROR)) | |
272 | break; | |
273 | } | |
274 | bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); | |
275 | error = geterror(bp); | |
ec67a3ce MK |
276 | if (bp->b_resid || error) |
277 | return (error); | |
278 | uio->uio_iov++; | |
279 | uio->uio_iovcnt--; | |
663dbc72 | 280 | } |
c5648f55 KB |
281 | if (allocbuf) |
282 | freeswbuf(bp); | |
283 | return (error); | |
663dbc72 BJ |
284 | } |
285 | ||
c5648f55 | 286 | u_int |
663dbc72 | 287 | minphys(bp) |
d6d7360b | 288 | struct buf *bp; |
663dbc72 | 289 | { |
35a494b8 SL |
290 | if (bp->b_bcount > MAXPHYS) |
291 | bp->b_bcount = MAXPHYS; | |
663dbc72 | 292 | } |
c5648f55 KB |
293 | |
294 | static | |
295 | struct buf * | |
296 | getswbuf(prio) | |
297 | int prio; | |
298 | { | |
299 | int s; | |
300 | struct buf *bp; | |
301 | ||
302 | s = splbio(); | |
303 | while (bswlist.av_forw == NULL) { | |
304 | bswlist.b_flags |= B_WANTED; | |
305 | sleep((caddr_t)&bswlist, prio); | |
306 | } | |
307 | bp = bswlist.av_forw; | |
308 | bswlist.av_forw = bp->av_forw; | |
309 | splx(s); | |
310 | return (bp); | |
311 | } | |
312 | ||
313 | static | |
314 | freeswbuf(bp) | |
315 | struct buf *bp; | |
316 | { | |
317 | int s; | |
318 | ||
319 | s = splbio(); | |
320 | bp->av_forw = bswlist.av_forw; | |
321 | bswlist.av_forw = bp; | |
343a57bd KM |
322 | if (bp->b_vp) |
323 | brelvp(bp); | |
c5648f55 KB |
324 | if (bswlist.b_flags & B_WANTED) { |
325 | bswlist.b_flags &= ~B_WANTED; | |
326 | wakeup((caddr_t)&bswlist); | |
327 | wakeup((caddr_t)&proc[2]); | |
328 | } | |
329 | splx(s); | |
330 | } | |
331 | ||
332 | rawread(dev, uio) | |
333 | dev_t dev; | |
334 | struct uio *uio; | |
335 | { | |
336 | return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, | |
337 | dev, B_READ, minphys, uio)); | |
338 | } | |
339 | ||
340 | rawwrite(dev, uio) | |
341 | dev_t dev; | |
342 | struct uio *uio; | |
343 | { | |
344 | return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, | |
345 | dev, B_WRITE, minphys, uio)); | |
346 | } |