Commit | Line | Data |
---|---|---|
800f879a AT |
1 | /* |
2 | * Implementation of select and poll | |
3 | * | |
4 | * Copyright 2011-2012 Intel Corporation. | |
5 | * | |
6 | * This file is a derivative of fs/select.c from within the Linux kernel | |
7 | * source distribution, version 2.6.34; it has been modified (starting | |
8 | * in May 2011) to work within the context of the SCIF driver. | |
9 | * | |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of the GNU General Public License, version 2, as | |
12 | * published by the Free Software Foundation. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, but | |
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU General Public License | |
20 | * along with this program; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | |
22 | * USA. | |
23 | * | |
24 | * Initial comment from fs/select.c: | |
25 | * | |
26 | * This file contains the procedures for the handling of select and poll | |
27 | * | |
28 | * Created for Linux based loosely upon Mathius Lattner's minix | |
29 | * patches by Peter MacDonald. Heavily edited by Linus. | |
30 | * | |
31 | * 4 February 1994 | |
32 | * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS | |
33 | * flag set in its personality we do *not* modify the given timeout | |
34 | * parameter to reflect time remaining. | |
35 | * | |
36 | * 24 January 2000 | |
37 | * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation | |
38 | * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). | |
39 | */ | |
40 | ||
41 | #include <linux/kernel.h> | |
42 | #include <linux/sched.h> | |
43 | #include <linux/file.h> | |
44 | #include <linux/hrtimer.h> | |
45 | #include <linux/module.h> | |
46 | ||
47 | #include "mic/micscif.h" | |
48 | ||
49 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) | |
50 | #include <linux/sched/rt.h> | |
51 | #endif | |
52 | ||
53 | struct poll_table_page { | |
54 | struct poll_table_page *next; | |
55 | struct poll_table_entry *entry; | |
56 | struct poll_table_entry entries[0]; | |
57 | }; | |
58 | ||
59 | /* | |
60 | * Estimate expected accuracy in ns from a timeval. | |
61 | * | |
62 | * After quite a bit of churning around, we've settled on | |
63 | * a simple thing of taking 0.1% of the timeout as the | |
64 | * slack, with a cap of 100 msec. | |
65 | * "nice" tasks get a 0.5% slack instead. | |
66 | * | |
67 | * Consider this comment an open invitation to come up with even | |
68 | * better solutions.. | |
69 | */ | |
70 | ||
71 | #define MAX_SLACK (100 * NSEC_PER_MSEC) | |
72 | ||
73 | static long __estimate_accuracy(struct timespec *tv) | |
74 | { | |
75 | long slack; | |
76 | int divfactor = 1000; | |
77 | ||
78 | if (tv->tv_sec < 0) | |
79 | return 0; | |
80 | ||
81 | if (task_nice(current) > 0) | |
82 | divfactor = divfactor / 5; | |
83 | ||
84 | if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) | |
85 | return MAX_SLACK; | |
86 | ||
87 | slack = tv->tv_nsec / divfactor; | |
88 | slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); | |
89 | ||
90 | if (slack > MAX_SLACK) | |
91 | return MAX_SLACK; | |
92 | ||
93 | return slack; | |
94 | } | |
95 | ||
96 | static long estimate_accuracy(struct timespec *tv) | |
97 | { | |
98 | unsigned long ret; | |
99 | struct timespec now; | |
100 | ||
101 | /* | |
102 | * Realtime tasks get a slack of 0 for obvious reasons. | |
103 | */ | |
104 | ||
105 | if (rt_task(current)) | |
106 | return 0; | |
107 | ||
108 | ktime_get_ts(&now); | |
109 | now = timespec_sub(*tv, now); | |
110 | ret = __estimate_accuracy(&now); | |
111 | if (ret < current->timer_slack_ns) | |
112 | return current->timer_slack_ns; | |
113 | return ret; | |
114 | } | |
115 | ||
116 | #define POLL_TABLE_FULL(table) \ | |
117 | ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) | |
118 | ||
119 | /* | |
120 | * Ok, Peter made a complicated, but straightforward multiple_wait() function. | |
121 | * I have rewritten this, taking some shortcuts: This code may not be easy to | |
122 | * follow, but it should be free of race-conditions, and it's practical. If you | |
123 | * understand what I'm doing here, then you understand how the linux | |
124 | * sleep/wakeup mechanism works. | |
125 | * | |
126 | * Two very simple procedures, poll_wait() and poll_freewait() make all the | |
127 | * work. poll_wait() is an inline-function defined in <linux/poll.h>, | |
128 | * as all select/poll functions have to call it to add an entry to the | |
129 | * poll table. | |
130 | */ | |
131 | static void __pollwait(struct file *filp __attribute__((unused)), wait_queue_head_t *wait_address, | |
132 | poll_table *p); | |
133 | ||
134 | static void scif_poll_initwait(struct poll_wqueues *pwq) | |
135 | { | |
136 | init_poll_funcptr(&pwq->pt, __pollwait); | |
137 | pwq->polling_task = current; | |
138 | pwq->triggered = 0; | |
139 | pwq->error = 0; | |
140 | pwq->table = NULL; | |
141 | pwq->inline_index = 0; | |
142 | } | |
143 | ||
144 | static void free_poll_entry(struct poll_table_entry *entry) | |
145 | { | |
146 | remove_wait_queue(entry->wait_address, &entry->wait); | |
147 | } | |
148 | ||
149 | static void scif_poll_freewait(struct poll_wqueues *pwq) | |
150 | { | |
151 | struct poll_table_page * p = pwq->table; | |
152 | int i; | |
153 | for (i = 0; i < pwq->inline_index; i++) | |
154 | free_poll_entry(pwq->inline_entries + i); | |
155 | while (p) { | |
156 | struct poll_table_entry *entry; | |
157 | struct poll_table_page *old; | |
158 | ||
159 | entry = p->entry; | |
160 | do { | |
161 | entry--; | |
162 | free_poll_entry(entry); | |
163 | } while (entry > p->entries); | |
164 | old = p; | |
165 | p = p->next; | |
166 | free_page((unsigned long) old); | |
167 | } | |
168 | } | |
169 | ||
170 | static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) | |
171 | { | |
172 | struct poll_table_page *table = p->table; | |
173 | ||
174 | if (p->inline_index < N_INLINE_POLL_ENTRIES) | |
175 | return p->inline_entries + p->inline_index++; | |
176 | ||
177 | if (!table || POLL_TABLE_FULL(table)) { | |
178 | struct poll_table_page *new_table; | |
179 | ||
180 | new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); | |
181 | if (!new_table) { | |
182 | p->error = -ENOMEM; | |
183 | return NULL; | |
184 | } | |
185 | new_table->entry = new_table->entries; | |
186 | new_table->next = table; | |
187 | p->table = new_table; | |
188 | table = new_table; | |
189 | } | |
190 | ||
191 | return table->entry++; | |
192 | } | |
193 | ||
5707b46f | 194 | static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) |
800f879a AT |
195 | { |
196 | struct poll_wqueues *pwq = wait->private; | |
197 | DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); | |
198 | ||
199 | /* | |
200 | * Although this function is called under waitqueue lock, LOCK | |
201 | * doesn't imply write barrier and the users expect write | |
202 | * barrier semantics on wakeup functions. The following | |
203 | * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() | |
204 | * and is paired with set_mb() in poll_schedule_timeout. | |
205 | */ | |
206 | smp_wmb(); | |
207 | pwq->triggered = 1; | |
208 | ||
209 | /* | |
210 | * Perform the default wake up operation using a dummy | |
211 | * waitqueue. | |
212 | * | |
213 | * TODO: This is hacky but there currently is no interface to | |
214 | * pass in @sync. @sync is scheduled to be removed and once | |
215 | * that happens, wake_up_process() can be used directly. | |
216 | */ | |
217 | return default_wake_function(&dummy_wait, mode, sync, key); | |
218 | } | |
219 | ||
5707b46f | 220 | static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) |
800f879a AT |
221 | { |
222 | struct poll_table_entry *entry; | |
223 | ||
224 | entry = container_of(wait, struct poll_table_entry, wait); | |
225 | if (key && !((unsigned long)key & entry->key)) | |
226 | return 0; | |
227 | return __pollwake(wait, mode, sync, key); | |
228 | } | |
229 | ||
230 | /* Add a new entry */ | |
231 | static void __pollwait(struct file *filp __attribute__((unused)), wait_queue_head_t *wait_address, | |
232 | poll_table *p) | |
233 | { | |
234 | struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); | |
235 | struct poll_table_entry *entry = poll_get_entry(pwq); | |
236 | if (!entry) | |
237 | return; | |
238 | entry->filp = NULL; | |
239 | entry->wait_address = wait_address; | |
240 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) | |
241 | entry->key = p->_key; | |
242 | #else | |
243 | entry->key = p->key; | |
244 | #endif | |
245 | init_waitqueue_func_entry(&entry->wait, pollwake); | |
246 | entry->wait.private = pwq; | |
247 | add_wait_queue(wait_address, &entry->wait); | |
248 | } | |
249 | ||
250 | int poll_schedule_timeout(struct poll_wqueues *pwq, int state, | |
251 | ktime_t *expires, unsigned long slack) | |
252 | { | |
253 | int rc = -EINTR; | |
254 | ||
255 | set_current_state(state); | |
256 | if (!pwq->triggered) | |
257 | rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); | |
258 | __set_current_state(TASK_RUNNING); | |
259 | ||
260 | /* | |
261 | * Prepare for the next iteration. | |
262 | * | |
263 | * The following set_mb() serves two purposes. First, it's | |
264 | * the counterpart rmb of the wmb in pollwake() such that data | |
265 | * written before wake up is always visible after wake up. | |
266 | * Second, the full barrier guarantees that triggered clearing | |
267 | * doesn't pass event check of the next iteration. Note that | |
268 | * this problem doesn't exist for the first iteration as | |
269 | * add_wait_queue() has full barrier semantics. | |
270 | */ | |
271 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,2,0)) | |
272 | smp_store_mb(pwq->triggered, 0); | |
273 | #else | |
274 | set_mb(pwq->triggered, 0); | |
275 | #endif | |
276 | ||
277 | return rc; | |
278 | } | |
279 | ||
280 | static unsigned int scif_poll_kernel(poll_table *pwait, struct endpt *ep) | |
281 | { | |
282 | return __scif_pollfd(NULL, pwait, ep); | |
283 | } | |
284 | ||
285 | /* | |
286 | * Fish for pollable events on the pollfd->fd file descriptor. We're only | |
287 | * interested in events matching the pollfd->events mask, and the result | |
288 | * matching that mask is both recorded in pollfd->revents and returned. The | |
289 | * pwait poll_table will be used by the fd-provided poll handler for waiting, | |
290 | * if non-NULL. | |
291 | */ | |
292 | static inline unsigned int do_pollfd(struct scif_pollepd *pollfd, poll_table *pwait) | |
293 | { | |
294 | unsigned int mask; | |
295 | scif_epd_t epd; | |
296 | ||
297 | mask = 0; | |
298 | epd = pollfd->epd; | |
299 | if (epd) { | |
300 | mask = POLLNVAL; | |
301 | mask = DEFAULT_POLLMASK; | |
302 | if (pwait) | |
303 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) | |
304 | pwait->_key = pollfd->events | POLLERR | POLLHUP; | |
305 | #else | |
306 | pwait->key = pollfd->events | POLLERR | POLLHUP; | |
307 | #endif | |
308 | mask = scif_poll_kernel(pwait, epd); | |
309 | /* Mask out unneeded events. */ | |
310 | mask &= pollfd->events | POLLERR | POLLHUP; | |
311 | } | |
312 | pollfd->revents = mask; | |
313 | ||
314 | return mask; | |
315 | } | |
316 | ||
317 | static int do_poll(unsigned int nfds, struct scif_pollepd *ufds, | |
318 | struct poll_wqueues *wait, struct timespec *end_time) | |
319 | { | |
320 | poll_table* pt = &wait->pt; | |
321 | ktime_t expire, *to = NULL; | |
322 | int timed_out = 0, count = 0, i = 0; | |
323 | unsigned long slack = 0; | |
324 | ||
325 | /* Optimise the no-wait case */ | |
326 | if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { | |
327 | pt = NULL; | |
328 | timed_out = 1; | |
329 | } | |
330 | ||
331 | if (end_time && !timed_out) | |
332 | slack = estimate_accuracy(end_time); | |
333 | ||
334 | for (;;) { | |
335 | for (i = 0; i < nfds; i++) { | |
336 | /* | |
337 | * Fish for events. If we found one, record it | |
338 | * and kill the poll_table, so we don't | |
339 | * needlessly register any other waiters after | |
340 | * this. They'll get immediately deregistered | |
341 | * when we break out and return. | |
342 | */ | |
343 | if (do_pollfd(ufds + i, pt)) { | |
344 | count++; | |
345 | pt = NULL; | |
346 | } | |
347 | } | |
348 | /* | |
349 | * All waiters have already been registered, so don't provide | |
350 | * a poll_table to them on the next loop iteration. | |
351 | */ | |
352 | pt = NULL; | |
353 | if (!count) { | |
354 | count = wait->error; | |
355 | if (signal_pending(current)) | |
356 | count = -EINTR; | |
357 | } | |
358 | if (count || timed_out) | |
359 | break; | |
360 | ||
361 | /* | |
362 | * If this is the first loop and we have a timeout | |
363 | * given, then we convert to ktime_t and set the to | |
364 | * pointer to the expiry value. | |
365 | */ | |
366 | if (end_time && !to) { | |
367 | expire = timespec_to_ktime(*end_time); | |
368 | to = &expire; | |
369 | } | |
370 | ||
371 | if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) | |
372 | timed_out = 1; | |
373 | } | |
374 | return count; | |
375 | } | |
376 | ||
377 | static int do_scif_poll(struct scif_pollepd *ufds, unsigned int nfds, | |
378 | struct timespec *end_time) | |
379 | { | |
380 | struct poll_wqueues table; | |
381 | int epdcount; | |
382 | ||
383 | scif_poll_initwait(&table); | |
384 | epdcount = do_poll(nfds, ufds, &table, end_time); | |
385 | scif_poll_freewait(&table); | |
386 | ||
387 | return epdcount; | |
388 | } | |
389 | ||
390 | /* | |
391 | * Add two timespec values and do a safety check for overflow. | |
392 | * It's assumed that both values are valid (>= 0) | |
393 | */ | |
394 | static struct timespec scif_timespec_add_safe(const struct timespec lhs, | |
395 | const struct timespec rhs) | |
396 | { | |
397 | struct timespec res; | |
398 | ||
399 | set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, | |
400 | lhs.tv_nsec + rhs.tv_nsec); | |
401 | ||
402 | if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) | |
403 | res.tv_sec = TIME_T_MAX; | |
404 | ||
405 | return res; | |
406 | } | |
407 | /** | |
408 | * poll_select_set_timeout - helper function to setup the timeout value | |
409 | * @to: pointer to timespec variable for the final timeout | |
410 | * @sec: seconds (from user space) | |
411 | * @nsec: nanoseconds (from user space) | |
412 | * | |
413 | * Note, we do not use a timespec for the user space value here, That | |
414 | * way we can use the function for timeval and compat interfaces as well. | |
415 | * | |
416 | * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. | |
417 | */ | |
418 | static int scif_poll_select_set_timeout(struct timespec *to, long sec, long nsec) | |
419 | { | |
420 | struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; | |
421 | ||
422 | if (!timespec_valid(&ts)) | |
423 | return -EINVAL; | |
424 | ||
425 | /* Optimize for the zero timeout value here */ | |
426 | if (!sec && !nsec) { | |
427 | to->tv_sec = to->tv_nsec = 0; | |
428 | } else { | |
429 | ktime_get_ts(to); | |
430 | *to = scif_timespec_add_safe(*to, ts); | |
431 | } | |
432 | return 0; | |
433 | } | |
434 | ||
435 | int scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs) | |
436 | { | |
437 | struct timespec end_time, *to = NULL; | |
438 | if (timeout_msecs >= 0) { | |
439 | to = &end_time; | |
440 | scif_poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, | |
441 | NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); | |
442 | } | |
443 | ||
444 | return do_scif_poll(ufds, nfds, to); | |
445 | } | |
446 | EXPORT_SYMBOL(scif_poll); |