* Implementation of select and poll
* Copyright 2011-2012 Intel Corporation.
* This file is a derivative of fs/select.c from within the Linux kernel
* source distribution, version 2.6.34; it has been modified (starting
* in May 2011) to work within the context of the SCIF driver.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License, version 2, as
* published by the Free Software Foundation.
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* Initial comment from fs/select.c:
* This file contains the procedures for the handling of select and poll
* Created for Linux based loosely upon Mathius Lattner's minix
* patches by Peter MacDonald. Heavily edited by Linus.
* COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
* flag set in its personality we do *not* modify the given timeout
* parameter to reflect time remaining.
* Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation
* of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
#include <linux/kernel.h>
#include <linux/hrtimer.h>
#include <linux/module.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
#include <linux/sched/rt.h>
struct poll_table_page
*next
;
struct poll_table_entry
*entry
;
struct poll_table_entry entries
[0];
* Estimate expected accuracy in ns from a timeval.
* After quite a bit of churning around, we've settled on
* a simple thing of taking 0.1% of the timeout as the
* slack, with a cap of 100 msec.
* "nice" tasks get a 0.5% slack instead.
* Consider this comment an open invitation to come up with even
#define MAX_SLACK (100 * NSEC_PER_MSEC)
static long __estimate_accuracy(struct timespec
*tv
)
if (task_nice(current
) > 0)
divfactor
= divfactor
/ 5;
if (tv
->tv_sec
> MAX_SLACK
/ (NSEC_PER_SEC
/divfactor
))
slack
= tv
->tv_nsec
/ divfactor
;
slack
+= tv
->tv_sec
* (NSEC_PER_SEC
/divfactor
);
static long estimate_accuracy(struct timespec
*tv
)
* Realtime tasks get a slack of 0 for obvious reasons.
now
= timespec_sub(*tv
, now
);
ret
= __estimate_accuracy(&now
);
if (ret
< current
->timer_slack_ns
)
return current
->timer_slack_ns
;
#define POLL_TABLE_FULL(table) \
((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
* Ok, Peter made a complicated, but straightforward multiple_wait() function.
* I have rewritten this, taking some shortcuts: This code may not be easy to
* follow, but it should be free of race-conditions, and it's practical. If you
* understand what I'm doing here, then you understand how the linux
* sleep/wakeup mechanism works.
* Two very simple procedures, poll_wait() and poll_freewait() make all the
* work. poll_wait() is an inline-function defined in <linux/poll.h>,
* as all select/poll functions have to call it to add an entry to the
static void __pollwait(struct file
*filp
__attribute__((unused
)), wait_queue_head_t
*wait_address
,
static void scif_poll_initwait(struct poll_wqueues
*pwq
)
init_poll_funcptr(&pwq
->pt
, __pollwait
);
pwq
->polling_task
= current
;
static void free_poll_entry(struct poll_table_entry
*entry
)
remove_wait_queue(entry
->wait_address
, &entry
->wait
);
static void scif_poll_freewait(struct poll_wqueues
*pwq
)
struct poll_table_page
* p
= pwq
->table
;
for (i
= 0; i
< pwq
->inline_index
; i
++)
free_poll_entry(pwq
->inline_entries
+ i
);
struct poll_table_entry
*entry
;
struct poll_table_page
*old
;
} while (entry
> p
->entries
);
free_page((unsigned long) old
);
static struct poll_table_entry
*poll_get_entry(struct poll_wqueues
*p
)
struct poll_table_page
*table
= p
->table
;
if (p
->inline_index
< N_INLINE_POLL_ENTRIES
)
return p
->inline_entries
+ p
->inline_index
++;
if (!table
|| POLL_TABLE_FULL(table
)) {
struct poll_table_page
*new_table
;
new_table
= (struct poll_table_page
*) __get_free_page(GFP_KERNEL
);
new_table
->entry
= new_table
->entries
;
static int __pollwake(wait_queue_entry_t
*wait
, unsigned mode
, int sync
, void *key
)
struct poll_wqueues
*pwq
= wait
->private;
DECLARE_WAITQUEUE(dummy_wait
, pwq
->polling_task
);
* Although this function is called under waitqueue lock, LOCK
* doesn't imply write barrier and the users expect write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
* and is paired with set_mb() in poll_schedule_timeout.
* Perform the default wake up operation using a dummy
* TODO: This is hacky but there currently is no interface to
* pass in @sync. @sync is scheduled to be removed and once
* that happens, wake_up_process() can be used directly.
return default_wake_function(&dummy_wait
, mode
, sync
, key
);
static int pollwake(wait_queue_entry_t
*wait
, unsigned mode
, int sync
, void *key
)
struct poll_table_entry
*entry
;
entry
= container_of(wait
, struct poll_table_entry
, wait
);
if (key
&& !((unsigned long)key
& entry
->key
))
return __pollwake(wait
, mode
, sync
, key
);
static void __pollwait(struct file
*filp
__attribute__((unused
)), wait_queue_head_t
*wait_address
,
struct poll_wqueues
*pwq
= container_of(p
, struct poll_wqueues
, pt
);
struct poll_table_entry
*entry
= poll_get_entry(pwq
);
entry
->wait_address
= wait_address
;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
init_waitqueue_func_entry(&entry
->wait
, pollwake
);
entry
->wait
.private = pwq
;
add_wait_queue(wait_address
, &entry
->wait
);
int poll_schedule_timeout(struct poll_wqueues
*pwq
, int state
,
ktime_t
*expires
, unsigned long slack
)
set_current_state(state
);
rc
= schedule_hrtimeout_range(expires
, slack
, HRTIMER_MODE_ABS
);
__set_current_state(TASK_RUNNING
);
* Prepare for the next iteration.
* The following set_mb() serves two purposes. First, it's
* the counterpart rmb of the wmb in pollwake() such that data
* written before wake up is always visible after wake up.
* Second, the full barrier guarantees that triggered clearing
* doesn't pass event check of the next iteration. Note that
* this problem doesn't exist for the first iteration as
* add_wait_queue() has full barrier semantics.
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,2,0))
smp_store_mb(pwq
->triggered
, 0);
set_mb(pwq
->triggered
, 0);
static unsigned int scif_poll_kernel(poll_table
*pwait
, struct endpt
*ep
)
return __scif_pollfd(NULL
, pwait
, ep
);
* Fish for pollable events on the pollfd->fd file descriptor. We're only
* interested in events matching the pollfd->events mask, and the result
* matching that mask is both recorded in pollfd->revents and returned. The
* pwait poll_table will be used by the fd-provided poll handler for waiting,
static inline unsigned int do_pollfd(struct scif_pollepd
*pollfd
, poll_table
*pwait
)
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
pwait
->_key
= pollfd
->events
| POLLERR
| POLLHUP
;
pwait
->key
= pollfd
->events
| POLLERR
| POLLHUP
;
mask
= scif_poll_kernel(pwait
, epd
);
/* Mask out unneeded events. */
mask
&= pollfd
->events
| POLLERR
| POLLHUP
;
static int do_poll(unsigned int nfds
, struct scif_pollepd
*ufds
,
struct poll_wqueues
*wait
, struct timespec
*end_time
)
poll_table
* pt
= &wait
->pt
;
ktime_t expire
, *to
= NULL
;
int timed_out
= 0, count
= 0, i
= 0;
/* Optimise the no-wait case */
if (end_time
&& !end_time
->tv_sec
&& !end_time
->tv_nsec
) {
if (end_time
&& !timed_out
)
slack
= estimate_accuracy(end_time
);
for (i
= 0; i
< nfds
; i
++) {
* Fish for events. If we found one, record it
* and kill the poll_table, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
if (do_pollfd(ufds
+ i
, pt
)) {
* All waiters have already been registered, so don't provide
* a poll_table to them on the next loop iteration.
if (signal_pending(current
))
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
expire
= timespec_to_ktime(*end_time
);
if (!poll_schedule_timeout(wait
, TASK_INTERRUPTIBLE
, to
, slack
))
static int do_scif_poll(struct scif_pollepd
*ufds
, unsigned int nfds
,
struct timespec
*end_time
)
struct poll_wqueues table
;
scif_poll_initwait(&table
);
epdcount
= do_poll(nfds
, ufds
, &table
, end_time
);
scif_poll_freewait(&table
);
* Add two timespec values and do a safety check for overflow.
* It's assumed that both values are valid (>= 0)
static struct timespec
scif_timespec_add_safe(const struct timespec lhs
,
const struct timespec rhs
)
set_normalized_timespec(&res
, lhs
.tv_sec
+ rhs
.tv_sec
,
lhs
.tv_nsec
+ rhs
.tv_nsec
);
if (res
.tv_sec
< lhs
.tv_sec
|| res
.tv_sec
< rhs
.tv_sec
)
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
* @sec: seconds (from user space)
* @nsec: nanoseconds (from user space)
* Note, we do not use a timespec for the user space value here, That
* way we can use the function for timeval and compat interfaces as well.
* Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
static int scif_poll_select_set_timeout(struct timespec
*to
, long sec
, long nsec
)
struct timespec ts
= {.tv_sec
= sec
, .tv_nsec
= nsec
};
if (!timespec_valid(&ts
))
/* Optimize for the zero timeout value here */
to
->tv_sec
= to
->tv_nsec
= 0;
*to
= scif_timespec_add_safe(*to
, ts
);
int scif_poll(struct scif_pollepd
*ufds
, unsigned int nfds
, long timeout_msecs
)
struct timespec end_time
, *to
= NULL
;
if (timeout_msecs
>= 0) {
scif_poll_select_set_timeout(to
, timeout_msecs
/ MSEC_PER_SEC
,
NSEC_PER_MSEC
* (timeout_msecs
% MSEC_PER_SEC
));
return do_scif_poll(ufds
, nfds
, to
);
EXPORT_SYMBOL(scif_poll
);