git.subgeniuskitty.com - xeon-phi-kernel-module/.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Implementation of select and poll
	3	*
	4	* Copyright 2011-2012 Intel Corporation.
	5	*
	6	* This file is a derivative of fs/select.c from within the Linux kernel
	7	* source distribution, version 2.6.34; it has been modified (starting
	8	* in May 2011) to work within the context of the SCIF driver.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License, version 2, as
	12	* published by the Free Software Foundation.
	13	*
	14	* This program is distributed in the hope that it will be useful, but
	15	* WITHOUT ANY WARRANTY; without even the implied warranty of
	16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	17	* General Public License for more details.
	18	*
	19	* You should have received a copy of the GNU General Public License
	20	* along with this program; if not, write to the Free Software
	21	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
	22	* USA.
	23	*
	24	* Initial comment from fs/select.c:
	25	*
	26	* This file contains the procedures for the handling of select and poll
	27	*
	28	* Created for Linux based loosely upon Mathius Lattner's minix
	29	* patches by Peter MacDonald. Heavily edited by Linus.
	30	*
	31	* 4 February 1994
	32	* COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
	33	* flag set in its personality we do not modify the given timeout
	34	* parameter to reflect time remaining.
	35	*
	36	* 24 January 2000
	37	* Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation
	38	* of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
	39	*/
	40
	41	#include <linux/kernel.h>
	42	#include <linux/sched.h>
	43	#include <linux/file.h>
	44	#include <linux/hrtimer.h>
	45	#include <linux/module.h>
	46
	47	#include "mic/micscif.h"
	48
	49	#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
	50	#include <linux/sched/rt.h>
	51	#endif
	52
	53	struct poll_table_page {
	54	struct poll_table_page *next;
	55	struct poll_table_entry *entry;
	56	struct poll_table_entry entries[0];
	57	};
	58
	59	/*
	60	* Estimate expected accuracy in ns from a timeval.
	61	*
	62	* After quite a bit of churning around, we've settled on
	63	* a simple thing of taking 0.1% of the timeout as the
	64	* slack, with a cap of 100 msec.
	65	* "nice" tasks get a 0.5% slack instead.
	66	*
	67	* Consider this comment an open invitation to come up with even
	68	* better solutions..
	69	*/
	70
	71	#define MAX_SLACK (100 * NSEC_PER_MSEC)
	72
	73	static long __estimate_accuracy(struct timespec *tv)
	74	{
	75	long slack;
	76	int divfactor = 1000;
	77
	78	if (tv->tv_sec < 0)
	79	return 0;
	80
	81	if (task_nice(current) > 0)
	82	divfactor = divfactor / 5;
	83
	84	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
	85	return MAX_SLACK;
	86
	87	slack = tv->tv_nsec / divfactor;
	88	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
	89
	90	if (slack > MAX_SLACK)
	91	return MAX_SLACK;
	92
	93	return slack;
	94	}
	95
	96	static long estimate_accuracy(struct timespec *tv)
	97	{
	98	unsigned long ret;
	99	struct timespec now;
	100
	101	/*
	102	* Realtime tasks get a slack of 0 for obvious reasons.
	103	*/
	104
	105	if (rt_task(current))
	106	return 0;
	107
	108	ktime_get_ts(&now);
	109	now = timespec_sub(*tv, now);
	110	ret = __estimate_accuracy(&now);
	111	if (ret < current->timer_slack_ns)
	112	return current->timer_slack_ns;
	113	return ret;
	114	}
	115
	116	#define POLL_TABLE_FULL(table) \
	117	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
	118
	119	/*
	120	* Ok, Peter made a complicated, but straightforward multiple_wait() function.
	121	* I have rewritten this, taking some shortcuts: This code may not be easy to
	122	* follow, but it should be free of race-conditions, and it's practical. If you
	123	* understand what I'm doing here, then you understand how the linux
	124	* sleep/wakeup mechanism works.
	125	*
	126	* Two very simple procedures, poll_wait() and poll_freewait() make all the
	127	* work. poll_wait() is an inline-function defined in <linux/poll.h>,
	128	* as all select/poll functions have to call it to add an entry to the
	129	* poll table.
	130	*/
	131	static void __pollwait(struct file filp __attribute__((unused)), wait_queue_head_t wait_address,
	132	poll_table *p);
	133
	134	static void scif_poll_initwait(struct poll_wqueues *pwq)
	135	{
	136	init_poll_funcptr(&pwq->pt, __pollwait);
	137	pwq->polling_task = current;
	138	pwq->triggered = 0;
	139	pwq->error = 0;
	140	pwq->table = NULL;
	141	pwq->inline_index = 0;
	142	}
	143
	144	static void free_poll_entry(struct poll_table_entry *entry)
	145	{
	146	remove_wait_queue(entry->wait_address, &entry->wait);
	147	}
	148
	149	static void scif_poll_freewait(struct poll_wqueues *pwq)
	150	{
	151	struct poll_table_page * p = pwq->table;
	152	int i;
	153	for (i = 0; i < pwq->inline_index; i++)
	154	free_poll_entry(pwq->inline_entries + i);
	155	while (p) {
	156	struct poll_table_entry *entry;
	157	struct poll_table_page *old;
	158
	159	entry = p->entry;
	160	do {
	161	entry--;
	162	free_poll_entry(entry);
	163	} while (entry > p->entries);
	164	old = p;
	165	p = p->next;
	166	free_page((unsigned long) old);
	167	}
	168	}
	169
	170	static struct poll_table_entry poll_get_entry(struct poll_wqueues p)
	171	{
	172	struct poll_table_page *table = p->table;
	173
	174	if (p->inline_index < N_INLINE_POLL_ENTRIES)
	175	return p->inline_entries + p->inline_index++;
	176
	177	if (!table \|\| POLL_TABLE_FULL(table)) {
	178	struct poll_table_page *new_table;
	179
	180	new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
	181	if (!new_table) {
	182	p->error = -ENOMEM;
	183	return NULL;
	184	}
	185	new_table->entry = new_table->entries;
	186	new_table->next = table;
	187	p->table = new_table;
	188	table = new_table;
	189	}
	190
	191	return table->entry++;
	192	}
	193
	194	static int __pollwake(wait_queue_t wait, unsigned mode, int sync, void key)
	195	{
	196	struct poll_wqueues *pwq = wait->private;
	197	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
	198
	199	/*
	200	* Although this function is called under waitqueue lock, LOCK
	201	* doesn't imply write barrier and the users expect write
	202	* barrier semantics on wakeup functions. The following
	203	* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
	204	* and is paired with set_mb() in poll_schedule_timeout.
	205	*/
	206	smp_wmb();
	207	pwq->triggered = 1;
	208
	209	/*
	210	* Perform the default wake up operation using a dummy
	211	* waitqueue.
	212	*
	213	* TODO: This is hacky but there currently is no interface to
	214	* pass in @sync. @sync is scheduled to be removed and once
	215	* that happens, wake_up_process() can be used directly.
	216	*/
	217	return default_wake_function(&dummy_wait, mode, sync, key);
	218	}
	219
	220	static int pollwake(wait_queue_t wait, unsigned mode, int sync, void key)
	221	{
	222	struct poll_table_entry *entry;
	223
	224	entry = container_of(wait, struct poll_table_entry, wait);
	225	if (key && !((unsigned long)key & entry->key))
	226	return 0;
	227	return __pollwake(wait, mode, sync, key);
	228	}
	229
	230	/* Add a new entry */
	231	static void __pollwait(struct file filp __attribute__((unused)), wait_queue_head_t wait_address,
	232	poll_table *p)
	233	{
	234	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	235	struct poll_table_entry *entry = poll_get_entry(pwq);
	236	if (!entry)
	237	return;
	238	entry->filp = NULL;
	239	entry->wait_address = wait_address;
	240	#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
	241	entry->key = p->_key;
	242	#else
	243	entry->key = p->key;
	244	#endif
	245	init_waitqueue_func_entry(&entry->wait, pollwake);
	246	entry->wait.private = pwq;
	247	add_wait_queue(wait_address, &entry->wait);
	248	}
	249
	250	int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
	251	ktime_t *expires, unsigned long slack)
	252	{
	253	int rc = -EINTR;
	254
	255	set_current_state(state);
	256	if (!pwq->triggered)
	257	rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
	258	__set_current_state(TASK_RUNNING);
	259
	260	/*
	261	* Prepare for the next iteration.
	262	*
	263	* The following set_mb() serves two purposes. First, it's
	264	* the counterpart rmb of the wmb in pollwake() such that data
	265	* written before wake up is always visible after wake up.
	266	* Second, the full barrier guarantees that triggered clearing
	267	* doesn't pass event check of the next iteration. Note that
	268	* this problem doesn't exist for the first iteration as
	269	* add_wait_queue() has full barrier semantics.
	270	*/
	271	#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,2,0))
	272	smp_store_mb(pwq->triggered, 0);
	273	#else
	274	set_mb(pwq->triggered, 0);
	275	#endif
	276
	277	return rc;
	278	}
	279
	280	static unsigned int scif_poll_kernel(poll_table pwait, struct endpt ep)
	281	{
	282	return __scif_pollfd(NULL, pwait, ep);
	283	}
	284
	285	/*
	286	* Fish for pollable events on the pollfd->fd file descriptor. We're only
	287	* interested in events matching the pollfd->events mask, and the result
	288	* matching that mask is both recorded in pollfd->revents and returned. The
	289	* pwait poll_table will be used by the fd-provided poll handler for waiting,
	290	* if non-NULL.
	291	*/
	292	static inline unsigned int do_pollfd(struct scif_pollepd pollfd, poll_table pwait)
	293	{
	294	unsigned int mask;
	295	scif_epd_t epd;
	296
	297	mask = 0;
	298	epd = pollfd->epd;
	299	if (epd) {
	300	mask = POLLNVAL;
	301	mask = DEFAULT_POLLMASK;
	302	if (pwait)
	303	#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
	304	pwait->_key = pollfd->events \| POLLERR \| POLLHUP;
	305	#else
	306	pwait->key = pollfd->events \| POLLERR \| POLLHUP;
	307	#endif
	308	mask = scif_poll_kernel(pwait, epd);
	309	/* Mask out unneeded events. */
	310	mask &= pollfd->events \| POLLERR \| POLLHUP;
	311	}
	312	pollfd->revents = mask;
	313
	314	return mask;
	315	}
	316
	317	static int do_poll(unsigned int nfds, struct scif_pollepd *ufds,
	318	struct poll_wqueues wait, struct timespec end_time)
	319	{
	320	poll_table* pt = &wait->pt;
	321	ktime_t expire, *to = NULL;
	322	int timed_out = 0, count = 0, i = 0;
	323	unsigned long slack = 0;
	324
	325	/* Optimise the no-wait case */
	326	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
	327	pt = NULL;
	328	timed_out = 1;
	329	}
	330
	331	if (end_time && !timed_out)
	332	slack = estimate_accuracy(end_time);
	333
	334	for (;;) {
	335	for (i = 0; i < nfds; i++) {
	336	/*
	337	* Fish for events. If we found one, record it
	338	* and kill the poll_table, so we don't
	339	* needlessly register any other waiters after
	340	* this. They'll get immediately deregistered
	341	* when we break out and return.
	342	*/
	343	if (do_pollfd(ufds + i, pt)) {
	344	count++;
	345	pt = NULL;
	346	}
	347	}
	348	/*
	349	* All waiters have already been registered, so don't provide
	350	* a poll_table to them on the next loop iteration.
	351	*/
	352	pt = NULL;
	353	if (!count) {
	354	count = wait->error;
	355	if (signal_pending(current))
	356	count = -EINTR;
	357	}
	358	if (count \|\| timed_out)
	359	break;
	360
	361	/*
	362	* If this is the first loop and we have a timeout
	363	* given, then we convert to ktime_t and set the to
	364	* pointer to the expiry value.
	365	*/
	366	if (end_time && !to) {
	367	expire = timespec_to_ktime(*end_time);
	368	to = &expire;
	369	}
	370
	371	if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
	372	timed_out = 1;
	373	}
	374	return count;
	375	}
	376
	377	static int do_scif_poll(struct scif_pollepd *ufds, unsigned int nfds,
	378	struct timespec *end_time)
	379	{
	380	struct poll_wqueues table;
	381	int epdcount;
	382
	383	scif_poll_initwait(&table);
	384	epdcount = do_poll(nfds, ufds, &table, end_time);
	385	scif_poll_freewait(&table);
	386
	387	return epdcount;
	388	}
	389
	390	/*
	391	* Add two timespec values and do a safety check for overflow.
	392	* It's assumed that both values are valid (>= 0)
	393	*/
	394	static struct timespec scif_timespec_add_safe(const struct timespec lhs,
	395	const struct timespec rhs)
	396	{
	397	struct timespec res;
	398
	399	set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
	400	lhs.tv_nsec + rhs.tv_nsec);
	401
	402	if (res.tv_sec < lhs.tv_sec \|\| res.tv_sec < rhs.tv_sec)
	403	res.tv_sec = TIME_T_MAX;
	404
	405	return res;
	406	}
	407	/**
	408	* poll_select_set_timeout - helper function to setup the timeout value
	409	* @to: pointer to timespec variable for the final timeout
	410	* @sec: seconds (from user space)
	411	* @nsec: nanoseconds (from user space)
	412	*
	413	* Note, we do not use a timespec for the user space value here, That
	414	* way we can use the function for timeval and compat interfaces as well.
	415	*
	416	* Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
	417	*/
	418	static int scif_poll_select_set_timeout(struct timespec *to, long sec, long nsec)
	419	{
	420	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
	421
	422	if (!timespec_valid(&ts))
	423	return -EINVAL;
	424
	425	/* Optimize for the zero timeout value here */
	426	if (!sec && !nsec) {
	427	to->tv_sec = to->tv_nsec = 0;
	428	} else {
	429	ktime_get_ts(to);
	430	to = scif_timespec_add_safe(to, ts);
	431	}
	432	return 0;
	433	}
	434
	435	int scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs)
	436	{
	437	struct timespec end_time, *to = NULL;
	438	if (timeout_msecs >= 0) {
	439	to = &end_time;
	440	scif_poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
	441	NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
	442	}
	443
	444	return do_scif_poll(ufds, nfds, to);
	445	}
	446	EXPORT_SYMBOL(scif_poll);