Added a basic README to the project.
[xeon-phi-kernel-module] / micscif / micscif_api.c
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36#include <linux/poll.h>
37#include <linux/time.h>
38#include <linux/ktime.h>
39#include <linux/sched.h>
40#include <linux/kref.h>
41#include <linux/module.h>
42#include "scif.h"
43#include "mic/micscif.h"
44#ifndef _MIC_SCIF_
45#include "mic_common.h"
46#endif
47#include "mic/micscif_map.h"
48
49#define SCIF_MAP_ULIMIT 0x40
50
51bool mic_ulimit_check = 0;
52
53char *scif_ep_states[] = {
54 "Closed",
55 "Unbound",
56 "Bound",
57 "Listening",
58 "Connected",
59 "Connecting",
60 "Mapping",
61 "Closing",
62 "Close Listening",
63 "Disconnected",
64 "Zombie"};
65
66enum conn_async_state {
67 ASYNC_CONN_IDLE = 1, /* ep setup for async connect */
68 ASYNC_CONN_INPROGRESS, /* async connect in progress */
69 ASYNC_CONN_FLUSH_WORK /* async work flush in progress */
70};
71
72/**
73 * scif_open() - Create a SCIF end point
74 *
75 * Create a SCIF end point and set the state to UNBOUND. This function
76 * returns the address of the end point data structure.
77 */
78scif_epd_t
79__scif_open(void)
80{
81 struct endpt *ep;
82
83 might_sleep();
84 if ((ep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL)) == NULL) {
85 printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point descriptor\n");
86 goto err_ep_alloc;
87 }
88
89 if ((ep->qp_info.qp = (struct micscif_qp *)
90 kzalloc(sizeof(struct micscif_qp), GFP_KERNEL)) == NULL) {
91 printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point queue pointer\n");
92 goto err_qp_alloc;
93 }
94
95 spin_lock_init(&ep->lock);
96 mutex_init (&ep->sendlock);
97 mutex_init (&ep->recvlock);
98
99 if (micscif_rma_ep_init(ep) < 0) {
100 printk(KERN_ERR "SCIFAPI _open: RMA EP Init failed\n");
101 goto err_rma_init;
102 }
103
104 ep->state = SCIFEP_UNBOUND;
105 pr_debug("SCIFAPI open: ep %p success\n", ep);
106 return (scif_epd_t)ep;
107
108err_rma_init:
109 kfree(ep->qp_info.qp);
110err_qp_alloc:
111 kfree(ep);
112err_ep_alloc:
113 return NULL;
114}
115
116scif_epd_t
117scif_open(void)
118{
119 struct endpt *ep;
120 ep = (struct endpt *)__scif_open();
121 if (ep)
122 kref_init(&(ep->ref_count));
123 return (scif_epd_t)ep;
124}
125EXPORT_SYMBOL(scif_open);
126
127/**
128 * scif_close() - Terminate a SCIF end point
129 * @epd: The end point address returned from scif_open()
130 *
131 * The function terminates a scif connection. It must ensure all traffic on
132 * the connection is finished before removing it.
133 *
134 * On Connection with memory mapped this become more difficult. Once normal
135 * DMA and message traffic has ended the end point must be placed in a zombie
136 * state and wait for the other side to also release it's memory references.
137 */
138int
139__scif_close(scif_epd_t epd)
140{
141 struct endpt *ep = (struct endpt *)epd;
142 struct endpt *tmpep;
143 struct list_head *pos, *tmpq;
144 unsigned long sflags;
145 enum endptstate oldstate;
146 int err;
147 bool flush_conn;
148
149 pr_debug("SCIFAPI close: ep %p %s\n", ep, scif_ep_states[ep->state]);
150
151 might_sleep();
152
153 spin_lock(&ep->lock);
154 flush_conn = (ep->conn_async_state == ASYNC_CONN_INPROGRESS);
155 spin_unlock(&ep->lock);
156
157 if (flush_conn)
158 flush_workqueue(ms_info.mi_conn_wq);
159
160 micscif_inc_node_refcnt(ep->remote_dev, 1);
161
162 spin_lock_irqsave(&ep->lock, sflags);
163 oldstate = ep->state;
164
165 ep->state = SCIFEP_CLOSING;
166
167 switch (oldstate) {
168 case SCIFEP_ZOMBIE:
169 BUG_ON(SCIFEP_ZOMBIE == oldstate);
170 case SCIFEP_CLOSED:
171 case SCIFEP_DISCONNECTED:
172 spin_unlock_irqrestore(&ep->lock, sflags);
173 micscif_unregister_all_windows(epd);
174 // Remove from the disconnected list
175 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
176 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
177 tmpep = list_entry(pos, struct endpt, list);
178 if (tmpep == ep) {
179 list_del(pos);
180 break;
181 }
182 }
183 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
184 break;
185 case SCIFEP_UNBOUND:
186 case SCIFEP_BOUND:
187 case SCIFEP_CONNECTING:
188 spin_unlock_irqrestore(&ep->lock, sflags);
189 break;
190 case SCIFEP_MAPPING:
191 case SCIFEP_CONNECTED:
192 case SCIFEP_CLOSING:
193 {
194 struct nodemsg msg;
195 struct endpt *fep = NULL;
196 struct endpt *tmpep;
197 unsigned long ts = jiffies;
198 struct list_head *pos, *tmpq;
199
200 // Very short time before mapping completes and state becomes connected
201 // and does a standard teardown.
202 ts = jiffies;
203 while (ep->state == SCIFEP_MAPPING) {
204 cpu_relax();
205 if (time_after((unsigned long)jiffies,ts + NODE_ALIVE_TIMEOUT)) {
206 printk(KERN_ERR "%s %d ep->state %d\n", __func__, __LINE__, ep->state);
207 ep->state = SCIFEP_BOUND;
208 break;
209 }
210 }
211
212 init_waitqueue_head(&ep->disconwq); // Wait for connection queue
213 spin_unlock_irqrestore(&ep->lock, sflags);
214
215 micscif_unregister_all_windows(epd);
216
217 // Remove from the connected list
218 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
219 list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
220 tmpep = list_entry(pos, struct endpt, list);
221 if (tmpep == ep) {
222 list_del(pos);
223 put_conn_count(ep->remote_dev);
224 fep = tmpep;
225 spin_lock(&ep->lock);
226 break;
227 }
228 }
229
230 if (fep == NULL) {
231 // The other side has completed the disconnect before
232 // the end point can be removed from the list. Therefore
233 // the ep lock is not locked, traverse the disconnected list
234 // to find the endpoint, release the conn lock and
235 // proceed to teardown the end point below.
236 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
237 tmpep = list_entry(pos, struct endpt, list);
238 if (tmpep == ep) {
239 list_del(pos);
240 break;
241 }
242 }
243 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
244 break;
245 }
246
247 spin_unlock(&ms_info.mi_connlock);
248
249 // Now we are free to close out the connection
250 msg.uop = SCIF_DISCNCT;
251 msg.src = ep->port;
252 msg.dst = ep->peer;
253 msg.payload[0] = (uint64_t)ep;
254 msg.payload[1] = ep->remote_ep;
255
256 err = micscif_nodeqp_send(ep->remote_dev, &msg, ep);
257 spin_unlock_irqrestore(&ep->lock, sflags);
258
259 if (!err)
260 /* Now wait for the remote node to respond */
261 wait_event_timeout(ep->disconwq,
262 (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT);
263 /*
264 * Grab and release the ep lock to synchronize with the
265 * thread waking us up. If we dont grab this lock, then
266 * the ep might be freed before the wakeup completes
267 * resulting in potential memory corruption.
268 */
269 spin_lock_irqsave(&ep->lock, sflags);
270 spin_unlock_irqrestore(&ep->lock, sflags);
271 break;
272 }
273 case SCIFEP_LISTENING:
274 case SCIFEP_CLLISTEN:
275 {
276 struct conreq *conreq;
277 struct nodemsg msg;
278 struct endpt *aep;
279
280 spin_unlock_irqrestore(&ep->lock, sflags);
281 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
282
283 // remove from listen list
284 list_for_each_safe(pos, tmpq, &ms_info.mi_listen) {
285 tmpep = list_entry(pos, struct endpt, list);
286 if (tmpep == ep) {
287 list_del(pos);
288 }
289 }
290 // Remove any dangling accepts
291 while (ep->acceptcnt) {
292 aep = list_first_entry(&ep->li_accept, struct endpt, liacceptlist);
293 BUG_ON(!aep);
294 list_del(&aep->liacceptlist);
295 if (aep->port.port && !aep->accepted_ep)
296 put_scif_port(aep->port.port);
297 list_for_each_safe(pos, tmpq, &ms_info.mi_uaccept) {
298 tmpep = list_entry(pos, struct endpt, miacceptlist);
299 if (tmpep == aep) {
300 list_del(pos);
301 break;
302 }
303 }
304 spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
305 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
306 list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
307 tmpep = list_entry(pos, struct endpt, list);
308 if (tmpep == aep) {
309 list_del(pos);
310 put_conn_count(aep->remote_dev);
311 break;
312 }
313 }
314 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
315 tmpep = list_entry(pos, struct endpt, list);
316 if (tmpep == aep) {
317 list_del(pos);
318 break;
319 }
320 }
321 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
322 micscif_teardown_ep(aep);
323 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
324 micscif_add_epd_to_zombie_list(aep, MI_EPLOCK_HELD);
325 ep->acceptcnt--;
326 }
327
328 spin_lock(&ep->lock);
329 spin_unlock(&ms_info.mi_eplock);
330
331 // Remove and reject any pending connection requests.
332 while (ep->conreqcnt) {
333 conreq = list_first_entry(&ep->conlist, struct conreq, list);
334 list_del(&conreq->list);
335
336 msg.uop = SCIF_CNCT_REJ;
337 msg.dst.node = conreq->msg.src.node;
338 msg.dst.port = conreq->msg.src.port;
339 msg.payload[0] = conreq->msg.payload[0];
340 msg.payload[1] = conreq->msg.payload[1];
341 /*
342 * No Error Handling on purpose for micscif_nodeqp_send().
343 * If the remote node is lost we still want free the connection
344 * requests on the self node.
345 */
346 micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, ep);
347
348 ep->conreqcnt--;
349 kfree(conreq);
350 }
351
352 // If a kSCIF accept is waiting wake it up
353 wake_up_interruptible(&ep->conwq);
354 spin_unlock_irqrestore(&ep->lock, sflags);
355 break;
356 }
357 }
358 if (ep->port.port && !ep->accepted_ep)
359 put_scif_port(ep->port.port);
360 micscif_dec_node_refcnt(ep->remote_dev, 1);
361 micscif_teardown_ep(ep);
362 micscif_add_epd_to_zombie_list(ep, !MI_EPLOCK_HELD);
363 return 0;
364}
365
366void
367scif_ref_rel(struct kref *kref_count)
368{
369 struct endpt *epd;
370 epd = container_of(kref_count, struct endpt, ref_count);
371 __scif_close((scif_epd_t)epd);
372}
373
374int
375scif_close(scif_epd_t epd)
376{
377 __scif_flush(epd);
378 put_kref_count(epd);
379 return 0;
380}
381EXPORT_SYMBOL(scif_close);
382
383/**
384 * scif_flush() - Flush the endpoint
385 * @epd: The end point address returned from scif_open()
386 *
387 */
388int
389__scif_flush(scif_epd_t epd)
390{
391 struct endpt *ep = (struct endpt *)epd;
392 struct endpt *tmpep;
393 struct list_head *pos, *tmpq;
394 unsigned long sflags;
395 int err;
396
397 might_sleep();
398
399 micscif_inc_node_refcnt(ep->remote_dev, 1);
400
401 spin_lock_irqsave(&ep->lock, sflags);
402
403 switch (ep->state) {
404 case SCIFEP_CONNECTED:
405 {
406 struct nodemsg msg;
407 struct endpt *fep = NULL;
408
409 init_waitqueue_head(&ep->disconwq); // Wait for connection queue
410 WARN_ON(ep->files); // files should never be set while connected
411 spin_unlock_irqrestore(&ep->lock, sflags);
412 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
413
414 list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
415 tmpep = list_entry(pos, struct endpt, list);
416 if (tmpep == ep) {
417 list_del(pos);
418 put_conn_count(ep->remote_dev);
419 fep = tmpep;
420 spin_lock(&ep->lock);
421 break;
422 }
423 }
424
425 if (fep == NULL) {
426 // The other side has completed the disconnect before
427 // the end point can be removed from the list. Therefore
428 // the ep lock is not locked, traverse the disconnected list
429 // to find the endpoint, release the conn lock.
430 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
431 tmpep = list_entry(pos, struct endpt, list);
432 if (tmpep == ep) {
433 list_del(pos);
434 break;
435 }
436 }
437 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
438 break;
439 }
440
441 spin_unlock(&ms_info.mi_connlock);
442
443 msg.uop = SCIF_DISCNCT;
444 msg.src = ep->port;
445 msg.dst = ep->peer;
446 msg.payload[0] = (uint64_t)ep;
447 msg.payload[1] = ep->remote_ep;
448
449 err = micscif_nodeqp_send(ep->remote_dev, &msg, ep);
450
451 spin_unlock_irqrestore(&ep->lock, sflags);
452 if (!err)
453 /* Now wait for the remote node to respond */
454 wait_event_timeout(ep->disconwq,
455 (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT);
456 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
457 spin_lock(&ep->lock);
458 list_add_tail(&ep->list, &ms_info.mi_disconnected);
459 ep->state = SCIFEP_DISCONNECTED;
460 spin_unlock(&ep->lock);
461 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
462 // Wake up threads blocked in send and recv
463 wake_up_interruptible(&ep->sendwq);
464 wake_up_interruptible(&ep->recvwq);
465 break;
466 }
467 case SCIFEP_LISTENING:
468 {
469 ep->state = SCIFEP_CLLISTEN;
470
471 // If an accept is waiting wake it up
472 wake_up_interruptible(&ep->conwq);
473 spin_unlock_irqrestore(&ep->lock, sflags);
474 break;
475 }
476 default:
477 spin_unlock_irqrestore(&ep->lock, sflags);
478 break;
479 }
480 micscif_dec_node_refcnt(ep->remote_dev, 1);
481 return 0;
482}
483
484/**
485 * scif_bind() - Bind a SCIF end point to a port ID.
486 * @epd: The end point address returned from scif_open()
487 * @pn: Port ID (number) to bind to
488 *
489 * Set the port ID associated with the end point and place it in the bound state.
490 * If a port ID of zero is requested a non zero port ID is allocated for it.
491 *
492 * Upon successful compltion the port id (number) will be returned.
493 *
494 * If the end point is not in the unbound state then return -EISCONN.
495 *
496 * If port ID zero is specified and allocation of a port ID fails -ENOSPC
497 * will be returned.
498 */
499int
500__scif_bind(scif_epd_t epd, uint16_t pn)
501{
502 struct endpt *ep = (struct endpt *)epd;
503 unsigned long sflags;
504 int ret = 0;
505 int tmp;
506
507 pr_debug("SCIFAPI bind: ep %p %s requested port number %d\n",
508 ep, scif_ep_states[ep->state], pn);
509
510 might_sleep();
511
512 if (pn) {
513 /*
514 * Modeled on http://www.ietf.org/rfc/rfc1700.txt?number=1700
515 * SCIF ports below SCIF_ADMIN_PORT_END can only be bound by
516 * system (or root) processes or by processes executed by
517 * privileged users.
518 */
519 if ( pn < SCIF_ADMIN_PORT_END && !capable(CAP_SYS_ADMIN)) {
520 ret = -EACCES;
521 goto scif_bind_admin_exit;
522 }
523 }
524
525 spin_lock_irqsave(&ep->lock, sflags);
526 if (ep->state == SCIFEP_BOUND) {
527 ret = -EINVAL;
528 goto scif_bind_exit;
529 } else if (ep->state != SCIFEP_UNBOUND) {
530 ret = -EISCONN;
531 goto scif_bind_exit;
532 }
533
534 if (pn) {
535 if ((tmp = rsrv_scif_port(pn)) != pn) {
536 ret = -EINVAL;
537 goto scif_bind_exit;
538 }
539 } else {
540 pn = get_scif_port();
541 if (!pn) {
542 ret = -ENOSPC;
543 goto scif_bind_exit;
544 }
545 }
546
547 ep->state = SCIFEP_BOUND;
548 ep->port.node = ms_info.mi_nodeid;
549 ep->port.port = pn;
550 ep->conn_async_state = ASYNC_CONN_IDLE;
551 ret = pn;
552 pr_debug("SCIFAPI bind: bound to port number %d\n", pn);
553
554scif_bind_exit:
555 spin_unlock_irqrestore(&ep->lock, sflags);
556scif_bind_admin_exit:
557 return ret;
558}
559
560int
561scif_bind(scif_epd_t epd, uint16_t pn)
562{
563 int ret;
564 get_kref_count(epd);
565 ret = __scif_bind(epd, pn);
566 put_kref_count(epd);
567 return ret;
568}
569EXPORT_SYMBOL(scif_bind);
570
571/**
572 * scif_listen() - Place the end point in the listening state
573 * @epd: The end point address returned from scif_open()
574 * @backlog: Maximum number of pending connection requests.
575 *
576 * The end point is placed in the listening state ready to accept connection
577 * requests. The backlog paramter is saved to indicate the maximun number of
578 * connection requests from the remote node to save. The end point is
579 * placed on a list of listening end points to allow a connection request to
580 * find it.
581 *
582 * Upon successful completion a zero is returned.
583 *
584 * If the end point is not in the bound state -EINVAL or -EISCONN is returned.
585 *
586 */
587int
588__scif_listen(scif_epd_t epd, int backlog)
589{
590 struct endpt *ep = (struct endpt *)epd;
591 unsigned long sflags;
592
593 pr_debug("SCIFAPI listen: ep %p %s\n", ep, scif_ep_states[ep->state]);
594
595 might_sleep();
596 spin_lock_irqsave(&ep->lock, sflags);
597 switch (ep->state) {
598 case SCIFEP_ZOMBIE:
599 BUG_ON(SCIFEP_ZOMBIE == ep->state);
600 case SCIFEP_CLOSED:
601 case SCIFEP_CLOSING:
602 case SCIFEP_CLLISTEN:
603 case SCIFEP_UNBOUND:
604 case SCIFEP_DISCONNECTED:
605 spin_unlock_irqrestore(&ep->lock, sflags);
606 return -EINVAL;
607 case SCIFEP_LISTENING:
608 case SCIFEP_CONNECTED:
609 case SCIFEP_CONNECTING:
610 case SCIFEP_MAPPING:
611 spin_unlock_irqrestore(&ep->lock, sflags);
612 return -EISCONN;
613 case SCIFEP_BOUND:
614 break;
615 }
616
617 ep->state = SCIFEP_LISTENING;
618 ep->backlog = backlog;
619
620 ep->conreqcnt = 0;
621 ep->acceptcnt = 0;
622 INIT_LIST_HEAD(&ep->conlist); // List of connection requests
623 init_waitqueue_head(&ep->conwq); // Wait for connection queue
624 INIT_LIST_HEAD(&ep->li_accept); // User ep list for ACCEPTREG calls
625 spin_unlock_irqrestore(&ep->lock, sflags);
626
627 // Listen status is complete so delete the qp information not needed
628 // on a listen before placing on the list of listening ep's
629 micscif_teardown_ep((void *)ep);
630 ep->qp_info.qp = NULL;
631
632 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
633 list_add_tail(&ep->list, &ms_info.mi_listen);
634 spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
635 return 0;
636}
637
638int
639scif_listen(scif_epd_t epd, int backlog)
640{
641 int ret;
642 get_kref_count(epd);
643 ret = __scif_listen(epd, backlog);
644 put_kref_count(epd);
645 return ret;
646}
647EXPORT_SYMBOL(scif_listen);
648
649#ifdef _MIC_SCIF_
650/*
651 * scif_p2p_connect:
652 * @node: destination node id
653 *
654 * Try to setup a p2p connection between the current
655 * node and the desitination node. We need host to
656 * setup the initial p2p connections. So we send
657 * this message to the host which acts like proxy
658 * in setting up p2p connection.
659 */
660static int scif_p2p_connect(int node)
661{
662 struct micscif_dev *remote_dev = &scif_dev[node];
663 struct nodemsg msg;
664 int err;
665
666 pr_debug("%s:%d SCIF_NODE_CONNECT to host\n", __func__, __LINE__);
667 micscif_inc_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
668
669 msg.dst.node = SCIF_HOST_NODE;
670 msg.payload[0] = node;
671 msg.uop = SCIF_NODE_CONNECT;
672
673 if ((err = micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE],
674 &msg, NULL))) {
675 printk(KERN_ERR "%s:%d error while sending SCIF_NODE_CONNECT to"
676 " node %d\n", __func__, __LINE__, node);
677 micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
678 goto error;
679 }
680
681 wait_event_interruptible_timeout(remote_dev->sd_p2p_wq,
682 (remote_dev->sd_state == SCIFDEV_RUNNING) ||
683 (remote_dev->sd_state == SCIFDEV_NOTPRESENT), NODE_ALIVE_TIMEOUT);
684
685 pr_debug("%s:%d SCIF_NODE_CONNECT state:%d\n", __func__, __LINE__,
686 remote_dev->sd_state);
687 micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
688error:
689 return err;
690}
691#endif
692
693static int scif_conn_func(struct endpt *ep)
694{
695 int err = 0;
696 struct nodemsg msg;
697 unsigned long sflags;
698 int term_sent = 0;
699
700 if ((err = micscif_reserve_dma_chan(ep))) {
701 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
702 ep->state = SCIFEP_BOUND;
703 goto connect_error_simple;
704 }
705 // Initiate the first part of the endpoint QP setup
706 err = micscif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset,
707 ENDPT_QP_SIZE, ep->remote_dev);
708 if (err) {
709 printk(KERN_ERR "%s err %d qp_offset 0x%llx\n",
710 __func__, err, ep->qp_info.qp_offset);
711 ep->state = SCIFEP_BOUND;
712 goto connect_error_simple;
713 }
714
715 micscif_inc_node_refcnt(ep->remote_dev, 1);
716
717 // Format connect message and send it
718 msg.src = ep->port;
719 msg.dst = ep->conn_port;
720 msg.uop = SCIF_CNCT_REQ;
721 msg.payload[0] = (uint64_t)ep;
722 msg.payload[1] = ep->qp_info.qp_offset;
723 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
724 micscif_dec_node_refcnt(ep->remote_dev, 1);
725 goto connect_error_simple;
726 }
727 // Wait for request to be processed.
728 while ((err = wait_event_interruptible_timeout(ep->conwq,
729 (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT)) <= 0) {
730 if (!err)
731 err = -ENODEV;
732
733 pr_debug("SCIFAPI connect: ep %p ^C detected\n", ep);
734 // interrupted out of the wait
735 if (!term_sent++) {
736 int bak_err = err;
737 msg.uop = SCIF_CNCT_TERM;
738 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
739retry:
740 err = wait_event_timeout(ep->diswq,
741 (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT);
742 if (!err && scifdev_alive(ep))
743 goto retry;
744 if (!err)
745 err = -ENODEV;
746 if (err > 0)
747 err = 0;
748 }
749 if (ep->state == SCIFEP_MAPPING) {
750 micscif_setup_qp_connect_response(ep->remote_dev,
751 ep->qp_info.qp, ep->qp_info.cnct_gnt_payload);
752 // Send grant nack
753 msg.uop = SCIF_CNCT_GNTNACK;
754 msg.payload[0] = ep->remote_ep;
755 /* No error handling for Notification messages */
756 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
757 }
758 // Ensure after that even after a timeout the state of the end point is bound
759 ep->state = SCIFEP_BOUND;
760 if (bak_err)
761 err = bak_err;
762 break;
763 }
764 }
765
766 if (err > 0)
767 err = 0;
768
769 if (term_sent || err) {
770 micscif_dec_node_refcnt(ep->remote_dev, 1);
771 goto connect_error_simple;
772 }
773
774 if (ep->state == SCIFEP_MAPPING) {
775 err = micscif_setup_qp_connect_response(ep->remote_dev,
776 ep->qp_info.qp, ep->qp_info.cnct_gnt_payload);
777
778 // If the resource to map the queue are not available then we need
779 // to tell the other side to terminate the accept
780 if (err) {
781 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
782
783 // Send grant nack
784 msg.uop = SCIF_CNCT_GNTNACK;
785 msg.payload[0] = ep->remote_ep;
786 /* No error handling for Notification messages */
787 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
788
789 ep->state = SCIFEP_BOUND;
790 micscif_dec_node_refcnt(ep->remote_dev, 1);
791 goto connect_error_simple;
792 }
793
794 // Send a grant ack to inform the accept we are done mapping its resources.
795 msg.uop = SCIF_CNCT_GNTACK;
796 msg.payload[0] = ep->remote_ep;
797 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
798 ep->state = SCIFEP_CONNECTED;
799 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
800 list_add_tail(&ep->list, &ms_info.mi_connected);
801 get_conn_count(ep->remote_dev);
802 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
803 pr_debug("SCIFAPI connect: ep %p connected\n", ep);
804 } else
805 ep->state = SCIFEP_BOUND;
806 micscif_dec_node_refcnt(ep->remote_dev, 1);
807 goto connect_error_simple;
808
809 } else if (ep->state == SCIFEP_BOUND) {
810 pr_debug("SCIFAPI connect: ep %p connection refused\n", ep);
811 err = -ECONNREFUSED;
812 micscif_dec_node_refcnt(ep->remote_dev, 1);
813 goto connect_error_simple;
814
815 } else {
816 pr_debug("SCIFAPI connect: ep %p connection interrupted\n", ep);
817 err = -EINTR;
818 micscif_dec_node_refcnt(ep->remote_dev, 1);
819 goto connect_error_simple;
820 }
821 micscif_dec_node_refcnt(ep->remote_dev, 1);
822connect_error_simple:
823 return err;
824}
825
826/*
827 * micscif_conn_handler:
828 *
829 * Workqueue handler for servicing non-blocking SCIF connect
830 *
831 */
832void micscif_conn_handler(struct work_struct *work)
833{
834 struct endpt *ep;
835
836 do {
837 ep = NULL;
838 spin_lock(&ms_info.mi_nb_connect_lock);
839 if (!list_empty(&ms_info.mi_nb_connect_list)) {
840 ep = list_first_entry(&ms_info.mi_nb_connect_list,
841 struct endpt, conn_list);
842 list_del(&ep->conn_list);
843 }
844 spin_unlock(&ms_info.mi_nb_connect_lock);
845 if (ep) {
846 ep->conn_err = scif_conn_func(ep);
847 wake_up_interruptible(&ep->conn_pend_wq);
848 }
849 } while (ep);
850}
851
852/**
853 * scif_connect() - Request a connection to a remote node
854 * @epd: The end point address returned from scif_open()
855 * @dst: Remote note address informtion
856 *
857 * The function requests a scif connection to the remote node
858 * identified by the dst parameter. "dst" contains the remote node and
859 * port ids.
860 *
861 * Upon successful complete a zero will be returned.
862 *
863 * If the end point is not in the bound state -EINVAL will be returned.
864 *
865 * If during the connection sequence resource allocation fails the -ENOMEM
866 * will be returned.
867 *
868 * If the remote side is not responding to connection requests the caller may
869 * terminate this funciton with a signal. If so a -EINTR will be returned.
870 */
871int
872__scif_connect(scif_epd_t epd, struct scif_portID *dst, bool non_block)
873{
874 struct endpt *ep = (struct endpt *)epd;
875 unsigned long sflags;
876 int err = 0;
877#ifdef _MIC_SCIF_
878 struct micscif_dev *remote_dev;
879#endif
880
881 pr_debug("SCIFAPI connect: ep %p %s\n", ep,
882 scif_ep_states[ep->state]);
883
884 if (dst->node > MAX_BOARD_SUPPORTED)
885 return -ENODEV;
886
887 might_sleep();
888
889#ifdef _MIC_SCIF_
890 remote_dev = &scif_dev[dst->node];
891 if ((SCIFDEV_INIT == remote_dev->sd_state ||
892 SCIFDEV_STOPPED == remote_dev->sd_state) && mic_p2p_enable)
893 if ((err = scif_p2p_connect(dst->node)))
894 return err;
895#endif
896
897 if (SCIFDEV_RUNNING != scif_dev[dst->node].sd_state &&
898 SCIFDEV_SLEEPING != scif_dev[dst->node].sd_state)
899 return -ENODEV;
900
901 spin_lock_irqsave(&ep->lock, sflags);
902 switch (ep->state) {
903 case SCIFEP_ZOMBIE:
904 BUG_ON(SCIFEP_ZOMBIE == ep->state);
905
906 case SCIFEP_CLOSED:
907 case SCIFEP_CLOSING:
908 err = -EINVAL;
909 break;
910
911 case SCIFEP_DISCONNECTED:
912 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
913 ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
914 else
915 err = -EINVAL;
916 break;
917
918 case SCIFEP_LISTENING:
919 case SCIFEP_CLLISTEN:
920 err = -EOPNOTSUPP;
921 break;
922
923 case SCIFEP_CONNECTING:
924 case SCIFEP_MAPPING:
925 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
926 err = -EINPROGRESS;
927 else
928 err = -EISCONN;
929 break;
930
931 case SCIFEP_CONNECTED:
932 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
933 ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
934 else
935 err = -EISCONN;
936 break;
937
938 case SCIFEP_UNBOUND:
939 if ((ep->port.port = get_scif_port()) == 0)
940 err = -ENOSPC;
941 else {
942 ep->port.node = ms_info.mi_nodeid;
943 ep->conn_async_state = ASYNC_CONN_IDLE;
944 }
945 /* Fall through */
946 case SCIFEP_BOUND:
947 /*
948 * If a non-blocking connect has been already initiated (conn_async_state
949 * is either ASYNC_CONN_INPROGRESS or ASYNC_CONN_FLUSH_WORK), the end point
950 * could end up in SCIF_BOUND due an error in the connection
951 * process (e.g., connnection refused)
952 * If conn_async_state is ASYNC_CONN_INPROGRESS - transition to
953 * ASYNC_CONN_FLUSH_WORK so that the error status can be collected.
954 * If the state is already ASYNC_CONN_FLUSH_WORK - then set the error
955 * to EINPROGRESS since some other thread is waiting to collect error status.
956 */
957 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
958 ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
959 else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK)
960 err = -EINPROGRESS;
961 else {
962 ep->conn_port = *dst;
963 init_waitqueue_head(&ep->sendwq);
964 init_waitqueue_head(&ep->recvwq);
965 init_waitqueue_head(&ep->conwq);
966 init_waitqueue_head(&ep->diswq);
967 ep->conn_async_state = 0;
968
969 if (unlikely(non_block))
970 ep->conn_async_state = ASYNC_CONN_INPROGRESS;
971 }
972 break;
973 }
974
975 if (err || ep->conn_async_state == ASYNC_CONN_FLUSH_WORK)
976 goto connect_simple_unlock1;
977
978 ep->state = SCIFEP_CONNECTING;
979 ep->remote_dev = &scif_dev[dst->node];
980 ep->sd_state = SCIFDEV_RUNNING;
981 ep->qp_info.qp->magic = SCIFEP_MAGIC;
982 ep->qp_info.qp->ep = (uint64_t)ep;
983 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
984 init_waitqueue_head(&ep->conn_pend_wq);
985 spin_lock(&ms_info.mi_nb_connect_lock);
986 list_add_tail(&ep->conn_list,
987 &ms_info.mi_nb_connect_list);
988 spin_unlock(&ms_info.mi_nb_connect_lock);
989 err = -EINPROGRESS;
990 queue_work(ms_info.mi_conn_wq, &ms_info.mi_conn_work);
991 }
992connect_simple_unlock1:
993 spin_unlock_irqrestore(&ep->lock, sflags);
994
995 if (err)
996 return err;
997 else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) {
998 flush_workqueue(ms_info.mi_conn_wq);
999 err = ep->conn_err;
1000 spin_lock_irqsave(&ep->lock, sflags);
1001 ep->conn_async_state = ASYNC_CONN_IDLE;
1002 spin_unlock_irqrestore(&ep->lock, sflags);
1003 } else {
1004 err = scif_conn_func(ep);
1005 }
1006 return err;
1007}
1008
1009int
1010scif_connect(scif_epd_t epd, struct scif_portID *dst)
1011{
1012 int ret;
1013 get_kref_count(epd);
1014 ret = __scif_connect(epd, dst, false);
1015 put_kref_count(epd);
1016 return ret;
1017}
1018EXPORT_SYMBOL(scif_connect);
1019
1020/**
1021 * scif_accept() - Accept a connection request from the remote node
1022 * @epd: The end point address returned from scif_open()
1023 * @peer: Filled in with pear node and port information
1024 * @newepd: New end point created for connection
1025 * @flags: Indicates sychronous or asynchronous mode
1026 *
1027 * The function accepts a connection request from the remote node. Successful
1028 * complete is indicate by a new end point being created and passed back
1029 * to the caller for future reference.
1030 *
1031 * Upon successful complete a zero will be returned and the peer information
1032 * will be filled in.
1033 *
1034 * If the end point is not in the listening state -EINVAL will be returned.
1035 *
1036 * If during the connection sequence resource allocation fails the -ENOMEM
1037 * will be returned.
1038 *
1039 * If the function is called asynchronously and not connection request are
1040 * pending it will return -EAGAIN.
1041 *
1042 * If the remote side is not sending any connection requests the caller may
1043 * terminate this funciton with a signal. If so a -EINTR will be returned.
1044 */
1045int
1046__scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags)
1047{
1048 struct endpt *lep = (struct endpt *)epd;
1049 struct endpt *cep;
1050 struct conreq *conreq;
1051 struct nodemsg msg;
1052 unsigned long sflags;
1053 int err;
1054
1055 pr_debug("SCIFAPI accept: ep %p %s\n", lep, scif_ep_states[lep->state]);
1056
1057 // Error if flags other than SCIF_ACCEPT_SYNC are set
1058 if (flags & ~SCIF_ACCEPT_SYNC) {
1059 pr_debug("SCIFAPI accept: ep %p invalid flags %x\n", lep, flags & ~SCIF_ACCEPT_SYNC);
1060 return -EINVAL;
1061 }
1062
1063 if (!peer || !newepd) {
1064 pr_debug("SCIFAPI accept: ep %p peer %p or newepd %p NULL\n",
1065 lep, peer, newepd);
1066 return -EINVAL;
1067 }
1068
1069 might_sleep();
1070 spin_lock_irqsave(&lep->lock, sflags);
1071 if (lep->state != SCIFEP_LISTENING) {
1072 pr_debug("SCIFAPI accept: ep %p not listending\n", lep);
1073 spin_unlock_irqrestore(&lep->lock, sflags);
1074 return -EINVAL;
1075 }
1076
1077 if (!lep->conreqcnt && !(flags & SCIF_ACCEPT_SYNC)) {
1078 // No connection request present and we do not want to wait
1079 pr_debug("SCIFAPI accept: ep %p async request with nothing pending\n", lep);
1080 spin_unlock_irqrestore(&lep->lock, sflags);
1081 return -EAGAIN;
1082 }
1083
1084retry_connection:
1085 spin_unlock_irqrestore(&lep->lock, sflags);
1086 lep->files = current ? current->files : NULL;
1087 if ((err = wait_event_interruptible(lep->conwq,
1088 (lep->conreqcnt || (lep->state != SCIFEP_LISTENING)))) != 0) {
1089 // wait was interrupted
1090 pr_debug("SCIFAPI accept: ep %p ^C detected\n", lep);
1091 return err; // -ERESTARTSYS
1092 }
1093
1094 if (lep->state != SCIFEP_LISTENING) {
1095 return -EINTR;
1096 }
1097
1098 spin_lock_irqsave(&lep->lock, sflags);
1099
1100 if (!lep->conreqcnt) {
1101 goto retry_connection;
1102 }
1103
1104 // Get the first connect request off the list
1105 conreq = list_first_entry(&lep->conlist, struct conreq, list);
1106 list_del(&conreq->list);
1107 lep->conreqcnt--;
1108 spin_unlock_irqrestore(&lep->lock, sflags);
1109
1110 // Fill in the peer information
1111 peer->node = conreq->msg.src.node;
1112 peer->port = conreq->msg.src.port;
1113
1114 // Create the connection endpoint
1115 cep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL);
1116 if (!cep) {
1117 pr_debug("SCIFAPI accept: ep %p new end point allocation failed\n", lep);
1118 err = -ENOMEM;
1119 goto scif_accept_error_epalloc;
1120 }
1121 spin_lock_init(&cep->lock);
1122 mutex_init (&cep->sendlock);
1123 mutex_init (&cep->recvlock);
1124 cep->state = SCIFEP_CONNECTING;
1125 cep->remote_dev = &scif_dev[peer->node];
1126 cep->remote_ep = conreq->msg.payload[0];
1127 cep->sd_state = SCIFDEV_RUNNING;
1128
1129 if (!scifdev_alive(cep)) {
1130 err = -ENODEV;
1131 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1132 goto scif_accept_error_qpalloc;
1133 }
1134
1135 if (micscif_rma_ep_init(cep) < 0) {
1136 pr_debug("SCIFAPI accept: ep %p new %p RMA EP init failed\n", lep, cep);
1137 err = -ENOMEM;
1138 goto scif_accept_error_qpalloc;
1139 }
1140
1141 if ((err = micscif_reserve_dma_chan(cep))) {
1142 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1143 goto scif_accept_error_qpalloc;
1144 }
1145
1146 cep->qp_info.qp = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_KERNEL);
1147 if (!cep->qp_info.qp) {
1148 printk(KERN_ERR "Port Qp Allocation Failed\n");
1149 err = -ENOMEM;
1150 goto scif_accept_error_qpalloc;
1151 }
1152
1153 cep->qp_info.qp->magic = SCIFEP_MAGIC;
1154 cep->qp_info.qp->ep = (uint64_t)cep;
1155 micscif_inc_node_refcnt(cep->remote_dev, 1);
1156 err = micscif_setup_qp_accept(cep->qp_info.qp, &cep->qp_info.qp_offset,
1157 conreq->msg.payload[1], ENDPT_QP_SIZE, cep->remote_dev);
1158 if (err) {
1159 pr_debug("SCIFAPI accept: ep %p new %p micscif_setup_qp_accept %d qp_offset 0x%llx\n",
1160 lep, cep, err, cep->qp_info.qp_offset);
1161 micscif_dec_node_refcnt(cep->remote_dev, 1);
1162 goto scif_accept_error_map;
1163 }
1164
1165 cep->port.node = lep->port.node;
1166 cep->port.port = lep->port.port;
1167 cep->peer.node = peer->node;
1168 cep->peer.port = peer->port;
1169 cep->accepted_ep = true;
1170 init_waitqueue_head(&cep->sendwq); // Wait for data to be consumed
1171 init_waitqueue_head(&cep->recvwq); // Wait for data to be produced
1172 init_waitqueue_head(&cep->conwq); // Wait for connection request
1173
1174 // Return the grant message
1175 msg.uop = SCIF_CNCT_GNT;
1176 msg.src = cep->port;
1177 msg.payload[0] = cep->remote_ep;
1178 msg.payload[1] = cep->qp_info.qp_offset;
1179 msg.payload[2] = (uint64_t)cep;
1180
1181 err = micscif_nodeqp_send(cep->remote_dev, &msg, cep);
1182
1183 micscif_dec_node_refcnt(cep->remote_dev, 1);
1184 if (err)
1185 goto scif_accept_error_map;
1186retry:
1187 err = wait_event_timeout(cep->conwq,
1188 (cep->state != SCIFEP_CONNECTING), NODE_ACCEPT_TIMEOUT);
1189 if (!err && scifdev_alive(cep))
1190 goto retry;
1191
1192 if (!err) {
1193 err = -ENODEV;
1194 goto scif_accept_error_map;
1195 }
1196
1197 if (err > 0)
1198 err = 0;
1199
1200 kfree(conreq);
1201
1202 spin_lock_irqsave(&cep->lock, sflags);
1203
1204 if (cep->state == SCIFEP_CONNECTED) {
1205 // Connect sequence complete return new endpoint information
1206 *newepd = (scif_epd_t)cep;
1207 spin_unlock_irqrestore(&cep->lock, sflags);
1208 pr_debug("SCIFAPI accept: ep %p new %p returning new epnd point\n", lep, cep);
1209 return 0;
1210 }
1211
1212 if (cep->state == SCIFEP_CLOSING) {
1213 // Remote failed to allocate resources and NAKed the grant.
1214 // There is at this point nothing referencing the new end point.
1215 spin_unlock_irqrestore(&cep->lock, sflags);
1216 micscif_teardown_ep((void *)cep);
1217 kfree(cep);
1218
1219 // If call with sync flag then go back and wait.
1220 if (flags & SCIF_ACCEPT_SYNC) {
1221 spin_lock_irqsave(&lep->lock, sflags);
1222 goto retry_connection;
1223 }
1224
1225 pr_debug("SCIFAPI accept: ep %p new %p remote failed to allocate resources\n", lep, cep);
1226 return -EAGAIN;
1227 }
1228
1229 // While connect was in progress the other side closed and sent a disconnect
1230 // so set the end point status to closed but return anyway. This will allow
1231 // the caller to drain anything the other side may have put in the message queue.
1232 *newepd = (scif_epd_t)cep;
1233 spin_unlock_irqrestore(&cep->lock, sflags);
1234 return 0;
1235
1236 // Error allocating or mapping resources
1237scif_accept_error_map:
1238 kfree(cep->qp_info.qp);
1239
1240scif_accept_error_qpalloc:
1241 kfree(cep);
1242
1243scif_accept_error_epalloc:
1244 micscif_inc_node_refcnt(&scif_dev[conreq->msg.src.node], 1);
1245 // New reject the connection request due to lack of resources
1246 msg.uop = SCIF_CNCT_REJ;
1247 msg.dst.node = conreq->msg.src.node;
1248 msg.dst.port = conreq->msg.src.port;
1249 msg.payload[0] = conreq->msg.payload[0];
1250 msg.payload[1] = conreq->msg.payload[1];
1251 /* No error handling for Notification messages */
1252 micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, NULL);
1253 micscif_dec_node_refcnt(&scif_dev[conreq->msg.src.node], 1);
1254
1255 kfree(conreq);
1256 return err;
1257}
1258
1259int
1260scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags)
1261{
1262 int ret;
1263 get_kref_count(epd);
1264 ret = __scif_accept(epd, peer, newepd, flags);
1265 if (ret == 0) {
1266 kref_init(&((*newepd)->ref_count));
1267 }
1268 put_kref_count(epd);
1269 return ret;
1270}
1271EXPORT_SYMBOL(scif_accept);
1272
1273/*
1274 * scif_msg_param_check:
1275 * @epd: The end point address returned from scif_open()
1276 * @len: Length to receive
1277 * @flags: Syncronous or asynchronous access
1278 *
1279 * Validate parameters for messaging APIs scif_send(..)/scif_recv(..).
1280 */
1281static inline int
1282scif_msg_param_check(scif_epd_t epd, int len, int flags)
1283{
1284 int ret = -EINVAL;
1285
1286 if (len < 0)
1287 goto err_ret;
1288
1289 if (flags && (!(flags & SCIF_RECV_BLOCK)))
1290 goto err_ret;
1291
1292 ret = 0;
1293
1294err_ret:
1295 return ret;
1296}
1297
1298#define SCIF_BLAST (1 << 1) /* Use bit 1 of flags field */
1299
1300#ifdef SCIF_BLAST
1301/*
1302 * Added a temporary implementation of the exception path.
1303 * The cost to the normal path is 1 local variable (set once and
1304 * tested once) plus 2 tests for the 'blast' flag.
1305 * This only apply to the card side kernel API.
1306 */
1307#ifndef _MIC_SCIF_
1308#undef SCIF_BLAST
1309#endif
1310#endif
1311
1312/**
1313 * _scif_send() - Send data to connection queue
1314 * @epd: The end point address returned from scif_open()
1315 * @msg: Address to place data
1316 * @len: Length to receive
1317 * @flags: Syncronous or asynchronous access
1318 *
1319 * This function sends a packet of data to the queue * created by the
1320 * connection establishment sequence. It returns when the packet has
1321 * been completely sent.
1322 *
1323 * Successful completion returns the number of bytes sent.
1324 *
1325 * If the end point is not in the connect state returns -ENOTCONN;
1326 *
1327 * This function may be interrupted by a signal and will return -EINTR.
1328 */
1329int
1330_scif_send(scif_epd_t epd, void *msg, int len, int flags)
1331{
1332 struct endpt *ep = (struct endpt *)epd;
1333 struct nodemsg notif_msg;
1334 unsigned long sflags;
1335 size_t curr_xfer_len = 0;
1336 size_t sent_len = 0;
1337 size_t write_count;
1338 int ret;
1339#ifdef SCIF_BLAST
1340 int tl;
1341#endif
1342
1343 if (flags & SCIF_SEND_BLOCK)
1344 might_sleep();
1345
1346#ifdef SCIF_BLAST
1347 if (flags & SCIF_BLAST) {
1348 /*
1349 * Do a decent try to acquire lock (~100 uSec)
1350 */
1351 for (ret = tl = 0; ret < 100 && !tl; ret++) {
1352 tl = spin_trylock_irqsave(&ep->lock, sflags);
1353 cpu_relax();
1354 }
1355 } else {
1356 tl = 1;
1357 spin_lock_irqsave(&ep->lock, sflags);
1358 }
1359#else
1360 spin_lock_irqsave(&ep->lock, sflags);
1361#endif
1362
1363 while (sent_len != len) {
1364 if (ep->state == SCIFEP_DISCONNECTED) {
1365 ret = (int)(sent_len ? sent_len : -ECONNRESET);
1366 goto unlock_dec_return;
1367 }
1368 if (ep->state != SCIFEP_CONNECTED) {
1369 ret = (int)(sent_len ? sent_len : -ENOTCONN);
1370 goto unlock_dec_return;
1371 }
1372 if (!scifdev_alive(ep)) {
1373 ret = (int) (sent_len ? sent_len : -ENODEV);
1374 goto unlock_dec_return;
1375 }
1376 write_count = micscif_rb_space(&ep->qp_info.qp->outbound_q);
1377 if (write_count) {
1378 /*
1379 * Best effort to send as much data as there
1380 * is space in the RB particularly important for the
1381 * Non Blocking case.
1382 */
1383 curr_xfer_len = min(len - sent_len, write_count);
1384 ret = micscif_rb_write(&ep->qp_info.qp->outbound_q, msg,
1385 (uint32_t)curr_xfer_len);
1386 if (ret < 0) {
1387 ret = -EFAULT;
1388 goto unlock_dec_return;
1389 }
1390 if (ret) {
1391 spin_unlock_irqrestore(&ep->lock, sflags);
1392 /*
1393 * If there is space in the RB and we have the
1394 * EP lock held then writing to the RB should
1395 * succeed. Releasing spin lock before asserting
1396 * to avoid deadlocking the system.
1397 */
1398 BUG_ON(ret);
1399 }
1400 /*
1401 * Success. Update write pointer.
1402 */
1403 micscif_rb_commit(&ep->qp_info.qp->outbound_q);
1404#ifdef SCIF_BLAST
1405 if (flags & SCIF_BLAST) {
1406 /*
1407 * Bypass-path; set flag int the host side node_qp
1408 * and ring the doorbell. Host will wake-up all
1409 * listeners, such that the message will be seen.
1410 * Need micscif_send_host_intr() to be non-static.
1411 */
1412 extern int micscif_send_host_intr(struct micscif_dev *, uint32_t);
1413 ep->remote_dev->qpairs->remote_qp->blast = 1;
1414 smp_wmb(); /* Sufficient or need sfence? */
1415 micscif_send_host_intr(ep->remote_dev, 0);
1416 } else {
1417 /*
1418 * Normal path: send notification on the
1419 * node_qp ring buffer and ring the doorbell.
1420 */
1421 notif_msg.src = ep->port;
1422 notif_msg.uop = SCIF_CLIENT_SENT;
1423 notif_msg.payload[0] = ep->remote_ep;
1424 if ((ret = micscif_nodeqp_send(ep->remote_dev, &notif_msg, ep))) {
1425 ret = sent_len ? sent_len : ret;
1426 goto unlock_dec_return;
1427 }
1428 }
1429#else
1430 /*
1431 * Send a notification to the peer about the
1432 * produced data message.
1433 */
1434 notif_msg.src = ep->port;
1435 notif_msg.uop = SCIF_CLIENT_SENT;
1436 notif_msg.payload[0] = ep->remote_ep;
1437 if ((ret = micscif_nodeqp_send(ep->remote_dev, &notif_msg, ep))) {
1438 ret = (int)(sent_len ? sent_len : ret);
1439 goto unlock_dec_return;
1440 }
1441#endif
1442 sent_len += curr_xfer_len;
1443 msg = (char *)msg + curr_xfer_len;
1444 continue;
1445 }
1446 curr_xfer_len = min(len - sent_len, (size_t)(ENDPT_QP_SIZE - 1));
1447 /*
1448 * Not enough space in the RB. Return in the Non Blocking case.
1449 */
1450 if (!(flags & SCIF_SEND_BLOCK)) {
1451 ret = (int)sent_len;
1452 goto unlock_dec_return;
1453 }
1454#ifdef SCIF_BLAST
1455 /*
1456 * Flags SCIF_BLAST and SCIF_SEND_BLOCK are mutually
1457 * exclusive, so if we get here we know that SCIF_BLAST
1458 * was not set and thus we _do_ have the spinlock.
1459 * No need to check variable tl here
1460 */
1461#endif
1462 spin_unlock_irqrestore(&ep->lock, sflags);
1463 /*
1464 * Wait for a message now in the Blocking case.
1465 */
1466 if ((ret = wait_event_interruptible(ep->sendwq,
1467 (SCIFEP_CONNECTED != ep->state) ||
1468 (micscif_rb_space(&ep->qp_info.qp->outbound_q)
1469 >= curr_xfer_len) || (!scifdev_alive(ep))))) {
1470 ret = (int) (sent_len ? sent_len : ret);
1471 goto dec_return;
1472 }
1473 spin_lock_irqsave(&ep->lock, sflags);
1474 }
1475 ret = len;
1476unlock_dec_return:
1477#ifdef SCIF_BLAST
1478 if (tl)
1479#endif
1480 spin_unlock_irqrestore(&ep->lock, sflags);
1481dec_return:
1482 return ret;
1483}
1484
1485/**
1486 * _scif_recv() - Recieve data from connection queue
1487 * @epd: The end point address returned from scif_open()
1488 * @msg: Address to place data
1489 * @len: Length to receive
1490 * @flags: Syncronous or asynchronous access
1491 * @touser: package send to user buffer or kernel
1492 *
1493 * This function requests to receive a packet of data from the queue
1494 * created by the connection establishment sequence. It reads the amount
1495 * of data requested before returning.
1496 *
1497 * This function differs from the scif_send() by also returning data if the
1498 * end point is in the disconnected state and data is present.
1499 *
1500 * Successful completion returns the number of bytes read.
1501 *
1502 * If the end point is not in the connect state or in the disconnected state
1503 * with data prosent it returns -ENOTCONN;
1504 *
1505 * This function may be interrupted by a signal and will return -EINTR.
1506 */
1507int
1508_scif_recv(scif_epd_t epd, void *msg, int len, int flags)
1509{
1510 int read_size;
1511 struct endpt *ep = (struct endpt *)epd;
1512 unsigned long sflags;
1513 struct nodemsg notif_msg;
1514 size_t curr_recv_len = 0;
1515 size_t remaining_len = len;
1516 size_t read_count;
1517 int ret;
1518
1519 if (flags & SCIF_RECV_BLOCK)
1520 might_sleep();
1521
1522 micscif_inc_node_refcnt(ep->remote_dev, 1);
1523 spin_lock_irqsave(&ep->lock, sflags);
1524 while (remaining_len) {
1525 if (ep->state != SCIFEP_CONNECTED &&
1526 ep->state != SCIFEP_DISCONNECTED) {
1527 ret = (int) (len - remaining_len) ?
1528 (int) (len - remaining_len) : -ENOTCONN;
1529 goto unlock_dec_return;
1530 }
1531 read_count = micscif_rb_count(&ep->qp_info.qp->inbound_q,
1532 (int) remaining_len);
1533 if (read_count) {
1534 /*
1535 * Best effort to recv as much data as there
1536 * are bytes to read in the RB particularly
1537 * important for the Non Blocking case.
1538 */
1539 curr_recv_len = min(remaining_len, read_count);
1540 read_size = micscif_rb_get_next(
1541 &ep->qp_info.qp->inbound_q,
1542 msg, (int) curr_recv_len);
1543 if (read_size < 0){
1544 /* only could happen when copy to USER buffer
1545 */
1546 ret = -EFAULT;
1547 goto unlock_dec_return;
1548 }
1549 if (read_size != curr_recv_len) {
1550 spin_unlock_irqrestore(&ep->lock, sflags);
1551 /*
1552 * If there are bytes to be read from the RB and
1553 * we have the EP lock held then reading from
1554 * RB should succeed. Releasing spin lock before
1555 * asserting to avoid deadlocking the system.
1556 */
1557 BUG_ON(read_size != curr_recv_len);
1558 }
1559 if (ep->state == SCIFEP_CONNECTED) {
1560 /*
1561 * Update the read pointer only if the endpoint is
1562 * still connected else the read pointer might no
1563 * longer exist since the peer has freed resources!
1564 */
1565 micscif_rb_update_read_ptr(&ep->qp_info.qp->inbound_q);
1566 /*
1567 * Send a notification to the peer about the
1568 * consumed data message only if the EP is in
1569 * SCIFEP_CONNECTED state.
1570 */
1571 notif_msg.src = ep->port;
1572 notif_msg.uop = SCIF_CLIENT_RCVD;
1573 notif_msg.payload[0] = ep->remote_ep;
1574 if ((ret = micscif_nodeqp_send(ep->remote_dev, &notif_msg, ep))) {
1575 ret = (len - (int)remaining_len) ?
1576 (len - (int)remaining_len) : ret;
1577 goto unlock_dec_return;
1578 }
1579 }
1580 remaining_len -= curr_recv_len;
1581 msg = (char *)msg + curr_recv_len;
1582 continue;
1583 }
1584 curr_recv_len = min(remaining_len, (size_t)(ENDPT_QP_SIZE - 1));
1585 /*
1586 * Bail out now if the EP is in SCIFEP_DISCONNECTED state else
1587 * we will keep looping forever.
1588 */
1589 if (ep->state == SCIFEP_DISCONNECTED) {
1590 ret = (len - (int)remaining_len) ?
1591 (len - (int)remaining_len) : -ECONNRESET;
1592 goto unlock_dec_return;
1593 }
1594 /*
1595 * Return in the Non Blocking case if there is no data
1596 * to read in this iteration.
1597 */
1598 if (!(flags & SCIF_RECV_BLOCK)) {
1599 ret = len - (int)remaining_len;
1600 goto unlock_dec_return;
1601 }
1602 spin_unlock_irqrestore(&ep->lock, sflags);
1603 micscif_dec_node_refcnt(ep->remote_dev, 1);
1604 /*
1605 * Wait for a message now in the Blocking case.
1606 * or until other side disconnects.
1607 */
1608 if ((ret = wait_event_interruptible(ep->recvwq,
1609 (SCIFEP_CONNECTED != ep->state) ||
1610 (micscif_rb_count(&ep->qp_info.qp->inbound_q,
1611 curr_recv_len) >= curr_recv_len) || (!scifdev_alive(ep))))) {
1612 ret = (len - remaining_len) ?
1613 (len - (int)remaining_len) : ret;
1614 goto dec_return;
1615 }
1616 micscif_inc_node_refcnt(ep->remote_dev, 1);
1617 spin_lock_irqsave(&ep->lock, sflags);
1618 }
1619 ret = len;
1620unlock_dec_return:
1621 spin_unlock_irqrestore(&ep->lock, sflags);
1622 micscif_dec_node_refcnt(ep->remote_dev, 1);
1623dec_return:
1624 return ret;
1625}
1626
1627
1628/**
1629 * scif_user_send() - Send data to connection queue
1630 * @epd: The end point address returned from scif_open()
1631 * @msg: Address to place data
1632 * @len: Length to receive
1633 * @flags: Syncronous or asynchronous access
1634 *
1635 * This function is called from the driver IOCTL entry point
1636 * only and is a wrapper for _scif_send().
1637 */
1638int
1639scif_user_send(scif_epd_t epd, void *msg, int len, int flags)
1640{
1641 struct endpt *ep = (struct endpt *)epd;
1642 int err = 0;
1643 int sent_len = 0;
1644 char *tmp;
1645 int loop_len;
1646 int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));;
1647 pr_debug("SCIFAPI send (U): ep %p %s\n", ep, scif_ep_states[ep->state]);
1648
1649 if (!len)
1650 return 0;
1651
1652 if ((err = scif_msg_param_check(epd, len, flags)))
1653 goto send_err;
1654
1655 if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) {
1656 err = -ENOMEM;
1657 goto send_err;
1658 }
1659 err = 0;
1660 micscif_inc_node_refcnt(ep->remote_dev, 1);
1661 /*
1662 * Grabbing the lock before breaking up the transfer in
1663 * multiple chunks is required to ensure that messages do
1664 * not get fragmented and reordered.
1665 */
1666 mutex_lock(&ep->sendlock);
1667
1668 while (sent_len != len) {
1669 msg = (void *)((char *)msg + err);
1670 loop_len = len - sent_len;
1671 loop_len = min(chunk_len, loop_len);
1672 if (copy_from_user(tmp, msg, loop_len)) {
1673 err = -EFAULT;
1674 goto send_free_err;
1675 }
1676 err = _scif_send(epd, (void *)tmp, loop_len, flags);
1677 if (err < 0) {
1678 goto send_free_err;
1679 }
1680 sent_len += err;
1681 if (err !=loop_len) {
1682 goto send_free_err;
1683 }
1684 }
1685send_free_err:
1686 mutex_unlock(&ep->sendlock);
1687 micscif_dec_node_refcnt(ep->remote_dev, 1);
1688 kfree(tmp);
1689send_err:
1690 return err < 0 ? err : sent_len;
1691}
1692
1693/**
1694 * scif_user_recv() - Recieve data from connection queue
1695 * @epd: The end point address returned from scif_open()
1696 * @msg: Address to place data
1697 * @len: Length to receive
1698 * @flags: Syncronous or asynchronous access
1699 *
1700 * This function is called from the driver IOCTL entry point
1701 * only and is a wrapper for _scif_recv().
1702 */
1703int
1704scif_user_recv(scif_epd_t epd, void *msg, int len, int flags)
1705{
1706 struct endpt *ep = (struct endpt *)epd;
1707 int err = 0;
1708 int recv_len = 0;
1709 char *tmp;
1710 int loop_len;
1711 int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));;
1712 pr_debug("SCIFAPI recv (U): ep %p %s\n", ep, scif_ep_states[ep->state]);
1713
1714 if (!len)
1715 return 0;
1716
1717 if ((err = scif_msg_param_check(epd, len, flags)))
1718 goto recv_err;
1719
1720 if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) {
1721 err = -ENOMEM;
1722 goto recv_err;
1723 }
1724 err = 0;
1725 /*
1726 * Grabbing the lock before breaking up the transfer in
1727 * multiple chunks is required to ensure that messages do
1728 * not get fragmented and reordered.
1729 */
1730 mutex_lock(&ep->recvlock);
1731
1732 while (recv_len != len) {
1733 msg = (void *)((char *)msg + err);
1734 loop_len = len - recv_len;
1735 loop_len = min(chunk_len, loop_len);
1736 if ((err = _scif_recv(epd, tmp, loop_len, flags)) < 0)
1737 goto recv_free_err;
1738 if (copy_to_user(msg, tmp, err)) {
1739 err = -EFAULT;
1740 goto recv_free_err;
1741 }
1742 recv_len += err;
1743 if (err !=loop_len) {
1744 goto recv_free_err;
1745 }
1746 }
1747recv_free_err:
1748 mutex_unlock(&ep->recvlock);
1749 kfree(tmp);
1750recv_err:
1751 return err < 0 ? err : recv_len;
1752}
1753
1754#ifdef SCIF_BLAST
1755/*
1756 * Added a temporary implementation of the exception path.
1757 * The cost to the normal path testing of 2 flag bits instead
1758 * of just one and a change to condition for node-wakeup.
1759 */
1760#endif
1761
1762/**
1763 * scif_send() - Send data to connection queue
1764 * @epd: The end point address returned from scif_open()
1765 * @msg: Address to place data
1766 * @len: Length to receive
1767 * @flags: Syncronous or asynchronous access
1768 *
1769 * This function is called from the kernel mode only and is
1770 * a wrapper for _scif_send().
1771 */
1772int
1773__scif_send(scif_epd_t epd, void *msg, int len, int flags)
1774{
1775 struct endpt *ep = (struct endpt *)epd;
1776 int ret;
1777
1778 pr_debug("SCIFAPI send (K): ep %p %s\n", ep, scif_ep_states[ep->state]);
1779 if (!len)
1780 return 0;
1781
1782#ifdef SCIF_BLAST
1783 /*
1784 * KAA: this is same code as scif_msg_param_check(),
1785 * but since that routine is shared with scif_recv
1786 * I thought is safer to replicate code here.
1787 */
1788 if (len < 0)
1789 return -EINVAL;
1790
1791 if (flags && !(flags & (SCIF_SEND_BLOCK | SCIF_BLAST)))
1792 return -EINVAL;
1793
1794 if ((flags & (SCIF_SEND_BLOCK | SCIF_BLAST)) ==
1795 (SCIF_SEND_BLOCK | SCIF_BLAST))
1796 return -EINVAL;
1797#else
1798 if ((ret = scif_msg_param_check(epd, len, flags)))
1799 return ret;
1800#endif
1801 /*
1802 * Cannot block while waiting for node to wake up
1803 * if non blocking messaging mode is requested. Return
1804 * ENODEV if the remote node is idle.
1805 */
1806 if (!(flags & SCIF_SEND_BLOCK) && ep->remote_dev &&
1807 SCIF_NODE_IDLE == atomic_long_read(
1808 &ep->remote_dev->scif_ref_cnt))
1809 return -ENODEV;
1810
1811 micscif_inc_node_refcnt(ep->remote_dev, 1);
1812
1813 /*
1814 * Grab the mutex lock in the blocking case only
1815 * to ensure messages do not get fragmented/reordered.
1816 * The non blocking mode is protected using spin locks
1817 * in _scif_send().
1818 */
1819 if (flags & SCIF_SEND_BLOCK)
1820 mutex_lock(&ep->sendlock);
1821
1822 ret = _scif_send(epd, msg, len, flags);
1823
1824 if (flags & SCIF_SEND_BLOCK)
1825 mutex_unlock(&ep->sendlock);
1826
1827 micscif_dec_node_refcnt(ep->remote_dev, 1);
1828 return ret;
1829}
1830
1831int
1832scif_send(scif_epd_t epd, void *msg, int len, int flags)
1833{
1834 int ret;
1835 get_kref_count(epd);
1836 ret = __scif_send(epd, msg, len, flags);
1837 put_kref_count(epd);
1838 return ret;
1839}
1840EXPORT_SYMBOL(scif_send);
1841
1842/**
1843 * scif_recv() - Recieve data from connection queue
1844 * @epd: The end point address returned from scif_open()
1845 * @msg: Address to place data
1846 * @len: Length to receive
1847 * @flags: Syncronous or asynchronous access
1848 *
1849 * This function is called from the kernel mode only and is
1850 * a wrapper for _scif_recv().
1851 */
1852int
1853__scif_recv(scif_epd_t epd, void *msg, int len, int flags)
1854{
1855 struct endpt *ep = (struct endpt *)epd;
1856 int ret;
1857
1858 pr_debug("SCIFAPI recv (K): ep %p %s\n", ep, scif_ep_states[ep->state]);
1859
1860 if (!len)
1861 return 0;
1862
1863 if ((ret = scif_msg_param_check(epd, len, flags)))
1864 return ret;
1865
1866 /*
1867 * Cannot block while waiting for node to wake up
1868 * if non blocking messaging mode is requested. Return
1869 * ENODEV if the remote node is idle.
1870 */
1871 if (!flags && ep->remote_dev &&
1872 SCIF_NODE_IDLE == atomic_long_read(
1873 &ep->remote_dev->scif_ref_cnt))
1874 return -ENODEV;
1875
1876 /*
1877 * Grab the mutex lock in the blocking case only
1878 * to ensure messages do not get fragmented/reordered.
1879 * The non blocking mode is protected using spin locks
1880 * in _scif_send().
1881 */
1882 if (flags & SCIF_RECV_BLOCK)
1883 mutex_lock(&ep->recvlock);
1884
1885 ret = _scif_recv(epd, msg, len, flags);
1886
1887 if (flags & SCIF_RECV_BLOCK)
1888 mutex_unlock(&ep->recvlock);
1889
1890 return ret;
1891}
1892
1893int
1894scif_recv(scif_epd_t epd, void *msg, int len, int flags)
1895{
1896 int ret;
1897 get_kref_count(epd);
1898 ret = __scif_recv(epd, msg, len, flags);
1899 put_kref_count(epd);
1900 return ret;
1901}
1902EXPORT_SYMBOL(scif_recv);
1903
1904/**
1905 * __scif_pin_pages - __scif_pin_pages() pins the physical pages which back
1906 * the range of virtual address pages starting at addr and continuing for
1907 * len bytes. addr and len are constrained to be multiples of the page size.
1908 * A successful scif_register() call returns an opaque pointer value
1909 * which may be used in subsequent calls to scif_register_pinned_pages().
1910 *
1911 * Return Values
1912 * Upon successful completion, __scif_pin_pages() returns a
1913 * scif_pinned_pages_t value else an apt error is returned as documented
1914 * in scif.h. Protections of the set of pinned pages are also returned by
1915 * reference via out_prot.
1916 */
1917int
1918__scif_pin_pages(void *addr, size_t len, int *out_prot,
1919 int map_flags, scif_pinned_pages_t *pages)
1920{
1921 struct scif_pinned_pages *pinned_pages;
1922 int nr_pages, err = 0, i;
1923 bool vmalloc_addr = false;
1924 bool try_upgrade = false;
1925 int prot = *out_prot;
1926 int ulimit = 0;
1927 struct mm_struct *mm = NULL;
1928
1929 /* Unsupported flags */
1930 if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT))
1931 return -EINVAL;
1932 ulimit = !!(map_flags & SCIF_MAP_ULIMIT);
1933
1934 /* Unsupported protection requested */
1935 if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
1936 return -EINVAL;
1937
1938 /* addr/len must be page aligned. len should be non zero */
1939 if ((!len) ||
1940 (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) ||
1941 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
1942 return -EINVAL;
1943
1944 might_sleep();
1945
1946 nr_pages = (int)(len >> PAGE_SHIFT);
1947
1948 /* Allocate a set of pinned pages */
1949 if (!(pinned_pages = micscif_create_pinned_pages(nr_pages, prot)))
1950 return -ENOMEM;
1951
1952 if (unlikely(map_flags & SCIF_MAP_KERNEL)) {
1953 if (is_vmalloc_addr(addr))
1954 vmalloc_addr = true;
1955
1956 for (i = 0; i < nr_pages; i++) {
1957 if (unlikely(vmalloc_addr))
1958 pinned_pages->pages[i] =
1959 vmalloc_to_page((char *)addr + (i * PAGE_SIZE) );
1960 else
1961 pinned_pages->pages[i] =
1962 virt_to_page((char *)addr + (i * PAGE_SIZE) );
1963 pinned_pages->num_pages[i] = 1;
1964 pinned_pages->nr_contig_chunks++;
1965 }
1966 pinned_pages->nr_pages = nr_pages;
1967 pinned_pages->map_flags = SCIF_MAP_KERNEL;
1968 } else {
1969 if (prot == SCIF_PROT_READ)
1970 try_upgrade = true;
1971 prot |= SCIF_PROT_WRITE;
1972retry:
1973 mm = current->mm;
1974 down_write(&mm->mmap_sem);
1975 if (ulimit) {
1976 err = __scif_check_inc_pinned_vm(mm, nr_pages);
1977 if (err) {
1978 up_write(&mm->mmap_sem);
1979 pinned_pages->nr_pages = 0;
1980 goto error_unmap;
1981 }
1982 }
1983
1984 pinned_pages->nr_pages = get_user_pages(
1985 current,
1986 mm,
1987 (uint64_t)addr,
1988 nr_pages,
1989 !!(prot & SCIF_PROT_WRITE),
1990 0,
1991 pinned_pages->pages,
1992 pinned_pages->vma);
1993 up_write(&mm->mmap_sem);
1994 if (nr_pages == pinned_pages->nr_pages) {
1995#ifdef RMA_DEBUG
1996 atomic_long_add_return(nr_pages, &ms_info.rma_pin_cnt);
1997#endif
1998 micscif_detect_large_page(pinned_pages, addr);
1999 } else {
2000 if (try_upgrade) {
2001 if (ulimit)
2002 __scif_dec_pinned_vm_lock(mm, nr_pages, 0);
2003#ifdef RMA_DEBUG
2004 WARN_ON(atomic_long_sub_return(1,
2005 &ms_info.rma_mm_cnt) < 0);
2006#endif
2007 /* Roll back any pinned pages */
2008 for (i = 0; i < pinned_pages->nr_pages; i++) {
2009 if (pinned_pages->pages[i])
2010 page_cache_release(pinned_pages->pages[i]);
2011 }
2012 prot &= ~SCIF_PROT_WRITE;
2013 try_upgrade = false;
2014 goto retry;
2015 }
2016 }
2017 pinned_pages->map_flags = 0;
2018 }
2019
2020 if (pinned_pages->nr_pages < nr_pages) {
2021 err = -EFAULT;
2022 pinned_pages->nr_pages = nr_pages;
2023 goto dec_pinned;
2024 }
2025
2026 *out_prot = prot;
2027 atomic_set(&pinned_pages->ref_count, nr_pages);
2028 *pages = pinned_pages;
2029 return err;
2030dec_pinned:
2031 if (ulimit)
2032 __scif_dec_pinned_vm_lock(mm, nr_pages, 0);
2033 /* Something went wrong! Rollback */
2034error_unmap:
2035 pinned_pages->nr_pages = nr_pages;
2036 micscif_destroy_pinned_pages(pinned_pages);
2037 *pages = NULL;
2038 pr_debug("%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len);
2039 return err;
2040
2041}
2042
2043/**
2044 * scif_pin_pages - scif_pin_pages() pins the physical pages which back
2045 * the range of virtual address pages starting at addr and continuing for
2046 * len bytes. addr and len are constrained to be multiples of the page size.
2047 * A successful scif_register() call returns an opaque pointer value
2048 * which may be used in subsequent calls to scif_register_pinned_pages().
2049 *
2050 * Return Values
2051 * Upon successful completion, scif_register() returns a
2052 * scif_pinned_pages_t value else an apt error is returned as documented
2053 * in scif.h
2054 */
2055int
2056scif_pin_pages(void *addr, size_t len, int prot,
2057 int map_flags, scif_pinned_pages_t *pages)
2058{
2059 return __scif_pin_pages(addr, len, &prot, map_flags, pages);
2060}
2061EXPORT_SYMBOL(scif_pin_pages);
2062
2063/**
2064 * scif_unpin_pages: Unpin a set of pages
2065 *
2066 * Return Values:
2067 * Upon successful completion, scif_unpin_pages() returns 0;
2068 * else an apt error is returned as documented in scif.h
2069 */
2070int
2071scif_unpin_pages(scif_pinned_pages_t pinned_pages)
2072{
2073 int err = 0, ret;
2074
2075 if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic)
2076 return -EINVAL;
2077
2078 ret = atomic_sub_return((int32_t)pinned_pages->nr_pages,
2079 &pinned_pages->ref_count);
2080 BUG_ON(ret < 0);
2081
2082 /*
2083 * Destroy the window if the ref count for this set of pinned
2084 * pages has dropped to zero. If it is positive then there is
2085 * a valid registered window which is backed by these pages and
2086 * it will be destroyed once all such windows are unregistered.
2087 */
2088 if (!ret)
2089 err = micscif_destroy_pinned_pages(pinned_pages);
2090
2091 return err;
2092}
2093EXPORT_SYMBOL(scif_unpin_pages);
2094
2095/**
2096 * scif_register_pinned_pages: Mark a memory region for remote access.
2097 *
2098 * The scif_register_pinned_pages() function opens a window, a range
2099 * of whole pages of the registered address space of the endpoint epd,
2100 * starting at offset po. The value of po, further described below, is
2101 * a function of the parameters offset and pinned_pages, and the value
2102 * of map_flags. Each page of the window represents a corresponding
2103 * physical memory page of pinned_pages; the length of the window is
2104 * the same as the length of pinned_pages. A successful scif_register()
2105 * call returns po as the return value.
2106 *
2107 * Return Values
2108 * Upon successful completion, scif_register_pinned_pages() returns
2109 * the offset at which the mapping was placed (po);
2110 * else an apt error is returned as documented in scif.h
2111 */
2112off_t
2113__scif_register_pinned_pages(scif_epd_t epd,
2114 scif_pinned_pages_t pinned_pages, off_t offset, int map_flags)
2115{
2116 struct endpt *ep = (struct endpt *)epd;
2117 uint64_t computed_offset;
2118 struct reg_range_t *window;
2119 int err;
2120 size_t len;
2121
2122#ifdef DEBUG
2123 /* Bad EP */
2124 if (!ep || !pinned_pages || pinned_pages->magic != SCIFEP_MAGIC)
2125 return -EINVAL;
2126#endif
2127 /* Unsupported flags */
2128 if (map_flags & ~SCIF_MAP_FIXED)
2129 return -EINVAL;
2130
2131 len = pinned_pages->nr_pages << PAGE_SHIFT;
2132
2133 /*
2134 * Offset is not page aligned/negative or offset+len
2135 * wraps around with SCIF_MAP_FIXED.
2136 */
2137 if ((map_flags & SCIF_MAP_FIXED) &&
2138 ((align_low(offset, PAGE_SIZE) != offset) ||
2139 (offset < 0) ||
2140 (offset + (off_t)len < offset)))
2141 return -EINVAL;
2142
2143 might_sleep();
2144
2145 if ((err = verify_epd(ep)))
2146 return err;
2147
2148 /* Compute the offset for this registration */
2149 if ((err = micscif_get_window_offset(ep, map_flags, offset,
2150 len, &computed_offset)))
2151 return err;
2152
2153 /* Allocate and prepare self registration window */
2154 if (!(window = micscif_create_window(ep, pinned_pages->nr_pages,
2155 computed_offset, false))) {
2156 micscif_free_window_offset(ep, computed_offset, len);
2157 return -ENOMEM;
2158 }
2159
2160 window->pinned_pages = pinned_pages;
2161 window->nr_pages = pinned_pages->nr_pages;
2162 window->nr_contig_chunks = pinned_pages->nr_contig_chunks;
2163 window->prot = pinned_pages->prot;
2164
2165 /*
2166 * This set of pinned pages now belongs to this window as well.
2167 * Assert if the ref count is zero since it is an error to
2168 * pass pinned_pages to scif_register_pinned_pages() after
2169 * calling scif_unpin_pages().
2170 */
2171 if (!atomic_add_unless(&pinned_pages->ref_count,
2172 (int32_t)pinned_pages->nr_pages, 0))
2173 BUG_ON(1);
2174
2175 micscif_inc_node_refcnt(ep->remote_dev, 1);
2176
2177 if ((err = micscif_send_alloc_request(ep, window))) {
2178 micscif_dec_node_refcnt(ep->remote_dev, 1);
2179 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2180 goto error_unmap;
2181 }
2182
2183 /* Prepare the remote registration window */
2184 if ((err = micscif_prep_remote_window(ep, window))) {
2185 micscif_dec_node_refcnt(ep->remote_dev, 1);
2186 micscif_set_nr_pages(ep->remote_dev, window);
2187 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2188 goto error_unmap;
2189 }
2190
2191 /* Tell the peer about the new window */
2192 if ((err = micscif_send_scif_register(ep, window))) {
2193 micscif_dec_node_refcnt(ep->remote_dev, 1);
2194 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2195 goto error_unmap;
2196 }
2197
2198 micscif_dec_node_refcnt(ep->remote_dev, 1);
2199
2200 /* No further failures expected. Insert new window */
2201 mutex_lock(&ep->rma_info.rma_lock);
2202 set_window_ref_count(window, pinned_pages->nr_pages);
2203 micscif_insert_window(window, &ep->rma_info.reg_list);
2204 mutex_unlock(&ep->rma_info.rma_lock);
2205
2206 return computed_offset;
2207error_unmap:
2208 micscif_destroy_window(ep, window);
2209 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2210 return err;
2211}
2212
2213off_t
2214scif_register_pinned_pages(scif_epd_t epd,
2215 scif_pinned_pages_t pinned_pages, off_t offset, int map_flags)
2216{
2217 off_t ret;
2218 get_kref_count(epd);
2219 ret = __scif_register_pinned_pages(epd, pinned_pages, offset, map_flags);
2220 put_kref_count(epd);
2221 return ret;
2222}
2223EXPORT_SYMBOL(scif_register_pinned_pages);
2224
2225/**
2226 * scif_get_pages - Add references to remote registered pages
2227 *
2228 * scif_get_pages() returns the addresses of the physical pages represented
2229 * by those pages of the registered address space of the peer of epd, starting
2230 * at offset offset and continuing for len bytes. offset and len are constrained
2231 * to be multiples of the page size.
2232 *
2233 * Return Values
2234 * Upon successful completion, scif_get_pages() returns 0;
2235 * else an apt error is returned as documented in scif.h.
2236 */
2237int
2238__scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages)
2239{
2240 struct endpt *ep = (struct endpt *)epd;
2241 struct micscif_rma_req req;
2242 struct reg_range_t *window = NULL;
2243 int nr_pages, err, i;
2244
2245 pr_debug("SCIFAPI get_pinned_pages: ep %p %s offset 0x%lx len 0x%lx\n",
2246 ep, scif_ep_states[ep->state], offset, len);
2247
2248 if ((err = verify_epd(ep)))
2249 return err;
2250
2251 if ((!len) ||
2252 (offset < 0) ||
2253 (offset + len < offset) ||
2254 (align_low((uint64_t)offset, PAGE_SIZE) != (uint64_t)offset) ||
2255 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
2256 return -EINVAL;
2257
2258 nr_pages = len >> PAGE_SHIFT;
2259
2260 req.out_window = &window;
2261 req.offset = offset;
2262 req.prot = 0;
2263 req.nr_bytes = len;
2264 req.type = WINDOW_SINGLE;
2265 req.head = &ep->rma_info.remote_reg_list;
2266
2267 mutex_lock(&ep->rma_info.rma_lock);
2268 /* Does a valid window exist? */
2269 if ((err = micscif_query_window(&req))) {
2270 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2271 goto error;
2272 }
2273 RMA_MAGIC(window);
2274
2275 /* Allocate scif_range */
2276 if (!(*pages = kzalloc(sizeof(struct scif_range), GFP_KERNEL))) {
2277 err = -ENOMEM;
2278 goto error;
2279 }
2280
2281 /* Allocate phys addr array */
2282 if (!((*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t)))) {
2283 err = -ENOMEM;
2284 goto error;
2285 }
2286
2287#ifndef _MIC_SCIF_
2288 /* Allocate virtual address array */
2289 if (!((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)))) {
2290 err = -ENOMEM;
2291 goto error;
2292 }
2293#endif
2294 /* Populate the values */
2295 (*pages)->cookie = window;
2296 (*pages)->nr_pages = nr_pages;
2297 (*pages)->prot_flags = window->prot;
2298
2299 for (i = 0; i < nr_pages; i++) {
2300 (*pages)->phys_addr[i] =
2301#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
2302 is_self_scifdev(ep->remote_dev) ?
2303 micscif_get_dma_addr(window, offset + (i * PAGE_SIZE),
2304 NULL, NULL, NULL) : window->phys_addr[i];
2305#else
2306 get_phys_addr(micscif_get_dma_addr(window, offset + (i * PAGE_SIZE),
2307 NULL, NULL, NULL), ep->remote_dev);
2308#endif
2309#ifndef _MIC_SCIF_
2310 if (!is_self_scifdev(ep->remote_dev))
2311 (*pages)->va[i] =
2312 get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.va +
2313 (*pages)->phys_addr[i] -
2314 get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.pa;
2315#endif
2316 }
2317
2318 window->get_put_ref_count += nr_pages;
2319 get_window_ref_count(window, nr_pages);
2320error:
2321 mutex_unlock(&ep->rma_info.rma_lock);
2322 if (err) {
2323 if (*pages) {
2324 if ((*pages)->phys_addr)
2325 scif_free((*pages)->phys_addr, nr_pages * sizeof(dma_addr_t));
2326#ifndef _MIC_SCIF_
2327 if ((*pages)->va)
2328 scif_free((*pages)->va, nr_pages * sizeof(void *));
2329#endif
2330 kfree(*pages);
2331 *pages = NULL;
2332 }
2333 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2334 } else {
2335 micscif_create_node_dep(ep->remote_dev, nr_pages);
2336 }
2337 return err;
2338}
2339
2340int
2341scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages)
2342{
2343 int ret;
2344 get_kref_count(epd);
2345 ret = __scif_get_pages(epd, offset, len, pages);
2346 put_kref_count(epd);
2347 return ret;
2348}
2349EXPORT_SYMBOL(scif_get_pages);
2350
2351/**
2352 * scif_put_pages - Remove references from remote registered pages
2353 *
2354 * scif_put_pages() returns a scif_range structure previously obtained by
2355 * calling scif_get_pages(). When control returns, the physical pages may
2356 * become available for reuse if and when the window which represented
2357 * those pages is unregistered. Therefore, those pages must never be accessed.
2358 *
2359 * Return Values
2360 * Upon success, zero is returned.
2361 * else an apt error is returned as documented in scif.h.
2362 */
2363int
2364__scif_put_pages(struct scif_range *pages)
2365{
2366 struct endpt *ep;
2367 struct reg_range_t *window;
2368 struct nodemsg msg;
2369
2370 if (!pages || !pages->cookie)
2371 return -EINVAL;
2372
2373 window = pages->cookie;
2374
2375 if (!window || window->magic != SCIFEP_MAGIC ||
2376 !window->get_put_ref_count)
2377 return -EINVAL;
2378
2379 ep = (struct endpt *)window->ep;
2380
2381 /*
2382 * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
2383 * callee should be allowed to release references to the pages,
2384 * else the endpoint was not connected in the first place,
2385 * hence the ENOTCONN.
2386 */
2387 if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED)
2388 return -ENOTCONN;
2389
2390 /*
2391 * TODO: Re-enable this check once ref counts for kernel mode APIs
2392 * have been implemented and node remove call backs are called before
2393 * the node is removed. This check results in kernel mode APIs not
2394 * being able to release pages correctly since node remove callbacks
2395 * are called after the node is removed currently.
2396 * if (!scifdev_alive(ep))
2397 * return -ENODEV;
2398 */
2399
2400 micscif_inc_node_refcnt(ep->remote_dev, 1);
2401 mutex_lock(&ep->rma_info.rma_lock);
2402
2403 /* Decrement the ref counts and check for errors */
2404 window->get_put_ref_count -= pages->nr_pages;
2405 BUG_ON(window->get_put_ref_count < 0);
2406 put_window_ref_count(window, pages->nr_pages);
2407
2408 /* Initiate window destruction if ref count is zero */
2409 if (!window->ref_count) {
2410 drain_dma_intr(ep->rma_info.dma_chan);
2411 /* Inform the peer about this window being destroyed. */
2412 msg.uop = SCIF_MUNMAP;
2413 msg.src = ep->port;
2414 msg.payload[0] = window->peer_window;
2415 /* No error handling for notification messages */
2416 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
2417 list_del(&window->list_member);
2418 /* Destroy this window from the peer's registered AS */
2419 micscif_destroy_remote_window(ep, window);
2420 }
2421 mutex_unlock(&ep->rma_info.rma_lock);
2422
2423 micscif_dec_node_refcnt(ep->remote_dev, 1);
2424 micscif_destroy_node_dep(ep->remote_dev, pages->nr_pages);
2425 scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t));
2426#ifndef _MIC_SCIF_
2427 scif_free(pages->va, pages->nr_pages * sizeof(void*));
2428#endif
2429 kfree(pages);
2430 return 0;
2431}
2432
2433int
2434scif_put_pages(struct scif_range *pages)
2435{
2436 int ret;
2437 struct reg_range_t *window = pages->cookie;
2438 struct endpt *ep = (struct endpt *)window->ep;
2439 if (atomic_read(&(&(ep->ref_count))->refcount) > 0) {
2440 kref_get(&(ep->ref_count));
2441 } else {
2442 WARN_ON(1);
2443 }
2444 ret = __scif_put_pages(pages);
2445 if (atomic_read(&(&(ep->ref_count))->refcount) > 0) {
2446 kref_put(&(ep->ref_count), scif_ref_rel);
2447 } else {
2448 //WARN_ON(1);
2449 }
2450 return ret;
2451}
2452EXPORT_SYMBOL(scif_put_pages);
2453
2454int scif_event_register(scif_callback_t handler)
2455{
2456 /* Add to the list of event handlers */
2457 struct scif_callback *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
2458 if (!cb)
2459 return -ENOMEM;
2460 mutex_lock(&ms_info.mi_event_cblock);
2461 cb->callback_handler = handler;
2462 list_add_tail(&cb->list_member, &ms_info.mi_event_cb);
2463 mutex_unlock(&ms_info.mi_event_cblock);
2464 return 0;
2465}
2466EXPORT_SYMBOL(scif_event_register);
2467
2468int scif_event_unregister(scif_callback_t handler)
2469{
2470 struct list_head *pos, *unused;
2471 struct scif_callback *temp;
2472 int err = -EINVAL;
2473
2474 mutex_lock(&ms_info.mi_event_cblock);
2475 list_for_each_safe(pos, unused, &ms_info.mi_event_cb) {
2476 temp = list_entry(pos, struct scif_callback, list_member);
2477 if (temp->callback_handler == handler) {
2478 err = 0;
2479 list_del(pos);
2480 kfree(temp);
2481 break;
2482 }
2483 }
2484
2485 mutex_unlock(&ms_info.mi_event_cblock);
2486 return err;
2487}
2488EXPORT_SYMBOL(scif_event_unregister);
2489
2490/**
2491 * scif_register - Mark a memory region for remote access.
2492 * @epd: endpoint descriptor
2493 * @addr: starting virtual address
2494 * @len: length of range
2495 * @offset: offset of window
2496 * @prot: read/write protection
2497 * @map_flags: flags
2498 *
2499 * Return Values
2500 * Upon successful completion, scif_register() returns the offset
2501 * at which the mapping was placed else an apt error is returned
2502 * as documented in scif.h.
2503 */
2504off_t
2505__scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
2506 int prot, int map_flags)
2507{
2508 scif_pinned_pages_t pinned_pages;
2509 off_t err;
2510 struct endpt *ep = (struct endpt *)epd;
2511 uint64_t computed_offset;
2512 struct reg_range_t *window;
2513 struct mm_struct *mm = NULL;
2514
2515 pr_debug("SCIFAPI register: ep %p %s addr %p len 0x%lx"
2516 " offset 0x%lx prot 0x%x map_flags 0x%x\n",
2517 epd, scif_ep_states[epd->state], addr, len, offset, prot, map_flags);
2518
2519 /* Unsupported flags */
2520 if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL))
2521 return -EINVAL;
2522
2523 /* Unsupported protection requested */
2524 if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
2525 return -EINVAL;
2526
2527 /* addr/len must be page aligned. len should be non zero */
2528 if ((!len) ||
2529 (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) ||
2530 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
2531 return -EINVAL;
2532
2533 /*
2534 * Offset is not page aligned/negative or offset+len
2535 * wraps around with SCIF_MAP_FIXED.
2536 */
2537 if ((map_flags & SCIF_MAP_FIXED) &&
2538 ((align_low(offset, PAGE_SIZE) != offset) ||
2539 (offset < 0) ||
2540 (offset + (off_t)len < offset)))
2541 return -EINVAL;
2542
2543
2544 might_sleep();
2545
2546#ifdef DEBUG
2547 /* Bad EP */
2548 if (!ep)
2549 return -EINVAL;
2550#endif
2551
2552 if ((err = verify_epd(ep)))
2553 return err;
2554
2555 /* Compute the offset for this registration */
2556 if ((err = micscif_get_window_offset(ep, map_flags, offset,
2557 len, &computed_offset)))
2558 return err;
2559
2560 /* Allocate and prepare self registration window */
2561 if (!(window = micscif_create_window(ep, len >> PAGE_SHIFT,
2562 computed_offset, false))) {
2563 micscif_free_window_offset(ep, computed_offset, len);
2564 return -ENOMEM;
2565 }
2566
2567 micscif_inc_node_refcnt(ep->remote_dev, 1);
2568
2569 window->nr_pages = len >> PAGE_SHIFT;
2570
2571 if ((err = micscif_send_alloc_request(ep, window))) {
2572 micscif_destroy_incomplete_window(ep, window);
2573 micscif_dec_node_refcnt(ep->remote_dev, 1);
2574 return err;
2575 }
2576
2577 if (!(map_flags & SCIF_MAP_KERNEL)) {
2578 mm = __scif_acquire_mm();
2579 map_flags |= SCIF_MAP_ULIMIT;
2580 }
2581 /* Pin down the pages */
2582 if ((err = scif_pin_pages(addr, len, prot,
2583 map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT),
2584 &pinned_pages))) {
2585 micscif_destroy_incomplete_window(ep, window);
2586 micscif_dec_node_refcnt(ep->remote_dev, 1);
2587 __scif_release_mm(mm);
2588 goto error;
2589 }
2590
2591 window->pinned_pages = pinned_pages;
2592 window->nr_contig_chunks = pinned_pages->nr_contig_chunks;
2593 window->prot = pinned_pages->prot;
2594 window->mm = mm;
2595
2596 /* Prepare the remote registration window */
2597 if ((err = micscif_prep_remote_window(ep, window))) {
2598 micscif_dec_node_refcnt(ep->remote_dev, 1);
2599 micscif_set_nr_pages(ep->remote_dev, window);
2600 printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
2601 goto error_unmap;
2602 }
2603
2604 /* Tell the peer about the new window */
2605 if ((err = micscif_send_scif_register(ep, window))) {
2606 micscif_dec_node_refcnt(ep->remote_dev, 1);
2607 printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
2608 goto error_unmap;
2609 }
2610
2611 micscif_dec_node_refcnt(ep->remote_dev, 1);
2612
2613 /* No further failures expected. Insert new window */
2614 mutex_lock(&ep->rma_info.rma_lock);
2615 set_window_ref_count(window, pinned_pages->nr_pages);
2616 micscif_insert_window(window, &ep->rma_info.reg_list);
2617 mutex_unlock(&ep->rma_info.rma_lock);
2618
2619 pr_debug("SCIFAPI register: ep %p %s addr %p"
2620 " len 0x%lx computed_offset 0x%llx\n",
2621 epd, scif_ep_states[epd->state], addr, len, computed_offset);
2622 return computed_offset;
2623error_unmap:
2624 micscif_destroy_window(ep, window);
2625error:
2626 printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
2627 return err;
2628}
2629
2630off_t
2631scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
2632 int prot, int map_flags)
2633{
2634 off_t ret;
2635 get_kref_count(epd);
2636 ret = __scif_register(epd, addr, len, offset, prot, map_flags);
2637 put_kref_count(epd);
2638 return ret;
2639}
2640EXPORT_SYMBOL(scif_register);
2641
2642/**
2643 * scif_unregister - Release a memory region registered for remote access.
2644 * @epd: endpoint descriptor
2645 * @offset: start of range to unregister
2646 * @len: length of range to unregister
2647 *
2648 * Return Values
2649 * Upon successful completion, scif_unegister() returns zero
2650 * else an apt error is returned as documented in scif.h.
2651 */
2652int
2653__scif_unregister(scif_epd_t epd, off_t offset, size_t len)
2654{
2655 struct endpt *ep = (struct endpt *)epd;
2656 struct reg_range_t *window = NULL;
2657 struct micscif_rma_req req;
2658 int nr_pages, err;
2659
2660 pr_debug("SCIFAPI unregister: ep %p %s offset 0x%lx len 0x%lx\n",
2661 ep, scif_ep_states[ep->state], offset, len);
2662
2663 /* len must be page aligned. len should be non zero */
2664 if ((!len) ||
2665 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
2666 return -EINVAL;
2667
2668 /* Offset is not page aligned or offset+len wraps around */
2669 if ((align_low(offset, PAGE_SIZE) != offset) ||
2670 (offset + (off_t)len < offset))
2671 return -EINVAL;
2672
2673 if ((err = verify_epd(ep)))
2674 return err;
2675
2676 might_sleep();
2677 nr_pages = (int)(len >> PAGE_SHIFT);
2678
2679 req.out_window = &window;
2680 req.offset = offset;
2681 req.prot = 0;
2682 req.nr_bytes = len;
2683 req.type = WINDOW_FULL;
2684 req.head = &ep->rma_info.reg_list;
2685
2686 micscif_inc_node_refcnt(ep->remote_dev, 1);
2687 mutex_lock(&ep->rma_info.rma_lock);
2688 /* Does a valid window exist? */
2689 if ((err = micscif_query_window(&req))) {
2690 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2691 goto error;
2692 }
2693 /* Unregister all the windows in this range */
2694 if ((err = micscif_rma_list_unregister(window, offset, nr_pages)))
2695 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2696error:
2697 mutex_unlock(&ep->rma_info.rma_lock);
2698 micscif_dec_node_refcnt(ep->remote_dev, 1);
2699 return err;
2700}
2701
2702int
2703scif_unregister(scif_epd_t epd, off_t offset, size_t len)
2704{
2705 int ret;
2706 get_kref_count(epd);
2707 ret = __scif_unregister(epd, offset, len);
2708 put_kref_count(epd);
2709 return ret;
2710}
2711EXPORT_SYMBOL(scif_unregister);
2712
2713unsigned int scif_pollfd(struct file *f, poll_table *wait, scif_epd_t epd)
2714{
2715 unsigned int ret;
2716 get_kref_count(epd);
2717 ret = __scif_pollfd(f, wait, (struct endpt *)epd);
2718 put_kref_count(epd);
2719 return ret;
2720}
2721
2722unsigned int __scif_pollfd(struct file *f, poll_table *wait, struct endpt *ep)
2723{
2724 unsigned int mask = 0;
2725 unsigned long sflags;
2726
2727 pr_debug("SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]);
2728
2729 micscif_inc_node_refcnt(ep->remote_dev, 1);
2730 spin_lock_irqsave(&ep->lock, sflags);
2731
2732 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
2733#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2734 if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) {
2735#else
2736 if (!wait || wait->key & SCIF_POLLOUT) {
2737#endif
2738 poll_wait(f, &ep->conn_pend_wq, wait);
2739 if (ep->state == SCIFEP_CONNECTED ||
2740 ep->state == SCIFEP_DISCONNECTED ||
2741 ep->conn_err) {
2742 mask |= SCIF_POLLOUT;
2743 }
2744 goto return_scif_poll;
2745 }
2746 }
2747
2748 /* Is it OK to use wait->key?? */
2749 if (ep->state == SCIFEP_LISTENING) {
2750#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2751 if (!wait || poll_requested_events(wait) & SCIF_POLLIN) {
2752#else
2753 if (!wait || wait->key & SCIF_POLLIN) {
2754#endif
2755 spin_unlock_irqrestore(&ep->lock, sflags);
2756 poll_wait(f, &ep->conwq, wait);
2757 spin_lock_irqsave(&ep->lock, sflags);
2758 if (ep->conreqcnt)
2759 mask |= SCIF_POLLIN;
2760 } else {
2761 mask |= SCIF_POLLERR;
2762 }
2763 goto return_scif_poll;
2764 }
2765
2766#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2767 if (!wait || poll_requested_events(wait) & SCIF_POLLIN) {
2768#else
2769 if (!wait || wait->key & SCIF_POLLIN) {
2770#endif
2771 if (ep->state != SCIFEP_CONNECTED &&
2772 ep->state != SCIFEP_LISTENING &&
2773 ep->state != SCIFEP_DISCONNECTED) {
2774 mask |= SCIF_POLLERR;
2775 goto return_scif_poll;
2776 }
2777
2778 spin_unlock_irqrestore(&ep->lock, sflags);
2779 poll_wait(f, &ep->recvwq, wait);
2780 spin_lock_irqsave(&ep->lock, sflags);
2781 if (micscif_rb_count(&ep->qp_info.qp->inbound_q, 1))
2782 mask |= SCIF_POLLIN;
2783 }
2784
2785#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2786 if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) {
2787#else
2788 if (!wait || wait->key & SCIF_POLLOUT) {
2789#endif
2790 if (ep->state != SCIFEP_CONNECTED &&
2791 ep->state != SCIFEP_LISTENING) {
2792 mask |= SCIF_POLLERR;
2793 goto return_scif_poll;
2794 }
2795
2796 spin_unlock_irqrestore(&ep->lock, sflags);
2797 poll_wait(f, &ep->sendwq, wait);
2798 spin_lock_irqsave(&ep->lock, sflags);
2799 if (micscif_rb_space(&ep->qp_info.qp->outbound_q))
2800 mask |= SCIF_POLLOUT;
2801 }
2802
2803return_scif_poll:
2804 /* If the endpoint is in the diconnected state then return hangup instead of error */
2805 if (ep->state == SCIFEP_DISCONNECTED) {
2806 mask &= ~SCIF_POLLERR;
2807 mask |= SCIF_POLLHUP;
2808 }
2809
2810 spin_unlock_irqrestore(&ep->lock, sflags);
2811 micscif_dec_node_refcnt(ep->remote_dev, 1);
2812 return mask;
2813}
2814
2815/*
2816 * The private data field of each VMA used to mmap a remote window
2817 * points to an instance of struct vma_pvt
2818 */
2819struct vma_pvt {
2820 struct endpt *ep; /* End point for remote window */
2821 uint64_t offset; /* offset within remote window */
2822 bool valid_offset; /* offset is valid only if the original
2823 * mmap request was for a single page
2824 * else the offset within the vma is
2825 * the correct offset
2826 */
2827 struct kref ref;
2828};
2829
2830static void vma_pvt_release(struct kref *ref)
2831{
2832 struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref);
2833 kfree(vmapvt);
2834}
2835
2836/**
2837 * scif_vma_open - VMA open driver callback
2838 * @vma: VMM memory area.
2839 * The open method is called by the kernel to allow the subsystem implementing
2840 * the VMA to initialize the area. This method is invoked any time a new
2841 * reference to the VMA is made (when a process forks, for example).
2842 * The one exception happens when the VMA is first created by mmap;
2843 * in this case, the driver's mmap method is called instead.
2844 * This function is also invoked when an existing VMA is split by the kernel
2845 * due to a call to munmap on a subset of the VMA resulting in two VMAs.
2846 * The kernel invokes this function only on one of the two VMAs.
2847 *
2848 * Return Values: None.
2849 */
2850static void scif_vma_open(struct vm_area_struct *vma)
2851{
2852 struct vma_pvt *vmapvt = ((vma)->vm_private_data);
2853 pr_debug("SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
2854 ((vma)->vm_start), ((vma)->vm_end));
2855 kref_get(&vmapvt->ref);
2856}
2857
2858/**
2859 * scif_munmap - VMA close driver callback.
2860 * @vma: VMM memory area.
2861 * When an area is destroyed, the kernel calls its close operation.
2862 * Note that there's no usage count associated with VMA's; the area
2863 * is opened and closed exactly once by each process that uses it.
2864 *
2865 * Return Values: None.
2866 */
2867void scif_munmap(struct vm_area_struct *vma)
2868{
2869 struct endpt *ep;
2870 struct vma_pvt *vmapvt = ((vma)->vm_private_data);
2871 int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT );
2872 uint64_t offset;
2873 struct micscif_rma_req req;
2874 struct reg_range_t *window = NULL;
2875 int err;
2876
2877 might_sleep();
2878 pr_debug("SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
2879 ((vma)->vm_start), ((vma)->vm_end));
2880 /* used to be a BUG_ON(), prefer keeping the kernel alive */
2881 if (!vmapvt) {
2882 WARN_ON(1);
2883 printk(KERN_ERR "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
2884 ((vma)->vm_start), ((vma)->vm_end));
2885 return;
2886 }
2887
2888 ep = vmapvt->ep;
2889 offset = vmapvt->valid_offset ? vmapvt->offset :
2890 ((vma)->vm_pgoff) << PAGE_SHIFT;
2891 pr_debug("SCIFAPI munmap: ep %p %s nr_pages 0x%x offset 0x%llx\n",
2892 ep, scif_ep_states[ep->state], nr_pages, offset);
2893
2894 req.out_window = &window;
2895 req.offset = offset;
2896 req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start);
2897 req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE);
2898 req.type = WINDOW_PARTIAL;
2899 req.head = &ep->rma_info.remote_reg_list;
2900
2901 micscif_inc_node_refcnt(ep->remote_dev, 1);
2902 mutex_lock(&ep->rma_info.rma_lock);
2903
2904 if ((err = micscif_query_window(&req)))
2905 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2906 else
2907 micscif_rma_list_munmap(window, offset, nr_pages);
2908
2909 mutex_unlock(&ep->rma_info.rma_lock);
2910 micscif_dec_node_refcnt(ep->remote_dev, 1);
2911
2912 micscif_destroy_node_dep(ep->remote_dev, nr_pages);
2913
2914 /*
2915 * The kernel probably zeroes these out but we still want
2916 * to clean up our own mess just in case.
2917 */
2918 vma->vm_ops = NULL;
2919 ((vma)->vm_private_data) = NULL;
2920 kref_put(&vmapvt->ref, vma_pvt_release);
2921 micscif_rma_put_task(ep, nr_pages);
2922}
2923
2924static const struct vm_operations_struct micscif_vm_ops = {
2925 .open = scif_vma_open,
2926 .close = scif_munmap,
2927};
2928
2929/**
2930 * scif_mmap - Map pages in virtual address space to a remote window.
2931 * @vma: VMM memory area.
2932 * @epd: endpoint descriptor
2933 *
2934 * Return Values
2935 * Upon successful completion, scif_mmap() returns zero
2936 * else an apt error is returned as documented in scif.h.
2937 */
2938int
2939scif_mmap(struct vm_area_struct *vma, scif_epd_t epd)
2940{
2941 struct micscif_rma_req req;
2942 struct reg_range_t *window = NULL;
2943 struct endpt *ep = (struct endpt *)epd;
2944 uint64_t start_offset = ((vma)->vm_pgoff) << PAGE_SHIFT;
2945 int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT);
2946 int err;
2947 struct vma_pvt *vmapvt;
2948
2949 pr_debug("SCIFAPI mmap: ep %p %s start_offset 0x%llx nr_pages 0x%x\n",
2950 ep, scif_ep_states[ep->state], start_offset, nr_pages);
2951
2952 if ((err = verify_epd(ep)))
2953 return err;
2954
2955 might_sleep();
2956
2957 if ((err = micscif_rma_get_task(ep, nr_pages)))
2958 return err;
2959
2960 if (!(vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL))) {
2961 micscif_rma_put_task(ep, nr_pages);
2962 return -ENOMEM;
2963 }
2964
2965 vmapvt->ep = ep;
2966 kref_init(&vmapvt->ref);
2967
2968 micscif_create_node_dep(ep->remote_dev, nr_pages);
2969
2970 req.out_window = &window;
2971 req.offset = start_offset;
2972 req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start);
2973 req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE);
2974 req.type = WINDOW_PARTIAL;
2975 req.head = &ep->rma_info.remote_reg_list;
2976
2977 micscif_inc_node_refcnt(ep->remote_dev, 1);
2978 mutex_lock(&ep->rma_info.rma_lock);
2979 /* Does a valid window exist? */
2980 if ((err = micscif_query_window(&req))) {
2981 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2982 goto error;
2983 }
2984 RMA_MAGIC(window);
2985
2986 /* Default prot for loopback */
2987 if (!is_self_scifdev(ep->remote_dev)) {
2988#ifdef _MIC_SCIF_
2989 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
2990#else
2991 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
2992#endif
2993 }
2994
2995 /*
2996 * VM_DONTCOPY - Do not copy this vma on fork
2997 * VM_DONTEXPAND - Cannot expand with mremap()
2998 * VM_RESERVED - Count as reserved_vm like IO
2999 * VM_PFNMAP - Page-ranges managed without "struct page"
3000 * VM_IO - Memory mapped I/O or similar
3001 *
3002 * We do not want to copy this VMA automatically on a fork(),
3003 * expand this VMA due to mremap() or swap out these pages since
3004 * the VMA is actually backed by physical pages in the remote
3005 * node's physical memory and not via a struct page.
3006 */
3007#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
3008 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP | VM_PFNMAP;
3009#else
3010 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP;
3011#endif
3012
3013 if (!is_self_scifdev(ep->remote_dev))
3014 ((vma)->vm_flags) |= VM_IO;
3015
3016 /* Map this range of windows */
3017 if ((err = micscif_rma_list_mmap(window,
3018 start_offset, nr_pages, vma))) {
3019 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3020 goto error;
3021 }
3022 /* Set up the driver call back */
3023 vma->vm_ops = &micscif_vm_ops;
3024 ((vma)->vm_private_data) = vmapvt;
3025 /*
3026 * For 1 page sized VMAs the kernel (remap_pfn_range) replaces the
3027 * offset in the VMA with the pfn, so in that case save off the
3028 * original offset, since the page sized VMA can't be split into
3029 * smaller VMAs the offset is not going to change.
3030 */
3031 if (nr_pages == 1) {
3032 vmapvt->offset = start_offset;
3033 vmapvt->valid_offset = true;
3034 }
3035 err = 0;
3036error:
3037 mutex_unlock(&ep->rma_info.rma_lock);
3038 micscif_dec_node_refcnt(ep->remote_dev, 1);
3039 if (err) {
3040 micscif_destroy_node_dep(ep->remote_dev, nr_pages);
3041 kfree(vmapvt);
3042 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3043 micscif_rma_put_task(ep, nr_pages);
3044 }
3045 return err;
3046}
3047
3048/**
3049 * scif_readfrom() - Read SCIF offset data from remote connection
3050 * @epd: endpoint descriptor
3051 * @loffset: offset in local registered address space to which to copy
3052 * @len: length of range to copy
3053 * @roffset: offset in remote registered address space from which to copy
3054 * @flags: flags
3055 *
3056 * Return Values
3057 * Upon successful completion, scif_readfrom() returns zero
3058 * else an apt error is returned as documented in scif.h.
3059 */
3060int
3061scif_readfrom(scif_epd_t epd, off_t loffset, size_t len,
3062 off_t roffset, int flags)
3063{
3064 int ret;
3065 get_kref_count(epd);
3066 ret = __scif_readfrom(epd, loffset, len, roffset, flags);
3067 put_kref_count(epd);
3068 return ret;
3069}
3070EXPORT_SYMBOL(scif_readfrom);
3071
3072/**
3073 * scif_writeto() - Send SCIF offset data to remote connection
3074 * @epd: endpoint descriptor
3075 * @loffset: offset in local registered address space from which to copy
3076 * @len: length of range to copy
3077 * @roffset: offset in remote registered address space to which to copy
3078 * @flags: flags
3079 *
3080 * Return Values
3081 * Upon successful completion, scif_writeto() returns zero
3082 * else an apt error is returned as documented in scif.h.
3083 *
3084 */
3085int scif_writeto(scif_epd_t epd, off_t loffset, size_t len,
3086 off_t roffset, int flags)
3087{
3088 int ret;
3089 get_kref_count(epd);
3090 ret = __scif_writeto(epd, loffset, len, roffset, flags);
3091 put_kref_count(epd);
3092 return ret;
3093}
3094EXPORT_SYMBOL(scif_writeto);
3095
3096#define HOST_LOOPB_MAGIC_MARK 0xdead
3097
3098/**
3099 * scif_fence_mark:
3100 * @epd: endpoint descriptor
3101 * @flags: control flags
3102 * @mark: marked handle returned as output.
3103 *
3104 * scif_fence_mark() returns after marking the current set of all uncompleted
3105 * RMAs initiated through the endpoint epd or marking the current set of all
3106 * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
3107 * marked with a value returned in mark. The application may subsequently
3108 * await completion of all RMAs so marked.
3109 *
3110 * Return Values
3111 * Upon successful completion, scif_fence_mark() returns 0;
3112 * else an apt error is returned as documented in scif.h.
3113 */
3114int __scif_fence_mark(scif_epd_t epd, int flags, int *mark)
3115{
3116 struct endpt *ep = (struct endpt *)epd;
3117 int err = 0;
3118
3119 pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x\n",
3120 ep, scif_ep_states[ep->state], flags, *mark);
3121
3122 if ((err = verify_epd(ep)))
3123 return err;
3124
3125 /* Invalid flags? */
3126 if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))
3127 return -EINVAL;
3128
3129 /* At least one of init self or peer RMA should be set */
3130 if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
3131 return -EINVAL;
3132
3133 /* Exactly one of init self or peer RMA should be set but not both */
3134 if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
3135 return -EINVAL;
3136
3137#ifndef _MIC_SCIF_
3138 /*
3139 * Host Loopback does not need to use DMA.
3140 * Return a valid mark to be symmetric.
3141 */
3142 if (is_self_scifdev(ep->remote_dev)) {
3143 *mark = HOST_LOOPB_MAGIC_MARK;
3144 return 0;
3145 }
3146#endif
3147
3148 if (flags & SCIF_FENCE_INIT_SELF) {
3149 if ((*mark = micscif_fence_mark(epd)) < 0)
3150 err = *mark;
3151 } else {
3152 micscif_inc_node_refcnt(ep->remote_dev, 1);
3153 err = micscif_send_fence_mark(ep, mark);
3154 micscif_dec_node_refcnt(ep->remote_dev, 1);
3155 }
3156 if (err)
3157 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3158
3159 pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x err %d\n",
3160 ep, scif_ep_states[ep->state], flags, *mark, err);
3161 return err;
3162}
3163
3164int scif_fence_mark(scif_epd_t epd, int flags, int *mark)
3165{
3166 int ret;
3167 get_kref_count(epd);
3168 ret = __scif_fence_mark(epd, flags, mark);
3169 put_kref_count(epd);
3170 return ret;
3171}
3172EXPORT_SYMBOL(scif_fence_mark);
3173
3174/**
3175 * scif_fence_wait:
3176 * @epd: endpoint descriptor
3177 * @mark: mark request.
3178 *
3179 * scif_fence_wait() returns after all RMAs marked with mark have completed.
3180 *
3181 * Return Values
3182 * Upon successful completion, scif_fence_wait() returns 0;
3183 * else an apt error is returned as documented in scif.h.
3184 */
3185int __scif_fence_wait(scif_epd_t epd, int mark)
3186{
3187 struct endpt *ep = (struct endpt *)epd;
3188 int err = 0;
3189
3190 pr_debug("SCIFAPI fence_wait: ep %p %s mark 0x%x\n",
3191 ep, scif_ep_states[ep->state], mark);
3192
3193 if ((err = verify_epd(ep)))
3194 return err;
3195
3196#ifndef _MIC_SCIF_
3197 /*
3198 * Host Loopback does not need to use DMA.
3199 * The only valid mark provided is 0 so simply
3200 * return success if the mark is valid.
3201 */
3202 if (is_self_scifdev(ep->remote_dev)) {
3203 if (HOST_LOOPB_MAGIC_MARK == mark)
3204 return 0;
3205 else
3206 return -EINVAL;
3207 }
3208#endif
3209 if (mark & SCIF_REMOTE_FENCE) {
3210 micscif_inc_node_refcnt(ep->remote_dev, 1);
3211 err = micscif_send_fence_wait(epd, mark);
3212 micscif_dec_node_refcnt(ep->remote_dev, 1);
3213 } else {
3214 err = dma_mark_wait(epd->rma_info.dma_chan, mark, true);
3215 if (!err && atomic_read(&ep->rma_info.tw_refcount))
3216 queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
3217 }
3218
3219 if (err < 0)
3220 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3221 return err;
3222}
3223
3224int scif_fence_wait(scif_epd_t epd, int mark)
3225{
3226 int ret;
3227 get_kref_count(epd);
3228 ret = __scif_fence_wait(epd, mark);
3229 put_kref_count(epd);
3230 return ret;
3231}
3232EXPORT_SYMBOL(scif_fence_wait);
3233
3234/*
3235 * scif_fence_signal:
3236 * @loff: local offset
3237 * @lval: local value to write to loffset
3238 * @roff: remote offset
3239 * @rval: remote value to write to roffset
3240 * @flags: flags
3241 *
3242 * scif_fence_signal() returns after marking the current set of all
3243 * uncompleted RMAs initiated through the endpoint epd or marking
3244 * the current set of all uncompleted RMAs initiated through the peer
3245 * of endpoint epd.
3246 *
3247 * Return Values
3248 * Upon successful completion, scif_fence_signal() returns 0;
3249 * else an apt error is returned as documented in scif.h.
3250 */
3251int __scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval,
3252 off_t roff, uint64_t rval, int flags)
3253{
3254 struct endpt *ep = (struct endpt *)epd;
3255 int err = 0;
3256
3257 pr_debug("SCIFAPI fence_signal: ep %p %s loff 0x%lx lval 0x%llx "
3258 "roff 0x%lx rval 0x%llx flags 0x%x\n",
3259 ep, scif_ep_states[ep->state], loff, lval, roff, rval, flags);
3260
3261 if ((err = verify_epd(ep)))
3262 return err;
3263
3264 /* Invalid flags? */
3265 if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER |
3266 SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))
3267 return -EINVAL;
3268
3269 /* At least one of init self or peer RMA should be set */
3270 if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
3271 return -EINVAL;
3272
3273 /* Exactly one of init self or peer RMA should be set but not both */
3274 if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
3275 return -EINVAL;
3276
3277 /* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */
3278 if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)))
3279 return -EINVAL;
3280
3281 /* Only Dword offsets allowed */
3282 if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(uint32_t) - 1)))
3283 return -EINVAL;
3284
3285 /* Only Dword aligned offsets allowed */
3286 if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(uint32_t) - 1)))
3287 return -EINVAL;
3288
3289 if (flags & SCIF_FENCE_INIT_PEER) {
3290 micscif_inc_node_refcnt(ep->remote_dev, 1);
3291 err = micscif_send_fence_signal(epd, roff,
3292 rval, loff, lval, flags);
3293 micscif_dec_node_refcnt(ep->remote_dev, 1);
3294 } else {
3295 /* Local Signal in Local RAS */
3296 if (flags & SCIF_SIGNAL_LOCAL)
3297 if ((err = micscif_prog_signal(epd, loff,
3298 lval, RMA_WINDOW_SELF)))
3299 goto error_ret;
3300
3301 /* Signal in Remote RAS */
3302 if (flags & SCIF_SIGNAL_REMOTE) {
3303 micscif_inc_node_refcnt(ep->remote_dev, 1);
3304 err = micscif_prog_signal(epd, roff,
3305 rval, RMA_WINDOW_PEER);
3306 micscif_dec_node_refcnt(ep->remote_dev, 1);
3307 }
3308 }
3309error_ret:
3310 if (err)
3311 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3312 else if (atomic_read(&ep->rma_info.tw_refcount))
3313 queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
3314 return err;
3315}
3316
3317int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval,
3318 off_t roff, uint64_t rval, int flags)
3319{
3320 int ret;
3321 get_kref_count(epd);
3322 ret = __scif_fence_signal(epd, loff, lval, roff, rval, flags);
3323 put_kref_count(epd);
3324 return ret;
3325}
3326EXPORT_SYMBOL(scif_fence_signal);
3327
3328/**
3329 * scif_get_nodeIDs - Return information about online nodes
3330 * @nodes: array space reserved for returning online node IDs
3331 * @len: number of entries on the nodes array
3332 * @self: address to place the node ID of this system
3333 *
3334 * Return Values
3335 * scif_get_nodeIDs() returns the total number of scif nodes
3336 * (including host) in the system
3337 */
3338int
3339scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self)
3340{
3341 int online = 0;
3342 int offset = 0;
3343 int node;
3344#ifdef _MIC_SCIF_
3345 micscif_get_node_info();
3346#endif
3347
3348 *self = ms_info.mi_nodeid;
3349 mutex_lock(&ms_info.mi_conflock);
3350 len = SCIF_MIN(len, (int32_t)ms_info.mi_total);
3351 for (node = 0; node <=(int32_t)ms_info.mi_maxid; node++) {
3352 if (ms_info.mi_mask & (1UL << node)) {
3353 online++;
3354 if (offset < len)
3355 nodes[offset++] = node;
3356 }
3357 }
3358 pr_debug("SCIFAPI get_nodeIDs total %d online %d filled in %d nodes\n",
3359 ms_info.mi_total, online, len);
3360 mutex_unlock(&ms_info.mi_conflock);
3361
3362 return online;
3363}
3364
3365EXPORT_SYMBOL(scif_get_nodeIDs);
3366
3367/**
3368 * micscif_pci_dev:
3369 * @node: node ID
3370 *
3371 * Return the pci_dev associated with a node.
3372 */
3373int micscif_pci_dev(uint16_t node, struct pci_dev **pdev)
3374{
3375#ifdef _MIC_SCIF_
3376 /* This *is* a PCI device, therefore no pdev to return. */
3377 return -ENODEV;
3378#else
3379 mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1);
3380 *pdev = mic_ctx->bi_pdev;
3381 return 0;
3382#endif
3383}
3384
3385#ifndef _MIC_SCIF_
3386/**
3387 * micscif_pci_info:
3388 * @node: node ID
3389 *
3390 * Populate the pci device info pointer associated with a node.
3391 */
3392int micscif_pci_info(uint16_t node, struct scif_pci_info *dev)
3393{
3394 int i;
3395 mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1);
3396 struct pci_dev *pdev;
3397
3398 if (!mic_ctx)
3399 return -ENODEV;
3400
3401 dev->pdev = pdev = mic_ctx->bi_pdev;
3402 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
3403 if (!pci_resource_start(pdev, i)) {
3404 dev->va[i] = NULL;
3405 continue;
3406 }
3407 if (pci_resource_flags(pdev, i) & IORESOURCE_PREFETCH) {
3408 /* TODO: Change comparison check for KNL. */
3409 if (pci_resource_start(pdev, i) == mic_ctx->aper.pa)
3410 dev->va[i] = mic_ctx->aper.va;
3411 else
3412 dev->va[i] = NULL;
3413 } else {
3414 dev->va[i] = mic_ctx->mmio.va;
3415 }
3416 }
3417 return 0;
3418}
3419#endif
3420
3421/**
3422 * scif_pci_info - Populate the pci device info pointer associated with a node
3423 * @node: the node to query
3424 * @scif_pdev: The scif_pci_info structure to populate.
3425 *
3426 * scif_pci_info() populates the provided scif_pci_info structure
3427 * associated with a node. The requested node ID cannot be the same as
3428 * the current node. This routine may only return success when called from
3429 * the host.
3430 *
3431 * Return Values
3432 * Upon successful completion, scif_pci_info() returns 0; otherwise the
3433 * an appropriate error is returned as documented in scif.h.
3434 */
3435int scif_pci_info(uint16_t node, struct scif_pci_info *dev)
3436{
3437#ifdef _MIC_SCIF_
3438 return -EINVAL;
3439#else
3440 if (node > ms_info.mi_maxid)
3441 return -EINVAL;
3442
3443 if ((scif_dev[node].sd_state == SCIFDEV_NOTPRESENT) ||
3444 is_self_scifdev(&scif_dev[node]))
3445 return -ENODEV;
3446
3447 return micscif_pci_info(node, dev);
3448#endif
3449}
3450EXPORT_SYMBOL(scif_pci_info);
3451
3452/*
3453 * DEBUG helper functions
3454 */
3455void
3456print_ep_state(struct endpt *ep, char *label)
3457{
3458 if (ep)
3459 printk("%s: EP %p state %s\n",
3460 label, ep, scif_ep_states[ep->state]);
3461 else
3462 printk("%s: EP %p\n state ?\n", label, ep);
3463}
3464