Updated micscif/micscif_api.c to new location for atomic_t element.
[xeon-phi-kernel-module] / micscif / micscif_api.c
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36#include <linux/poll.h>
37#include <linux/time.h>
38#include <linux/ktime.h>
39#include <linux/sched.h>
40#include <linux/kref.h>
41#include <linux/module.h>
42#include "scif.h"
43#include "mic/micscif.h"
44#ifndef _MIC_SCIF_
45#include "mic_common.h"
46#endif
47#include "mic/micscif_map.h"
48
49#define SCIF_MAP_ULIMIT 0x40
50
51bool mic_ulimit_check = 0;
52
53char *scif_ep_states[] = {
54 "Closed",
55 "Unbound",
56 "Bound",
57 "Listening",
58 "Connected",
59 "Connecting",
60 "Mapping",
61 "Closing",
62 "Close Listening",
63 "Disconnected",
64 "Zombie"};
65
66enum conn_async_state {
67 ASYNC_CONN_IDLE = 1, /* ep setup for async connect */
68 ASYNC_CONN_INPROGRESS, /* async connect in progress */
69 ASYNC_CONN_FLUSH_WORK /* async work flush in progress */
70};
71
72/**
73 * scif_open() - Create a SCIF end point
74 *
75 * Create a SCIF end point and set the state to UNBOUND. This function
76 * returns the address of the end point data structure.
77 */
78scif_epd_t
79__scif_open(void)
80{
81 struct endpt *ep;
82
83 might_sleep();
84 if ((ep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL)) == NULL) {
85 printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point descriptor\n");
86 goto err_ep_alloc;
87 }
88
89 if ((ep->qp_info.qp = (struct micscif_qp *)
90 kzalloc(sizeof(struct micscif_qp), GFP_KERNEL)) == NULL) {
91 printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point queue pointer\n");
92 goto err_qp_alloc;
93 }
94
95 spin_lock_init(&ep->lock);
96 mutex_init (&ep->sendlock);
97 mutex_init (&ep->recvlock);
98
99 if (micscif_rma_ep_init(ep) < 0) {
100 printk(KERN_ERR "SCIFAPI _open: RMA EP Init failed\n");
101 goto err_rma_init;
102 }
103
104 ep->state = SCIFEP_UNBOUND;
105 pr_debug("SCIFAPI open: ep %p success\n", ep);
106 return (scif_epd_t)ep;
107
108err_rma_init:
109 kfree(ep->qp_info.qp);
110err_qp_alloc:
111 kfree(ep);
112err_ep_alloc:
113 return NULL;
114}
115
116scif_epd_t
117scif_open(void)
118{
119 struct endpt *ep;
120 ep = (struct endpt *)__scif_open();
121 if (ep)
122 kref_init(&(ep->ref_count));
123 return (scif_epd_t)ep;
124}
125EXPORT_SYMBOL(scif_open);
126
127/**
128 * scif_close() - Terminate a SCIF end point
129 * @epd: The end point address returned from scif_open()
130 *
131 * The function terminates a scif connection. It must ensure all traffic on
132 * the connection is finished before removing it.
133 *
134 * On Connection with memory mapped this become more difficult. Once normal
135 * DMA and message traffic has ended the end point must be placed in a zombie
136 * state and wait for the other side to also release it's memory references.
137 */
138int
139__scif_close(scif_epd_t epd)
140{
141 struct endpt *ep = (struct endpt *)epd;
142 struct endpt *tmpep;
143 struct list_head *pos, *tmpq;
144 unsigned long sflags;
145 enum endptstate oldstate;
146 int err;
147 bool flush_conn;
148
149 pr_debug("SCIFAPI close: ep %p %s\n", ep, scif_ep_states[ep->state]);
150
151 might_sleep();
152
153 spin_lock(&ep->lock);
154 flush_conn = (ep->conn_async_state == ASYNC_CONN_INPROGRESS);
155 spin_unlock(&ep->lock);
156
157 if (flush_conn)
158 flush_workqueue(ms_info.mi_conn_wq);
159
160 micscif_inc_node_refcnt(ep->remote_dev, 1);
161
162 spin_lock_irqsave(&ep->lock, sflags);
163 oldstate = ep->state;
164
165 ep->state = SCIFEP_CLOSING;
166
167 switch (oldstate) {
168 case SCIFEP_ZOMBIE:
169 BUG_ON(SCIFEP_ZOMBIE == oldstate);
170 case SCIFEP_CLOSED:
171 case SCIFEP_DISCONNECTED:
172 spin_unlock_irqrestore(&ep->lock, sflags);
173 micscif_unregister_all_windows(epd);
174 // Remove from the disconnected list
175 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
176 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
177 tmpep = list_entry(pos, struct endpt, list);
178 if (tmpep == ep) {
179 list_del(pos);
180 break;
181 }
182 }
183 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
184 break;
185 case SCIFEP_UNBOUND:
186 case SCIFEP_BOUND:
187 case SCIFEP_CONNECTING:
188 spin_unlock_irqrestore(&ep->lock, sflags);
189 break;
190 case SCIFEP_MAPPING:
191 case SCIFEP_CONNECTED:
192 case SCIFEP_CLOSING:
193 {
194 struct nodemsg msg;
195 struct endpt *fep = NULL;
196 struct endpt *tmpep;
197 unsigned long ts = jiffies;
198 struct list_head *pos, *tmpq;
199
200 // Very short time before mapping completes and state becomes connected
201 // and does a standard teardown.
202 ts = jiffies;
203 while (ep->state == SCIFEP_MAPPING) {
204 cpu_relax();
205 if (time_after((unsigned long)jiffies,ts + NODE_ALIVE_TIMEOUT)) {
206 printk(KERN_ERR "%s %d ep->state %d\n", __func__, __LINE__, ep->state);
207 ep->state = SCIFEP_BOUND;
208 break;
209 }
210 }
211
212 init_waitqueue_head(&ep->disconwq); // Wait for connection queue
213 spin_unlock_irqrestore(&ep->lock, sflags);
214
215 micscif_unregister_all_windows(epd);
216
217 // Remove from the connected list
218 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
219 list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
220 tmpep = list_entry(pos, struct endpt, list);
221 if (tmpep == ep) {
222 list_del(pos);
223 put_conn_count(ep->remote_dev);
224 fep = tmpep;
225 spin_lock(&ep->lock);
226 break;
227 }
228 }
229
230 if (fep == NULL) {
231 // The other side has completed the disconnect before
232 // the end point can be removed from the list. Therefore
233 // the ep lock is not locked, traverse the disconnected list
234 // to find the endpoint, release the conn lock and
235 // proceed to teardown the end point below.
236 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
237 tmpep = list_entry(pos, struct endpt, list);
238 if (tmpep == ep) {
239 list_del(pos);
240 break;
241 }
242 }
243 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
244 break;
245 }
246
247 spin_unlock(&ms_info.mi_connlock);
248
249 // Now we are free to close out the connection
250 msg.uop = SCIF_DISCNCT;
251 msg.src = ep->port;
252 msg.dst = ep->peer;
253 msg.payload[0] = (uint64_t)ep;
254 msg.payload[1] = ep->remote_ep;
255
256 err = micscif_nodeqp_send(ep->remote_dev, &msg, ep);
257 spin_unlock_irqrestore(&ep->lock, sflags);
258
259 if (!err)
260 /* Now wait for the remote node to respond */
261 wait_event_timeout(ep->disconwq,
262 (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT);
263 /*
264 * Grab and release the ep lock to synchronize with the
265 * thread waking us up. If we dont grab this lock, then
266 * the ep might be freed before the wakeup completes
267 * resulting in potential memory corruption.
268 */
269 spin_lock_irqsave(&ep->lock, sflags);
270 spin_unlock_irqrestore(&ep->lock, sflags);
271 break;
272 }
273 case SCIFEP_LISTENING:
274 case SCIFEP_CLLISTEN:
275 {
276 struct conreq *conreq;
277 struct nodemsg msg;
278 struct endpt *aep;
279
280 spin_unlock_irqrestore(&ep->lock, sflags);
281 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
282
283 // remove from listen list
284 list_for_each_safe(pos, tmpq, &ms_info.mi_listen) {
285 tmpep = list_entry(pos, struct endpt, list);
286 if (tmpep == ep) {
287 list_del(pos);
288 }
289 }
290 // Remove any dangling accepts
291 while (ep->acceptcnt) {
292 aep = list_first_entry(&ep->li_accept, struct endpt, liacceptlist);
293 BUG_ON(!aep);
294 list_del(&aep->liacceptlist);
295 if (aep->port.port && !aep->accepted_ep)
296 put_scif_port(aep->port.port);
297 list_for_each_safe(pos, tmpq, &ms_info.mi_uaccept) {
298 tmpep = list_entry(pos, struct endpt, miacceptlist);
299 if (tmpep == aep) {
300 list_del(pos);
301 break;
302 }
303 }
304 spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
305 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
306 list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
307 tmpep = list_entry(pos, struct endpt, list);
308 if (tmpep == aep) {
309 list_del(pos);
310 put_conn_count(aep->remote_dev);
311 break;
312 }
313 }
314 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
315 tmpep = list_entry(pos, struct endpt, list);
316 if (tmpep == aep) {
317 list_del(pos);
318 break;
319 }
320 }
321 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
322 micscif_teardown_ep(aep);
323 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
324 micscif_add_epd_to_zombie_list(aep, MI_EPLOCK_HELD);
325 ep->acceptcnt--;
326 }
327
328 spin_lock(&ep->lock);
329 spin_unlock(&ms_info.mi_eplock);
330
331 // Remove and reject any pending connection requests.
332 while (ep->conreqcnt) {
333 conreq = list_first_entry(&ep->conlist, struct conreq, list);
334 list_del(&conreq->list);
335
336 msg.uop = SCIF_CNCT_REJ;
337 msg.dst.node = conreq->msg.src.node;
338 msg.dst.port = conreq->msg.src.port;
339 msg.payload[0] = conreq->msg.payload[0];
340 msg.payload[1] = conreq->msg.payload[1];
341 /*
342 * No Error Handling on purpose for micscif_nodeqp_send().
343 * If the remote node is lost we still want free the connection
344 * requests on the self node.
345 */
346 micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, ep);
347
348 ep->conreqcnt--;
349 kfree(conreq);
350 }
351
352 // If a kSCIF accept is waiting wake it up
353 wake_up_interruptible(&ep->conwq);
354 spin_unlock_irqrestore(&ep->lock, sflags);
355 break;
356 }
357 }
358 if (ep->port.port && !ep->accepted_ep)
359 put_scif_port(ep->port.port);
360 micscif_dec_node_refcnt(ep->remote_dev, 1);
361 micscif_teardown_ep(ep);
362 micscif_add_epd_to_zombie_list(ep, !MI_EPLOCK_HELD);
363 return 0;
364}
365
366void
367scif_ref_rel(struct kref *kref_count)
368{
369 struct endpt *epd;
370 epd = container_of(kref_count, struct endpt, ref_count);
371 __scif_close((scif_epd_t)epd);
372}
373
374int
375scif_close(scif_epd_t epd)
376{
377 __scif_flush(epd);
378 put_kref_count(epd);
379 return 0;
380}
381EXPORT_SYMBOL(scif_close);
382
383/**
384 * scif_flush() - Flush the endpoint
385 * @epd: The end point address returned from scif_open()
386 *
387 */
388int
389__scif_flush(scif_epd_t epd)
390{
391 struct endpt *ep = (struct endpt *)epd;
392 struct endpt *tmpep;
393 struct list_head *pos, *tmpq;
394 unsigned long sflags;
395 int err;
396
397 might_sleep();
398
399 micscif_inc_node_refcnt(ep->remote_dev, 1);
400
401 spin_lock_irqsave(&ep->lock, sflags);
402
403 switch (ep->state) {
404 case SCIFEP_CONNECTED:
405 {
406 struct nodemsg msg;
407 struct endpt *fep = NULL;
408
409 init_waitqueue_head(&ep->disconwq); // Wait for connection queue
410 WARN_ON(ep->files); // files should never be set while connected
411 spin_unlock_irqrestore(&ep->lock, sflags);
412 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
413
414 list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
415 tmpep = list_entry(pos, struct endpt, list);
416 if (tmpep == ep) {
417 list_del(pos);
418 put_conn_count(ep->remote_dev);
419 fep = tmpep;
420 spin_lock(&ep->lock);
421 break;
422 }
423 }
424
425 if (fep == NULL) {
426 // The other side has completed the disconnect before
427 // the end point can be removed from the list. Therefore
428 // the ep lock is not locked, traverse the disconnected list
429 // to find the endpoint, release the conn lock.
430 list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
431 tmpep = list_entry(pos, struct endpt, list);
432 if (tmpep == ep) {
433 list_del(pos);
434 break;
435 }
436 }
437 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
438 break;
439 }
440
441 spin_unlock(&ms_info.mi_connlock);
442
443 msg.uop = SCIF_DISCNCT;
444 msg.src = ep->port;
445 msg.dst = ep->peer;
446 msg.payload[0] = (uint64_t)ep;
447 msg.payload[1] = ep->remote_ep;
448
449 err = micscif_nodeqp_send(ep->remote_dev, &msg, ep);
450
451 spin_unlock_irqrestore(&ep->lock, sflags);
452 if (!err)
453 /* Now wait for the remote node to respond */
454 wait_event_timeout(ep->disconwq,
455 (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT);
456 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
457 spin_lock(&ep->lock);
458 list_add_tail(&ep->list, &ms_info.mi_disconnected);
459 ep->state = SCIFEP_DISCONNECTED;
460 spin_unlock(&ep->lock);
461 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
462 // Wake up threads blocked in send and recv
463 wake_up_interruptible(&ep->sendwq);
464 wake_up_interruptible(&ep->recvwq);
465 break;
466 }
467 case SCIFEP_LISTENING:
468 {
469 ep->state = SCIFEP_CLLISTEN;
470
471 // If an accept is waiting wake it up
472 wake_up_interruptible(&ep->conwq);
473 spin_unlock_irqrestore(&ep->lock, sflags);
474 break;
475 }
476 default:
477 spin_unlock_irqrestore(&ep->lock, sflags);
478 break;
479 }
480 micscif_dec_node_refcnt(ep->remote_dev, 1);
481 return 0;
482}
483
484/**
485 * scif_bind() - Bind a SCIF end point to a port ID.
486 * @epd: The end point address returned from scif_open()
487 * @pn: Port ID (number) to bind to
488 *
489 * Set the port ID associated with the end point and place it in the bound state.
490 * If a port ID of zero is requested a non zero port ID is allocated for it.
491 *
492 * Upon successful compltion the port id (number) will be returned.
493 *
494 * If the end point is not in the unbound state then return -EISCONN.
495 *
496 * If port ID zero is specified and allocation of a port ID fails -ENOSPC
497 * will be returned.
498 */
499int
500__scif_bind(scif_epd_t epd, uint16_t pn)
501{
502 struct endpt *ep = (struct endpt *)epd;
503 unsigned long sflags;
504 int ret = 0;
505 int tmp;
506
507 pr_debug("SCIFAPI bind: ep %p %s requested port number %d\n",
508 ep, scif_ep_states[ep->state], pn);
509
510 might_sleep();
511
512 if (pn) {
513 /*
514 * Modeled on http://www.ietf.org/rfc/rfc1700.txt?number=1700
515 * SCIF ports below SCIF_ADMIN_PORT_END can only be bound by
516 * system (or root) processes or by processes executed by
517 * privileged users.
518 */
519 if ( pn < SCIF_ADMIN_PORT_END && !capable(CAP_SYS_ADMIN)) {
520 ret = -EACCES;
521 goto scif_bind_admin_exit;
522 }
523 }
524
525 spin_lock_irqsave(&ep->lock, sflags);
526 if (ep->state == SCIFEP_BOUND) {
527 ret = -EINVAL;
528 goto scif_bind_exit;
529 } else if (ep->state != SCIFEP_UNBOUND) {
530 ret = -EISCONN;
531 goto scif_bind_exit;
532 }
533
534 if (pn) {
535 if ((tmp = rsrv_scif_port(pn)) != pn) {
536 ret = -EINVAL;
537 goto scif_bind_exit;
538 }
539 } else {
540 pn = get_scif_port();
541 if (!pn) {
542 ret = -ENOSPC;
543 goto scif_bind_exit;
544 }
545 }
546
547 ep->state = SCIFEP_BOUND;
548 ep->port.node = ms_info.mi_nodeid;
549 ep->port.port = pn;
550 ep->conn_async_state = ASYNC_CONN_IDLE;
551 ret = pn;
552 pr_debug("SCIFAPI bind: bound to port number %d\n", pn);
553
554scif_bind_exit:
555 spin_unlock_irqrestore(&ep->lock, sflags);
556scif_bind_admin_exit:
557 return ret;
558}
559
560int
561scif_bind(scif_epd_t epd, uint16_t pn)
562{
563 int ret;
564 get_kref_count(epd);
565 ret = __scif_bind(epd, pn);
566 put_kref_count(epd);
567 return ret;
568}
569EXPORT_SYMBOL(scif_bind);
570
571/**
572 * scif_listen() - Place the end point in the listening state
573 * @epd: The end point address returned from scif_open()
574 * @backlog: Maximum number of pending connection requests.
575 *
576 * The end point is placed in the listening state ready to accept connection
577 * requests. The backlog paramter is saved to indicate the maximun number of
578 * connection requests from the remote node to save. The end point is
579 * placed on a list of listening end points to allow a connection request to
580 * find it.
581 *
582 * Upon successful completion a zero is returned.
583 *
584 * If the end point is not in the bound state -EINVAL or -EISCONN is returned.
585 *
586 */
587int
588__scif_listen(scif_epd_t epd, int backlog)
589{
590 struct endpt *ep = (struct endpt *)epd;
591 unsigned long sflags;
592
593 pr_debug("SCIFAPI listen: ep %p %s\n", ep, scif_ep_states[ep->state]);
594
595 might_sleep();
596 spin_lock_irqsave(&ep->lock, sflags);
597 switch (ep->state) {
598 case SCIFEP_ZOMBIE:
599 BUG_ON(SCIFEP_ZOMBIE == ep->state);
600 case SCIFEP_CLOSED:
601 case SCIFEP_CLOSING:
602 case SCIFEP_CLLISTEN:
603 case SCIFEP_UNBOUND:
604 case SCIFEP_DISCONNECTED:
605 spin_unlock_irqrestore(&ep->lock, sflags);
606 return -EINVAL;
607 case SCIFEP_LISTENING:
608 case SCIFEP_CONNECTED:
609 case SCIFEP_CONNECTING:
610 case SCIFEP_MAPPING:
611 spin_unlock_irqrestore(&ep->lock, sflags);
612 return -EISCONN;
613 case SCIFEP_BOUND:
614 break;
615 }
616
617 ep->state = SCIFEP_LISTENING;
618 ep->backlog = backlog;
619
620 ep->conreqcnt = 0;
621 ep->acceptcnt = 0;
622 INIT_LIST_HEAD(&ep->conlist); // List of connection requests
623 init_waitqueue_head(&ep->conwq); // Wait for connection queue
624 INIT_LIST_HEAD(&ep->li_accept); // User ep list for ACCEPTREG calls
625 spin_unlock_irqrestore(&ep->lock, sflags);
626
627 // Listen status is complete so delete the qp information not needed
628 // on a listen before placing on the list of listening ep's
629 micscif_teardown_ep((void *)ep);
630 ep->qp_info.qp = NULL;
631
632 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
633 list_add_tail(&ep->list, &ms_info.mi_listen);
634 spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
635 return 0;
636}
637
638int
639scif_listen(scif_epd_t epd, int backlog)
640{
641 int ret;
642 get_kref_count(epd);
643 ret = __scif_listen(epd, backlog);
644 put_kref_count(epd);
645 return ret;
646}
647EXPORT_SYMBOL(scif_listen);
648
649#ifdef _MIC_SCIF_
650/*
651 * scif_p2p_connect:
652 * @node: destination node id
653 *
654 * Try to setup a p2p connection between the current
655 * node and the desitination node. We need host to
656 * setup the initial p2p connections. So we send
657 * this message to the host which acts like proxy
658 * in setting up p2p connection.
659 */
660static int scif_p2p_connect(int node)
661{
662 struct micscif_dev *remote_dev = &scif_dev[node];
663 struct nodemsg msg;
664 int err;
665
666 pr_debug("%s:%d SCIF_NODE_CONNECT to host\n", __func__, __LINE__);
667 micscif_inc_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
668
669 msg.dst.node = SCIF_HOST_NODE;
670 msg.payload[0] = node;
671 msg.uop = SCIF_NODE_CONNECT;
672
673 if ((err = micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE],
674 &msg, NULL))) {
675 printk(KERN_ERR "%s:%d error while sending SCIF_NODE_CONNECT to"
676 " node %d\n", __func__, __LINE__, node);
677 micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
678 goto error;
679 }
680
681 wait_event_interruptible_timeout(remote_dev->sd_p2p_wq,
682 (remote_dev->sd_state == SCIFDEV_RUNNING) ||
683 (remote_dev->sd_state == SCIFDEV_NOTPRESENT), NODE_ALIVE_TIMEOUT);
684
685 pr_debug("%s:%d SCIF_NODE_CONNECT state:%d\n", __func__, __LINE__,
686 remote_dev->sd_state);
687 micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
688error:
689 return err;
690}
691#endif
692
693static int scif_conn_func(struct endpt *ep)
694{
695 int err = 0;
696 struct nodemsg msg;
697 unsigned long sflags;
698 int term_sent = 0;
699
700 if ((err = micscif_reserve_dma_chan(ep))) {
701 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
702 ep->state = SCIFEP_BOUND;
703 goto connect_error_simple;
704 }
705 // Initiate the first part of the endpoint QP setup
706 err = micscif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset,
707 ENDPT_QP_SIZE, ep->remote_dev);
708 if (err) {
709 printk(KERN_ERR "%s err %d qp_offset 0x%llx\n",
710 __func__, err, ep->qp_info.qp_offset);
711 ep->state = SCIFEP_BOUND;
712 goto connect_error_simple;
713 }
714
715 micscif_inc_node_refcnt(ep->remote_dev, 1);
716
717 // Format connect message and send it
718 msg.src = ep->port;
719 msg.dst = ep->conn_port;
720 msg.uop = SCIF_CNCT_REQ;
721 msg.payload[0] = (uint64_t)ep;
722 msg.payload[1] = ep->qp_info.qp_offset;
723 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
724 micscif_dec_node_refcnt(ep->remote_dev, 1);
725 goto connect_error_simple;
726 }
727 // Wait for request to be processed.
728 while ((err = wait_event_interruptible_timeout(ep->conwq,
729 (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT)) <= 0) {
730 if (!err)
731 err = -ENODEV;
732
733 pr_debug("SCIFAPI connect: ep %p ^C detected\n", ep);
734 // interrupted out of the wait
735 if (!term_sent++) {
736 int bak_err = err;
737 msg.uop = SCIF_CNCT_TERM;
738 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
739retry:
740 err = wait_event_timeout(ep->diswq,
741 (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT);
742 if (!err && scifdev_alive(ep))
743 goto retry;
744 if (!err)
745 err = -ENODEV;
746 if (err > 0)
747 err = 0;
748 }
749 if (ep->state == SCIFEP_MAPPING) {
750 micscif_setup_qp_connect_response(ep->remote_dev,
751 ep->qp_info.qp, ep->qp_info.cnct_gnt_payload);
752 // Send grant nack
753 msg.uop = SCIF_CNCT_GNTNACK;
754 msg.payload[0] = ep->remote_ep;
755 /* No error handling for Notification messages */
756 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
757 }
758 // Ensure after that even after a timeout the state of the end point is bound
759 ep->state = SCIFEP_BOUND;
760 if (bak_err)
761 err = bak_err;
762 break;
763 }
764 }
765
766 if (err > 0)
767 err = 0;
768
769 if (term_sent || err) {
770 micscif_dec_node_refcnt(ep->remote_dev, 1);
771 goto connect_error_simple;
772 }
773
774 if (ep->state == SCIFEP_MAPPING) {
775 err = micscif_setup_qp_connect_response(ep->remote_dev,
776 ep->qp_info.qp, ep->qp_info.cnct_gnt_payload);
777
778 // If the resource to map the queue are not available then we need
779 // to tell the other side to terminate the accept
780 if (err) {
781 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
782
783 // Send grant nack
784 msg.uop = SCIF_CNCT_GNTNACK;
785 msg.payload[0] = ep->remote_ep;
786 /* No error handling for Notification messages */
787 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
788
789 ep->state = SCIFEP_BOUND;
790 micscif_dec_node_refcnt(ep->remote_dev, 1);
791 goto connect_error_simple;
792 }
793
794 // Send a grant ack to inform the accept we are done mapping its resources.
795 msg.uop = SCIF_CNCT_GNTACK;
796 msg.payload[0] = ep->remote_ep;
797 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
798 ep->state = SCIFEP_CONNECTED;
799 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
800 list_add_tail(&ep->list, &ms_info.mi_connected);
801 get_conn_count(ep->remote_dev);
802 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
803 pr_debug("SCIFAPI connect: ep %p connected\n", ep);
804 } else
805 ep->state = SCIFEP_BOUND;
806 micscif_dec_node_refcnt(ep->remote_dev, 1);
807 goto connect_error_simple;
808
809 } else if (ep->state == SCIFEP_BOUND) {
810 pr_debug("SCIFAPI connect: ep %p connection refused\n", ep);
811 err = -ECONNREFUSED;
812 micscif_dec_node_refcnt(ep->remote_dev, 1);
813 goto connect_error_simple;
814
815 } else {
816 pr_debug("SCIFAPI connect: ep %p connection interrupted\n", ep);
817 err = -EINTR;
818 micscif_dec_node_refcnt(ep->remote_dev, 1);
819 goto connect_error_simple;
820 }
821 micscif_dec_node_refcnt(ep->remote_dev, 1);
822connect_error_simple:
823 return err;
824}
825
826/*
827 * micscif_conn_handler:
828 *
829 * Workqueue handler for servicing non-blocking SCIF connect
830 *
831 */
832void micscif_conn_handler(struct work_struct *work)
833{
834 struct endpt *ep;
835
836 do {
837 ep = NULL;
838 spin_lock(&ms_info.mi_nb_connect_lock);
839 if (!list_empty(&ms_info.mi_nb_connect_list)) {
840 ep = list_first_entry(&ms_info.mi_nb_connect_list,
841 struct endpt, conn_list);
842 list_del(&ep->conn_list);
843 }
844 spin_unlock(&ms_info.mi_nb_connect_lock);
845 if (ep) {
846 ep->conn_err = scif_conn_func(ep);
847 wake_up_interruptible(&ep->conn_pend_wq);
848 }
849 } while (ep);
850}
851
852/**
853 * scif_connect() - Request a connection to a remote node
854 * @epd: The end point address returned from scif_open()
855 * @dst: Remote note address informtion
856 *
857 * The function requests a scif connection to the remote node
858 * identified by the dst parameter. "dst" contains the remote node and
859 * port ids.
860 *
861 * Upon successful complete a zero will be returned.
862 *
863 * If the end point is not in the bound state -EINVAL will be returned.
864 *
865 * If during the connection sequence resource allocation fails the -ENOMEM
866 * will be returned.
867 *
868 * If the remote side is not responding to connection requests the caller may
869 * terminate this funciton with a signal. If so a -EINTR will be returned.
870 */
871int
872__scif_connect(scif_epd_t epd, struct scif_portID *dst, bool non_block)
873{
874 struct endpt *ep = (struct endpt *)epd;
875 unsigned long sflags;
876 int err = 0;
877#ifdef _MIC_SCIF_
878 struct micscif_dev *remote_dev;
879#endif
880
881 pr_debug("SCIFAPI connect: ep %p %s\n", ep,
882 scif_ep_states[ep->state]);
883
884 if (dst->node > MAX_BOARD_SUPPORTED)
885 return -ENODEV;
886
887 might_sleep();
888
889#ifdef _MIC_SCIF_
890 remote_dev = &scif_dev[dst->node];
891 if ((SCIFDEV_INIT == remote_dev->sd_state ||
892 SCIFDEV_STOPPED == remote_dev->sd_state) && mic_p2p_enable)
893 if ((err = scif_p2p_connect(dst->node)))
894 return err;
895#endif
896
897 if (SCIFDEV_RUNNING != scif_dev[dst->node].sd_state &&
898 SCIFDEV_SLEEPING != scif_dev[dst->node].sd_state)
899 return -ENODEV;
900
901 spin_lock_irqsave(&ep->lock, sflags);
902 switch (ep->state) {
903 case SCIFEP_ZOMBIE:
904 BUG_ON(SCIFEP_ZOMBIE == ep->state);
905
906 case SCIFEP_CLOSED:
907 case SCIFEP_CLOSING:
908 err = -EINVAL;
909 break;
910
911 case SCIFEP_DISCONNECTED:
912 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
913 ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
914 else
915 err = -EINVAL;
916 break;
917
918 case SCIFEP_LISTENING:
919 case SCIFEP_CLLISTEN:
920 err = -EOPNOTSUPP;
921 break;
922
923 case SCIFEP_CONNECTING:
924 case SCIFEP_MAPPING:
925 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
926 err = -EINPROGRESS;
927 else
928 err = -EISCONN;
929 break;
930
931 case SCIFEP_CONNECTED:
932 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
933 ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
934 else
935 err = -EISCONN;
936 break;
937
938 case SCIFEP_UNBOUND:
939 if ((ep->port.port = get_scif_port()) == 0)
940 err = -ENOSPC;
941 else {
942 ep->port.node = ms_info.mi_nodeid;
943 ep->conn_async_state = ASYNC_CONN_IDLE;
944 }
945 /* Fall through */
946 case SCIFEP_BOUND:
947 /*
948 * If a non-blocking connect has been already initiated (conn_async_state
949 * is either ASYNC_CONN_INPROGRESS or ASYNC_CONN_FLUSH_WORK), the end point
950 * could end up in SCIF_BOUND due an error in the connection
951 * process (e.g., connnection refused)
952 * If conn_async_state is ASYNC_CONN_INPROGRESS - transition to
953 * ASYNC_CONN_FLUSH_WORK so that the error status can be collected.
954 * If the state is already ASYNC_CONN_FLUSH_WORK - then set the error
955 * to EINPROGRESS since some other thread is waiting to collect error status.
956 */
957 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
958 ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
959 else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK)
960 err = -EINPROGRESS;
961 else {
962 ep->conn_port = *dst;
963 init_waitqueue_head(&ep->sendwq);
964 init_waitqueue_head(&ep->recvwq);
965 init_waitqueue_head(&ep->conwq);
966 init_waitqueue_head(&ep->diswq);
967 ep->conn_async_state = 0;
968
969 if (unlikely(non_block))
970 ep->conn_async_state = ASYNC_CONN_INPROGRESS;
971 }
972 break;
973 }
974
975 if (err || ep->conn_async_state == ASYNC_CONN_FLUSH_WORK)
976 goto connect_simple_unlock1;
977
978 ep->state = SCIFEP_CONNECTING;
979 ep->remote_dev = &scif_dev[dst->node];
980 ep->sd_state = SCIFDEV_RUNNING;
981 ep->qp_info.qp->magic = SCIFEP_MAGIC;
982 ep->qp_info.qp->ep = (uint64_t)ep;
983 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
984 init_waitqueue_head(&ep->conn_pend_wq);
985 spin_lock(&ms_info.mi_nb_connect_lock);
986 list_add_tail(&ep->conn_list,
987 &ms_info.mi_nb_connect_list);
988 spin_unlock(&ms_info.mi_nb_connect_lock);
989 err = -EINPROGRESS;
990 queue_work(ms_info.mi_conn_wq, &ms_info.mi_conn_work);
991 }
992connect_simple_unlock1:
993 spin_unlock_irqrestore(&ep->lock, sflags);
994
995 if (err)
996 return err;
997 else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) {
998 flush_workqueue(ms_info.mi_conn_wq);
999 err = ep->conn_err;
1000 spin_lock_irqsave(&ep->lock, sflags);
1001 ep->conn_async_state = ASYNC_CONN_IDLE;
1002 spin_unlock_irqrestore(&ep->lock, sflags);
1003 } else {
1004 err = scif_conn_func(ep);
1005 }
1006 return err;
1007}
1008
1009int
1010scif_connect(scif_epd_t epd, struct scif_portID *dst)
1011{
1012 int ret;
1013 get_kref_count(epd);
1014 ret = __scif_connect(epd, dst, false);
1015 put_kref_count(epd);
1016 return ret;
1017}
1018EXPORT_SYMBOL(scif_connect);
1019
1020/**
1021 * scif_accept() - Accept a connection request from the remote node
1022 * @epd: The end point address returned from scif_open()
1023 * @peer: Filled in with pear node and port information
1024 * @newepd: New end point created for connection
1025 * @flags: Indicates sychronous or asynchronous mode
1026 *
1027 * The function accepts a connection request from the remote node. Successful
1028 * complete is indicate by a new end point being created and passed back
1029 * to the caller for future reference.
1030 *
1031 * Upon successful complete a zero will be returned and the peer information
1032 * will be filled in.
1033 *
1034 * If the end point is not in the listening state -EINVAL will be returned.
1035 *
1036 * If during the connection sequence resource allocation fails the -ENOMEM
1037 * will be returned.
1038 *
1039 * If the function is called asynchronously and not connection request are
1040 * pending it will return -EAGAIN.
1041 *
1042 * If the remote side is not sending any connection requests the caller may
1043 * terminate this funciton with a signal. If so a -EINTR will be returned.
1044 */
1045int
1046__scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags)
1047{
1048 struct endpt *lep = (struct endpt *)epd;
1049 struct endpt *cep;
1050 struct conreq *conreq;
1051 struct nodemsg msg;
1052 unsigned long sflags;
1053 int err;
1054
1055 pr_debug("SCIFAPI accept: ep %p %s\n", lep, scif_ep_states[lep->state]);
1056
1057 // Error if flags other than SCIF_ACCEPT_SYNC are set
1058 if (flags & ~SCIF_ACCEPT_SYNC) {
1059 pr_debug("SCIFAPI accept: ep %p invalid flags %x\n", lep, flags & ~SCIF_ACCEPT_SYNC);
1060 return -EINVAL;
1061 }
1062
1063 if (!peer || !newepd) {
1064 pr_debug("SCIFAPI accept: ep %p peer %p or newepd %p NULL\n",
1065 lep, peer, newepd);
1066 return -EINVAL;
1067 }
1068
1069 might_sleep();
1070 spin_lock_irqsave(&lep->lock, sflags);
1071 if (lep->state != SCIFEP_LISTENING) {
1072 pr_debug("SCIFAPI accept: ep %p not listending\n", lep);
1073 spin_unlock_irqrestore(&lep->lock, sflags);
1074 return -EINVAL;
1075 }
1076
1077 if (!lep->conreqcnt && !(flags & SCIF_ACCEPT_SYNC)) {
1078 // No connection request present and we do not want to wait
1079 pr_debug("SCIFAPI accept: ep %p async request with nothing pending\n", lep);
1080 spin_unlock_irqrestore(&lep->lock, sflags);
1081 return -EAGAIN;
1082 }
1083
1084retry_connection:
1085 spin_unlock_irqrestore(&lep->lock, sflags);
1086 lep->files = current ? current->files : NULL;
1087 if ((err = wait_event_interruptible(lep->conwq,
1088 (lep->conreqcnt || (lep->state != SCIFEP_LISTENING)))) != 0) {
1089 // wait was interrupted
1090 pr_debug("SCIFAPI accept: ep %p ^C detected\n", lep);
1091 return err; // -ERESTARTSYS
1092 }
1093
1094 if (lep->state != SCIFEP_LISTENING) {
1095 return -EINTR;
1096 }
1097
1098 spin_lock_irqsave(&lep->lock, sflags);
1099
1100 if (!lep->conreqcnt) {
1101 goto retry_connection;
1102 }
1103
1104 // Get the first connect request off the list
1105 conreq = list_first_entry(&lep->conlist, struct conreq, list);
1106 list_del(&conreq->list);
1107 lep->conreqcnt--;
1108 spin_unlock_irqrestore(&lep->lock, sflags);
1109
1110 // Fill in the peer information
1111 peer->node = conreq->msg.src.node;
1112 peer->port = conreq->msg.src.port;
1113
1114 // Create the connection endpoint
1115 cep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL);
1116 if (!cep) {
1117 pr_debug("SCIFAPI accept: ep %p new end point allocation failed\n", lep);
1118 err = -ENOMEM;
1119 goto scif_accept_error_epalloc;
1120 }
1121 spin_lock_init(&cep->lock);
1122 mutex_init (&cep->sendlock);
1123 mutex_init (&cep->recvlock);
1124 cep->state = SCIFEP_CONNECTING;
1125 cep->remote_dev = &scif_dev[peer->node];
1126 cep->remote_ep = conreq->msg.payload[0];
1127 cep->sd_state = SCIFDEV_RUNNING;
1128
1129 if (!scifdev_alive(cep)) {
1130 err = -ENODEV;
1131 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1132 goto scif_accept_error_qpalloc;
1133 }
1134
1135 if (micscif_rma_ep_init(cep) < 0) {
1136 pr_debug("SCIFAPI accept: ep %p new %p RMA EP init failed\n", lep, cep);
1137 err = -ENOMEM;
1138 goto scif_accept_error_qpalloc;
1139 }
1140
1141 if ((err = micscif_reserve_dma_chan(cep))) {
1142 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1143 goto scif_accept_error_qpalloc;
1144 }
1145
1146 cep->qp_info.qp = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_KERNEL);
1147 if (!cep->qp_info.qp) {
1148 printk(KERN_ERR "Port Qp Allocation Failed\n");
1149 err = -ENOMEM;
1150 goto scif_accept_error_qpalloc;
1151 }
1152
1153 cep->qp_info.qp->magic = SCIFEP_MAGIC;
1154 cep->qp_info.qp->ep = (uint64_t)cep;
1155 micscif_inc_node_refcnt(cep->remote_dev, 1);
1156 err = micscif_setup_qp_accept(cep->qp_info.qp, &cep->qp_info.qp_offset,
1157 conreq->msg.payload[1], ENDPT_QP_SIZE, cep->remote_dev);
1158 if (err) {
1159 pr_debug("SCIFAPI accept: ep %p new %p micscif_setup_qp_accept %d qp_offset 0x%llx\n",
1160 lep, cep, err, cep->qp_info.qp_offset);
1161 micscif_dec_node_refcnt(cep->remote_dev, 1);
1162 goto scif_accept_error_map;
1163 }
1164
1165 cep->port.node = lep->port.node;
1166 cep->port.port = lep->port.port;
1167 cep->peer.node = peer->node;
1168 cep->peer.port = peer->port;
1169 cep->accepted_ep = true;
1170 init_waitqueue_head(&cep->sendwq); // Wait for data to be consumed
1171 init_waitqueue_head(&cep->recvwq); // Wait for data to be produced
1172 init_waitqueue_head(&cep->conwq); // Wait for connection request
1173
1174 // Return the grant message
1175 msg.uop = SCIF_CNCT_GNT;
1176 msg.src = cep->port;
1177 msg.payload[0] = cep->remote_ep;
1178 msg.payload[1] = cep->qp_info.qp_offset;
1179 msg.payload[2] = (uint64_t)cep;
1180
1181 err = micscif_nodeqp_send(cep->remote_dev, &msg, cep);
1182
1183 micscif_dec_node_refcnt(cep->remote_dev, 1);
1184 if (err)
1185 goto scif_accept_error_map;
1186retry:
1187 err = wait_event_timeout(cep->conwq,
1188 (cep->state != SCIFEP_CONNECTING), NODE_ACCEPT_TIMEOUT);
1189 if (!err && scifdev_alive(cep))
1190 goto retry;
1191
1192 if (!err) {
1193 err = -ENODEV;
1194 goto scif_accept_error_map;
1195 }
1196
1197 if (err > 0)
1198 err = 0;
1199
1200 kfree(conreq);
1201
1202 spin_lock_irqsave(&cep->lock, sflags);
1203
1204 if (cep->state == SCIFEP_CONNECTED) {
1205 // Connect sequence complete return new endpoint information
1206 *newepd = (scif_epd_t)cep;
1207 spin_unlock_irqrestore(&cep->lock, sflags);
1208 pr_debug("SCIFAPI accept: ep %p new %p returning new epnd point\n", lep, cep);
1209 return 0;
1210 }
1211
1212 if (cep->state == SCIFEP_CLOSING) {
1213 // Remote failed to allocate resources and NAKed the grant.
1214 // There is at this point nothing referencing the new end point.
1215 spin_unlock_irqrestore(&cep->lock, sflags);
1216 micscif_teardown_ep((void *)cep);
1217 kfree(cep);
1218
1219 // If call with sync flag then go back and wait.
1220 if (flags & SCIF_ACCEPT_SYNC) {
1221 spin_lock_irqsave(&lep->lock, sflags);
1222 goto retry_connection;
1223 }
1224
1225 pr_debug("SCIFAPI accept: ep %p new %p remote failed to allocate resources\n", lep, cep);
1226 return -EAGAIN;
1227 }
1228
1229 // While connect was in progress the other side closed and sent a disconnect
1230 // so set the end point status to closed but return anyway. This will allow
1231 // the caller to drain anything the other side may have put in the message queue.
1232 *newepd = (scif_epd_t)cep;
1233 spin_unlock_irqrestore(&cep->lock, sflags);
1234 return 0;
1235
1236 // Error allocating or mapping resources
1237scif_accept_error_map:
1238 kfree(cep->qp_info.qp);
1239
1240scif_accept_error_qpalloc:
1241 kfree(cep);
1242
1243scif_accept_error_epalloc:
1244 micscif_inc_node_refcnt(&scif_dev[conreq->msg.src.node], 1);
1245 // New reject the connection request due to lack of resources
1246 msg.uop = SCIF_CNCT_REJ;
1247 msg.dst.node = conreq->msg.src.node;
1248 msg.dst.port = conreq->msg.src.port;
1249 msg.payload[0] = conreq->msg.payload[0];
1250 msg.payload[1] = conreq->msg.payload[1];
1251 /* No error handling for Notification messages */
1252 micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, NULL);
1253 micscif_dec_node_refcnt(&scif_dev[conreq->msg.src.node], 1);
1254
1255 kfree(conreq);
1256 return err;
1257}
1258
1259int
1260scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags)
1261{
1262 int ret;
1263 get_kref_count(epd);
1264 ret = __scif_accept(epd, peer, newepd, flags);
1265 if (ret == 0) {
1266 kref_init(&((*newepd)->ref_count));
1267 }
1268 put_kref_count(epd);
1269 return ret;
1270}
1271EXPORT_SYMBOL(scif_accept);
1272
1273/*
1274 * scif_msg_param_check:
1275 * @epd: The end point address returned from scif_open()
1276 * @len: Length to receive
1277 * @flags: Syncronous or asynchronous access
1278 *
1279 * Validate parameters for messaging APIs scif_send(..)/scif_recv(..).
1280 */
1281static inline int
1282scif_msg_param_check(scif_epd_t epd, int len, int flags)
1283{
1284 int ret = -EINVAL;
1285
1286 if (len < 0)
1287 goto err_ret;
1288
1289 if (flags && (!(flags & SCIF_RECV_BLOCK)))
1290 goto err_ret;
1291
1292 ret = 0;
1293
1294err_ret:
1295 return ret;
1296}
1297
1298#define SCIF_BLAST (1 << 1) /* Use bit 1 of flags field */
1299
1300#ifdef SCIF_BLAST
1301/*
1302 * Added a temporary implementation of the exception path.
1303 * The cost to the normal path is 1 local variable (set once and
1304 * tested once) plus 2 tests for the 'blast' flag.
1305 * This only apply to the card side kernel API.
1306 */
1307#ifndef _MIC_SCIF_
1308#undef SCIF_BLAST
1309#endif
1310#endif
1311
1312/**
1313 * _scif_send() - Send data to connection queue
1314 * @epd: The end point address returned from scif_open()
1315 * @msg: Address to place data
1316 * @len: Length to receive
1317 * @flags: Syncronous or asynchronous access
1318 *
1319 * This function sends a packet of data to the queue * created by the
1320 * connection establishment sequence. It returns when the packet has
1321 * been completely sent.
1322 *
1323 * Successful completion returns the number of bytes sent.
1324 *
1325 * If the end point is not in the connect state returns -ENOTCONN;
1326 *
1327 * This function may be interrupted by a signal and will return -EINTR.
1328 */
1329int
1330_scif_send(scif_epd_t epd, void *msg, int len, int flags)
1331{
1332 struct endpt *ep = (struct endpt *)epd;
1333 struct nodemsg notif_msg;
1334 unsigned long sflags;
1335 size_t curr_xfer_len = 0;
1336 size_t sent_len = 0;
1337 size_t write_count;
1338 int ret;
1339#ifdef SCIF_BLAST
1340 int tl;
1341#endif
1342
1343 if (flags & SCIF_SEND_BLOCK)
1344 might_sleep();
1345
1346#ifdef SCIF_BLAST
1347 if (flags & SCIF_BLAST) {
1348 /*
1349 * Do a decent try to acquire lock (~100 uSec)
1350 */
1351 for (ret = tl = 0; ret < 100 && !tl; ret++) {
1352 tl = spin_trylock_irqsave(&ep->lock, sflags);
1353 cpu_relax();
1354 }
1355 } else {
1356 tl = 1;
1357 spin_lock_irqsave(&ep->lock, sflags);
1358 }
1359#else
1360 spin_lock_irqsave(&ep->lock, sflags);
1361#endif
1362
1363 while (sent_len != len) {
1364 if (ep->state == SCIFEP_DISCONNECTED) {
1365 ret = (int)(sent_len ? sent_len : -ECONNRESET);
1366 goto unlock_dec_return;
1367 }
1368 if (ep->state != SCIFEP_CONNECTED) {
1369 ret = (int)(sent_len ? sent_len : -ENOTCONN);
1370 goto unlock_dec_return;
1371 }
1372 if (!scifdev_alive(ep)) {
1373 ret = (int) (sent_len ? sent_len : -ENODEV);
1374 goto unlock_dec_return;
1375 }
1376 write_count = micscif_rb_space(&ep->qp_info.qp->outbound_q);
1377 if (write_count) {
1378 /*
1379 * Best effort to send as much data as there
1380 * is space in the RB particularly important for the
1381 * Non Blocking case.
1382 */
1383 curr_xfer_len = min(len - sent_len, write_count);
1384 ret = micscif_rb_write(&ep->qp_info.qp->outbound_q, msg,
1385 (uint32_t)curr_xfer_len);
1386 if (ret < 0) {
1387 ret = -EFAULT;
1388 goto unlock_dec_return;
1389 }
1390 if (ret) {
1391 spin_unlock_irqrestore(&ep->lock, sflags);
1392 /*
1393 * If there is space in the RB and we have the
1394 * EP lock held then writing to the RB should
1395 * succeed. Releasing spin lock before asserting
1396 * to avoid deadlocking the system.
1397 */
1398 BUG_ON(ret);
1399 }
1400 /*
1401 * Success. Update write pointer.
1402 */
1403 micscif_rb_commit(&ep->qp_info.qp->outbound_q);
1404#ifdef SCIF_BLAST
1405 if (flags & SCIF_BLAST) {
1406 /*
1407 * Bypass-path; set flag int the host side node_qp
1408 * and ring the doorbell. Host will wake-up all
1409 * listeners, such that the message will be seen.
1410 * Need micscif_send_host_intr() to be non-static.
1411 */
1412 extern int micscif_send_host_intr(struct micscif_dev *, uint32_t);
1413 ep->remote_dev->qpairs->remote_qp->blast = 1;
1414 smp_wmb(); /* Sufficient or need sfence? */
1415 micscif_send_host_intr(ep->remote_dev, 0);
1416 } else {
1417 /*
1418 * Normal path: send notification on the
1419 * node_qp ring buffer and ring the doorbell.
1420 */
1421 notif_msg.src = ep->port;
1422 notif_msg.uop = SCIF_CLIENT_SENT;
1423 notif_msg.payload[0] = ep->remote_ep;
1424 if ((ret = micscif_nodeqp_send(ep->remote_dev, &notif_msg, ep))) {
1425 ret = sent_len ? sent_len : ret;
1426 goto unlock_dec_return;
1427 }
1428 }
1429#else
1430 /*
1431 * Send a notification to the peer about the
1432 * produced data message.
1433 */
1434 notif_msg.src = ep->port;
1435 notif_msg.uop = SCIF_CLIENT_SENT;
1436 notif_msg.payload[0] = ep->remote_ep;
1437 if ((ret = micscif_nodeqp_send(ep->remote_dev, &notif_msg, ep))) {
1438 ret = (int)(sent_len ? sent_len : ret);
1439 goto unlock_dec_return;
1440 }
1441#endif
1442 sent_len += curr_xfer_len;
1443 msg = (char *)msg + curr_xfer_len;
1444 continue;
1445 }
1446 curr_xfer_len = min(len - sent_len, (size_t)(ENDPT_QP_SIZE - 1));
1447 /*
1448 * Not enough space in the RB. Return in the Non Blocking case.
1449 */
1450 if (!(flags & SCIF_SEND_BLOCK)) {
1451 ret = (int)sent_len;
1452 goto unlock_dec_return;
1453 }
1454#ifdef SCIF_BLAST
1455 /*
1456 * Flags SCIF_BLAST and SCIF_SEND_BLOCK are mutually
1457 * exclusive, so if we get here we know that SCIF_BLAST
1458 * was not set and thus we _do_ have the spinlock.
1459 * No need to check variable tl here
1460 */
1461#endif
1462 spin_unlock_irqrestore(&ep->lock, sflags);
1463 /*
1464 * Wait for a message now in the Blocking case.
1465 */
1466 if ((ret = wait_event_interruptible(ep->sendwq,
1467 (SCIFEP_CONNECTED != ep->state) ||
1468 (micscif_rb_space(&ep->qp_info.qp->outbound_q)
1469 >= curr_xfer_len) || (!scifdev_alive(ep))))) {
1470 ret = (int) (sent_len ? sent_len : ret);
1471 goto dec_return;
1472 }
1473 spin_lock_irqsave(&ep->lock, sflags);
1474 }
1475 ret = len;
1476unlock_dec_return:
1477#ifdef SCIF_BLAST
1478 if (tl)
1479#endif
1480 spin_unlock_irqrestore(&ep->lock, sflags);
1481dec_return:
1482 return ret;
1483}
1484
1485/**
1486 * _scif_recv() - Recieve data from connection queue
1487 * @epd: The end point address returned from scif_open()
1488 * @msg: Address to place data
1489 * @len: Length to receive
1490 * @flags: Syncronous or asynchronous access
1491 * @touser: package send to user buffer or kernel
1492 *
1493 * This function requests to receive a packet of data from the queue
1494 * created by the connection establishment sequence. It reads the amount
1495 * of data requested before returning.
1496 *
1497 * This function differs from the scif_send() by also returning data if the
1498 * end point is in the disconnected state and data is present.
1499 *
1500 * Successful completion returns the number of bytes read.
1501 *
1502 * If the end point is not in the connect state or in the disconnected state
1503 * with data prosent it returns -ENOTCONN;
1504 *
1505 * This function may be interrupted by a signal and will return -EINTR.
1506 */
1507int
1508_scif_recv(scif_epd_t epd, void *msg, int len, int flags)
1509{
1510 int read_size;
1511 struct endpt *ep = (struct endpt *)epd;
1512 unsigned long sflags;
1513 struct nodemsg notif_msg;
1514 size_t curr_recv_len = 0;
1515 size_t remaining_len = len;
1516 size_t read_count;
1517 int ret;
1518
1519 if (flags & SCIF_RECV_BLOCK)
1520 might_sleep();
1521
1522 micscif_inc_node_refcnt(ep->remote_dev, 1);
1523 spin_lock_irqsave(&ep->lock, sflags);
1524 while (remaining_len) {
1525 if (ep->state != SCIFEP_CONNECTED &&
1526 ep->state != SCIFEP_DISCONNECTED) {
1527 ret = (int) (len - remaining_len) ?
1528 (int) (len - remaining_len) : -ENOTCONN;
1529 goto unlock_dec_return;
1530 }
1531 read_count = micscif_rb_count(&ep->qp_info.qp->inbound_q,
1532 (int) remaining_len);
1533 if (read_count) {
1534 /*
1535 * Best effort to recv as much data as there
1536 * are bytes to read in the RB particularly
1537 * important for the Non Blocking case.
1538 */
1539 curr_recv_len = min(remaining_len, read_count);
1540 read_size = micscif_rb_get_next(
1541 &ep->qp_info.qp->inbound_q,
1542 msg, (int) curr_recv_len);
1543 if (read_size < 0){
1544 /* only could happen when copy to USER buffer
1545 */
1546 ret = -EFAULT;
1547 goto unlock_dec_return;
1548 }
1549 if (read_size != curr_recv_len) {
1550 spin_unlock_irqrestore(&ep->lock, sflags);
1551 /*
1552 * If there are bytes to be read from the RB and
1553 * we have the EP lock held then reading from
1554 * RB should succeed. Releasing spin lock before
1555 * asserting to avoid deadlocking the system.
1556 */
1557 BUG_ON(read_size != curr_recv_len);
1558 }
1559 if (ep->state == SCIFEP_CONNECTED) {
1560 /*
1561 * Update the read pointer only if the endpoint is
1562 * still connected else the read pointer might no
1563 * longer exist since the peer has freed resources!
1564 */
1565 micscif_rb_update_read_ptr(&ep->qp_info.qp->inbound_q);
1566 /*
1567 * Send a notification to the peer about the
1568 * consumed data message only if the EP is in
1569 * SCIFEP_CONNECTED state.
1570 */
1571 notif_msg.src = ep->port;
1572 notif_msg.uop = SCIF_CLIENT_RCVD;
1573 notif_msg.payload[0] = ep->remote_ep;
1574 if ((ret = micscif_nodeqp_send(ep->remote_dev, &notif_msg, ep))) {
1575 ret = (len - (int)remaining_len) ?
1576 (len - (int)remaining_len) : ret;
1577 goto unlock_dec_return;
1578 }
1579 }
1580 remaining_len -= curr_recv_len;
1581 msg = (char *)msg + curr_recv_len;
1582 continue;
1583 }
1584 curr_recv_len = min(remaining_len, (size_t)(ENDPT_QP_SIZE - 1));
1585 /*
1586 * Bail out now if the EP is in SCIFEP_DISCONNECTED state else
1587 * we will keep looping forever.
1588 */
1589 if (ep->state == SCIFEP_DISCONNECTED) {
1590 ret = (len - (int)remaining_len) ?
1591 (len - (int)remaining_len) : -ECONNRESET;
1592 goto unlock_dec_return;
1593 }
1594 /*
1595 * Return in the Non Blocking case if there is no data
1596 * to read in this iteration.
1597 */
1598 if (!(flags & SCIF_RECV_BLOCK)) {
1599 ret = len - (int)remaining_len;
1600 goto unlock_dec_return;
1601 }
1602 spin_unlock_irqrestore(&ep->lock, sflags);
1603 micscif_dec_node_refcnt(ep->remote_dev, 1);
1604 /*
1605 * Wait for a message now in the Blocking case.
1606 * or until other side disconnects.
1607 */
1608 if ((ret = wait_event_interruptible(ep->recvwq,
1609 (SCIFEP_CONNECTED != ep->state) ||
1610 (micscif_rb_count(&ep->qp_info.qp->inbound_q,
1611 curr_recv_len) >= curr_recv_len) || (!scifdev_alive(ep))))) {
1612 ret = (len - remaining_len) ?
1613 (len - (int)remaining_len) : ret;
1614 goto dec_return;
1615 }
1616 micscif_inc_node_refcnt(ep->remote_dev, 1);
1617 spin_lock_irqsave(&ep->lock, sflags);
1618 }
1619 ret = len;
1620unlock_dec_return:
1621 spin_unlock_irqrestore(&ep->lock, sflags);
1622 micscif_dec_node_refcnt(ep->remote_dev, 1);
1623dec_return:
1624 return ret;
1625}
1626
1627
1628/**
1629 * scif_user_send() - Send data to connection queue
1630 * @epd: The end point address returned from scif_open()
1631 * @msg: Address to place data
1632 * @len: Length to receive
1633 * @flags: Syncronous or asynchronous access
1634 *
1635 * This function is called from the driver IOCTL entry point
1636 * only and is a wrapper for _scif_send().
1637 */
1638int
1639scif_user_send(scif_epd_t epd, void *msg, int len, int flags)
1640{
1641 struct endpt *ep = (struct endpt *)epd;
1642 int err = 0;
1643 int sent_len = 0;
1644 char *tmp;
1645 int loop_len;
1646 int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));;
1647 pr_debug("SCIFAPI send (U): ep %p %s\n", ep, scif_ep_states[ep->state]);
1648
1649 if (!len)
1650 return 0;
1651
1652 if ((err = scif_msg_param_check(epd, len, flags)))
1653 goto send_err;
1654
1655 if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) {
1656 err = -ENOMEM;
1657 goto send_err;
1658 }
1659 err = 0;
1660 micscif_inc_node_refcnt(ep->remote_dev, 1);
1661 /*
1662 * Grabbing the lock before breaking up the transfer in
1663 * multiple chunks is required to ensure that messages do
1664 * not get fragmented and reordered.
1665 */
1666 mutex_lock(&ep->sendlock);
1667
1668 while (sent_len != len) {
1669 msg = (void *)((char *)msg + err);
1670 loop_len = len - sent_len;
1671 loop_len = min(chunk_len, loop_len);
1672 if (copy_from_user(tmp, msg, loop_len)) {
1673 err = -EFAULT;
1674 goto send_free_err;
1675 }
1676 err = _scif_send(epd, (void *)tmp, loop_len, flags);
1677 if (err < 0) {
1678 goto send_free_err;
1679 }
1680 sent_len += err;
1681 if (err !=loop_len) {
1682 goto send_free_err;
1683 }
1684 }
1685send_free_err:
1686 mutex_unlock(&ep->sendlock);
1687 micscif_dec_node_refcnt(ep->remote_dev, 1);
1688 kfree(tmp);
1689send_err:
1690 return err < 0 ? err : sent_len;
1691}
1692
1693/**
1694 * scif_user_recv() - Recieve data from connection queue
1695 * @epd: The end point address returned from scif_open()
1696 * @msg: Address to place data
1697 * @len: Length to receive
1698 * @flags: Syncronous or asynchronous access
1699 *
1700 * This function is called from the driver IOCTL entry point
1701 * only and is a wrapper for _scif_recv().
1702 */
1703int
1704scif_user_recv(scif_epd_t epd, void *msg, int len, int flags)
1705{
1706 struct endpt *ep = (struct endpt *)epd;
1707 int err = 0;
1708 int recv_len = 0;
1709 char *tmp;
1710 int loop_len;
1711 int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));;
1712 pr_debug("SCIFAPI recv (U): ep %p %s\n", ep, scif_ep_states[ep->state]);
1713
1714 if (!len)
1715 return 0;
1716
1717 if ((err = scif_msg_param_check(epd, len, flags)))
1718 goto recv_err;
1719
1720 if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) {
1721 err = -ENOMEM;
1722 goto recv_err;
1723 }
1724 err = 0;
1725 /*
1726 * Grabbing the lock before breaking up the transfer in
1727 * multiple chunks is required to ensure that messages do
1728 * not get fragmented and reordered.
1729 */
1730 mutex_lock(&ep->recvlock);
1731
1732 while (recv_len != len) {
1733 msg = (void *)((char *)msg + err);
1734 loop_len = len - recv_len;
1735 loop_len = min(chunk_len, loop_len);
1736 if ((err = _scif_recv(epd, tmp, loop_len, flags)) < 0)
1737 goto recv_free_err;
1738 if (copy_to_user(msg, tmp, err)) {
1739 err = -EFAULT;
1740 goto recv_free_err;
1741 }
1742 recv_len += err;
1743 if (err !=loop_len) {
1744 goto recv_free_err;
1745 }
1746 }
1747recv_free_err:
1748 mutex_unlock(&ep->recvlock);
1749 kfree(tmp);
1750recv_err:
1751 return err < 0 ? err : recv_len;
1752}
1753
1754#ifdef SCIF_BLAST
1755/*
1756 * Added a temporary implementation of the exception path.
1757 * The cost to the normal path testing of 2 flag bits instead
1758 * of just one and a change to condition for node-wakeup.
1759 */
1760#endif
1761
1762/**
1763 * scif_send() - Send data to connection queue
1764 * @epd: The end point address returned from scif_open()
1765 * @msg: Address to place data
1766 * @len: Length to receive
1767 * @flags: Syncronous or asynchronous access
1768 *
1769 * This function is called from the kernel mode only and is
1770 * a wrapper for _scif_send().
1771 */
1772int
1773__scif_send(scif_epd_t epd, void *msg, int len, int flags)
1774{
1775 struct endpt *ep = (struct endpt *)epd;
1776 int ret;
1777
1778 pr_debug("SCIFAPI send (K): ep %p %s\n", ep, scif_ep_states[ep->state]);
1779 if (!len)
1780 return 0;
1781
1782#ifdef SCIF_BLAST
1783 /*
1784 * KAA: this is same code as scif_msg_param_check(),
1785 * but since that routine is shared with scif_recv
1786 * I thought is safer to replicate code here.
1787 */
1788 if (len < 0)
1789 return -EINVAL;
1790
1791 if (flags && !(flags & (SCIF_SEND_BLOCK | SCIF_BLAST)))
1792 return -EINVAL;
1793
1794 if ((flags & (SCIF_SEND_BLOCK | SCIF_BLAST)) ==
1795 (SCIF_SEND_BLOCK | SCIF_BLAST))
1796 return -EINVAL;
1797#else
1798 if ((ret = scif_msg_param_check(epd, len, flags)))
1799 return ret;
1800#endif
1801 /*
1802 * Cannot block while waiting for node to wake up
1803 * if non blocking messaging mode is requested. Return
1804 * ENODEV if the remote node is idle.
1805 */
1806 if (!(flags & SCIF_SEND_BLOCK) && ep->remote_dev &&
1807 SCIF_NODE_IDLE == atomic_long_read(
1808 &ep->remote_dev->scif_ref_cnt))
1809 return -ENODEV;
1810
1811 micscif_inc_node_refcnt(ep->remote_dev, 1);
1812
1813 /*
1814 * Grab the mutex lock in the blocking case only
1815 * to ensure messages do not get fragmented/reordered.
1816 * The non blocking mode is protected using spin locks
1817 * in _scif_send().
1818 */
1819 if (flags & SCIF_SEND_BLOCK)
1820 mutex_lock(&ep->sendlock);
1821
1822 ret = _scif_send(epd, msg, len, flags);
1823
1824 if (flags & SCIF_SEND_BLOCK)
1825 mutex_unlock(&ep->sendlock);
1826
1827 micscif_dec_node_refcnt(ep->remote_dev, 1);
1828 return ret;
1829}
1830
1831int
1832scif_send(scif_epd_t epd, void *msg, int len, int flags)
1833{
1834 int ret;
1835 get_kref_count(epd);
1836 ret = __scif_send(epd, msg, len, flags);
1837 put_kref_count(epd);
1838 return ret;
1839}
1840EXPORT_SYMBOL(scif_send);
1841
1842/**
1843 * scif_recv() - Recieve data from connection queue
1844 * @epd: The end point address returned from scif_open()
1845 * @msg: Address to place data
1846 * @len: Length to receive
1847 * @flags: Syncronous or asynchronous access
1848 *
1849 * This function is called from the kernel mode only and is
1850 * a wrapper for _scif_recv().
1851 */
1852int
1853__scif_recv(scif_epd_t epd, void *msg, int len, int flags)
1854{
1855 struct endpt *ep = (struct endpt *)epd;
1856 int ret;
1857
1858 pr_debug("SCIFAPI recv (K): ep %p %s\n", ep, scif_ep_states[ep->state]);
1859
1860 if (!len)
1861 return 0;
1862
1863 if ((ret = scif_msg_param_check(epd, len, flags)))
1864 return ret;
1865
1866 /*
1867 * Cannot block while waiting for node to wake up
1868 * if non blocking messaging mode is requested. Return
1869 * ENODEV if the remote node is idle.
1870 */
1871 if (!flags && ep->remote_dev &&
1872 SCIF_NODE_IDLE == atomic_long_read(
1873 &ep->remote_dev->scif_ref_cnt))
1874 return -ENODEV;
1875
1876 /*
1877 * Grab the mutex lock in the blocking case only
1878 * to ensure messages do not get fragmented/reordered.
1879 * The non blocking mode is protected using spin locks
1880 * in _scif_send().
1881 */
1882 if (flags & SCIF_RECV_BLOCK)
1883 mutex_lock(&ep->recvlock);
1884
1885 ret = _scif_recv(epd, msg, len, flags);
1886
1887 if (flags & SCIF_RECV_BLOCK)
1888 mutex_unlock(&ep->recvlock);
1889
1890 return ret;
1891}
1892
1893int
1894scif_recv(scif_epd_t epd, void *msg, int len, int flags)
1895{
1896 int ret;
1897 get_kref_count(epd);
1898 ret = __scif_recv(epd, msg, len, flags);
1899 put_kref_count(epd);
1900 return ret;
1901}
1902EXPORT_SYMBOL(scif_recv);
1903
1904/**
1905 * __scif_pin_pages - __scif_pin_pages() pins the physical pages which back
1906 * the range of virtual address pages starting at addr and continuing for
1907 * len bytes. addr and len are constrained to be multiples of the page size.
1908 * A successful scif_register() call returns an opaque pointer value
1909 * which may be used in subsequent calls to scif_register_pinned_pages().
1910 *
1911 * Return Values
1912 * Upon successful completion, __scif_pin_pages() returns a
1913 * scif_pinned_pages_t value else an apt error is returned as documented
1914 * in scif.h. Protections of the set of pinned pages are also returned by
1915 * reference via out_prot.
1916 */
1917int
1918__scif_pin_pages(void *addr, size_t len, int *out_prot,
1919 int map_flags, scif_pinned_pages_t *pages)
1920{
1921 struct scif_pinned_pages *pinned_pages;
1922 int nr_pages, err = 0, i;
1923 bool vmalloc_addr = false;
1924 bool try_upgrade = false;
1925 int prot = *out_prot;
1926 int ulimit = 0;
1927 struct mm_struct *mm = NULL;
1928
1929 /* Unsupported flags */
1930 if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT))
1931 return -EINVAL;
1932 ulimit = !!(map_flags & SCIF_MAP_ULIMIT);
1933
1934 /* Unsupported protection requested */
1935 if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
1936 return -EINVAL;
1937
1938 /* addr/len must be page aligned. len should be non zero */
1939 if ((!len) ||
1940 (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) ||
1941 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
1942 return -EINVAL;
1943
1944 might_sleep();
1945
1946 nr_pages = (int)(len >> PAGE_SHIFT);
1947
1948 /* Allocate a set of pinned pages */
1949 if (!(pinned_pages = micscif_create_pinned_pages(nr_pages, prot)))
1950 return -ENOMEM;
1951
1952 if (unlikely(map_flags & SCIF_MAP_KERNEL)) {
1953 if (is_vmalloc_addr(addr))
1954 vmalloc_addr = true;
1955
1956 for (i = 0; i < nr_pages; i++) {
1957 if (unlikely(vmalloc_addr))
1958 pinned_pages->pages[i] =
1959 vmalloc_to_page((char *)addr + (i * PAGE_SIZE) );
1960 else
1961 pinned_pages->pages[i] =
1962 virt_to_page((char *)addr + (i * PAGE_SIZE) );
1963 pinned_pages->num_pages[i] = 1;
1964 pinned_pages->nr_contig_chunks++;
1965 }
1966 pinned_pages->nr_pages = nr_pages;
1967 pinned_pages->map_flags = SCIF_MAP_KERNEL;
1968 } else {
1969 if (prot == SCIF_PROT_READ)
1970 try_upgrade = true;
1971 prot |= SCIF_PROT_WRITE;
1972retry:
1973 mm = current->mm;
1974 down_write(&mm->mmap_sem);
1975 if (ulimit) {
1976 err = __scif_check_inc_pinned_vm(mm, nr_pages);
1977 if (err) {
1978 up_write(&mm->mmap_sem);
1979 pinned_pages->nr_pages = 0;
1980 goto error_unmap;
1981 }
1982 }
1983
1984 pinned_pages->nr_pages = get_user_pages(
800f879a
AT
1985 (uint64_t)addr,
1986 nr_pages,
1c7f6af6 1987 prot & SCIF_PROT_WRITE ? FOLL_WRITE : 0,
800f879a
AT
1988 pinned_pages->pages,
1989 pinned_pages->vma);
1990 up_write(&mm->mmap_sem);
1991 if (nr_pages == pinned_pages->nr_pages) {
1992#ifdef RMA_DEBUG
1993 atomic_long_add_return(nr_pages, &ms_info.rma_pin_cnt);
1994#endif
1995 micscif_detect_large_page(pinned_pages, addr);
1996 } else {
1997 if (try_upgrade) {
1998 if (ulimit)
1999 __scif_dec_pinned_vm_lock(mm, nr_pages, 0);
2000#ifdef RMA_DEBUG
2001 WARN_ON(atomic_long_sub_return(1,
2002 &ms_info.rma_mm_cnt) < 0);
2003#endif
2004 /* Roll back any pinned pages */
2005 for (i = 0; i < pinned_pages->nr_pages; i++) {
2006 if (pinned_pages->pages[i])
afe5b8be 2007 put_page(pinned_pages->pages[i]);
800f879a
AT
2008 }
2009 prot &= ~SCIF_PROT_WRITE;
2010 try_upgrade = false;
2011 goto retry;
2012 }
2013 }
2014 pinned_pages->map_flags = 0;
2015 }
2016
2017 if (pinned_pages->nr_pages < nr_pages) {
2018 err = -EFAULT;
2019 pinned_pages->nr_pages = nr_pages;
2020 goto dec_pinned;
2021 }
2022
2023 *out_prot = prot;
2024 atomic_set(&pinned_pages->ref_count, nr_pages);
2025 *pages = pinned_pages;
2026 return err;
2027dec_pinned:
2028 if (ulimit)
2029 __scif_dec_pinned_vm_lock(mm, nr_pages, 0);
2030 /* Something went wrong! Rollback */
2031error_unmap:
2032 pinned_pages->nr_pages = nr_pages;
2033 micscif_destroy_pinned_pages(pinned_pages);
2034 *pages = NULL;
2035 pr_debug("%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len);
2036 return err;
2037
2038}
2039
2040/**
2041 * scif_pin_pages - scif_pin_pages() pins the physical pages which back
2042 * the range of virtual address pages starting at addr and continuing for
2043 * len bytes. addr and len are constrained to be multiples of the page size.
2044 * A successful scif_register() call returns an opaque pointer value
2045 * which may be used in subsequent calls to scif_register_pinned_pages().
2046 *
2047 * Return Values
2048 * Upon successful completion, scif_register() returns a
2049 * scif_pinned_pages_t value else an apt error is returned as documented
2050 * in scif.h
2051 */
2052int
2053scif_pin_pages(void *addr, size_t len, int prot,
2054 int map_flags, scif_pinned_pages_t *pages)
2055{
2056 return __scif_pin_pages(addr, len, &prot, map_flags, pages);
2057}
2058EXPORT_SYMBOL(scif_pin_pages);
2059
2060/**
2061 * scif_unpin_pages: Unpin a set of pages
2062 *
2063 * Return Values:
2064 * Upon successful completion, scif_unpin_pages() returns 0;
2065 * else an apt error is returned as documented in scif.h
2066 */
2067int
2068scif_unpin_pages(scif_pinned_pages_t pinned_pages)
2069{
2070 int err = 0, ret;
2071
2072 if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic)
2073 return -EINVAL;
2074
2075 ret = atomic_sub_return((int32_t)pinned_pages->nr_pages,
2076 &pinned_pages->ref_count);
2077 BUG_ON(ret < 0);
2078
2079 /*
2080 * Destroy the window if the ref count for this set of pinned
2081 * pages has dropped to zero. If it is positive then there is
2082 * a valid registered window which is backed by these pages and
2083 * it will be destroyed once all such windows are unregistered.
2084 */
2085 if (!ret)
2086 err = micscif_destroy_pinned_pages(pinned_pages);
2087
2088 return err;
2089}
2090EXPORT_SYMBOL(scif_unpin_pages);
2091
2092/**
2093 * scif_register_pinned_pages: Mark a memory region for remote access.
2094 *
2095 * The scif_register_pinned_pages() function opens a window, a range
2096 * of whole pages of the registered address space of the endpoint epd,
2097 * starting at offset po. The value of po, further described below, is
2098 * a function of the parameters offset and pinned_pages, and the value
2099 * of map_flags. Each page of the window represents a corresponding
2100 * physical memory page of pinned_pages; the length of the window is
2101 * the same as the length of pinned_pages. A successful scif_register()
2102 * call returns po as the return value.
2103 *
2104 * Return Values
2105 * Upon successful completion, scif_register_pinned_pages() returns
2106 * the offset at which the mapping was placed (po);
2107 * else an apt error is returned as documented in scif.h
2108 */
2109off_t
2110__scif_register_pinned_pages(scif_epd_t epd,
2111 scif_pinned_pages_t pinned_pages, off_t offset, int map_flags)
2112{
2113 struct endpt *ep = (struct endpt *)epd;
2114 uint64_t computed_offset;
2115 struct reg_range_t *window;
2116 int err;
2117 size_t len;
2118
2119#ifdef DEBUG
2120 /* Bad EP */
2121 if (!ep || !pinned_pages || pinned_pages->magic != SCIFEP_MAGIC)
2122 return -EINVAL;
2123#endif
2124 /* Unsupported flags */
2125 if (map_flags & ~SCIF_MAP_FIXED)
2126 return -EINVAL;
2127
2128 len = pinned_pages->nr_pages << PAGE_SHIFT;
2129
2130 /*
2131 * Offset is not page aligned/negative or offset+len
2132 * wraps around with SCIF_MAP_FIXED.
2133 */
2134 if ((map_flags & SCIF_MAP_FIXED) &&
2135 ((align_low(offset, PAGE_SIZE) != offset) ||
2136 (offset < 0) ||
2137 (offset + (off_t)len < offset)))
2138 return -EINVAL;
2139
2140 might_sleep();
2141
2142 if ((err = verify_epd(ep)))
2143 return err;
2144
2145 /* Compute the offset for this registration */
2146 if ((err = micscif_get_window_offset(ep, map_flags, offset,
2147 len, &computed_offset)))
2148 return err;
2149
2150 /* Allocate and prepare self registration window */
2151 if (!(window = micscif_create_window(ep, pinned_pages->nr_pages,
2152 computed_offset, false))) {
2153 micscif_free_window_offset(ep, computed_offset, len);
2154 return -ENOMEM;
2155 }
2156
2157 window->pinned_pages = pinned_pages;
2158 window->nr_pages = pinned_pages->nr_pages;
2159 window->nr_contig_chunks = pinned_pages->nr_contig_chunks;
2160 window->prot = pinned_pages->prot;
2161
2162 /*
2163 * This set of pinned pages now belongs to this window as well.
2164 * Assert if the ref count is zero since it is an error to
2165 * pass pinned_pages to scif_register_pinned_pages() after
2166 * calling scif_unpin_pages().
2167 */
2168 if (!atomic_add_unless(&pinned_pages->ref_count,
2169 (int32_t)pinned_pages->nr_pages, 0))
2170 BUG_ON(1);
2171
2172 micscif_inc_node_refcnt(ep->remote_dev, 1);
2173
2174 if ((err = micscif_send_alloc_request(ep, window))) {
2175 micscif_dec_node_refcnt(ep->remote_dev, 1);
2176 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2177 goto error_unmap;
2178 }
2179
2180 /* Prepare the remote registration window */
2181 if ((err = micscif_prep_remote_window(ep, window))) {
2182 micscif_dec_node_refcnt(ep->remote_dev, 1);
2183 micscif_set_nr_pages(ep->remote_dev, window);
2184 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2185 goto error_unmap;
2186 }
2187
2188 /* Tell the peer about the new window */
2189 if ((err = micscif_send_scif_register(ep, window))) {
2190 micscif_dec_node_refcnt(ep->remote_dev, 1);
2191 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2192 goto error_unmap;
2193 }
2194
2195 micscif_dec_node_refcnt(ep->remote_dev, 1);
2196
2197 /* No further failures expected. Insert new window */
2198 mutex_lock(&ep->rma_info.rma_lock);
2199 set_window_ref_count(window, pinned_pages->nr_pages);
2200 micscif_insert_window(window, &ep->rma_info.reg_list);
2201 mutex_unlock(&ep->rma_info.rma_lock);
2202
2203 return computed_offset;
2204error_unmap:
2205 micscif_destroy_window(ep, window);
2206 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2207 return err;
2208}
2209
2210off_t
2211scif_register_pinned_pages(scif_epd_t epd,
2212 scif_pinned_pages_t pinned_pages, off_t offset, int map_flags)
2213{
2214 off_t ret;
2215 get_kref_count(epd);
2216 ret = __scif_register_pinned_pages(epd, pinned_pages, offset, map_flags);
2217 put_kref_count(epd);
2218 return ret;
2219}
2220EXPORT_SYMBOL(scif_register_pinned_pages);
2221
2222/**
2223 * scif_get_pages - Add references to remote registered pages
2224 *
2225 * scif_get_pages() returns the addresses of the physical pages represented
2226 * by those pages of the registered address space of the peer of epd, starting
2227 * at offset offset and continuing for len bytes. offset and len are constrained
2228 * to be multiples of the page size.
2229 *
2230 * Return Values
2231 * Upon successful completion, scif_get_pages() returns 0;
2232 * else an apt error is returned as documented in scif.h.
2233 */
2234int
2235__scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages)
2236{
2237 struct endpt *ep = (struct endpt *)epd;
2238 struct micscif_rma_req req;
2239 struct reg_range_t *window = NULL;
2240 int nr_pages, err, i;
2241
2242 pr_debug("SCIFAPI get_pinned_pages: ep %p %s offset 0x%lx len 0x%lx\n",
2243 ep, scif_ep_states[ep->state], offset, len);
2244
2245 if ((err = verify_epd(ep)))
2246 return err;
2247
2248 if ((!len) ||
2249 (offset < 0) ||
2250 (offset + len < offset) ||
2251 (align_low((uint64_t)offset, PAGE_SIZE) != (uint64_t)offset) ||
2252 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
2253 return -EINVAL;
2254
2255 nr_pages = len >> PAGE_SHIFT;
2256
2257 req.out_window = &window;
2258 req.offset = offset;
2259 req.prot = 0;
2260 req.nr_bytes = len;
2261 req.type = WINDOW_SINGLE;
2262 req.head = &ep->rma_info.remote_reg_list;
2263
2264 mutex_lock(&ep->rma_info.rma_lock);
2265 /* Does a valid window exist? */
2266 if ((err = micscif_query_window(&req))) {
2267 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2268 goto error;
2269 }
2270 RMA_MAGIC(window);
2271
2272 /* Allocate scif_range */
2273 if (!(*pages = kzalloc(sizeof(struct scif_range), GFP_KERNEL))) {
2274 err = -ENOMEM;
2275 goto error;
2276 }
2277
2278 /* Allocate phys addr array */
2279 if (!((*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t)))) {
2280 err = -ENOMEM;
2281 goto error;
2282 }
2283
2284#ifndef _MIC_SCIF_
2285 /* Allocate virtual address array */
2286 if (!((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)))) {
2287 err = -ENOMEM;
2288 goto error;
2289 }
2290#endif
2291 /* Populate the values */
2292 (*pages)->cookie = window;
2293 (*pages)->nr_pages = nr_pages;
2294 (*pages)->prot_flags = window->prot;
2295
2296 for (i = 0; i < nr_pages; i++) {
2297 (*pages)->phys_addr[i] =
2298#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
2299 is_self_scifdev(ep->remote_dev) ?
2300 micscif_get_dma_addr(window, offset + (i * PAGE_SIZE),
2301 NULL, NULL, NULL) : window->phys_addr[i];
2302#else
2303 get_phys_addr(micscif_get_dma_addr(window, offset + (i * PAGE_SIZE),
2304 NULL, NULL, NULL), ep->remote_dev);
2305#endif
2306#ifndef _MIC_SCIF_
2307 if (!is_self_scifdev(ep->remote_dev))
2308 (*pages)->va[i] =
2309 get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.va +
2310 (*pages)->phys_addr[i] -
2311 get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.pa;
2312#endif
2313 }
2314
2315 window->get_put_ref_count += nr_pages;
2316 get_window_ref_count(window, nr_pages);
2317error:
2318 mutex_unlock(&ep->rma_info.rma_lock);
2319 if (err) {
2320 if (*pages) {
2321 if ((*pages)->phys_addr)
2322 scif_free((*pages)->phys_addr, nr_pages * sizeof(dma_addr_t));
2323#ifndef _MIC_SCIF_
2324 if ((*pages)->va)
2325 scif_free((*pages)->va, nr_pages * sizeof(void *));
2326#endif
2327 kfree(*pages);
2328 *pages = NULL;
2329 }
2330 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2331 } else {
2332 micscif_create_node_dep(ep->remote_dev, nr_pages);
2333 }
2334 return err;
2335}
2336
2337int
2338scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages)
2339{
2340 int ret;
2341 get_kref_count(epd);
2342 ret = __scif_get_pages(epd, offset, len, pages);
2343 put_kref_count(epd);
2344 return ret;
2345}
2346EXPORT_SYMBOL(scif_get_pages);
2347
2348/**
2349 * scif_put_pages - Remove references from remote registered pages
2350 *
2351 * scif_put_pages() returns a scif_range structure previously obtained by
2352 * calling scif_get_pages(). When control returns, the physical pages may
2353 * become available for reuse if and when the window which represented
2354 * those pages is unregistered. Therefore, those pages must never be accessed.
2355 *
2356 * Return Values
2357 * Upon success, zero is returned.
2358 * else an apt error is returned as documented in scif.h.
2359 */
2360int
2361__scif_put_pages(struct scif_range *pages)
2362{
2363 struct endpt *ep;
2364 struct reg_range_t *window;
2365 struct nodemsg msg;
2366
2367 if (!pages || !pages->cookie)
2368 return -EINVAL;
2369
2370 window = pages->cookie;
2371
2372 if (!window || window->magic != SCIFEP_MAGIC ||
2373 !window->get_put_ref_count)
2374 return -EINVAL;
2375
2376 ep = (struct endpt *)window->ep;
2377
2378 /*
2379 * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
2380 * callee should be allowed to release references to the pages,
2381 * else the endpoint was not connected in the first place,
2382 * hence the ENOTCONN.
2383 */
2384 if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED)
2385 return -ENOTCONN;
2386
2387 /*
2388 * TODO: Re-enable this check once ref counts for kernel mode APIs
2389 * have been implemented and node remove call backs are called before
2390 * the node is removed. This check results in kernel mode APIs not
2391 * being able to release pages correctly since node remove callbacks
2392 * are called after the node is removed currently.
2393 * if (!scifdev_alive(ep))
2394 * return -ENODEV;
2395 */
2396
2397 micscif_inc_node_refcnt(ep->remote_dev, 1);
2398 mutex_lock(&ep->rma_info.rma_lock);
2399
2400 /* Decrement the ref counts and check for errors */
2401 window->get_put_ref_count -= pages->nr_pages;
2402 BUG_ON(window->get_put_ref_count < 0);
2403 put_window_ref_count(window, pages->nr_pages);
2404
2405 /* Initiate window destruction if ref count is zero */
2406 if (!window->ref_count) {
2407 drain_dma_intr(ep->rma_info.dma_chan);
2408 /* Inform the peer about this window being destroyed. */
2409 msg.uop = SCIF_MUNMAP;
2410 msg.src = ep->port;
2411 msg.payload[0] = window->peer_window;
2412 /* No error handling for notification messages */
2413 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
2414 list_del(&window->list_member);
2415 /* Destroy this window from the peer's registered AS */
2416 micscif_destroy_remote_window(ep, window);
2417 }
2418 mutex_unlock(&ep->rma_info.rma_lock);
2419
2420 micscif_dec_node_refcnt(ep->remote_dev, 1);
2421 micscif_destroy_node_dep(ep->remote_dev, pages->nr_pages);
2422 scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t));
2423#ifndef _MIC_SCIF_
2424 scif_free(pages->va, pages->nr_pages * sizeof(void*));
2425#endif
2426 kfree(pages);
2427 return 0;
2428}
2429
2430int
2431scif_put_pages(struct scif_range *pages)
2432{
2433 int ret;
2434 struct reg_range_t *window = pages->cookie;
2435 struct endpt *ep = (struct endpt *)window->ep;
bc36e74e 2436 if (atomic_read(&(&(&(ep->ref_count))->refcount)->refs) > 0) {
800f879a
AT
2437 kref_get(&(ep->ref_count));
2438 } else {
2439 WARN_ON(1);
2440 }
2441 ret = __scif_put_pages(pages);
bc36e74e 2442 if (atomic_read(&(&(&(ep->ref_count))->refcount)->refs) > 0) {
800f879a
AT
2443 kref_put(&(ep->ref_count), scif_ref_rel);
2444 } else {
2445 //WARN_ON(1);
2446 }
2447 return ret;
2448}
2449EXPORT_SYMBOL(scif_put_pages);
2450
2451int scif_event_register(scif_callback_t handler)
2452{
2453 /* Add to the list of event handlers */
2454 struct scif_callback *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
2455 if (!cb)
2456 return -ENOMEM;
2457 mutex_lock(&ms_info.mi_event_cblock);
2458 cb->callback_handler = handler;
2459 list_add_tail(&cb->list_member, &ms_info.mi_event_cb);
2460 mutex_unlock(&ms_info.mi_event_cblock);
2461 return 0;
2462}
2463EXPORT_SYMBOL(scif_event_register);
2464
2465int scif_event_unregister(scif_callback_t handler)
2466{
2467 struct list_head *pos, *unused;
2468 struct scif_callback *temp;
2469 int err = -EINVAL;
2470
2471 mutex_lock(&ms_info.mi_event_cblock);
2472 list_for_each_safe(pos, unused, &ms_info.mi_event_cb) {
2473 temp = list_entry(pos, struct scif_callback, list_member);
2474 if (temp->callback_handler == handler) {
2475 err = 0;
2476 list_del(pos);
2477 kfree(temp);
2478 break;
2479 }
2480 }
2481
2482 mutex_unlock(&ms_info.mi_event_cblock);
2483 return err;
2484}
2485EXPORT_SYMBOL(scif_event_unregister);
2486
2487/**
2488 * scif_register - Mark a memory region for remote access.
2489 * @epd: endpoint descriptor
2490 * @addr: starting virtual address
2491 * @len: length of range
2492 * @offset: offset of window
2493 * @prot: read/write protection
2494 * @map_flags: flags
2495 *
2496 * Return Values
2497 * Upon successful completion, scif_register() returns the offset
2498 * at which the mapping was placed else an apt error is returned
2499 * as documented in scif.h.
2500 */
2501off_t
2502__scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
2503 int prot, int map_flags)
2504{
2505 scif_pinned_pages_t pinned_pages;
2506 off_t err;
2507 struct endpt *ep = (struct endpt *)epd;
2508 uint64_t computed_offset;
2509 struct reg_range_t *window;
2510 struct mm_struct *mm = NULL;
2511
2512 pr_debug("SCIFAPI register: ep %p %s addr %p len 0x%lx"
2513 " offset 0x%lx prot 0x%x map_flags 0x%x\n",
2514 epd, scif_ep_states[epd->state], addr, len, offset, prot, map_flags);
2515
2516 /* Unsupported flags */
2517 if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL))
2518 return -EINVAL;
2519
2520 /* Unsupported protection requested */
2521 if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
2522 return -EINVAL;
2523
2524 /* addr/len must be page aligned. len should be non zero */
2525 if ((!len) ||
2526 (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) ||
2527 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
2528 return -EINVAL;
2529
2530 /*
2531 * Offset is not page aligned/negative or offset+len
2532 * wraps around with SCIF_MAP_FIXED.
2533 */
2534 if ((map_flags & SCIF_MAP_FIXED) &&
2535 ((align_low(offset, PAGE_SIZE) != offset) ||
2536 (offset < 0) ||
2537 (offset + (off_t)len < offset)))
2538 return -EINVAL;
2539
2540
2541 might_sleep();
2542
2543#ifdef DEBUG
2544 /* Bad EP */
2545 if (!ep)
2546 return -EINVAL;
2547#endif
2548
2549 if ((err = verify_epd(ep)))
2550 return err;
2551
2552 /* Compute the offset for this registration */
2553 if ((err = micscif_get_window_offset(ep, map_flags, offset,
2554 len, &computed_offset)))
2555 return err;
2556
2557 /* Allocate and prepare self registration window */
2558 if (!(window = micscif_create_window(ep, len >> PAGE_SHIFT,
2559 computed_offset, false))) {
2560 micscif_free_window_offset(ep, computed_offset, len);
2561 return -ENOMEM;
2562 }
2563
2564 micscif_inc_node_refcnt(ep->remote_dev, 1);
2565
2566 window->nr_pages = len >> PAGE_SHIFT;
2567
2568 if ((err = micscif_send_alloc_request(ep, window))) {
2569 micscif_destroy_incomplete_window(ep, window);
2570 micscif_dec_node_refcnt(ep->remote_dev, 1);
2571 return err;
2572 }
2573
2574 if (!(map_flags & SCIF_MAP_KERNEL)) {
2575 mm = __scif_acquire_mm();
2576 map_flags |= SCIF_MAP_ULIMIT;
2577 }
2578 /* Pin down the pages */
2579 if ((err = scif_pin_pages(addr, len, prot,
2580 map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT),
2581 &pinned_pages))) {
2582 micscif_destroy_incomplete_window(ep, window);
2583 micscif_dec_node_refcnt(ep->remote_dev, 1);
2584 __scif_release_mm(mm);
2585 goto error;
2586 }
2587
2588 window->pinned_pages = pinned_pages;
2589 window->nr_contig_chunks = pinned_pages->nr_contig_chunks;
2590 window->prot = pinned_pages->prot;
2591 window->mm = mm;
2592
2593 /* Prepare the remote registration window */
2594 if ((err = micscif_prep_remote_window(ep, window))) {
2595 micscif_dec_node_refcnt(ep->remote_dev, 1);
2596 micscif_set_nr_pages(ep->remote_dev, window);
2597 printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
2598 goto error_unmap;
2599 }
2600
2601 /* Tell the peer about the new window */
2602 if ((err = micscif_send_scif_register(ep, window))) {
2603 micscif_dec_node_refcnt(ep->remote_dev, 1);
2604 printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
2605 goto error_unmap;
2606 }
2607
2608 micscif_dec_node_refcnt(ep->remote_dev, 1);
2609
2610 /* No further failures expected. Insert new window */
2611 mutex_lock(&ep->rma_info.rma_lock);
2612 set_window_ref_count(window, pinned_pages->nr_pages);
2613 micscif_insert_window(window, &ep->rma_info.reg_list);
2614 mutex_unlock(&ep->rma_info.rma_lock);
2615
2616 pr_debug("SCIFAPI register: ep %p %s addr %p"
2617 " len 0x%lx computed_offset 0x%llx\n",
2618 epd, scif_ep_states[epd->state], addr, len, computed_offset);
2619 return computed_offset;
2620error_unmap:
2621 micscif_destroy_window(ep, window);
2622error:
2623 printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
2624 return err;
2625}
2626
2627off_t
2628scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
2629 int prot, int map_flags)
2630{
2631 off_t ret;
2632 get_kref_count(epd);
2633 ret = __scif_register(epd, addr, len, offset, prot, map_flags);
2634 put_kref_count(epd);
2635 return ret;
2636}
2637EXPORT_SYMBOL(scif_register);
2638
2639/**
2640 * scif_unregister - Release a memory region registered for remote access.
2641 * @epd: endpoint descriptor
2642 * @offset: start of range to unregister
2643 * @len: length of range to unregister
2644 *
2645 * Return Values
2646 * Upon successful completion, scif_unegister() returns zero
2647 * else an apt error is returned as documented in scif.h.
2648 */
2649int
2650__scif_unregister(scif_epd_t epd, off_t offset, size_t len)
2651{
2652 struct endpt *ep = (struct endpt *)epd;
2653 struct reg_range_t *window = NULL;
2654 struct micscif_rma_req req;
2655 int nr_pages, err;
2656
2657 pr_debug("SCIFAPI unregister: ep %p %s offset 0x%lx len 0x%lx\n",
2658 ep, scif_ep_states[ep->state], offset, len);
2659
2660 /* len must be page aligned. len should be non zero */
2661 if ((!len) ||
2662 (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
2663 return -EINVAL;
2664
2665 /* Offset is not page aligned or offset+len wraps around */
2666 if ((align_low(offset, PAGE_SIZE) != offset) ||
2667 (offset + (off_t)len < offset))
2668 return -EINVAL;
2669
2670 if ((err = verify_epd(ep)))
2671 return err;
2672
2673 might_sleep();
2674 nr_pages = (int)(len >> PAGE_SHIFT);
2675
2676 req.out_window = &window;
2677 req.offset = offset;
2678 req.prot = 0;
2679 req.nr_bytes = len;
2680 req.type = WINDOW_FULL;
2681 req.head = &ep->rma_info.reg_list;
2682
2683 micscif_inc_node_refcnt(ep->remote_dev, 1);
2684 mutex_lock(&ep->rma_info.rma_lock);
2685 /* Does a valid window exist? */
2686 if ((err = micscif_query_window(&req))) {
2687 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2688 goto error;
2689 }
2690 /* Unregister all the windows in this range */
2691 if ((err = micscif_rma_list_unregister(window, offset, nr_pages)))
2692 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2693error:
2694 mutex_unlock(&ep->rma_info.rma_lock);
2695 micscif_dec_node_refcnt(ep->remote_dev, 1);
2696 return err;
2697}
2698
2699int
2700scif_unregister(scif_epd_t epd, off_t offset, size_t len)
2701{
2702 int ret;
2703 get_kref_count(epd);
2704 ret = __scif_unregister(epd, offset, len);
2705 put_kref_count(epd);
2706 return ret;
2707}
2708EXPORT_SYMBOL(scif_unregister);
2709
2710unsigned int scif_pollfd(struct file *f, poll_table *wait, scif_epd_t epd)
2711{
2712 unsigned int ret;
2713 get_kref_count(epd);
2714 ret = __scif_pollfd(f, wait, (struct endpt *)epd);
2715 put_kref_count(epd);
2716 return ret;
2717}
2718
2719unsigned int __scif_pollfd(struct file *f, poll_table *wait, struct endpt *ep)
2720{
2721 unsigned int mask = 0;
2722 unsigned long sflags;
2723
2724 pr_debug("SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]);
2725
2726 micscif_inc_node_refcnt(ep->remote_dev, 1);
2727 spin_lock_irqsave(&ep->lock, sflags);
2728
2729 if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
2730#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2731 if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) {
2732#else
2733 if (!wait || wait->key & SCIF_POLLOUT) {
2734#endif
2735 poll_wait(f, &ep->conn_pend_wq, wait);
2736 if (ep->state == SCIFEP_CONNECTED ||
2737 ep->state == SCIFEP_DISCONNECTED ||
2738 ep->conn_err) {
2739 mask |= SCIF_POLLOUT;
2740 }
2741 goto return_scif_poll;
2742 }
2743 }
2744
2745 /* Is it OK to use wait->key?? */
2746 if (ep->state == SCIFEP_LISTENING) {
2747#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2748 if (!wait || poll_requested_events(wait) & SCIF_POLLIN) {
2749#else
2750 if (!wait || wait->key & SCIF_POLLIN) {
2751#endif
2752 spin_unlock_irqrestore(&ep->lock, sflags);
2753 poll_wait(f, &ep->conwq, wait);
2754 spin_lock_irqsave(&ep->lock, sflags);
2755 if (ep->conreqcnt)
2756 mask |= SCIF_POLLIN;
2757 } else {
2758 mask |= SCIF_POLLERR;
2759 }
2760 goto return_scif_poll;
2761 }
2762
2763#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2764 if (!wait || poll_requested_events(wait) & SCIF_POLLIN) {
2765#else
2766 if (!wait || wait->key & SCIF_POLLIN) {
2767#endif
2768 if (ep->state != SCIFEP_CONNECTED &&
2769 ep->state != SCIFEP_LISTENING &&
2770 ep->state != SCIFEP_DISCONNECTED) {
2771 mask |= SCIF_POLLERR;
2772 goto return_scif_poll;
2773 }
2774
2775 spin_unlock_irqrestore(&ep->lock, sflags);
2776 poll_wait(f, &ep->recvwq, wait);
2777 spin_lock_irqsave(&ep->lock, sflags);
2778 if (micscif_rb_count(&ep->qp_info.qp->inbound_q, 1))
2779 mask |= SCIF_POLLIN;
2780 }
2781
2782#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
2783 if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) {
2784#else
2785 if (!wait || wait->key & SCIF_POLLOUT) {
2786#endif
2787 if (ep->state != SCIFEP_CONNECTED &&
2788 ep->state != SCIFEP_LISTENING) {
2789 mask |= SCIF_POLLERR;
2790 goto return_scif_poll;
2791 }
2792
2793 spin_unlock_irqrestore(&ep->lock, sflags);
2794 poll_wait(f, &ep->sendwq, wait);
2795 spin_lock_irqsave(&ep->lock, sflags);
2796 if (micscif_rb_space(&ep->qp_info.qp->outbound_q))
2797 mask |= SCIF_POLLOUT;
2798 }
2799
2800return_scif_poll:
2801 /* If the endpoint is in the diconnected state then return hangup instead of error */
2802 if (ep->state == SCIFEP_DISCONNECTED) {
2803 mask &= ~SCIF_POLLERR;
2804 mask |= SCIF_POLLHUP;
2805 }
2806
2807 spin_unlock_irqrestore(&ep->lock, sflags);
2808 micscif_dec_node_refcnt(ep->remote_dev, 1);
2809 return mask;
2810}
2811
2812/*
2813 * The private data field of each VMA used to mmap a remote window
2814 * points to an instance of struct vma_pvt
2815 */
2816struct vma_pvt {
2817 struct endpt *ep; /* End point for remote window */
2818 uint64_t offset; /* offset within remote window */
2819 bool valid_offset; /* offset is valid only if the original
2820 * mmap request was for a single page
2821 * else the offset within the vma is
2822 * the correct offset
2823 */
2824 struct kref ref;
2825};
2826
2827static void vma_pvt_release(struct kref *ref)
2828{
2829 struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref);
2830 kfree(vmapvt);
2831}
2832
2833/**
2834 * scif_vma_open - VMA open driver callback
2835 * @vma: VMM memory area.
2836 * The open method is called by the kernel to allow the subsystem implementing
2837 * the VMA to initialize the area. This method is invoked any time a new
2838 * reference to the VMA is made (when a process forks, for example).
2839 * The one exception happens when the VMA is first created by mmap;
2840 * in this case, the driver's mmap method is called instead.
2841 * This function is also invoked when an existing VMA is split by the kernel
2842 * due to a call to munmap on a subset of the VMA resulting in two VMAs.
2843 * The kernel invokes this function only on one of the two VMAs.
2844 *
2845 * Return Values: None.
2846 */
2847static void scif_vma_open(struct vm_area_struct *vma)
2848{
2849 struct vma_pvt *vmapvt = ((vma)->vm_private_data);
2850 pr_debug("SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
2851 ((vma)->vm_start), ((vma)->vm_end));
2852 kref_get(&vmapvt->ref);
2853}
2854
2855/**
2856 * scif_munmap - VMA close driver callback.
2857 * @vma: VMM memory area.
2858 * When an area is destroyed, the kernel calls its close operation.
2859 * Note that there's no usage count associated with VMA's; the area
2860 * is opened and closed exactly once by each process that uses it.
2861 *
2862 * Return Values: None.
2863 */
2864void scif_munmap(struct vm_area_struct *vma)
2865{
2866 struct endpt *ep;
2867 struct vma_pvt *vmapvt = ((vma)->vm_private_data);
2868 int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT );
2869 uint64_t offset;
2870 struct micscif_rma_req req;
2871 struct reg_range_t *window = NULL;
2872 int err;
2873
2874 might_sleep();
2875 pr_debug("SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
2876 ((vma)->vm_start), ((vma)->vm_end));
2877 /* used to be a BUG_ON(), prefer keeping the kernel alive */
2878 if (!vmapvt) {
2879 WARN_ON(1);
2880 printk(KERN_ERR "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
2881 ((vma)->vm_start), ((vma)->vm_end));
2882 return;
2883 }
2884
2885 ep = vmapvt->ep;
2886 offset = vmapvt->valid_offset ? vmapvt->offset :
2887 ((vma)->vm_pgoff) << PAGE_SHIFT;
2888 pr_debug("SCIFAPI munmap: ep %p %s nr_pages 0x%x offset 0x%llx\n",
2889 ep, scif_ep_states[ep->state], nr_pages, offset);
2890
2891 req.out_window = &window;
2892 req.offset = offset;
2893 req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start);
2894 req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE);
2895 req.type = WINDOW_PARTIAL;
2896 req.head = &ep->rma_info.remote_reg_list;
2897
2898 micscif_inc_node_refcnt(ep->remote_dev, 1);
2899 mutex_lock(&ep->rma_info.rma_lock);
2900
2901 if ((err = micscif_query_window(&req)))
2902 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2903 else
2904 micscif_rma_list_munmap(window, offset, nr_pages);
2905
2906 mutex_unlock(&ep->rma_info.rma_lock);
2907 micscif_dec_node_refcnt(ep->remote_dev, 1);
2908
2909 micscif_destroy_node_dep(ep->remote_dev, nr_pages);
2910
2911 /*
2912 * The kernel probably zeroes these out but we still want
2913 * to clean up our own mess just in case.
2914 */
2915 vma->vm_ops = NULL;
2916 ((vma)->vm_private_data) = NULL;
2917 kref_put(&vmapvt->ref, vma_pvt_release);
2918 micscif_rma_put_task(ep, nr_pages);
2919}
2920
2921static const struct vm_operations_struct micscif_vm_ops = {
2922 .open = scif_vma_open,
2923 .close = scif_munmap,
2924};
2925
2926/**
2927 * scif_mmap - Map pages in virtual address space to a remote window.
2928 * @vma: VMM memory area.
2929 * @epd: endpoint descriptor
2930 *
2931 * Return Values
2932 * Upon successful completion, scif_mmap() returns zero
2933 * else an apt error is returned as documented in scif.h.
2934 */
2935int
2936scif_mmap(struct vm_area_struct *vma, scif_epd_t epd)
2937{
2938 struct micscif_rma_req req;
2939 struct reg_range_t *window = NULL;
2940 struct endpt *ep = (struct endpt *)epd;
2941 uint64_t start_offset = ((vma)->vm_pgoff) << PAGE_SHIFT;
2942 int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT);
2943 int err;
2944 struct vma_pvt *vmapvt;
2945
2946 pr_debug("SCIFAPI mmap: ep %p %s start_offset 0x%llx nr_pages 0x%x\n",
2947 ep, scif_ep_states[ep->state], start_offset, nr_pages);
2948
2949 if ((err = verify_epd(ep)))
2950 return err;
2951
2952 might_sleep();
2953
2954 if ((err = micscif_rma_get_task(ep, nr_pages)))
2955 return err;
2956
2957 if (!(vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL))) {
2958 micscif_rma_put_task(ep, nr_pages);
2959 return -ENOMEM;
2960 }
2961
2962 vmapvt->ep = ep;
2963 kref_init(&vmapvt->ref);
2964
2965 micscif_create_node_dep(ep->remote_dev, nr_pages);
2966
2967 req.out_window = &window;
2968 req.offset = start_offset;
2969 req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start);
2970 req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE);
2971 req.type = WINDOW_PARTIAL;
2972 req.head = &ep->rma_info.remote_reg_list;
2973
2974 micscif_inc_node_refcnt(ep->remote_dev, 1);
2975 mutex_lock(&ep->rma_info.rma_lock);
2976 /* Does a valid window exist? */
2977 if ((err = micscif_query_window(&req))) {
2978 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
2979 goto error;
2980 }
2981 RMA_MAGIC(window);
2982
2983 /* Default prot for loopback */
2984 if (!is_self_scifdev(ep->remote_dev)) {
2985#ifdef _MIC_SCIF_
2986 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
2987#else
2988 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
2989#endif
2990 }
2991
2992 /*
2993 * VM_DONTCOPY - Do not copy this vma on fork
2994 * VM_DONTEXPAND - Cannot expand with mremap()
2995 * VM_RESERVED - Count as reserved_vm like IO
2996 * VM_PFNMAP - Page-ranges managed without "struct page"
2997 * VM_IO - Memory mapped I/O or similar
2998 *
2999 * We do not want to copy this VMA automatically on a fork(),
3000 * expand this VMA due to mremap() or swap out these pages since
3001 * the VMA is actually backed by physical pages in the remote
3002 * node's physical memory and not via a struct page.
3003 */
3004#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
3005 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP | VM_PFNMAP;
3006#else
3007 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP;
3008#endif
3009
3010 if (!is_self_scifdev(ep->remote_dev))
3011 ((vma)->vm_flags) |= VM_IO;
3012
3013 /* Map this range of windows */
3014 if ((err = micscif_rma_list_mmap(window,
3015 start_offset, nr_pages, vma))) {
3016 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3017 goto error;
3018 }
3019 /* Set up the driver call back */
3020 vma->vm_ops = &micscif_vm_ops;
3021 ((vma)->vm_private_data) = vmapvt;
3022 /*
3023 * For 1 page sized VMAs the kernel (remap_pfn_range) replaces the
3024 * offset in the VMA with the pfn, so in that case save off the
3025 * original offset, since the page sized VMA can't be split into
3026 * smaller VMAs the offset is not going to change.
3027 */
3028 if (nr_pages == 1) {
3029 vmapvt->offset = start_offset;
3030 vmapvt->valid_offset = true;
3031 }
3032 err = 0;
3033error:
3034 mutex_unlock(&ep->rma_info.rma_lock);
3035 micscif_dec_node_refcnt(ep->remote_dev, 1);
3036 if (err) {
3037 micscif_destroy_node_dep(ep->remote_dev, nr_pages);
3038 kfree(vmapvt);
3039 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3040 micscif_rma_put_task(ep, nr_pages);
3041 }
3042 return err;
3043}
3044
3045/**
3046 * scif_readfrom() - Read SCIF offset data from remote connection
3047 * @epd: endpoint descriptor
3048 * @loffset: offset in local registered address space to which to copy
3049 * @len: length of range to copy
3050 * @roffset: offset in remote registered address space from which to copy
3051 * @flags: flags
3052 *
3053 * Return Values
3054 * Upon successful completion, scif_readfrom() returns zero
3055 * else an apt error is returned as documented in scif.h.
3056 */
3057int
3058scif_readfrom(scif_epd_t epd, off_t loffset, size_t len,
3059 off_t roffset, int flags)
3060{
3061 int ret;
3062 get_kref_count(epd);
3063 ret = __scif_readfrom(epd, loffset, len, roffset, flags);
3064 put_kref_count(epd);
3065 return ret;
3066}
3067EXPORT_SYMBOL(scif_readfrom);
3068
3069/**
3070 * scif_writeto() - Send SCIF offset data to remote connection
3071 * @epd: endpoint descriptor
3072 * @loffset: offset in local registered address space from which to copy
3073 * @len: length of range to copy
3074 * @roffset: offset in remote registered address space to which to copy
3075 * @flags: flags
3076 *
3077 * Return Values
3078 * Upon successful completion, scif_writeto() returns zero
3079 * else an apt error is returned as documented in scif.h.
3080 *
3081 */
3082int scif_writeto(scif_epd_t epd, off_t loffset, size_t len,
3083 off_t roffset, int flags)
3084{
3085 int ret;
3086 get_kref_count(epd);
3087 ret = __scif_writeto(epd, loffset, len, roffset, flags);
3088 put_kref_count(epd);
3089 return ret;
3090}
3091EXPORT_SYMBOL(scif_writeto);
3092
3093#define HOST_LOOPB_MAGIC_MARK 0xdead
3094
3095/**
3096 * scif_fence_mark:
3097 * @epd: endpoint descriptor
3098 * @flags: control flags
3099 * @mark: marked handle returned as output.
3100 *
3101 * scif_fence_mark() returns after marking the current set of all uncompleted
3102 * RMAs initiated through the endpoint epd or marking the current set of all
3103 * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
3104 * marked with a value returned in mark. The application may subsequently
3105 * await completion of all RMAs so marked.
3106 *
3107 * Return Values
3108 * Upon successful completion, scif_fence_mark() returns 0;
3109 * else an apt error is returned as documented in scif.h.
3110 */
3111int __scif_fence_mark(scif_epd_t epd, int flags, int *mark)
3112{
3113 struct endpt *ep = (struct endpt *)epd;
3114 int err = 0;
3115
3116 pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x\n",
3117 ep, scif_ep_states[ep->state], flags, *mark);
3118
3119 if ((err = verify_epd(ep)))
3120 return err;
3121
3122 /* Invalid flags? */
3123 if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))
3124 return -EINVAL;
3125
3126 /* At least one of init self or peer RMA should be set */
3127 if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
3128 return -EINVAL;
3129
3130 /* Exactly one of init self or peer RMA should be set but not both */
3131 if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
3132 return -EINVAL;
3133
3134#ifndef _MIC_SCIF_
3135 /*
3136 * Host Loopback does not need to use DMA.
3137 * Return a valid mark to be symmetric.
3138 */
3139 if (is_self_scifdev(ep->remote_dev)) {
3140 *mark = HOST_LOOPB_MAGIC_MARK;
3141 return 0;
3142 }
3143#endif
3144
3145 if (flags & SCIF_FENCE_INIT_SELF) {
3146 if ((*mark = micscif_fence_mark(epd)) < 0)
3147 err = *mark;
3148 } else {
3149 micscif_inc_node_refcnt(ep->remote_dev, 1);
3150 err = micscif_send_fence_mark(ep, mark);
3151 micscif_dec_node_refcnt(ep->remote_dev, 1);
3152 }
3153 if (err)
3154 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3155
3156 pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x err %d\n",
3157 ep, scif_ep_states[ep->state], flags, *mark, err);
3158 return err;
3159}
3160
3161int scif_fence_mark(scif_epd_t epd, int flags, int *mark)
3162{
3163 int ret;
3164 get_kref_count(epd);
3165 ret = __scif_fence_mark(epd, flags, mark);
3166 put_kref_count(epd);
3167 return ret;
3168}
3169EXPORT_SYMBOL(scif_fence_mark);
3170
3171/**
3172 * scif_fence_wait:
3173 * @epd: endpoint descriptor
3174 * @mark: mark request.
3175 *
3176 * scif_fence_wait() returns after all RMAs marked with mark have completed.
3177 *
3178 * Return Values
3179 * Upon successful completion, scif_fence_wait() returns 0;
3180 * else an apt error is returned as documented in scif.h.
3181 */
3182int __scif_fence_wait(scif_epd_t epd, int mark)
3183{
3184 struct endpt *ep = (struct endpt *)epd;
3185 int err = 0;
3186
3187 pr_debug("SCIFAPI fence_wait: ep %p %s mark 0x%x\n",
3188 ep, scif_ep_states[ep->state], mark);
3189
3190 if ((err = verify_epd(ep)))
3191 return err;
3192
3193#ifndef _MIC_SCIF_
3194 /*
3195 * Host Loopback does not need to use DMA.
3196 * The only valid mark provided is 0 so simply
3197 * return success if the mark is valid.
3198 */
3199 if (is_self_scifdev(ep->remote_dev)) {
3200 if (HOST_LOOPB_MAGIC_MARK == mark)
3201 return 0;
3202 else
3203 return -EINVAL;
3204 }
3205#endif
3206 if (mark & SCIF_REMOTE_FENCE) {
3207 micscif_inc_node_refcnt(ep->remote_dev, 1);
3208 err = micscif_send_fence_wait(epd, mark);
3209 micscif_dec_node_refcnt(ep->remote_dev, 1);
3210 } else {
3211 err = dma_mark_wait(epd->rma_info.dma_chan, mark, true);
3212 if (!err && atomic_read(&ep->rma_info.tw_refcount))
3213 queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
3214 }
3215
3216 if (err < 0)
3217 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3218 return err;
3219}
3220
3221int scif_fence_wait(scif_epd_t epd, int mark)
3222{
3223 int ret;
3224 get_kref_count(epd);
3225 ret = __scif_fence_wait(epd, mark);
3226 put_kref_count(epd);
3227 return ret;
3228}
3229EXPORT_SYMBOL(scif_fence_wait);
3230
3231/*
3232 * scif_fence_signal:
3233 * @loff: local offset
3234 * @lval: local value to write to loffset
3235 * @roff: remote offset
3236 * @rval: remote value to write to roffset
3237 * @flags: flags
3238 *
3239 * scif_fence_signal() returns after marking the current set of all
3240 * uncompleted RMAs initiated through the endpoint epd or marking
3241 * the current set of all uncompleted RMAs initiated through the peer
3242 * of endpoint epd.
3243 *
3244 * Return Values
3245 * Upon successful completion, scif_fence_signal() returns 0;
3246 * else an apt error is returned as documented in scif.h.
3247 */
3248int __scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval,
3249 off_t roff, uint64_t rval, int flags)
3250{
3251 struct endpt *ep = (struct endpt *)epd;
3252 int err = 0;
3253
3254 pr_debug("SCIFAPI fence_signal: ep %p %s loff 0x%lx lval 0x%llx "
3255 "roff 0x%lx rval 0x%llx flags 0x%x\n",
3256 ep, scif_ep_states[ep->state], loff, lval, roff, rval, flags);
3257
3258 if ((err = verify_epd(ep)))
3259 return err;
3260
3261 /* Invalid flags? */
3262 if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER |
3263 SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))
3264 return -EINVAL;
3265
3266 /* At least one of init self or peer RMA should be set */
3267 if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
3268 return -EINVAL;
3269
3270 /* Exactly one of init self or peer RMA should be set but not both */
3271 if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
3272 return -EINVAL;
3273
3274 /* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */
3275 if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)))
3276 return -EINVAL;
3277
3278 /* Only Dword offsets allowed */
3279 if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(uint32_t) - 1)))
3280 return -EINVAL;
3281
3282 /* Only Dword aligned offsets allowed */
3283 if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(uint32_t) - 1)))
3284 return -EINVAL;
3285
3286 if (flags & SCIF_FENCE_INIT_PEER) {
3287 micscif_inc_node_refcnt(ep->remote_dev, 1);
3288 err = micscif_send_fence_signal(epd, roff,
3289 rval, loff, lval, flags);
3290 micscif_dec_node_refcnt(ep->remote_dev, 1);
3291 } else {
3292 /* Local Signal in Local RAS */
3293 if (flags & SCIF_SIGNAL_LOCAL)
3294 if ((err = micscif_prog_signal(epd, loff,
3295 lval, RMA_WINDOW_SELF)))
3296 goto error_ret;
3297
3298 /* Signal in Remote RAS */
3299 if (flags & SCIF_SIGNAL_REMOTE) {
3300 micscif_inc_node_refcnt(ep->remote_dev, 1);
3301 err = micscif_prog_signal(epd, roff,
3302 rval, RMA_WINDOW_PEER);
3303 micscif_dec_node_refcnt(ep->remote_dev, 1);
3304 }
3305 }
3306error_ret:
3307 if (err)
3308 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
3309 else if (atomic_read(&ep->rma_info.tw_refcount))
3310 queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
3311 return err;
3312}
3313
3314int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval,
3315 off_t roff, uint64_t rval, int flags)
3316{
3317 int ret;
3318 get_kref_count(epd);
3319 ret = __scif_fence_signal(epd, loff, lval, roff, rval, flags);
3320 put_kref_count(epd);
3321 return ret;
3322}
3323EXPORT_SYMBOL(scif_fence_signal);
3324
3325/**
3326 * scif_get_nodeIDs - Return information about online nodes
3327 * @nodes: array space reserved for returning online node IDs
3328 * @len: number of entries on the nodes array
3329 * @self: address to place the node ID of this system
3330 *
3331 * Return Values
3332 * scif_get_nodeIDs() returns the total number of scif nodes
3333 * (including host) in the system
3334 */
3335int
3336scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self)
3337{
3338 int online = 0;
3339 int offset = 0;
3340 int node;
3341#ifdef _MIC_SCIF_
3342 micscif_get_node_info();
3343#endif
3344
3345 *self = ms_info.mi_nodeid;
3346 mutex_lock(&ms_info.mi_conflock);
3347 len = SCIF_MIN(len, (int32_t)ms_info.mi_total);
3348 for (node = 0; node <=(int32_t)ms_info.mi_maxid; node++) {
3349 if (ms_info.mi_mask & (1UL << node)) {
3350 online++;
3351 if (offset < len)
3352 nodes[offset++] = node;
3353 }
3354 }
3355 pr_debug("SCIFAPI get_nodeIDs total %d online %d filled in %d nodes\n",
3356 ms_info.mi_total, online, len);
3357 mutex_unlock(&ms_info.mi_conflock);
3358
3359 return online;
3360}
3361
3362EXPORT_SYMBOL(scif_get_nodeIDs);
3363
3364/**
3365 * micscif_pci_dev:
3366 * @node: node ID
3367 *
3368 * Return the pci_dev associated with a node.
3369 */
3370int micscif_pci_dev(uint16_t node, struct pci_dev **pdev)
3371{
3372#ifdef _MIC_SCIF_
3373 /* This *is* a PCI device, therefore no pdev to return. */
3374 return -ENODEV;
3375#else
3376 mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1);
3377 *pdev = mic_ctx->bi_pdev;
3378 return 0;
3379#endif
3380}
3381
3382#ifndef _MIC_SCIF_
3383/**
3384 * micscif_pci_info:
3385 * @node: node ID
3386 *
3387 * Populate the pci device info pointer associated with a node.
3388 */
3389int micscif_pci_info(uint16_t node, struct scif_pci_info *dev)
3390{
3391 int i;
3392 mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1);
3393 struct pci_dev *pdev;
3394
3395 if (!mic_ctx)
3396 return -ENODEV;
3397
3398 dev->pdev = pdev = mic_ctx->bi_pdev;
3399 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
3400 if (!pci_resource_start(pdev, i)) {
3401 dev->va[i] = NULL;
3402 continue;
3403 }
3404 if (pci_resource_flags(pdev, i) & IORESOURCE_PREFETCH) {
3405 /* TODO: Change comparison check for KNL. */
3406 if (pci_resource_start(pdev, i) == mic_ctx->aper.pa)
3407 dev->va[i] = mic_ctx->aper.va;
3408 else
3409 dev->va[i] = NULL;
3410 } else {
3411 dev->va[i] = mic_ctx->mmio.va;
3412 }
3413 }
3414 return 0;
3415}
3416#endif
3417
3418/**
3419 * scif_pci_info - Populate the pci device info pointer associated with a node
3420 * @node: the node to query
3421 * @scif_pdev: The scif_pci_info structure to populate.
3422 *
3423 * scif_pci_info() populates the provided scif_pci_info structure
3424 * associated with a node. The requested node ID cannot be the same as
3425 * the current node. This routine may only return success when called from
3426 * the host.
3427 *
3428 * Return Values
3429 * Upon successful completion, scif_pci_info() returns 0; otherwise the
3430 * an appropriate error is returned as documented in scif.h.
3431 */
3432int scif_pci_info(uint16_t node, struct scif_pci_info *dev)
3433{
3434#ifdef _MIC_SCIF_
3435 return -EINVAL;
3436#else
3437 if (node > ms_info.mi_maxid)
3438 return -EINVAL;
3439
3440 if ((scif_dev[node].sd_state == SCIFDEV_NOTPRESENT) ||
3441 is_self_scifdev(&scif_dev[node]))
3442 return -ENODEV;
3443
3444 return micscif_pci_info(node, dev);
3445#endif
3446}
3447EXPORT_SYMBOL(scif_pci_info);
3448
3449/*
3450 * DEBUG helper functions
3451 */
3452void
3453print_ep_state(struct endpt *ep, char *label)
3454{
3455 if (ep)
3456 printk("%s: EP %p state %s\n",
3457 label, ep, scif_ep_states[ep->state]);
3458 else
3459 printk("%s: EP %p\n state ?\n", label, ep);
3460}
3461