* Copyright 2010-2017 Intel Corporation.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License, version 2,
* as published by the Free Software Foundation.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
* Disclaimer: The codes contained in these modules may be specific to
* the Intel Software Development Platform codenamed Knights Ferry,
* and the Intel product codenamed Knights Corner, and are not backward
* compatible with other Intel products. Additionally, Intel will NOT
* support the codes or instruction set in future products.
* Intel offers no warranty of any kind regarding the code. This code is
* licensed on an "AS IS" basis and Intel is not obligated to provide
* any support, assistance, installation, training, or other services
* of any kind. Intel is also not obligated to provide any updates,
* enhancements or extensions. Intel specifically disclaims any warranty
* of merchantability, non-infringement, fitness for any particular
* purpose, and any other warranty.
* Further, Intel disclaims all liability of any kind, including but
* not limited to liability for infringement of any proprietary rights,
* relating to the use of the code, even if Intel is notified of the
* possibility of such liability. Except as expressly stated in an Intel
* license agreement provided with this code and agreed upon with Intel,
* no license, express or implied, by estoppel or otherwise, to any
* intellectual property rights is granted herein.
#include <linux/module.h>
#include "mic/micscif_map.h"
#define SCIF_MAP_ULIMIT 0x40
bool mic_ulimit_check
= 0;
char *scif_ep_states
[] = {
ASYNC_CONN_IDLE
= 1, /* ep setup for async connect */
ASYNC_CONN_INPROGRESS
, /* async connect in progress */
ASYNC_CONN_FLUSH_WORK
/* async work flush in progress */
* scif_open() - Create a SCIF end point
* Create a SCIF end point and set the state to UNBOUND. This function
* returns the address of the end point data structure.
if ((ep
= (struct endpt
*)kzalloc(sizeof(struct endpt
), GFP_KERNEL
)) == NULL
) {
printk(KERN_ERR
"SCIFAPI open: kzalloc fail on scif end point descriptor\n");
if ((ep
->qp_info
.qp
= (struct micscif_qp
*)
kzalloc(sizeof(struct micscif_qp
), GFP_KERNEL
)) == NULL
) {
printk(KERN_ERR
"SCIFAPI open: kzalloc fail on scif end point queue pointer\n");
spin_lock_init(&ep
->lock
);
mutex_init (&ep
->sendlock
);
mutex_init (&ep
->recvlock
);
if (micscif_rma_ep_init(ep
) < 0) {
printk(KERN_ERR
"SCIFAPI _open: RMA EP Init failed\n");
ep
->state
= SCIFEP_UNBOUND
;
pr_debug("SCIFAPI open: ep %p success\n", ep
);
ep
= (struct endpt
*)__scif_open();
kref_init(&(ep
->ref_count
));
EXPORT_SYMBOL(scif_open
);
* scif_close() - Terminate a SCIF end point
* @epd: The end point address returned from scif_open()
* The function terminates a scif connection. It must ensure all traffic on
* the connection is finished before removing it.
* On Connection with memory mapped this become more difficult. Once normal
* DMA and message traffic has ended the end point must be placed in a zombie
* state and wait for the other side to also release it's memory references.
__scif_close(scif_epd_t epd
)
struct endpt
*ep
= (struct endpt
*)epd
;
struct list_head
*pos
, *tmpq
;
enum endptstate oldstate
;
pr_debug("SCIFAPI close: ep %p %s\n", ep
, scif_ep_states
[ep
->state
]);
flush_conn
= (ep
->conn_async_state
== ASYNC_CONN_INPROGRESS
);
flush_workqueue(ms_info
.mi_conn_wq
);
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
spin_lock_irqsave(&ep
->lock
, sflags
);
ep
->state
= SCIFEP_CLOSING
;
BUG_ON(SCIFEP_ZOMBIE
== oldstate
);
case SCIFEP_DISCONNECTED
:
spin_unlock_irqrestore(&ep
->lock
, sflags
);
micscif_unregister_all_windows(epd
);
// Remove from the disconnected list
spin_lock_irqsave(&ms_info
.mi_connlock
, sflags
);
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_disconnected
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
spin_unlock_irqrestore(&ms_info
.mi_connlock
, sflags
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
struct endpt
*fep
= NULL
;
unsigned long ts
= jiffies
;
struct list_head
*pos
, *tmpq
;
// Very short time before mapping completes and state becomes connected
// and does a standard teardown.
while (ep
->state
== SCIFEP_MAPPING
) {
if (time_after((unsigned long)jiffies
,ts
+ NODE_ALIVE_TIMEOUT
)) {
printk(KERN_ERR
"%s %d ep->state %d\n", __func__
, __LINE__
, ep
->state
);
ep
->state
= SCIFEP_BOUND
;
init_waitqueue_head(&ep
->disconwq
); // Wait for connection queue
spin_unlock_irqrestore(&ep
->lock
, sflags
);
micscif_unregister_all_windows(epd
);
// Remove from the connected list
spin_lock_irqsave(&ms_info
.mi_connlock
, sflags
);
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_connected
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
put_conn_count(ep
->remote_dev
);
// The other side has completed the disconnect before
// the end point can be removed from the list. Therefore
// the ep lock is not locked, traverse the disconnected list
// to find the endpoint, release the conn lock and
// proceed to teardown the end point below.
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_disconnected
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
spin_unlock_irqrestore(&ms_info
.mi_connlock
, sflags
);
spin_unlock(&ms_info
.mi_connlock
);
// Now we are free to close out the connection
msg
.payload
[0] = (uint64_t)ep
;
msg
.payload
[1] = ep
->remote_ep
;
err
= micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
/* Now wait for the remote node to respond */
wait_event_timeout(ep
->disconwq
,
(ep
->state
== SCIFEP_DISCONNECTED
), NODE_ALIVE_TIMEOUT
);
* Grab and release the ep lock to synchronize with the
* thread waking us up. If we dont grab this lock, then
* the ep might be freed before the wakeup completes
* resulting in potential memory corruption.
spin_lock_irqsave(&ep
->lock
, sflags
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
spin_lock_irqsave(&ms_info
.mi_eplock
, sflags
);
// remove from listen list
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_listen
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
// Remove any dangling accepts
aep
= list_first_entry(&ep
->li_accept
, struct endpt
, liacceptlist
);
list_del(&aep
->liacceptlist
);
if (aep
->port
.port
&& !aep
->accepted_ep
)
put_scif_port(aep
->port
.port
);
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_uaccept
) {
tmpep
= list_entry(pos
, struct endpt
, miacceptlist
);
spin_unlock_irqrestore(&ms_info
.mi_eplock
, sflags
);
spin_lock_irqsave(&ms_info
.mi_connlock
, sflags
);
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_connected
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
put_conn_count(aep
->remote_dev
);
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_disconnected
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
spin_unlock_irqrestore(&ms_info
.mi_connlock
, sflags
);
micscif_teardown_ep(aep
);
spin_lock_irqsave(&ms_info
.mi_eplock
, sflags
);
micscif_add_epd_to_zombie_list(aep
, MI_EPLOCK_HELD
);
spin_unlock(&ms_info
.mi_eplock
);
// Remove and reject any pending connection requests.
conreq
= list_first_entry(&ep
->conlist
, struct conreq
, list
);
msg
.dst
.node
= conreq
->msg
.src
.node
;
msg
.dst
.port
= conreq
->msg
.src
.port
;
msg
.payload
[0] = conreq
->msg
.payload
[0];
msg
.payload
[1] = conreq
->msg
.payload
[1];
* No Error Handling on purpose for micscif_nodeqp_send().
* If the remote node is lost we still want free the connection
* requests on the self node.
micscif_nodeqp_send(&scif_dev
[conreq
->msg
.src
.node
], &msg
, ep
);
// If a kSCIF accept is waiting wake it up
wake_up_interruptible(&ep
->conwq
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
if (ep
->port
.port
&& !ep
->accepted_ep
)
put_scif_port(ep
->port
.port
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
micscif_add_epd_to_zombie_list(ep
, !MI_EPLOCK_HELD
);
scif_ref_rel(struct kref
*kref_count
)
epd
= container_of(kref_count
, struct endpt
, ref_count
);
__scif_close((scif_epd_t
)epd
);
scif_close(scif_epd_t epd
)
EXPORT_SYMBOL(scif_close
);
* scif_flush() - Flush the endpoint
* @epd: The end point address returned from scif_open()
__scif_flush(scif_epd_t epd
)
struct endpt
*ep
= (struct endpt
*)epd
;
struct list_head
*pos
, *tmpq
;
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
spin_lock_irqsave(&ep
->lock
, sflags
);
struct endpt
*fep
= NULL
;
init_waitqueue_head(&ep
->disconwq
); // Wait for connection queue
WARN_ON(ep
->files
); // files should never be set while connected
spin_unlock_irqrestore(&ep
->lock
, sflags
);
spin_lock_irqsave(&ms_info
.mi_connlock
, sflags
);
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_connected
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
put_conn_count(ep
->remote_dev
);
// The other side has completed the disconnect before
// the end point can be removed from the list. Therefore
// the ep lock is not locked, traverse the disconnected list
// to find the endpoint, release the conn lock.
list_for_each_safe(pos
, tmpq
, &ms_info
.mi_disconnected
) {
tmpep
= list_entry(pos
, struct endpt
, list
);
spin_unlock_irqrestore(&ms_info
.mi_connlock
, sflags
);
spin_unlock(&ms_info
.mi_connlock
);
msg
.payload
[0] = (uint64_t)ep
;
msg
.payload
[1] = ep
->remote_ep
;
err
= micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
/* Now wait for the remote node to respond */
wait_event_timeout(ep
->disconwq
,
(ep
->state
== SCIFEP_DISCONNECTED
), NODE_ALIVE_TIMEOUT
);
spin_lock_irqsave(&ms_info
.mi_connlock
, sflags
);
list_add_tail(&ep
->list
, &ms_info
.mi_disconnected
);
ep
->state
= SCIFEP_DISCONNECTED
;
spin_unlock_irqrestore(&ms_info
.mi_connlock
, sflags
);
// Wake up threads blocked in send and recv
wake_up_interruptible(&ep
->sendwq
);
wake_up_interruptible(&ep
->recvwq
);
ep
->state
= SCIFEP_CLLISTEN
;
// If an accept is waiting wake it up
wake_up_interruptible(&ep
->conwq
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
* scif_bind() - Bind a SCIF end point to a port ID.
* @epd: The end point address returned from scif_open()
* @pn: Port ID (number) to bind to
* Set the port ID associated with the end point and place it in the bound state.
* If a port ID of zero is requested a non zero port ID is allocated for it.
* Upon successful compltion the port id (number) will be returned.
* If the end point is not in the unbound state then return -EISCONN.
* If port ID zero is specified and allocation of a port ID fails -ENOSPC
__scif_bind(scif_epd_t epd
, uint16_t pn
)
struct endpt
*ep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI bind: ep %p %s requested port number %d\n",
ep
, scif_ep_states
[ep
->state
], pn
);
* Modeled on http://www.ietf.org/rfc/rfc1700.txt?number=1700
* SCIF ports below SCIF_ADMIN_PORT_END can only be bound by
* system (or root) processes or by processes executed by
if ( pn
< SCIF_ADMIN_PORT_END
&& !capable(CAP_SYS_ADMIN
)) {
goto scif_bind_admin_exit
;
spin_lock_irqsave(&ep
->lock
, sflags
);
if (ep
->state
== SCIFEP_BOUND
) {
} else if (ep
->state
!= SCIFEP_UNBOUND
) {
if ((tmp
= rsrv_scif_port(pn
)) != pn
) {
ep
->state
= SCIFEP_BOUND
;
ep
->port
.node
= ms_info
.mi_nodeid
;
ep
->conn_async_state
= ASYNC_CONN_IDLE
;
pr_debug("SCIFAPI bind: bound to port number %d\n", pn
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
scif_bind(scif_epd_t epd
, uint16_t pn
)
ret
= __scif_bind(epd
, pn
);
EXPORT_SYMBOL(scif_bind
);
* scif_listen() - Place the end point in the listening state
* @epd: The end point address returned from scif_open()
* @backlog: Maximum number of pending connection requests.
* The end point is placed in the listening state ready to accept connection
* requests. The backlog paramter is saved to indicate the maximun number of
* connection requests from the remote node to save. The end point is
* placed on a list of listening end points to allow a connection request to
* Upon successful completion a zero is returned.
* If the end point is not in the bound state -EINVAL or -EISCONN is returned.
__scif_listen(scif_epd_t epd
, int backlog
)
struct endpt
*ep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI listen: ep %p %s\n", ep
, scif_ep_states
[ep
->state
]);
spin_lock_irqsave(&ep
->lock
, sflags
);
BUG_ON(SCIFEP_ZOMBIE
== ep
->state
);
case SCIFEP_DISCONNECTED
:
spin_unlock_irqrestore(&ep
->lock
, sflags
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
ep
->state
= SCIFEP_LISTENING
;
INIT_LIST_HEAD(&ep
->conlist
); // List of connection requests
init_waitqueue_head(&ep
->conwq
); // Wait for connection queue
INIT_LIST_HEAD(&ep
->li_accept
); // User ep list for ACCEPTREG calls
spin_unlock_irqrestore(&ep
->lock
, sflags
);
// Listen status is complete so delete the qp information not needed
// on a listen before placing on the list of listening ep's
micscif_teardown_ep((void *)ep
);
spin_lock_irqsave(&ms_info
.mi_eplock
, sflags
);
list_add_tail(&ep
->list
, &ms_info
.mi_listen
);
spin_unlock_irqrestore(&ms_info
.mi_eplock
, sflags
);
scif_listen(scif_epd_t epd
, int backlog
)
ret
= __scif_listen(epd
, backlog
);
EXPORT_SYMBOL(scif_listen
);
* @node: destination node id
* Try to setup a p2p connection between the current
* node and the desitination node. We need host to
* setup the initial p2p connections. So we send
* this message to the host which acts like proxy
* in setting up p2p connection.
static int scif_p2p_connect(int node
)
struct micscif_dev
*remote_dev
= &scif_dev
[node
];
pr_debug("%s:%d SCIF_NODE_CONNECT to host\n", __func__
, __LINE__
);
micscif_inc_node_refcnt(&scif_dev
[SCIF_HOST_NODE
], 1);
msg
.dst
.node
= SCIF_HOST_NODE
;
msg
.uop
= SCIF_NODE_CONNECT
;
if ((err
= micscif_nodeqp_send(&scif_dev
[SCIF_HOST_NODE
],
printk(KERN_ERR
"%s:%d error while sending SCIF_NODE_CONNECT to"
" node %d\n", __func__
, __LINE__
, node
);
micscif_dec_node_refcnt(&scif_dev
[SCIF_HOST_NODE
], 1);
wait_event_interruptible_timeout(remote_dev
->sd_p2p_wq
,
(remote_dev
->sd_state
== SCIFDEV_RUNNING
) ||
(remote_dev
->sd_state
== SCIFDEV_NOTPRESENT
), NODE_ALIVE_TIMEOUT
);
pr_debug("%s:%d SCIF_NODE_CONNECT state:%d\n", __func__
, __LINE__
,
micscif_dec_node_refcnt(&scif_dev
[SCIF_HOST_NODE
], 1);
static int scif_conn_func(struct endpt
*ep
)
if ((err
= micscif_reserve_dma_chan(ep
))) {
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
ep
->state
= SCIFEP_BOUND
;
goto connect_error_simple
;
// Initiate the first part of the endpoint QP setup
err
= micscif_setup_qp_connect(ep
->qp_info
.qp
, &ep
->qp_info
.qp_offset
,
ENDPT_QP_SIZE
, ep
->remote_dev
);
printk(KERN_ERR
"%s err %d qp_offset 0x%llx\n",
__func__
, err
, ep
->qp_info
.qp_offset
);
ep
->state
= SCIFEP_BOUND
;
goto connect_error_simple
;
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
// Format connect message and send it
msg
.payload
[0] = (uint64_t)ep
;
msg
.payload
[1] = ep
->qp_info
.qp_offset
;
if ((err
= micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
))) {
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
goto connect_error_simple
;
// Wait for request to be processed.
while ((err
= wait_event_interruptible_timeout(ep
->conwq
,
(ep
->state
!= SCIFEP_CONNECTING
), NODE_ALIVE_TIMEOUT
)) <= 0) {
pr_debug("SCIFAPI connect: ep %p ^C detected\n", ep
);
// interrupted out of the wait
msg
.uop
= SCIF_CNCT_TERM
;
if (!(err
= micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
))) {
err
= wait_event_timeout(ep
->diswq
,
(ep
->state
!= SCIFEP_CONNECTING
), NODE_ALIVE_TIMEOUT
);
if (!err
&& scifdev_alive(ep
))
if (ep
->state
== SCIFEP_MAPPING
) {
micscif_setup_qp_connect_response(ep
->remote_dev
,
ep
->qp_info
.qp
, ep
->qp_info
.cnct_gnt_payload
);
msg
.uop
= SCIF_CNCT_GNTNACK
;
msg
.payload
[0] = ep
->remote_ep
;
/* No error handling for Notification messages */
micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
);
// Ensure after that even after a timeout the state of the end point is bound
ep
->state
= SCIFEP_BOUND
;
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
goto connect_error_simple
;
if (ep
->state
== SCIFEP_MAPPING
) {
err
= micscif_setup_qp_connect_response(ep
->remote_dev
,
ep
->qp_info
.qp
, ep
->qp_info
.cnct_gnt_payload
);
// If the resource to map the queue are not available then we need
// to tell the other side to terminate the accept
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
msg
.uop
= SCIF_CNCT_GNTNACK
;
msg
.payload
[0] = ep
->remote_ep
;
/* No error handling for Notification messages */
micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
);
ep
->state
= SCIFEP_BOUND
;
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
goto connect_error_simple
;
// Send a grant ack to inform the accept we are done mapping its resources.
msg
.uop
= SCIF_CNCT_GNTACK
;
msg
.payload
[0] = ep
->remote_ep
;
if (!(err
= micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
))) {
ep
->state
= SCIFEP_CONNECTED
;
spin_lock_irqsave(&ms_info
.mi_connlock
, sflags
);
list_add_tail(&ep
->list
, &ms_info
.mi_connected
);
get_conn_count(ep
->remote_dev
);
spin_unlock_irqrestore(&ms_info
.mi_connlock
, sflags
);
pr_debug("SCIFAPI connect: ep %p connected\n", ep
);
ep
->state
= SCIFEP_BOUND
;
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
goto connect_error_simple
;
} else if (ep
->state
== SCIFEP_BOUND
) {
pr_debug("SCIFAPI connect: ep %p connection refused\n", ep
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
goto connect_error_simple
;
pr_debug("SCIFAPI connect: ep %p connection interrupted\n", ep
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
goto connect_error_simple
;
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
* Workqueue handler for servicing non-blocking SCIF connect
void micscif_conn_handler(struct work_struct
*work
)
spin_lock(&ms_info
.mi_nb_connect_lock
);
if (!list_empty(&ms_info
.mi_nb_connect_list
)) {
ep
= list_first_entry(&ms_info
.mi_nb_connect_list
,
struct endpt
, conn_list
);
list_del(&ep
->conn_list
);
spin_unlock(&ms_info
.mi_nb_connect_lock
);
ep
->conn_err
= scif_conn_func(ep
);
wake_up_interruptible(&ep
->conn_pend_wq
);
* scif_connect() - Request a connection to a remote node
* @epd: The end point address returned from scif_open()
* @dst: Remote note address informtion
* The function requests a scif connection to the remote node
* identified by the dst parameter. "dst" contains the remote node and
* Upon successful complete a zero will be returned.
* If the end point is not in the bound state -EINVAL will be returned.
* If during the connection sequence resource allocation fails the -ENOMEM
* If the remote side is not responding to connection requests the caller may
* terminate this funciton with a signal. If so a -EINTR will be returned.
__scif_connect(scif_epd_t epd
, struct scif_portID
*dst
, bool non_block
)
struct endpt
*ep
= (struct endpt
*)epd
;
struct micscif_dev
*remote_dev
;
pr_debug("SCIFAPI connect: ep %p %s\n", ep
,
scif_ep_states
[ep
->state
]);
if (dst
->node
> MAX_BOARD_SUPPORTED
)
remote_dev
= &scif_dev
[dst
->node
];
if ((SCIFDEV_INIT
== remote_dev
->sd_state
||
SCIFDEV_STOPPED
== remote_dev
->sd_state
) && mic_p2p_enable
)
if ((err
= scif_p2p_connect(dst
->node
)))
if (SCIFDEV_RUNNING
!= scif_dev
[dst
->node
].sd_state
&&
SCIFDEV_SLEEPING
!= scif_dev
[dst
->node
].sd_state
)
spin_lock_irqsave(&ep
->lock
, sflags
);
BUG_ON(SCIFEP_ZOMBIE
== ep
->state
);
case SCIFEP_DISCONNECTED
:
if (ep
->conn_async_state
== ASYNC_CONN_INPROGRESS
)
ep
->conn_async_state
= ASYNC_CONN_FLUSH_WORK
;
if (ep
->conn_async_state
== ASYNC_CONN_INPROGRESS
)
if (ep
->conn_async_state
== ASYNC_CONN_INPROGRESS
)
ep
->conn_async_state
= ASYNC_CONN_FLUSH_WORK
;
if ((ep
->port
.port
= get_scif_port()) == 0)
ep
->port
.node
= ms_info
.mi_nodeid
;
ep
->conn_async_state
= ASYNC_CONN_IDLE
;
* If a non-blocking connect has been already initiated (conn_async_state
* is either ASYNC_CONN_INPROGRESS or ASYNC_CONN_FLUSH_WORK), the end point
* could end up in SCIF_BOUND due an error in the connection
* process (e.g., connnection refused)
* If conn_async_state is ASYNC_CONN_INPROGRESS - transition to
* ASYNC_CONN_FLUSH_WORK so that the error status can be collected.
* If the state is already ASYNC_CONN_FLUSH_WORK - then set the error
* to EINPROGRESS since some other thread is waiting to collect error status.
if (ep
->conn_async_state
== ASYNC_CONN_INPROGRESS
)
ep
->conn_async_state
= ASYNC_CONN_FLUSH_WORK
;
else if (ep
->conn_async_state
== ASYNC_CONN_FLUSH_WORK
)
init_waitqueue_head(&ep
->sendwq
);
init_waitqueue_head(&ep
->recvwq
);
init_waitqueue_head(&ep
->conwq
);
init_waitqueue_head(&ep
->diswq
);
ep
->conn_async_state
= 0;
ep
->conn_async_state
= ASYNC_CONN_INPROGRESS
;
if (err
|| ep
->conn_async_state
== ASYNC_CONN_FLUSH_WORK
)
goto connect_simple_unlock1
;
ep
->state
= SCIFEP_CONNECTING
;
ep
->remote_dev
= &scif_dev
[dst
->node
];
ep
->sd_state
= SCIFDEV_RUNNING
;
ep
->qp_info
.qp
->magic
= SCIFEP_MAGIC
;
ep
->qp_info
.qp
->ep
= (uint64_t)ep
;
if (ep
->conn_async_state
== ASYNC_CONN_INPROGRESS
) {
init_waitqueue_head(&ep
->conn_pend_wq
);
spin_lock(&ms_info
.mi_nb_connect_lock
);
list_add_tail(&ep
->conn_list
,
&ms_info
.mi_nb_connect_list
);
spin_unlock(&ms_info
.mi_nb_connect_lock
);
queue_work(ms_info
.mi_conn_wq
, &ms_info
.mi_conn_work
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
else if (ep
->conn_async_state
== ASYNC_CONN_FLUSH_WORK
) {
flush_workqueue(ms_info
.mi_conn_wq
);
spin_lock_irqsave(&ep
->lock
, sflags
);
ep
->conn_async_state
= ASYNC_CONN_IDLE
;
spin_unlock_irqrestore(&ep
->lock
, sflags
);
err
= scif_conn_func(ep
);
scif_connect(scif_epd_t epd
, struct scif_portID
*dst
)
ret
= __scif_connect(epd
, dst
, false);
EXPORT_SYMBOL(scif_connect
);
* scif_accept() - Accept a connection request from the remote node
* @epd: The end point address returned from scif_open()
* @peer: Filled in with pear node and port information
* @newepd: New end point created for connection
* @flags: Indicates sychronous or asynchronous mode
* The function accepts a connection request from the remote node. Successful
* complete is indicate by a new end point being created and passed back
* to the caller for future reference.
* Upon successful complete a zero will be returned and the peer information
* If the end point is not in the listening state -EINVAL will be returned.
* If during the connection sequence resource allocation fails the -ENOMEM
* If the function is called asynchronously and not connection request are
* pending it will return -EAGAIN.
* If the remote side is not sending any connection requests the caller may
* terminate this funciton with a signal. If so a -EINTR will be returned.
__scif_accept(scif_epd_t epd
, struct scif_portID
*peer
, scif_epd_t
*newepd
, int flags
)
struct endpt
*lep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI accept: ep %p %s\n", lep
, scif_ep_states
[lep
->state
]);
// Error if flags other than SCIF_ACCEPT_SYNC are set
if (flags
& ~SCIF_ACCEPT_SYNC
) {
pr_debug("SCIFAPI accept: ep %p invalid flags %x\n", lep
, flags
& ~SCIF_ACCEPT_SYNC
);
pr_debug("SCIFAPI accept: ep %p peer %p or newepd %p NULL\n",
spin_lock_irqsave(&lep
->lock
, sflags
);
if (lep
->state
!= SCIFEP_LISTENING
) {
pr_debug("SCIFAPI accept: ep %p not listending\n", lep
);
spin_unlock_irqrestore(&lep
->lock
, sflags
);
if (!lep
->conreqcnt
&& !(flags
& SCIF_ACCEPT_SYNC
)) {
// No connection request present and we do not want to wait
pr_debug("SCIFAPI accept: ep %p async request with nothing pending\n", lep
);
spin_unlock_irqrestore(&lep
->lock
, sflags
);
spin_unlock_irqrestore(&lep
->lock
, sflags
);
lep
->files
= current
? current
->files
: NULL
;
if ((err
= wait_event_interruptible(lep
->conwq
,
(lep
->conreqcnt
|| (lep
->state
!= SCIFEP_LISTENING
)))) != 0) {
pr_debug("SCIFAPI accept: ep %p ^C detected\n", lep
);
return err
; // -ERESTARTSYS
if (lep
->state
!= SCIFEP_LISTENING
) {
spin_lock_irqsave(&lep
->lock
, sflags
);
// Get the first connect request off the list
conreq
= list_first_entry(&lep
->conlist
, struct conreq
, list
);
spin_unlock_irqrestore(&lep
->lock
, sflags
);
// Fill in the peer information
peer
->node
= conreq
->msg
.src
.node
;
peer
->port
= conreq
->msg
.src
.port
;
// Create the connection endpoint
cep
= (struct endpt
*)kzalloc(sizeof(struct endpt
), GFP_KERNEL
);
pr_debug("SCIFAPI accept: ep %p new end point allocation failed\n", lep
);
goto scif_accept_error_epalloc
;
spin_lock_init(&cep
->lock
);
mutex_init (&cep
->sendlock
);
mutex_init (&cep
->recvlock
);
cep
->state
= SCIFEP_CONNECTING
;
cep
->remote_dev
= &scif_dev
[peer
->node
];
cep
->remote_ep
= conreq
->msg
.payload
[0];
cep
->sd_state
= SCIFDEV_RUNNING
;
if (!scifdev_alive(cep
)) {
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
goto scif_accept_error_qpalloc
;
if (micscif_rma_ep_init(cep
) < 0) {
pr_debug("SCIFAPI accept: ep %p new %p RMA EP init failed\n", lep
, cep
);
goto scif_accept_error_qpalloc
;
if ((err
= micscif_reserve_dma_chan(cep
))) {
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
goto scif_accept_error_qpalloc
;
cep
->qp_info
.qp
= (struct micscif_qp
*)kzalloc(sizeof(struct micscif_qp
), GFP_KERNEL
);
printk(KERN_ERR
"Port Qp Allocation Failed\n");
goto scif_accept_error_qpalloc
;
cep
->qp_info
.qp
->magic
= SCIFEP_MAGIC
;
cep
->qp_info
.qp
->ep
= (uint64_t)cep
;
micscif_inc_node_refcnt(cep
->remote_dev
, 1);
err
= micscif_setup_qp_accept(cep
->qp_info
.qp
, &cep
->qp_info
.qp_offset
,
conreq
->msg
.payload
[1], ENDPT_QP_SIZE
, cep
->remote_dev
);
pr_debug("SCIFAPI accept: ep %p new %p micscif_setup_qp_accept %d qp_offset 0x%llx\n",
lep
, cep
, err
, cep
->qp_info
.qp_offset
);
micscif_dec_node_refcnt(cep
->remote_dev
, 1);
goto scif_accept_error_map
;
cep
->port
.node
= lep
->port
.node
;
cep
->port
.port
= lep
->port
.port
;
cep
->peer
.node
= peer
->node
;
cep
->peer
.port
= peer
->port
;
init_waitqueue_head(&cep
->sendwq
); // Wait for data to be consumed
init_waitqueue_head(&cep
->recvwq
); // Wait for data to be produced
init_waitqueue_head(&cep
->conwq
); // Wait for connection request
// Return the grant message
msg
.payload
[0] = cep
->remote_ep
;
msg
.payload
[1] = cep
->qp_info
.qp_offset
;
msg
.payload
[2] = (uint64_t)cep
;
err
= micscif_nodeqp_send(cep
->remote_dev
, &msg
, cep
);
micscif_dec_node_refcnt(cep
->remote_dev
, 1);
goto scif_accept_error_map
;
err
= wait_event_timeout(cep
->conwq
,
(cep
->state
!= SCIFEP_CONNECTING
), NODE_ACCEPT_TIMEOUT
);
if (!err
&& scifdev_alive(cep
))
goto scif_accept_error_map
;
spin_lock_irqsave(&cep
->lock
, sflags
);
if (cep
->state
== SCIFEP_CONNECTED
) {
// Connect sequence complete return new endpoint information
*newepd
= (scif_epd_t
)cep
;
spin_unlock_irqrestore(&cep
->lock
, sflags
);
pr_debug("SCIFAPI accept: ep %p new %p returning new epnd point\n", lep
, cep
);
if (cep
->state
== SCIFEP_CLOSING
) {
// Remote failed to allocate resources and NAKed the grant.
// There is at this point nothing referencing the new end point.
spin_unlock_irqrestore(&cep
->lock
, sflags
);
micscif_teardown_ep((void *)cep
);
// If call with sync flag then go back and wait.
if (flags
& SCIF_ACCEPT_SYNC
) {
spin_lock_irqsave(&lep
->lock
, sflags
);
pr_debug("SCIFAPI accept: ep %p new %p remote failed to allocate resources\n", lep
, cep
);
// While connect was in progress the other side closed and sent a disconnect
// so set the end point status to closed but return anyway. This will allow
// the caller to drain anything the other side may have put in the message queue.
*newepd
= (scif_epd_t
)cep
;
spin_unlock_irqrestore(&cep
->lock
, sflags
);
// Error allocating or mapping resources
scif_accept_error_qpalloc
:
scif_accept_error_epalloc
:
micscif_inc_node_refcnt(&scif_dev
[conreq
->msg
.src
.node
], 1);
// New reject the connection request due to lack of resources
msg
.dst
.node
= conreq
->msg
.src
.node
;
msg
.dst
.port
= conreq
->msg
.src
.port
;
msg
.payload
[0] = conreq
->msg
.payload
[0];
msg
.payload
[1] = conreq
->msg
.payload
[1];
/* No error handling for Notification messages */
micscif_nodeqp_send(&scif_dev
[conreq
->msg
.src
.node
], &msg
, NULL
);
micscif_dec_node_refcnt(&scif_dev
[conreq
->msg
.src
.node
], 1);
scif_accept(scif_epd_t epd
, struct scif_portID
*peer
, scif_epd_t
*newepd
, int flags
)
ret
= __scif_accept(epd
, peer
, newepd
, flags
);
kref_init(&((*newepd
)->ref_count
));
EXPORT_SYMBOL(scif_accept
);
* @epd: The end point address returned from scif_open()
* @len: Length to receive
* @flags: Syncronous or asynchronous access
* Validate parameters for messaging APIs scif_send(..)/scif_recv(..).
scif_msg_param_check(scif_epd_t epd
, int len
, int flags
)
if (flags
&& (!(flags
& SCIF_RECV_BLOCK
)))
#define SCIF_BLAST (1 << 1) /* Use bit 1 of flags field */
* Added a temporary implementation of the exception path.
* The cost to the normal path is 1 local variable (set once and
* tested once) plus 2 tests for the 'blast' flag.
* This only apply to the card side kernel API.
* _scif_send() - Send data to connection queue
* @epd: The end point address returned from scif_open()
* @msg: Address to place data
* @len: Length to receive
* @flags: Syncronous or asynchronous access
* This function sends a packet of data to the queue * created by the
* connection establishment sequence. It returns when the packet has
* Successful completion returns the number of bytes sent.
* If the end point is not in the connect state returns -ENOTCONN;
* This function may be interrupted by a signal and will return -EINTR.
_scif_send(scif_epd_t epd
, void *msg
, int len
, int flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
struct nodemsg notif_msg
;
size_t curr_xfer_len
= 0;
if (flags
& SCIF_SEND_BLOCK
)
if (flags
& SCIF_BLAST
) {
* Do a decent try to acquire lock (~100 uSec)
for (ret
= tl
= 0; ret
< 100 && !tl
; ret
++) {
tl
= spin_trylock_irqsave(&ep
->lock
, sflags
);
spin_lock_irqsave(&ep
->lock
, sflags
);
spin_lock_irqsave(&ep
->lock
, sflags
);
while (sent_len
!= len
) {
if (ep
->state
== SCIFEP_DISCONNECTED
) {
ret
= (int)(sent_len
? sent_len
: -ECONNRESET
);
if (ep
->state
!= SCIFEP_CONNECTED
) {
ret
= (int)(sent_len
? sent_len
: -ENOTCONN
);
if (!scifdev_alive(ep
)) {
ret
= (int) (sent_len
? sent_len
: -ENODEV
);
write_count
= micscif_rb_space(&ep
->qp_info
.qp
->outbound_q
);
* Best effort to send as much data as there
* is space in the RB particularly important for the
curr_xfer_len
= min(len
- sent_len
, write_count
);
ret
= micscif_rb_write(&ep
->qp_info
.qp
->outbound_q
, msg
,
(uint32_t)curr_xfer_len
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
* If there is space in the RB and we have the
* EP lock held then writing to the RB should
* succeed. Releasing spin lock before asserting
* to avoid deadlocking the system.
* Success. Update write pointer.
micscif_rb_commit(&ep
->qp_info
.qp
->outbound_q
);
if (flags
& SCIF_BLAST
) {
* Bypass-path; set flag int the host side node_qp
* and ring the doorbell. Host will wake-up all
* listeners, such that the message will be seen.
* Need micscif_send_host_intr() to be non-static.
extern int micscif_send_host_intr(struct micscif_dev
*, uint32_t);
ep
->remote_dev
->qpairs
->remote_qp
->blast
= 1;
smp_wmb(); /* Sufficient or need sfence? */
micscif_send_host_intr(ep
->remote_dev
, 0);
* Normal path: send notification on the
* node_qp ring buffer and ring the doorbell.
notif_msg
.src
= ep
->port
;
notif_msg
.uop
= SCIF_CLIENT_SENT
;
notif_msg
.payload
[0] = ep
->remote_ep
;
if ((ret
= micscif_nodeqp_send(ep
->remote_dev
, ¬if_msg
, ep
))) {
ret
= sent_len
? sent_len
: ret
;
* Send a notification to the peer about the
notif_msg
.src
= ep
->port
;
notif_msg
.uop
= SCIF_CLIENT_SENT
;
notif_msg
.payload
[0] = ep
->remote_ep
;
if ((ret
= micscif_nodeqp_send(ep
->remote_dev
, ¬if_msg
, ep
))) {
ret
= (int)(sent_len
? sent_len
: ret
);
sent_len
+= curr_xfer_len
;
msg
= (char *)msg
+ curr_xfer_len
;
curr_xfer_len
= min(len
- sent_len
, (size_t)(ENDPT_QP_SIZE
- 1));
* Not enough space in the RB. Return in the Non Blocking case.
if (!(flags
& SCIF_SEND_BLOCK
)) {
* Flags SCIF_BLAST and SCIF_SEND_BLOCK are mutually
* exclusive, so if we get here we know that SCIF_BLAST
* was not set and thus we _do_ have the spinlock.
* No need to check variable tl here
spin_unlock_irqrestore(&ep
->lock
, sflags
);
* Wait for a message now in the Blocking case.
if ((ret
= wait_event_interruptible(ep
->sendwq
,
(SCIFEP_CONNECTED
!= ep
->state
) ||
(micscif_rb_space(&ep
->qp_info
.qp
->outbound_q
)
>= curr_xfer_len
) || (!scifdev_alive(ep
))))) {
ret
= (int) (sent_len
? sent_len
: ret
);
spin_lock_irqsave(&ep
->lock
, sflags
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
* _scif_recv() - Recieve data from connection queue
* @epd: The end point address returned from scif_open()
* @msg: Address to place data
* @len: Length to receive
* @flags: Syncronous or asynchronous access
* @touser: package send to user buffer or kernel
* This function requests to receive a packet of data from the queue
* created by the connection establishment sequence. It reads the amount
* of data requested before returning.
* This function differs from the scif_send() by also returning data if the
* end point is in the disconnected state and data is present.
* Successful completion returns the number of bytes read.
* If the end point is not in the connect state or in the disconnected state
* with data prosent it returns -ENOTCONN;
* This function may be interrupted by a signal and will return -EINTR.
_scif_recv(scif_epd_t epd
, void *msg
, int len
, int flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
struct nodemsg notif_msg
;
size_t curr_recv_len
= 0;
size_t remaining_len
= len
;
if (flags
& SCIF_RECV_BLOCK
)
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
spin_lock_irqsave(&ep
->lock
, sflags
);
if (ep
->state
!= SCIFEP_CONNECTED
&&
ep
->state
!= SCIFEP_DISCONNECTED
) {
ret
= (int) (len
- remaining_len
) ?
(int) (len
- remaining_len
) : -ENOTCONN
;
read_count
= micscif_rb_count(&ep
->qp_info
.qp
->inbound_q
,
* Best effort to recv as much data as there
* are bytes to read in the RB particularly
* important for the Non Blocking case.
curr_recv_len
= min(remaining_len
, read_count
);
read_size
= micscif_rb_get_next(
&ep
->qp_info
.qp
->inbound_q
,
msg
, (int) curr_recv_len
);
/* only could happen when copy to USER buffer
if (read_size
!= curr_recv_len
) {
spin_unlock_irqrestore(&ep
->lock
, sflags
);
* If there are bytes to be read from the RB and
* we have the EP lock held then reading from
* RB should succeed. Releasing spin lock before
* asserting to avoid deadlocking the system.
BUG_ON(read_size
!= curr_recv_len
);
if (ep
->state
== SCIFEP_CONNECTED
) {
* Update the read pointer only if the endpoint is
* still connected else the read pointer might no
* longer exist since the peer has freed resources!
micscif_rb_update_read_ptr(&ep
->qp_info
.qp
->inbound_q
);
* Send a notification to the peer about the
* consumed data message only if the EP is in
* SCIFEP_CONNECTED state.
notif_msg
.src
= ep
->port
;
notif_msg
.uop
= SCIF_CLIENT_RCVD
;
notif_msg
.payload
[0] = ep
->remote_ep
;
if ((ret
= micscif_nodeqp_send(ep
->remote_dev
, ¬if_msg
, ep
))) {
ret
= (len
- (int)remaining_len
) ?
(len
- (int)remaining_len
) : ret
;
remaining_len
-= curr_recv_len
;
msg
= (char *)msg
+ curr_recv_len
;
curr_recv_len
= min(remaining_len
, (size_t)(ENDPT_QP_SIZE
- 1));
* Bail out now if the EP is in SCIFEP_DISCONNECTED state else
* we will keep looping forever.
if (ep
->state
== SCIFEP_DISCONNECTED
) {
ret
= (len
- (int)remaining_len
) ?
(len
- (int)remaining_len
) : -ECONNRESET
;
* Return in the Non Blocking case if there is no data
* to read in this iteration.
if (!(flags
& SCIF_RECV_BLOCK
)) {
ret
= len
- (int)remaining_len
;
spin_unlock_irqrestore(&ep
->lock
, sflags
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
* Wait for a message now in the Blocking case.
* or until other side disconnects.
if ((ret
= wait_event_interruptible(ep
->recvwq
,
(SCIFEP_CONNECTED
!= ep
->state
) ||
(micscif_rb_count(&ep
->qp_info
.qp
->inbound_q
,
curr_recv_len
) >= curr_recv_len
) || (!scifdev_alive(ep
))))) {
ret
= (len
- remaining_len
) ?
(len
- (int)remaining_len
) : ret
;
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
spin_lock_irqsave(&ep
->lock
, sflags
);
spin_unlock_irqrestore(&ep
->lock
, sflags
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
* scif_user_send() - Send data to connection queue
* @epd: The end point address returned from scif_open()
* @msg: Address to place data
* @len: Length to receive
* @flags: Syncronous or asynchronous access
* This function is called from the driver IOCTL entry point
* only and is a wrapper for _scif_send().
scif_user_send(scif_epd_t epd
, void *msg
, int len
, int flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
int chunk_len
= min(len
, (1 << (MAX_ORDER
+ PAGE_SHIFT
- 1)));;
pr_debug("SCIFAPI send (U): ep %p %s\n", ep
, scif_ep_states
[ep
->state
]);
if ((err
= scif_msg_param_check(epd
, len
, flags
)))
if (!(tmp
= kmalloc(chunk_len
, GFP_KERNEL
))) {
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
* Grabbing the lock before breaking up the transfer in
* multiple chunks is required to ensure that messages do
* not get fragmented and reordered.
mutex_lock(&ep
->sendlock
);
while (sent_len
!= len
) {
msg
= (void *)((char *)msg
+ err
);
loop_len
= len
- sent_len
;
loop_len
= min(chunk_len
, loop_len
);
if (copy_from_user(tmp
, msg
, loop_len
)) {
err
= _scif_send(epd
, (void *)tmp
, loop_len
, flags
);
mutex_unlock(&ep
->sendlock
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
return err
< 0 ? err
: sent_len
;
* scif_user_recv() - Recieve data from connection queue
* @epd: The end point address returned from scif_open()
* @msg: Address to place data
* @len: Length to receive
* @flags: Syncronous or asynchronous access
* This function is called from the driver IOCTL entry point
* only and is a wrapper for _scif_recv().
scif_user_recv(scif_epd_t epd
, void *msg
, int len
, int flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
int chunk_len
= min(len
, (1 << (MAX_ORDER
+ PAGE_SHIFT
- 1)));;
pr_debug("SCIFAPI recv (U): ep %p %s\n", ep
, scif_ep_states
[ep
->state
]);
if ((err
= scif_msg_param_check(epd
, len
, flags
)))
if (!(tmp
= kmalloc(chunk_len
, GFP_KERNEL
))) {
* Grabbing the lock before breaking up the transfer in
* multiple chunks is required to ensure that messages do
* not get fragmented and reordered.
mutex_lock(&ep
->recvlock
);
while (recv_len
!= len
) {
msg
= (void *)((char *)msg
+ err
);
loop_len
= len
- recv_len
;
loop_len
= min(chunk_len
, loop_len
);
if ((err
= _scif_recv(epd
, tmp
, loop_len
, flags
)) < 0)
if (copy_to_user(msg
, tmp
, err
)) {
mutex_unlock(&ep
->recvlock
);
return err
< 0 ? err
: recv_len
;
* Added a temporary implementation of the exception path.
* The cost to the normal path testing of 2 flag bits instead
* of just one and a change to condition for node-wakeup.
* scif_send() - Send data to connection queue
* @epd: The end point address returned from scif_open()
* @msg: Address to place data
* @len: Length to receive
* @flags: Syncronous or asynchronous access
* This function is called from the kernel mode only and is
* a wrapper for _scif_send().
__scif_send(scif_epd_t epd
, void *msg
, int len
, int flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI send (K): ep %p %s\n", ep
, scif_ep_states
[ep
->state
]);
* KAA: this is same code as scif_msg_param_check(),
* but since that routine is shared with scif_recv
* I thought is safer to replicate code here.
if (flags
&& !(flags
& (SCIF_SEND_BLOCK
| SCIF_BLAST
)))
if ((flags
& (SCIF_SEND_BLOCK
| SCIF_BLAST
)) ==
(SCIF_SEND_BLOCK
| SCIF_BLAST
))
if ((ret
= scif_msg_param_check(epd
, len
, flags
)))
* Cannot block while waiting for node to wake up
* if non blocking messaging mode is requested. Return
* ENODEV if the remote node is idle.
if (!(flags
& SCIF_SEND_BLOCK
) && ep
->remote_dev
&&
SCIF_NODE_IDLE
== atomic_long_read(
&ep
->remote_dev
->scif_ref_cnt
))
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
* Grab the mutex lock in the blocking case only
* to ensure messages do not get fragmented/reordered.
* The non blocking mode is protected using spin locks
if (flags
& SCIF_SEND_BLOCK
)
mutex_lock(&ep
->sendlock
);
ret
= _scif_send(epd
, msg
, len
, flags
);
if (flags
& SCIF_SEND_BLOCK
)
mutex_unlock(&ep
->sendlock
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
scif_send(scif_epd_t epd
, void *msg
, int len
, int flags
)
ret
= __scif_send(epd
, msg
, len
, flags
);
EXPORT_SYMBOL(scif_send
);
* scif_recv() - Recieve data from connection queue
* @epd: The end point address returned from scif_open()
* @msg: Address to place data
* @len: Length to receive
* @flags: Syncronous or asynchronous access
* This function is called from the kernel mode only and is
* a wrapper for _scif_recv().
__scif_recv(scif_epd_t epd
, void *msg
, int len
, int flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI recv (K): ep %p %s\n", ep
, scif_ep_states
[ep
->state
]);
if ((ret
= scif_msg_param_check(epd
, len
, flags
)))
* Cannot block while waiting for node to wake up
* if non blocking messaging mode is requested. Return
* ENODEV if the remote node is idle.
if (!flags
&& ep
->remote_dev
&&
SCIF_NODE_IDLE
== atomic_long_read(
&ep
->remote_dev
->scif_ref_cnt
))
* Grab the mutex lock in the blocking case only
* to ensure messages do not get fragmented/reordered.
* The non blocking mode is protected using spin locks
if (flags
& SCIF_RECV_BLOCK
)
mutex_lock(&ep
->recvlock
);
ret
= _scif_recv(epd
, msg
, len
, flags
);
if (flags
& SCIF_RECV_BLOCK
)
mutex_unlock(&ep
->recvlock
);
scif_recv(scif_epd_t epd
, void *msg
, int len
, int flags
)
ret
= __scif_recv(epd
, msg
, len
, flags
);
EXPORT_SYMBOL(scif_recv
);
* __scif_pin_pages - __scif_pin_pages() pins the physical pages which back
* the range of virtual address pages starting at addr and continuing for
* len bytes. addr and len are constrained to be multiples of the page size.
* A successful scif_register() call returns an opaque pointer value
* which may be used in subsequent calls to scif_register_pinned_pages().
* Upon successful completion, __scif_pin_pages() returns a
* scif_pinned_pages_t value else an apt error is returned as documented
* in scif.h. Protections of the set of pinned pages are also returned by
* reference via out_prot.
__scif_pin_pages(void *addr
, size_t len
, int *out_prot
,
int map_flags
, scif_pinned_pages_t
*pages
)
struct scif_pinned_pages
*pinned_pages
;
int nr_pages
, err
= 0, i
;
bool vmalloc_addr
= false;
bool try_upgrade
= false;
struct mm_struct
*mm
= NULL
;
if (map_flags
& ~(SCIF_MAP_KERNEL
| SCIF_MAP_ULIMIT
))
ulimit
= !!(map_flags
& SCIF_MAP_ULIMIT
);
/* Unsupported protection requested */
if (prot
& ~(SCIF_PROT_READ
| SCIF_PROT_WRITE
))
/* addr/len must be page aligned. len should be non zero */
(align_low((uint64_t)addr
, PAGE_SIZE
) != (uint64_t)addr
) ||
(align_low((uint64_t)len
, PAGE_SIZE
) != (uint64_t)len
))
nr_pages
= (int)(len
>> PAGE_SHIFT
);
/* Allocate a set of pinned pages */
if (!(pinned_pages
= micscif_create_pinned_pages(nr_pages
, prot
)))
if (unlikely(map_flags
& SCIF_MAP_KERNEL
)) {
if (is_vmalloc_addr(addr
))
for (i
= 0; i
< nr_pages
; i
++) {
if (unlikely(vmalloc_addr
))
vmalloc_to_page((char *)addr
+ (i
* PAGE_SIZE
) );
virt_to_page((char *)addr
+ (i
* PAGE_SIZE
) );
pinned_pages
->num_pages
[i
] = 1;
pinned_pages
->nr_contig_chunks
++;
pinned_pages
->nr_pages
= nr_pages
;
pinned_pages
->map_flags
= SCIF_MAP_KERNEL
;
if (prot
== SCIF_PROT_READ
)
down_write(&mm
->mmap_sem
);
err
= __scif_check_inc_pinned_vm(mm
, nr_pages
);
pinned_pages
->nr_pages
= 0;
pinned_pages
->nr_pages
= get_user_pages(
prot
& SCIF_PROT_WRITE
? FOLL_WRITE
: 0,
if (nr_pages
== pinned_pages
->nr_pages
) {
atomic_long_add_return(nr_pages
, &ms_info
.rma_pin_cnt
);
micscif_detect_large_page(pinned_pages
, addr
);
__scif_dec_pinned_vm_lock(mm
, nr_pages
, 0);
WARN_ON(atomic_long_sub_return(1,
&ms_info
.rma_mm_cnt
) < 0);
/* Roll back any pinned pages */
for (i
= 0; i
< pinned_pages
->nr_pages
; i
++) {
if (pinned_pages
->pages
[i
])
put_page(pinned_pages
->pages
[i
]);
prot
&= ~SCIF_PROT_WRITE
;
pinned_pages
->map_flags
= 0;
if (pinned_pages
->nr_pages
< nr_pages
) {
pinned_pages
->nr_pages
= nr_pages
;
atomic_set(&pinned_pages
->ref_count
, nr_pages
);
__scif_dec_pinned_vm_lock(mm
, nr_pages
, 0);
/* Something went wrong! Rollback */
pinned_pages
->nr_pages
= nr_pages
;
micscif_destroy_pinned_pages(pinned_pages
);
pr_debug("%s %d err %d len 0x%lx\n", __func__
, __LINE__
, err
, len
);
* scif_pin_pages - scif_pin_pages() pins the physical pages which back
* the range of virtual address pages starting at addr and continuing for
* len bytes. addr and len are constrained to be multiples of the page size.
* A successful scif_register() call returns an opaque pointer value
* which may be used in subsequent calls to scif_register_pinned_pages().
* Upon successful completion, scif_register() returns a
* scif_pinned_pages_t value else an apt error is returned as documented
scif_pin_pages(void *addr
, size_t len
, int prot
,
int map_flags
, scif_pinned_pages_t
*pages
)
return __scif_pin_pages(addr
, len
, &prot
, map_flags
, pages
);
EXPORT_SYMBOL(scif_pin_pages
);
* scif_unpin_pages: Unpin a set of pages
* Upon successful completion, scif_unpin_pages() returns 0;
* else an apt error is returned as documented in scif.h
scif_unpin_pages(scif_pinned_pages_t pinned_pages
)
if (!pinned_pages
|| SCIFEP_MAGIC
!= pinned_pages
->magic
)
ret
= atomic_sub_return((int32_t)pinned_pages
->nr_pages
,
&pinned_pages
->ref_count
);
* Destroy the window if the ref count for this set of pinned
* pages has dropped to zero. If it is positive then there is
* a valid registered window which is backed by these pages and
* it will be destroyed once all such windows are unregistered.
err
= micscif_destroy_pinned_pages(pinned_pages
);
EXPORT_SYMBOL(scif_unpin_pages
);
* scif_register_pinned_pages: Mark a memory region for remote access.
* The scif_register_pinned_pages() function opens a window, a range
* of whole pages of the registered address space of the endpoint epd,
* starting at offset po. The value of po, further described below, is
* a function of the parameters offset and pinned_pages, and the value
* of map_flags. Each page of the window represents a corresponding
* physical memory page of pinned_pages; the length of the window is
* the same as the length of pinned_pages. A successful scif_register()
* call returns po as the return value.
* Upon successful completion, scif_register_pinned_pages() returns
* the offset at which the mapping was placed (po);
* else an apt error is returned as documented in scif.h
__scif_register_pinned_pages(scif_epd_t epd
,
scif_pinned_pages_t pinned_pages
, off_t offset
, int map_flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
uint64_t computed_offset
;
struct reg_range_t
*window
;
if (!ep
|| !pinned_pages
|| pinned_pages
->magic
!= SCIFEP_MAGIC
)
if (map_flags
& ~SCIF_MAP_FIXED
)
len
= pinned_pages
->nr_pages
<< PAGE_SHIFT
;
* Offset is not page aligned/negative or offset+len
* wraps around with SCIF_MAP_FIXED.
if ((map_flags
& SCIF_MAP_FIXED
) &&
((align_low(offset
, PAGE_SIZE
) != offset
) ||
(offset
+ (off_t
)len
< offset
)))
if ((err
= verify_epd(ep
)))
/* Compute the offset for this registration */
if ((err
= micscif_get_window_offset(ep
, map_flags
, offset
,
/* Allocate and prepare self registration window */
if (!(window
= micscif_create_window(ep
, pinned_pages
->nr_pages
,
computed_offset
, false))) {
micscif_free_window_offset(ep
, computed_offset
, len
);
window
->pinned_pages
= pinned_pages
;
window
->nr_pages
= pinned_pages
->nr_pages
;
window
->nr_contig_chunks
= pinned_pages
->nr_contig_chunks
;
window
->prot
= pinned_pages
->prot
;
* This set of pinned pages now belongs to this window as well.
* Assert if the ref count is zero since it is an error to
* pass pinned_pages to scif_register_pinned_pages() after
* calling scif_unpin_pages().
if (!atomic_add_unless(&pinned_pages
->ref_count
,
(int32_t)pinned_pages
->nr_pages
, 0))
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
if ((err
= micscif_send_alloc_request(ep
, window
))) {
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
/* Prepare the remote registration window */
if ((err
= micscif_prep_remote_window(ep
, window
))) {
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
micscif_set_nr_pages(ep
->remote_dev
, window
);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
/* Tell the peer about the new window */
if ((err
= micscif_send_scif_register(ep
, window
))) {
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
/* No further failures expected. Insert new window */
mutex_lock(&ep
->rma_info
.rma_lock
);
set_window_ref_count(window
, pinned_pages
->nr_pages
);
micscif_insert_window(window
, &ep
->rma_info
.reg_list
);
mutex_unlock(&ep
->rma_info
.rma_lock
);
micscif_destroy_window(ep
, window
);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
scif_register_pinned_pages(scif_epd_t epd
,
scif_pinned_pages_t pinned_pages
, off_t offset
, int map_flags
)
ret
= __scif_register_pinned_pages(epd
, pinned_pages
, offset
, map_flags
);
EXPORT_SYMBOL(scif_register_pinned_pages
);
* scif_get_pages - Add references to remote registered pages
* scif_get_pages() returns the addresses of the physical pages represented
* by those pages of the registered address space of the peer of epd, starting
* at offset offset and continuing for len bytes. offset and len are constrained
* to be multiples of the page size.
* Upon successful completion, scif_get_pages() returns 0;
* else an apt error is returned as documented in scif.h.
__scif_get_pages(scif_epd_t epd
, off_t offset
, size_t len
, struct scif_range
**pages
)
struct endpt
*ep
= (struct endpt
*)epd
;
struct micscif_rma_req req
;
struct reg_range_t
*window
= NULL
;
pr_debug("SCIFAPI get_pinned_pages: ep %p %s offset 0x%lx len 0x%lx\n",
ep
, scif_ep_states
[ep
->state
], offset
, len
);
if ((err
= verify_epd(ep
)))
(offset
+ len
< offset
) ||
(align_low((uint64_t)offset
, PAGE_SIZE
) != (uint64_t)offset
) ||
(align_low((uint64_t)len
, PAGE_SIZE
) != (uint64_t)len
))
nr_pages
= len
>> PAGE_SHIFT
;
req
.out_window
= &window
;
req
.type
= WINDOW_SINGLE
;
req
.head
= &ep
->rma_info
.remote_reg_list
;
mutex_lock(&ep
->rma_info
.rma_lock
);
/* Does a valid window exist? */
if ((err
= micscif_query_window(&req
))) {
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
/* Allocate scif_range */
if (!(*pages
= kzalloc(sizeof(struct scif_range
), GFP_KERNEL
))) {
/* Allocate phys addr array */
if (!((*pages
)->phys_addr
= scif_zalloc(nr_pages
* sizeof(dma_addr_t
)))) {
/* Allocate virtual address array */
if (!((*pages
)->va
= scif_zalloc(nr_pages
* sizeof(void *)))) {
/* Populate the values */
(*pages
)->cookie
= window
;
(*pages
)->nr_pages
= nr_pages
;
(*pages
)->prot_flags
= window
->prot
;
for (i
= 0; i
< nr_pages
; i
++) {
#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
is_self_scifdev(ep
->remote_dev
) ?
micscif_get_dma_addr(window
, offset
+ (i
* PAGE_SIZE
),
NULL
, NULL
, NULL
) : window
->phys_addr
[i
];
get_phys_addr(micscif_get_dma_addr(window
, offset
+ (i
* PAGE_SIZE
),
NULL
, NULL
, NULL
), ep
->remote_dev
);
if (!is_self_scifdev(ep
->remote_dev
))
get_per_dev_ctx(ep
->remote_dev
->sd_node
- 1)->aper
.va
+
get_per_dev_ctx(ep
->remote_dev
->sd_node
- 1)->aper
.pa
;
window
->get_put_ref_count
+= nr_pages
;
get_window_ref_count(window
, nr_pages
);
mutex_unlock(&ep
->rma_info
.rma_lock
);
scif_free((*pages
)->phys_addr
, nr_pages
* sizeof(dma_addr_t
));
scif_free((*pages
)->va
, nr_pages
* sizeof(void *));
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
micscif_create_node_dep(ep
->remote_dev
, nr_pages
);
scif_get_pages(scif_epd_t epd
, off_t offset
, size_t len
, struct scif_range
**pages
)
ret
= __scif_get_pages(epd
, offset
, len
, pages
);
EXPORT_SYMBOL(scif_get_pages
);
* scif_put_pages - Remove references from remote registered pages
* scif_put_pages() returns a scif_range structure previously obtained by
* calling scif_get_pages(). When control returns, the physical pages may
* become available for reuse if and when the window which represented
* those pages is unregistered. Therefore, those pages must never be accessed.
* Upon success, zero is returned.
* else an apt error is returned as documented in scif.h.
__scif_put_pages(struct scif_range
*pages
)
struct reg_range_t
*window
;
if (!pages
|| !pages
->cookie
)
if (!window
|| window
->magic
!= SCIFEP_MAGIC
||
!window
->get_put_ref_count
)
ep
= (struct endpt
*)window
->ep
;
* If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
* callee should be allowed to release references to the pages,
* else the endpoint was not connected in the first place,
if (ep
->state
!= SCIFEP_CONNECTED
&& ep
->state
!= SCIFEP_DISCONNECTED
)
* TODO: Re-enable this check once ref counts for kernel mode APIs
* have been implemented and node remove call backs are called before
* the node is removed. This check results in kernel mode APIs not
* being able to release pages correctly since node remove callbacks
* are called after the node is removed currently.
* if (!scifdev_alive(ep))
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
mutex_lock(&ep
->rma_info
.rma_lock
);
/* Decrement the ref counts and check for errors */
window
->get_put_ref_count
-= pages
->nr_pages
;
BUG_ON(window
->get_put_ref_count
< 0);
put_window_ref_count(window
, pages
->nr_pages
);
/* Initiate window destruction if ref count is zero */
if (!window
->ref_count
) {
drain_dma_intr(ep
->rma_info
.dma_chan
);
/* Inform the peer about this window being destroyed. */
msg
.payload
[0] = window
->peer_window
;
/* No error handling for notification messages */
micscif_nodeqp_send(ep
->remote_dev
, &msg
, ep
);
list_del(&window
->list_member
);
/* Destroy this window from the peer's registered AS */
micscif_destroy_remote_window(ep
, window
);
mutex_unlock(&ep
->rma_info
.rma_lock
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
micscif_destroy_node_dep(ep
->remote_dev
, pages
->nr_pages
);
scif_free(pages
->phys_addr
, pages
->nr_pages
* sizeof(dma_addr_t
));
scif_free(pages
->va
, pages
->nr_pages
* sizeof(void*));
scif_put_pages(struct scif_range
*pages
)
struct reg_range_t
*window
= pages
->cookie
;
struct endpt
*ep
= (struct endpt
*)window
->ep
;
if (atomic_read(&(&(&(ep
->ref_count
))->refcount
)->refs
) > 0) {
kref_get(&(ep
->ref_count
));
ret
= __scif_put_pages(pages
);
if (atomic_read(&(&(&(ep
->ref_count
))->refcount
)->refs
) > 0) {
kref_put(&(ep
->ref_count
), scif_ref_rel
);
EXPORT_SYMBOL(scif_put_pages
);
int scif_event_register(scif_callback_t handler
)
/* Add to the list of event handlers */
struct scif_callback
*cb
= kmalloc(sizeof(*cb
), GFP_KERNEL
);
mutex_lock(&ms_info
.mi_event_cblock
);
cb
->callback_handler
= handler
;
list_add_tail(&cb
->list_member
, &ms_info
.mi_event_cb
);
mutex_unlock(&ms_info
.mi_event_cblock
);
EXPORT_SYMBOL(scif_event_register
);
int scif_event_unregister(scif_callback_t handler
)
struct list_head
*pos
, *unused
;
struct scif_callback
*temp
;
mutex_lock(&ms_info
.mi_event_cblock
);
list_for_each_safe(pos
, unused
, &ms_info
.mi_event_cb
) {
temp
= list_entry(pos
, struct scif_callback
, list_member
);
if (temp
->callback_handler
== handler
) {
mutex_unlock(&ms_info
.mi_event_cblock
);
EXPORT_SYMBOL(scif_event_unregister
);
* scif_register - Mark a memory region for remote access.
* @epd: endpoint descriptor
* @addr: starting virtual address
* @offset: offset of window
* @prot: read/write protection
* Upon successful completion, scif_register() returns the offset
* at which the mapping was placed else an apt error is returned
* as documented in scif.h.
__scif_register(scif_epd_t epd
, void *addr
, size_t len
, off_t offset
,
scif_pinned_pages_t pinned_pages
;
struct endpt
*ep
= (struct endpt
*)epd
;
uint64_t computed_offset
;
struct reg_range_t
*window
;
struct mm_struct
*mm
= NULL
;
pr_debug("SCIFAPI register: ep %p %s addr %p len 0x%lx"
" offset 0x%lx prot 0x%x map_flags 0x%x\n",
epd
, scif_ep_states
[epd
->state
], addr
, len
, offset
, prot
, map_flags
);
if (map_flags
& ~(SCIF_MAP_FIXED
| SCIF_MAP_KERNEL
))
/* Unsupported protection requested */
if (prot
& ~(SCIF_PROT_READ
| SCIF_PROT_WRITE
))
/* addr/len must be page aligned. len should be non zero */
(align_low((uint64_t)addr
, PAGE_SIZE
) != (uint64_t)addr
) ||
(align_low((uint64_t)len
, PAGE_SIZE
) != (uint64_t)len
))
* Offset is not page aligned/negative or offset+len
* wraps around with SCIF_MAP_FIXED.
if ((map_flags
& SCIF_MAP_FIXED
) &&
((align_low(offset
, PAGE_SIZE
) != offset
) ||
(offset
+ (off_t
)len
< offset
)))
if ((err
= verify_epd(ep
)))
/* Compute the offset for this registration */
if ((err
= micscif_get_window_offset(ep
, map_flags
, offset
,
/* Allocate and prepare self registration window */
if (!(window
= micscif_create_window(ep
, len
>> PAGE_SHIFT
,
computed_offset
, false))) {
micscif_free_window_offset(ep
, computed_offset
, len
);
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
window
->nr_pages
= len
>> PAGE_SHIFT
;
if ((err
= micscif_send_alloc_request(ep
, window
))) {
micscif_destroy_incomplete_window(ep
, window
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
if (!(map_flags
& SCIF_MAP_KERNEL
)) {
mm
= __scif_acquire_mm();
map_flags
|= SCIF_MAP_ULIMIT
;
if ((err
= scif_pin_pages(addr
, len
, prot
,
map_flags
& (SCIF_MAP_KERNEL
| SCIF_MAP_ULIMIT
),
micscif_destroy_incomplete_window(ep
, window
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
window
->pinned_pages
= pinned_pages
;
window
->nr_contig_chunks
= pinned_pages
->nr_contig_chunks
;
window
->prot
= pinned_pages
->prot
;
/* Prepare the remote registration window */
if ((err
= micscif_prep_remote_window(ep
, window
))) {
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
micscif_set_nr_pages(ep
->remote_dev
, window
);
printk(KERN_ERR
"%s %d err %ld\n", __func__
, __LINE__
, err
);
/* Tell the peer about the new window */
if ((err
= micscif_send_scif_register(ep
, window
))) {
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
printk(KERN_ERR
"%s %d err %ld\n", __func__
, __LINE__
, err
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
/* No further failures expected. Insert new window */
mutex_lock(&ep
->rma_info
.rma_lock
);
set_window_ref_count(window
, pinned_pages
->nr_pages
);
micscif_insert_window(window
, &ep
->rma_info
.reg_list
);
mutex_unlock(&ep
->rma_info
.rma_lock
);
pr_debug("SCIFAPI register: ep %p %s addr %p"
" len 0x%lx computed_offset 0x%llx\n",
epd
, scif_ep_states
[epd
->state
], addr
, len
, computed_offset
);
micscif_destroy_window(ep
, window
);
printk(KERN_ERR
"%s %d err %ld\n", __func__
, __LINE__
, err
);
scif_register(scif_epd_t epd
, void *addr
, size_t len
, off_t offset
,
ret
= __scif_register(epd
, addr
, len
, offset
, prot
, map_flags
);
EXPORT_SYMBOL(scif_register
);
* scif_unregister - Release a memory region registered for remote access.
* @epd: endpoint descriptor
* @offset: start of range to unregister
* @len: length of range to unregister
* Upon successful completion, scif_unegister() returns zero
* else an apt error is returned as documented in scif.h.
__scif_unregister(scif_epd_t epd
, off_t offset
, size_t len
)
struct endpt
*ep
= (struct endpt
*)epd
;
struct reg_range_t
*window
= NULL
;
struct micscif_rma_req req
;
pr_debug("SCIFAPI unregister: ep %p %s offset 0x%lx len 0x%lx\n",
ep
, scif_ep_states
[ep
->state
], offset
, len
);
/* len must be page aligned. len should be non zero */
(align_low((uint64_t)len
, PAGE_SIZE
) != (uint64_t)len
))
/* Offset is not page aligned or offset+len wraps around */
if ((align_low(offset
, PAGE_SIZE
) != offset
) ||
(offset
+ (off_t
)len
< offset
))
if ((err
= verify_epd(ep
)))
nr_pages
= (int)(len
>> PAGE_SHIFT
);
req
.out_window
= &window
;
req
.head
= &ep
->rma_info
.reg_list
;
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
mutex_lock(&ep
->rma_info
.rma_lock
);
/* Does a valid window exist? */
if ((err
= micscif_query_window(&req
))) {
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
/* Unregister all the windows in this range */
if ((err
= micscif_rma_list_unregister(window
, offset
, nr_pages
)))
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
mutex_unlock(&ep
->rma_info
.rma_lock
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
scif_unregister(scif_epd_t epd
, off_t offset
, size_t len
)
ret
= __scif_unregister(epd
, offset
, len
);
EXPORT_SYMBOL(scif_unregister
);
unsigned int scif_pollfd(struct file
*f
, poll_table
*wait
, scif_epd_t epd
)
ret
= __scif_pollfd(f
, wait
, (struct endpt
*)epd
);
unsigned int __scif_pollfd(struct file
*f
, poll_table
*wait
, struct endpt
*ep
)
pr_debug("SCIFAPI pollfd: ep %p %s\n", ep
, scif_ep_states
[ep
->state
]);
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
spin_lock_irqsave(&ep
->lock
, sflags
);
if (ep
->conn_async_state
== ASYNC_CONN_INPROGRESS
) {
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
if (!wait
|| poll_requested_events(wait
) & SCIF_POLLOUT
) {
if (!wait
|| wait
->key
& SCIF_POLLOUT
) {
poll_wait(f
, &ep
->conn_pend_wq
, wait
);
if (ep
->state
== SCIFEP_CONNECTED
||
ep
->state
== SCIFEP_DISCONNECTED
||
/* Is it OK to use wait->key?? */
if (ep
->state
== SCIFEP_LISTENING
) {
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
if (!wait
|| poll_requested_events(wait
) & SCIF_POLLIN
) {
if (!wait
|| wait
->key
& SCIF_POLLIN
) {
spin_unlock_irqrestore(&ep
->lock
, sflags
);
poll_wait(f
, &ep
->conwq
, wait
);
spin_lock_irqsave(&ep
->lock
, sflags
);
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
if (!wait
|| poll_requested_events(wait
) & SCIF_POLLIN
) {
if (!wait
|| wait
->key
& SCIF_POLLIN
) {
if (ep
->state
!= SCIFEP_CONNECTED
&&
ep
->state
!= SCIFEP_LISTENING
&&
ep
->state
!= SCIFEP_DISCONNECTED
) {
spin_unlock_irqrestore(&ep
->lock
, sflags
);
poll_wait(f
, &ep
->recvwq
, wait
);
spin_lock_irqsave(&ep
->lock
, sflags
);
if (micscif_rb_count(&ep
->qp_info
.qp
->inbound_q
, 1))
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
if (!wait
|| poll_requested_events(wait
) & SCIF_POLLOUT
) {
if (!wait
|| wait
->key
& SCIF_POLLOUT
) {
if (ep
->state
!= SCIFEP_CONNECTED
&&
ep
->state
!= SCIFEP_LISTENING
) {
spin_unlock_irqrestore(&ep
->lock
, sflags
);
poll_wait(f
, &ep
->sendwq
, wait
);
spin_lock_irqsave(&ep
->lock
, sflags
);
if (micscif_rb_space(&ep
->qp_info
.qp
->outbound_q
))
/* If the endpoint is in the diconnected state then return hangup instead of error */
if (ep
->state
== SCIFEP_DISCONNECTED
) {
spin_unlock_irqrestore(&ep
->lock
, sflags
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
* The private data field of each VMA used to mmap a remote window
* points to an instance of struct vma_pvt
struct endpt
*ep
; /* End point for remote window */
uint64_t offset
; /* offset within remote window */
bool valid_offset
; /* offset is valid only if the original
* mmap request was for a single page
* else the offset within the vma is
static void vma_pvt_release(struct kref
*ref
)
struct vma_pvt
*vmapvt
= container_of(ref
, struct vma_pvt
, ref
);
* scif_vma_open - VMA open driver callback
* The open method is called by the kernel to allow the subsystem implementing
* the VMA to initialize the area. This method is invoked any time a new
* reference to the VMA is made (when a process forks, for example).
* The one exception happens when the VMA is first created by mmap;
* in this case, the driver's mmap method is called instead.
* This function is also invoked when an existing VMA is split by the kernel
* due to a call to munmap on a subset of the VMA resulting in two VMAs.
* The kernel invokes this function only on one of the two VMAs.
static void scif_vma_open(struct vm_area_struct
*vma
)
struct vma_pvt
*vmapvt
= ((vma
)->vm_private_data
);
pr_debug("SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
((vma
)->vm_start
), ((vma
)->vm_end
));
* scif_munmap - VMA close driver callback.
* When an area is destroyed, the kernel calls its close operation.
* Note that there's no usage count associated with VMA's; the area
* is opened and closed exactly once by each process that uses it.
void scif_munmap(struct vm_area_struct
*vma
)
struct vma_pvt
*vmapvt
= ((vma
)->vm_private_data
);
int nr_pages
= (int)( (((vma
)->vm_end
) - ((vma
)->vm_start
)) >> PAGE_SHIFT
);
struct micscif_rma_req req
;
struct reg_range_t
*window
= NULL
;
pr_debug("SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
((vma
)->vm_start
), ((vma
)->vm_end
));
/* used to be a BUG_ON(), prefer keeping the kernel alive */
printk(KERN_ERR
"SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
((vma
)->vm_start
), ((vma
)->vm_end
));
offset
= vmapvt
->valid_offset
? vmapvt
->offset
:
((vma
)->vm_pgoff
) << PAGE_SHIFT
;
pr_debug("SCIFAPI munmap: ep %p %s nr_pages 0x%x offset 0x%llx\n",
ep
, scif_ep_states
[ep
->state
], nr_pages
, offset
);
req
.out_window
= &window
;
req
.nr_bytes
= ((vma
)->vm_end
) - ((vma
)->vm_start
);
req
.prot
= ((vma
)->vm_flags
) & (VM_READ
| VM_WRITE
);
req
.type
= WINDOW_PARTIAL
;
req
.head
= &ep
->rma_info
.remote_reg_list
;
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
mutex_lock(&ep
->rma_info
.rma_lock
);
if ((err
= micscif_query_window(&req
)))
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
micscif_rma_list_munmap(window
, offset
, nr_pages
);
mutex_unlock(&ep
->rma_info
.rma_lock
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
micscif_destroy_node_dep(ep
->remote_dev
, nr_pages
);
* The kernel probably zeroes these out but we still want
* to clean up our own mess just in case.
((vma
)->vm_private_data
) = NULL
;
kref_put(&vmapvt
->ref
, vma_pvt_release
);
micscif_rma_put_task(ep
, nr_pages
);
static const struct vm_operations_struct micscif_vm_ops
= {
* scif_mmap - Map pages in virtual address space to a remote window.
* @epd: endpoint descriptor
* Upon successful completion, scif_mmap() returns zero
* else an apt error is returned as documented in scif.h.
scif_mmap(struct vm_area_struct
*vma
, scif_epd_t epd
)
struct micscif_rma_req req
;
struct reg_range_t
*window
= NULL
;
struct endpt
*ep
= (struct endpt
*)epd
;
uint64_t start_offset
= ((vma
)->vm_pgoff
) << PAGE_SHIFT
;
int nr_pages
= (int)( (((vma
)->vm_end
) - ((vma
)->vm_start
)) >> PAGE_SHIFT
);
pr_debug("SCIFAPI mmap: ep %p %s start_offset 0x%llx nr_pages 0x%x\n",
ep
, scif_ep_states
[ep
->state
], start_offset
, nr_pages
);
if ((err
= verify_epd(ep
)))
if ((err
= micscif_rma_get_task(ep
, nr_pages
)))
if (!(vmapvt
= kzalloc(sizeof(*vmapvt
), GFP_KERNEL
))) {
micscif_rma_put_task(ep
, nr_pages
);
micscif_create_node_dep(ep
->remote_dev
, nr_pages
);
req
.out_window
= &window
;
req
.offset
= start_offset
;
req
.nr_bytes
= ((vma
)->vm_end
) - ((vma
)->vm_start
);
req
.prot
= ((vma
)->vm_flags
) & (VM_READ
| VM_WRITE
);
req
.type
= WINDOW_PARTIAL
;
req
.head
= &ep
->rma_info
.remote_reg_list
;
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
mutex_lock(&ep
->rma_info
.rma_lock
);
/* Does a valid window exist? */
if ((err
= micscif_query_window(&req
))) {
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
/* Default prot for loopback */
if (!is_self_scifdev(ep
->remote_dev
)) {
vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
vma
->vm_page_prot
= pgprot_writecombine(vma
->vm_page_prot
);
* VM_DONTCOPY - Do not copy this vma on fork
* VM_DONTEXPAND - Cannot expand with mremap()
* VM_RESERVED - Count as reserved_vm like IO
* VM_PFNMAP - Page-ranges managed without "struct page"
* VM_IO - Memory mapped I/O or similar
* We do not want to copy this VMA automatically on a fork(),
* expand this VMA due to mremap() or swap out these pages since
* the VMA is actually backed by physical pages in the remote
* node's physical memory and not via a struct page.
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
vma
->vm_flags
|= VM_DONTCOPY
| VM_DONTEXPAND
| VM_DONTDUMP
| VM_PFNMAP
;
vma
->vm_flags
|= VM_DONTCOPY
| VM_DONTEXPAND
| VM_RESERVED
| VM_PFNMAP
;
if (!is_self_scifdev(ep
->remote_dev
))
((vma
)->vm_flags
) |= VM_IO
;
/* Map this range of windows */
if ((err
= micscif_rma_list_mmap(window
,
start_offset
, nr_pages
, vma
))) {
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
/* Set up the driver call back */
vma
->vm_ops
= &micscif_vm_ops
;
((vma
)->vm_private_data
) = vmapvt
;
* For 1 page sized VMAs the kernel (remap_pfn_range) replaces the
* offset in the VMA with the pfn, so in that case save off the
* original offset, since the page sized VMA can't be split into
* smaller VMAs the offset is not going to change.
vmapvt
->offset
= start_offset
;
vmapvt
->valid_offset
= true;
mutex_unlock(&ep
->rma_info
.rma_lock
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
micscif_destroy_node_dep(ep
->remote_dev
, nr_pages
);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
micscif_rma_put_task(ep
, nr_pages
);
* scif_readfrom() - Read SCIF offset data from remote connection
* @epd: endpoint descriptor
* @loffset: offset in local registered address space to which to copy
* @len: length of range to copy
* @roffset: offset in remote registered address space from which to copy
* Upon successful completion, scif_readfrom() returns zero
* else an apt error is returned as documented in scif.h.
scif_readfrom(scif_epd_t epd
, off_t loffset
, size_t len
,
off_t roffset
, int flags
)
ret
= __scif_readfrom(epd
, loffset
, len
, roffset
, flags
);
EXPORT_SYMBOL(scif_readfrom
);
* scif_writeto() - Send SCIF offset data to remote connection
* @epd: endpoint descriptor
* @loffset: offset in local registered address space from which to copy
* @len: length of range to copy
* @roffset: offset in remote registered address space to which to copy
* Upon successful completion, scif_writeto() returns zero
* else an apt error is returned as documented in scif.h.
int scif_writeto(scif_epd_t epd
, off_t loffset
, size_t len
,
off_t roffset
, int flags
)
ret
= __scif_writeto(epd
, loffset
, len
, roffset
, flags
);
EXPORT_SYMBOL(scif_writeto
);
#define HOST_LOOPB_MAGIC_MARK 0xdead
* @epd: endpoint descriptor
* @mark: marked handle returned as output.
* scif_fence_mark() returns after marking the current set of all uncompleted
* RMAs initiated through the endpoint epd or marking the current set of all
* uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
* marked with a value returned in mark. The application may subsequently
* await completion of all RMAs so marked.
* Upon successful completion, scif_fence_mark() returns 0;
* else an apt error is returned as documented in scif.h.
int __scif_fence_mark(scif_epd_t epd
, int flags
, int *mark
)
struct endpt
*ep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x\n",
ep
, scif_ep_states
[ep
->state
], flags
, *mark
);
if ((err
= verify_epd(ep
)))
if (flags
& ~(SCIF_FENCE_INIT_SELF
| SCIF_FENCE_INIT_PEER
))
/* At least one of init self or peer RMA should be set */
if (!(flags
& (SCIF_FENCE_INIT_SELF
| SCIF_FENCE_INIT_PEER
)))
/* Exactly one of init self or peer RMA should be set but not both */
if ((flags
& SCIF_FENCE_INIT_SELF
) && (flags
& SCIF_FENCE_INIT_PEER
))
* Host Loopback does not need to use DMA.
* Return a valid mark to be symmetric.
if (is_self_scifdev(ep
->remote_dev
)) {
*mark
= HOST_LOOPB_MAGIC_MARK
;
if (flags
& SCIF_FENCE_INIT_SELF
) {
if ((*mark
= micscif_fence_mark(epd
)) < 0)
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
err
= micscif_send_fence_mark(ep
, mark
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x err %d\n",
ep
, scif_ep_states
[ep
->state
], flags
, *mark
, err
);
int scif_fence_mark(scif_epd_t epd
, int flags
, int *mark
)
ret
= __scif_fence_mark(epd
, flags
, mark
);
EXPORT_SYMBOL(scif_fence_mark
);
* @epd: endpoint descriptor
* scif_fence_wait() returns after all RMAs marked with mark have completed.
* Upon successful completion, scif_fence_wait() returns 0;
* else an apt error is returned as documented in scif.h.
int __scif_fence_wait(scif_epd_t epd
, int mark
)
struct endpt
*ep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI fence_wait: ep %p %s mark 0x%x\n",
ep
, scif_ep_states
[ep
->state
], mark
);
if ((err
= verify_epd(ep
)))
* Host Loopback does not need to use DMA.
* The only valid mark provided is 0 so simply
* return success if the mark is valid.
if (is_self_scifdev(ep
->remote_dev
)) {
if (HOST_LOOPB_MAGIC_MARK
== mark
)
if (mark
& SCIF_REMOTE_FENCE
) {
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
err
= micscif_send_fence_wait(epd
, mark
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
err
= dma_mark_wait(epd
->rma_info
.dma_chan
, mark
, true);
if (!err
&& atomic_read(&ep
->rma_info
.tw_refcount
))
queue_work(ms_info
.mi_misc_wq
, &ms_info
.mi_misc_work
);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
int scif_fence_wait(scif_epd_t epd
, int mark
)
ret
= __scif_fence_wait(epd
, mark
);
EXPORT_SYMBOL(scif_fence_wait
);
* @lval: local value to write to loffset
* @rval: remote value to write to roffset
* scif_fence_signal() returns after marking the current set of all
* uncompleted RMAs initiated through the endpoint epd or marking
* the current set of all uncompleted RMAs initiated through the peer
* Upon successful completion, scif_fence_signal() returns 0;
* else an apt error is returned as documented in scif.h.
int __scif_fence_signal(scif_epd_t epd
, off_t loff
, uint64_t lval
,
off_t roff
, uint64_t rval
, int flags
)
struct endpt
*ep
= (struct endpt
*)epd
;
pr_debug("SCIFAPI fence_signal: ep %p %s loff 0x%lx lval 0x%llx "
"roff 0x%lx rval 0x%llx flags 0x%x\n",
ep
, scif_ep_states
[ep
->state
], loff
, lval
, roff
, rval
, flags
);
if ((err
= verify_epd(ep
)))
if (flags
& ~(SCIF_FENCE_INIT_SELF
| SCIF_FENCE_INIT_PEER
|
SCIF_SIGNAL_LOCAL
| SCIF_SIGNAL_REMOTE
))
/* At least one of init self or peer RMA should be set */
if (!(flags
& (SCIF_FENCE_INIT_SELF
| SCIF_FENCE_INIT_PEER
)))
/* Exactly one of init self or peer RMA should be set but not both */
if ((flags
& SCIF_FENCE_INIT_SELF
) && (flags
& SCIF_FENCE_INIT_PEER
))
/* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */
if (!(flags
& (SCIF_SIGNAL_LOCAL
| SCIF_SIGNAL_REMOTE
)))
/* Only Dword offsets allowed */
if ((flags
& SCIF_SIGNAL_LOCAL
) && (loff
& (sizeof(uint32_t) - 1)))
/* Only Dword aligned offsets allowed */
if ((flags
& SCIF_SIGNAL_REMOTE
) && (roff
& (sizeof(uint32_t) - 1)))
if (flags
& SCIF_FENCE_INIT_PEER
) {
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
err
= micscif_send_fence_signal(epd
, roff
,
rval
, loff
, lval
, flags
);
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
/* Local Signal in Local RAS */
if (flags
& SCIF_SIGNAL_LOCAL
)
if ((err
= micscif_prog_signal(epd
, loff
,
/* Signal in Remote RAS */
if (flags
& SCIF_SIGNAL_REMOTE
) {
micscif_inc_node_refcnt(ep
->remote_dev
, 1);
err
= micscif_prog_signal(epd
, roff
,
micscif_dec_node_refcnt(ep
->remote_dev
, 1);
printk(KERN_ERR
"%s %d err %d\n", __func__
, __LINE__
, err
);
else if (atomic_read(&ep
->rma_info
.tw_refcount
))
queue_work(ms_info
.mi_misc_wq
, &ms_info
.mi_misc_work
);
int scif_fence_signal(scif_epd_t epd
, off_t loff
, uint64_t lval
,
off_t roff
, uint64_t rval
, int flags
)
ret
= __scif_fence_signal(epd
, loff
, lval
, roff
, rval
, flags
);
EXPORT_SYMBOL(scif_fence_signal
);
* scif_get_nodeIDs - Return information about online nodes
* @nodes: array space reserved for returning online node IDs
* @len: number of entries on the nodes array
* @self: address to place the node ID of this system
* scif_get_nodeIDs() returns the total number of scif nodes
* (including host) in the system
scif_get_nodeIDs(uint16_t *nodes
, int len
, uint16_t *self
)
*self
= ms_info
.mi_nodeid
;
mutex_lock(&ms_info
.mi_conflock
);
len
= SCIF_MIN(len
, (int32_t)ms_info
.mi_total
);
for (node
= 0; node
<=(int32_t)ms_info
.mi_maxid
; node
++) {
if (ms_info
.mi_mask
& (1UL << node
)) {
pr_debug("SCIFAPI get_nodeIDs total %d online %d filled in %d nodes\n",
ms_info
.mi_total
, online
, len
);
mutex_unlock(&ms_info
.mi_conflock
);
EXPORT_SYMBOL(scif_get_nodeIDs
);
* Return the pci_dev associated with a node.
int micscif_pci_dev(uint16_t node
, struct pci_dev
**pdev
)
/* This *is* a PCI device, therefore no pdev to return. */
mic_ctx_t
*mic_ctx
= get_per_dev_ctx(node
- 1);
*pdev
= mic_ctx
->bi_pdev
;
* Populate the pci device info pointer associated with a node.
int micscif_pci_info(uint16_t node
, struct scif_pci_info
*dev
)
mic_ctx_t
*mic_ctx
= get_per_dev_ctx(node
- 1);
dev
->pdev
= pdev
= mic_ctx
->bi_pdev
;
for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
if (!pci_resource_start(pdev
, i
)) {
if (pci_resource_flags(pdev
, i
) & IORESOURCE_PREFETCH
) {
/* TODO: Change comparison check for KNL. */
if (pci_resource_start(pdev
, i
) == mic_ctx
->aper
.pa
)
dev
->va
[i
] = mic_ctx
->aper
.va
;
dev
->va
[i
] = mic_ctx
->mmio
.va
;
* scif_pci_info - Populate the pci device info pointer associated with a node
* @node: the node to query
* @scif_pdev: The scif_pci_info structure to populate.
* scif_pci_info() populates the provided scif_pci_info structure
* associated with a node. The requested node ID cannot be the same as
* the current node. This routine may only return success when called from
* Upon successful completion, scif_pci_info() returns 0; otherwise the
* an appropriate error is returned as documented in scif.h.
int scif_pci_info(uint16_t node
, struct scif_pci_info
*dev
)
if (node
> ms_info
.mi_maxid
)
if ((scif_dev
[node
].sd_state
== SCIFDEV_NOTPRESENT
) ||
is_self_scifdev(&scif_dev
[node
]))
return micscif_pci_info(node
, dev
);
EXPORT_SYMBOL(scif_pci_info
);
print_ep_state(struct endpt
*ep
, char *label
)
printk("%s: EP %p state %s\n",
label
, ep
, scif_ep_states
[ep
->state
]);
printk("%s: EP %p\n state ?\n", label
, ep
);