Updated micscif/micscif_rma.c to address changes in callback functions stored in...
[xeon-phi-kernel-module] / micscif / micscif_rma.c
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36#include "mic/micscif.h"
37#include "mic/micscif_smpt.h"
38#include "mic/micscif_kmem_cache.h"
39#include "mic/micscif_rma_list.h"
40#ifndef _MIC_SCIF_
41#include "mic_common.h"
42#endif
43#include "mic/mic_dma_api.h"
44#include "mic/micscif_map.h"
45
46bool mic_reg_cache_enable = 0;
47
48bool mic_huge_page_enable = 1;
49
50#ifdef _MIC_SCIF_
51mic_dma_handle_t mic_dma_handle;
52#endif
53static inline
54void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
55 struct endpt *ep, bool inrange,
56 uint64_t start, uint64_t len);
57#ifdef CONFIG_MMU_NOTIFIER
58static void scif_mmu_notifier_release(struct mmu_notifier *mn,
59 struct mm_struct *mm);
db10dc00 60static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
800f879a 61 struct mm_struct *mm,
db10dc00
AT
62 unsigned long start, unsigned long end,
63 bool blockable);
800f879a
AT
64static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
65 struct mm_struct *mm,
66 unsigned long start, unsigned long end);
67static const struct mmu_notifier_ops scif_mmu_notifier_ops = {
68 .release = scif_mmu_notifier_release,
69 .clear_flush_young = NULL,
70 .change_pte = NULL,/*TODO*/
800f879a
AT
71 .invalidate_range_start = scif_mmu_notifier_invalidate_range_start,
72 .invalidate_range_end = scif_mmu_notifier_invalidate_range_end};
73
74static void scif_mmu_notifier_release(struct mmu_notifier *mn,
75 struct mm_struct *mm)
76{
77 struct endpt *ep;
78 struct rma_mmu_notifier *mmn;
79 mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
80 ep = mmn->ep;
81 micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
82 pr_debug("%s\n", __func__);
83 return;
84}
85
db10dc00 86static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
800f879a 87 struct mm_struct *mm,
db10dc00
AT
88 unsigned long start, unsigned long end,
89 bool blockable)
800f879a
AT
90{
91 struct endpt *ep;
92 struct rma_mmu_notifier *mmn;
93 mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
94 ep = mmn->ep;
db10dc00
AT
95 /*
96 * The kernel file `include/linux/mmu_notifier.h` states the following
97 * regarding the invalidate_range_start() callback:
98 *
99 * If blockable argument is set to false then the callback cannot
100 * sleep and has to return with -EAGAIN. 0 should be returned
101 * otherwise.
102 *
103 * The following function executes spin_lock_irqsave(), which feels like it
104 * qualifies as 'sleep'. However, returning -EAGAIN would require me to
105 * understand the location and function of all code that calls this
106 * callback. I do not yet have that understanding.
107 *
108 * For now, maintain the original behavior of calling
109 * micscif_rma_destroy_tcw() every time, accepting the spinlock. If this
110 * becomes problematic, either figure out all the code that can call this
111 * function and teach it to understand -EAGAIN, or investigate the `#ifdef
112 * CONFIG_MMU_NOTIFIER`.
113 *
114 * If you ended up here while tracking down a bug and pulling your hair
115 * out, sorry. :-(
116 */
800f879a
AT
117 micscif_rma_destroy_tcw(mmn, ep, true, (uint64_t)start, (uint64_t)(end - start));
118 pr_debug("%s start=%lx, end=%lx\n", __func__, start, end);
db10dc00 119 return 0;
800f879a
AT
120}
121
122static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
123 struct mm_struct *mm,
124 unsigned long start, unsigned long end)
125{
126 /* Nothing to do here, everything needed was done in invalidate_range_start */
127 pr_debug("%s\n", __func__);
128 return;
129}
130#endif
131
132#ifdef CONFIG_MMU_NOTIFIER
133void ep_unregister_mmu_notifier(struct endpt *ep)
134{
135 struct endpt_rma_info *rma = &ep->rma_info;
136 struct rma_mmu_notifier *mmn = NULL;
137 struct list_head *item, *tmp;
138 mutex_lock(&ep->rma_info.mmn_lock);
139 list_for_each_safe(item, tmp, &rma->mmn_list) {
140 mmn = list_entry(item,
141 struct rma_mmu_notifier, list_member);
142 mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm);
143#ifdef RMA_DEBUG
144 BUG_ON(atomic_long_sub_return(1, &ms_info.mmu_notif_cnt) < 0);
145#endif
146 list_del(item);
147 kfree(mmn);
148 }
149 mutex_unlock(&ep->rma_info.mmn_lock);
150}
151
152static void init_mmu_notifier(struct rma_mmu_notifier *mmn, struct mm_struct *mm, struct endpt *ep)
153{
154 mmn->ep = ep;
155 mmn->mm = mm;
156 mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops;
157 INIT_LIST_HEAD(&mmn->list_member);
158 INIT_LIST_HEAD(&mmn->tc_reg_list);
159}
160
161static struct rma_mmu_notifier *find_mmu_notifier(struct mm_struct *mm, struct endpt_rma_info *rma)
162{
163 struct rma_mmu_notifier *mmn;
164 struct list_head *item;
165 list_for_each(item, &rma->mmn_list) {
166 mmn = list_entry(item,
167 struct rma_mmu_notifier, list_member);
168 if (mmn->mm == mm)
169 return mmn;
170 }
171 return NULL;
172}
173#endif
174
175/**
176 * micscif_rma_ep_init:
177 * @ep: end point
178 *
179 * Initialize RMA per EP data structures.
180 */
181int micscif_rma_ep_init(struct endpt *ep)
182{
183 int ret;
184 struct endpt_rma_info *rma = &ep->rma_info;
185
186 mutex_init (&rma->rma_lock);
187 if ((ret = va_gen_init(&rma->va_gen,
188 VA_GEN_MIN, VA_GEN_RANGE)) < 0)
189 goto init_err;
190 spin_lock_init(&rma->tc_lock);
191 mutex_init (&rma->mmn_lock);
192 mutex_init (&rma->va_lock);
193 INIT_LIST_HEAD(&rma->reg_list);
194 INIT_LIST_HEAD(&rma->remote_reg_list);
195 atomic_set(&rma->tw_refcount, 0);
196 atomic_set(&rma->tw_total_pages, 0);
197 atomic_set(&rma->tcw_refcount, 0);
198 atomic_set(&rma->tcw_total_pages, 0);
199 init_waitqueue_head(&rma->fence_wq);
200 rma->fence_refcount = 0;
201 rma->async_list_del = 0;
202 rma->dma_chan = NULL;
203 INIT_LIST_HEAD(&rma->mmn_list);
204 INIT_LIST_HEAD(&rma->task_list);
205init_err:
206 return ret;
207}
208
209/**
210 * micscif_rma_ep_can_uninit:
211 * @ep: end point
212 *
213 * Returns 1 if an endpoint can be uninitialized and 0 otherwise.
214 */
215int micscif_rma_ep_can_uninit(struct endpt *ep)
216{
217 int ret = 0;
218
219 /* Destroy RMA Info only if both lists are empty */
220 if (list_empty(&ep->rma_info.reg_list) &&
221 list_empty(&ep->rma_info.remote_reg_list) &&
222#ifdef CONFIG_MMU_NOTIFIER
223 list_empty(&ep->rma_info.mmn_list) &&
224#endif
225 !atomic_read(&ep->rma_info.tw_refcount) &&
226 !atomic_read(&ep->rma_info.tcw_refcount))
227 ret = 1;
228 return ret;
229}
230
231#ifdef _MIC_SCIF_
232/**
233 * __micscif_setup_proxy_dma:
234 * @ep: SCIF endpoint descriptor.
235 *
236 * Sets up data structures for P2P Proxy DMAs.
237 */
238static int __micscif_setup_proxy_dma(struct endpt *ep)
239{
240 struct endpt_rma_info *rma = &ep->rma_info;
241 int err = 0;
242 uint64_t *tmp = NULL;
243
244 mutex_lock(&rma->rma_lock);
245 if (is_p2p_scifdev(ep->remote_dev) && !rma->proxy_dma_va) {
246 if (!(tmp = scif_zalloc(PAGE_SIZE))) {
247 err = -ENOMEM;
248 goto error;
249 }
250 if ((err = map_virt_into_aperture(&rma->proxy_dma_phys,
251 tmp,
252 ep->remote_dev, PAGE_SIZE))) {
253 scif_free(tmp, PAGE_SIZE);
254 goto error;
255 }
256 *tmp = OP_IDLE;
257 rma->proxy_dma_va = tmp;
258 }
259error:
260 mutex_unlock(&rma->rma_lock);
261 return err;
262}
263
264static __always_inline int micscif_setup_proxy_dma(struct endpt *ep)
265{
266 if (ep->rma_info.proxy_dma_va)
267 return 0;
268
269 return __micscif_setup_proxy_dma(ep);
270}
271
272/**
273 * micscif_teardown_proxy_dma:
274 * @ep: SCIF endpoint descriptor.
275 *
276 * Tears down data structures setup for P2P Proxy DMAs.
277 */
278void micscif_teardown_proxy_dma(struct endpt *ep)
279{
280 struct endpt_rma_info *rma = &ep->rma_info;
281 mutex_lock(&rma->rma_lock);
282 if (rma->proxy_dma_va) {
283 unmap_from_aperture(rma->proxy_dma_phys, ep->remote_dev, PAGE_SIZE);
284 scif_free(rma->proxy_dma_va, PAGE_SIZE);
285 rma->proxy_dma_va = NULL;
286 }
287 mutex_unlock(&rma->rma_lock);
288}
289
290/**
291 * micscif_proxy_dma:
292 * @ep: SCIF endpoint descriptor.
293 * @copy_work: DMA copy work information.
294 *
295 * This API does the following:
296 * 1) Sends the peer a SCIF Node QP message with the information
297 * required to program a proxy DMA to covert a P2P Read to a Write
298 * which will initiate a DMA transfer from the peer card to self.
299 * The reason for this special code path is KNF and KNC P2P read
300 * performance being much lower than P2P write performance on Crown
301 * Pass platforms.
302 * 2) Poll for an update of the known proxy dma VA to OP_COMPLETED
303 * via a SUD by the peer.
304 */
305static int micscif_proxy_dma(scif_epd_t epd, struct mic_copy_work *work)
306{
307 struct endpt *ep = (struct endpt *)epd;
308 struct nodemsg msg;
309 unsigned long ts = jiffies;
310 struct endpt_rma_info *rma = &ep->rma_info;
311 int err;
312 volatile uint64_t *proxy_dma_va = rma->proxy_dma_va;
313
314 mutex_lock(&ep->rma_info.rma_lock);
315 /*
316 * Bail out if there is a Proxy DMA already in progress
317 * for this endpoint. The callee will fallback on self
318 * DMAs upon an error.
319 */
320 if (*proxy_dma_va != OP_IDLE) {
321 mutex_unlock(&ep->rma_info.rma_lock);
322 err = -EBUSY;
323 goto error;
324 }
325 *proxy_dma_va = OP_IN_PROGRESS;
326 mutex_unlock(&ep->rma_info.rma_lock);
327
328 msg.src = ep->port;
329 msg.uop = work->ordered ? SCIF_PROXY_ORDERED_DMA : SCIF_PROXY_DMA;
330 msg.payload[0] = ep->remote_ep;
331 msg.payload[1] = work->src_offset;
332 msg.payload[2] = work->dst_offset;
333 msg.payload[3] = work->len;
334
335 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
336 goto error_init_va;
337
338 while (*proxy_dma_va != OP_COMPLETED) {
339 schedule();
340 if (time_after(jiffies,
341 ts + NODE_ALIVE_TIMEOUT)) {
342 err = -EBUSY;
343 goto error_init_va;
344 }
345 }
346 err = 0;
347error_init_va:
348 *proxy_dma_va = OP_IDLE;
349error:
350 return err;
351}
352#endif
353
354/**
355 * micscif_create_pinned_pages:
356 * @nr_pages: number of pages in window
357 * @prot: read/write protection
358 *
359 * Allocate and prepare a set of pinned pages.
360 */
361struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot)
362{
363 struct scif_pinned_pages *pinned_pages;
364
365 might_sleep();
366 if (!(pinned_pages = scif_zalloc(sizeof(*pinned_pages))))
367 goto error;
368
369 if (!(pinned_pages->pages = scif_zalloc(nr_pages *
370 sizeof(*(pinned_pages->pages)))))
371 goto error_free_pinned_pages;
372
373 if (!(pinned_pages->num_pages = scif_zalloc(nr_pages *
374 sizeof(*(pinned_pages->num_pages)))))
375 goto error_free_pages;
376
377#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
378 if (!(pinned_pages->vma = scif_zalloc(nr_pages *
379 sizeof(*(pinned_pages->vma)))))
380 goto error_free_num_pages;
381#endif
382
383 pinned_pages->prot = prot;
384 pinned_pages->magic = SCIFEP_MAGIC;
385 pinned_pages->nr_contig_chunks = 0;
386 return pinned_pages;
387
388#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
389error_free_num_pages:
390 scif_free(pinned_pages->num_pages,
391 pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages)));
392#endif
393error_free_pages:
394 scif_free(pinned_pages->pages,
395 pinned_pages->nr_pages * sizeof(*(pinned_pages->pages)));
396error_free_pinned_pages:
397 scif_free(pinned_pages, sizeof(*pinned_pages));
398error:
399 return NULL;
400}
401
402/**
403 * micscif_destroy_pinned_pages:
404 * @pinned_pages: A set of pinned pages.
405 *
406 * Deallocate resources for pinned pages.
407 */
408int micscif_destroy_pinned_pages(struct scif_pinned_pages *pinned_pages)
409{
410 int j;
411 int writeable = pinned_pages->prot & SCIF_PROT_WRITE;
412 int kernel = SCIF_MAP_KERNEL & pinned_pages->map_flags;
413
414 for (j = 0; j < pinned_pages->nr_pages; j++) {
415 if (pinned_pages->pages[j]) {
416 if (!kernel) {
417 if (writeable)
418 SetPageDirty(pinned_pages->pages[j]);
419#ifdef RMA_DEBUG
420 BUG_ON(!page_count(pinned_pages->pages[j]));
421 BUG_ON(atomic_long_sub_return(1, &ms_info.rma_pin_cnt) < 0);
422#endif
60589c21 423 put_page(pinned_pages->pages[j]);
800f879a
AT
424 }
425 }
426 }
427
428#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
429 scif_free(pinned_pages->vma,
430 pinned_pages->nr_pages * sizeof(*(pinned_pages->vma)));
431#endif
432 scif_free(pinned_pages->pages,
433 pinned_pages->nr_pages * sizeof(*(pinned_pages->pages)));
434 scif_free(pinned_pages->num_pages,
435 pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages)));
436 scif_free(pinned_pages, sizeof(*pinned_pages));
437 return 0;
438}
439
440/*
441 * micscif_create_window:
442 * @ep: end point
443 * @pinned_pages: Set of pinned pages which wil back this window.
444 * @offset: offset hint
445 *
446 * Allocate and prepare a self registration window.
447 */
448struct reg_range_t *micscif_create_window(struct endpt *ep,
449 int64_t nr_pages, uint64_t offset, bool temp)
450{
451 struct reg_range_t *window;
452
453 might_sleep();
454 if (!(window = scif_zalloc(sizeof(struct reg_range_t))))
455 goto error;
456
457#ifdef CONFIG_ML1OM
458 if (!temp) {
459 if (!(window->phys_addr = scif_zalloc(nr_pages *
460 sizeof(*(window->phys_addr)))))
461 goto error_free_window;
462
463 if (!(window->temp_phys_addr = scif_zalloc(nr_pages *
464 sizeof(*(window->temp_phys_addr)))))
465 goto error_free_window;
466 }
467#endif
468
469 if (!(window->dma_addr = scif_zalloc(nr_pages *
470 sizeof(*(window->dma_addr)))))
471 goto error_free_window;
472
473 if (!(window->num_pages = scif_zalloc(nr_pages *
474 sizeof(*(window->num_pages)))))
475 goto error_free_window;
476
477 window->offset = offset;
478 window->ep = (uint64_t)ep;
479 window->magic = SCIFEP_MAGIC;
480 window->reg_state = OP_IDLE;
481 init_waitqueue_head(&window->regwq);
482 window->unreg_state = OP_IDLE;
483 init_waitqueue_head(&window->unregwq);
484 INIT_LIST_HEAD(&window->list_member);
485 window->type = RMA_WINDOW_SELF;
486 window->temp = temp;
487#ifdef _MIC_SCIF_
488 micscif_setup_proxy_dma(ep);
489#endif
490 return window;
491
492error_free_window:
493 if (window->dma_addr)
494 scif_free(window->dma_addr, nr_pages * sizeof(*(window->dma_addr)));
495#ifdef CONFIG_ML1OM
496 if (window->temp_phys_addr)
497 scif_free(window->temp_phys_addr, nr_pages * sizeof(*(window->temp_phys_addr)));
498 if (window->phys_addr)
499 scif_free(window->phys_addr, nr_pages * sizeof(*(window->phys_addr)));
500#endif
501 scif_free(window, sizeof(*window));
502error:
503 return NULL;
504}
505
506/**
507 * micscif_destroy_incomplete_window:
508 * @ep: end point
509 * @window: registration window
510 *
511 * Deallocate resources for self window.
512 */
513int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window)
514{
515 int err;
516 int64_t nr_pages = window->nr_pages;
517 struct allocmsg *alloc = &window->alloc_handle;
518 struct nodemsg msg;
519
520 RMA_MAGIC(window);
521retry:
522 err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
523 if (!err && scifdev_alive(ep))
524 goto retry;
525
526 if (OP_COMPLETED == alloc->state) {
527 msg.uop = SCIF_FREE_VIRT;
528 msg.src = ep->port;
529 msg.payload[0] = ep->remote_ep;
530 msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
531 msg.payload[2] = (uint64_t)window;
532 msg.payload[3] = SCIF_REGISTER;
533 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
534 }
535
536 micscif_free_window_offset(ep, window->offset,
537 window->nr_pages << PAGE_SHIFT);
538 if (window->dma_addr)
539 scif_free(window->dma_addr, nr_pages *
540 sizeof(*(window->dma_addr)));
541 if (window->num_pages)
542 scif_free(window->num_pages, nr_pages *
543 sizeof(*(window->num_pages)));
544#ifdef CONFIG_ML1OM
545 if (window->phys_addr)
546 scif_free(window->phys_addr, window->nr_pages *
547 sizeof(*(window->phys_addr)));
548 if (window->temp_phys_addr)
549 scif_free(window->temp_phys_addr, nr_pages *
550 sizeof(*(window->temp_phys_addr)));
551#endif
552 scif_free(window, sizeof(*window));
553 return 0;
554}
555
556/**
557 * micscif_destroy_window:
558 * @ep: end point
559 * @window: registration window
560 *
561 * Deallocate resources for self window.
562 */
563int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window)
564{
565 int j;
566 struct scif_pinned_pages *pinned_pages = window->pinned_pages;
567 int64_t nr_pages = window->nr_pages;
568
569 might_sleep();
570 RMA_MAGIC(window);
571 if (!window->temp && window->mm) {
572 __scif_dec_pinned_vm_lock(window->mm, window->nr_pages, 0);
573 __scif_release_mm(window->mm);
574 window->mm = NULL;
575 }
576
577 if (!window->offset_freed)
578 micscif_free_window_offset(ep, window->offset,
579 window->nr_pages << PAGE_SHIFT);
580 for (j = 0; j < window->nr_contig_chunks; j++) {
581 if (window->dma_addr[j]) {
582 unmap_from_aperture(
583 window->dma_addr[j],
584 ep->remote_dev,
585 window->num_pages[j] << PAGE_SHIFT);
586 }
587 }
588
589 /*
590 * Decrement references for this set of pinned pages from
591 * this window.
592 */
593 j = atomic_sub_return((int32_t)pinned_pages->nr_pages,
594 &pinned_pages->ref_count);
595 BUG_ON(j < 0);
596 /*
597 * If the ref count for pinned_pages is zero then someone
598 * has already called scif_unpin_pages() for it and we should
599 * destroy the page cache.
600 */
601 if (!j)
602 micscif_destroy_pinned_pages(window->pinned_pages);
603 if (window->dma_addr)
604 scif_free(window->dma_addr, nr_pages *
605 sizeof(*(window->dma_addr)));
606 if (window->num_pages)
607 scif_free(window->num_pages, nr_pages *
608 sizeof(*(window->num_pages)));
609#ifdef CONFIG_ML1OM
610 if (window->phys_addr)
611 scif_free(window->phys_addr, window->nr_pages *
612 sizeof(*(window->phys_addr)));
613 if (window->temp_phys_addr)
614 scif_free(window->temp_phys_addr, nr_pages *
615 sizeof(*(window->temp_phys_addr)));
616#endif
617 window->magic = 0;
618 scif_free(window, sizeof(*window));
619 return 0;
620}
621
622/**
623 * micscif_create_remote_lookup:
624 * @ep: end point
625 * @window: remote window
626 *
627 * Allocate and prepare lookup entries for the remote
628 * end to copy over the physical addresses.
629 * Returns 0 on success and appropriate errno on failure.
630 */
631int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window)
632{
633 int i, j, err = 0;
634 int64_t nr_pages = window->nr_pages;
635 bool vmalloc_dma_phys;
636#ifdef CONFIG_ML1OM
637 bool vmalloc_temp_phys = false;
638 bool vmalloc_phys = false;
639#endif
640 might_sleep();
641
642 /* Map window */
643 err = map_virt_into_aperture(&window->mapped_offset,
644 window, ep->remote_dev, sizeof(*window));
645 if (err)
646 goto error_window;
647
648 /* Compute the number of lookup entries. 21 == 2MB Shift */
649 window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE,
650 ((2) * 1024 * 1024)) >> 21;
651
652 if (!(window->dma_addr_lookup.lookup =
653 scif_zalloc(window->nr_lookup *
654 sizeof(*(window->dma_addr_lookup.lookup)))))
655 goto error_window;
656
657 /* Map DMA physical addess lookup array */
658 err = map_virt_into_aperture(&window->dma_addr_lookup.offset,
659 window->dma_addr_lookup.lookup, ep->remote_dev,
660 window->nr_lookup *
661 sizeof(*window->dma_addr_lookup.lookup));
662 if (err)
663 goto error_window;
664
665 vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]);
666
667#ifdef CONFIG_ML1OM
668 if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) {
669 if (!(window->temp_phys_addr_lookup.lookup =
670 scif_zalloc(window->nr_lookup *
671 sizeof(*(window->temp_phys_addr_lookup.lookup)))))
672 goto error_window;
673
674 /* Map physical addess lookup array */
675 err = map_virt_into_aperture(&window->temp_phys_addr_lookup.offset,
676 window->temp_phys_addr_lookup.lookup, ep->remote_dev,
677 window->nr_lookup *
678 sizeof(*window->temp_phys_addr_lookup.lookup));
679 if (err)
680 goto error_window;
681
682 if (!(window->phys_addr_lookup.lookup =
683 scif_zalloc(window->nr_lookup *
684 sizeof(*(window->phys_addr_lookup.lookup)))))
685 goto error_window;
686
687 /* Map physical addess lookup array */
688 err = map_virt_into_aperture(&window->phys_addr_lookup.offset,
689 window->phys_addr_lookup.lookup, ep->remote_dev,
690 window->nr_lookup *
691 sizeof(*window->phys_addr_lookup.lookup));
692 if (err)
693 goto error_window;
694
695 vmalloc_phys = is_vmalloc_addr(&window->phys_addr[0]);
696 vmalloc_temp_phys = is_vmalloc_addr(&window->temp_phys_addr[0]);
697 }
698#endif
699
700 /* Now map each of the pages containing physical addresses */
701 for (i = 0, j = 0; i < nr_pages; i += NR_PHYS_ADDR_IN_PAGE, j++) {
702#ifdef CONFIG_ML1OM
703 if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) {
704 err = map_page_into_aperture(
705 &window->temp_phys_addr_lookup.lookup[j],
706 vmalloc_temp_phys ?
707 vmalloc_to_page(&window->temp_phys_addr[i]) :
708 virt_to_page(&window->temp_phys_addr[i]),
709 ep->remote_dev);
710 if (err)
711 goto error_window;
712
713 err = map_page_into_aperture(
714 &window->phys_addr_lookup.lookup[j],
715 vmalloc_phys ?
716 vmalloc_to_page(&window->phys_addr[i]) :
717 virt_to_page(&window->phys_addr[i]),
718 ep->remote_dev);
719 if (err)
720 goto error_window;
721 }
722#endif
723 err = map_page_into_aperture(
724 &window->dma_addr_lookup.lookup[j],
725 vmalloc_dma_phys ?
726 vmalloc_to_page(&window->dma_addr[i]) :
727 virt_to_page(&window->dma_addr[i]),
728 ep->remote_dev);
729 if (err)
730 goto error_window;
731 }
732 return 0;
733error_window:
734 return err;
735}
736
737/**
738 * micscif_destroy_remote_lookup:
739 * @ep: end point
740 * @window: remote window
741 *
742 * Destroy lookup entries used for the remote
743 * end to copy over the physical addresses.
744 */
745void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window)
746{
747 int i, j;
748
749 RMA_MAGIC(window);
750 if (window->nr_lookup) {
751 for (i = 0, j = 0; i < window->nr_pages;
752 i += NR_PHYS_ADDR_IN_PAGE, j++) {
753 if (window->dma_addr_lookup.lookup &&
754 window->dma_addr_lookup.lookup[j]) {
755 unmap_from_aperture(
756 window->dma_addr_lookup.lookup[j],
757 ep->remote_dev, PAGE_SIZE);
758 }
759 }
760 if (window->dma_addr_lookup.offset) {
761 unmap_from_aperture(
762 window->dma_addr_lookup.offset,
763 ep->remote_dev, window->nr_lookup *
764 sizeof(*window->dma_addr_lookup.lookup));
765 }
766 if (window->dma_addr_lookup.lookup)
767 scif_free(window->dma_addr_lookup.lookup, window->nr_lookup *
768 sizeof(*(window->dma_addr_lookup.lookup)));
769 if (window->mapped_offset) {
770 unmap_from_aperture(window->mapped_offset,
771 ep->remote_dev, sizeof(*window));
772 }
773 window->nr_lookup = 0;
774 }
775}
776
777/**
778 * micscif_create_remote_window:
779 * @ep: end point
780 * @nr_pages: number of pages in window
781 *
782 * Allocate and prepare a remote registration window.
783 */
784struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages)
785{
786 struct reg_range_t *window;
787
788 might_sleep();
789 if (!(window = scif_zalloc(sizeof(struct reg_range_t))))
790 goto error_ret;
791
792 window->magic = SCIFEP_MAGIC;
793 window->nr_pages = nr_pages;
794
795#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
796 if (!(window->page_ref_count = scif_zalloc(nr_pages *
797 sizeof(*(window->page_ref_count)))))
798 goto error_window;
799#endif
800
801 if (!(window->dma_addr = scif_zalloc(nr_pages *
802 sizeof(*(window->dma_addr)))))
803 goto error_window;
804
805 if (!(window->num_pages = scif_zalloc(nr_pages *
806 sizeof(*(window->num_pages)))))
807 goto error_window;
808
809#ifdef CONFIG_ML1OM
810 if (!(window->phys_addr = scif_zalloc(nr_pages *
811 sizeof(*(window->phys_addr)))))
812 goto error_window;
813
814 if (!(window->temp_phys_addr = scif_zalloc(nr_pages *
815 sizeof(*(window->temp_phys_addr)))))
816 goto error_window;
817#endif
818
819 if (micscif_create_remote_lookup(ep, window))
820 goto error_window;
821
822 window->ep = (uint64_t)ep;
823 window->type = RMA_WINDOW_PEER;
824 set_window_ref_count(window, nr_pages);
825 window->get_put_ref_count = 0;
826 window->unreg_state = OP_IDLE;
827#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
828 window->gttmap_state = OP_IDLE;
829 init_waitqueue_head(&window->gttmapwq);
830#endif
831#ifdef _MIC_SCIF_
832 micscif_setup_proxy_dma(ep);
833 window->proxy_dma_phys = ep->rma_info.proxy_dma_phys;
834#endif
835 return window;
836error_window:
837 micscif_destroy_remote_window(ep, window);
838error_ret:
839 return NULL;
840}
841
842/**
843 * micscif_destroy_remote_window:
844 * @ep: end point
845 * @window: remote registration window
846 *
847 * Deallocate resources for remote window.
848 */
849void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window)
850{
851 RMA_MAGIC(window);
852 micscif_destroy_remote_lookup(ep, window);
853 if (window->dma_addr)
854 scif_free(window->dma_addr, window->nr_pages *
855 sizeof(*(window->dma_addr)));
856 if (window->num_pages)
857 scif_free(window->num_pages, window->nr_pages *
858 sizeof(*(window->num_pages)));
859#ifdef CONFIG_ML1OM
860 if (window->phys_addr)
861 scif_free(window->phys_addr, window->nr_pages *
862 sizeof(*(window->phys_addr)));
863 if (window->temp_phys_addr)
864 scif_free(window->temp_phys_addr, window->nr_pages *
865 sizeof(*(window->temp_phys_addr)));
866#endif
867
868#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
869 if (window->page_ref_count)
870 scif_free(window->page_ref_count, window->nr_pages *
871 sizeof(*(window->page_ref_count)));
872#endif
873 window->magic = 0;
874 scif_free(window, sizeof(*window));
875}
876
877/**
878 * micscif_map_window_pages:
879 * @ep: end point
880 * @window: self registration window
881 * @tmp_wnd: is a temporary window?
882 *
883 * Map pages of a window into the aperture/PCI.
884 * Also compute physical addresses required for DMA.
885 */
886int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool tmp_wnd)
887{
888 int j, i, err = 0, nr_pages;
889 scif_pinned_pages_t pinned_pages;
890
891 might_sleep();
892 RMA_MAGIC(window);
893
894 pinned_pages = window->pinned_pages;
895 for (j = 0, i = 0; j < window->nr_contig_chunks; j++, i += nr_pages) {
896 nr_pages = pinned_pages->num_pages[i];
897#ifdef _MIC_SCIF_
898#ifdef CONFIG_ML1OM
899 /* phys_addr[] holds addresses as seen from the remote node
900 * these addressed are then copied into the remote card's
901 * window structure
902 * when the remote node is the host and the card is knf
903 * these addresses are only created at the point of mapping
904 * the card physical address into gtt (for the KNC the
905 * the gtt code path returns the local address)
906 * when the remote node is loopback - the address remains
907 * the same
908 * when the remote node is a kn* - the base address of the local
909 * card as seen from the remote node is added in
910 */
911 if (!tmp_wnd) {
912 if(ep->remote_dev != &scif_dev[SCIF_HOST_NODE]) {
913 if ((err = map_virt_into_aperture(
914 &window->temp_phys_addr[j],
915 phys_to_virt(page_to_phys(pinned_pages->pages[i])),
916 ep->remote_dev,
917 nr_pages << PAGE_SHIFT))) {
918 int k,l;
919
920 for (l = k = 0; k < i; l++) {
921 nr_pages = pinned_pages->num_pages[k];
922 window->temp_phys_addr[l]
923 &= ~RMA_HUGE_NR_PAGE_MASK;
924 unmap_from_aperture(
925 window->temp_phys_addr[l],
926 ep->remote_dev,
927 nr_pages << PAGE_SHIFT);
928 k += nr_pages;
929 window->temp_phys_addr[l] = 0;
930 }
931 return err;
932 }
933 if (!tmp_wnd)
934 RMA_SET_NR_PAGES(window->temp_phys_addr[j], nr_pages);
935 }
936 }
937#endif
938 window->dma_addr[j] =
939 page_to_phys(pinned_pages->pages[i]);
940 if (!tmp_wnd)
941 RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages);
942#else
943 err = map_virt_into_aperture(&window->dma_addr[j],
944 phys_to_virt(page_to_phys(pinned_pages->pages[i])),
945 ep->remote_dev, nr_pages << PAGE_SHIFT);
946 if (err)
947 return err;
948 if (!tmp_wnd)
949 RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages);
950#endif
951 window->num_pages[j] = nr_pages;
952 }
953 return err;
954}
955
956
957/**
958 * micscif_unregister_window:
959 * @window: self registration window
960 *
961 * Send an unregistration request and wait for a response.
962 */
963int micscif_unregister_window(struct reg_range_t *window)
964{
965 int err = 0;
966 struct endpt *ep = (struct endpt *)window->ep;
967 bool send_msg = false;
968
969 might_sleep();
970 BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock));
971
972 switch (window->unreg_state) {
973 case OP_IDLE:
974 {
975 window->unreg_state = OP_IN_PROGRESS;
976 send_msg = true;
977 /* fall through */
978 }
979 case OP_IN_PROGRESS:
980 {
981 get_window_ref_count(window, 1);
982 mutex_unlock(&ep->rma_info.rma_lock);
983 if (send_msg && (err = micscif_send_scif_unregister(ep, window))) {
984 window->unreg_state = OP_COMPLETED;
985 goto done;
986 }
987retry:
988 err = wait_event_timeout(window->unregwq,
989 window->unreg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
990 if (!err && scifdev_alive(ep))
991 goto retry;
992 if (!err) {
993 err = -ENODEV;
994 window->unreg_state = OP_COMPLETED;
995 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
996 }
997 if (err > 0)
998 err = 0;
999done:
1000 mutex_lock(&ep->rma_info.rma_lock);
1001 put_window_ref_count(window, 1);
1002 break;
1003 }
1004 case OP_FAILED:
1005 {
1006 if (!scifdev_alive(ep)) {
1007 err = -ENODEV;
1008 window->unreg_state = OP_COMPLETED;
1009 }
1010 break;
1011 }
1012 case OP_COMPLETED:
1013 break;
1014 default:
1015 /* Invalid opcode? */
1016 BUG_ON(1);
1017 }
1018
1019 if (OP_COMPLETED == window->unreg_state &&
1020 window->ref_count)
1021 put_window_ref_count(window, window->nr_pages);
1022
1023 if (!window->ref_count) {
1024 atomic_inc(&ep->rma_info.tw_refcount);
1025 atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
1026 list_del(&window->list_member);
1027 micscif_free_window_offset(ep, window->offset,
1028 window->nr_pages << PAGE_SHIFT);
1029 window->offset_freed = true;
1030 mutex_unlock(&ep->rma_info.rma_lock);
1031 if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL))
1032 && scifdev_alive(ep)) {
1033 drain_dma_intr(ep->rma_info.dma_chan);
1034 } else {
1035 if (!__scif_dec_pinned_vm_lock(window->mm,
1036 window->nr_pages, 1)) {
1037 __scif_release_mm(window->mm);
1038 window->mm = NULL;
1039 }
1040 }
1041 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
1042 mutex_lock(&ep->rma_info.rma_lock);
1043 }
1044 return err;
1045}
1046
1047/**
1048 * micscif_send_alloc_request:
1049 * @ep: end point
1050 * @window: self registration window
1051 *
1052 * Send a remote window allocation request
1053 */
1054int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window)
1055{
1056 struct nodemsg msg;
1057 struct allocmsg *alloc = &window->alloc_handle;
1058
1059 /* Set up the Alloc Handle */
1060 alloc->uop = SCIF_REGISTER;
1061 alloc->state = OP_IN_PROGRESS;
1062 init_waitqueue_head(&alloc->allocwq);
1063
1064 /* Send out an allocation request */
1065 msg.uop = SCIF_ALLOC_REQ;
1066 msg.src = ep->port;
1067 msg.payload[0] = ep->remote_ep;
1068 msg.payload[1] = window->nr_pages;
1069 msg.payload[2] = (uint64_t)&window->alloc_handle;
1070 msg.payload[3] = SCIF_REGISTER;
1071 return micscif_nodeqp_send(ep->remote_dev, &msg, ep);
1072}
1073
1074/**
1075 * micscif_prep_remote_window:
1076 * @ep: end point
1077 * @window: self registration window
1078 *
1079 * Send a remote window allocation request, wait for an allocation response,
1080 * prepare the remote window and notify the peer to unmap it once done.
1081 */
1082int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window)
1083{
1084 struct nodemsg msg;
1085 struct reg_range_t *remote_window;
1086 struct allocmsg *alloc = &window->alloc_handle;
1087 dma_addr_t *dma_phys_lookup, *tmp;
1088 int i = 0, j = 0;
1089 int nr_contig_chunks, loop_nr_contig_chunks, remaining_nr_contig_chunks, nr_lookup;
1090#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1091 dma_addr_t *phys_lookup = 0;
1092#endif
1093 int err, map_err;
1094
1095 nr_contig_chunks = remaining_nr_contig_chunks = (int)window->nr_contig_chunks;
1096
1097 if ((map_err = micscif_map_window_pages(ep, window, false))) {
1098 printk(KERN_ERR "%s %d map_err %d\n", __func__, __LINE__, map_err);
1099 }
1100retry:
1101 /* Now wait for the response */
1102 err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
1103 if (!err && scifdev_alive(ep))
1104 goto retry;
1105
1106 if (!err)
1107 err = -ENODEV;
1108
1109 if (err > 0)
1110 err = 0;
1111 else
1112 return err;
1113
1114 /* Bail out. The remote end rejected this request */
1115 if (OP_FAILED == alloc->state)
1116 return -ENOMEM;
1117
1118 if (map_err) {
1119 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, map_err);
1120 msg.uop = SCIF_FREE_VIRT;
1121 msg.src = ep->port;
1122 msg.payload[0] = ep->remote_ep;
1123 msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
1124 msg.payload[2] = (uint64_t)window;
1125 msg.payload[3] = SCIF_REGISTER;
1126 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1127 err = -ENOTCONN;
1128 else
1129 err = map_err;
1130 return err;
1131 }
1132
1133
1134 remote_window = scif_ioremap(alloc->phys_addr,
1135 sizeof(*window), ep->remote_dev);
1136
1137 RMA_MAGIC(remote_window);
1138
1139 /* Compute the number of lookup entries. 21 == 2MB Shift */
1140 nr_lookup = ALIGN(nr_contig_chunks * PAGE_SIZE, ((2) * 1024 * 1024)) >> 21;
1141#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1142 if (is_p2p_scifdev(ep->remote_dev))
1143 phys_lookup = scif_ioremap(remote_window->temp_phys_addr_lookup.offset,
1144 nr_lookup *
1145 sizeof(*remote_window->temp_phys_addr_lookup.lookup),
1146 ep->remote_dev);
1147#endif
1148
1149 dma_phys_lookup = scif_ioremap(remote_window->dma_addr_lookup.offset,
1150 nr_lookup *
1151 sizeof(*remote_window->dma_addr_lookup.lookup),
1152 ep->remote_dev);
1153
1154 while (remaining_nr_contig_chunks) {
1155 loop_nr_contig_chunks = min(remaining_nr_contig_chunks, (int)NR_PHYS_ADDR_IN_PAGE);
1156 /* #1/2 - Copy physical addresses over to the remote side */
1157
1158#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1159 /* If the remote dev is self or is any node except the host
1160 * its OK to copy the bus address to the remote window
1161 * in the case of the host (for KNF only) the bus address
1162 * is generated at the time of mmap(..) into card memory
1163 * and does not exist at this time
1164 */
1165 /* Note:
1166 * the phys_addr[] holds MIC address for remote cards
1167 * -> GTT offset for the host (KNF)
1168 * -> local address for the host (KNC)
1169 * -> local address for loopback
1170 * this is done in map_window_pages(..) except for GTT
1171 * offset for KNF
1172 */
1173 if (is_p2p_scifdev(ep->remote_dev)) {
1174 tmp = scif_ioremap(phys_lookup[j],
1175 loop_nr_contig_chunks * sizeof(*window->temp_phys_addr),
1176 ep->remote_dev);
1177 memcpy_toio(tmp, &window->temp_phys_addr[i],
1178 loop_nr_contig_chunks * sizeof(*window->temp_phys_addr));
1179 serializing_request(tmp);
1180 smp_mb();
1181 scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev);
1182 }
1183#endif
1184 /* #2/2 - Copy DMA addresses (addresses that are fed into the DMA engine)
1185 * We transfer bus addresses which are then converted into a MIC physical
1186 * address on the remote side if it is a MIC, if the remote node is a host
1187 * we transfer the MIC physical address
1188 */
1189 tmp = scif_ioremap(
1190 dma_phys_lookup[j],
1191 loop_nr_contig_chunks * sizeof(*window->dma_addr),
1192 ep->remote_dev);
1193#ifdef _MIC_SCIF_
1194 if (is_p2p_scifdev(ep->remote_dev)) {
1195 /* knf:
1196 * send the address as mapped through the GTT (the remote node's
1197 * base address for this node is already added in)
1198 * knc:
1199 * add remote node's base address for this node to convert it
1200 * into a MIC address
1201 */
1202 int m;
1203 dma_addr_t dma_addr;
1204 for (m = 0; m < loop_nr_contig_chunks; m++) {
1205#ifdef CONFIG_ML1OM
1206 dma_addr = window->temp_phys_addr[i + m];
1207#else
1208 dma_addr = window->dma_addr[i + m] +
1209 ep->remote_dev->sd_base_addr;
1210#endif
1211 writeq(dma_addr, &tmp[m]);
1212 }
1213 } else
1214 /* Host node or loopback - transfer DMA addresses as is, this is
1215 * the same as a MIC physical address (we use the dma_addr
1216 * and not the phys_addr array since the phys_addr is only setup
1217 * if there is a mmap() request from the host)
1218 */
1219 memcpy_toio(tmp, &window->dma_addr[i],
1220 loop_nr_contig_chunks * sizeof(*window->dma_addr));
1221#else
1222 /* Transfer the physical address array - this is the MIC address
1223 * as seen by the card
1224 */
1225 memcpy_toio(tmp, &window->dma_addr[i],
1226 loop_nr_contig_chunks * sizeof(*window->dma_addr));
1227#endif
1228 remaining_nr_contig_chunks -= loop_nr_contig_chunks;
1229 i += loop_nr_contig_chunks;
1230 j++;
1231 serializing_request(tmp);
1232 smp_mb();
1233 scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev);
1234 }
1235
1236 /* Prepare the remote window for the peer */
1237 remote_window->peer_window = (uint64_t)window;
1238 remote_window->offset = window->offset;
1239 remote_window->prot = window->prot;
1240 remote_window->nr_contig_chunks = nr_contig_chunks;
1241#ifdef _MIC_SCIF_
1242 if (!ep->rma_info.proxy_dma_peer_phys)
1243 ep->rma_info.proxy_dma_peer_phys = remote_window->proxy_dma_phys;
1244#endif
1245#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1246 if (is_p2p_scifdev(ep->remote_dev))
1247 scif_iounmap(phys_lookup,
1248 nr_lookup *
1249 sizeof(*remote_window->temp_phys_addr_lookup.lookup),
1250 ep->remote_dev);
1251#endif
1252 scif_iounmap(dma_phys_lookup,
1253 nr_lookup *
1254 sizeof(*remote_window->dma_addr_lookup.lookup),
1255 ep->remote_dev);
1256 scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev);
1257 window->peer_window = (uint64_t)alloc->vaddr;
1258 return err;
1259}
1260
1261/**
1262 * micscif_send_scif_register:
1263 * @ep: end point
1264 * @window: self registration window
1265 *
1266 * Send a SCIF_REGISTER message if EP is connected and wait for a
1267 * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT
1268 * message so that the peer can free its remote window allocated earlier.
1269 */
1270int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window)
1271{
1272 int err = 0;
1273 struct nodemsg msg;
1274
1275 msg.src = ep->port;
1276 msg.payload[0] = ep->remote_ep;
1277 msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
1278 msg.payload[2] = (uint64_t)window;
1279 if (SCIFEP_CONNECTED == ep->state) {
1280 msg.uop = SCIF_REGISTER;
1281 window->reg_state = OP_IN_PROGRESS;
1282 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
1283 micscif_set_nr_pages(ep->remote_dev, window);
1284retry:
1285 err = wait_event_timeout(window->regwq,
1286 window->reg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
1287 if (!err && scifdev_alive(ep))
1288 goto retry;
1289 if (!err)
1290 err = -ENODEV;
1291 if (err > 0)
1292 err = 0;
1293 if (OP_FAILED == window->reg_state)
1294 err = -ENOTCONN;
1295 } else {
1296 micscif_set_nr_pages(ep->remote_dev, window);
1297 }
1298 } else {
1299 msg.uop = SCIF_FREE_VIRT;
1300 msg.payload[3] = SCIF_REGISTER;
1301 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1302 err = -ENOTCONN;
1303 micscif_set_nr_pages(ep->remote_dev, window);
1304 }
1305 return err;
1306}
1307
1308/**
1309 * micscif_send_scif_unregister:
1310 * @ep: end point
1311 * @window: self registration window
1312 *
1313 * Send a SCIF_UNREGISTER message.
1314 */
1315int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window)
1316{
1317 struct nodemsg msg;
1318
1319 RMA_MAGIC(window);
1320 msg.uop = SCIF_UNREGISTER;
1321 msg.src = ep->port;
1322 msg.payload[0] = (uint64_t)window->alloc_handle.vaddr;
1323 msg.payload[1] = (uint64_t)window;
1324 return micscif_nodeqp_send(ep->remote_dev, &msg, ep);
1325}
1326
1327/**
1328 * micscif_get_window_offset:
1329 * @epd: end point descriptor
1330 * @flags: flags
1331 * @offset: offset hint
1332 * @len: length of range
1333 * @out_offset: computed offset returned by reference.
1334 *
1335 * Compute/Claim a new offset for this EP. The callee is supposed to grab
1336 * the RMA mutex before calling this API.
1337 */
1338int micscif_get_window_offset(struct endpt *ep, int flags,
1339 uint64_t offset, size_t len, uint64_t *out_offset)
1340{
1341 uint64_t computed_offset;
1342 int err = 0;
1343
1344 might_sleep();
1345 mutex_lock(&ep->rma_info.va_lock);
1346 if (flags & SCIF_MAP_FIXED) {
1347 computed_offset = va_gen_claim(&ep->rma_info.va_gen,
1348 (uint64_t)offset, len);
1349 if (INVALID_VA_GEN_ADDRESS == computed_offset)
1350 err = -EADDRINUSE;
1351 } else {
1352 computed_offset = va_gen_alloc(&ep->rma_info.va_gen,
1353 len, PAGE_SIZE);
1354 if (INVALID_VA_GEN_ADDRESS == computed_offset)
1355 err = -ENOMEM;
1356 }
1357 *out_offset = computed_offset;
1358 mutex_unlock(&ep->rma_info.va_lock);
1359 return err;
1360}
1361
1362/**
1363 * micscif_free_window_offset:
1364 * @offset: offset hint
1365 * @len: length of range
1366 *
1367 * Free offset for this EP. The callee is supposed to grab
1368 * the RMA mutex before calling this API.
1369 */
1370void micscif_free_window_offset(struct endpt *ep,
1371 uint64_t offset, size_t len)
1372{
1373 mutex_lock(&ep->rma_info.va_lock);
1374 va_gen_free(&ep->rma_info.va_gen, offset, len);
1375 mutex_unlock(&ep->rma_info.va_lock);
1376}
1377
1378/**
1379 * scif_register_temp:
1380 * @epd: End Point Descriptor.
1381 * @addr: virtual address to/from which to copy
1382 * @len: length of range to copy
1383 * @out_offset: computed offset returned by reference.
1384 * @out_window: allocated registered window returned by reference.
1385 *
1386 * Create a temporary registered window. The peer will not know about this
1387 * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's.
1388 */
1389static int
1390micscif_register_temp(scif_epd_t epd, void *addr, size_t len, int prot,
1391 off_t *out_offset, struct reg_range_t **out_window)
1392{
1393 struct endpt *ep = (struct endpt *)epd;
1394 int err;
1395 scif_pinned_pages_t pinned_pages;
1396 size_t aligned_len;
1397
1398 aligned_len = ALIGN(len, PAGE_SIZE);
1399
1400 if ((err = __scif_pin_pages((void *)((uint64_t)addr &
1401 PAGE_MASK),
1402 aligned_len, &prot, 0, &pinned_pages)))
1403 return err;
1404
1405 pinned_pages->prot = prot;
1406
1407 /* Compute the offset for this registration */
1408 if ((err = micscif_get_window_offset(ep, 0, 0,
1409 aligned_len, (uint64_t *)out_offset)))
1410 goto error_unpin;
1411
1412 /* Allocate and prepare self registration window */
1413 if (!(*out_window = micscif_create_window(ep, aligned_len >> PAGE_SHIFT,
1414 *out_offset, true))) {
1415 micscif_free_window_offset(ep, *out_offset, aligned_len);
1416 err = -ENOMEM;
1417 goto error_unpin;
1418 }
1419
1420 (*out_window)->pinned_pages = pinned_pages;
1421 (*out_window)->nr_pages = pinned_pages->nr_pages;
1422 (*out_window)->nr_contig_chunks = pinned_pages->nr_contig_chunks;
1423 (*out_window)->prot = pinned_pages->prot;
1424
1425 (*out_window)->va_for_temp = (void*)((uint64_t)addr & PAGE_MASK);
1426 if ((err = micscif_map_window_pages(ep, *out_window, true))) {
1427 /* Something went wrong! Rollback */
1428 micscif_destroy_window(ep, *out_window);
1429 *out_window = NULL;
1430 } else
1431 *out_offset |= ((uint64_t)addr & ~PAGE_MASK);
1432
1433 return err;
1434error_unpin:
1435 if (err)
1436 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1437 scif_unpin_pages(pinned_pages);
1438 return err;
1439}
1440
1441/**
1442 * micscif_rma_completion_cb:
1443 * @data: RMA cookie
1444 *
1445 * RMA interrupt completion callback.
1446 */
1447void micscif_rma_completion_cb(uint64_t data)
1448{
1449 struct dma_completion_cb *comp_cb = (struct dma_completion_cb *)data;
1450#ifndef _MIC_SCIF_
1451 struct pci_dev *pdev;
1452#endif
1453
1454 /* Free DMA Completion CB. */
1455 if (comp_cb && comp_cb->temp_buf) {
1456 if (comp_cb->dst_window) {
1457 micscif_rma_local_cpu_copy(comp_cb->dst_offset,
1458 comp_cb->dst_window, comp_cb->temp_buf + comp_cb->header_padding,
1459 comp_cb->len, false);
1460 }
1461#ifndef _MIC_SCIF_
1462 micscif_pci_dev(comp_cb->remote_node, &pdev);
1463 mic_ctx_unmap_single(get_per_dev_ctx(comp_cb->remote_node - 1),
1464 comp_cb->temp_phys, KMEM_UNALIGNED_BUF_SIZE);
1465#endif
1466 if (comp_cb->is_cache)
1467 micscif_kmem_cache_free(comp_cb->temp_buf_to_free);
1468 else
1469 kfree(comp_cb->temp_buf_to_free);
1470 }
1471 kfree(comp_cb);
1472}
1473
1474static void __micscif_rma_destroy_tcw_ep(struct endpt *ep);
1475static
1476bool micscif_rma_tc_can_cache(struct endpt *ep, size_t cur_bytes)
1477{
1478 if ((cur_bytes >> PAGE_SHIFT) > ms_info.mi_rma_tc_limit)
1479 return false;
1480 if ((atomic_read(&ep->rma_info.tcw_total_pages)
1481 + (cur_bytes >> PAGE_SHIFT)) >
1482 ms_info.mi_rma_tc_limit) {
1483 printk(KERN_ALERT "%s %d total=%d, current=%zu reached max\n",
1484 __func__, __LINE__,
1485 atomic_read(&ep->rma_info.tcw_total_pages),
1486 (1 + (cur_bytes >> PAGE_SHIFT)));
1487 micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc);
1488 __micscif_rma_destroy_tcw_ep(ep);
1489 }
1490 return true;
1491}
1492
1493/**
1494 * micscif_rma_copy:
1495 * @epd: end point descriptor.
1496 * @loffset: offset in local registered address space to/from which to copy
1497 * @addr: user virtual address to/from which to copy
1498 * @len: length of range to copy
1499 * @roffset: offset in remote registered address space to/from which to copy
1500 * @flags: flags
1501 * @dir: LOCAL->REMOTE or vice versa.
1502 *
1503 * Validate parameters, check if src/dst registered ranges requested for copy
1504 * are valid and initiate either CPU or DMA copy.
1505 */
1506int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len,
1507 off_t roffset, int flags, enum rma_direction dir, bool last_chunk)
1508{
1509 struct endpt *ep = (struct endpt *)epd;
1510 struct micscif_rma_req remote_req;
1511 struct micscif_rma_req req;
1512 struct reg_range_t *window = NULL;
1513 struct reg_range_t *remote_window = NULL;
1514 struct mic_copy_work copy_work;
1515 bool loopback;
1516 int err = 0;
1517 struct dma_channel *chan;
1518 struct rma_mmu_notifier *mmn = NULL;
1519 bool insert_window = false;
1520 bool cache = false;
1521
1522 if ((err = verify_epd(ep)))
1523 return err;
1524
1525 if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE | SCIF_RMA_SYNC | SCIF_RMA_ORDERED)))
1526 return -EINVAL;
1527
1528 if (!len)
1529 return -EINVAL;
1530 loopback = is_self_scifdev(ep->remote_dev) ? true : false;
1531 copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ? DO_DMA_POLLING : 0;
1532 copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk);
1533
1534#ifdef CONFIG_MMU_NOTIFIER
1535 if (!mic_reg_cache_enable)
1536 flags &= ~SCIF_RMA_USECACHE;
1537#else
1538 flags &= ~SCIF_RMA_USECACHE;
1539#endif
1540#ifndef _MIC_SCIF_
1541#ifdef CONFIG_ML1OM
1542 /* Use DMA Copies even if CPU copy is requested on KNF MIC from Host */
1543 if (flags & SCIF_RMA_USECPU) {
1544 flags &= ~SCIF_RMA_USECPU;
1545 if (last_chunk)
1546 copy_work.fence_type = DO_DMA_POLLING;
1547 }
1548#endif
1549 /* Use CPU for Host<->Host Copies */
1550 if (loopback) {
1551 flags |= SCIF_RMA_USECPU;
1552 copy_work.fence_type = 0x0;
1553 }
1554#endif
1555
1556 cache = flags & SCIF_RMA_USECACHE;
1557
1558 /* Trying to wrap around */
1559 if ((loffset && (loffset + (off_t)len < loffset)) ||
1560 (roffset + (off_t)len < roffset))
1561 return -EINVAL;
1562
1563 remote_req.out_window = &remote_window;
1564 remote_req.offset = roffset;
1565 remote_req.nr_bytes = len;
1566 /*
1567 * If transfer is from local to remote then the remote window
1568 * must be writeable and vice versa.
1569 */
1570 remote_req.prot = LOCAL_TO_REMOTE == dir ? VM_WRITE : VM_READ;
1571 remote_req.type = WINDOW_PARTIAL;
1572 remote_req.head = &ep->rma_info.remote_reg_list;
1573
1574#ifdef CONFIG_MMU_NOTIFIER
1575 if (addr && cache) {
1576 mutex_lock(&ep->rma_info.mmn_lock);
1577 mmn = find_mmu_notifier(current->mm, &ep->rma_info);
1578 if (!mmn) {
1579 mmn = kzalloc(sizeof(*mmn), GFP_KERNEL);
1580 if (!mmn) {
1581 mutex_unlock(&ep->rma_info.mmn_lock);
1582 return -ENOMEM;
1583 }
1584 init_mmu_notifier(mmn, current->mm, ep);
1585 if (mmu_notifier_register(&mmn->ep_mmu_notifier, current->mm)) {
1586 mutex_unlock(&ep->rma_info.mmn_lock);
1587 kfree(mmn);
1588 return -EBUSY;
1589 }
1590#ifdef RMA_DEBUG
1591 atomic_long_add_return(1, &ms_info.mmu_notif_cnt);
1592#endif
1593 list_add(&mmn->list_member, &ep->rma_info.mmn_list);
1594 }
1595 mutex_unlock(&ep->rma_info.mmn_lock);
1596 }
1597#endif
1598
1599 micscif_inc_node_refcnt(ep->remote_dev, 1);
1600#ifdef _MIC_SCIF_
1601 if (!(flags & SCIF_RMA_USECPU)) {
1602 /*
1603 * Proxy the DMA only for P2P reads with transfer size
1604 * greater than proxy DMA threshold. scif_vreadfrom(..)
1605 * and scif_vwriteto(..) is not supported since the peer
1606 * does not have the page lists required to perform the
1607 * proxy DMA.
1608 */
1609 if (ep->remote_dev->sd_proxy_dma_reads &&
1610 !addr && dir == REMOTE_TO_LOCAL &&
1611 ep->rma_info.proxy_dma_va &&
1612 len >= ms_info.mi_proxy_dma_threshold) {
1613 copy_work.len = len;
1614 copy_work.src_offset = roffset;
1615 copy_work.dst_offset = loffset;
1616 /* Fall through if there were errors */
1617 if (!(err = micscif_proxy_dma(epd, &copy_work)))
1618 goto error;
1619 }
1620 }
1621#endif
1622 mutex_lock(&ep->rma_info.rma_lock);
1623 if (addr) {
1624 req.out_window = &window;
1625 req.nr_bytes = ALIGN(len + ((uint64_t)addr & ~PAGE_MASK), PAGE_SIZE);
1626 if (mmn)
1627 req.head = &mmn->tc_reg_list;
1628 req.va_for_temp = (void*)((uint64_t)addr & PAGE_MASK);
1629 req.prot = (LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE | VM_READ);
1630 /* Does a valid local window exist? */
1631
1632 pr_debug("%s %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n",
1633 __func__, __LINE__, req.va_for_temp, addr, req.nr_bytes, len);
1634 spin_lock(&ep->rma_info.tc_lock);
1635 if (!mmn || (err = micscif_query_tcw(ep, &req))) {
1636 pr_debug("%s %d err %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n",
1637 __func__, __LINE__, err, req.va_for_temp, addr, req.nr_bytes, len);
1638 spin_unlock(&ep->rma_info.tc_lock);
1639 mutex_unlock(&ep->rma_info.rma_lock);
1640 if (cache)
1641 if (!micscif_rma_tc_can_cache(ep, req.nr_bytes))
1642 cache = false;
1643 if ((err = micscif_register_temp(epd, req.va_for_temp, req.nr_bytes,
1644 req.prot,
1645 &loffset, &window))) {
1646 goto error;
1647 }
1648 mutex_lock(&ep->rma_info.rma_lock);
1649 pr_debug("New temp window created addr %p\n", addr);
1650 if (cache) {
1651 atomic_inc(&ep->rma_info.tcw_refcount);
1652 atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tcw_total_pages);
1653 if (mmn) {
1654 spin_lock(&ep->rma_info.tc_lock);
1655 micscif_insert_tcw(window, &mmn->tc_reg_list);
1656 spin_unlock(&ep->rma_info.tc_lock);
1657 }
1658 }
1659 insert_window = true;
1660 } else {
1661 spin_unlock(&ep->rma_info.tc_lock);
1662 pr_debug("window found for addr %p\n", addr);
1663 BUG_ON(window->va_for_temp > addr);
1664 }
1665 loffset = window->offset + ((uint64_t)addr - (uint64_t)window->va_for_temp);
1666 pr_debug("%s %d addr %p loffset 0x%lx window->nr_pages 0x%llx"
1667 " window->va_for_temp %p\n", __func__, __LINE__,
1668 addr, loffset, window->nr_pages, window->va_for_temp);
1669 RMA_MAGIC(window);
1670 }
1671
1672 /* Does a valid remote window exist? */
1673 if ((err = micscif_query_window(&remote_req))) {
1674 pr_debug("%s %d err %d roffset 0x%lx len 0x%lx\n",
1675 __func__, __LINE__, err, roffset, len);
1676 mutex_unlock(&ep->rma_info.rma_lock);
1677 goto error;
1678 }
1679 RMA_MAGIC(remote_window);
1680 if (!addr) {
1681 req.out_window = &window;
1682 req.offset = loffset;
1683 /*
1684 * If transfer is from local to remote then the self window
1685 * must be readable and vice versa.
1686 */
1687 req.prot = LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE;
1688 req.nr_bytes = len;
1689 req.type = WINDOW_PARTIAL;
1690 req.head = &ep->rma_info.reg_list;
1691 /* Does a valid local window exist? */
1692 if ((err = micscif_query_window(&req))) {
1693 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1694 mutex_unlock(&ep->rma_info.rma_lock);
1695 goto error;
1696 }
1697 RMA_MAGIC(window);
1698 }
1699
1700 /*
1701 * Preprare copy_work for submitting work to the DMA kernel thread
1702 * or CPU copy routine.
1703 */
1704 copy_work.len = len;
1705 copy_work.loopback = loopback;
1706 copy_work.remote_dev = ep->remote_dev;
1707 copy_work.dma_chan_released = false;
1708 if (LOCAL_TO_REMOTE == dir) {
1709 copy_work.src_offset = loffset;
1710 copy_work.src_window = window;
1711 copy_work.dst_offset = roffset;
1712 copy_work.dst_window = remote_window;
1713 } else {
1714 copy_work.src_offset = roffset;
1715 copy_work.src_window = remote_window;
1716 copy_work.dst_offset = loffset;
1717 copy_work.dst_window = window;
1718 }
1719
1720 if (!(flags & SCIF_RMA_USECPU)) {
1721 chan = ep->rma_info.dma_chan;
1722 if ((err = request_dma_channel(chan))) {
1723 mutex_unlock(&ep->rma_info.rma_lock);
1724 goto error;
1725 }
1726 err = micscif_rma_list_dma_copy_wrapper(epd, &copy_work,
1727 chan, loffset);
1728 if (!copy_work.dma_chan_released)
1729 free_dma_channel(chan);
1730 }
1731 if (flags & SCIF_RMA_USECPU) {
1732 /* Initiate synchronous CPU copy */
1733 micscif_rma_list_cpu_copy(&copy_work);
1734 }
1735 if (insert_window && !cache) {
1736 atomic_inc(&ep->rma_info.tw_refcount);
1737 atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
1738 }
1739
1740 mutex_unlock(&ep->rma_info.rma_lock);
1741
1742 if (last_chunk) {
1743 if (DO_DMA_POLLING == copy_work.fence_type)
1744 err = drain_dma_poll(ep->rma_info.dma_chan);
1745 else if (DO_DMA_INTR == copy_work.fence_type)
1746 err = drain_dma_intr(ep->rma_info.dma_chan);
1747 }
1748
1749 micscif_dec_node_refcnt(ep->remote_dev, 1);
1750 if (insert_window && !cache)
1751 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
1752 return err;
1753error:
1754 if (err) {
1755 if (addr && window && !cache)
1756 micscif_destroy_window(ep, window);
1757 printk(KERN_ERR "%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len);
1758 }
1759 micscif_dec_node_refcnt(ep->remote_dev, 1);
1760 return err;
1761}
1762
1763/**
1764 * micscif_send_fence_mark:
1765 * @epd: end point descriptor.
1766 * @out_mark: Output DMA mark reported by peer.
1767 *
1768 * Send a remote fence mark request.
1769 */
1770int micscif_send_fence_mark(scif_epd_t epd, int *out_mark)
1771{
1772 int err;
1773 struct nodemsg msg;
1774 struct fence_info *fence_req;
1775 struct endpt *ep = (struct endpt *)epd;
1776
1777 if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
1778 err = -ENOMEM;
1779 goto error;
1780 }
1781
1782 fence_req->state = OP_IN_PROGRESS;
1783 init_waitqueue_head(&fence_req->wq);
1784
1785 msg.src = ep->port;
1786 msg.uop = SCIF_MARK;
1787 msg.payload[0] = ep->remote_ep;
1788 msg.payload[1] = (uint64_t)fence_req;
1789
1790 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1791 goto error;
1792
1793retry:
1794 err = wait_event_timeout(fence_req->wq,
1795 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1796 if (!err && scifdev_alive(ep))
1797 goto retry;
1798 if (!err)
1799 err = -ENODEV;
1800 if (err > 0)
1801 err = 0;
1802 if (err < 0) {
1803 mutex_lock(&ep->rma_info.rma_lock);
1804 if (OP_IN_PROGRESS == fence_req->state)
1805 fence_req->state = OP_FAILED;
1806 mutex_unlock(&ep->rma_info.rma_lock);
1807 }
1808 if (OP_COMPLETED == fence_req->state)
1809 *out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark;
1810
1811 if (OP_FAILED == fence_req->state && !err)
1812 err = -ENOMEM;
1813 mutex_lock(&ep->rma_info.rma_lock);
1814 mutex_unlock(&ep->rma_info.rma_lock);
1815 kfree(fence_req);
1816error:
1817 return err;
1818}
1819
1820/**
1821 * micscif_send_fence_wait:
1822 * @epd: end point descriptor.
1823 * @mark: DMA mark to wait for.
1824 *
1825 * Send a remote fence wait request.
1826 */
1827int micscif_send_fence_wait(scif_epd_t epd, int mark)
1828{
1829 int err;
1830 struct nodemsg msg;
1831 struct fence_info *fence_req;
1832 struct endpt *ep = (struct endpt *)epd;
1833
1834 if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
1835 err = -ENOMEM;
1836 goto error;
1837 }
1838
1839 fence_req->state = OP_IN_PROGRESS;
1840 init_waitqueue_head(&fence_req->wq);
1841
1842 msg.src = ep->port;
1843 msg.uop = SCIF_WAIT;
1844 msg.payload[0] = ep->remote_ep;
1845 msg.payload[1] = (uint64_t)fence_req;
1846 msg.payload[2] = mark;
1847
1848 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1849 goto error;
1850retry:
1851 err = wait_event_timeout(fence_req->wq,
1852 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1853 if (!err && scifdev_alive(ep))
1854 goto retry;
1855 if (!err)
1856 err = -ENODEV;
1857 if (err > 0)
1858 err = 0;
1859 if (err < 0) {
1860 mutex_lock(&ep->rma_info.rma_lock);
1861 if (OP_IN_PROGRESS == fence_req->state)
1862 fence_req->state = OP_FAILED;
1863 mutex_unlock(&ep->rma_info.rma_lock);
1864 }
1865 if (OP_FAILED == fence_req->state && !err)
1866 err = -ENOMEM;
1867 mutex_lock(&ep->rma_info.rma_lock);
1868 mutex_unlock(&ep->rma_info.rma_lock);
1869 kfree(fence_req);
1870error:
1871 return err;
1872}
1873
1874/**
1875 * micscif_send_fence_signal:
1876 * @epd - endpoint descriptor
1877 * @loff - local offset
1878 * @lval - local value to write to loffset
1879 * @roff - remote offset
1880 * @rval - remote value to write to roffset
1881 * @flags - flags
1882 *
1883 * Sends a remote fence signal request
1884 */
1885int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval,
1886 off_t loff, uint64_t lval, int flags)
1887{
1888 int err = 0;
1889 struct nodemsg msg;
1890 struct fence_info *fence_req;
1891 struct endpt *ep = (struct endpt *)epd;
1892
1893 if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
1894 err = -ENOMEM;
1895 goto error;
1896 }
1897
1898 fence_req->state = OP_IN_PROGRESS;
1899 init_waitqueue_head(&fence_req->wq);
1900
1901 msg.src = ep->port;
1902 if (flags & SCIF_SIGNAL_LOCAL) {
1903 msg.uop = SCIF_SIG_LOCAL;
1904 msg.payload[0] = ep->remote_ep;
1905 msg.payload[1] = roff;
1906 msg.payload[2] = rval;
1907 msg.payload[3] = (uint64_t)fence_req;
1908 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1909 goto error_free;
1910retry1:
1911 err = wait_event_timeout(fence_req->wq,
1912 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1913 if (!err && scifdev_alive(ep))
1914 goto retry1;
1915 if (!err)
1916 err = -ENODEV;
1917 if (err > 0)
1918 err = 0;
1919 if (err < 0) {
1920 mutex_lock(&ep->rma_info.rma_lock);
1921 if (OP_IN_PROGRESS == fence_req->state)
1922 fence_req->state = OP_FAILED;
1923 mutex_unlock(&ep->rma_info.rma_lock);
1924 }
1925 if (OP_FAILED == fence_req->state && !err) {
1926 err = -ENXIO;
1927 goto error_free;
1928 }
1929 }
1930 fence_req->state = OP_IN_PROGRESS;
1931
1932 if (flags & SCIF_SIGNAL_REMOTE) {
1933 msg.uop = SCIF_SIG_REMOTE;
1934 msg.payload[0] = ep->remote_ep;
1935 msg.payload[1] = loff;
1936 msg.payload[2] = lval;
1937 msg.payload[3] = (uint64_t)fence_req;
1938 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1939 goto error_free;
1940retry2:
1941 err = wait_event_timeout(fence_req->wq,
1942 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1943 if (!err && scifdev_alive(ep))
1944 goto retry2;
1945 if (!err)
1946 err = -ENODEV;
1947 if (err > 0)
1948 err = 0;
1949 if (err < 0) {
1950 mutex_lock(&ep->rma_info.rma_lock);
1951 if (OP_IN_PROGRESS == fence_req->state)
1952 fence_req->state = OP_FAILED;
1953 mutex_unlock(&ep->rma_info.rma_lock);
1954 }
1955 if (OP_FAILED == fence_req->state && !err) {
1956 err = -ENXIO;
1957 goto error_free;
1958 }
1959 }
1960error_free:
1961 mutex_lock(&ep->rma_info.rma_lock);
1962 mutex_unlock(&ep->rma_info.rma_lock);
1963 kfree(fence_req);
1964error:
1965 return err;
1966}
1967
1968/*
1969 * micscif_fence_mark:
1970 *
1971 * @epd - endpoint descriptor
1972 * Set up a mark for this endpoint and return the value of the mark.
1973 */
1974int micscif_fence_mark(scif_epd_t epd)
1975{
1976 int mark = 0;
1977 struct endpt *ep = (struct endpt *)epd;
1978 struct dma_channel *chan = ep->rma_info.dma_chan;
1979
1980 if ((mark = request_dma_channel(chan)))
1981 goto error;
1982
1983 mark = program_dma_mark(chan);
1984
1985 free_dma_channel(chan);
1986error:
1987 return mark;
1988}
1989
1990/**
1991 * micscif_rma_destroy_temp_windows:
1992 *
1993 * This routine destroys temporary registered windows created
1994 * by scif_vreadfrom() and scif_vwriteto().
1995 */
1996void micscif_rma_destroy_temp_windows(void)
1997{
1998 struct list_head *item, *tmp;
1999 struct reg_range_t *window;
2000 struct endpt *ep;
2001 struct dma_channel *chan;
2002 might_sleep();
2003restart:
2004 spin_lock(&ms_info.mi_rmalock);
2005 list_for_each_safe(item, tmp, &ms_info.mi_rma) {
2006 window = list_entry(item,
2007 struct reg_range_t, list_member);
2008 ep = (struct endpt *)window->ep;
2009 chan = ep->rma_info.dma_chan;
2010
2011 list_del(&window->list_member);
2012 spin_unlock(&ms_info.mi_rmalock);
2013 micscif_inc_node_refcnt(ep->remote_dev, 1);
2014 if (!chan ||
2015 !scifdev_alive(ep) ||
2016 (!is_current_dma_mark(chan, window->dma_mark) &&
2017 is_dma_mark_processed(chan, window->dma_mark)) ||
2018 !drain_dma_intr(chan)) {
2019 micscif_dec_node_refcnt(ep->remote_dev, 1);
2020 /* Remove window from global list */
2021 window->unreg_state = OP_COMPLETED;
2022 } else {
2023 micscif_dec_node_refcnt(ep->remote_dev, 1);
2024 /* DMA engine hung ?? */
2025 printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d "
2026 "window->dma_mark 0x%x channel_mark 0x%x\n",
2027 __func__, __LINE__, get_chan_num(chan),
2028 ep->sd_state, window->dma_mark, get_dma_mark(chan));
2029 WARN_ON(1);
2030 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
2031 goto restart;
2032 }
2033
2034 if (OP_COMPLETED == window->unreg_state) {
2035 BUG_ON(atomic_sub_return((int32_t)window->nr_pages,
2036 &ep->rma_info.tw_total_pages) < 0);
2037 if (RMA_WINDOW_SELF == window->type)
2038 micscif_destroy_window(ep, window);
2039 else
2040 micscif_destroy_remote_window(ep, window);
2041 BUG_ON(atomic_dec_return(
2042 &ep->rma_info.tw_refcount) < 0);
2043 }
2044 goto restart;
2045 }
2046 spin_unlock(&ms_info.mi_rmalock);
2047}
2048
2049/**
2050 * micscif_rma_destroy_tcw:
2051 *
2052 * This routine destroys temporary registered windows created
2053 * by scif_vreadfrom() and scif_vwriteto().
2054 */
2055static
2056void __micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
2057 struct endpt *ep, bool inrange,
2058 uint64_t start, uint64_t len)
2059{
2060 struct list_head *item, *tmp;
2061 struct reg_range_t *window;
2062 uint64_t start_va, end_va;
2063 uint64_t end = start + len;
2064 list_for_each_safe(item, tmp, &mmn->tc_reg_list) {
2065 window = list_entry(item,
2066 struct reg_range_t, list_member);
2067 ep = (struct endpt *)window->ep;
2068 if (inrange) {
2069 if (0 == len)
2070 break;
2071 start_va = (uint64_t)window->va_for_temp;
2072 end_va = start_va+ (window->nr_pages << PAGE_SHIFT);
2073 if (start < start_va) {
2074 if (end <= start_va) {
2075 break;
2076 } else {
2077 }
2078
2079 } else {
2080 if (start >= end_va) {
2081 continue;
2082 } else {
2083 }
2084 }
2085 }
2086 __micscif_rma_destroy_tcw_helper(window);
2087 }
2088}
2089
2090static inline
2091void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
2092 struct endpt *ep, bool inrange,
2093 uint64_t start, uint64_t len)
2094{
2095 unsigned long sflags;
2096
2097 spin_lock_irqsave(&ep->rma_info.tc_lock, sflags);
2098 __micscif_rma_destroy_tcw(mmn, ep, inrange, start, len);
2099 spin_unlock_irqrestore(&ep->rma_info.tc_lock, sflags);
2100}
2101
2102static void __micscif_rma_destroy_tcw_ep(struct endpt *ep)
2103{
2104 struct list_head *item, *tmp;
2105 struct rma_mmu_notifier *mmn;
2106 spin_lock(&ep->rma_info.tc_lock);
2107 list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
2108 mmn = list_entry(item,
2109 struct rma_mmu_notifier, list_member);
2110 __micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
2111 }
2112 spin_unlock(&ep->rma_info.tc_lock);
2113}
2114
2115void micscif_rma_destroy_tcw_ep(struct endpt *ep)
2116{
2117 struct list_head *item, *tmp;
2118 struct rma_mmu_notifier *mmn;
2119 list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
2120 mmn = list_entry(item,
2121 struct rma_mmu_notifier, list_member);
2122 micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
2123 }
2124}
2125
2126/**
2127 * micscif_rma_destroy_tcw:
2128 *
2129 * This routine destroys temporary registered windows created
2130 * by scif_vreadfrom() and scif_vwriteto().
2131 */
2132void micscif_rma_destroy_tcw_invalid(struct list_head *list)
2133{
2134 struct list_head *item, *tmp;
2135 struct reg_range_t *window;
2136 struct endpt *ep;
2137 struct dma_channel *chan;
2138 might_sleep();
2139restart:
2140 spin_lock(&ms_info.mi_rmalock);
2141 list_for_each_safe(item, tmp, list) {
2142 window = list_entry(item,
2143 struct reg_range_t, list_member);
2144 ep = (struct endpt *)window->ep;
2145 chan = ep->rma_info.dma_chan;
2146 list_del(&window->list_member);
2147 spin_unlock(&ms_info.mi_rmalock);
2148 micscif_inc_node_refcnt(ep->remote_dev, 1);
2149 mutex_lock(&ep->rma_info.rma_lock);
2150 if (!chan ||
2151 !scifdev_alive(ep) ||
2152 (!is_current_dma_mark(chan, window->dma_mark) &&
2153 is_dma_mark_processed(chan, window->dma_mark)) ||
2154 !drain_dma_intr(chan)) {
2155 micscif_dec_node_refcnt(ep->remote_dev, 1);
2156 BUG_ON(atomic_sub_return((int32_t)window->nr_pages,
2157 &ep->rma_info.tcw_total_pages) < 0);
2158 micscif_destroy_window(ep, window);
2159 BUG_ON(atomic_dec_return(
2160 &ep->rma_info.tcw_refcount) < 0);
2161 } else {
2162 /* DMA engine hung ?? */
2163 printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d "
2164 "window->dma_mark 0x%x channel_mark 0x%x\n",
2165 __func__, __LINE__, get_chan_num(chan),
2166 ep->sd_state, window->dma_mark, get_dma_mark(chan));
2167 WARN_ON(1);
2168 mutex_unlock(&ep->rma_info.rma_lock);
2169 micscif_dec_node_refcnt(ep->remote_dev, 1);
2170 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
2171 goto restart;
2172 }
2173 mutex_unlock(&ep->rma_info.rma_lock);
2174 goto restart;
2175 }
2176 spin_unlock(&ms_info.mi_rmalock);
2177}
2178
2179/**
2180 * micscif_rma_handle_remote_fences:
2181 *
2182 * This routine services remote fence requests.
2183 */
2184void micscif_rma_handle_remote_fences(void)
2185{
2186 struct list_head *item, *tmp;
2187 struct remote_fence_info *fence;
2188 struct endpt *ep;
2189 int mark;
2190
2191 might_sleep();
2192 mutex_lock(&ms_info.mi_fencelock);
2193 list_for_each_safe(item, tmp, &ms_info.mi_fence) {
2194 fence = list_entry(item,
2195 struct remote_fence_info, list_member);
2196 /* Remove fence from global list */
2197 list_del(&fence->list_member);
2198
2199 /* Initiate the fence operation */
2200 ep = (struct endpt *)fence->msg.payload[0];
2201 mark = (int)fence->msg.payload[2];
2202 BUG_ON(!(mark & SCIF_REMOTE_FENCE));
2203 if (dma_mark_wait(ep->rma_info.dma_chan,
2204 mark & ~SCIF_REMOTE_FENCE, false)) {
2205 printk(KERN_ERR "%s %d err\n", __func__, __LINE__);
2206 fence->msg.uop = SCIF_WAIT_NACK;
2207 } else {
2208 fence->msg.uop = SCIF_WAIT_ACK;
2209 }
2210 micscif_inc_node_refcnt(ep->remote_dev, 1);
2211 fence->msg.payload[0] = ep->remote_ep;
2212 /* No error handling for Notification messages. */
2213 micscif_nodeqp_send(ep->remote_dev, &fence->msg, ep);
2214 micscif_dec_node_refcnt(ep->remote_dev, 1);
2215 kfree(fence);
2216 /*
2217 * Decrement ref count and wake up
2218 * any thread blocked in the EP close routine waiting
2219 * for all such remote fence requests to complete.
2220 */
2221 ep->rma_info.fence_refcount--;
2222 wake_up(&ep->rma_info.fence_wq);
2223 }
2224 mutex_unlock(&ms_info.mi_fencelock);
2225}
2226
2227#ifdef CONFIG_MMU_NOTIFIER
2228void micscif_mmu_notif_handler(struct work_struct *work)
2229{
2230 struct list_head *pos, *tmpq;
2231 struct endpt *ep;
2232restart:
2233 micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc);
2234 spin_lock(&ms_info.mi_rmalock);
2235 list_for_each_safe(pos, tmpq, &ms_info.mi_mmu_notif_cleanup) {
2236 ep = list_entry(pos, struct endpt, mmu_list);
2237 list_del(&ep->mmu_list);
2238 spin_unlock(&ms_info.mi_rmalock);
2239 BUG_ON(list_empty(&ep->rma_info.mmn_list));
2240
2241 micscif_rma_destroy_tcw_ep(ep);
2242 ep_unregister_mmu_notifier(ep);
2243 queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
2244 goto restart;
2245 }
2246 spin_unlock(&ms_info.mi_rmalock);
2247}
2248#endif
2249
2250/**
2251 * micscif_reserve_dma_chan:
2252 * @ep: Endpoint Descriptor.
2253 *
2254 * This routine reserves a DMA channel for a particular
2255 * endpoint. All DMA transfers for an endpoint are always
2256 * programmed on the same DMA channel.
2257 */
2258int micscif_reserve_dma_chan(struct endpt *ep)
2259{
2260 int err = 0;
2261#ifndef _MIC_SCIF_
2262 /*
2263 * Host Loopback cannot use DMA by design and hence
2264 * reserving DMA channels is a nop.
2265 */
2266 if (is_self_scifdev(ep->remote_dev))
2267 return 0;
2268#endif
2269 mutex_lock(&ep->rma_info.rma_lock);
2270 if (!ep->rma_info.dma_chan) {
2271 struct dma_channel **chan = &ep->rma_info.dma_chan;
2272 unsigned long ts = jiffies;
2273#ifndef _MIC_SCIF_
2274 mic_ctx_t *mic_ctx =
2275 get_per_dev_ctx(ep->remote_dev->sd_node - 1);
2276 BUG_ON(!ep->remote_dev->sd_node);
2277#endif
2278 while (true) {
2279 if (!(err = allocate_dma_channel((struct mic_dma_ctx_t *)
2280#ifdef _MIC_SCIF_
2281 mic_dma_handle,
2282#else
2283 mic_ctx->dma_handle,
2284#endif
2285 chan)))
2286 break;
2287 schedule();
2288 if (time_after(jiffies,
2289 ts + NODE_ALIVE_TIMEOUT)) {
2290 err = -EBUSY;
2291 goto error;
2292 }
2293 }
2294 mic_dma_thread_free_chan(*chan);
2295 }
2296error:
2297 mutex_unlock(&ep->rma_info.rma_lock);
2298 return err;
2299}
2300
2301/*
2302 * micscif_prog_signal:
2303 * @epd - Endpoint Descriptor
2304 * @offset - registered address
2305 * @val - Value to be programmed in SUD.
2306 * @type - Type of the window.
2307 *
2308 * Program a status update descriptor adter ensuring that the offset
2309 * provided is indeed valid.
2310 */
2311int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val,
2312 enum rma_window_type type)
2313{
2314 struct endpt *ep = (struct endpt *)epd;
2315 struct dma_channel *chan = ep->rma_info.dma_chan;
2316 struct reg_range_t *window = NULL;
2317 struct micscif_rma_req req;
2318 int err;
2319 dma_addr_t phys;
2320
2321 mutex_lock(&ep->rma_info.rma_lock);
2322 req.out_window = &window;
2323 req.offset = offset;
2324 req.nr_bytes = sizeof(uint64_t);
2325 req.prot = SCIF_PROT_WRITE;
2326 req.type = WINDOW_SINGLE;
2327 if (RMA_WINDOW_SELF == type)
2328 req.head = &ep->rma_info.reg_list;
2329 else
2330 req.head = &ep->rma_info.remote_reg_list;
2331 /* Does a valid window exist? */
2332 if ((err = micscif_query_window(&req))) {
2333 printk(KERN_ERR "%s %d err %d\n",
2334 __func__, __LINE__, err);
2335 goto unlock_ret;
2336 }
2337 RMA_MAGIC(window);
2338
2339#ifndef _MIC_SCIF_
2340 if (unlikely(is_self_scifdev(ep->remote_dev))) {
2341 void *dst_virt;
2342 if (RMA_WINDOW_SELF == type)
2343 dst_virt = get_local_va(offset, window,
2344 sizeof(uint32_t));
2345 else {
2346 struct page **pages = ((struct reg_range_t *)
2347 (window->peer_window))->pinned_pages->pages;
2348 int page_nr = (int) ( (offset - window->offset) >> PAGE_SHIFT );
2349 off_t page_off = offset & ~PAGE_MASK;
2350 dst_virt = (void *)((uint64_t)phys_to_virt(page_to_phys(
2351 pages[page_nr])) | page_off);
2352 }
2353 *(uint64_t*)dst_virt = val;
2354 goto unlock_ret;
2355 }
2356#endif
2357 phys = micscif_get_dma_addr(window, offset, NULL, NULL, NULL);
2358 if ((err = request_dma_channel(chan)))
2359 goto unlock_ret;
2360 err = do_status_update(chan, phys, val);
2361 free_dma_channel(chan);
2362unlock_ret:
2363 mutex_unlock(&ep->rma_info.rma_lock);
2364 return err;
2365}
2366
2367/*
2368 * __micscif_kill_apps_with_mmaps:
2369 * @ep - The SCIF endpoint
2370 *
2371 * Kill the applications which have valid remote memory mappings
2372 * created via scif_mmap(..).
2373 */
2374static void __micscif_kill_apps_with_mmaps(struct endpt *ep)
2375{
2376 struct list_head *item;
2377 struct rma_task_info *info;
2378
2379 spin_lock(&ep->lock);
2380 list_for_each(item, &ep->rma_info.task_list) {
2381 info = list_entry(item, struct rma_task_info, list_member);
2382 kill_pid(info->pid, SIGKILL, 1);
2383 pr_debug("%s ep %p pid %p ref %d\n",
2384 __func__, ep, info->pid, info->ref_count);
2385 }
2386 spin_unlock(&ep->lock);
2387}
2388
2389/*
2390 * _micscif_kill_apps_with_mmaps:
2391 * @node - remote node id.
2392 * @head - head of the list of endpoints to kill.
2393 *
2394 * Traverse the list of endpoints for a particular remote node and
2395 * kill applications with valid remote memory mappings.
2396 */
2397static void _micscif_kill_apps_with_mmaps(int node, struct list_head *head)
2398{
2399 struct endpt *ep;
2400 unsigned long sflags;
2401 struct list_head *item;
2402
2403 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
2404 list_for_each(item, head) {
2405 ep = list_entry(item, struct endpt, list);
2406 if (ep->remote_dev->sd_node == node)
2407 __micscif_kill_apps_with_mmaps(ep);
2408 }
2409 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
2410}
2411
2412/*
2413 * micscif_kill_apps_with_mmaps:
2414 * @node - remote node id.
2415 *
2416 * Wrapper for killing applications with valid remote memory mappings
2417 * for a particular node. This API is called by peer nodes as part of
2418 * handling a lost node.
2419 */
2420void micscif_kill_apps_with_mmaps(int node)
2421{
2422 _micscif_kill_apps_with_mmaps(node, &ms_info.mi_connected);
2423 _micscif_kill_apps_with_mmaps(node, &ms_info.mi_disconnected);
2424}
2425
2426/*
2427 * micscif_query_apps_with_mmaps:
2428 * @node - remote node id.
2429 * @head - head of the list of endpoints to query.
2430 *
2431 * Query if any applications for a remote node have valid remote memory
2432 * mappings.
2433 */
2434static bool micscif_query_apps_with_mmaps(int node, struct list_head *head)
2435{
2436 struct endpt *ep;
2437 unsigned long sflags;
2438 struct list_head *item;
2439 bool ret = false;
2440
2441 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
2442 list_for_each(item, head) {
2443 ep = list_entry(item, struct endpt, list);
2444 if (ep->remote_dev->sd_node == node &&
2445 !list_empty(&ep->rma_info.task_list)) {
2446 ret = true;
2447 break;
2448 }
2449 }
2450 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
2451 return ret;
2452}
2453
2454/*
2455 * micscif_rma_do_apps_have_mmaps:
2456 * @node - remote node id.
2457 *
2458 * Wrapper for querying if any applications have remote memory mappings
2459 * for a particular node.
2460 */
2461bool micscif_rma_do_apps_have_mmaps(int node)
2462{
2463 return (micscif_query_apps_with_mmaps(node, &ms_info.mi_connected) ||
2464 micscif_query_apps_with_mmaps(node, &ms_info.mi_disconnected));
2465}
2466
2467/*
2468 * __micscif_cleanup_rma_for_zombies:
2469 * @ep - The SCIF endpoint
2470 *
2471 * This API is only called while handling a lost node:
2472 * a) Remote node is dead.
2473 * b) All endpoints with remote memory mappings have been killed.
2474 * So we can traverse the remote_reg_list without any locks. Since
2475 * the window has not yet been unregistered we can drop the ref count
2476 * and queue it to the cleanup thread.
2477 */
2478static void __micscif_cleanup_rma_for_zombies(struct endpt *ep)
2479{
2480 struct list_head *pos, *tmp;
2481 struct reg_range_t *window;
2482
2483 list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) {
2484 window = list_entry(pos, struct reg_range_t, list_member);
2485 /* If unregistration is complete then why is it on the list? */
2486 WARN_ON(window->unreg_state == OP_COMPLETED);
2487 if (window->ref_count)
2488 put_window_ref_count(window, window->nr_pages);
2489 if (!window->ref_count) {
2490 atomic_inc(&ep->rma_info.tw_refcount);
2491 atomic_add_return((int32_t)window->nr_pages,
2492 &ep->rma_info.tw_total_pages);
2493 list_del(&window->list_member);
2494 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
2495 }
2496 }
2497}
2498
2499/*
2500 * micscif_cleanup_rma_for_zombies:
2501 * @node - remote node id.
2502 *
2503 * Cleanup remote registration lists for zombie endpoints.
2504 */
2505void micscif_cleanup_rma_for_zombies(int node)
2506{
2507 struct endpt *ep;
2508 unsigned long sflags;
2509 struct list_head *item;
2510
2511 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
2512 list_for_each(item, &ms_info.mi_zombie) {
2513 ep = list_entry(item, struct endpt, list);
2514 if (ep->remote_dev && ep->remote_dev->sd_node == node) {
2515 /*
2516 * If the zombie endpoint remote node matches the lost
2517 * node then the scifdev should not be alive.
2518 */
2519 WARN_ON(scifdev_alive(ep));
2520 __micscif_cleanup_rma_for_zombies(ep);
2521 }
2522 }
2523 spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
2524}
2525
2526/*
2527 * micscif_rma_get_task:
2528 *
2529 * Store the parent task struct and bump up the number of remote mappings.
2530 * If this is the first remote memory mapping for this endpoint then
2531 * create a new rma_task_info entry in the epd task list.
2532 */
2533int micscif_rma_get_task(struct endpt *ep, int nr_pages)
2534{
2535 struct list_head *item;
2536 struct rma_task_info *info;
2537 int err = 0;
2538
2539 spin_lock(&ep->lock);
2540 list_for_each(item, &ep->rma_info.task_list) {
2541 info = list_entry(item, struct rma_task_info, list_member);
2542 if (info->pid == task_tgid(current)) {
2543 info->ref_count += nr_pages;
2544 pr_debug("%s ep %p existing pid %p ref %d\n",
2545 __func__, ep, info->pid, info->ref_count);
2546 goto unlock;
2547 }
2548 }
2549 spin_unlock(&ep->lock);
2550
2551 /* A new task is mapping this window. Create a new entry */
2552 if (!(info = kzalloc(sizeof(*info), GFP_KERNEL))) {
2553 err = -ENOMEM;
2554 goto done;
2555 }
2556
2557 info->pid = get_pid(task_tgid(current));
2558 info->ref_count = nr_pages;
2559 pr_debug("%s ep %p new pid %p ref %d\n",
2560 __func__, ep, info->pid, info->ref_count);
2561 spin_lock(&ep->lock);
2562 list_add_tail(&info->list_member, &ep->rma_info.task_list);
2563unlock:
2564 spin_unlock(&ep->lock);
2565done:
2566 return err;
2567}
2568
2569/*
2570 * micscif_rma_put_task:
2571 *
2572 * Bump down the number of remote mappings. if the ref count for this
2573 * particular task drops to zero then remove the rma_task_info from
2574 * the epd task list.
2575 */
2576void micscif_rma_put_task(struct endpt *ep, int nr_pages)
2577{
2578 struct list_head *item;
2579 struct rma_task_info *info;
2580
2581 spin_lock(&ep->lock);
2582 list_for_each(item, &ep->rma_info.task_list) {
2583 info = list_entry(item, struct rma_task_info, list_member);
2584 if (info->pid == task_tgid(current)) {
2585 info->ref_count -= nr_pages;
2586 pr_debug("%s ep %p pid %p ref %d\n",
2587 __func__, ep, info->pid, info->ref_count);
2588 if (!info->ref_count) {
2589 list_del(&info->list_member);
2590 put_pid(info->pid);
2591 kfree(info);
2592 }
2593 goto done;
2594 }
2595 }
2596 /* Why was the task not found? This is a bug. */
2597 WARN_ON(1);
2598done:
2599 spin_unlock(&ep->lock);
2600 return;
2601}
2602
2603/* Only debug API's below */
2604void micscif_display_window(struct reg_range_t *window, const char *s, int line)
2605{
2606 int j;
2607
2608 printk("%s %d window %p type %d temp %d offset 0x%llx"
2609 " nr_pages 0x%llx nr_contig_chunks 0x%llx"
2610 " prot %d ref_count %d magic 0x%llx peer_window 0x%llx"
2611 " unreg_state 0x%x va_for_temp %p\n",
2612 s, line, window, window->type, window->temp,
2613 window->offset, window->nr_pages, window->nr_contig_chunks,
2614 window->prot, window->ref_count, window->magic,
2615 window->peer_window, window->unreg_state, window->va_for_temp);
2616
2617 for (j = 0; j < window->nr_contig_chunks; j++)
2618 pr_debug("page[%d] = dma_addr 0x%llx num_pages 0x%x\n",
2619 j,
2620 window->dma_addr[j],
2621 window->num_pages[j]);
2622
2623 if (RMA_WINDOW_SELF == window->type && window->pinned_pages)
2624 for (j = 0; j < window->nr_pages; j++)
2625 pr_debug("page[%d] = pinned_pages %p address %p\n",
2626 j, window->pinned_pages->pages[j],
2627 page_address(window->pinned_pages->pages[j]));
2628
2629#ifdef CONFIG_ML1OM
2630 if (window->temp_phys_addr)
2631 for (j = 0; j < window->nr_contig_chunks; j++)
2632 pr_debug("page[%d] = temp_phys_addr 0x%llx\n",
2633 j, window->temp_phys_addr[j]);
2634 if (window->phys_addr)
2635 for (j = 0; j < window->nr_pages; j++)
2636 pr_debug("page[%d] = phys_addr 0x%llx\n",
2637 j, window->phys_addr[j]);
2638#endif
2639 RMA_MAGIC(window);
2640}