Updated micscif/micscif_rma.c from page_cache_release() to put_page().
[xeon-phi-kernel-module] / micscif / micscif_rma.c
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36#include "mic/micscif.h"
37#include "mic/micscif_smpt.h"
38#include "mic/micscif_kmem_cache.h"
39#include "mic/micscif_rma_list.h"
40#ifndef _MIC_SCIF_
41#include "mic_common.h"
42#endif
43#include "mic/mic_dma_api.h"
44#include "mic/micscif_map.h"
45
46bool mic_reg_cache_enable = 0;
47
48bool mic_huge_page_enable = 1;
49
50#ifdef _MIC_SCIF_
51mic_dma_handle_t mic_dma_handle;
52#endif
53static inline
54void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
55 struct endpt *ep, bool inrange,
56 uint64_t start, uint64_t len);
57#ifdef CONFIG_MMU_NOTIFIER
58static void scif_mmu_notifier_release(struct mmu_notifier *mn,
59 struct mm_struct *mm);
60static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
61 struct mm_struct *mm,
62 unsigned long address);
63static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
64 struct mm_struct *mm,
65 unsigned long start, unsigned long end);
66static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
67 struct mm_struct *mm,
68 unsigned long start, unsigned long end);
69static const struct mmu_notifier_ops scif_mmu_notifier_ops = {
70 .release = scif_mmu_notifier_release,
71 .clear_flush_young = NULL,
72 .change_pte = NULL,/*TODO*/
73 .invalidate_page = scif_mmu_notifier_invalidate_page,
74 .invalidate_range_start = scif_mmu_notifier_invalidate_range_start,
75 .invalidate_range_end = scif_mmu_notifier_invalidate_range_end};
76
77static void scif_mmu_notifier_release(struct mmu_notifier *mn,
78 struct mm_struct *mm)
79{
80 struct endpt *ep;
81 struct rma_mmu_notifier *mmn;
82 mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
83 ep = mmn->ep;
84 micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
85 pr_debug("%s\n", __func__);
86 return;
87}
88
89static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
90 struct mm_struct *mm,
91 unsigned long address)
92{
93 struct endpt *ep;
94 struct rma_mmu_notifier *mmn;
95 mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
96 ep = mmn->ep;
97 micscif_rma_destroy_tcw(mmn, ep, true, address, PAGE_SIZE);
98 pr_debug("%s address 0x%lx\n", __func__, address);
99 return;
100}
101
102static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
103 struct mm_struct *mm,
104 unsigned long start, unsigned long end)
105{
106 struct endpt *ep;
107 struct rma_mmu_notifier *mmn;
108 mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
109 ep = mmn->ep;
110 micscif_rma_destroy_tcw(mmn, ep, true, (uint64_t)start, (uint64_t)(end - start));
111 pr_debug("%s start=%lx, end=%lx\n", __func__, start, end);
112 return;
113}
114
115static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
116 struct mm_struct *mm,
117 unsigned long start, unsigned long end)
118{
119 /* Nothing to do here, everything needed was done in invalidate_range_start */
120 pr_debug("%s\n", __func__);
121 return;
122}
123#endif
124
125#ifdef CONFIG_MMU_NOTIFIER
126void ep_unregister_mmu_notifier(struct endpt *ep)
127{
128 struct endpt_rma_info *rma = &ep->rma_info;
129 struct rma_mmu_notifier *mmn = NULL;
130 struct list_head *item, *tmp;
131 mutex_lock(&ep->rma_info.mmn_lock);
132 list_for_each_safe(item, tmp, &rma->mmn_list) {
133 mmn = list_entry(item,
134 struct rma_mmu_notifier, list_member);
135 mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm);
136#ifdef RMA_DEBUG
137 BUG_ON(atomic_long_sub_return(1, &ms_info.mmu_notif_cnt) < 0);
138#endif
139 list_del(item);
140 kfree(mmn);
141 }
142 mutex_unlock(&ep->rma_info.mmn_lock);
143}
144
145static void init_mmu_notifier(struct rma_mmu_notifier *mmn, struct mm_struct *mm, struct endpt *ep)
146{
147 mmn->ep = ep;
148 mmn->mm = mm;
149 mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops;
150 INIT_LIST_HEAD(&mmn->list_member);
151 INIT_LIST_HEAD(&mmn->tc_reg_list);
152}
153
154static struct rma_mmu_notifier *find_mmu_notifier(struct mm_struct *mm, struct endpt_rma_info *rma)
155{
156 struct rma_mmu_notifier *mmn;
157 struct list_head *item;
158 list_for_each(item, &rma->mmn_list) {
159 mmn = list_entry(item,
160 struct rma_mmu_notifier, list_member);
161 if (mmn->mm == mm)
162 return mmn;
163 }
164 return NULL;
165}
166#endif
167
168/**
169 * micscif_rma_ep_init:
170 * @ep: end point
171 *
172 * Initialize RMA per EP data structures.
173 */
174int micscif_rma_ep_init(struct endpt *ep)
175{
176 int ret;
177 struct endpt_rma_info *rma = &ep->rma_info;
178
179 mutex_init (&rma->rma_lock);
180 if ((ret = va_gen_init(&rma->va_gen,
181 VA_GEN_MIN, VA_GEN_RANGE)) < 0)
182 goto init_err;
183 spin_lock_init(&rma->tc_lock);
184 mutex_init (&rma->mmn_lock);
185 mutex_init (&rma->va_lock);
186 INIT_LIST_HEAD(&rma->reg_list);
187 INIT_LIST_HEAD(&rma->remote_reg_list);
188 atomic_set(&rma->tw_refcount, 0);
189 atomic_set(&rma->tw_total_pages, 0);
190 atomic_set(&rma->tcw_refcount, 0);
191 atomic_set(&rma->tcw_total_pages, 0);
192 init_waitqueue_head(&rma->fence_wq);
193 rma->fence_refcount = 0;
194 rma->async_list_del = 0;
195 rma->dma_chan = NULL;
196 INIT_LIST_HEAD(&rma->mmn_list);
197 INIT_LIST_HEAD(&rma->task_list);
198init_err:
199 return ret;
200}
201
202/**
203 * micscif_rma_ep_can_uninit:
204 * @ep: end point
205 *
206 * Returns 1 if an endpoint can be uninitialized and 0 otherwise.
207 */
208int micscif_rma_ep_can_uninit(struct endpt *ep)
209{
210 int ret = 0;
211
212 /* Destroy RMA Info only if both lists are empty */
213 if (list_empty(&ep->rma_info.reg_list) &&
214 list_empty(&ep->rma_info.remote_reg_list) &&
215#ifdef CONFIG_MMU_NOTIFIER
216 list_empty(&ep->rma_info.mmn_list) &&
217#endif
218 !atomic_read(&ep->rma_info.tw_refcount) &&
219 !atomic_read(&ep->rma_info.tcw_refcount))
220 ret = 1;
221 return ret;
222}
223
224#ifdef _MIC_SCIF_
225/**
226 * __micscif_setup_proxy_dma:
227 * @ep: SCIF endpoint descriptor.
228 *
229 * Sets up data structures for P2P Proxy DMAs.
230 */
231static int __micscif_setup_proxy_dma(struct endpt *ep)
232{
233 struct endpt_rma_info *rma = &ep->rma_info;
234 int err = 0;
235 uint64_t *tmp = NULL;
236
237 mutex_lock(&rma->rma_lock);
238 if (is_p2p_scifdev(ep->remote_dev) && !rma->proxy_dma_va) {
239 if (!(tmp = scif_zalloc(PAGE_SIZE))) {
240 err = -ENOMEM;
241 goto error;
242 }
243 if ((err = map_virt_into_aperture(&rma->proxy_dma_phys,
244 tmp,
245 ep->remote_dev, PAGE_SIZE))) {
246 scif_free(tmp, PAGE_SIZE);
247 goto error;
248 }
249 *tmp = OP_IDLE;
250 rma->proxy_dma_va = tmp;
251 }
252error:
253 mutex_unlock(&rma->rma_lock);
254 return err;
255}
256
257static __always_inline int micscif_setup_proxy_dma(struct endpt *ep)
258{
259 if (ep->rma_info.proxy_dma_va)
260 return 0;
261
262 return __micscif_setup_proxy_dma(ep);
263}
264
265/**
266 * micscif_teardown_proxy_dma:
267 * @ep: SCIF endpoint descriptor.
268 *
269 * Tears down data structures setup for P2P Proxy DMAs.
270 */
271void micscif_teardown_proxy_dma(struct endpt *ep)
272{
273 struct endpt_rma_info *rma = &ep->rma_info;
274 mutex_lock(&rma->rma_lock);
275 if (rma->proxy_dma_va) {
276 unmap_from_aperture(rma->proxy_dma_phys, ep->remote_dev, PAGE_SIZE);
277 scif_free(rma->proxy_dma_va, PAGE_SIZE);
278 rma->proxy_dma_va = NULL;
279 }
280 mutex_unlock(&rma->rma_lock);
281}
282
283/**
284 * micscif_proxy_dma:
285 * @ep: SCIF endpoint descriptor.
286 * @copy_work: DMA copy work information.
287 *
288 * This API does the following:
289 * 1) Sends the peer a SCIF Node QP message with the information
290 * required to program a proxy DMA to covert a P2P Read to a Write
291 * which will initiate a DMA transfer from the peer card to self.
292 * The reason for this special code path is KNF and KNC P2P read
293 * performance being much lower than P2P write performance on Crown
294 * Pass platforms.
295 * 2) Poll for an update of the known proxy dma VA to OP_COMPLETED
296 * via a SUD by the peer.
297 */
298static int micscif_proxy_dma(scif_epd_t epd, struct mic_copy_work *work)
299{
300 struct endpt *ep = (struct endpt *)epd;
301 struct nodemsg msg;
302 unsigned long ts = jiffies;
303 struct endpt_rma_info *rma = &ep->rma_info;
304 int err;
305 volatile uint64_t *proxy_dma_va = rma->proxy_dma_va;
306
307 mutex_lock(&ep->rma_info.rma_lock);
308 /*
309 * Bail out if there is a Proxy DMA already in progress
310 * for this endpoint. The callee will fallback on self
311 * DMAs upon an error.
312 */
313 if (*proxy_dma_va != OP_IDLE) {
314 mutex_unlock(&ep->rma_info.rma_lock);
315 err = -EBUSY;
316 goto error;
317 }
318 *proxy_dma_va = OP_IN_PROGRESS;
319 mutex_unlock(&ep->rma_info.rma_lock);
320
321 msg.src = ep->port;
322 msg.uop = work->ordered ? SCIF_PROXY_ORDERED_DMA : SCIF_PROXY_DMA;
323 msg.payload[0] = ep->remote_ep;
324 msg.payload[1] = work->src_offset;
325 msg.payload[2] = work->dst_offset;
326 msg.payload[3] = work->len;
327
328 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
329 goto error_init_va;
330
331 while (*proxy_dma_va != OP_COMPLETED) {
332 schedule();
333 if (time_after(jiffies,
334 ts + NODE_ALIVE_TIMEOUT)) {
335 err = -EBUSY;
336 goto error_init_va;
337 }
338 }
339 err = 0;
340error_init_va:
341 *proxy_dma_va = OP_IDLE;
342error:
343 return err;
344}
345#endif
346
347/**
348 * micscif_create_pinned_pages:
349 * @nr_pages: number of pages in window
350 * @prot: read/write protection
351 *
352 * Allocate and prepare a set of pinned pages.
353 */
354struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot)
355{
356 struct scif_pinned_pages *pinned_pages;
357
358 might_sleep();
359 if (!(pinned_pages = scif_zalloc(sizeof(*pinned_pages))))
360 goto error;
361
362 if (!(pinned_pages->pages = scif_zalloc(nr_pages *
363 sizeof(*(pinned_pages->pages)))))
364 goto error_free_pinned_pages;
365
366 if (!(pinned_pages->num_pages = scif_zalloc(nr_pages *
367 sizeof(*(pinned_pages->num_pages)))))
368 goto error_free_pages;
369
370#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
371 if (!(pinned_pages->vma = scif_zalloc(nr_pages *
372 sizeof(*(pinned_pages->vma)))))
373 goto error_free_num_pages;
374#endif
375
376 pinned_pages->prot = prot;
377 pinned_pages->magic = SCIFEP_MAGIC;
378 pinned_pages->nr_contig_chunks = 0;
379 return pinned_pages;
380
381#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
382error_free_num_pages:
383 scif_free(pinned_pages->num_pages,
384 pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages)));
385#endif
386error_free_pages:
387 scif_free(pinned_pages->pages,
388 pinned_pages->nr_pages * sizeof(*(pinned_pages->pages)));
389error_free_pinned_pages:
390 scif_free(pinned_pages, sizeof(*pinned_pages));
391error:
392 return NULL;
393}
394
395/**
396 * micscif_destroy_pinned_pages:
397 * @pinned_pages: A set of pinned pages.
398 *
399 * Deallocate resources for pinned pages.
400 */
401int micscif_destroy_pinned_pages(struct scif_pinned_pages *pinned_pages)
402{
403 int j;
404 int writeable = pinned_pages->prot & SCIF_PROT_WRITE;
405 int kernel = SCIF_MAP_KERNEL & pinned_pages->map_flags;
406
407 for (j = 0; j < pinned_pages->nr_pages; j++) {
408 if (pinned_pages->pages[j]) {
409 if (!kernel) {
410 if (writeable)
411 SetPageDirty(pinned_pages->pages[j]);
412#ifdef RMA_DEBUG
413 BUG_ON(!page_count(pinned_pages->pages[j]));
414 BUG_ON(atomic_long_sub_return(1, &ms_info.rma_pin_cnt) < 0);
415#endif
60589c21 416 put_page(pinned_pages->pages[j]);
800f879a
AT
417 }
418 }
419 }
420
421#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
422 scif_free(pinned_pages->vma,
423 pinned_pages->nr_pages * sizeof(*(pinned_pages->vma)));
424#endif
425 scif_free(pinned_pages->pages,
426 pinned_pages->nr_pages * sizeof(*(pinned_pages->pages)));
427 scif_free(pinned_pages->num_pages,
428 pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages)));
429 scif_free(pinned_pages, sizeof(*pinned_pages));
430 return 0;
431}
432
433/*
434 * micscif_create_window:
435 * @ep: end point
436 * @pinned_pages: Set of pinned pages which wil back this window.
437 * @offset: offset hint
438 *
439 * Allocate and prepare a self registration window.
440 */
441struct reg_range_t *micscif_create_window(struct endpt *ep,
442 int64_t nr_pages, uint64_t offset, bool temp)
443{
444 struct reg_range_t *window;
445
446 might_sleep();
447 if (!(window = scif_zalloc(sizeof(struct reg_range_t))))
448 goto error;
449
450#ifdef CONFIG_ML1OM
451 if (!temp) {
452 if (!(window->phys_addr = scif_zalloc(nr_pages *
453 sizeof(*(window->phys_addr)))))
454 goto error_free_window;
455
456 if (!(window->temp_phys_addr = scif_zalloc(nr_pages *
457 sizeof(*(window->temp_phys_addr)))))
458 goto error_free_window;
459 }
460#endif
461
462 if (!(window->dma_addr = scif_zalloc(nr_pages *
463 sizeof(*(window->dma_addr)))))
464 goto error_free_window;
465
466 if (!(window->num_pages = scif_zalloc(nr_pages *
467 sizeof(*(window->num_pages)))))
468 goto error_free_window;
469
470 window->offset = offset;
471 window->ep = (uint64_t)ep;
472 window->magic = SCIFEP_MAGIC;
473 window->reg_state = OP_IDLE;
474 init_waitqueue_head(&window->regwq);
475 window->unreg_state = OP_IDLE;
476 init_waitqueue_head(&window->unregwq);
477 INIT_LIST_HEAD(&window->list_member);
478 window->type = RMA_WINDOW_SELF;
479 window->temp = temp;
480#ifdef _MIC_SCIF_
481 micscif_setup_proxy_dma(ep);
482#endif
483 return window;
484
485error_free_window:
486 if (window->dma_addr)
487 scif_free(window->dma_addr, nr_pages * sizeof(*(window->dma_addr)));
488#ifdef CONFIG_ML1OM
489 if (window->temp_phys_addr)
490 scif_free(window->temp_phys_addr, nr_pages * sizeof(*(window->temp_phys_addr)));
491 if (window->phys_addr)
492 scif_free(window->phys_addr, nr_pages * sizeof(*(window->phys_addr)));
493#endif
494 scif_free(window, sizeof(*window));
495error:
496 return NULL;
497}
498
499/**
500 * micscif_destroy_incomplete_window:
501 * @ep: end point
502 * @window: registration window
503 *
504 * Deallocate resources for self window.
505 */
506int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window)
507{
508 int err;
509 int64_t nr_pages = window->nr_pages;
510 struct allocmsg *alloc = &window->alloc_handle;
511 struct nodemsg msg;
512
513 RMA_MAGIC(window);
514retry:
515 err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
516 if (!err && scifdev_alive(ep))
517 goto retry;
518
519 if (OP_COMPLETED == alloc->state) {
520 msg.uop = SCIF_FREE_VIRT;
521 msg.src = ep->port;
522 msg.payload[0] = ep->remote_ep;
523 msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
524 msg.payload[2] = (uint64_t)window;
525 msg.payload[3] = SCIF_REGISTER;
526 micscif_nodeqp_send(ep->remote_dev, &msg, ep);
527 }
528
529 micscif_free_window_offset(ep, window->offset,
530 window->nr_pages << PAGE_SHIFT);
531 if (window->dma_addr)
532 scif_free(window->dma_addr, nr_pages *
533 sizeof(*(window->dma_addr)));
534 if (window->num_pages)
535 scif_free(window->num_pages, nr_pages *
536 sizeof(*(window->num_pages)));
537#ifdef CONFIG_ML1OM
538 if (window->phys_addr)
539 scif_free(window->phys_addr, window->nr_pages *
540 sizeof(*(window->phys_addr)));
541 if (window->temp_phys_addr)
542 scif_free(window->temp_phys_addr, nr_pages *
543 sizeof(*(window->temp_phys_addr)));
544#endif
545 scif_free(window, sizeof(*window));
546 return 0;
547}
548
549/**
550 * micscif_destroy_window:
551 * @ep: end point
552 * @window: registration window
553 *
554 * Deallocate resources for self window.
555 */
556int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window)
557{
558 int j;
559 struct scif_pinned_pages *pinned_pages = window->pinned_pages;
560 int64_t nr_pages = window->nr_pages;
561
562 might_sleep();
563 RMA_MAGIC(window);
564 if (!window->temp && window->mm) {
565 __scif_dec_pinned_vm_lock(window->mm, window->nr_pages, 0);
566 __scif_release_mm(window->mm);
567 window->mm = NULL;
568 }
569
570 if (!window->offset_freed)
571 micscif_free_window_offset(ep, window->offset,
572 window->nr_pages << PAGE_SHIFT);
573 for (j = 0; j < window->nr_contig_chunks; j++) {
574 if (window->dma_addr[j]) {
575 unmap_from_aperture(
576 window->dma_addr[j],
577 ep->remote_dev,
578 window->num_pages[j] << PAGE_SHIFT);
579 }
580 }
581
582 /*
583 * Decrement references for this set of pinned pages from
584 * this window.
585 */
586 j = atomic_sub_return((int32_t)pinned_pages->nr_pages,
587 &pinned_pages->ref_count);
588 BUG_ON(j < 0);
589 /*
590 * If the ref count for pinned_pages is zero then someone
591 * has already called scif_unpin_pages() for it and we should
592 * destroy the page cache.
593 */
594 if (!j)
595 micscif_destroy_pinned_pages(window->pinned_pages);
596 if (window->dma_addr)
597 scif_free(window->dma_addr, nr_pages *
598 sizeof(*(window->dma_addr)));
599 if (window->num_pages)
600 scif_free(window->num_pages, nr_pages *
601 sizeof(*(window->num_pages)));
602#ifdef CONFIG_ML1OM
603 if (window->phys_addr)
604 scif_free(window->phys_addr, window->nr_pages *
605 sizeof(*(window->phys_addr)));
606 if (window->temp_phys_addr)
607 scif_free(window->temp_phys_addr, nr_pages *
608 sizeof(*(window->temp_phys_addr)));
609#endif
610 window->magic = 0;
611 scif_free(window, sizeof(*window));
612 return 0;
613}
614
615/**
616 * micscif_create_remote_lookup:
617 * @ep: end point
618 * @window: remote window
619 *
620 * Allocate and prepare lookup entries for the remote
621 * end to copy over the physical addresses.
622 * Returns 0 on success and appropriate errno on failure.
623 */
624int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window)
625{
626 int i, j, err = 0;
627 int64_t nr_pages = window->nr_pages;
628 bool vmalloc_dma_phys;
629#ifdef CONFIG_ML1OM
630 bool vmalloc_temp_phys = false;
631 bool vmalloc_phys = false;
632#endif
633 might_sleep();
634
635 /* Map window */
636 err = map_virt_into_aperture(&window->mapped_offset,
637 window, ep->remote_dev, sizeof(*window));
638 if (err)
639 goto error_window;
640
641 /* Compute the number of lookup entries. 21 == 2MB Shift */
642 window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE,
643 ((2) * 1024 * 1024)) >> 21;
644
645 if (!(window->dma_addr_lookup.lookup =
646 scif_zalloc(window->nr_lookup *
647 sizeof(*(window->dma_addr_lookup.lookup)))))
648 goto error_window;
649
650 /* Map DMA physical addess lookup array */
651 err = map_virt_into_aperture(&window->dma_addr_lookup.offset,
652 window->dma_addr_lookup.lookup, ep->remote_dev,
653 window->nr_lookup *
654 sizeof(*window->dma_addr_lookup.lookup));
655 if (err)
656 goto error_window;
657
658 vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]);
659
660#ifdef CONFIG_ML1OM
661 if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) {
662 if (!(window->temp_phys_addr_lookup.lookup =
663 scif_zalloc(window->nr_lookup *
664 sizeof(*(window->temp_phys_addr_lookup.lookup)))))
665 goto error_window;
666
667 /* Map physical addess lookup array */
668 err = map_virt_into_aperture(&window->temp_phys_addr_lookup.offset,
669 window->temp_phys_addr_lookup.lookup, ep->remote_dev,
670 window->nr_lookup *
671 sizeof(*window->temp_phys_addr_lookup.lookup));
672 if (err)
673 goto error_window;
674
675 if (!(window->phys_addr_lookup.lookup =
676 scif_zalloc(window->nr_lookup *
677 sizeof(*(window->phys_addr_lookup.lookup)))))
678 goto error_window;
679
680 /* Map physical addess lookup array */
681 err = map_virt_into_aperture(&window->phys_addr_lookup.offset,
682 window->phys_addr_lookup.lookup, ep->remote_dev,
683 window->nr_lookup *
684 sizeof(*window->phys_addr_lookup.lookup));
685 if (err)
686 goto error_window;
687
688 vmalloc_phys = is_vmalloc_addr(&window->phys_addr[0]);
689 vmalloc_temp_phys = is_vmalloc_addr(&window->temp_phys_addr[0]);
690 }
691#endif
692
693 /* Now map each of the pages containing physical addresses */
694 for (i = 0, j = 0; i < nr_pages; i += NR_PHYS_ADDR_IN_PAGE, j++) {
695#ifdef CONFIG_ML1OM
696 if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) {
697 err = map_page_into_aperture(
698 &window->temp_phys_addr_lookup.lookup[j],
699 vmalloc_temp_phys ?
700 vmalloc_to_page(&window->temp_phys_addr[i]) :
701 virt_to_page(&window->temp_phys_addr[i]),
702 ep->remote_dev);
703 if (err)
704 goto error_window;
705
706 err = map_page_into_aperture(
707 &window->phys_addr_lookup.lookup[j],
708 vmalloc_phys ?
709 vmalloc_to_page(&window->phys_addr[i]) :
710 virt_to_page(&window->phys_addr[i]),
711 ep->remote_dev);
712 if (err)
713 goto error_window;
714 }
715#endif
716 err = map_page_into_aperture(
717 &window->dma_addr_lookup.lookup[j],
718 vmalloc_dma_phys ?
719 vmalloc_to_page(&window->dma_addr[i]) :
720 virt_to_page(&window->dma_addr[i]),
721 ep->remote_dev);
722 if (err)
723 goto error_window;
724 }
725 return 0;
726error_window:
727 return err;
728}
729
730/**
731 * micscif_destroy_remote_lookup:
732 * @ep: end point
733 * @window: remote window
734 *
735 * Destroy lookup entries used for the remote
736 * end to copy over the physical addresses.
737 */
738void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window)
739{
740 int i, j;
741
742 RMA_MAGIC(window);
743 if (window->nr_lookup) {
744 for (i = 0, j = 0; i < window->nr_pages;
745 i += NR_PHYS_ADDR_IN_PAGE, j++) {
746 if (window->dma_addr_lookup.lookup &&
747 window->dma_addr_lookup.lookup[j]) {
748 unmap_from_aperture(
749 window->dma_addr_lookup.lookup[j],
750 ep->remote_dev, PAGE_SIZE);
751 }
752 }
753 if (window->dma_addr_lookup.offset) {
754 unmap_from_aperture(
755 window->dma_addr_lookup.offset,
756 ep->remote_dev, window->nr_lookup *
757 sizeof(*window->dma_addr_lookup.lookup));
758 }
759 if (window->dma_addr_lookup.lookup)
760 scif_free(window->dma_addr_lookup.lookup, window->nr_lookup *
761 sizeof(*(window->dma_addr_lookup.lookup)));
762 if (window->mapped_offset) {
763 unmap_from_aperture(window->mapped_offset,
764 ep->remote_dev, sizeof(*window));
765 }
766 window->nr_lookup = 0;
767 }
768}
769
770/**
771 * micscif_create_remote_window:
772 * @ep: end point
773 * @nr_pages: number of pages in window
774 *
775 * Allocate and prepare a remote registration window.
776 */
777struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages)
778{
779 struct reg_range_t *window;
780
781 might_sleep();
782 if (!(window = scif_zalloc(sizeof(struct reg_range_t))))
783 goto error_ret;
784
785 window->magic = SCIFEP_MAGIC;
786 window->nr_pages = nr_pages;
787
788#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
789 if (!(window->page_ref_count = scif_zalloc(nr_pages *
790 sizeof(*(window->page_ref_count)))))
791 goto error_window;
792#endif
793
794 if (!(window->dma_addr = scif_zalloc(nr_pages *
795 sizeof(*(window->dma_addr)))))
796 goto error_window;
797
798 if (!(window->num_pages = scif_zalloc(nr_pages *
799 sizeof(*(window->num_pages)))))
800 goto error_window;
801
802#ifdef CONFIG_ML1OM
803 if (!(window->phys_addr = scif_zalloc(nr_pages *
804 sizeof(*(window->phys_addr)))))
805 goto error_window;
806
807 if (!(window->temp_phys_addr = scif_zalloc(nr_pages *
808 sizeof(*(window->temp_phys_addr)))))
809 goto error_window;
810#endif
811
812 if (micscif_create_remote_lookup(ep, window))
813 goto error_window;
814
815 window->ep = (uint64_t)ep;
816 window->type = RMA_WINDOW_PEER;
817 set_window_ref_count(window, nr_pages);
818 window->get_put_ref_count = 0;
819 window->unreg_state = OP_IDLE;
820#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
821 window->gttmap_state = OP_IDLE;
822 init_waitqueue_head(&window->gttmapwq);
823#endif
824#ifdef _MIC_SCIF_
825 micscif_setup_proxy_dma(ep);
826 window->proxy_dma_phys = ep->rma_info.proxy_dma_phys;
827#endif
828 return window;
829error_window:
830 micscif_destroy_remote_window(ep, window);
831error_ret:
832 return NULL;
833}
834
835/**
836 * micscif_destroy_remote_window:
837 * @ep: end point
838 * @window: remote registration window
839 *
840 * Deallocate resources for remote window.
841 */
842void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window)
843{
844 RMA_MAGIC(window);
845 micscif_destroy_remote_lookup(ep, window);
846 if (window->dma_addr)
847 scif_free(window->dma_addr, window->nr_pages *
848 sizeof(*(window->dma_addr)));
849 if (window->num_pages)
850 scif_free(window->num_pages, window->nr_pages *
851 sizeof(*(window->num_pages)));
852#ifdef CONFIG_ML1OM
853 if (window->phys_addr)
854 scif_free(window->phys_addr, window->nr_pages *
855 sizeof(*(window->phys_addr)));
856 if (window->temp_phys_addr)
857 scif_free(window->temp_phys_addr, window->nr_pages *
858 sizeof(*(window->temp_phys_addr)));
859#endif
860
861#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
862 if (window->page_ref_count)
863 scif_free(window->page_ref_count, window->nr_pages *
864 sizeof(*(window->page_ref_count)));
865#endif
866 window->magic = 0;
867 scif_free(window, sizeof(*window));
868}
869
870/**
871 * micscif_map_window_pages:
872 * @ep: end point
873 * @window: self registration window
874 * @tmp_wnd: is a temporary window?
875 *
876 * Map pages of a window into the aperture/PCI.
877 * Also compute physical addresses required for DMA.
878 */
879int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool tmp_wnd)
880{
881 int j, i, err = 0, nr_pages;
882 scif_pinned_pages_t pinned_pages;
883
884 might_sleep();
885 RMA_MAGIC(window);
886
887 pinned_pages = window->pinned_pages;
888 for (j = 0, i = 0; j < window->nr_contig_chunks; j++, i += nr_pages) {
889 nr_pages = pinned_pages->num_pages[i];
890#ifdef _MIC_SCIF_
891#ifdef CONFIG_ML1OM
892 /* phys_addr[] holds addresses as seen from the remote node
893 * these addressed are then copied into the remote card's
894 * window structure
895 * when the remote node is the host and the card is knf
896 * these addresses are only created at the point of mapping
897 * the card physical address into gtt (for the KNC the
898 * the gtt code path returns the local address)
899 * when the remote node is loopback - the address remains
900 * the same
901 * when the remote node is a kn* - the base address of the local
902 * card as seen from the remote node is added in
903 */
904 if (!tmp_wnd) {
905 if(ep->remote_dev != &scif_dev[SCIF_HOST_NODE]) {
906 if ((err = map_virt_into_aperture(
907 &window->temp_phys_addr[j],
908 phys_to_virt(page_to_phys(pinned_pages->pages[i])),
909 ep->remote_dev,
910 nr_pages << PAGE_SHIFT))) {
911 int k,l;
912
913 for (l = k = 0; k < i; l++) {
914 nr_pages = pinned_pages->num_pages[k];
915 window->temp_phys_addr[l]
916 &= ~RMA_HUGE_NR_PAGE_MASK;
917 unmap_from_aperture(
918 window->temp_phys_addr[l],
919 ep->remote_dev,
920 nr_pages << PAGE_SHIFT);
921 k += nr_pages;
922 window->temp_phys_addr[l] = 0;
923 }
924 return err;
925 }
926 if (!tmp_wnd)
927 RMA_SET_NR_PAGES(window->temp_phys_addr[j], nr_pages);
928 }
929 }
930#endif
931 window->dma_addr[j] =
932 page_to_phys(pinned_pages->pages[i]);
933 if (!tmp_wnd)
934 RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages);
935#else
936 err = map_virt_into_aperture(&window->dma_addr[j],
937 phys_to_virt(page_to_phys(pinned_pages->pages[i])),
938 ep->remote_dev, nr_pages << PAGE_SHIFT);
939 if (err)
940 return err;
941 if (!tmp_wnd)
942 RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages);
943#endif
944 window->num_pages[j] = nr_pages;
945 }
946 return err;
947}
948
949
950/**
951 * micscif_unregister_window:
952 * @window: self registration window
953 *
954 * Send an unregistration request and wait for a response.
955 */
956int micscif_unregister_window(struct reg_range_t *window)
957{
958 int err = 0;
959 struct endpt *ep = (struct endpt *)window->ep;
960 bool send_msg = false;
961
962 might_sleep();
963 BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock));
964
965 switch (window->unreg_state) {
966 case OP_IDLE:
967 {
968 window->unreg_state = OP_IN_PROGRESS;
969 send_msg = true;
970 /* fall through */
971 }
972 case OP_IN_PROGRESS:
973 {
974 get_window_ref_count(window, 1);
975 mutex_unlock(&ep->rma_info.rma_lock);
976 if (send_msg && (err = micscif_send_scif_unregister(ep, window))) {
977 window->unreg_state = OP_COMPLETED;
978 goto done;
979 }
980retry:
981 err = wait_event_timeout(window->unregwq,
982 window->unreg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
983 if (!err && scifdev_alive(ep))
984 goto retry;
985 if (!err) {
986 err = -ENODEV;
987 window->unreg_state = OP_COMPLETED;
988 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
989 }
990 if (err > 0)
991 err = 0;
992done:
993 mutex_lock(&ep->rma_info.rma_lock);
994 put_window_ref_count(window, 1);
995 break;
996 }
997 case OP_FAILED:
998 {
999 if (!scifdev_alive(ep)) {
1000 err = -ENODEV;
1001 window->unreg_state = OP_COMPLETED;
1002 }
1003 break;
1004 }
1005 case OP_COMPLETED:
1006 break;
1007 default:
1008 /* Invalid opcode? */
1009 BUG_ON(1);
1010 }
1011
1012 if (OP_COMPLETED == window->unreg_state &&
1013 window->ref_count)
1014 put_window_ref_count(window, window->nr_pages);
1015
1016 if (!window->ref_count) {
1017 atomic_inc(&ep->rma_info.tw_refcount);
1018 atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
1019 list_del(&window->list_member);
1020 micscif_free_window_offset(ep, window->offset,
1021 window->nr_pages << PAGE_SHIFT);
1022 window->offset_freed = true;
1023 mutex_unlock(&ep->rma_info.rma_lock);
1024 if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL))
1025 && scifdev_alive(ep)) {
1026 drain_dma_intr(ep->rma_info.dma_chan);
1027 } else {
1028 if (!__scif_dec_pinned_vm_lock(window->mm,
1029 window->nr_pages, 1)) {
1030 __scif_release_mm(window->mm);
1031 window->mm = NULL;
1032 }
1033 }
1034 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
1035 mutex_lock(&ep->rma_info.rma_lock);
1036 }
1037 return err;
1038}
1039
1040/**
1041 * micscif_send_alloc_request:
1042 * @ep: end point
1043 * @window: self registration window
1044 *
1045 * Send a remote window allocation request
1046 */
1047int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window)
1048{
1049 struct nodemsg msg;
1050 struct allocmsg *alloc = &window->alloc_handle;
1051
1052 /* Set up the Alloc Handle */
1053 alloc->uop = SCIF_REGISTER;
1054 alloc->state = OP_IN_PROGRESS;
1055 init_waitqueue_head(&alloc->allocwq);
1056
1057 /* Send out an allocation request */
1058 msg.uop = SCIF_ALLOC_REQ;
1059 msg.src = ep->port;
1060 msg.payload[0] = ep->remote_ep;
1061 msg.payload[1] = window->nr_pages;
1062 msg.payload[2] = (uint64_t)&window->alloc_handle;
1063 msg.payload[3] = SCIF_REGISTER;
1064 return micscif_nodeqp_send(ep->remote_dev, &msg, ep);
1065}
1066
1067/**
1068 * micscif_prep_remote_window:
1069 * @ep: end point
1070 * @window: self registration window
1071 *
1072 * Send a remote window allocation request, wait for an allocation response,
1073 * prepare the remote window and notify the peer to unmap it once done.
1074 */
1075int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window)
1076{
1077 struct nodemsg msg;
1078 struct reg_range_t *remote_window;
1079 struct allocmsg *alloc = &window->alloc_handle;
1080 dma_addr_t *dma_phys_lookup, *tmp;
1081 int i = 0, j = 0;
1082 int nr_contig_chunks, loop_nr_contig_chunks, remaining_nr_contig_chunks, nr_lookup;
1083#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1084 dma_addr_t *phys_lookup = 0;
1085#endif
1086 int err, map_err;
1087
1088 nr_contig_chunks = remaining_nr_contig_chunks = (int)window->nr_contig_chunks;
1089
1090 if ((map_err = micscif_map_window_pages(ep, window, false))) {
1091 printk(KERN_ERR "%s %d map_err %d\n", __func__, __LINE__, map_err);
1092 }
1093retry:
1094 /* Now wait for the response */
1095 err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
1096 if (!err && scifdev_alive(ep))
1097 goto retry;
1098
1099 if (!err)
1100 err = -ENODEV;
1101
1102 if (err > 0)
1103 err = 0;
1104 else
1105 return err;
1106
1107 /* Bail out. The remote end rejected this request */
1108 if (OP_FAILED == alloc->state)
1109 return -ENOMEM;
1110
1111 if (map_err) {
1112 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, map_err);
1113 msg.uop = SCIF_FREE_VIRT;
1114 msg.src = ep->port;
1115 msg.payload[0] = ep->remote_ep;
1116 msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
1117 msg.payload[2] = (uint64_t)window;
1118 msg.payload[3] = SCIF_REGISTER;
1119 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1120 err = -ENOTCONN;
1121 else
1122 err = map_err;
1123 return err;
1124 }
1125
1126
1127 remote_window = scif_ioremap(alloc->phys_addr,
1128 sizeof(*window), ep->remote_dev);
1129
1130 RMA_MAGIC(remote_window);
1131
1132 /* Compute the number of lookup entries. 21 == 2MB Shift */
1133 nr_lookup = ALIGN(nr_contig_chunks * PAGE_SIZE, ((2) * 1024 * 1024)) >> 21;
1134#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1135 if (is_p2p_scifdev(ep->remote_dev))
1136 phys_lookup = scif_ioremap(remote_window->temp_phys_addr_lookup.offset,
1137 nr_lookup *
1138 sizeof(*remote_window->temp_phys_addr_lookup.lookup),
1139 ep->remote_dev);
1140#endif
1141
1142 dma_phys_lookup = scif_ioremap(remote_window->dma_addr_lookup.offset,
1143 nr_lookup *
1144 sizeof(*remote_window->dma_addr_lookup.lookup),
1145 ep->remote_dev);
1146
1147 while (remaining_nr_contig_chunks) {
1148 loop_nr_contig_chunks = min(remaining_nr_contig_chunks, (int)NR_PHYS_ADDR_IN_PAGE);
1149 /* #1/2 - Copy physical addresses over to the remote side */
1150
1151#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1152 /* If the remote dev is self or is any node except the host
1153 * its OK to copy the bus address to the remote window
1154 * in the case of the host (for KNF only) the bus address
1155 * is generated at the time of mmap(..) into card memory
1156 * and does not exist at this time
1157 */
1158 /* Note:
1159 * the phys_addr[] holds MIC address for remote cards
1160 * -> GTT offset for the host (KNF)
1161 * -> local address for the host (KNC)
1162 * -> local address for loopback
1163 * this is done in map_window_pages(..) except for GTT
1164 * offset for KNF
1165 */
1166 if (is_p2p_scifdev(ep->remote_dev)) {
1167 tmp = scif_ioremap(phys_lookup[j],
1168 loop_nr_contig_chunks * sizeof(*window->temp_phys_addr),
1169 ep->remote_dev);
1170 memcpy_toio(tmp, &window->temp_phys_addr[i],
1171 loop_nr_contig_chunks * sizeof(*window->temp_phys_addr));
1172 serializing_request(tmp);
1173 smp_mb();
1174 scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev);
1175 }
1176#endif
1177 /* #2/2 - Copy DMA addresses (addresses that are fed into the DMA engine)
1178 * We transfer bus addresses which are then converted into a MIC physical
1179 * address on the remote side if it is a MIC, if the remote node is a host
1180 * we transfer the MIC physical address
1181 */
1182 tmp = scif_ioremap(
1183 dma_phys_lookup[j],
1184 loop_nr_contig_chunks * sizeof(*window->dma_addr),
1185 ep->remote_dev);
1186#ifdef _MIC_SCIF_
1187 if (is_p2p_scifdev(ep->remote_dev)) {
1188 /* knf:
1189 * send the address as mapped through the GTT (the remote node's
1190 * base address for this node is already added in)
1191 * knc:
1192 * add remote node's base address for this node to convert it
1193 * into a MIC address
1194 */
1195 int m;
1196 dma_addr_t dma_addr;
1197 for (m = 0; m < loop_nr_contig_chunks; m++) {
1198#ifdef CONFIG_ML1OM
1199 dma_addr = window->temp_phys_addr[i + m];
1200#else
1201 dma_addr = window->dma_addr[i + m] +
1202 ep->remote_dev->sd_base_addr;
1203#endif
1204 writeq(dma_addr, &tmp[m]);
1205 }
1206 } else
1207 /* Host node or loopback - transfer DMA addresses as is, this is
1208 * the same as a MIC physical address (we use the dma_addr
1209 * and not the phys_addr array since the phys_addr is only setup
1210 * if there is a mmap() request from the host)
1211 */
1212 memcpy_toio(tmp, &window->dma_addr[i],
1213 loop_nr_contig_chunks * sizeof(*window->dma_addr));
1214#else
1215 /* Transfer the physical address array - this is the MIC address
1216 * as seen by the card
1217 */
1218 memcpy_toio(tmp, &window->dma_addr[i],
1219 loop_nr_contig_chunks * sizeof(*window->dma_addr));
1220#endif
1221 remaining_nr_contig_chunks -= loop_nr_contig_chunks;
1222 i += loop_nr_contig_chunks;
1223 j++;
1224 serializing_request(tmp);
1225 smp_mb();
1226 scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev);
1227 }
1228
1229 /* Prepare the remote window for the peer */
1230 remote_window->peer_window = (uint64_t)window;
1231 remote_window->offset = window->offset;
1232 remote_window->prot = window->prot;
1233 remote_window->nr_contig_chunks = nr_contig_chunks;
1234#ifdef _MIC_SCIF_
1235 if (!ep->rma_info.proxy_dma_peer_phys)
1236 ep->rma_info.proxy_dma_peer_phys = remote_window->proxy_dma_phys;
1237#endif
1238#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
1239 if (is_p2p_scifdev(ep->remote_dev))
1240 scif_iounmap(phys_lookup,
1241 nr_lookup *
1242 sizeof(*remote_window->temp_phys_addr_lookup.lookup),
1243 ep->remote_dev);
1244#endif
1245 scif_iounmap(dma_phys_lookup,
1246 nr_lookup *
1247 sizeof(*remote_window->dma_addr_lookup.lookup),
1248 ep->remote_dev);
1249 scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev);
1250 window->peer_window = (uint64_t)alloc->vaddr;
1251 return err;
1252}
1253
1254/**
1255 * micscif_send_scif_register:
1256 * @ep: end point
1257 * @window: self registration window
1258 *
1259 * Send a SCIF_REGISTER message if EP is connected and wait for a
1260 * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT
1261 * message so that the peer can free its remote window allocated earlier.
1262 */
1263int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window)
1264{
1265 int err = 0;
1266 struct nodemsg msg;
1267
1268 msg.src = ep->port;
1269 msg.payload[0] = ep->remote_ep;
1270 msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
1271 msg.payload[2] = (uint64_t)window;
1272 if (SCIFEP_CONNECTED == ep->state) {
1273 msg.uop = SCIF_REGISTER;
1274 window->reg_state = OP_IN_PROGRESS;
1275 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
1276 micscif_set_nr_pages(ep->remote_dev, window);
1277retry:
1278 err = wait_event_timeout(window->regwq,
1279 window->reg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
1280 if (!err && scifdev_alive(ep))
1281 goto retry;
1282 if (!err)
1283 err = -ENODEV;
1284 if (err > 0)
1285 err = 0;
1286 if (OP_FAILED == window->reg_state)
1287 err = -ENOTCONN;
1288 } else {
1289 micscif_set_nr_pages(ep->remote_dev, window);
1290 }
1291 } else {
1292 msg.uop = SCIF_FREE_VIRT;
1293 msg.payload[3] = SCIF_REGISTER;
1294 if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1295 err = -ENOTCONN;
1296 micscif_set_nr_pages(ep->remote_dev, window);
1297 }
1298 return err;
1299}
1300
1301/**
1302 * micscif_send_scif_unregister:
1303 * @ep: end point
1304 * @window: self registration window
1305 *
1306 * Send a SCIF_UNREGISTER message.
1307 */
1308int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window)
1309{
1310 struct nodemsg msg;
1311
1312 RMA_MAGIC(window);
1313 msg.uop = SCIF_UNREGISTER;
1314 msg.src = ep->port;
1315 msg.payload[0] = (uint64_t)window->alloc_handle.vaddr;
1316 msg.payload[1] = (uint64_t)window;
1317 return micscif_nodeqp_send(ep->remote_dev, &msg, ep);
1318}
1319
1320/**
1321 * micscif_get_window_offset:
1322 * @epd: end point descriptor
1323 * @flags: flags
1324 * @offset: offset hint
1325 * @len: length of range
1326 * @out_offset: computed offset returned by reference.
1327 *
1328 * Compute/Claim a new offset for this EP. The callee is supposed to grab
1329 * the RMA mutex before calling this API.
1330 */
1331int micscif_get_window_offset(struct endpt *ep, int flags,
1332 uint64_t offset, size_t len, uint64_t *out_offset)
1333{
1334 uint64_t computed_offset;
1335 int err = 0;
1336
1337 might_sleep();
1338 mutex_lock(&ep->rma_info.va_lock);
1339 if (flags & SCIF_MAP_FIXED) {
1340 computed_offset = va_gen_claim(&ep->rma_info.va_gen,
1341 (uint64_t)offset, len);
1342 if (INVALID_VA_GEN_ADDRESS == computed_offset)
1343 err = -EADDRINUSE;
1344 } else {
1345 computed_offset = va_gen_alloc(&ep->rma_info.va_gen,
1346 len, PAGE_SIZE);
1347 if (INVALID_VA_GEN_ADDRESS == computed_offset)
1348 err = -ENOMEM;
1349 }
1350 *out_offset = computed_offset;
1351 mutex_unlock(&ep->rma_info.va_lock);
1352 return err;
1353}
1354
1355/**
1356 * micscif_free_window_offset:
1357 * @offset: offset hint
1358 * @len: length of range
1359 *
1360 * Free offset for this EP. The callee is supposed to grab
1361 * the RMA mutex before calling this API.
1362 */
1363void micscif_free_window_offset(struct endpt *ep,
1364 uint64_t offset, size_t len)
1365{
1366 mutex_lock(&ep->rma_info.va_lock);
1367 va_gen_free(&ep->rma_info.va_gen, offset, len);
1368 mutex_unlock(&ep->rma_info.va_lock);
1369}
1370
1371/**
1372 * scif_register_temp:
1373 * @epd: End Point Descriptor.
1374 * @addr: virtual address to/from which to copy
1375 * @len: length of range to copy
1376 * @out_offset: computed offset returned by reference.
1377 * @out_window: allocated registered window returned by reference.
1378 *
1379 * Create a temporary registered window. The peer will not know about this
1380 * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's.
1381 */
1382static int
1383micscif_register_temp(scif_epd_t epd, void *addr, size_t len, int prot,
1384 off_t *out_offset, struct reg_range_t **out_window)
1385{
1386 struct endpt *ep = (struct endpt *)epd;
1387 int err;
1388 scif_pinned_pages_t pinned_pages;
1389 size_t aligned_len;
1390
1391 aligned_len = ALIGN(len, PAGE_SIZE);
1392
1393 if ((err = __scif_pin_pages((void *)((uint64_t)addr &
1394 PAGE_MASK),
1395 aligned_len, &prot, 0, &pinned_pages)))
1396 return err;
1397
1398 pinned_pages->prot = prot;
1399
1400 /* Compute the offset for this registration */
1401 if ((err = micscif_get_window_offset(ep, 0, 0,
1402 aligned_len, (uint64_t *)out_offset)))
1403 goto error_unpin;
1404
1405 /* Allocate and prepare self registration window */
1406 if (!(*out_window = micscif_create_window(ep, aligned_len >> PAGE_SHIFT,
1407 *out_offset, true))) {
1408 micscif_free_window_offset(ep, *out_offset, aligned_len);
1409 err = -ENOMEM;
1410 goto error_unpin;
1411 }
1412
1413 (*out_window)->pinned_pages = pinned_pages;
1414 (*out_window)->nr_pages = pinned_pages->nr_pages;
1415 (*out_window)->nr_contig_chunks = pinned_pages->nr_contig_chunks;
1416 (*out_window)->prot = pinned_pages->prot;
1417
1418 (*out_window)->va_for_temp = (void*)((uint64_t)addr & PAGE_MASK);
1419 if ((err = micscif_map_window_pages(ep, *out_window, true))) {
1420 /* Something went wrong! Rollback */
1421 micscif_destroy_window(ep, *out_window);
1422 *out_window = NULL;
1423 } else
1424 *out_offset |= ((uint64_t)addr & ~PAGE_MASK);
1425
1426 return err;
1427error_unpin:
1428 if (err)
1429 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1430 scif_unpin_pages(pinned_pages);
1431 return err;
1432}
1433
1434/**
1435 * micscif_rma_completion_cb:
1436 * @data: RMA cookie
1437 *
1438 * RMA interrupt completion callback.
1439 */
1440void micscif_rma_completion_cb(uint64_t data)
1441{
1442 struct dma_completion_cb *comp_cb = (struct dma_completion_cb *)data;
1443#ifndef _MIC_SCIF_
1444 struct pci_dev *pdev;
1445#endif
1446
1447 /* Free DMA Completion CB. */
1448 if (comp_cb && comp_cb->temp_buf) {
1449 if (comp_cb->dst_window) {
1450 micscif_rma_local_cpu_copy(comp_cb->dst_offset,
1451 comp_cb->dst_window, comp_cb->temp_buf + comp_cb->header_padding,
1452 comp_cb->len, false);
1453 }
1454#ifndef _MIC_SCIF_
1455 micscif_pci_dev(comp_cb->remote_node, &pdev);
1456 mic_ctx_unmap_single(get_per_dev_ctx(comp_cb->remote_node - 1),
1457 comp_cb->temp_phys, KMEM_UNALIGNED_BUF_SIZE);
1458#endif
1459 if (comp_cb->is_cache)
1460 micscif_kmem_cache_free(comp_cb->temp_buf_to_free);
1461 else
1462 kfree(comp_cb->temp_buf_to_free);
1463 }
1464 kfree(comp_cb);
1465}
1466
1467static void __micscif_rma_destroy_tcw_ep(struct endpt *ep);
1468static
1469bool micscif_rma_tc_can_cache(struct endpt *ep, size_t cur_bytes)
1470{
1471 if ((cur_bytes >> PAGE_SHIFT) > ms_info.mi_rma_tc_limit)
1472 return false;
1473 if ((atomic_read(&ep->rma_info.tcw_total_pages)
1474 + (cur_bytes >> PAGE_SHIFT)) >
1475 ms_info.mi_rma_tc_limit) {
1476 printk(KERN_ALERT "%s %d total=%d, current=%zu reached max\n",
1477 __func__, __LINE__,
1478 atomic_read(&ep->rma_info.tcw_total_pages),
1479 (1 + (cur_bytes >> PAGE_SHIFT)));
1480 micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc);
1481 __micscif_rma_destroy_tcw_ep(ep);
1482 }
1483 return true;
1484}
1485
1486/**
1487 * micscif_rma_copy:
1488 * @epd: end point descriptor.
1489 * @loffset: offset in local registered address space to/from which to copy
1490 * @addr: user virtual address to/from which to copy
1491 * @len: length of range to copy
1492 * @roffset: offset in remote registered address space to/from which to copy
1493 * @flags: flags
1494 * @dir: LOCAL->REMOTE or vice versa.
1495 *
1496 * Validate parameters, check if src/dst registered ranges requested for copy
1497 * are valid and initiate either CPU or DMA copy.
1498 */
1499int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len,
1500 off_t roffset, int flags, enum rma_direction dir, bool last_chunk)
1501{
1502 struct endpt *ep = (struct endpt *)epd;
1503 struct micscif_rma_req remote_req;
1504 struct micscif_rma_req req;
1505 struct reg_range_t *window = NULL;
1506 struct reg_range_t *remote_window = NULL;
1507 struct mic_copy_work copy_work;
1508 bool loopback;
1509 int err = 0;
1510 struct dma_channel *chan;
1511 struct rma_mmu_notifier *mmn = NULL;
1512 bool insert_window = false;
1513 bool cache = false;
1514
1515 if ((err = verify_epd(ep)))
1516 return err;
1517
1518 if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE | SCIF_RMA_SYNC | SCIF_RMA_ORDERED)))
1519 return -EINVAL;
1520
1521 if (!len)
1522 return -EINVAL;
1523 loopback = is_self_scifdev(ep->remote_dev) ? true : false;
1524 copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ? DO_DMA_POLLING : 0;
1525 copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk);
1526
1527#ifdef CONFIG_MMU_NOTIFIER
1528 if (!mic_reg_cache_enable)
1529 flags &= ~SCIF_RMA_USECACHE;
1530#else
1531 flags &= ~SCIF_RMA_USECACHE;
1532#endif
1533#ifndef _MIC_SCIF_
1534#ifdef CONFIG_ML1OM
1535 /* Use DMA Copies even if CPU copy is requested on KNF MIC from Host */
1536 if (flags & SCIF_RMA_USECPU) {
1537 flags &= ~SCIF_RMA_USECPU;
1538 if (last_chunk)
1539 copy_work.fence_type = DO_DMA_POLLING;
1540 }
1541#endif
1542 /* Use CPU for Host<->Host Copies */
1543 if (loopback) {
1544 flags |= SCIF_RMA_USECPU;
1545 copy_work.fence_type = 0x0;
1546 }
1547#endif
1548
1549 cache = flags & SCIF_RMA_USECACHE;
1550
1551 /* Trying to wrap around */
1552 if ((loffset && (loffset + (off_t)len < loffset)) ||
1553 (roffset + (off_t)len < roffset))
1554 return -EINVAL;
1555
1556 remote_req.out_window = &remote_window;
1557 remote_req.offset = roffset;
1558 remote_req.nr_bytes = len;
1559 /*
1560 * If transfer is from local to remote then the remote window
1561 * must be writeable and vice versa.
1562 */
1563 remote_req.prot = LOCAL_TO_REMOTE == dir ? VM_WRITE : VM_READ;
1564 remote_req.type = WINDOW_PARTIAL;
1565 remote_req.head = &ep->rma_info.remote_reg_list;
1566
1567#ifdef CONFIG_MMU_NOTIFIER
1568 if (addr && cache) {
1569 mutex_lock(&ep->rma_info.mmn_lock);
1570 mmn = find_mmu_notifier(current->mm, &ep->rma_info);
1571 if (!mmn) {
1572 mmn = kzalloc(sizeof(*mmn), GFP_KERNEL);
1573 if (!mmn) {
1574 mutex_unlock(&ep->rma_info.mmn_lock);
1575 return -ENOMEM;
1576 }
1577 init_mmu_notifier(mmn, current->mm, ep);
1578 if (mmu_notifier_register(&mmn->ep_mmu_notifier, current->mm)) {
1579 mutex_unlock(&ep->rma_info.mmn_lock);
1580 kfree(mmn);
1581 return -EBUSY;
1582 }
1583#ifdef RMA_DEBUG
1584 atomic_long_add_return(1, &ms_info.mmu_notif_cnt);
1585#endif
1586 list_add(&mmn->list_member, &ep->rma_info.mmn_list);
1587 }
1588 mutex_unlock(&ep->rma_info.mmn_lock);
1589 }
1590#endif
1591
1592 micscif_inc_node_refcnt(ep->remote_dev, 1);
1593#ifdef _MIC_SCIF_
1594 if (!(flags & SCIF_RMA_USECPU)) {
1595 /*
1596 * Proxy the DMA only for P2P reads with transfer size
1597 * greater than proxy DMA threshold. scif_vreadfrom(..)
1598 * and scif_vwriteto(..) is not supported since the peer
1599 * does not have the page lists required to perform the
1600 * proxy DMA.
1601 */
1602 if (ep->remote_dev->sd_proxy_dma_reads &&
1603 !addr && dir == REMOTE_TO_LOCAL &&
1604 ep->rma_info.proxy_dma_va &&
1605 len >= ms_info.mi_proxy_dma_threshold) {
1606 copy_work.len = len;
1607 copy_work.src_offset = roffset;
1608 copy_work.dst_offset = loffset;
1609 /* Fall through if there were errors */
1610 if (!(err = micscif_proxy_dma(epd, &copy_work)))
1611 goto error;
1612 }
1613 }
1614#endif
1615 mutex_lock(&ep->rma_info.rma_lock);
1616 if (addr) {
1617 req.out_window = &window;
1618 req.nr_bytes = ALIGN(len + ((uint64_t)addr & ~PAGE_MASK), PAGE_SIZE);
1619 if (mmn)
1620 req.head = &mmn->tc_reg_list;
1621 req.va_for_temp = (void*)((uint64_t)addr & PAGE_MASK);
1622 req.prot = (LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE | VM_READ);
1623 /* Does a valid local window exist? */
1624
1625 pr_debug("%s %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n",
1626 __func__, __LINE__, req.va_for_temp, addr, req.nr_bytes, len);
1627 spin_lock(&ep->rma_info.tc_lock);
1628 if (!mmn || (err = micscif_query_tcw(ep, &req))) {
1629 pr_debug("%s %d err %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n",
1630 __func__, __LINE__, err, req.va_for_temp, addr, req.nr_bytes, len);
1631 spin_unlock(&ep->rma_info.tc_lock);
1632 mutex_unlock(&ep->rma_info.rma_lock);
1633 if (cache)
1634 if (!micscif_rma_tc_can_cache(ep, req.nr_bytes))
1635 cache = false;
1636 if ((err = micscif_register_temp(epd, req.va_for_temp, req.nr_bytes,
1637 req.prot,
1638 &loffset, &window))) {
1639 goto error;
1640 }
1641 mutex_lock(&ep->rma_info.rma_lock);
1642 pr_debug("New temp window created addr %p\n", addr);
1643 if (cache) {
1644 atomic_inc(&ep->rma_info.tcw_refcount);
1645 atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tcw_total_pages);
1646 if (mmn) {
1647 spin_lock(&ep->rma_info.tc_lock);
1648 micscif_insert_tcw(window, &mmn->tc_reg_list);
1649 spin_unlock(&ep->rma_info.tc_lock);
1650 }
1651 }
1652 insert_window = true;
1653 } else {
1654 spin_unlock(&ep->rma_info.tc_lock);
1655 pr_debug("window found for addr %p\n", addr);
1656 BUG_ON(window->va_for_temp > addr);
1657 }
1658 loffset = window->offset + ((uint64_t)addr - (uint64_t)window->va_for_temp);
1659 pr_debug("%s %d addr %p loffset 0x%lx window->nr_pages 0x%llx"
1660 " window->va_for_temp %p\n", __func__, __LINE__,
1661 addr, loffset, window->nr_pages, window->va_for_temp);
1662 RMA_MAGIC(window);
1663 }
1664
1665 /* Does a valid remote window exist? */
1666 if ((err = micscif_query_window(&remote_req))) {
1667 pr_debug("%s %d err %d roffset 0x%lx len 0x%lx\n",
1668 __func__, __LINE__, err, roffset, len);
1669 mutex_unlock(&ep->rma_info.rma_lock);
1670 goto error;
1671 }
1672 RMA_MAGIC(remote_window);
1673 if (!addr) {
1674 req.out_window = &window;
1675 req.offset = loffset;
1676 /*
1677 * If transfer is from local to remote then the self window
1678 * must be readable and vice versa.
1679 */
1680 req.prot = LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE;
1681 req.nr_bytes = len;
1682 req.type = WINDOW_PARTIAL;
1683 req.head = &ep->rma_info.reg_list;
1684 /* Does a valid local window exist? */
1685 if ((err = micscif_query_window(&req))) {
1686 printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
1687 mutex_unlock(&ep->rma_info.rma_lock);
1688 goto error;
1689 }
1690 RMA_MAGIC(window);
1691 }
1692
1693 /*
1694 * Preprare copy_work for submitting work to the DMA kernel thread
1695 * or CPU copy routine.
1696 */
1697 copy_work.len = len;
1698 copy_work.loopback = loopback;
1699 copy_work.remote_dev = ep->remote_dev;
1700 copy_work.dma_chan_released = false;
1701 if (LOCAL_TO_REMOTE == dir) {
1702 copy_work.src_offset = loffset;
1703 copy_work.src_window = window;
1704 copy_work.dst_offset = roffset;
1705 copy_work.dst_window = remote_window;
1706 } else {
1707 copy_work.src_offset = roffset;
1708 copy_work.src_window = remote_window;
1709 copy_work.dst_offset = loffset;
1710 copy_work.dst_window = window;
1711 }
1712
1713 if (!(flags & SCIF_RMA_USECPU)) {
1714 chan = ep->rma_info.dma_chan;
1715 if ((err = request_dma_channel(chan))) {
1716 mutex_unlock(&ep->rma_info.rma_lock);
1717 goto error;
1718 }
1719 err = micscif_rma_list_dma_copy_wrapper(epd, &copy_work,
1720 chan, loffset);
1721 if (!copy_work.dma_chan_released)
1722 free_dma_channel(chan);
1723 }
1724 if (flags & SCIF_RMA_USECPU) {
1725 /* Initiate synchronous CPU copy */
1726 micscif_rma_list_cpu_copy(&copy_work);
1727 }
1728 if (insert_window && !cache) {
1729 atomic_inc(&ep->rma_info.tw_refcount);
1730 atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
1731 }
1732
1733 mutex_unlock(&ep->rma_info.rma_lock);
1734
1735 if (last_chunk) {
1736 if (DO_DMA_POLLING == copy_work.fence_type)
1737 err = drain_dma_poll(ep->rma_info.dma_chan);
1738 else if (DO_DMA_INTR == copy_work.fence_type)
1739 err = drain_dma_intr(ep->rma_info.dma_chan);
1740 }
1741
1742 micscif_dec_node_refcnt(ep->remote_dev, 1);
1743 if (insert_window && !cache)
1744 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
1745 return err;
1746error:
1747 if (err) {
1748 if (addr && window && !cache)
1749 micscif_destroy_window(ep, window);
1750 printk(KERN_ERR "%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len);
1751 }
1752 micscif_dec_node_refcnt(ep->remote_dev, 1);
1753 return err;
1754}
1755
1756/**
1757 * micscif_send_fence_mark:
1758 * @epd: end point descriptor.
1759 * @out_mark: Output DMA mark reported by peer.
1760 *
1761 * Send a remote fence mark request.
1762 */
1763int micscif_send_fence_mark(scif_epd_t epd, int *out_mark)
1764{
1765 int err;
1766 struct nodemsg msg;
1767 struct fence_info *fence_req;
1768 struct endpt *ep = (struct endpt *)epd;
1769
1770 if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
1771 err = -ENOMEM;
1772 goto error;
1773 }
1774
1775 fence_req->state = OP_IN_PROGRESS;
1776 init_waitqueue_head(&fence_req->wq);
1777
1778 msg.src = ep->port;
1779 msg.uop = SCIF_MARK;
1780 msg.payload[0] = ep->remote_ep;
1781 msg.payload[1] = (uint64_t)fence_req;
1782
1783 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1784 goto error;
1785
1786retry:
1787 err = wait_event_timeout(fence_req->wq,
1788 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1789 if (!err && scifdev_alive(ep))
1790 goto retry;
1791 if (!err)
1792 err = -ENODEV;
1793 if (err > 0)
1794 err = 0;
1795 if (err < 0) {
1796 mutex_lock(&ep->rma_info.rma_lock);
1797 if (OP_IN_PROGRESS == fence_req->state)
1798 fence_req->state = OP_FAILED;
1799 mutex_unlock(&ep->rma_info.rma_lock);
1800 }
1801 if (OP_COMPLETED == fence_req->state)
1802 *out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark;
1803
1804 if (OP_FAILED == fence_req->state && !err)
1805 err = -ENOMEM;
1806 mutex_lock(&ep->rma_info.rma_lock);
1807 mutex_unlock(&ep->rma_info.rma_lock);
1808 kfree(fence_req);
1809error:
1810 return err;
1811}
1812
1813/**
1814 * micscif_send_fence_wait:
1815 * @epd: end point descriptor.
1816 * @mark: DMA mark to wait for.
1817 *
1818 * Send a remote fence wait request.
1819 */
1820int micscif_send_fence_wait(scif_epd_t epd, int mark)
1821{
1822 int err;
1823 struct nodemsg msg;
1824 struct fence_info *fence_req;
1825 struct endpt *ep = (struct endpt *)epd;
1826
1827 if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
1828 err = -ENOMEM;
1829 goto error;
1830 }
1831
1832 fence_req->state = OP_IN_PROGRESS;
1833 init_waitqueue_head(&fence_req->wq);
1834
1835 msg.src = ep->port;
1836 msg.uop = SCIF_WAIT;
1837 msg.payload[0] = ep->remote_ep;
1838 msg.payload[1] = (uint64_t)fence_req;
1839 msg.payload[2] = mark;
1840
1841 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1842 goto error;
1843retry:
1844 err = wait_event_timeout(fence_req->wq,
1845 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1846 if (!err && scifdev_alive(ep))
1847 goto retry;
1848 if (!err)
1849 err = -ENODEV;
1850 if (err > 0)
1851 err = 0;
1852 if (err < 0) {
1853 mutex_lock(&ep->rma_info.rma_lock);
1854 if (OP_IN_PROGRESS == fence_req->state)
1855 fence_req->state = OP_FAILED;
1856 mutex_unlock(&ep->rma_info.rma_lock);
1857 }
1858 if (OP_FAILED == fence_req->state && !err)
1859 err = -ENOMEM;
1860 mutex_lock(&ep->rma_info.rma_lock);
1861 mutex_unlock(&ep->rma_info.rma_lock);
1862 kfree(fence_req);
1863error:
1864 return err;
1865}
1866
1867/**
1868 * micscif_send_fence_signal:
1869 * @epd - endpoint descriptor
1870 * @loff - local offset
1871 * @lval - local value to write to loffset
1872 * @roff - remote offset
1873 * @rval - remote value to write to roffset
1874 * @flags - flags
1875 *
1876 * Sends a remote fence signal request
1877 */
1878int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval,
1879 off_t loff, uint64_t lval, int flags)
1880{
1881 int err = 0;
1882 struct nodemsg msg;
1883 struct fence_info *fence_req;
1884 struct endpt *ep = (struct endpt *)epd;
1885
1886 if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
1887 err = -ENOMEM;
1888 goto error;
1889 }
1890
1891 fence_req->state = OP_IN_PROGRESS;
1892 init_waitqueue_head(&fence_req->wq);
1893
1894 msg.src = ep->port;
1895 if (flags & SCIF_SIGNAL_LOCAL) {
1896 msg.uop = SCIF_SIG_LOCAL;
1897 msg.payload[0] = ep->remote_ep;
1898 msg.payload[1] = roff;
1899 msg.payload[2] = rval;
1900 msg.payload[3] = (uint64_t)fence_req;
1901 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1902 goto error_free;
1903retry1:
1904 err = wait_event_timeout(fence_req->wq,
1905 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1906 if (!err && scifdev_alive(ep))
1907 goto retry1;
1908 if (!err)
1909 err = -ENODEV;
1910 if (err > 0)
1911 err = 0;
1912 if (err < 0) {
1913 mutex_lock(&ep->rma_info.rma_lock);
1914 if (OP_IN_PROGRESS == fence_req->state)
1915 fence_req->state = OP_FAILED;
1916 mutex_unlock(&ep->rma_info.rma_lock);
1917 }
1918 if (OP_FAILED == fence_req->state && !err) {
1919 err = -ENXIO;
1920 goto error_free;
1921 }
1922 }
1923 fence_req->state = OP_IN_PROGRESS;
1924
1925 if (flags & SCIF_SIGNAL_REMOTE) {
1926 msg.uop = SCIF_SIG_REMOTE;
1927 msg.payload[0] = ep->remote_ep;
1928 msg.payload[1] = loff;
1929 msg.payload[2] = lval;
1930 msg.payload[3] = (uint64_t)fence_req;
1931 if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
1932 goto error_free;
1933retry2:
1934 err = wait_event_timeout(fence_req->wq,
1935 (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
1936 if (!err && scifdev_alive(ep))
1937 goto retry2;
1938 if (!err)
1939 err = -ENODEV;
1940 if (err > 0)
1941 err = 0;
1942 if (err < 0) {
1943 mutex_lock(&ep->rma_info.rma_lock);
1944 if (OP_IN_PROGRESS == fence_req->state)
1945 fence_req->state = OP_FAILED;
1946 mutex_unlock(&ep->rma_info.rma_lock);
1947 }
1948 if (OP_FAILED == fence_req->state && !err) {
1949 err = -ENXIO;
1950 goto error_free;
1951 }
1952 }
1953error_free:
1954 mutex_lock(&ep->rma_info.rma_lock);
1955 mutex_unlock(&ep->rma_info.rma_lock);
1956 kfree(fence_req);
1957error:
1958 return err;
1959}
1960
1961/*
1962 * micscif_fence_mark:
1963 *
1964 * @epd - endpoint descriptor
1965 * Set up a mark for this endpoint and return the value of the mark.
1966 */
1967int micscif_fence_mark(scif_epd_t epd)
1968{
1969 int mark = 0;
1970 struct endpt *ep = (struct endpt *)epd;
1971 struct dma_channel *chan = ep->rma_info.dma_chan;
1972
1973 if ((mark = request_dma_channel(chan)))
1974 goto error;
1975
1976 mark = program_dma_mark(chan);
1977
1978 free_dma_channel(chan);
1979error:
1980 return mark;
1981}
1982
1983/**
1984 * micscif_rma_destroy_temp_windows:
1985 *
1986 * This routine destroys temporary registered windows created
1987 * by scif_vreadfrom() and scif_vwriteto().
1988 */
1989void micscif_rma_destroy_temp_windows(void)
1990{
1991 struct list_head *item, *tmp;
1992 struct reg_range_t *window;
1993 struct endpt *ep;
1994 struct dma_channel *chan;
1995 might_sleep();
1996restart:
1997 spin_lock(&ms_info.mi_rmalock);
1998 list_for_each_safe(item, tmp, &ms_info.mi_rma) {
1999 window = list_entry(item,
2000 struct reg_range_t, list_member);
2001 ep = (struct endpt *)window->ep;
2002 chan = ep->rma_info.dma_chan;
2003
2004 list_del(&window->list_member);
2005 spin_unlock(&ms_info.mi_rmalock);
2006 micscif_inc_node_refcnt(ep->remote_dev, 1);
2007 if (!chan ||
2008 !scifdev_alive(ep) ||
2009 (!is_current_dma_mark(chan, window->dma_mark) &&
2010 is_dma_mark_processed(chan, window->dma_mark)) ||
2011 !drain_dma_intr(chan)) {
2012 micscif_dec_node_refcnt(ep->remote_dev, 1);
2013 /* Remove window from global list */
2014 window->unreg_state = OP_COMPLETED;
2015 } else {
2016 micscif_dec_node_refcnt(ep->remote_dev, 1);
2017 /* DMA engine hung ?? */
2018 printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d "
2019 "window->dma_mark 0x%x channel_mark 0x%x\n",
2020 __func__, __LINE__, get_chan_num(chan),
2021 ep->sd_state, window->dma_mark, get_dma_mark(chan));
2022 WARN_ON(1);
2023 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
2024 goto restart;
2025 }
2026
2027 if (OP_COMPLETED == window->unreg_state) {
2028 BUG_ON(atomic_sub_return((int32_t)window->nr_pages,
2029 &ep->rma_info.tw_total_pages) < 0);
2030 if (RMA_WINDOW_SELF == window->type)
2031 micscif_destroy_window(ep, window);
2032 else
2033 micscif_destroy_remote_window(ep, window);
2034 BUG_ON(atomic_dec_return(
2035 &ep->rma_info.tw_refcount) < 0);
2036 }
2037 goto restart;
2038 }
2039 spin_unlock(&ms_info.mi_rmalock);
2040}
2041
2042/**
2043 * micscif_rma_destroy_tcw:
2044 *
2045 * This routine destroys temporary registered windows created
2046 * by scif_vreadfrom() and scif_vwriteto().
2047 */
2048static
2049void __micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
2050 struct endpt *ep, bool inrange,
2051 uint64_t start, uint64_t len)
2052{
2053 struct list_head *item, *tmp;
2054 struct reg_range_t *window;
2055 uint64_t start_va, end_va;
2056 uint64_t end = start + len;
2057 list_for_each_safe(item, tmp, &mmn->tc_reg_list) {
2058 window = list_entry(item,
2059 struct reg_range_t, list_member);
2060 ep = (struct endpt *)window->ep;
2061 if (inrange) {
2062 if (0 == len)
2063 break;
2064 start_va = (uint64_t)window->va_for_temp;
2065 end_va = start_va+ (window->nr_pages << PAGE_SHIFT);
2066 if (start < start_va) {
2067 if (end <= start_va) {
2068 break;
2069 } else {
2070 }
2071
2072 } else {
2073 if (start >= end_va) {
2074 continue;
2075 } else {
2076 }
2077 }
2078 }
2079 __micscif_rma_destroy_tcw_helper(window);
2080 }
2081}
2082
2083static inline
2084void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
2085 struct endpt *ep, bool inrange,
2086 uint64_t start, uint64_t len)
2087{
2088 unsigned long sflags;
2089
2090 spin_lock_irqsave(&ep->rma_info.tc_lock, sflags);
2091 __micscif_rma_destroy_tcw(mmn, ep, inrange, start, len);
2092 spin_unlock_irqrestore(&ep->rma_info.tc_lock, sflags);
2093}
2094
2095static void __micscif_rma_destroy_tcw_ep(struct endpt *ep)
2096{
2097 struct list_head *item, *tmp;
2098 struct rma_mmu_notifier *mmn;
2099 spin_lock(&ep->rma_info.tc_lock);
2100 list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
2101 mmn = list_entry(item,
2102 struct rma_mmu_notifier, list_member);
2103 __micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
2104 }
2105 spin_unlock(&ep->rma_info.tc_lock);
2106}
2107
2108void micscif_rma_destroy_tcw_ep(struct endpt *ep)
2109{
2110 struct list_head *item, *tmp;
2111 struct rma_mmu_notifier *mmn;
2112 list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
2113 mmn = list_entry(item,
2114 struct rma_mmu_notifier, list_member);
2115 micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
2116 }
2117}
2118
2119/**
2120 * micscif_rma_destroy_tcw:
2121 *
2122 * This routine destroys temporary registered windows created
2123 * by scif_vreadfrom() and scif_vwriteto().
2124 */
2125void micscif_rma_destroy_tcw_invalid(struct list_head *list)
2126{
2127 struct list_head *item, *tmp;
2128 struct reg_range_t *window;
2129 struct endpt *ep;
2130 struct dma_channel *chan;
2131 might_sleep();
2132restart:
2133 spin_lock(&ms_info.mi_rmalock);
2134 list_for_each_safe(item, tmp, list) {
2135 window = list_entry(item,
2136 struct reg_range_t, list_member);
2137 ep = (struct endpt *)window->ep;
2138 chan = ep->rma_info.dma_chan;
2139 list_del(&window->list_member);
2140 spin_unlock(&ms_info.mi_rmalock);
2141 micscif_inc_node_refcnt(ep->remote_dev, 1);
2142 mutex_lock(&ep->rma_info.rma_lock);
2143 if (!chan ||
2144 !scifdev_alive(ep) ||
2145 (!is_current_dma_mark(chan, window->dma_mark) &&
2146 is_dma_mark_processed(chan, window->dma_mark)) ||
2147 !drain_dma_intr(chan)) {
2148 micscif_dec_node_refcnt(ep->remote_dev, 1);
2149 BUG_ON(atomic_sub_return((int32_t)window->nr_pages,
2150 &ep->rma_info.tcw_total_pages) < 0);
2151 micscif_destroy_window(ep, window);
2152 BUG_ON(atomic_dec_return(
2153 &ep->rma_info.tcw_refcount) < 0);
2154 } else {
2155 /* DMA engine hung ?? */
2156 printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d "
2157 "window->dma_mark 0x%x channel_mark 0x%x\n",
2158 __func__, __LINE__, get_chan_num(chan),
2159 ep->sd_state, window->dma_mark, get_dma_mark(chan));
2160 WARN_ON(1);
2161 mutex_unlock(&ep->rma_info.rma_lock);
2162 micscif_dec_node_refcnt(ep->remote_dev, 1);
2163 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
2164 goto restart;
2165 }
2166 mutex_unlock(&ep->rma_info.rma_lock);
2167 goto restart;
2168 }
2169 spin_unlock(&ms_info.mi_rmalock);
2170}
2171
2172/**
2173 * micscif_rma_handle_remote_fences:
2174 *
2175 * This routine services remote fence requests.
2176 */
2177void micscif_rma_handle_remote_fences(void)
2178{
2179 struct list_head *item, *tmp;
2180 struct remote_fence_info *fence;
2181 struct endpt *ep;
2182 int mark;
2183
2184 might_sleep();
2185 mutex_lock(&ms_info.mi_fencelock);
2186 list_for_each_safe(item, tmp, &ms_info.mi_fence) {
2187 fence = list_entry(item,
2188 struct remote_fence_info, list_member);
2189 /* Remove fence from global list */
2190 list_del(&fence->list_member);
2191
2192 /* Initiate the fence operation */
2193 ep = (struct endpt *)fence->msg.payload[0];
2194 mark = (int)fence->msg.payload[2];
2195 BUG_ON(!(mark & SCIF_REMOTE_FENCE));
2196 if (dma_mark_wait(ep->rma_info.dma_chan,
2197 mark & ~SCIF_REMOTE_FENCE, false)) {
2198 printk(KERN_ERR "%s %d err\n", __func__, __LINE__);
2199 fence->msg.uop = SCIF_WAIT_NACK;
2200 } else {
2201 fence->msg.uop = SCIF_WAIT_ACK;
2202 }
2203 micscif_inc_node_refcnt(ep->remote_dev, 1);
2204 fence->msg.payload[0] = ep->remote_ep;
2205 /* No error handling for Notification messages. */
2206 micscif_nodeqp_send(ep->remote_dev, &fence->msg, ep);
2207 micscif_dec_node_refcnt(ep->remote_dev, 1);
2208 kfree(fence);
2209 /*
2210 * Decrement ref count and wake up
2211 * any thread blocked in the EP close routine waiting
2212 * for all such remote fence requests to complete.
2213 */
2214 ep->rma_info.fence_refcount--;
2215 wake_up(&ep->rma_info.fence_wq);
2216 }
2217 mutex_unlock(&ms_info.mi_fencelock);
2218}
2219
2220#ifdef CONFIG_MMU_NOTIFIER
2221void micscif_mmu_notif_handler(struct work_struct *work)
2222{
2223 struct list_head *pos, *tmpq;
2224 struct endpt *ep;
2225restart:
2226 micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc);
2227 spin_lock(&ms_info.mi_rmalock);
2228 list_for_each_safe(pos, tmpq, &ms_info.mi_mmu_notif_cleanup) {
2229 ep = list_entry(pos, struct endpt, mmu_list);
2230 list_del(&ep->mmu_list);
2231 spin_unlock(&ms_info.mi_rmalock);
2232 BUG_ON(list_empty(&ep->rma_info.mmn_list));
2233
2234 micscif_rma_destroy_tcw_ep(ep);
2235 ep_unregister_mmu_notifier(ep);
2236 queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
2237 goto restart;
2238 }
2239 spin_unlock(&ms_info.mi_rmalock);
2240}
2241#endif
2242
2243/**
2244 * micscif_reserve_dma_chan:
2245 * @ep: Endpoint Descriptor.
2246 *
2247 * This routine reserves a DMA channel for a particular
2248 * endpoint. All DMA transfers for an endpoint are always
2249 * programmed on the same DMA channel.
2250 */
2251int micscif_reserve_dma_chan(struct endpt *ep)
2252{
2253 int err = 0;
2254#ifndef _MIC_SCIF_
2255 /*
2256 * Host Loopback cannot use DMA by design and hence
2257 * reserving DMA channels is a nop.
2258 */
2259 if (is_self_scifdev(ep->remote_dev))
2260 return 0;
2261#endif
2262 mutex_lock(&ep->rma_info.rma_lock);
2263 if (!ep->rma_info.dma_chan) {
2264 struct dma_channel **chan = &ep->rma_info.dma_chan;
2265 unsigned long ts = jiffies;
2266#ifndef _MIC_SCIF_
2267 mic_ctx_t *mic_ctx =
2268 get_per_dev_ctx(ep->remote_dev->sd_node - 1);
2269 BUG_ON(!ep->remote_dev->sd_node);
2270#endif
2271 while (true) {
2272 if (!(err = allocate_dma_channel((struct mic_dma_ctx_t *)
2273#ifdef _MIC_SCIF_
2274 mic_dma_handle,
2275#else
2276 mic_ctx->dma_handle,
2277#endif
2278 chan)))
2279 break;
2280 schedule();
2281 if (time_after(jiffies,
2282 ts + NODE_ALIVE_TIMEOUT)) {
2283 err = -EBUSY;
2284 goto error;
2285 }
2286 }
2287 mic_dma_thread_free_chan(*chan);
2288 }
2289error:
2290 mutex_unlock(&ep->rma_info.rma_lock);
2291 return err;
2292}
2293
2294/*
2295 * micscif_prog_signal:
2296 * @epd - Endpoint Descriptor
2297 * @offset - registered address
2298 * @val - Value to be programmed in SUD.
2299 * @type - Type of the window.
2300 *
2301 * Program a status update descriptor adter ensuring that the offset
2302 * provided is indeed valid.
2303 */
2304int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val,
2305 enum rma_window_type type)
2306{
2307 struct endpt *ep = (struct endpt *)epd;
2308 struct dma_channel *chan = ep->rma_info.dma_chan;
2309 struct reg_range_t *window = NULL;
2310 struct micscif_rma_req req;
2311 int err;
2312 dma_addr_t phys;
2313
2314 mutex_lock(&ep->rma_info.rma_lock);
2315 req.out_window = &window;
2316 req.offset = offset;
2317 req.nr_bytes = sizeof(uint64_t);
2318 req.prot = SCIF_PROT_WRITE;
2319 req.type = WINDOW_SINGLE;
2320 if (RMA_WINDOW_SELF == type)
2321 req.head = &ep->rma_info.reg_list;
2322 else
2323 req.head = &ep->rma_info.remote_reg_list;
2324 /* Does a valid window exist? */
2325 if ((err = micscif_query_window(&req))) {
2326 printk(KERN_ERR "%s %d err %d\n",
2327 __func__, __LINE__, err);
2328 goto unlock_ret;
2329 }
2330 RMA_MAGIC(window);
2331
2332#ifndef _MIC_SCIF_
2333 if (unlikely(is_self_scifdev(ep->remote_dev))) {
2334 void *dst_virt;
2335 if (RMA_WINDOW_SELF == type)
2336 dst_virt = get_local_va(offset, window,
2337 sizeof(uint32_t));
2338 else {
2339 struct page **pages = ((struct reg_range_t *)
2340 (window->peer_window))->pinned_pages->pages;
2341 int page_nr = (int) ( (offset - window->offset) >> PAGE_SHIFT );
2342 off_t page_off = offset & ~PAGE_MASK;
2343 dst_virt = (void *)((uint64_t)phys_to_virt(page_to_phys(
2344 pages[page_nr])) | page_off);
2345 }
2346 *(uint64_t*)dst_virt = val;
2347 goto unlock_ret;
2348 }
2349#endif
2350 phys = micscif_get_dma_addr(window, offset, NULL, NULL, NULL);
2351 if ((err = request_dma_channel(chan)))
2352 goto unlock_ret;
2353 err = do_status_update(chan, phys, val);
2354 free_dma_channel(chan);
2355unlock_ret:
2356 mutex_unlock(&ep->rma_info.rma_lock);
2357 return err;
2358}
2359
2360/*
2361 * __micscif_kill_apps_with_mmaps:
2362 * @ep - The SCIF endpoint
2363 *
2364 * Kill the applications which have valid remote memory mappings
2365 * created via scif_mmap(..).
2366 */
2367static void __micscif_kill_apps_with_mmaps(struct endpt *ep)
2368{
2369 struct list_head *item;
2370 struct rma_task_info *info;
2371
2372 spin_lock(&ep->lock);
2373 list_for_each(item, &ep->rma_info.task_list) {
2374 info = list_entry(item, struct rma_task_info, list_member);
2375 kill_pid(info->pid, SIGKILL, 1);
2376 pr_debug("%s ep %p pid %p ref %d\n",
2377 __func__, ep, info->pid, info->ref_count);
2378 }
2379 spin_unlock(&ep->lock);
2380}
2381
2382/*
2383 * _micscif_kill_apps_with_mmaps:
2384 * @node - remote node id.
2385 * @head - head of the list of endpoints to kill.
2386 *
2387 * Traverse the list of endpoints for a particular remote node and
2388 * kill applications with valid remote memory mappings.
2389 */
2390static void _micscif_kill_apps_with_mmaps(int node, struct list_head *head)
2391{
2392 struct endpt *ep;
2393 unsigned long sflags;
2394 struct list_head *item;
2395
2396 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
2397 list_for_each(item, head) {
2398 ep = list_entry(item, struct endpt, list);
2399 if (ep->remote_dev->sd_node == node)
2400 __micscif_kill_apps_with_mmaps(ep);
2401 }
2402 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
2403}
2404
2405/*
2406 * micscif_kill_apps_with_mmaps:
2407 * @node - remote node id.
2408 *
2409 * Wrapper for killing applications with valid remote memory mappings
2410 * for a particular node. This API is called by peer nodes as part of
2411 * handling a lost node.
2412 */
2413void micscif_kill_apps_with_mmaps(int node)
2414{
2415 _micscif_kill_apps_with_mmaps(node, &ms_info.mi_connected);
2416 _micscif_kill_apps_with_mmaps(node, &ms_info.mi_disconnected);
2417}
2418
2419/*
2420 * micscif_query_apps_with_mmaps:
2421 * @node - remote node id.
2422 * @head - head of the list of endpoints to query.
2423 *
2424 * Query if any applications for a remote node have valid remote memory
2425 * mappings.
2426 */
2427static bool micscif_query_apps_with_mmaps(int node, struct list_head *head)
2428{
2429 struct endpt *ep;
2430 unsigned long sflags;
2431 struct list_head *item;
2432 bool ret = false;
2433
2434 spin_lock_irqsave(&ms_info.mi_connlock, sflags);
2435 list_for_each(item, head) {
2436 ep = list_entry(item, struct endpt, list);
2437 if (ep->remote_dev->sd_node == node &&
2438 !list_empty(&ep->rma_info.task_list)) {
2439 ret = true;
2440 break;
2441 }
2442 }
2443 spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
2444 return ret;
2445}
2446
2447/*
2448 * micscif_rma_do_apps_have_mmaps:
2449 * @node - remote node id.
2450 *
2451 * Wrapper for querying if any applications have remote memory mappings
2452 * for a particular node.
2453 */
2454bool micscif_rma_do_apps_have_mmaps(int node)
2455{
2456 return (micscif_query_apps_with_mmaps(node, &ms_info.mi_connected) ||
2457 micscif_query_apps_with_mmaps(node, &ms_info.mi_disconnected));
2458}
2459
2460/*
2461 * __micscif_cleanup_rma_for_zombies:
2462 * @ep - The SCIF endpoint
2463 *
2464 * This API is only called while handling a lost node:
2465 * a) Remote node is dead.
2466 * b) All endpoints with remote memory mappings have been killed.
2467 * So we can traverse the remote_reg_list without any locks. Since
2468 * the window has not yet been unregistered we can drop the ref count
2469 * and queue it to the cleanup thread.
2470 */
2471static void __micscif_cleanup_rma_for_zombies(struct endpt *ep)
2472{
2473 struct list_head *pos, *tmp;
2474 struct reg_range_t *window;
2475
2476 list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) {
2477 window = list_entry(pos, struct reg_range_t, list_member);
2478 /* If unregistration is complete then why is it on the list? */
2479 WARN_ON(window->unreg_state == OP_COMPLETED);
2480 if (window->ref_count)
2481 put_window_ref_count(window, window->nr_pages);
2482 if (!window->ref_count) {
2483 atomic_inc(&ep->rma_info.tw_refcount);
2484 atomic_add_return((int32_t)window->nr_pages,
2485 &ep->rma_info.tw_total_pages);
2486 list_del(&window->list_member);
2487 micscif_queue_for_cleanup(window, &ms_info.mi_rma);
2488 }
2489 }
2490}
2491
2492/*
2493 * micscif_cleanup_rma_for_zombies:
2494 * @node - remote node id.
2495 *
2496 * Cleanup remote registration lists for zombie endpoints.
2497 */
2498void micscif_cleanup_rma_for_zombies(int node)
2499{
2500 struct endpt *ep;
2501 unsigned long sflags;
2502 struct list_head *item;
2503
2504 spin_lock_irqsave(&ms_info.mi_eplock, sflags);
2505 list_for_each(item, &ms_info.mi_zombie) {
2506 ep = list_entry(item, struct endpt, list);
2507 if (ep->remote_dev && ep->remote_dev->sd_node == node) {
2508 /*
2509 * If the zombie endpoint remote node matches the lost
2510 * node then the scifdev should not be alive.
2511 */
2512 WARN_ON(scifdev_alive(ep));
2513 __micscif_cleanup_rma_for_zombies(ep);
2514 }
2515 }
2516 spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
2517}
2518
2519/*
2520 * micscif_rma_get_task:
2521 *
2522 * Store the parent task struct and bump up the number of remote mappings.
2523 * If this is the first remote memory mapping for this endpoint then
2524 * create a new rma_task_info entry in the epd task list.
2525 */
2526int micscif_rma_get_task(struct endpt *ep, int nr_pages)
2527{
2528 struct list_head *item;
2529 struct rma_task_info *info;
2530 int err = 0;
2531
2532 spin_lock(&ep->lock);
2533 list_for_each(item, &ep->rma_info.task_list) {
2534 info = list_entry(item, struct rma_task_info, list_member);
2535 if (info->pid == task_tgid(current)) {
2536 info->ref_count += nr_pages;
2537 pr_debug("%s ep %p existing pid %p ref %d\n",
2538 __func__, ep, info->pid, info->ref_count);
2539 goto unlock;
2540 }
2541 }
2542 spin_unlock(&ep->lock);
2543
2544 /* A new task is mapping this window. Create a new entry */
2545 if (!(info = kzalloc(sizeof(*info), GFP_KERNEL))) {
2546 err = -ENOMEM;
2547 goto done;
2548 }
2549
2550 info->pid = get_pid(task_tgid(current));
2551 info->ref_count = nr_pages;
2552 pr_debug("%s ep %p new pid %p ref %d\n",
2553 __func__, ep, info->pid, info->ref_count);
2554 spin_lock(&ep->lock);
2555 list_add_tail(&info->list_member, &ep->rma_info.task_list);
2556unlock:
2557 spin_unlock(&ep->lock);
2558done:
2559 return err;
2560}
2561
2562/*
2563 * micscif_rma_put_task:
2564 *
2565 * Bump down the number of remote mappings. if the ref count for this
2566 * particular task drops to zero then remove the rma_task_info from
2567 * the epd task list.
2568 */
2569void micscif_rma_put_task(struct endpt *ep, int nr_pages)
2570{
2571 struct list_head *item;
2572 struct rma_task_info *info;
2573
2574 spin_lock(&ep->lock);
2575 list_for_each(item, &ep->rma_info.task_list) {
2576 info = list_entry(item, struct rma_task_info, list_member);
2577 if (info->pid == task_tgid(current)) {
2578 info->ref_count -= nr_pages;
2579 pr_debug("%s ep %p pid %p ref %d\n",
2580 __func__, ep, info->pid, info->ref_count);
2581 if (!info->ref_count) {
2582 list_del(&info->list_member);
2583 put_pid(info->pid);
2584 kfree(info);
2585 }
2586 goto done;
2587 }
2588 }
2589 /* Why was the task not found? This is a bug. */
2590 WARN_ON(1);
2591done:
2592 spin_unlock(&ep->lock);
2593 return;
2594}
2595
2596/* Only debug API's below */
2597void micscif_display_window(struct reg_range_t *window, const char *s, int line)
2598{
2599 int j;
2600
2601 printk("%s %d window %p type %d temp %d offset 0x%llx"
2602 " nr_pages 0x%llx nr_contig_chunks 0x%llx"
2603 " prot %d ref_count %d magic 0x%llx peer_window 0x%llx"
2604 " unreg_state 0x%x va_for_temp %p\n",
2605 s, line, window, window->type, window->temp,
2606 window->offset, window->nr_pages, window->nr_contig_chunks,
2607 window->prot, window->ref_count, window->magic,
2608 window->peer_window, window->unreg_state, window->va_for_temp);
2609
2610 for (j = 0; j < window->nr_contig_chunks; j++)
2611 pr_debug("page[%d] = dma_addr 0x%llx num_pages 0x%x\n",
2612 j,
2613 window->dma_addr[j],
2614 window->num_pages[j]);
2615
2616 if (RMA_WINDOW_SELF == window->type && window->pinned_pages)
2617 for (j = 0; j < window->nr_pages; j++)
2618 pr_debug("page[%d] = pinned_pages %p address %p\n",
2619 j, window->pinned_pages->pages[j],
2620 page_address(window->pinned_pages->pages[j]));
2621
2622#ifdef CONFIG_ML1OM
2623 if (window->temp_phys_addr)
2624 for (j = 0; j < window->nr_contig_chunks; j++)
2625 pr_debug("page[%d] = temp_phys_addr 0x%llx\n",
2626 j, window->temp_phys_addr[j]);
2627 if (window->phys_addr)
2628 for (j = 0; j < window->nr_pages; j++)
2629 pr_debug("page[%d] = phys_addr 0x%llx\n",
2630 j, window->phys_addr[j]);
2631#endif
2632 RMA_MAGIC(window);
2633}