Initial commit of files contained in `mpss-modules-3.8.6.tar.bz2` for Intel Xeon...
[xeon-phi-kernel-module] / include / mic / micscif_rma.h
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36#ifndef MICSCIF_RMA_H
37#define MICSCIF_RMA_H
38
39#ifdef CONFIG_MMU_NOTIFIER
40#include <linux/mmu_notifier.h>
41#ifdef CONFIG_TRANSPARENT_HUGEPAGE
42#include <linux/huge_mm.h>
43#endif
44#ifdef CONFIG_HUGETLB_PAGE
45#include <linux/hugetlb.h>
46#endif
47#endif
48#include "scif.h"
49#include <linux/errno.h>
50#include <linux/hardirq.h>
51#include <linux/types.h>
52#include <linux/capability.h>
53#include <linux/slab.h>
54#include <linux/string.h>
55#include <linux/gfp.h>
56#include <linux/vmalloc.h>
57#include <asm/io.h>
58#include <linux/kernel.h>
59#include <linux/mm_types.h>
60#include <linux/jiffies.h>
61#include <linux/timer.h>
62#include <linux/irqflags.h>
63#include <linux/time.h>
64#include <linux/spinlock.h>
65#include <linux/mutex.h>
66#include <linux/semaphore.h>
67#include <linux/kthread.h>
68#include <linux/sched.h>
69#include <linux/delay.h>
70#include <linux/wait.h>
71#include <asm/bug.h>
72#include <linux/pci.h>
73#include <linux/device.h>
74#include <linux/fs.h>
75#include <linux/list.h>
76#include <linux/workqueue.h>
77#include <linux/interrupt.h>
78#include <asm/atomic.h>
79#include <linux/netdevice.h>
80#include <linux/debugfs.h>
81#include "mic/micscif_kmem_cache.h"
82
83struct rma_mmu_notifier {
84#ifdef CONFIG_MMU_NOTIFIER
85 struct mmu_notifier ep_mmu_notifier;
86#endif
87 bool ep_mn_registered;
88 /* List of temp registration windows for self */
89 struct list_head tc_reg_list;
90 struct mm_struct *mm;
91 struct endpt *ep;
92 struct list_head list_member;
93};
94
95/* Per Endpoint Remote Memory Access Information */
96struct endpt_rma_info {
97 /* List of registration windows for self */
98 struct list_head reg_list;
99 /* List of registration windows for peer */
100 struct list_head remote_reg_list;
101 /* Offset generator */
102 struct va_gen_addr va_gen;
103 /*
104 * Synchronizes access to self/remote list and also
105 * protects the window from being destroyed while
106 * RMAs are in progress.
107 */
108 struct mutex rma_lock;
109 /*
110 * Synchronizes access to temporary cached windows list
111 * for SCIF Registration Caching.
112 */
113 spinlock_t tc_lock;
114 /*
115 * Synchronizes access to the list of MMU notifiers
116 * registered for this SCIF endpoint.
117 */
118 struct mutex mmn_lock;
119 /*
120 * Synchronizes access to the SCIF registered address space
121 * offset generator.
122 */
123 struct mutex va_lock;
124 /*
125 * Keeps track of number of outstanding temporary registered
126 * windows created by scif_vreadfrom/scif_vwriteto which have
127 * not been destroyed. tcw refers to the number of temporary
128 * cached windows and total number of pages pinned.
129 */
130 atomic_t tw_refcount;
131 atomic_t tw_total_pages;
132 atomic_t tcw_refcount;
133 atomic_t tcw_total_pages;
134 /*
135 * MMU notifier so that we can destroy the windows when there is
136 * a change
137 */
138 struct list_head mmn_list;
139 /*
140 * Keeps track of number of outstanding remote fence requests
141 * which have been received by the peer.
142 */
143 int fence_refcount;
144 /*
145 * The close routine blocks on this wait queue to ensure that all
146 * remote fence requests have been serviced.
147 */
148 wait_queue_head_t fence_wq;
149 /*
150 * DMA channel used for all DMA transfers for this endpoint.
151 */
152 struct dma_channel *dma_chan;
153 /* Detect asynchronous list entry deletion */
154 int async_list_del;
155#ifdef _MIC_SCIF_
156 /* Local P2P proxy DMA virtual address for SUD updates by peer */
157 void *proxy_dma_va;
158 /* Local P2P proxy DMA physical address location for SUD updates */
159 dma_addr_t proxy_dma_phys;
160 /* Remote P2P proxy DMA physical address location for SUD updates */
161 dma_addr_t proxy_dma_peer_phys;
162#endif
163 /* List of tasks which have remote memory mappings */
164 struct list_head task_list;
165};
166
167/* Information used for tracking remote fence requests */
168struct fence_info {
169 /* State of this transfer */
170 enum micscif_msg_state state;
171
172 /* Fences wait on this queue */
173 wait_queue_head_t wq;
174
175 /* Used for storing the DMA mark */
176 int dma_mark;
177};
178
179/* Per remote fence wait request */
180struct remote_fence_info {
181 /* The SCIF_WAIT message */
182 struct nodemsg msg;
183
184 struct list_head list_member;
185};
186
187/* Self or Peer window */
188enum rma_window_type {
189 RMA_WINDOW_SELF = 0x1,
190 RMA_WINDOW_PEER
191};
192
193/* The number of physical addresses that can be stored in a PAGE. */
194#define NR_PHYS_ADDR_IN_PAGE (PAGE_SIZE >> 3)
195
196/*
197 * Store an array of lookup offsets. Each offset in this array maps
198 * one 4K page containing 512 physical addresses i.e. 2MB. 512 such
199 * offsets in a 4K page will correspond to 1GB of registered address space.
200 */
201struct rma_lookup {
202 /* Array of offsets */
203 dma_addr_t *lookup;
204 /* Offset used to map lookup array */
205 dma_addr_t offset;
206};
207
208
209/*
210 * A set of pinned pages obtained with scif_pin_pages() which could be part
211 * of multiple registered windows across different end points.
212 */
213struct scif_pinned_pages {
214 int64_t nr_pages;
215 int prot;
216 int map_flags;
217 atomic_t ref_count;
218 uint64_t magic;
219 /*
220 * Array of pointers to struct pages populated
221 * with get_user_pages(..)
222 */
223 struct page **pages;
224 int *num_pages;
225 int64_t nr_contig_chunks;
226 /* Only for Hosts without THP but with Huge TLB FS Like SuSe11 SP1 */
227 struct vm_area_struct **vma;
228};
229
230/*
231 * Information about a particular task which has remote memory mappings
232 * created via scif_mmap(..).
233 */
234struct rma_task_info {
235 /*
236 * Stores the pid struct of the grp_leader task structure which
237 * scif_mmap(..)'d the remote window.
238 */
239 struct pid *pid;
240 int ref_count;
241 struct list_head list_member;
242};
243
244/* Registration Window for Self */
245struct reg_range_t {
246 int64_t nr_pages;
247 /* Number of contiguous physical chunks */
248 int64_t nr_contig_chunks;
249 int prot;
250 int ref_count;
251 /* Cookie to detect corruption */
252 uint64_t magic;
253 uint64_t offset;
254 /* va address that this window represents
255 * Useful for only for temp windows*/
256 void *va_for_temp;
257 /* Used for temporary windows*/
258 int dma_mark;
259 /*
260 * Pointer to EP. Useful for passing EP around
261 * with messages to avoid expensive list
262 * traversals.
263 */
264 uint64_t ep;
265
266 struct list_head list_member;
267
268 enum rma_window_type type;
269
270 /*
271 * Pointer to peer window. Useful for sending
272 * messages to peer without requiring an
273 * extra list traversal
274 */
275 uint64_t peer_window;
276
277 /* Unregistration state */
278 enum micscif_msg_state unreg_state;
279
280 /*
281 * True for temporary windows created via
282 * scif_vreadfrom/scif_vwriteto.
283 */
284 bool temp;
285
286 bool offset_freed;
287
288 /* Local P2P proxy DMA physical address location for SUD updates */
289 dma_addr_t proxy_dma_phys;
290
291 union {
292 /* Self RAS */
293 struct {
294 /* The set of pinned_pages backing this window */
295 struct scif_pinned_pages *pinned_pages;
296
297 /* Handle for sending ALLOC_REQ */
298 struct allocmsg alloc_handle;
299
300 /* Wait Queue for an registration (N)ACK */
301 wait_queue_head_t regwq;
302
303 /* Registration state */
304 enum micscif_msg_state reg_state;
305
306 /* Wait Queue for an unregistration (N)ACK */
307 wait_queue_head_t unregwq;
308 };
309 /* Peer RAS specific window elements */
310 struct {
311#ifdef CONFIG_ML1OM
312 /* Lookup for physical addresses used for mmap */
313 struct rma_lookup phys_addr_lookup;
314
315 /* Lookup for temp physical addresses used for mmap */
316 struct rma_lookup temp_phys_addr_lookup;
317
318 /* Mmap state */
319 enum micscif_msg_state gttmap_state;
320
321 /* Wait Queue for an unregistration (N)ACK */
322 wait_queue_head_t gttmapwq;
323
324 /* Ref count per page */
325 int *page_ref_count;
326#endif
327 /* Lookup for physical addresses used for DMA */
328 struct rma_lookup dma_addr_lookup;
329
330 /* Number of entries in lookup */
331 int nr_lookup;
332
333 /* Offset used to map the window by the peer */
334 dma_addr_t mapped_offset;
335
336 /* Ref count for tracking scif_get_pages */
337 int get_put_ref_count;
338 };
339 };
340#ifdef CONFIG_ML1OM
341 /* Array of physical addresses used for creating VtoP mappings */
342 /* FIXME: these are phys_addr as seen by the peer node, node at the
343 * opposite end of the endpt
344 */
345 dma_addr_t *phys_addr;
346
347 /* Temporary array for storing physical addresses for performance */
348 dma_addr_t *temp_phys_addr;
349#endif
350
351 /* Array of physical addresses used for Host & MIC initiated DMA */
352 dma_addr_t *dma_addr;
353
354 /* Array specifying number of pages for each physical address */
355 int *num_pages;
356 struct mm_struct *mm;
357} __attribute__ ((packed));
358
359
360#define RMA_MAGIC(x) BUG_ON(x->magic != SCIFEP_MAGIC)
361
362/* If this bit is set then the mark is a remote fence mark */
363#define SCIF_REMOTE_FENCE_BIT 30
364/* Magic value used to indicate a remote fence request */
365#define SCIF_REMOTE_FENCE (1ULL << SCIF_REMOTE_FENCE_BIT)
366
367enum rma_direction {
368 LOCAL_TO_REMOTE,
369 REMOTE_TO_LOCAL
370};
371
372/* Initialize RMA for this EP */
373int micscif_rma_ep_init(struct endpt *ep);
374
375/* Check if epd can be uninitialized */
376int micscif_rma_ep_can_uninit(struct endpt *ep);
377
378/* Obtain a new offset. Callee must grab RMA lock */
379int micscif_get_window_offset(struct endpt *ep, int flags,
380 uint64_t offset, size_t len, uint64_t *out_offset);
381
382/* Free offset. Callee must grab RMA lock */
383void micscif_free_window_offset(struct endpt *ep,
384 uint64_t offset, size_t len);
385
386/* Create self registration window */
387struct reg_range_t *micscif_create_window(struct endpt *ep,
388 int64_t nr_pages, uint64_t offset, bool temp);
389
390/* Create a set of pinned pages */
391struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot);
392
393/* Destroy a set of pinned pages */
394int micscif_destroy_pinned_pages(struct scif_pinned_pages *pages);
395
396/* Destroy self registration window.*/
397int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window);
398
399int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window);
400
401/* Map pages of self window to Aperture/PCI */
402int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool temp);
403
404/* Unregister a self window */
405int micscif_unregister_window(struct reg_range_t *window);
406
407/* Create remote registration window */
408struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages);
409
410/* Destroy remote registration window */
411void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window);
412
413int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window);
414
415/* Prepare a remote registration window */
416int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window);
417
418/* Create remote lookup entries for physical addresses */
419int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window);
420
421/* Destroy remote lookup entries for physical addresses */
422void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window);
423
424/* Send a SCIF_REGISTER message and wait for an ACK */
425int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window);
426
427/* Send a SCIF_UNREGISTER message */
428int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window);
429
430/* RMA copy API */
431int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len,
432 off_t roffset, int flags, enum rma_direction dir, bool last_chunk);
433
434/* Sends a remote fence mark request */
435int micscif_send_fence_mark(scif_epd_t epd, int *out_mark);
436
437/* Sends a remote fence wait request */
438int micscif_send_fence_wait(scif_epd_t epd, int mark);
439
440/* Sends a remote fence signal request */
441int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval,
442 off_t loff, uint64_t lval, int flags);
443
444/* Setup a DMA mark for an endpoint */
445int micscif_fence_mark(scif_epd_t epd);
446
447void ep_unregister_mmu_notifier(struct endpt *ep);
448#ifdef CONFIG_MMU_NOTIFIER
449void micscif_mmu_notif_handler(struct work_struct *work);
450#endif
451
452void micscif_rma_destroy_temp_windows(void);
453void micscif_rma_destroy_tcw_ep(struct endpt *ep);
454void micscif_rma_destroy_tcw_invalid(struct list_head *list);
455
456void micscif_rma_handle_remote_fences(void);
457
458/* Reserve a DMA channel for a particular endpoint */
459int micscif_reserve_dma_chan(struct endpt *ep);
460
461/* Program DMA SUD's after verifying the registered offset */
462int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val,
463 enum rma_window_type type);
464
465/* Kill any applications which have valid remote memory mappings */
466void micscif_kill_apps_with_mmaps(int node);
467
468/* Query if any applications have remote memory mappings */
469bool micscif_rma_do_apps_have_mmaps(int node);
470
471/* Get a reference to the current task which is creating a remote memory mapping */
472int micscif_rma_get_task(struct endpt *ep, int nr_pages);
473
474/* Release a reference to the current task which is destroying a remote memory mapping */
475void micscif_rma_put_task(struct endpt *ep, int nr_pages);
476
477/* Cleanup remote registration lists for zombie endpoints */
478void micscif_cleanup_rma_for_zombies(int node);
479
480#ifdef _MIC_SCIF_
481void micscif_teardown_proxy_dma(struct endpt *ep);
482#endif
483
484static __always_inline
485bool is_unaligned(off_t src_offset, off_t dst_offset)
486{
487 src_offset = src_offset & (L1_CACHE_BYTES - 1);
488 dst_offset = dst_offset & (L1_CACHE_BYTES - 1);
489 if (src_offset == dst_offset)
490 return false;
491 else
492 return true;
493}
494
495static __always_inline
496int __scif_readfrom(scif_epd_t epd, off_t loffset, size_t len,
497 off_t roffset, int flags)
498{
499 int err;
500
501 pr_debug("SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx"
502 " offset 0x%lx flags 0x%x\n",
503 epd, loffset, len, roffset, flags);
504
505 if (is_unaligned(loffset, roffset)) {
506 while(len > MAX_UNALIGNED_BUF_SIZE) {
507 err = micscif_rma_copy(epd, loffset, NULL,
508 MAX_UNALIGNED_BUF_SIZE,
509 roffset, flags, REMOTE_TO_LOCAL, false);
510 if (err)
511 goto readfrom_err;
512 loffset += MAX_UNALIGNED_BUF_SIZE;
513 roffset += MAX_UNALIGNED_BUF_SIZE;
514 len -=MAX_UNALIGNED_BUF_SIZE;
515 }
516 }
517 err = micscif_rma_copy(epd, loffset, NULL, len,
518 roffset, flags, REMOTE_TO_LOCAL, true);
519readfrom_err:
520 return err;
521}
522
523static __always_inline
524int __scif_writeto(scif_epd_t epd, off_t loffset, size_t len,
525 off_t roffset, int flags)
526{
527 int err;
528
529 pr_debug("SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx"
530 " roffset 0x%lx flags 0x%x\n",
531 epd, loffset, len, roffset, flags);
532
533 if (is_unaligned(loffset, roffset)) {
534 while(len > MAX_UNALIGNED_BUF_SIZE) {
535 err = micscif_rma_copy(epd, loffset, NULL,
536 MAX_UNALIGNED_BUF_SIZE,
537 roffset, flags, LOCAL_TO_REMOTE, false);
538 if (err)
539 goto writeto_err;
540 loffset += MAX_UNALIGNED_BUF_SIZE;
541 roffset += MAX_UNALIGNED_BUF_SIZE;
542 len -= MAX_UNALIGNED_BUF_SIZE;
543 }
544 }
545 err = micscif_rma_copy(epd, loffset, NULL, len,
546 roffset, flags, LOCAL_TO_REMOTE, true);
547writeto_err:
548 return err;
549}
550
551static __always_inline
552int __scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags)
553{
554 int err;
555
556 pr_debug("SCIFAPI vreadfrom: ep %p addr %p len 0x%lx"
557 " roffset 0x%lx flags 0x%x\n",
558 epd, addr, len, roffset, flags);
559
560 if (is_unaligned((off_t)addr, roffset)) {
561 if (len > MAX_UNALIGNED_BUF_SIZE)
562 flags &= ~SCIF_RMA_USECACHE;
563
564 while(len > MAX_UNALIGNED_BUF_SIZE) {
565 err = micscif_rma_copy(epd, 0, addr,
566 MAX_UNALIGNED_BUF_SIZE,
567 roffset, flags, REMOTE_TO_LOCAL, false);
568 if (err)
569 goto vreadfrom_err;
570 addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE);
571 roffset += MAX_UNALIGNED_BUF_SIZE;
572 len -= MAX_UNALIGNED_BUF_SIZE;
573 }
574 }
575 err = micscif_rma_copy(epd, 0, addr, len,
576 roffset, flags, REMOTE_TO_LOCAL, true);
577vreadfrom_err:
578 return err;
579}
580
581static __always_inline
582int __scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags)
583{
584 int err;
585
586 pr_debug("SCIFAPI vwriteto: ep %p addr %p len 0x%lx"
587 " roffset 0x%lx flags 0x%x\n",
588 epd, addr, len, roffset, flags);
589
590 if (is_unaligned((off_t)addr, roffset)) {
591 if (len > MAX_UNALIGNED_BUF_SIZE)
592 flags &= ~SCIF_RMA_USECACHE;
593
594 while(len > MAX_UNALIGNED_BUF_SIZE) {
595 err = micscif_rma_copy(epd, 0, addr,
596 MAX_UNALIGNED_BUF_SIZE,
597 roffset, flags, LOCAL_TO_REMOTE, false);
598 if (err)
599 goto vwriteto_err;
600 addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE);
601 roffset += MAX_UNALIGNED_BUF_SIZE;
602 len -= MAX_UNALIGNED_BUF_SIZE;
603 }
604 }
605 err = micscif_rma_copy(epd, 0, addr, len,
606 roffset, flags, LOCAL_TO_REMOTE, true);
607vwriteto_err:
608 return err;
609}
610
611void micscif_rma_completion_cb(uint64_t data);
612
613int micscif_pci_dev(uint16_t node, struct pci_dev **pdev);
614#ifndef _MIC_SCIF_
615int micscif_pci_info(uint16_t node, struct scif_pci_info *dev);
616#endif
617
618/*
619 * nr_pages in a 2MB page is specified via the top 12 bits in the
620 * physical address.
621 */
622
623/* Check all parenthesis in these macros. See if putting in bottom makes sense? */
624#define RMA_HUGE_NR_PAGE_SHIFT ((52))
625#define RMA_HUGE_NR_PAGE_MASK (((0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT))
626#define RMA_GET_NR_PAGES(addr) ((addr) >> RMA_HUGE_NR_PAGE_SHIFT)
627#define RMA_SET_NR_PAGES(addr, nr_pages) ((addr) = (((nr_pages) & 0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT) | ((uint64_t)(addr)))
628#define RMA_GET_ADDR(addr) ((addr) & ~(RMA_HUGE_NR_PAGE_MASK))
629
630extern bool mic_huge_page_enable;
631
632#define SCIF_HUGE_PAGE_SHIFT 21
633
634/*
635 * micscif_is_huge_page:
636 * @page: A physical page.
637 */
638static __always_inline int
639micscif_is_huge_page(struct scif_pinned_pages *pinned_pages, int index)
640{
641 int huge = 0;
642 struct page *page = pinned_pages->pages[index];
643
644 if (compound_order(page) + PAGE_SHIFT == SCIF_HUGE_PAGE_SHIFT)
645 huge = 1;
646 if (huge)
647 ms_info.nr_2mb_pages++;
648 if (!mic_huge_page_enable)
649 huge = 0;
650#ifdef RMA_DEBUG
651 WARN_ON(!page_count(page));
652 WARN_ON(page_mapcount(page) < 0);
653#endif
654 return huge;
655}
656
657/*
658 * micscif_detect_large_page:
659 * @pinned_pages: A set of pinned pages.
660 */
661static __always_inline int
662micscif_detect_large_page(struct scif_pinned_pages *pinned_pages, char *addr)
663{
664 int i = 0, nr_pages, huge;
665 char *next_huge, *end;
666 char *end_addr = addr + (pinned_pages->nr_pages << PAGE_SHIFT);
667
668 while (addr < end_addr) {
669 huge = micscif_is_huge_page(pinned_pages, i);
670 if (huge) {
671 next_huge = (char *)ALIGN(
672 (unsigned long)(addr + 1),
673 PMD_SIZE);
674 end = next_huge > end_addr ? end_addr : next_huge;
675 nr_pages = (int)((end - addr) >> PAGE_SHIFT);
676 pinned_pages->num_pages[i] = (int)nr_pages;
677 addr = end;
678 i += (int)nr_pages;
679
680 } else {
681 pinned_pages->num_pages[i] = 1;
682 i++;
683 addr += PAGE_SIZE;
684 ms_info.nr_4k_pages++;
685 }
686 pinned_pages->nr_contig_chunks++;
687 }
688 return 0;
689}
690
691/**
692 * micscif_set_nr_pages:
693 * @ep: end point
694 * @window: self registration window
695 *
696 * Set nr_pages in every entry of physical address/dma address array
697 * and also remove nr_pages information from physical addresses.
698 */
699static __always_inline void
700micscif_set_nr_pages(struct micscif_dev *dev, struct reg_range_t *window)
701{
702 int j;
703#ifdef CONFIG_ML1OM
704 int l = 0, k;
705#endif
706
707 for (j = 0; j < window->nr_contig_chunks; j++) {
708 window->num_pages[j] = RMA_GET_NR_PAGES(window->dma_addr[j]);
709 if (window->num_pages[j])
710 window->dma_addr[j] = RMA_GET_ADDR(window->dma_addr[j]);
711 else
712 break;
713#ifdef CONFIG_ML1OM
714 for (k = 0; k < window->num_pages[j]; k++)
715 if (window->temp_phys_addr[j])
716 window->phys_addr[l + k] =
717 RMA_GET_ADDR(window->temp_phys_addr[j]) + (k << PAGE_SHIFT);
718 l += window->num_pages[j];
719#endif
720 }
721}
722
723#ifdef CONFIG_ML1OM
724/*
725 * micscif_get_phys_addr:
726 * Obtain the phys_addr given the window and the offset.
727 * @window: Registered window.
728 * @off: Window offset.
729 */
730static __always_inline dma_addr_t
731micscif_get_phys_addr(struct reg_range_t *window, uint64_t off)
732{
733 int page_nr = (off - window->offset) >> PAGE_SHIFT;
734 off_t page_off = off & ~PAGE_MASK;
735 return window->phys_addr[page_nr] | page_off;
736}
737#endif
738
739#define RMA_ERROR_CODE (~(dma_addr_t)0x0)
740
741/*
742 * micscif_get_dma_addr:
743 * Obtain the dma_addr given the window and the offset.
744 * @window: Registered window.
745 * @off: Window offset.
746 * @nr_bytes: Return the number of contiguous bytes till next DMA addr index.
747 * @index: Return the index of the dma_addr array found.
748 * @start_off: start offset of index of the dma addr array found.
749 * The nr_bytes provides the callee an estimate of the maximum possible
750 * DMA xfer possible while the index/start_off provide faster lookups
751 * for the next iteration.
752 */
753static __always_inline dma_addr_t
754micscif_get_dma_addr(struct reg_range_t *window, uint64_t off, size_t *nr_bytes, int *index, uint64_t *start_off)
755{
756 if (window->nr_pages == window->nr_contig_chunks) {
757 int page_nr = (int)((off - window->offset) >> PAGE_SHIFT);
758 off_t page_off = off & ~PAGE_MASK;
759 if (nr_bytes)
760 *nr_bytes = PAGE_SIZE - page_off;
761 if (page_nr >= window->nr_pages) {
762 printk(KERN_ERR "%s dma_addr out of boundary\n", __FUNCTION__);
763 }
764 return window->dma_addr[page_nr] | page_off;
765 } else {
766 int i = index ? *index : 0;
767 uint64_t end;
768 uint64_t start = start_off ? *start_off : window->offset;
769 for (; i < window->nr_contig_chunks; i++) {
770 end = start + (window->num_pages[i] << PAGE_SHIFT);
771 if (off >= start && off < end) {
772 if (index)
773 *index = i;
774 if (start_off)
775 *start_off = start;
776 if (nr_bytes)
777 *nr_bytes = end - off;
778 return (window->dma_addr[i] + (off - start));
779 }
780 start += (window->num_pages[i] << PAGE_SHIFT);
781 }
782 }
783#ifdef CONFIG_MK1OM
784 printk(KERN_ERR "%s %d BUG. Addr not found? window %p off 0x%llx\n", __func__, __LINE__, window, off);
785 BUG_ON(1);
786#endif
787 return RMA_ERROR_CODE;
788}
789
790/*
791 * scif_memset:
792 * @va: kernel virtual address
793 * @c: The byte used to fill the memory
794 * @size: Buffer size
795 *
796 * Helper API which fills size bytes of memory pointed to by va with the
797 * constant byte c. This API fills the memory in chunks of 4GB - 1 bytes
798 * for a single invocation of memset(..) to work around a kernel bug in
799 * x86_64 @ https://bugzilla.kernel.org/show_bug.cgi?id=27732
800 * where memset(..) does not do "ANY" work for size >= 4GB.
801 * This kernel bug has been fixed upstream in v3.2 via the commit
802 * titled "x86-64: Fix memset() to support sizes of 4Gb and above"
803 * but has not been backported to distributions like RHEL 6.3 yet.
804 */
805static __always_inline void scif_memset(char *va, int c, size_t size)
806{
807 size_t loop_size;
808 const size_t four_gb = 4 * 1024 * 1024 * 1024ULL;
809
810 while (size) {
811 loop_size = min(size, four_gb - 1);
812 memset(va, c, loop_size);
813 size -= loop_size;
814 va += loop_size;
815 }
816}
817
818/*
819 * scif_zalloc:
820 * @size: Size of the allocation request.
821 *
822 * Helper API which attempts to allocate zeroed pages via
823 * __get_free_pages(..) first and then falls back on
824 * vmalloc(..) if that fails. This is required because
825 * vmalloc(..) is *slow*.
826 */
827static __always_inline void *scif_zalloc(size_t size)
828{
829 void *ret;
830 size_t align = ALIGN(size, PAGE_SIZE);
831
832 if (!align)
833 return NULL;
834
835 if (align <= (1 << (MAX_ORDER + PAGE_SHIFT - 1)))
836 if ((ret = (void*)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
837 get_order(align))))
838 goto done;
839 if (!(ret = vmalloc(align)))
840 return NULL;
841
842 /* TODO: Use vzalloc once kernel supports it */
843 scif_memset(ret, 0, size);
844done:
845#ifdef RMA_DEBUG
846 atomic_long_add_return(align, &ms_info.rma_alloc_cnt);
847#endif
848 return ret;
849}
850
851/*
852 * scif_free:
853 * @addr: Address to be freed.
854 * @size: Size of the allocation.
855 * Helper API which frees memory allocated via scif_zalloc().
856 */
857static __always_inline void scif_free(void *addr, size_t size)
858{
859 size_t align = ALIGN(size, PAGE_SIZE);
860
861 if (unlikely(is_vmalloc_addr(addr)))
862 vfree(addr);
863 else {
864 free_pages((unsigned long)addr, get_order(align));
865 }
866#ifdef RMA_DEBUG
867 WARN_ON(atomic_long_sub_return(align, &ms_info.rma_alloc_cnt) < 0);
868#endif
869}
870
871static __always_inline void
872get_window_ref_count(struct reg_range_t *window, int64_t nr_pages)
873{
874 window->ref_count += (int)nr_pages;
875}
876
877static __always_inline void
878put_window_ref_count(struct reg_range_t *window, int64_t nr_pages)
879{
880 window->ref_count -= (int)nr_pages;
881 BUG_ON(window->nr_pages < 0);
882}
883
884static __always_inline void
885set_window_ref_count(struct reg_range_t *window, int64_t nr_pages)
886{
887 window->ref_count = (int)nr_pages;
888}
889
890/* Debug API's */
891void micscif_display_window(struct reg_range_t *window, const char *s, int line);
892static inline struct mm_struct *__scif_acquire_mm(void)
893{
894 if (mic_ulimit_check) {
895#ifdef RMA_DEBUG
896 atomic_long_add_return(1, &ms_info.rma_mm_cnt);
897#endif
898 return get_task_mm(current);
899 }
900 return NULL;
901}
902
903static inline void __scif_release_mm(struct mm_struct *mm)
904{
905 if (mic_ulimit_check && mm) {
906#ifdef RMA_DEBUG
907 WARN_ON(atomic_long_sub_return(1, &ms_info.rma_mm_cnt) < 0);
908#endif
909 mmput(mm);
910 }
911}
912
913static inline int __scif_dec_pinned_vm_lock(struct mm_struct *mm,
914 int64_t nr_pages, bool try_lock)
915{
916 if (mm && nr_pages && mic_ulimit_check) {
917 if (try_lock) {
918 if (!down_write_trylock(&mm->mmap_sem)) {
919 return -1;
920 }
921 } else {
922 down_write(&mm->mmap_sem);
923 }
924#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0))
925 mm->pinned_vm -= nr_pages;
926#else
927 mm->locked_vm -= nr_pages;
928#endif
929 up_write(&mm->mmap_sem);
930 }
931 return 0;
932}
933
934static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm,
935 int64_t nr_pages)
936{
937 if (mm && mic_ulimit_check && nr_pages) {
938 unsigned long locked, lock_limit;
939 locked = nr_pages;
940#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0))
941 locked += mm->pinned_vm;
942#else
943 locked += mm->locked_vm;
944#endif
945 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
946 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
947 pr_debug("locked(%lu) > lock_limit(%lu)\n",
948 locked, lock_limit);
949 return -ENOMEM;
950 } else {
951#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0))
952 mm->pinned_vm = locked;
953#else
954 mm->locked_vm = locked;
955#endif
956 }
957 }
958 return 0;
959}
960#endif