Commit | Line | Data |
---|---|---|
800f879a AT |
1 | /* |
2 | * Copyright 2010-2017 Intel Corporation. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License, version 2, | |
6 | * as published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, | |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | * | |
13 | * Disclaimer: The codes contained in these modules may be specific to | |
14 | * the Intel Software Development Platform codenamed Knights Ferry, | |
15 | * and the Intel product codenamed Knights Corner, and are not backward | |
16 | * compatible with other Intel products. Additionally, Intel will NOT | |
17 | * support the codes or instruction set in future products. | |
18 | * | |
19 | * Intel offers no warranty of any kind regarding the code. This code is | |
20 | * licensed on an "AS IS" basis and Intel is not obligated to provide | |
21 | * any support, assistance, installation, training, or other services | |
22 | * of any kind. Intel is also not obligated to provide any updates, | |
23 | * enhancements or extensions. Intel specifically disclaims any warranty | |
24 | * of merchantability, non-infringement, fitness for any particular | |
25 | * purpose, and any other warranty. | |
26 | * | |
27 | * Further, Intel disclaims all liability of any kind, including but | |
28 | * not limited to liability for infringement of any proprietary rights, | |
29 | * relating to the use of the code, even if Intel is notified of the | |
30 | * possibility of such liability. Except as expressly stated in an Intel | |
31 | * license agreement provided with this code and agreed upon with Intel, | |
32 | * no license, express or implied, by estoppel or otherwise, to any | |
33 | * intellectual property rights is granted herein. | |
34 | */ | |
35 | ||
36 | #ifndef MICSCIF_RMA_H | |
37 | #define MICSCIF_RMA_H | |
38 | ||
39 | #ifdef CONFIG_MMU_NOTIFIER | |
40 | #include <linux/mmu_notifier.h> | |
41 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
42 | #include <linux/huge_mm.h> | |
43 | #endif | |
44 | #ifdef CONFIG_HUGETLB_PAGE | |
45 | #include <linux/hugetlb.h> | |
46 | #endif | |
47 | #endif | |
48 | #include "scif.h" | |
49 | #include <linux/errno.h> | |
50 | #include <linux/hardirq.h> | |
51 | #include <linux/types.h> | |
52 | #include <linux/capability.h> | |
53 | #include <linux/slab.h> | |
54 | #include <linux/string.h> | |
55 | #include <linux/gfp.h> | |
56 | #include <linux/vmalloc.h> | |
57 | #include <asm/io.h> | |
58 | #include <linux/kernel.h> | |
59 | #include <linux/mm_types.h> | |
60 | #include <linux/jiffies.h> | |
61 | #include <linux/timer.h> | |
62 | #include <linux/irqflags.h> | |
63 | #include <linux/time.h> | |
64 | #include <linux/spinlock.h> | |
65 | #include <linux/mutex.h> | |
66 | #include <linux/semaphore.h> | |
67 | #include <linux/kthread.h> | |
68 | #include <linux/sched.h> | |
69 | #include <linux/delay.h> | |
70 | #include <linux/wait.h> | |
71 | #include <asm/bug.h> | |
72 | #include <linux/pci.h> | |
73 | #include <linux/device.h> | |
74 | #include <linux/fs.h> | |
75 | #include <linux/list.h> | |
76 | #include <linux/workqueue.h> | |
77 | #include <linux/interrupt.h> | |
78 | #include <asm/atomic.h> | |
79 | #include <linux/netdevice.h> | |
80 | #include <linux/debugfs.h> | |
81 | #include "mic/micscif_kmem_cache.h" | |
82 | ||
83 | struct rma_mmu_notifier { | |
84 | #ifdef CONFIG_MMU_NOTIFIER | |
85 | struct mmu_notifier ep_mmu_notifier; | |
86 | #endif | |
87 | bool ep_mn_registered; | |
88 | /* List of temp registration windows for self */ | |
89 | struct list_head tc_reg_list; | |
90 | struct mm_struct *mm; | |
91 | struct endpt *ep; | |
92 | struct list_head list_member; | |
93 | }; | |
94 | ||
95 | /* Per Endpoint Remote Memory Access Information */ | |
96 | struct endpt_rma_info { | |
97 | /* List of registration windows for self */ | |
98 | struct list_head reg_list; | |
99 | /* List of registration windows for peer */ | |
100 | struct list_head remote_reg_list; | |
101 | /* Offset generator */ | |
102 | struct va_gen_addr va_gen; | |
103 | /* | |
104 | * Synchronizes access to self/remote list and also | |
105 | * protects the window from being destroyed while | |
106 | * RMAs are in progress. | |
107 | */ | |
108 | struct mutex rma_lock; | |
109 | /* | |
110 | * Synchronizes access to temporary cached windows list | |
111 | * for SCIF Registration Caching. | |
112 | */ | |
113 | spinlock_t tc_lock; | |
114 | /* | |
115 | * Synchronizes access to the list of MMU notifiers | |
116 | * registered for this SCIF endpoint. | |
117 | */ | |
118 | struct mutex mmn_lock; | |
119 | /* | |
120 | * Synchronizes access to the SCIF registered address space | |
121 | * offset generator. | |
122 | */ | |
123 | struct mutex va_lock; | |
124 | /* | |
125 | * Keeps track of number of outstanding temporary registered | |
126 | * windows created by scif_vreadfrom/scif_vwriteto which have | |
127 | * not been destroyed. tcw refers to the number of temporary | |
128 | * cached windows and total number of pages pinned. | |
129 | */ | |
130 | atomic_t tw_refcount; | |
131 | atomic_t tw_total_pages; | |
132 | atomic_t tcw_refcount; | |
133 | atomic_t tcw_total_pages; | |
134 | /* | |
135 | * MMU notifier so that we can destroy the windows when there is | |
136 | * a change | |
137 | */ | |
138 | struct list_head mmn_list; | |
139 | /* | |
140 | * Keeps track of number of outstanding remote fence requests | |
141 | * which have been received by the peer. | |
142 | */ | |
143 | int fence_refcount; | |
144 | /* | |
145 | * The close routine blocks on this wait queue to ensure that all | |
146 | * remote fence requests have been serviced. | |
147 | */ | |
148 | wait_queue_head_t fence_wq; | |
149 | /* | |
150 | * DMA channel used for all DMA transfers for this endpoint. | |
151 | */ | |
152 | struct dma_channel *dma_chan; | |
153 | /* Detect asynchronous list entry deletion */ | |
154 | int async_list_del; | |
155 | #ifdef _MIC_SCIF_ | |
156 | /* Local P2P proxy DMA virtual address for SUD updates by peer */ | |
157 | void *proxy_dma_va; | |
158 | /* Local P2P proxy DMA physical address location for SUD updates */ | |
159 | dma_addr_t proxy_dma_phys; | |
160 | /* Remote P2P proxy DMA physical address location for SUD updates */ | |
161 | dma_addr_t proxy_dma_peer_phys; | |
162 | #endif | |
163 | /* List of tasks which have remote memory mappings */ | |
164 | struct list_head task_list; | |
165 | }; | |
166 | ||
167 | /* Information used for tracking remote fence requests */ | |
168 | struct fence_info { | |
169 | /* State of this transfer */ | |
170 | enum micscif_msg_state state; | |
171 | ||
172 | /* Fences wait on this queue */ | |
173 | wait_queue_head_t wq; | |
174 | ||
175 | /* Used for storing the DMA mark */ | |
176 | int dma_mark; | |
177 | }; | |
178 | ||
179 | /* Per remote fence wait request */ | |
180 | struct remote_fence_info { | |
181 | /* The SCIF_WAIT message */ | |
182 | struct nodemsg msg; | |
183 | ||
184 | struct list_head list_member; | |
185 | }; | |
186 | ||
187 | /* Self or Peer window */ | |
188 | enum rma_window_type { | |
189 | RMA_WINDOW_SELF = 0x1, | |
190 | RMA_WINDOW_PEER | |
191 | }; | |
192 | ||
193 | /* The number of physical addresses that can be stored in a PAGE. */ | |
194 | #define NR_PHYS_ADDR_IN_PAGE (PAGE_SIZE >> 3) | |
195 | ||
196 | /* | |
197 | * Store an array of lookup offsets. Each offset in this array maps | |
198 | * one 4K page containing 512 physical addresses i.e. 2MB. 512 such | |
199 | * offsets in a 4K page will correspond to 1GB of registered address space. | |
200 | */ | |
201 | struct rma_lookup { | |
202 | /* Array of offsets */ | |
203 | dma_addr_t *lookup; | |
204 | /* Offset used to map lookup array */ | |
205 | dma_addr_t offset; | |
206 | }; | |
207 | ||
208 | ||
209 | /* | |
210 | * A set of pinned pages obtained with scif_pin_pages() which could be part | |
211 | * of multiple registered windows across different end points. | |
212 | */ | |
213 | struct scif_pinned_pages { | |
214 | int64_t nr_pages; | |
215 | int prot; | |
216 | int map_flags; | |
217 | atomic_t ref_count; | |
218 | uint64_t magic; | |
219 | /* | |
220 | * Array of pointers to struct pages populated | |
221 | * with get_user_pages(..) | |
222 | */ | |
223 | struct page **pages; | |
224 | int *num_pages; | |
225 | int64_t nr_contig_chunks; | |
226 | /* Only for Hosts without THP but with Huge TLB FS Like SuSe11 SP1 */ | |
227 | struct vm_area_struct **vma; | |
228 | }; | |
229 | ||
230 | /* | |
231 | * Information about a particular task which has remote memory mappings | |
232 | * created via scif_mmap(..). | |
233 | */ | |
234 | struct rma_task_info { | |
235 | /* | |
236 | * Stores the pid struct of the grp_leader task structure which | |
237 | * scif_mmap(..)'d the remote window. | |
238 | */ | |
239 | struct pid *pid; | |
240 | int ref_count; | |
241 | struct list_head list_member; | |
242 | }; | |
243 | ||
244 | /* Registration Window for Self */ | |
245 | struct reg_range_t { | |
246 | int64_t nr_pages; | |
247 | /* Number of contiguous physical chunks */ | |
248 | int64_t nr_contig_chunks; | |
249 | int prot; | |
250 | int ref_count; | |
251 | /* Cookie to detect corruption */ | |
252 | uint64_t magic; | |
253 | uint64_t offset; | |
254 | /* va address that this window represents | |
255 | * Useful for only for temp windows*/ | |
256 | void *va_for_temp; | |
257 | /* Used for temporary windows*/ | |
258 | int dma_mark; | |
259 | /* | |
260 | * Pointer to EP. Useful for passing EP around | |
261 | * with messages to avoid expensive list | |
262 | * traversals. | |
263 | */ | |
264 | uint64_t ep; | |
265 | ||
266 | struct list_head list_member; | |
267 | ||
268 | enum rma_window_type type; | |
269 | ||
270 | /* | |
271 | * Pointer to peer window. Useful for sending | |
272 | * messages to peer without requiring an | |
273 | * extra list traversal | |
274 | */ | |
275 | uint64_t peer_window; | |
276 | ||
277 | /* Unregistration state */ | |
278 | enum micscif_msg_state unreg_state; | |
279 | ||
280 | /* | |
281 | * True for temporary windows created via | |
282 | * scif_vreadfrom/scif_vwriteto. | |
283 | */ | |
284 | bool temp; | |
285 | ||
286 | bool offset_freed; | |
287 | ||
288 | /* Local P2P proxy DMA physical address location for SUD updates */ | |
289 | dma_addr_t proxy_dma_phys; | |
290 | ||
291 | union { | |
292 | /* Self RAS */ | |
293 | struct { | |
294 | /* The set of pinned_pages backing this window */ | |
295 | struct scif_pinned_pages *pinned_pages; | |
296 | ||
297 | /* Handle for sending ALLOC_REQ */ | |
298 | struct allocmsg alloc_handle; | |
299 | ||
300 | /* Wait Queue for an registration (N)ACK */ | |
301 | wait_queue_head_t regwq; | |
302 | ||
303 | /* Registration state */ | |
304 | enum micscif_msg_state reg_state; | |
305 | ||
306 | /* Wait Queue for an unregistration (N)ACK */ | |
307 | wait_queue_head_t unregwq; | |
308 | }; | |
309 | /* Peer RAS specific window elements */ | |
310 | struct { | |
311 | #ifdef CONFIG_ML1OM | |
312 | /* Lookup for physical addresses used for mmap */ | |
313 | struct rma_lookup phys_addr_lookup; | |
314 | ||
315 | /* Lookup for temp physical addresses used for mmap */ | |
316 | struct rma_lookup temp_phys_addr_lookup; | |
317 | ||
318 | /* Mmap state */ | |
319 | enum micscif_msg_state gttmap_state; | |
320 | ||
321 | /* Wait Queue for an unregistration (N)ACK */ | |
322 | wait_queue_head_t gttmapwq; | |
323 | ||
324 | /* Ref count per page */ | |
325 | int *page_ref_count; | |
326 | #endif | |
327 | /* Lookup for physical addresses used for DMA */ | |
328 | struct rma_lookup dma_addr_lookup; | |
329 | ||
330 | /* Number of entries in lookup */ | |
331 | int nr_lookup; | |
332 | ||
333 | /* Offset used to map the window by the peer */ | |
334 | dma_addr_t mapped_offset; | |
335 | ||
336 | /* Ref count for tracking scif_get_pages */ | |
337 | int get_put_ref_count; | |
338 | }; | |
339 | }; | |
340 | #ifdef CONFIG_ML1OM | |
341 | /* Array of physical addresses used for creating VtoP mappings */ | |
342 | /* FIXME: these are phys_addr as seen by the peer node, node at the | |
343 | * opposite end of the endpt | |
344 | */ | |
345 | dma_addr_t *phys_addr; | |
346 | ||
347 | /* Temporary array for storing physical addresses for performance */ | |
348 | dma_addr_t *temp_phys_addr; | |
349 | #endif | |
350 | ||
351 | /* Array of physical addresses used for Host & MIC initiated DMA */ | |
352 | dma_addr_t *dma_addr; | |
353 | ||
354 | /* Array specifying number of pages for each physical address */ | |
355 | int *num_pages; | |
356 | struct mm_struct *mm; | |
357 | } __attribute__ ((packed)); | |
358 | ||
359 | ||
360 | #define RMA_MAGIC(x) BUG_ON(x->magic != SCIFEP_MAGIC) | |
361 | ||
362 | /* If this bit is set then the mark is a remote fence mark */ | |
363 | #define SCIF_REMOTE_FENCE_BIT 30 | |
364 | /* Magic value used to indicate a remote fence request */ | |
365 | #define SCIF_REMOTE_FENCE (1ULL << SCIF_REMOTE_FENCE_BIT) | |
366 | ||
367 | enum rma_direction { | |
368 | LOCAL_TO_REMOTE, | |
369 | REMOTE_TO_LOCAL | |
370 | }; | |
371 | ||
372 | /* Initialize RMA for this EP */ | |
373 | int micscif_rma_ep_init(struct endpt *ep); | |
374 | ||
375 | /* Check if epd can be uninitialized */ | |
376 | int micscif_rma_ep_can_uninit(struct endpt *ep); | |
377 | ||
378 | /* Obtain a new offset. Callee must grab RMA lock */ | |
379 | int micscif_get_window_offset(struct endpt *ep, int flags, | |
380 | uint64_t offset, size_t len, uint64_t *out_offset); | |
381 | ||
382 | /* Free offset. Callee must grab RMA lock */ | |
383 | void micscif_free_window_offset(struct endpt *ep, | |
384 | uint64_t offset, size_t len); | |
385 | ||
386 | /* Create self registration window */ | |
387 | struct reg_range_t *micscif_create_window(struct endpt *ep, | |
388 | int64_t nr_pages, uint64_t offset, bool temp); | |
389 | ||
390 | /* Create a set of pinned pages */ | |
391 | struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot); | |
392 | ||
393 | /* Destroy a set of pinned pages */ | |
394 | int micscif_destroy_pinned_pages(struct scif_pinned_pages *pages); | |
395 | ||
396 | /* Destroy self registration window.*/ | |
397 | int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window); | |
398 | ||
399 | int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window); | |
400 | ||
401 | /* Map pages of self window to Aperture/PCI */ | |
402 | int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool temp); | |
403 | ||
404 | /* Unregister a self window */ | |
405 | int micscif_unregister_window(struct reg_range_t *window); | |
406 | ||
407 | /* Create remote registration window */ | |
408 | struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages); | |
409 | ||
410 | /* Destroy remote registration window */ | |
411 | void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window); | |
412 | ||
413 | int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window); | |
414 | ||
415 | /* Prepare a remote registration window */ | |
416 | int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window); | |
417 | ||
418 | /* Create remote lookup entries for physical addresses */ | |
419 | int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window); | |
420 | ||
421 | /* Destroy remote lookup entries for physical addresses */ | |
422 | void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window); | |
423 | ||
424 | /* Send a SCIF_REGISTER message and wait for an ACK */ | |
425 | int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window); | |
426 | ||
427 | /* Send a SCIF_UNREGISTER message */ | |
428 | int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window); | |
429 | ||
430 | /* RMA copy API */ | |
431 | int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len, | |
432 | off_t roffset, int flags, enum rma_direction dir, bool last_chunk); | |
433 | ||
434 | /* Sends a remote fence mark request */ | |
435 | int micscif_send_fence_mark(scif_epd_t epd, int *out_mark); | |
436 | ||
437 | /* Sends a remote fence wait request */ | |
438 | int micscif_send_fence_wait(scif_epd_t epd, int mark); | |
439 | ||
440 | /* Sends a remote fence signal request */ | |
441 | int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval, | |
442 | off_t loff, uint64_t lval, int flags); | |
443 | ||
444 | /* Setup a DMA mark for an endpoint */ | |
445 | int micscif_fence_mark(scif_epd_t epd); | |
446 | ||
447 | void ep_unregister_mmu_notifier(struct endpt *ep); | |
448 | #ifdef CONFIG_MMU_NOTIFIER | |
449 | void micscif_mmu_notif_handler(struct work_struct *work); | |
450 | #endif | |
451 | ||
452 | void micscif_rma_destroy_temp_windows(void); | |
453 | void micscif_rma_destroy_tcw_ep(struct endpt *ep); | |
454 | void micscif_rma_destroy_tcw_invalid(struct list_head *list); | |
455 | ||
456 | void micscif_rma_handle_remote_fences(void); | |
457 | ||
458 | /* Reserve a DMA channel for a particular endpoint */ | |
459 | int micscif_reserve_dma_chan(struct endpt *ep); | |
460 | ||
461 | /* Program DMA SUD's after verifying the registered offset */ | |
462 | int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val, | |
463 | enum rma_window_type type); | |
464 | ||
465 | /* Kill any applications which have valid remote memory mappings */ | |
466 | void micscif_kill_apps_with_mmaps(int node); | |
467 | ||
468 | /* Query if any applications have remote memory mappings */ | |
469 | bool micscif_rma_do_apps_have_mmaps(int node); | |
470 | ||
471 | /* Get a reference to the current task which is creating a remote memory mapping */ | |
472 | int micscif_rma_get_task(struct endpt *ep, int nr_pages); | |
473 | ||
474 | /* Release a reference to the current task which is destroying a remote memory mapping */ | |
475 | void micscif_rma_put_task(struct endpt *ep, int nr_pages); | |
476 | ||
477 | /* Cleanup remote registration lists for zombie endpoints */ | |
478 | void micscif_cleanup_rma_for_zombies(int node); | |
479 | ||
480 | #ifdef _MIC_SCIF_ | |
481 | void micscif_teardown_proxy_dma(struct endpt *ep); | |
482 | #endif | |
483 | ||
484 | static __always_inline | |
485 | bool is_unaligned(off_t src_offset, off_t dst_offset) | |
486 | { | |
487 | src_offset = src_offset & (L1_CACHE_BYTES - 1); | |
488 | dst_offset = dst_offset & (L1_CACHE_BYTES - 1); | |
489 | if (src_offset == dst_offset) | |
490 | return false; | |
491 | else | |
492 | return true; | |
493 | } | |
494 | ||
495 | static __always_inline | |
496 | int __scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, | |
497 | off_t roffset, int flags) | |
498 | { | |
499 | int err; | |
500 | ||
501 | pr_debug("SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx" | |
502 | " offset 0x%lx flags 0x%x\n", | |
503 | epd, loffset, len, roffset, flags); | |
504 | ||
505 | if (is_unaligned(loffset, roffset)) { | |
506 | while(len > MAX_UNALIGNED_BUF_SIZE) { | |
507 | err = micscif_rma_copy(epd, loffset, NULL, | |
508 | MAX_UNALIGNED_BUF_SIZE, | |
509 | roffset, flags, REMOTE_TO_LOCAL, false); | |
510 | if (err) | |
511 | goto readfrom_err; | |
512 | loffset += MAX_UNALIGNED_BUF_SIZE; | |
513 | roffset += MAX_UNALIGNED_BUF_SIZE; | |
514 | len -=MAX_UNALIGNED_BUF_SIZE; | |
515 | } | |
516 | } | |
517 | err = micscif_rma_copy(epd, loffset, NULL, len, | |
518 | roffset, flags, REMOTE_TO_LOCAL, true); | |
519 | readfrom_err: | |
520 | return err; | |
521 | } | |
522 | ||
523 | static __always_inline | |
524 | int __scif_writeto(scif_epd_t epd, off_t loffset, size_t len, | |
525 | off_t roffset, int flags) | |
526 | { | |
527 | int err; | |
528 | ||
529 | pr_debug("SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx" | |
530 | " roffset 0x%lx flags 0x%x\n", | |
531 | epd, loffset, len, roffset, flags); | |
532 | ||
533 | if (is_unaligned(loffset, roffset)) { | |
534 | while(len > MAX_UNALIGNED_BUF_SIZE) { | |
535 | err = micscif_rma_copy(epd, loffset, NULL, | |
536 | MAX_UNALIGNED_BUF_SIZE, | |
537 | roffset, flags, LOCAL_TO_REMOTE, false); | |
538 | if (err) | |
539 | goto writeto_err; | |
540 | loffset += MAX_UNALIGNED_BUF_SIZE; | |
541 | roffset += MAX_UNALIGNED_BUF_SIZE; | |
542 | len -= MAX_UNALIGNED_BUF_SIZE; | |
543 | } | |
544 | } | |
545 | err = micscif_rma_copy(epd, loffset, NULL, len, | |
546 | roffset, flags, LOCAL_TO_REMOTE, true); | |
547 | writeto_err: | |
548 | return err; | |
549 | } | |
550 | ||
551 | static __always_inline | |
552 | int __scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags) | |
553 | { | |
554 | int err; | |
555 | ||
556 | pr_debug("SCIFAPI vreadfrom: ep %p addr %p len 0x%lx" | |
557 | " roffset 0x%lx flags 0x%x\n", | |
558 | epd, addr, len, roffset, flags); | |
559 | ||
560 | if (is_unaligned((off_t)addr, roffset)) { | |
561 | if (len > MAX_UNALIGNED_BUF_SIZE) | |
562 | flags &= ~SCIF_RMA_USECACHE; | |
563 | ||
564 | while(len > MAX_UNALIGNED_BUF_SIZE) { | |
565 | err = micscif_rma_copy(epd, 0, addr, | |
566 | MAX_UNALIGNED_BUF_SIZE, | |
567 | roffset, flags, REMOTE_TO_LOCAL, false); | |
568 | if (err) | |
569 | goto vreadfrom_err; | |
570 | addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE); | |
571 | roffset += MAX_UNALIGNED_BUF_SIZE; | |
572 | len -= MAX_UNALIGNED_BUF_SIZE; | |
573 | } | |
574 | } | |
575 | err = micscif_rma_copy(epd, 0, addr, len, | |
576 | roffset, flags, REMOTE_TO_LOCAL, true); | |
577 | vreadfrom_err: | |
578 | return err; | |
579 | } | |
580 | ||
581 | static __always_inline | |
582 | int __scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags) | |
583 | { | |
584 | int err; | |
585 | ||
586 | pr_debug("SCIFAPI vwriteto: ep %p addr %p len 0x%lx" | |
587 | " roffset 0x%lx flags 0x%x\n", | |
588 | epd, addr, len, roffset, flags); | |
589 | ||
590 | if (is_unaligned((off_t)addr, roffset)) { | |
591 | if (len > MAX_UNALIGNED_BUF_SIZE) | |
592 | flags &= ~SCIF_RMA_USECACHE; | |
593 | ||
594 | while(len > MAX_UNALIGNED_BUF_SIZE) { | |
595 | err = micscif_rma_copy(epd, 0, addr, | |
596 | MAX_UNALIGNED_BUF_SIZE, | |
597 | roffset, flags, LOCAL_TO_REMOTE, false); | |
598 | if (err) | |
599 | goto vwriteto_err; | |
600 | addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE); | |
601 | roffset += MAX_UNALIGNED_BUF_SIZE; | |
602 | len -= MAX_UNALIGNED_BUF_SIZE; | |
603 | } | |
604 | } | |
605 | err = micscif_rma_copy(epd, 0, addr, len, | |
606 | roffset, flags, LOCAL_TO_REMOTE, true); | |
607 | vwriteto_err: | |
608 | return err; | |
609 | } | |
610 | ||
611 | void micscif_rma_completion_cb(uint64_t data); | |
612 | ||
613 | int micscif_pci_dev(uint16_t node, struct pci_dev **pdev); | |
614 | #ifndef _MIC_SCIF_ | |
615 | int micscif_pci_info(uint16_t node, struct scif_pci_info *dev); | |
616 | #endif | |
617 | ||
618 | /* | |
619 | * nr_pages in a 2MB page is specified via the top 12 bits in the | |
620 | * physical address. | |
621 | */ | |
622 | ||
623 | /* Check all parenthesis in these macros. See if putting in bottom makes sense? */ | |
624 | #define RMA_HUGE_NR_PAGE_SHIFT ((52)) | |
625 | #define RMA_HUGE_NR_PAGE_MASK (((0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT)) | |
626 | #define RMA_GET_NR_PAGES(addr) ((addr) >> RMA_HUGE_NR_PAGE_SHIFT) | |
627 | #define RMA_SET_NR_PAGES(addr, nr_pages) ((addr) = (((nr_pages) & 0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT) | ((uint64_t)(addr))) | |
628 | #define RMA_GET_ADDR(addr) ((addr) & ~(RMA_HUGE_NR_PAGE_MASK)) | |
629 | ||
630 | extern bool mic_huge_page_enable; | |
631 | ||
632 | #define SCIF_HUGE_PAGE_SHIFT 21 | |
633 | ||
634 | /* | |
635 | * micscif_is_huge_page: | |
636 | * @page: A physical page. | |
637 | */ | |
638 | static __always_inline int | |
639 | micscif_is_huge_page(struct scif_pinned_pages *pinned_pages, int index) | |
640 | { | |
641 | int huge = 0; | |
642 | struct page *page = pinned_pages->pages[index]; | |
643 | ||
644 | if (compound_order(page) + PAGE_SHIFT == SCIF_HUGE_PAGE_SHIFT) | |
645 | huge = 1; | |
646 | if (huge) | |
647 | ms_info.nr_2mb_pages++; | |
648 | if (!mic_huge_page_enable) | |
649 | huge = 0; | |
650 | #ifdef RMA_DEBUG | |
651 | WARN_ON(!page_count(page)); | |
652 | WARN_ON(page_mapcount(page) < 0); | |
653 | #endif | |
654 | return huge; | |
655 | } | |
656 | ||
657 | /* | |
658 | * micscif_detect_large_page: | |
659 | * @pinned_pages: A set of pinned pages. | |
660 | */ | |
661 | static __always_inline int | |
662 | micscif_detect_large_page(struct scif_pinned_pages *pinned_pages, char *addr) | |
663 | { | |
664 | int i = 0, nr_pages, huge; | |
665 | char *next_huge, *end; | |
666 | char *end_addr = addr + (pinned_pages->nr_pages << PAGE_SHIFT); | |
667 | ||
668 | while (addr < end_addr) { | |
669 | huge = micscif_is_huge_page(pinned_pages, i); | |
670 | if (huge) { | |
671 | next_huge = (char *)ALIGN( | |
672 | (unsigned long)(addr + 1), | |
673 | PMD_SIZE); | |
674 | end = next_huge > end_addr ? end_addr : next_huge; | |
675 | nr_pages = (int)((end - addr) >> PAGE_SHIFT); | |
676 | pinned_pages->num_pages[i] = (int)nr_pages; | |
677 | addr = end; | |
678 | i += (int)nr_pages; | |
679 | ||
680 | } else { | |
681 | pinned_pages->num_pages[i] = 1; | |
682 | i++; | |
683 | addr += PAGE_SIZE; | |
684 | ms_info.nr_4k_pages++; | |
685 | } | |
686 | pinned_pages->nr_contig_chunks++; | |
687 | } | |
688 | return 0; | |
689 | } | |
690 | ||
691 | /** | |
692 | * micscif_set_nr_pages: | |
693 | * @ep: end point | |
694 | * @window: self registration window | |
695 | * | |
696 | * Set nr_pages in every entry of physical address/dma address array | |
697 | * and also remove nr_pages information from physical addresses. | |
698 | */ | |
699 | static __always_inline void | |
700 | micscif_set_nr_pages(struct micscif_dev *dev, struct reg_range_t *window) | |
701 | { | |
702 | int j; | |
703 | #ifdef CONFIG_ML1OM | |
704 | int l = 0, k; | |
705 | #endif | |
706 | ||
707 | for (j = 0; j < window->nr_contig_chunks; j++) { | |
708 | window->num_pages[j] = RMA_GET_NR_PAGES(window->dma_addr[j]); | |
709 | if (window->num_pages[j]) | |
710 | window->dma_addr[j] = RMA_GET_ADDR(window->dma_addr[j]); | |
711 | else | |
712 | break; | |
713 | #ifdef CONFIG_ML1OM | |
714 | for (k = 0; k < window->num_pages[j]; k++) | |
715 | if (window->temp_phys_addr[j]) | |
716 | window->phys_addr[l + k] = | |
717 | RMA_GET_ADDR(window->temp_phys_addr[j]) + (k << PAGE_SHIFT); | |
718 | l += window->num_pages[j]; | |
719 | #endif | |
720 | } | |
721 | } | |
722 | ||
723 | #ifdef CONFIG_ML1OM | |
724 | /* | |
725 | * micscif_get_phys_addr: | |
726 | * Obtain the phys_addr given the window and the offset. | |
727 | * @window: Registered window. | |
728 | * @off: Window offset. | |
729 | */ | |
730 | static __always_inline dma_addr_t | |
731 | micscif_get_phys_addr(struct reg_range_t *window, uint64_t off) | |
732 | { | |
733 | int page_nr = (off - window->offset) >> PAGE_SHIFT; | |
734 | off_t page_off = off & ~PAGE_MASK; | |
735 | return window->phys_addr[page_nr] | page_off; | |
736 | } | |
737 | #endif | |
738 | ||
739 | #define RMA_ERROR_CODE (~(dma_addr_t)0x0) | |
740 | ||
741 | /* | |
742 | * micscif_get_dma_addr: | |
743 | * Obtain the dma_addr given the window and the offset. | |
744 | * @window: Registered window. | |
745 | * @off: Window offset. | |
746 | * @nr_bytes: Return the number of contiguous bytes till next DMA addr index. | |
747 | * @index: Return the index of the dma_addr array found. | |
748 | * @start_off: start offset of index of the dma addr array found. | |
749 | * The nr_bytes provides the callee an estimate of the maximum possible | |
750 | * DMA xfer possible while the index/start_off provide faster lookups | |
751 | * for the next iteration. | |
752 | */ | |
753 | static __always_inline dma_addr_t | |
754 | micscif_get_dma_addr(struct reg_range_t *window, uint64_t off, size_t *nr_bytes, int *index, uint64_t *start_off) | |
755 | { | |
756 | if (window->nr_pages == window->nr_contig_chunks) { | |
757 | int page_nr = (int)((off - window->offset) >> PAGE_SHIFT); | |
758 | off_t page_off = off & ~PAGE_MASK; | |
759 | if (nr_bytes) | |
760 | *nr_bytes = PAGE_SIZE - page_off; | |
761 | if (page_nr >= window->nr_pages) { | |
762 | printk(KERN_ERR "%s dma_addr out of boundary\n", __FUNCTION__); | |
763 | } | |
764 | return window->dma_addr[page_nr] | page_off; | |
765 | } else { | |
766 | int i = index ? *index : 0; | |
767 | uint64_t end; | |
768 | uint64_t start = start_off ? *start_off : window->offset; | |
769 | for (; i < window->nr_contig_chunks; i++) { | |
770 | end = start + (window->num_pages[i] << PAGE_SHIFT); | |
771 | if (off >= start && off < end) { | |
772 | if (index) | |
773 | *index = i; | |
774 | if (start_off) | |
775 | *start_off = start; | |
776 | if (nr_bytes) | |
777 | *nr_bytes = end - off; | |
778 | return (window->dma_addr[i] + (off - start)); | |
779 | } | |
780 | start += (window->num_pages[i] << PAGE_SHIFT); | |
781 | } | |
782 | } | |
783 | #ifdef CONFIG_MK1OM | |
784 | printk(KERN_ERR "%s %d BUG. Addr not found? window %p off 0x%llx\n", __func__, __LINE__, window, off); | |
785 | BUG_ON(1); | |
786 | #endif | |
787 | return RMA_ERROR_CODE; | |
788 | } | |
789 | ||
790 | /* | |
791 | * scif_memset: | |
792 | * @va: kernel virtual address | |
793 | * @c: The byte used to fill the memory | |
794 | * @size: Buffer size | |
795 | * | |
796 | * Helper API which fills size bytes of memory pointed to by va with the | |
797 | * constant byte c. This API fills the memory in chunks of 4GB - 1 bytes | |
798 | * for a single invocation of memset(..) to work around a kernel bug in | |
799 | * x86_64 @ https://bugzilla.kernel.org/show_bug.cgi?id=27732 | |
800 | * where memset(..) does not do "ANY" work for size >= 4GB. | |
801 | * This kernel bug has been fixed upstream in v3.2 via the commit | |
802 | * titled "x86-64: Fix memset() to support sizes of 4Gb and above" | |
803 | * but has not been backported to distributions like RHEL 6.3 yet. | |
804 | */ | |
805 | static __always_inline void scif_memset(char *va, int c, size_t size) | |
806 | { | |
807 | size_t loop_size; | |
808 | const size_t four_gb = 4 * 1024 * 1024 * 1024ULL; | |
809 | ||
810 | while (size) { | |
811 | loop_size = min(size, four_gb - 1); | |
812 | memset(va, c, loop_size); | |
813 | size -= loop_size; | |
814 | va += loop_size; | |
815 | } | |
816 | } | |
817 | ||
818 | /* | |
819 | * scif_zalloc: | |
820 | * @size: Size of the allocation request. | |
821 | * | |
822 | * Helper API which attempts to allocate zeroed pages via | |
823 | * __get_free_pages(..) first and then falls back on | |
824 | * vmalloc(..) if that fails. This is required because | |
825 | * vmalloc(..) is *slow*. | |
826 | */ | |
827 | static __always_inline void *scif_zalloc(size_t size) | |
828 | { | |
829 | void *ret; | |
830 | size_t align = ALIGN(size, PAGE_SIZE); | |
831 | ||
832 | if (!align) | |
833 | return NULL; | |
834 | ||
835 | if (align <= (1 << (MAX_ORDER + PAGE_SHIFT - 1))) | |
836 | if ((ret = (void*)__get_free_pages(GFP_KERNEL | __GFP_ZERO, | |
837 | get_order(align)))) | |
838 | goto done; | |
839 | if (!(ret = vmalloc(align))) | |
840 | return NULL; | |
841 | ||
842 | /* TODO: Use vzalloc once kernel supports it */ | |
843 | scif_memset(ret, 0, size); | |
844 | done: | |
845 | #ifdef RMA_DEBUG | |
846 | atomic_long_add_return(align, &ms_info.rma_alloc_cnt); | |
847 | #endif | |
848 | return ret; | |
849 | } | |
850 | ||
851 | /* | |
852 | * scif_free: | |
853 | * @addr: Address to be freed. | |
854 | * @size: Size of the allocation. | |
855 | * Helper API which frees memory allocated via scif_zalloc(). | |
856 | */ | |
857 | static __always_inline void scif_free(void *addr, size_t size) | |
858 | { | |
859 | size_t align = ALIGN(size, PAGE_SIZE); | |
860 | ||
861 | if (unlikely(is_vmalloc_addr(addr))) | |
862 | vfree(addr); | |
863 | else { | |
864 | free_pages((unsigned long)addr, get_order(align)); | |
865 | } | |
866 | #ifdef RMA_DEBUG | |
867 | WARN_ON(atomic_long_sub_return(align, &ms_info.rma_alloc_cnt) < 0); | |
868 | #endif | |
869 | } | |
870 | ||
871 | static __always_inline void | |
872 | get_window_ref_count(struct reg_range_t *window, int64_t nr_pages) | |
873 | { | |
874 | window->ref_count += (int)nr_pages; | |
875 | } | |
876 | ||
877 | static __always_inline void | |
878 | put_window_ref_count(struct reg_range_t *window, int64_t nr_pages) | |
879 | { | |
880 | window->ref_count -= (int)nr_pages; | |
881 | BUG_ON(window->nr_pages < 0); | |
882 | } | |
883 | ||
884 | static __always_inline void | |
885 | set_window_ref_count(struct reg_range_t *window, int64_t nr_pages) | |
886 | { | |
887 | window->ref_count = (int)nr_pages; | |
888 | } | |
889 | ||
890 | /* Debug API's */ | |
891 | void micscif_display_window(struct reg_range_t *window, const char *s, int line); | |
892 | static inline struct mm_struct *__scif_acquire_mm(void) | |
893 | { | |
894 | if (mic_ulimit_check) { | |
895 | #ifdef RMA_DEBUG | |
896 | atomic_long_add_return(1, &ms_info.rma_mm_cnt); | |
897 | #endif | |
898 | return get_task_mm(current); | |
899 | } | |
900 | return NULL; | |
901 | } | |
902 | ||
903 | static inline void __scif_release_mm(struct mm_struct *mm) | |
904 | { | |
905 | if (mic_ulimit_check && mm) { | |
906 | #ifdef RMA_DEBUG | |
907 | WARN_ON(atomic_long_sub_return(1, &ms_info.rma_mm_cnt) < 0); | |
908 | #endif | |
909 | mmput(mm); | |
910 | } | |
911 | } | |
912 | ||
913 | static inline int __scif_dec_pinned_vm_lock(struct mm_struct *mm, | |
914 | int64_t nr_pages, bool try_lock) | |
915 | { | |
916 | if (mm && nr_pages && mic_ulimit_check) { | |
917 | if (try_lock) { | |
918 | if (!down_write_trylock(&mm->mmap_sem)) { | |
919 | return -1; | |
920 | } | |
921 | } else { | |
922 | down_write(&mm->mmap_sem); | |
923 | } | |
924 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)) | |
925 | mm->pinned_vm -= nr_pages; | |
926 | #else | |
927 | mm->locked_vm -= nr_pages; | |
928 | #endif | |
929 | up_write(&mm->mmap_sem); | |
930 | } | |
931 | return 0; | |
932 | } | |
933 | ||
934 | static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, | |
935 | int64_t nr_pages) | |
936 | { | |
937 | if (mm && mic_ulimit_check && nr_pages) { | |
938 | unsigned long locked, lock_limit; | |
939 | locked = nr_pages; | |
940 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)) | |
941 | locked += mm->pinned_vm; | |
942 | #else | |
943 | locked += mm->locked_vm; | |
944 | #endif | |
945 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | |
946 | if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { | |
947 | pr_debug("locked(%lu) > lock_limit(%lu)\n", | |
948 | locked, lock_limit); | |
949 | return -ENOMEM; | |
950 | } else { | |
951 | #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)) | |
952 | mm->pinned_vm = locked; | |
953 | #else | |
954 | mm->locked_vm = locked; | |
955 | #endif | |
956 | } | |
957 | } | |
958 | return 0; | |
959 | } | |
960 | #endif |