From 800f879a77716ad833d229ccc058e700c698b039 Mon Sep 17 00:00:00 2001 From: Aaron Taylor Date: Sun, 25 Apr 2021 18:24:22 -0700 Subject: [PATCH] Initial commit of files contained in `mpss-modules-3.8.6.tar.bz2` for Intel Xeon Phi. --- .mpss-metadata | 2 + COPYING | 339 +++ Kbuild | 106 + Makefile | 106 + dma/Kbuild | 5 + dma/mic_dma_lib.c | 1792 ++++++++++++++ dma/mic_dma_md.c | 522 +++++ dma/mic_sbox_md.c | 57 + host/Makefile | 47 + host/acptboot.c | 194 ++ host/ioctl.c | 186 ++ host/linpm.c | 232 ++ host/linpsmi.c | 152 ++ host/linscif_host.c | 315 +++ host/linsysfs.c | 766 ++++++ host/linux.c | 796 +++++++ host/linvcons.c | 687 ++++++ host/linvnet.c | 802 +++++++ host/micpsmi.c | 184 ++ host/micscif_pm.c | 1062 +++++++++ host/pm_ioctl.c | 603 +++++ host/pm_pcstate.c | 1107 +++++++++ host/tools_support.c | 978 ++++++++ host/uos_download.c | 1950 ++++++++++++++++ host/vhost/mic_blk.c | 665 ++++++ host/vhost/mic_vhost.c | 697 ++++++ host/vhost/vhost.h | 261 +++ host/vmcore.c | 821 +++++++ include/mic/bootparams.h | 170 ++ include/mic/compl_buf_ring.h | 220 ++ include/mic/io_interface.h | 217 ++ include/mic/mic_dma_api.h | 170 ++ include/mic/mic_dma_lib.h | 207 ++ include/mic/mic_dma_md.h | 462 ++++ include/mic/mic_macaddr.h | 104 + include/mic/mic_pm.h | 442 ++++ include/mic/mic_sbox_md.h | 90 + include/mic/mic_virtio.h | 70 + include/mic/micbaseaddressdefine.h | 111 + include/mic/micdboxdefine.h | 48 + include/mic/micpsmi.h | 62 + include/mic/micsboxdefine.h | 255 ++ include/mic/micscif.h | 900 ++++++++ include/mic/micscif_intr.h | 52 + include/mic/micscif_kmem_cache.h | 62 + include/mic/micscif_map.h | 276 +++ include/mic/micscif_nm.h | 234 ++ include/mic/micscif_nodeqp.h | 200 ++ include/mic/micscif_rb.h | 170 ++ include/mic/micscif_rma.h | 960 ++++++++ include/mic/micscif_rma_list.h | 151 ++ include/mic/micscif_smpt.h | 120 + include/mic/micscif_va_gen.h | 86 + include/mic/micscif_va_node.h | 115 + include/mic/micvcons.h | 164 ++ include/mic/micveth.h | 145 ++ include/mic/micveth_common.h | 69 + include/mic/micveth_dma.h | 279 +++ include/mic/ringbuffer.h | 195 ++ include/mic_common.h | 769 ++++++ include/mic_interrupts.h | 118 + include/micint.h | 114 + include/scif.h | 1743 ++++++++++++++ include/scif_ioctl.h | 225 ++ mic.conf | 32 + mic.modules | 5 + micscif/Kbuild | 21 + micscif/micscif_api.c | 3464 ++++++++++++++++++++++++++++ micscif/micscif_debug.c | 1005 ++++++++ micscif/micscif_fd.c | 528 +++++ micscif/micscif_intr.c | 159 ++ micscif/micscif_main.c | 606 +++++ micscif/micscif_nm.c | 1740 ++++++++++++++ micscif/micscif_nodeqp.c | 2902 +++++++++++++++++++++++ micscif/micscif_ports.c | 376 +++ micscif/micscif_rb.c | 372 +++ micscif/micscif_rma.c | 2633 +++++++++++++++++++++ micscif/micscif_rma_dma.c | 982 ++++++++ micscif/micscif_rma_list.c | 533 +++++ micscif/micscif_select.c | 446 ++++ micscif/micscif_smpt.c | 457 ++++ micscif/micscif_sysfs.c | 234 ++ micscif/micscif_va_gen.c | 480 ++++ micscif/micscif_va_node.c | 187 ++ mpssboot/Kbuild | 1 + mpssboot/mpssboot.c | 238 ++ pm_scif/Kbuild | 1 + pm_scif/pm_scif.c | 439 ++++ pm_scif/pm_scif.h | 48 + ramoops/Kbuild | 1 + ramoops/ramoops.c | 163 ++ ras/Kbuild | 6 + ras/Makefile | 210 ++ ras/micmca_api.h | 135 ++ ras/micpm_api.h | 307 +++ ras/micras.h | 536 +++++ ras/micras_api.h | 1006 ++++++++ ras/micras_common.c | 968 ++++++++ ras/micras_core.c | 973 ++++++++ ras/micras_elog.c | 3136 +++++++++++++++++++++++++ ras/micras_knc.c | 2794 ++++++++++++++++++++++ ras/micras_knf.c | 1432 ++++++++++++ ras/micras_main.c | 2650 +++++++++++++++++++++ ras/micras_pm.c | 1050 +++++++++ ras/micras_uncore.c | 1194 ++++++++++ ras/monahan.h | 201 ++ trace_capture/Kbuild | 1 + trace_capture/Makefile | 34 + trace_capture/docapture.c | 70 + trace_capture/tc_host.c | 366 +++ trace_capture/tc_memcvt.c | 85 + trace_capture/trace_capture.c | 2031 ++++++++++++++++ trace_capture/trace_capture.h | 245 ++ udev-mic.rules | 9 + vcons/Kbuild | 3 + vcons/hvc_console.h | 119 + vcons/hvc_mic.c | 341 +++ virtio/Kbuild | 2 + virtio/mic_virtblk.c | 862 +++++++ vnet/Kbuild | 3 + vnet/mic.h | 108 + vnet/micveth.c | 869 +++++++ vnet/micveth_dma.c | 1642 +++++++++++++ vnet/micveth_param.c | 95 + 124 files changed, 66745 insertions(+) create mode 100644 .mpss-metadata create mode 100644 COPYING create mode 100644 Kbuild create mode 100644 Makefile create mode 100644 dma/Kbuild create mode 100644 dma/mic_dma_lib.c create mode 100644 dma/mic_dma_md.c create mode 100644 dma/mic_sbox_md.c create mode 100644 host/Makefile create mode 100644 host/acptboot.c create mode 100644 host/ioctl.c create mode 100644 host/linpm.c create mode 100644 host/linpsmi.c create mode 100644 host/linscif_host.c create mode 100644 host/linsysfs.c create mode 100644 host/linux.c create mode 100644 host/linvcons.c create mode 100644 host/linvnet.c create mode 100644 host/micpsmi.c create mode 100644 host/micscif_pm.c create mode 100644 host/pm_ioctl.c create mode 100644 host/pm_pcstate.c create mode 100644 host/tools_support.c create mode 100644 host/uos_download.c create mode 100644 host/vhost/mic_blk.c create mode 100644 host/vhost/mic_vhost.c create mode 100644 host/vhost/vhost.h create mode 100644 host/vmcore.c create mode 100644 include/mic/bootparams.h create mode 100644 include/mic/compl_buf_ring.h create mode 100644 include/mic/io_interface.h create mode 100644 include/mic/mic_dma_api.h create mode 100644 include/mic/mic_dma_lib.h create mode 100644 include/mic/mic_dma_md.h create mode 100644 include/mic/mic_macaddr.h create mode 100644 include/mic/mic_pm.h create mode 100644 include/mic/mic_sbox_md.h create mode 100644 include/mic/mic_virtio.h create mode 100644 include/mic/micbaseaddressdefine.h create mode 100644 include/mic/micdboxdefine.h create mode 100644 include/mic/micpsmi.h create mode 100644 include/mic/micsboxdefine.h create mode 100644 include/mic/micscif.h create mode 100644 include/mic/micscif_intr.h create mode 100644 include/mic/micscif_kmem_cache.h create mode 100644 include/mic/micscif_map.h create mode 100644 include/mic/micscif_nm.h create mode 100644 include/mic/micscif_nodeqp.h create mode 100644 include/mic/micscif_rb.h create mode 100644 include/mic/micscif_rma.h create mode 100644 include/mic/micscif_rma_list.h create mode 100644 include/mic/micscif_smpt.h create mode 100644 include/mic/micscif_va_gen.h create mode 100644 include/mic/micscif_va_node.h create mode 100644 include/mic/micvcons.h create mode 100644 include/mic/micveth.h create mode 100644 include/mic/micveth_common.h create mode 100644 include/mic/micveth_dma.h create mode 100644 include/mic/ringbuffer.h create mode 100644 include/mic_common.h create mode 100644 include/mic_interrupts.h create mode 100644 include/micint.h create mode 100644 include/scif.h create mode 100644 include/scif_ioctl.h create mode 100644 mic.conf create mode 100755 mic.modules create mode 100644 micscif/Kbuild create mode 100644 micscif/micscif_api.c create mode 100644 micscif/micscif_debug.c create mode 100644 micscif/micscif_fd.c create mode 100644 micscif/micscif_intr.c create mode 100644 micscif/micscif_main.c create mode 100644 micscif/micscif_nm.c create mode 100644 micscif/micscif_nodeqp.c create mode 100644 micscif/micscif_ports.c create mode 100644 micscif/micscif_rb.c create mode 100644 micscif/micscif_rma.c create mode 100644 micscif/micscif_rma_dma.c create mode 100644 micscif/micscif_rma_list.c create mode 100644 micscif/micscif_select.c create mode 100644 micscif/micscif_smpt.c create mode 100644 micscif/micscif_sysfs.c create mode 100644 micscif/micscif_va_gen.c create mode 100644 micscif/micscif_va_node.c create mode 100644 mpssboot/Kbuild create mode 100644 mpssboot/mpssboot.c create mode 100644 pm_scif/Kbuild create mode 100644 pm_scif/pm_scif.c create mode 100644 pm_scif/pm_scif.h create mode 100644 ramoops/Kbuild create mode 100644 ramoops/ramoops.c create mode 100644 ras/Kbuild create mode 100644 ras/Makefile create mode 100644 ras/micmca_api.h create mode 100644 ras/micpm_api.h create mode 100644 ras/micras.h create mode 100644 ras/micras_api.h create mode 100644 ras/micras_common.c create mode 100644 ras/micras_core.c create mode 100644 ras/micras_elog.c create mode 100644 ras/micras_knc.c create mode 100644 ras/micras_knf.c create mode 100644 ras/micras_main.c create mode 100644 ras/micras_pm.c create mode 100644 ras/micras_uncore.c create mode 100644 ras/monahan.h create mode 100644 trace_capture/Kbuild create mode 100644 trace_capture/Makefile create mode 100644 trace_capture/docapture.c create mode 100644 trace_capture/tc_host.c create mode 100644 trace_capture/tc_memcvt.c create mode 100644 trace_capture/trace_capture.c create mode 100644 trace_capture/trace_capture.h create mode 100644 udev-mic.rules create mode 100644 vcons/Kbuild create mode 100644 vcons/hvc_console.h create mode 100644 vcons/hvc_mic.c create mode 100644 virtio/Kbuild create mode 100644 virtio/mic_virtblk.c create mode 100644 vnet/Kbuild create mode 100644 vnet/mic.h create mode 100644 vnet/micveth.c create mode 100644 vnet/micveth_dma.c create mode 100644 vnet/micveth_param.c diff --git a/.mpss-metadata b/.mpss-metadata new file mode 100644 index 0000000..66c84b0 --- /dev/null +++ b/.mpss-metadata @@ -0,0 +1,2 @@ +3.8.6-1 +e8ef53c4fa26582ac37b5e0101b7451a70263f6c diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Kbuild b/Kbuild new file mode 100644 index 0000000..f56e01c --- /dev/null +++ b/Kbuild @@ -0,0 +1,106 @@ +not-y := n +not-n := y +m-not-y := n +m-not-n := m + +ifeq ($(CONFIG_X86_MICPCI),) +CONFIG_X86_MICPCI := n +endif +ifeq ($(CONFIG_X86_MICPCI)$(MIC_CARD_ARCH),n) +$(error building for host, but $$(MIC_CARD_ARCH) is unset) +endif +ifneq ($(MIC_CARD_ARCH),$(firstword $(filter l1om k1om,$(MIC_CARD_ARCH)))) +$(error $$(MIC_CARD_ARCH) must be l1om or k1om) +endif + +# Force optimization to -O2 in case the kernel was configured to use +# -Os. The main reason is pretty dumb -- -Os has a warning -O2 doesn't, +# and we compile with -Werror internally. Another reason is that -O2 is +# what we're used to in terms of validation and performance analysis. We +# should probably get rid of this, though. +subdir-ccflags-y += -O2 + +# Makes it easy to inject "-Werror" from the environment +subdir-ccflags-y += $(KERNWARNFLAGS) + +# Bake some information about who built the module(s), and what version +# of the source code they started with. Possibly useful during debug. +subdir-ccflags-y += -DBUILD_NUMBER=\"'$(MPSS_BUILDNO)'\" +subdir-ccflags-y += -DBUILD_BYWHOM=\"'$(MPSS_BUILTBY)'\" +subdir-ccflags-y += -DBUILD_ONDATE=\"'$(MPSS_BUILTON)'\" +subdir-ccflags-y += -DBUILD_SCMVER=\"'$(MPSS_COMMIT)'\" +subdir-ccflags-y += -DBUILD_VERSION=\"'$(or $(MPSS_VERSION),0.0) ($(MPSS_BUILTBY))'\" + +# Code common with the host mustn't use CONFIG_M[LK]1OM directly. +# But of course it does anyway. Arrgh. +subdir-ccflags-$(CONFIG_ML1OM) += -DMIC_IS_L1OM +subdir-ccflags-$(CONFIG_MK1OM) += -DMIC_IS_K1OM +ifeq ($(MIC_CARD_ARCH),l1om) +subdir-ccflags-y += -DMIC_IS_L1OM -DCONFIG_ML1OM +endif +ifeq ($(MIC_CARD_ARCH),k1om) +subdir-ccflags-y += -DMIC_IS_K1OM -DCONFIG_MK1OM +endif + +# a shorthand for "runs on the card"? +subdir-ccflags-$(CONFIG_X86_MICPCI) += -D_MIC_SCIF_ + +# "runs on the host" +subdir-ccflags-$(not-$(CONFIG_X86_MICPCI)) += -DHOST -DUSE_VCONSOLE + +# always set? what's this thing's purpose? +subdir-ccflags-y += -D__LINUX_GPL__ -D_MODULE_SCIF_ + +subdir-ccflags-y += -I$(M)/include + +obj-$(CONFIG_X86_MICPCI) += dma/ micscif/ pm_scif/ ras/ +obj-$(CONFIG_X86_MICPCI) += vcons/ vnet/ mpssboot/ ramoops/ virtio/ + +obj-$(m-not-$(CONFIG_X86_MICPCI)) += mic.o + +mic-objs := +mic-objs += dma/mic_dma_lib.o +mic-objs += dma/mic_dma_md.o +mic-objs += host/acptboot.o +mic-objs += host/ioctl.o +mic-objs += host/linpm.o +mic-objs += host/linpsmi.o +mic-objs += host/linscif_host.o +mic-objs += host/linsysfs.o +mic-objs += host/linux.o +mic-objs += host/linvcons.o +mic-objs += host/linvnet.o +mic-objs += host/micpsmi.o +mic-objs += host/micscif_pm.o +mic-objs += host/pm_ioctl.o +mic-objs += host/pm_pcstate.o +mic-objs += host/tools_support.o +mic-objs += host/uos_download.o +mic-objs += host/vhost/mic_vhost.o +mic-objs += host/vhost/mic_blk.o +mic-objs += host/vmcore.o +mic-objs += micscif/micscif_api.o +mic-objs += micscif/micscif_debug.o +mic-objs += micscif/micscif_fd.o +mic-objs += micscif/micscif_intr.o +mic-objs += micscif/micscif_nm.o +mic-objs += micscif/micscif_nodeqp.o +mic-objs += micscif/micscif_ports.o +mic-objs += micscif/micscif_rb.o +mic-objs += micscif/micscif_rma_dma.o +mic-objs += micscif/micscif_rma_list.o +mic-objs += micscif/micscif_rma.o +mic-objs += micscif/micscif_select.o +mic-objs += micscif/micscif_smpt.o +mic-objs += micscif/micscif_sysfs.o +mic-objs += micscif/micscif_va_gen.o +mic-objs += micscif/micscif_va_node.o +mic-objs += vnet/micveth_dma.o +mic-objs += vnet/micveth_param.o + +version-le = $(shell printf '%s\n' $(1) | sort -t. -k 1,1n -k 2,2n -k 3,3n -k 4,4n -c >/dev/null 2>&1 && echo t) +ifeq ($(call version-le, 2.6.23 $(KERNELRELEASE)),t) +ccflags-y += $(mic-cflags) +else +$(error building against kernels <= 2.6.23 is broken) +endif diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..fa50e3d --- /dev/null +++ b/Makefile @@ -0,0 +1,106 @@ +# Copyright 2010-2017 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License, version 2, +# as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Disclaimer: The codes contained in these modules may be specific to +# the Intel Software Development Platform codenamed Knights Ferry, +# and the Intel product codenamed Knights Corner, and are not backward +# compatible with other Intel products. Additionally, Intel will NOT +# support the codes or instruction set in future products. +# +# Intel offers no warranty of any kind regarding the code. This code is +# licensed on an "AS IS" basis and Intel is not obligated to provide +# any support, assistance, installation, training, or other services +# of any kind. Intel is also not obligated to provide any updates, +# enhancements or extensions. Intel specifically disclaims any warranty +# of merchantability, non-infringement, fitness for any particular +# purpose, and any other warranty. +# +# Further, Intel disclaims all liability of any kind, including but +# not limited to liability for infringement of any proprietary rights, +# relating to the use of the code, even if Intel is notified of the +# possibility of such liability. Except as expressly stated in an Intel +# license agreement provided with this code and agreed upon with Intel, +# no license, express or implied, by estoppel or otherwise, to any +# intellectual property rights is granted herein. + +MPSS_COMMIT ?= $(or $(shell sed -ne '2 p' .mpss-metadata 2>/dev/null), \ + $(error .mpss-metadata file is missing or incorrect)) +MPSS_VERSION ?= $(or $(shell sed -ne '1 p' .mpss-metadata 2>/dev/null), \ + $(error .mpss-metadata file is missing or incorrect)) +MPSS_BUILDNO ?= 0 +export MPSS_COMMIT := $(MPSS_COMMIT) +export MPSS_VERSION := $(MPSS_VERSION) +export MPSS_BUILDNO := $(MPSS_BUILDNO) +export MPSS_BUILTBY := $(shell echo "`whoami`@`uname -n`") +export MPSS_BUILTON := $(shell date +'%F %T %z') + +KERNEL_VERSION := $(shell uname -r) +KERNEL_SRC = /lib/modules/$(KERNEL_VERSION)/build + +INSTALL = install +INSTALL_d = $(INSTALL) -d +INSTALL_x = $(INSTALL) +INSTALL_f = $(INSTALL) -m644 + +prefix = /usr/local +sysconfdir = $(prefix)/etc +includedir = $(prefix)/include + +kmodinstalldir = /lib/modules/$(KERNEL_VERSION) +kmodincludedir = $(realpath $(KERNEL_SRC))/include/modules + +# If building the host's driver for a MIC co-processor card, which card +# $(ARCH) it should support +export MIC_CARD_ARCH + +.PHONY: all install modules +.PHONY: modules_install conf_install dev_install kdev_install + +all: modules + +install: modules_install conf_install kdev_install + +modules modules_install: %: + $(MAKE) -C $(KERNEL_SRC) M=$(CURDIR) $* \ + INSTALL_MOD_PATH=$(DESTDIR) + +conf_install: +ifneq ($(MIC_CARD_ARCH),) + $(INSTALL_d) $(DESTDIR)$(sysconfdir)/sysconfig/modules + $(INSTALL_x) mic.modules $(DESTDIR)$(sysconfdir)/sysconfig/modules + $(INSTALL_d) $(DESTDIR)$(sysconfdir)/modprobe.d + $(INSTALL_f) mic.conf $(DESTDIR)$(sysconfdir)/modprobe.d +endif + $(INSTALL_d) $(DESTDIR)$(sysconfdir)/udev/rules.d + $(INSTALL_f) udev-mic.rules $(DESTDIR)$(sysconfdir)/udev/rules.d/50-udev-mic.rules + +dev_install: + $(INSTALL_d) $(DESTDIR)$(includedir)/mic + $(INSTALL_f) include/scif_ioctl.h $(DESTDIR)$(includedir) + $(INSTALL_f) include/mic/io_interface.h $(DESTDIR)$(includedir)/mic + $(INSTALL_f) include/mic/mic_pm.h $(DESTDIR)$(includedir)/mic + $(INSTALL_f) ras/micras_api.h $(DESTDIR)$(includedir)/mic + $(INSTALL_f) ras/micmca_api.h $(DESTDIR)$(includedir)/mic +ifeq ($(MIC_CARD_ARCH),) # Card side + $(INSTALL_f) ras/micpm_api.h $(DESTDIR)$(includedir)/mic + $(INSTALL_f) ras/micras.h $(DESTDIR)$(includedir)/mic +else # Host side + $(INSTALL_f) include/mic/micbaseaddressdefine.h $(DESTDIR)$(includedir)/mic + $(INSTALL_f) include/mic/micsboxdefine.h $(DESTDIR)$(includedir)/mic + $(INSTALL_f) include/mic/micdboxdefine.h $(DESTDIR)$(includedir)/mic + $(INSTALL_f) ras/micpm_api.h $(DESTDIR)$(includedir)/mic +endif + +kdev_install: + $(INSTALL_d) $(DESTDIR)$(kmodinstalldir) + $(INSTALL_f) Module.symvers $(DESTDIR)$(kmodinstalldir)/scif.symvers + $(INSTALL_d) $(DESTDIR)$(kmodincludedir) + $(INSTALL_f) include/scif.h $(DESTDIR)$(kmodincludedir) diff --git a/dma/Kbuild b/dma/Kbuild new file mode 100644 index 0000000..4db196d --- /dev/null +++ b/dma/Kbuild @@ -0,0 +1,5 @@ +ccflags-y += -DDMA_CHAN_MIC_OWNER=0 + +obj-m := dma_module.o + +dma_module-objs := mic_dma_lib.o mic_dma_md.o mic_sbox_md.o diff --git a/dma/mic_dma_lib.c b/dma/mic_dma_lib.c new file mode 100644 index 0000000..80a4a0b --- /dev/null +++ b/dma/mic_dma_lib.c @@ -0,0 +1,1792 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _MIC_SCIF_ +#include +#ifdef CONFIG_PAGE_CACHE_DMA +#include +#endif +#endif + +#ifndef _MIC_SCIF_ +#include +#include "mic_common.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); + +#ifdef MIC_IS_EMULATION +#define DMA_TO (INT_MAX) +#define DMA_FENCE_TIMEOUT_CNT (INT_MAX) +#else +#define DMA_TO (5 * HZ) +#define DMA_SLOWEST_BW (300) // 300Mbps +// the maximum size for each decriptor entry is 2M +#define DMA_FENCE_TIMEOUT_CNT (2 * MIC_MAX_NUM_DESC_PER_RING /DMA_SLOWEST_BW/ (DMA_TO/HZ)) +#endif + +#ifdef _MIC_SCIF_ +#define MAX_DMA_XFER_SIZE MIC_MAX_DMA_XFER_SIZE +#else +/* Use 512K as the maximum descriptor transfer size for Host */ +#define MAX_DMA_XFER_SIZE (((1U) * 1024 * 1024) >> 1) +#endif +#ifndef KASSERT +#define KASSERT(x, y, ...) \ + do { \ + if(!x) \ + printk(y, ##__VA_ARGS__);\ + BUG_ON(!x); \ + } while(0) +#endif +/* + * Arrary of per device DMA contexts. The card only uses index 0. The host uses one + * context per card starting from 0. + */ +static struct mic_dma_ctx_t *mic_dma_context[MAX_BOARD_SUPPORTED + 1]; +static struct mutex lock_dma_dev_init[MAX_BOARD_SUPPORTED + 1]; + +enum mic_desc_format_type { + NOP, + MEMCOPY, + STATUS, + GENERAL, + KEYNONCECNT, + KEY +}; +char proc_dma_reg[]="mic_dma_registers_"; +char proc_dma_ring[]="mic_dma_ring_"; + +#define PR_PREFIX "DMA_LIB_MI:" +#define DMA_DESC_RING_SIZE MIC_MAX_NUM_DESC_PER_RING +#define MAX_POLLING_BUFFERS DMA_DESC_RING_SIZE + +#define DMA_PROC +static void mic_dma_proc_init(struct mic_dma_ctx_t *dma_ctx); +static void mic_dma_proc_uninit(struct mic_dma_ctx_t *dma_ctx); + +/* + * TODO: This is size of s/w interrupt ring. + * We need to figure out a value so that we don't run out of memory in + * interrupt ring and at the same time don't waste memory + */ +#define NUM_COMP_BUFS (((PAGE_SIZE/sizeof(struct dma_completion_cb*)) - 10) * 10) + +struct intr_compl_buf_ring { + struct dma_completion_cb **comp_cb_array; + struct compl_buf_ring ring; + int old_tail; +}; + +struct mic_dma_ctx_t; /* Forward Declaration */ + +struct dma_channel { + int ch_num;/*Duplicated in md_mic_dma_chan struct too*/ + struct md_mic_dma_chan *chan; + atomic_t flags; + wait_queue_head_t intr_wq; + wait_queue_head_t access_wq; + union md_mic_dma_desc *desc_ring_bak; + union md_mic_dma_desc *desc_ring; + phys_addr_t desc_ring_phys; + uint64_t next_write_index; /* next write index into desc ring */ + struct intr_compl_buf_ring intr_ring; + struct compl_buf_ring poll_ring; + struct mic_dma_ctx_t *dma_ctx; /* Pointer to parent DMA context */ +}; + +/* Per MIC device (per MIC board) DMA context */ +struct mic_dma_ctx_t { + struct dma_channel dma_channels[MAX_NUM_DMA_CHAN]; + int last_allocated_dma_channel_num; + struct mic_dma_device dma_dev; + int device_num; + atomic_t ref_count; /* Reference count */ + atomic_t ch_num; +}; + +/* DMA Library Init/Uninit Routines */ +static int mic_dma_lib_init(uint8_t *mmio_va_base, struct mic_dma_ctx_t *dma_ctx); +static void mic_dma_lib_uninit(struct mic_dma_ctx_t *dma_ctx); + +int get_chan_num(struct dma_channel *chan) +{ + return chan->ch_num; +} +EXPORT_SYMBOL(get_chan_num); + +void initdmaglobalvar(void) +{ + memset(mic_dma_context, 0, sizeof(struct mic_dma_ctx_t *) * (MAX_BOARD_SUPPORTED + 1)); +} + +static void +ack_dma_interrupt(struct dma_channel *ch) +{ + md_mic_dma_chan_mask_intr(&ch->dma_ctx->dma_dev, ch->chan); + md_mic_dma_chan_unmask_intr(&ch->dma_ctx->dma_dev, ch->chan); +} + +/* Returns true if the next write index is "within" bounds */ +static inline bool verify_next_write_index(struct dma_channel *ch) +{ + bool ret = false; + + if (ch->next_write_index < DMA_DESC_RING_SIZE) + ret = true; + else + printk(KERN_ERR "%s %d OOB ch_num 0x%x next_write_index 0x%llx\n", + __func__, __LINE__, + ch->ch_num, ch->next_write_index); + return ret; +} + +/* TODO: + * See if we can use __get_free_pages or something similar + * get_free_pages expects a power of 2 number of pages + */ +static void +alloc_dma_desc_ring_mem(struct dma_channel *ch, struct mic_dma_ctx_t *dma_ctx) +{ +#ifndef _MIC_SCIF_ + struct pci_dev *pdev; +#endif + /* Is there any kernel allocator which provides the + * option to give the alignment?? + */ + ch->desc_ring = kzalloc( + (DMA_DESC_RING_SIZE * sizeof(*ch->desc_ring)) + PAGE_SIZE, GFP_KERNEL); + ch->desc_ring_bak = ch->desc_ring; + ch->desc_ring = (union md_mic_dma_desc *)ALIGN( + (uint64_t)ch->desc_ring, PAGE_SIZE); +#ifdef _MIC_SCIF_ + ch->desc_ring_phys = virt_to_phys(ch->desc_ring); +#else + micscif_pci_dev(dma_ctx->device_num, &pdev); + ch->desc_ring_phys = mic_map_single(dma_ctx->device_num - 1, pdev, (void *)ch->desc_ring, + (DMA_DESC_RING_SIZE * sizeof(*ch->desc_ring)) + PAGE_SIZE); + BUG_ON(pci_dma_mapping_error(pdev, ch->desc_ring_phys)); +#endif +} + +/* + * Call completion cb functions: + * Take care of case where we allocated temp buf + */ +static void +mic_dma_lib_interrupt_handler(struct dma_channel *chan) +{ + int i = 0; + int ring_size = chan->intr_ring.ring.size; + struct dma_completion_cb **temp = chan->intr_ring.comp_cb_array; + struct dma_completion_cb *cb; + int new_tail, old_tail; + + if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC && + mic_hw_stepping(chan->dma_ctx->device_num) >= KNC_B0_STEP) { + unsigned long error = *((uint32_t*)chan->chan->dstat_wb_loc); + if (unlikely(test_bit(31, &error))) + printk(KERN_ERR "DMA h/w error - %s %d, dstatwb=%lx\n", + __func__, __LINE__, error); + } + new_tail = read_tail(&chan->intr_ring.ring); + old_tail = chan->intr_ring.old_tail; + + for (; i < ring_size && old_tail != new_tail; + old_tail = incr_rb_index(old_tail, ring_size), i++) { + cb = (struct dma_completion_cb *)xchg(&temp[old_tail], NULL); + if (cb) { + cb->dma_completion_func(cb->cb_cookie); + } + } + chan->intr_ring.old_tail = new_tail; + update_tail(&chan->intr_ring.ring, new_tail); + wake_up(&chan->intr_wq); + if (i == ring_size && old_tail != new_tail) { + printk(KERN_ERR PR_PREFIX "Something went wrong, old tail = %d, new tail = %d\n", + old_tail, new_tail); + } +} + +#ifdef _MIC_SCIF_ +/* + * TODO; + * Maybe move the logic into slow interrupt handler + */ +static irqreturn_t +dma_interrupt_handler(int irq, void *dev_id) +{ + struct dma_channel *chan = ((struct dma_channel*)dev_id); + + ack_dma_interrupt(chan); + mic_dma_lib_interrupt_handler(chan); + + return IRQ_HANDLED; +} +#else + +#define SBOX_SICR0_DMA(x) (((x) >> 8) & 0xff) + +/* + * TODO; + * Maybe move the logic into slow interrupt handler + */ +void +host_dma_interrupt_handler(mic_dma_handle_t dma_handle, uint32_t sboxSicr0reg) +{ + struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *) dma_handle; + uint32_t dma_chan_id; + struct dma_channel *ch; + + for (dma_chan_id = 0; dma_chan_id < 8; dma_chan_id++) { + if (SBOX_SICR0_DMA(sboxSicr0reg) & (0x1 << dma_chan_id)) { + ch = &dma_ctx->dma_channels[dma_chan_id]; + if (ch->desc_ring) + host_dma_lib_interrupt_handler(ch); + } + } +} + +void +host_dma_lib_interrupt_handler(struct dma_channel *chan) +{ + ack_dma_interrupt(chan); + mic_dma_lib_interrupt_handler(chan); +} +#endif + +static void +mi_mic_dma_chan_setup(struct dma_channel *ch, struct mic_dma_ctx_t *dma_ctx) +{ + ch->next_write_index = ch->chan->cached_tail; + + init_ring(&ch->poll_ring, MAX_POLLING_BUFFERS, dma_ctx->device_num); + + ch->intr_ring.comp_cb_array = + kzalloc(sizeof(*ch->intr_ring.comp_cb_array) * NUM_COMP_BUFS, GFP_KERNEL); + init_ring(&ch->intr_ring.ring, NUM_COMP_BUFS, dma_ctx->device_num); + ch->intr_ring.old_tail = 0; +} + +static void +mi_mic_dma_chan_destroy(struct dma_channel *ch, struct mic_dma_ctx_t *dma_ctx) +{ + uninit_ring(&ch->intr_ring.ring, dma_ctx->device_num); + kfree(ch->intr_ring.comp_cb_array); + uninit_ring(&ch->poll_ring, dma_ctx->device_num); +} + +int +open_dma_device(int device_num, uint8_t *mmio_va_base, mic_dma_handle_t* dma_handle) +{ + int result = 0; + + if (device_num >= MAX_BOARD_SUPPORTED) + return -EINVAL; + + mutex_lock(&lock_dma_dev_init[device_num]); + if (!mic_dma_context[device_num]) { + mic_dma_context[device_num] = kzalloc(sizeof(struct mic_dma_ctx_t), GFP_KERNEL); + BUG_ON(!mic_dma_context[device_num]); + + mic_dma_context[device_num]->device_num = device_num; + + result = mic_dma_lib_init(mmio_va_base, mic_dma_context[device_num]); + BUG_ON(result); + } + + atomic_inc(&mic_dma_context[device_num]->ref_count); + *dma_handle = mic_dma_context[device_num]; + mutex_unlock(&lock_dma_dev_init[device_num]); + + return result; +} +EXPORT_SYMBOL(open_dma_device); + +void +close_dma_device(int device_num, mic_dma_handle_t *dma_handle) +{ + struct mic_dma_ctx_t *dma_ctx; + + if (device_num >= MAX_BOARD_SUPPORTED) + return; + + mutex_lock(&lock_dma_dev_init[device_num]); + dma_ctx = (struct mic_dma_ctx_t *) *dma_handle; + if (dma_ctx && + atomic_read(&dma_ctx->ref_count) && + atomic_dec_and_test(&dma_ctx->ref_count)) { + mic_dma_lib_uninit(dma_ctx); + mic_dma_context[dma_ctx->device_num] = 0; + *dma_handle = NULL; + kfree(dma_ctx); + } + mutex_unlock(&lock_dma_dev_init[device_num]); +} +EXPORT_SYMBOL(close_dma_device); + +void mi_mic_dma_chan_set_dstat_wb(struct mic_dma_ctx_t *dma_ctx, + struct md_mic_dma_chan *chan) +{ +#ifndef _MIC_SCIF_ + struct pci_dev *pdev; +#endif + if (!chan->dstat_wb_phys) { + chan->dstat_wb_loc = kzalloc(sizeof(uint32_t), GFP_KERNEL); + +#ifdef _MIC_SCIF_ + chan->dstat_wb_phys = virt_to_phys(chan->dstat_wb_loc); +#else + micscif_pci_dev(dma_ctx->device_num, &pdev); + chan->dstat_wb_phys = mic_map_single(dma_ctx->device_num - 1, pdev, chan->dstat_wb_loc, + sizeof(uint32_t)); + BUG_ON(pci_dma_mapping_error(pdev, chan->dstat_wb_phys)); +#endif + } + md_mic_dma_chan_set_dstat_wb(&dma_ctx->dma_dev, chan); +} + +void +md_mic_dma_chan_setup(struct mic_dma_ctx_t *dma_ctx, struct dma_channel *ch) +{ + md_mic_dma_chan_unmask_intr(&dma_ctx->dma_dev, ch->chan); + + /* + * Disable the channel, update desc ring base and size, write new head + * and then enable the channel. + */ + if (mic_hw_family(ch->dma_ctx->device_num) == FAMILY_KNC && + mic_hw_stepping(ch->dma_ctx->device_num) >= KNC_B0_STEP) { + mi_mic_dma_chan_set_dstat_wb(dma_ctx, ch->chan); + md_mic_dma_chan_set_dcherr_msk(&dma_ctx->dma_dev, ch->chan, 0); + } + md_mic_dma_chan_set_desc_ring(&dma_ctx->dma_dev, ch->chan, + ch->desc_ring_phys, + DMA_DESC_RING_SIZE); + + wmb(); + + md_mic_dma_chan_unmask_intr(&dma_ctx->dma_dev, ch->chan); +} + +int +mic_dma_lib_init(uint8_t *mmio_va_base, struct mic_dma_ctx_t *dma_ctx) +{ + int i; +#ifdef _MIC_SCIF_ + int ret_value; +#endif + struct dma_channel *ch; + enum md_mic_dma_chan_owner owner, currentOwner; + + //pr_debug(PR_PREFIX "Initialized the dma mmio va=%p\n", mmio_va_base); + // Using this to check where the DMA lib is at for now. + currentOwner = mmio_va_base == 0 ? MIC_DMA_CHAN_MIC_OWNED : MIC_DMA_CHAN_HOST_OWNED; + + // TODO: multi-card support + md_mic_dma_init(&dma_ctx->dma_dev, mmio_va_base); + + for (i = 0 ; i < MAX_NUM_DMA_CHAN; i++) { + ch = &dma_ctx->dma_channels[i]; + + /* Initialize pointer to parent */ + ch->dma_ctx = dma_ctx; + + owner = i > __LAST_HOST_CHAN_NUM ? MIC_DMA_CHAN_MIC_OWNED + : MIC_DMA_CHAN_HOST_OWNED; + + // This has to be done from card side + ch->chan = md_mic_dma_request_chan(&dma_ctx->dma_dev, owner); + KASSERT((ch->chan != NULL), "dummy\n"); + ch->ch_num = ch->chan->ch_num; + +#ifdef _MIC_SCIF_ + /* + * Host driver would have executed by now and thus setup the + * desc. ring + */ + if (ch->chan->owner == MIC_DMA_CHAN_HOST_OWNED) + md_mic_dma_enable_chan(&dma_ctx->dma_dev, i, true); +#endif + + atomic_set(&(ch->flags), CHAN_INUSE); // Mark as used by default + if (currentOwner == owner) { + alloc_dma_desc_ring_mem(ch, dma_ctx); + +#ifdef _MIC_SCIF_ // DMA now shares the IRQ handler with other system interrupts + ret_value = request_irq(i, dma_interrupt_handler, IRQF_DISABLED, + "dma channel", ch); + ret_value = ret_value; + //pr_debug(PR_PREFIX "Interrupt handler ret value for chan %d = %d\n", i, ret_value); +#endif + md_mic_dma_chan_setup(dma_ctx, ch); + + mi_mic_dma_chan_setup(ch, dma_ctx); + + init_waitqueue_head(&ch->intr_wq); + init_waitqueue_head(&ch->access_wq); + // Only mark owned channel to be available + atomic_set(&(ch->flags), CHAN_AVAILABLE); + md_mic_dma_print_debug(&dma_ctx->dma_dev, ch->chan); + } else { + ch->desc_ring = NULL; + } + } + + /* Initialize last_allocated_dma_channel */ + dma_ctx->last_allocated_dma_channel_num = -1; + //pr_debug(PR_PREFIX "Initialized the dma channels\n"); + mic_dma_proc_init(dma_ctx); + return 0; +} + +void +mic_dma_lib_uninit(struct mic_dma_ctx_t *dma_ctx) +{ + int i; + struct dma_channel *ch; +#ifndef _MIC_SCIF_ + struct pci_dev *pdev; +#endif + + mic_dma_proc_uninit(dma_ctx); + for (i = 0 ; i < MAX_NUM_DMA_CHAN; i++) { + ch = &dma_ctx->dma_channels[i]; + if (!ch->desc_ring) + continue; + drain_dma_intr(ch); + /* Request the channel but don't free it. Errors are okay */ + request_dma_channel(ch); +#ifdef _MIC_SCIF_ // DMA now shares the IRQ handler with other system interrupts + free_irq(i, ch); +#endif + mi_mic_dma_chan_destroy(ch, dma_ctx); +#ifndef _MIC_SCIF_ + micscif_pci_dev(dma_ctx->device_num, &pdev); + mic_unmap_single(dma_ctx->device_num - 1, pdev, ch->desc_ring_phys, + (DMA_DESC_RING_SIZE * sizeof(*ch->desc_ring)) + PAGE_SIZE); +#endif + + kfree(ch->desc_ring_bak); + ch->desc_ring_bak = NULL; + ch->desc_ring = NULL; + if (mic_hw_family(ch->dma_ctx->device_num) == FAMILY_KNC && + mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP) { +#ifndef _MIC_SCIF_ + mic_unmap_single(dma_ctx->device_num - 1, pdev, ch->chan->dstat_wb_phys, + sizeof(uint32_t)); +#endif + kfree(ch->chan->dstat_wb_loc); + ch->chan->dstat_wb_loc = NULL; + ch->chan->dstat_wb_phys = 0; + } + md_mic_dma_free_chan(&dma_ctx->dma_dev, ch->chan); + } +#ifndef MIC_IS_EMULATION + /* Ensure that all waiters for DMA channels time out */ + msleep(DMA_TO/HZ * 1000); +#endif + md_mic_dma_uninit(&dma_ctx->dma_dev); + //pr_debug(PR_PREFIX "Uninitialized the dma channels\n"); +} + +/* + * reserve_dma_channel - reserve a given dma channel for exclusive use + * + * @dma_handle - handle to DMA device returned by open_dma_device + * @chan_num - Channel number to be reserved + * @chan - set to point to the dma channel reserved by the call + * + * Returns < 1 on error (errorno) + * Returns 0 on success + * + * NOTES: Should this function sleep waiting for the lock? + * TODO: + * Maybe there should be a blocking and non-blocking versions of this function + */ +int +reserve_dma_channel(mic_dma_handle_t dma_handle, int chan_num, struct dma_channel **chan) +{ + struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *) dma_handle; + + /* + * Do we need to do acquire the lock for statically allocated channels? + * I am assuming we dont have to lock + */ + if (CHAN_AVAILABLE == atomic_cmpxchg(&(dma_ctx->dma_channels[chan_num].flags), + CHAN_AVAILABLE, CHAN_INUSE)) { + *chan = &dma_ctx->dma_channels[chan_num]; + return 0; + } + return -1; +} +EXPORT_SYMBOL(reserve_dma_channel); + +/* + * allocate_dma_channel - dynamically allocate a dma channel (for a short while). Will + * search for, choose, and lock down one channel for use by the calling thread. + * + * @dma_handle - handle to DMA device returned by open_dma_device + * @chan - Returns the dma_channel pointer that was allocated by the call + * + * Returns < 1 on error + * Returns 0 on success + * + * NOTE: This function grabs a lock before exiting -- the calling thread MUST NOT + * sleep, and must call free_dma_channel before returning to user-space or switching + * volantarily to another thread. Similarly, this function cannot be called from + * an interrupt context at this time. + * + * TODO: How do we pick a dma channel? + * For now I am doing it in round robin fashion. + */ +int +allocate_dma_channel(mic_dma_handle_t dma_handle, struct dma_channel **chan) +{ + struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *) dma_handle; + int i, j; + + if (!dma_ctx) + return -ENODEV; + + j = dma_ctx->last_allocated_dma_channel_num + 1; + + for (i = 0; i < MAX_NUM_DMA_CHAN; i++, j++) { + if (CHAN_AVAILABLE == atomic_cmpxchg(&(dma_ctx->dma_channels[j % + MAX_NUM_DMA_CHAN].flags), + CHAN_AVAILABLE, CHAN_INUSE)) { + *chan = &(dma_ctx->dma_channels[j % MAX_NUM_DMA_CHAN]); + dma_ctx->last_allocated_dma_channel_num = j % MAX_NUM_DMA_CHAN; + return 0; + } + } + return -1; +} +EXPORT_SYMBOL(allocate_dma_channel); + +/* + * request_dma_channel - Request a specific DMA channel. + * + * @chan - Returns the dma_channel pointer that was requested + * + * Returns: 0 on success and -ERESTARTSYS if the wait was interrupted + * or -EBUSY if the channel was not available. + * + * NOTE: This function must call free_dma_channel before returning to + * user-space. + */ +int request_dma_channel(struct dma_channel *chan) +{ + int ret; + + ret = wait_event_interruptible_timeout(chan->access_wq, + CHAN_AVAILABLE == atomic_cmpxchg(&chan->flags, + CHAN_AVAILABLE, CHAN_INUSE), DMA_TO); + if (!ret) { + printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num); + ret = -EBUSY; + } + if (ret > 0) + ret = 0; + return ret; +} +EXPORT_SYMBOL(request_dma_channel); + +/* + * free_dma_channel - after allocating a channel, used to + * free the channel after DMAs are submitted + * + * @chan - pointer to the dma_channel struct that was allocated + * + * Returns 0 on success, < 1 on error (errorno) + * + * NOTE: This function must be called after all do_dma calls are finished, + * but can be called before the DMAs actually complete (as long as the comp_cb() + * handler in do_dma don't refer to the dma_channel struct). If called with a + * dynamically allocated dma_chan, the caller must be the thread that called + * allocate_dma_chan. When operating on a dynamic channel, free unlocks the + * mutex locked in allocate. Statically allocated channels cannot be freed, + * and calling this function with that type of channel will return an error. + */ +int +free_dma_channel(struct dma_channel *chan) +{ + /* + * Why can't we use this function with channels that were statically allocated?? + */ + BUG_ON(CHAN_INUSE != + atomic_cmpxchg(&chan->flags, CHAN_INUSE, CHAN_AVAILABLE)); + wake_up(&chan->access_wq); + return 0; +} +EXPORT_SYMBOL(free_dma_channel); + +static __always_inline uint32_t +get_dma_tail_pointer(struct dma_channel *chan) +{ + struct mic_dma_device *dma_dev; + dma_dev = &chan->dma_ctx->dma_dev; + return md_mic_dma_chan_read_tail(dma_dev, chan->chan); +} +/* + * Return -1 in case of error + */ +static int +program_memcpy_descriptors(struct dma_channel *chan, uint64_t src, uint64_t dst, size_t len) +{ + size_t current_transfer_len; + bool is_astep = false; + unsigned long ts = jiffies; + + if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) { + if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP) + is_astep = true; + } else { + is_astep = true; + } + do { + current_transfer_len = (len > MAX_DMA_XFER_SIZE) ? + MAX_DMA_XFER_SIZE : len; + + ts = jiffies; + while (!md_avail_desc_ring_space(&chan->dma_ctx->dma_dev, is_astep, chan->chan, + (uint32_t)chan->next_write_index, 1)) { + if (time_after(jiffies,ts + DMA_TO)) { + printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num); + return -ENOMEM; + } + } + + //pr_debug("src_phys=0x%llx, dst_phys=0x%llx, size=0x%zx\n", src_phys_addr, dst_phys_addr, current_transfer_len); + md_mic_dma_memcpy_desc(&chan->desc_ring[chan->next_write_index], + src, dst, current_transfer_len); + chan->next_write_index = incr_rb_index((int)chan->next_write_index, + chan->chan->num_desc_in_ring); + len -= current_transfer_len; + dst = dst + current_transfer_len; + src = src + current_transfer_len; + } while(len > 0); + + return 0; +} + +/* + * do_dma - main dma function: perform a dma memcpy, len bytes from src to dst + * + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_chan, or statically by + * reserve_dma_chan. Using a channel not allocated in this way will + * result in undefined behavior. + * @flags - ATOMIC, called from an interrupt context (no blocking) + * @src - src physical address + * @dst - dst physical address + * @len - Length of the dma + * @comp_cb - When the DMA is complete, the struct's function will be called. NOTE! + * comp_cb(cb_cookie) is called from an interrupt context, so the + * function must not sleep or block. + * + * TODO: Figure out proper value instead of -2 + * Return < 0 on error + * Return = -2 copy was done successfully, no need to wait + * Return >= 0: DMA has been queued. Return value can be polled on for completion + * if DO_DMA_POLLING was sent in flags + * (poll cookie). An example (simplified w/ no error handling). + * int cookie = do_dma(...); + * while (poll_dma_completion(cookie) == 0); + * printf("DMA now complete\n"); + */ +int +do_dma(struct dma_channel *chan, int flags, uint64_t src, + uint64_t dst, size_t len, struct dma_completion_cb *comp_cb) +{ + /* + * TODO: + * Do we need to assert the ownership of channel?? + */ + int poll_ring_index = -1; + int intr_ring_index = -1; + uint32_t num_status_desc = 0; + bool is_astep = false; + unsigned long ts = jiffies; + + might_sleep(); + if (flags & DO_DMA_INTR && !comp_cb) + return -EINVAL; + + if (!verify_next_write_index(chan)) + return -ENODEV; + + //pr_debug(PR_PREFIX "Current transfer src = 0x%llx,dst = 0x%llx, len = 0x%zx\n", src, dst, len); + if (flags & DO_DMA_INTR) { + int err; + err = wait_event_interruptible_timeout(chan->intr_wq, + (-1 != (intr_ring_index = allocate_buffer(&chan->intr_ring.ring))), + DMA_TO); + if (!err) { + printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num); + err = -ENOMEM; + } + if (err > 0) + err = 0; + if (!err) { + chan->intr_ring.comp_cb_array[intr_ring_index] = comp_cb; + num_status_desc++; +#ifdef CONFIG_MK1OM + num_status_desc++; +#endif + } else { + return err; + } + //pr_debug(PR_PREFIX "INTR intr_ring_index=%d, chan_num=%lx\n", intr_ring_index, (chan - dma_channels)); + } + + if (flags & DO_DMA_POLLING) { + poll_ring_index = allocate_buffer(&chan->poll_ring); + if (-1 == poll_ring_index) + return -ENOMEM; + num_status_desc++; + //pr_debug(PR_PREFIX "polling poll_ring_index=%d\n", poll_ring_index); + } + if (len && -ENOMEM == program_memcpy_descriptors(chan, src, dst, len)) { + //pr_debug(PR_PREFIX "ERROR: do_dma: No available space from program_memcpy_descriptors\n"); + return -ENOMEM; + } + + if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) { + if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP) + is_astep = true; + } else { + is_astep = true; + } + + ts = jiffies; + + while (num_status_desc && num_status_desc > md_avail_desc_ring_space(&chan->dma_ctx->dma_dev, + is_astep, chan->chan, (uint32_t)chan->next_write_index, num_status_desc)) { + if (time_after(jiffies,ts + DMA_TO)) { + printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num); + return -ENOMEM; + } + //pr_debug(PR_PREFIX "ERROR: do_dma: No available space from md_avail_desc_ring_space\n"); + } + + if (flags & DO_DMA_POLLING) { + incr_head(&chan->poll_ring); + md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index], + poll_ring_index, + chan->poll_ring.tail_phys, + false); + chan->next_write_index = incr_rb_index((int)chan->next_write_index, + chan->chan->num_desc_in_ring); + } + + if (flags & DO_DMA_INTR) { + incr_head(&chan->intr_ring.ring); +#ifdef CONFIG_MK1OM + md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index], + intr_ring_index, + chan->intr_ring.ring.tail_phys, + false); + chan->next_write_index = incr_rb_index((int)chan->next_write_index, + chan->chan->num_desc_in_ring); +#endif + md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index], + intr_ring_index, + chan->intr_ring.ring.tail_phys, + true); + chan->next_write_index = incr_rb_index((int)chan->next_write_index, + chan->chan->num_desc_in_ring); + } + + /* + * TODO: + * Maybe it is better if we update the head pointer for every descriptor?? + */ + md_mic_dma_chan_write_head(&chan->dma_ctx->dma_dev, chan->chan, (uint32_t)chan->next_write_index); + //pr_debug(PR_PREFIX "in HW chan->next_write_index=%lld\n", chan->next_write_index); + + if (DO_DMA_POLLING & flags) + return poll_ring_index; + return 0; +} +EXPORT_SYMBOL(do_dma); + +/* + * poll_dma_completion - check if a DMA is complete + * + * @poll_cookie - value returned from do_dma + * + * Returns + * 0 -> DMA pending + * 1 -> DMA completed + * + * Note: This is mostly useful after calling do_dma with a NULL comp_cb parameter, as + * it will allow the caller to wait for DMA completion. + */ +int +poll_dma_completion(int poll_cookie, struct dma_channel *chan) +{ + if (!chan) + return -EINVAL; + /* + * In case of interrupts the ISR runs and reads the value + * of the tail location. If we are polling then we need + * to read the value of the tail location before checking + * if the entry is processed. + */ + chan->poll_ring.tail = read_tail(&chan->poll_ring); + return is_entry_processed(&chan->poll_ring, poll_cookie); +} +EXPORT_SYMBOL(poll_dma_completion); + +/* + * do_status_update: Update physical address location with the value provided. + * Ensures all previous DMA descriptors submitted on this DMA + * channel are executed. + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_channel, or statically by + * reserve_dma_channel. Using a channel not allocated in this way will + * result in undefined behavior. + * @phys - physical address + * @value - Value to be programmed + */ +int do_status_update(struct dma_channel *chan, uint64_t phys, uint64_t value) +{ + unsigned long ts = jiffies; + bool is_astep = false; + + if (!verify_next_write_index(chan)) + return -ENODEV; + + if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) { + if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP) + is_astep = true; + } else { + is_astep = true; + } + /* + * TODO: + * Do we need to assert the ownership of channel?? + */ + ts = jiffies; + while (!md_avail_desc_ring_space(&chan->dma_ctx->dma_dev, + is_astep, chan->chan, (uint32_t) chan->next_write_index, 1)) { + cpu_relax(); + if (time_after(jiffies,ts + DMA_TO)) { + printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num); + return -EBUSY; + } + } + + md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index], + value, + phys, + false); + + chan->next_write_index = incr_rb_index((int)chan->next_write_index, + chan->chan->num_desc_in_ring); + + md_mic_dma_chan_write_head(&chan->dma_ctx->dma_dev, + chan->chan, (uint32_t)chan->next_write_index); + return 0; +} +EXPORT_SYMBOL(do_status_update); + +/* + * get_dma_mark: Obtain current value of DMA mark + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_channel, or statically by + * reserve_dma_channel. Using a channel not allocated in this way will + * result in undefined behavior. + */ +int get_dma_mark(struct dma_channel *chan) +{ + if (chan) + return chan->intr_ring.ring.head; + else + return -1; +} +EXPORT_SYMBOL(get_dma_mark); + +/* + * program_dma_mark: Increment the current value of the DMA mark for a DMA channel + * and program an interrupt status update descriptor which ensures that all DMA + * descriptors programmed uptil this point in time are completed. + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_channel, or statically by + * reserve_dma_channel. Using a channel not allocated in this way will + * result in undefined behavior. + */ +int program_dma_mark(struct dma_channel *chan) +{ + /* + * TODO: + * Do we need to assert the ownership of channel?? + */ + int intr_ring_index; + int err; + unsigned long ts = jiffies; + uint32_t num_status_desc = 1; + bool is_astep = false; + + if (!verify_next_write_index(chan)) + return -ENODEV; + + if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) { + if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP) + is_astep = true; + } else { + is_astep = true; + } + might_sleep(); + err = wait_event_interruptible_timeout(chan->intr_wq, + (-1 != (intr_ring_index = allocate_buffer(&chan->intr_ring.ring))), + DMA_TO); + if (!err) + err = -EBUSY; + if (err > 0) + err = 0; + if (err) + return err; + +#ifdef CONFIG_MK1OM + num_status_desc++; +#endif + ts = jiffies; + while (num_status_desc > md_avail_desc_ring_space(&chan->dma_ctx->dma_dev, + is_astep, chan->chan, (uint32_t)chan->next_write_index, num_status_desc)) { + cpu_relax(); + if (time_after(jiffies,ts + DMA_TO)) { + printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num); + return -EBUSY; + } + } + + chan->intr_ring.comp_cb_array[intr_ring_index] = NULL; + + incr_head(&chan->intr_ring.ring); +#ifdef CONFIG_MK1OM + md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index], + intr_ring_index, + chan->intr_ring.ring.tail_phys, + false); + chan->next_write_index = incr_rb_index((int)chan->next_write_index, + chan->chan->num_desc_in_ring); +#endif + md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index], + intr_ring_index, + chan->intr_ring.ring.tail_phys, + true); + chan->next_write_index = incr_rb_index((int)chan->next_write_index, + chan->chan->num_desc_in_ring); + + md_mic_dma_chan_write_head(&chan->dma_ctx->dma_dev, chan->chan, (uint32_t)chan->next_write_index); + return intr_ring_index; +} +EXPORT_SYMBOL(program_dma_mark); + +/* + * is_current_dma_mark: Check if the dma mark provided is the current DMA mark. + * @chan - DMA channel + * @mark - DMA mark + * + * Return true on success and false on failure. + */ +bool is_current_dma_mark(struct dma_channel *chan, int mark) +{ + return (get_dma_mark(chan) == mark); +} +EXPORT_SYMBOL(is_current_dma_mark); + +/* + * is_dma_mark_processed: Check if the dma mark provided has been processed. + * @chan - DMA channel + * @mark - DMA mark + * + * Return true on success and false on failure. + */ +bool is_dma_mark_processed(struct dma_channel *chan, int mark) +{ + return is_entry_processed(&chan->intr_ring.ring, mark); +} +EXPORT_SYMBOL(is_dma_mark_processed); + +/* + * dma_mark_wait: + * @chan - DMA channel + * @mark - DMA mark + * @is_interruptible - Use wait_event_interruptible() or not. + * + * Wait for the dma mark to complete. + * Return 0 on success and appropriate error value on error. + */ +int dma_mark_wait(struct dma_channel *chan, int mark, bool is_interruptible) +{ + int err = 0; + uint32_t prev_tail = 0, new_tail; + uint32_t count = 0; + + if (chan) { + might_sleep(); +__retry: + if (is_interruptible) + err = wait_event_interruptible_timeout( + chan->intr_wq, + is_dma_mark_processed(chan, mark), + DMA_TO); + else + err = wait_event_timeout(chan->intr_wq, + is_dma_mark_processed(chan, mark), DMA_TO); + + if (!err) { // 0 is timeout + new_tail = get_dma_tail_pointer(chan); + if ((count <= DMA_FENCE_TIMEOUT_CNT) && + (!count || new_tail != prev_tail)) { // For performance, prev_tail is not read at the begining + prev_tail = new_tail; + count++; + pr_debug("DMA fence wating is still ongoing, waiting for %d seconds\n", DMA_TO/HZ *count); + goto __retry; + } else { + printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num); + err = -EBUSY; + } + } + if (err > 0) + err = 0; + } + return err; +} +EXPORT_SYMBOL(dma_mark_wait); + +/* + * drain_dma_poll - Drain all outstanding DMA operations for a particular + * DMA channel via polling. + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +int drain_dma_poll(struct dma_channel *chan) +{ + int cookie, err; + unsigned long ts; + uint32_t prev_tail = 0, new_tail, count = 0; + if (chan) { + if ((err = request_dma_channel(chan))) + goto error; + if ((cookie = do_dma(chan, + DO_DMA_POLLING, 0, 0, 0, NULL)) < 0) { + err = cookie; + free_dma_channel(chan); + goto error; + } + free_dma_channel(chan); + ts = jiffies; + while (1 != poll_dma_completion(cookie, chan)) { + cpu_relax(); + if (time_after(jiffies,ts + DMA_TO)) { + new_tail = get_dma_tail_pointer(chan); + if ((!count || new_tail != prev_tail) && (count <= DMA_FENCE_TIMEOUT_CNT)) { + prev_tail = new_tail; + ts = jiffies; + count++; + pr_debug("polling DMA is still ongoing, wating for %d seconds\n", DMA_TO/HZ * count); + } else { + err = -EBUSY; + break; + } + } + } +error: + if (err) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + } else { + err = -EINVAL; + } + return err; +} +EXPORT_SYMBOL(drain_dma_poll); + +/* + * drain_dma_intr - Drain all outstanding DMA operations for a particular + * DMA channel via interrupt based blocking wait. + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +int drain_dma_intr(struct dma_channel *chan) +{ + int cookie, err; + + if (chan) { + if ((err = request_dma_channel(chan))) + goto error; + if ((cookie = program_dma_mark(chan)) < 0) { + err = cookie; + free_dma_channel(chan); + goto error; + } + free_dma_channel(chan); + err = dma_mark_wait(chan, cookie, false); +error: + if (err) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + } else { + err = -EINVAL; + } + return err; +} +EXPORT_SYMBOL(drain_dma_intr); + +/* + * drain_dma_global - Drain all outstanding DMA operations for + * all online DMA channel. + * Return none + */ +int drain_dma_global(mic_dma_handle_t dma_handle) +{ + int i, err = -EINVAL; + struct dma_channel *chan; + struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle; + + if (!dma_ctx) + return err; + + might_sleep(); + for (i = 0 ; i < MAX_NUM_DMA_CHAN; i++) { + chan = &dma_ctx->dma_channels[i]; + if (chan->desc_ring == NULL) + continue; + if ((err = drain_dma_intr(chan))) + break; + } + return err; +} +EXPORT_SYMBOL(drain_dma_global); + +#ifdef _MIC_SCIF_ +/* + * dma_suspend: DMA tasks before transition to low power state. + * @dma_handle: Handle for a DMA driver context. + * + * Perform the following tasks before the device transitions + * to a low power state: + * 1) Store away the DMA descriptor ring physical address base for + * all DMA channels (both host/uOS owned) since the value would be + * required to reinitialize the DMA channels upon transition from + * low power to active state. + * + * Return: none + * Notes: Invoked only on MIC. + */ +void dma_suspend(mic_dma_handle_t dma_handle) +{ + int i; + struct dma_channel *ch; + struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle; + struct mic_dma_device *dma_dev = &dma_ctx->dma_dev; + + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { + ch = &dma_ctx->dma_channels[i]; + ch->desc_ring_phys = + md_mic_dma_chan_get_desc_ring_phys(dma_dev, ch->chan); + ch->chan->dstat_wb_phys = + md_mic_dma_chan_get_dstatwb_phys(dma_dev, ch->chan); + } +} +EXPORT_SYMBOL(dma_suspend); + +/* + * dma_resume: DMA tasks after wake up from low power state. + * @dma_handle: Handle for a DMA driver context. + * + * Performs the following tasks before the device transitions + * from a low power state to active state: + * 1) As a test, reset the value in DMA configuration register. + * 2) Reset the next_write_index for the DMA descriptor ring to 0 + * since the DMA channel will be reset shortly. + * 3) Reinitialize the DMA MD layer for the channel. + * + * Return: none + * Notes: + * Notes: Invoked only on MIC. + */ +void dma_resume(mic_dma_handle_t dma_handle) +{ + int i; + struct dma_channel *ch; + struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle; + struct mic_dma_device *dma_dev = &dma_ctx->dma_dev; + + /* TODO: Remove test write to SBOX_DCR */ + mic_sbox_write_mmio(dma_dev->mm_sbox, SBOX_DCR, 0); + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { + ch = &dma_ctx->dma_channels[i]; + ch->next_write_index = 0; + md_mic_dma_chan_init_attr(dma_dev, ch->chan); + md_mic_dma_chan_setup(dma_ctx, ch); + } +} +EXPORT_SYMBOL(dma_resume); + +#else + +/* + * dma_prep_suspend: DMA tasks required on host before a device can transition + * to a low power state. + * @dma_handle: Handle for a DMA driver context. + * + * Performs the following tasks on the host before the device can be allowed + * to transiti to a low power state. + * 1) Reset the next_Write_index for the DMA descriptor ring to 0 + * since the DMA channel will be reset shortly. This is required primarily + * for Host owned DMA channels since MIC does not have access to this + * information. + * Return: none + * Invoked only on Host. + */ +void dma_prep_suspend(mic_dma_handle_t dma_handle) +{ + int i; + struct dma_channel *ch; + struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle; + + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { + ch = &dma_ctx->dma_channels[i]; + ch->next_write_index = 0; + } +} +EXPORT_SYMBOL(dma_prep_suspend); +#endif + +#ifdef CONFIG_PAGE_CACHE_DMA +#ifdef _MIC_SCIF_ +static const struct dma_operations dma_operations_fast_copy = { + .do_dma = do_dma, + .poll_dma_completion = poll_dma_completion, + .free_dma_channel = free_dma_channel, + .open_dma_device = open_dma_device, + .close_dma_device = close_dma_device, + .allocate_dma_channel = allocate_dma_channel, + .program_descriptors = program_memcpy_descriptors, + .do_dma_polling = DO_DMA_POLLING, +}; + +static const struct file_dma fdma_callback = { + .dmaops = &dma_operations_fast_copy, +}; +#endif +#endif + +#ifdef _MIC_SCIF_ +static int +#else +int +#endif +mic_dma_init(void) +{ + int i; + + for (i = 0; i < MAX_BOARD_SUPPORTED; i++) + mutex_init (&lock_dma_dev_init[i]); +#ifdef CONFIG_PAGE_CACHE_DMA +#ifdef _MIC_SCIF_ + register_dma_for_fast_copy(&fdma_callback); +#endif +#endif + return 0; +} + +#ifdef _MIC_SCIF_ +static void mic_dma_uninit(void) +{ +#ifdef CONFIG_PAGE_CACHE_DMA + unregister_dma_for_fast_copy(); +#endif +} + +module_init(mic_dma_init); +module_exit(mic_dma_uninit); +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +static int +mic_dma_proc_ring_show(struct seq_file *m, void *data) +{ + struct mic_dma_ctx_t *dma_ctx = m->private; + mic_ctx_t *mic_ctx = get_per_dev_ctx(dma_ctx->device_num - 1); + int i, err; + struct compl_buf_ring *ring; + + if ((err = micpm_get_reference(mic_ctx, true))) { + printk(KERN_ERR "%s %d: unable to get micpm reference: %d\n", + __func__, __LINE__, err); + return err; + } + + seq_printf(m, "Intr rings\n"); + seq_printf(m, "%-10s%-12s%-12s%-12s%-25s%-18s%-25s\n", + "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail", "In Use"); + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { + ring = &dma_ctx->dma_channels[i].intr_ring.ring; + seq_printf(m, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x%-#18x\n", + i, ring->head, ring->tail, ring->size, + ring->tail_location, *(int*)ring->tail_location, + atomic_read(&dma_ctx->dma_channels[i].flags)); + } + seq_printf(m, "Poll rings\n"); + seq_printf(m, "%-10s%-12s%-12s%-12s%-25s%-18s\n", + "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail"); + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { + ring = &dma_ctx->dma_channels[i].poll_ring; + seq_printf(m, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x\n", + i, ring->head, ring->tail, ring->size, + ring->tail_location, *(int*)ring->tail_location); + } + seq_printf(m, "Next_Write_Index\n"); + seq_printf(m, "%-10s%-12s\n", "Chan", "Next_Write_Index"); + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { + seq_printf(m, "%-#10x%-#12llx\n", + i, dma_ctx->dma_channels[i].next_write_index); + } + micpm_put_reference(mic_ctx); + return 0; +} + +static int +mic_dma_proc_ring_open(struct inode *inode, struct file *file) +{ + return single_open(file, mic_dma_proc_ring_show, PDE_DATA(inode)); +} + +static int +mic_dma_proc_reg_show(struct seq_file *m, void *data) +{ + int i, j, chan_num, size, dtpr, err; + struct mic_dma_ctx_t *dma_ctx = m->private; + mic_ctx_t *mic_ctx = get_per_dev_ctx(dma_ctx->device_num - 1); + struct mic_dma_device *dma_dev = &dma_ctx->dma_dev; + struct dma_channel *curr_chan; + union md_mic_dma_desc desc; + + if ((err = micpm_get_reference(mic_ctx, true))) { + printk(KERN_ERR "%s %d: unable to get micpm reference: %d\n", + __func__, __LINE__, err); + return err; + } + + seq_printf(m, "========================================" + "=======================================\n"); + seq_printf(m, "SBOX_DCR: %#x\n", + mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR)); + seq_printf(m, "DMA Channel Registers\n"); + seq_printf(m, "========================================" + "=======================================\n"); + seq_printf(m, "%-10s| %-10s %-10s %-10s %-10s %-10s %-10s" +#ifdef CONFIG_MK1OM + " %-10s %-11s %-14s %-10s" +#endif + "\n", "Channel", "DCAR", "DTPR", "DHPR", + "DRAR_HI", "DRAR_LO", +#ifdef CONFIG_MK1OM + "DSTATWB_LO", "DSTATWB_HI", "DSTAT_CHERR", "DSTAT_CHERRMSK", +#endif + "DSTAT"); + seq_printf(m, "========================================" + "=======================================\n"); + +#ifdef _MIC_SCIF_ + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { +#else + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { +#endif + curr_chan = &dma_ctx->dma_channels[i]; + chan_num = curr_chan->ch_num; + seq_printf(m, "%-10i| %-#10x %-#10x %-#10x %-#10x" + " %-#10x" +#ifdef CONFIG_MK1OM + " %-#10x %-#11x %-#10x %-#14x" +#endif + " %-#10x\n", chan_num, + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO), +#ifdef CONFIG_MK1OM + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_LO), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_HI), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERRMSK), +#endif + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTAT)); + } + + seq_printf(m, "\nDMA Channel Descriptor Rings\n"); + seq_printf(m, "========================================" + "=======================================\n"); + + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { + curr_chan = &dma_ctx->dma_channels[i]; + chan_num = curr_chan->ch_num; + dtpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR); + seq_printf(m, "Channel %i: [", chan_num); + size = ((int) md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR) + - dtpr) % curr_chan->chan->num_desc_in_ring; + /* + * In KNC B0, empty condition is tail = head -1 + */ + if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC && + mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP) + size -= 1; + + for (j = 0; j < size; j++) { + desc = curr_chan->desc_ring[(j+dtpr) % + curr_chan->chan->num_desc_in_ring]; + + switch (desc.desc.nop.type){ + case NOP: + seq_printf(m," {Type: NOP, 0x%#llx" + " %#llx} ", desc.qwords.qw0, + desc.qwords.qw1); + case MEMCOPY: + seq_printf(m," {Type: MEMCOPY, SAP:" + " 0x%#llx, DAP: %#llx, length: %#llx} ", + (uint64_t) desc.desc.memcopy.sap, + (uint64_t) desc.desc.memcopy.dap, + (uint64_t) desc.desc.memcopy.length); + break; + case STATUS: + seq_printf(m," {Type: STATUS, data:" + " 0x%#llx, DAP: %#llx, intr: %lli} ", + (uint64_t) desc.desc.status.data, + (uint64_t) desc.desc.status.dap, + (uint64_t) desc.desc.status.intr); + break; + case GENERAL: + seq_printf(m," {Type: GENERAL, " + "DAP: %#llx, dword: %#llx} ", + (uint64_t) desc.desc.general.dap, + (uint64_t) desc.desc.general.data); + break; + case KEYNONCECNT: + seq_printf(m," {Type: KEYNONCECNT, sel: " + "%lli, h: %lli, index: %lli, cs: %lli," + " value: %#llx} ", + (uint64_t) desc.desc.keynoncecnt.sel, + (uint64_t) desc.desc.keynoncecnt.h, + (uint64_t) desc.desc.keynoncecnt.index, + (uint64_t) desc.desc.keynoncecnt.cs, + (uint64_t) desc.desc.keynoncecnt.data); + break; + case KEY: + seq_printf(m," {Type: KEY, dest_ind" + "ex: %lli, ski: %lli, skap: %#llx ", + (uint64_t) desc.desc.key.di, + (uint64_t) desc.desc.key.ski, + (uint64_t) desc.desc.key.skap); + break; + default: + seq_printf(m," {Uknown Type=%lli ," + "%#llx %#llx} ",(uint64_t) desc.desc.nop.type, + (uint64_t) desc.qwords.qw0, + (uint64_t) desc.qwords.qw1); + } + } + seq_printf(m, "]\n"); + if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC && + mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP && + curr_chan->chan->dstat_wb_loc) + seq_printf(m, "DSTAT_WB = 0x%x\n", + *((uint32_t*)curr_chan->chan->dstat_wb_loc)); + } + micpm_put_reference(mic_ctx); + + return 0; +} + +static int +mic_dma_proc_reg_open(struct inode *inode, struct file *file) +{ + return single_open(file, mic_dma_proc_reg_show, PDE_DATA(inode)); +} + +struct file_operations micdma_ring_fops = { + .open = mic_dma_proc_ring_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +struct file_operations micdma_reg_fops = { + .open = mic_dma_proc_reg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void +mic_dma_proc_init(struct mic_dma_ctx_t *dma_ctx) +{ + char name[64]; + + snprintf(name, 63, "%s%d", proc_dma_ring, dma_ctx->device_num); + if (!proc_create_data(name, S_IFREG | S_IRUGO, NULL, &micdma_ring_fops, dma_ctx)) + printk("micdma: unable to register /proc/%s\n", name); + + snprintf(name, 63, "%s%d", proc_dma_reg, dma_ctx->device_num); + if (!proc_create_data(name, S_IFREG | S_IRUGO, NULL, &micdma_reg_fops, dma_ctx)) + printk("micdma: unable to register /proc/%s\n", name); + +} +#else // LINUX VERSION +static int +mic_dma_proc_read_fn(char *buf, char **start, off_t offset, int count, int *eof, void *data) +{ + struct mic_dma_ctx_t *dma_ctx = data; + int i, len = 0; + struct compl_buf_ring *ring; + + len += sprintf(buf + len, "Intr rings\n"); + len += sprintf(buf + len, "%-10s%-12s%-12s%-12s%-25s%-18s%-25s\n", + "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail", "In Use"); + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { + ring = &dma_ctx->dma_channels[i].intr_ring.ring; + len += sprintf(buf + len, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x%-#18x\n", + i, ring->head, ring->tail, ring->size, + ring->tail_location, *(int*)ring->tail_location, + atomic_read(&dma_ctx->dma_channels[i].flags)); + } + len += sprintf(buf + len, "Poll rings\n"); + len += sprintf(buf + len, "%-10s%-12s%-12s%-12s%-25s%-18s\n", + "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail"); + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { + ring = &dma_ctx->dma_channels[i].poll_ring; + len += sprintf(buf + len, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x\n", + i, ring->head, ring->tail, ring->size, + ring->tail_location, *(int*)ring->tail_location); + } + len += sprintf(buf + len, "Next_Write_Index\n"); + len += sprintf(buf + len, "%-10s%-12s\n", "Chan", "Next_Write_Index"); + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { + len += sprintf(buf + len, "%-#10x%-#12llx\n", + i, dma_ctx->dma_channels[i].next_write_index); + } + return len; +} + +static int +mic_dma_proc_read_registers_fn(char *buf, char **start, off_t offset, int count, + int *eof, void *data) +{ + int i, j, chan_num, size, dtpr, len = 0; + struct mic_dma_ctx_t *dma_ctx = data; + struct mic_dma_device *dma_dev = &dma_ctx->dma_dev; + struct dma_channel *curr_chan; + union md_mic_dma_desc desc; + + len += sprintf(buf + len, "========================================" + "=======================================\n"); + len += sprintf(buf + len, "SBOX_DCR: %#x\n", + mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR)); + len += sprintf(buf + len, "DMA Channel Registers\n"); + len += sprintf(buf + len, "========================================" + "=======================================\n"); + len += sprintf(buf + len, "%-10s| %-10s %-10s %-10s %-10s %-10s %-10s" +#ifdef CONFIG_MK1OM + " %-10s %-11s %-14s %-10s" +#endif + "\n", "Channel", "DCAR", "DTPR", "DHPR", + "DRAR_HI", "DRAR_LO", +#ifdef CONFIG_MK1OM + "DSTATWB_LO", "DSTATWB_HI", "DSTAT_CHERR", "DSTAT_CHERRMSK", +#endif + "DSTAT"); + len += sprintf(buf + len, "========================================" + "=======================================\n"); + +#ifdef _MIC_SCIF_ + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { +#else + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { +#endif + curr_chan = &dma_ctx->dma_channels[i]; + chan_num = curr_chan->ch_num; + len += sprintf(buf + len, "%-10i| %-#10x %-#10x %-#10x %-#10x" + " %-#10x" +#ifdef CONFIG_MK1OM + " %-#10x %-#11x %-#10x %-#14x" +#endif + " %-#10x\n", chan_num, + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO), +#ifdef CONFIG_MK1OM + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_LO), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_HI), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERR), + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERRMSK), +#endif + md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTAT)); + } + + len += sprintf(buf + len, "\nDMA Channel Descriptor Rings\n"); + len += sprintf(buf + len, "========================================" + "=======================================\n"); + + for (i = first_dma_chan(); i <= last_dma_chan(); i++) { + curr_chan = &dma_ctx->dma_channels[i]; + chan_num = curr_chan->ch_num; + dtpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR); + len += sprintf(buf + len, "Channel %i: [", chan_num); + size = ((int) md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR) + - dtpr) % curr_chan->chan->num_desc_in_ring; + /* + * In KNC B0, empty condition is tail = head -1 + */ + if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC && + mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP) + size -= 1; + + for (j = 0; j < size; j++) { + desc = curr_chan->desc_ring[(j+dtpr) % + curr_chan->chan->num_desc_in_ring]; + + switch (desc.desc.nop.type){ + case NOP: + len += sprintf(buf + len," {Type: NOP, 0x%#llx" + " %#llx} ", desc.qwords.qw0, + desc.qwords.qw1); + case MEMCOPY: + len += sprintf(buf + len," {Type: MEMCOPY, SAP:" + " 0x%#llx, DAP: %#llx, length: %#llx} ", + (uint64_t) desc.desc.memcopy.sap, + (uint64_t) desc.desc.memcopy.dap, + (uint64_t) desc.desc.memcopy.length); + break; + case STATUS: + len += sprintf(buf + len," {Type: STATUS, data:" + " 0x%#llx, DAP: %#llx, intr: %lli} ", + (uint64_t) desc.desc.status.data, + (uint64_t) desc.desc.status.dap, + (uint64_t) desc.desc.status.intr); + break; + case GENERAL: + len += sprintf(buf + len," {Type: GENERAL, " + "DAP: %#llx, dword: %#llx} ", + (uint64_t) desc.desc.general.dap, + (uint64_t) desc.desc.general.data); + break; + case KEYNONCECNT: + len += sprintf(buf + len," {Type: KEYNONCECNT, sel: " + "%lli, h: %lli, index: %lli, cs: %lli," + " value: %#llx} ", + (uint64_t) desc.desc.keynoncecnt.sel, + (uint64_t) desc.desc.keynoncecnt.h, + (uint64_t) desc.desc.keynoncecnt.index, + (uint64_t) desc.desc.keynoncecnt.cs, + (uint64_t) desc.desc.keynoncecnt.data); + break; + case KEY: + len += sprintf(buf + len," {Type: KEY, dest_ind" + "ex: %lli, ski: %lli, skap: %#llx ", + (uint64_t) desc.desc.key.di, + (uint64_t) desc.desc.key.ski, + (uint64_t) desc.desc.key.skap); + break; + default: + len += sprintf(buf + len," {Uknown Type=%lli ," + "%#llx %#llx} ",(uint64_t) desc.desc.nop.type, + (uint64_t) desc.qwords.qw0, + (uint64_t) desc.qwords.qw1); + } + } + len += sprintf(buf + len, "]\n"); + if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC && + mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP && + curr_chan->chan->dstat_wb_loc) + len += sprintf(buf + len, "DSTAT_WB = 0x%x\n", + *((uint32_t*)curr_chan->chan->dstat_wb_loc)); + } + return len; +} + +static void +mic_dma_proc_init(struct mic_dma_ctx_t *dma_ctx) +{ + struct proc_dir_entry *dma_proc; + char name[64]; + + snprintf(name, 63, "%s%d", proc_dma_ring, dma_ctx->device_num); + if ((dma_proc = create_proc_entry(name, S_IFREG | S_IRUGO, NULL)) != NULL) { + dma_proc->read_proc = mic_dma_proc_read_fn; + dma_proc->data = dma_ctx; + } + snprintf(name, 63, "%s%d", proc_dma_reg, dma_ctx->device_num); + if ((dma_proc = create_proc_entry(name, S_IFREG | S_IRUGO, NULL)) != NULL) { + dma_proc->read_proc = mic_dma_proc_read_registers_fn; + dma_proc->data = dma_ctx; + } + +} +#endif // LINUX VERSION + +static void +mic_dma_proc_uninit(struct mic_dma_ctx_t *dma_ctx) +{ + char name[64]; + + snprintf(name, 63, "%s%d", proc_dma_reg, dma_ctx->device_num); + remove_proc_entry(name, NULL); + snprintf(name, 63, "%s%d", proc_dma_ring, dma_ctx->device_num); + remove_proc_entry(name, NULL); +} diff --git a/dma/mic_dma_md.c b/dma/mic_dma_md.c new file mode 100644 index 0000000..705c504 --- /dev/null +++ b/dma/mic_dma_md.c @@ -0,0 +1,522 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#define PR_PREFIX "DMA_LIB_MD:" + +#ifdef CONFIG_ML1OM +#define MIC_DMA_AES_CHAN_NUM 7 +#define is_AES_channel(n) ((n) == MIC_DMA_AES_CHAN_NUM) +#else +#define is_AES_channel(n) ((void)(n), 0) +#endif + +#define DMA_CHAN_COOKIE 0xdeadc0d + +#define SBOX_DCAR_IM0 (0x1 << 24) // APIC Interrupt mask bit +#define SBOX_DCAR_IM1 (0x1 << 25) // MSI-X Interrupt mask bit +#define SBOX_DCAR_IS0 (0x1 << 26) // Interrupt status + +#define SBOX_DRARHI_SYS_MASK (0x1 << 26) + +#ifdef _MIC_SCIF_ +static inline uint32_t chan_to_dcr_mask(uint32_t dcr, struct md_mic_dma_chan *chan, struct mic_dma_device *dma_dev) +{ + uint32_t chan_num = chan->ch_num; + uint32_t owner; + + if (!is_AES_channel(chan_num)) + owner = chan->owner; + else + owner = chan->endianness; + + return ((dcr & ~(0x1 << (chan_num * 2))) | (owner << (chan_num * 2))); +} +#endif + +static inline uint32_t drar_hi_to_ba_bits(uint32_t drar_hi) +{ + /* + * Setting bits 3:2 should generate a DESC_ADDR_ERR but the hardware ignores + * these bits currently and doesn't generate the error. + */ +#ifdef _MIC_SCIF_ + return drar_hi & 0xf; +#else + return drar_hi & 0x3; +#endif +} + +static inline uint32_t physaddr_to_drarhi_ba(phys_addr_t phys_addr) +{ + return drar_hi_to_ba_bits((uint32_t)(phys_addr >> 32)); +} + +static inline uint32_t size_to_drar_hi_size(uint32_t size) +{ + return (size & 0x1ffff) << 4; +} + +static inline uint32_t addr_to_drar_hi_smpt_bits(phys_addr_t mic_phys_addr) +{ + return ((mic_phys_addr >> MIC_SYSTEM_PAGE_SHIFT) & 0x1f) << 21; +} + +static inline uint32_t drar_hi_to_smpt(uint32_t drar_hi, uint32_t chan_num) +{ + return ((drar_hi >> 21) & 0x1f); +} + +void md_mic_dma_enable_chan(struct mic_dma_device *dma_dev, uint32_t chan_num, bool enable); + + +#ifdef _MIC_SCIF_ +/** + * md_mic_dma_chan_init_attr - Set channel attributes like owner and endianness + * @chan: The DMA channel handle + */ +void md_mic_dma_chan_init_attr(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan) +{ + uint32_t dcr; + + CHECK_CHAN(chan); + + dcr = mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR); + dcr = chan_to_dcr_mask(dcr, chan, dma_dev); + mic_sbox_write_mmio(dma_dev->mm_sbox, SBOX_DCR, dcr); +} +#endif + +/* One time DMA Init API */ +void md_mic_dma_init(struct mic_dma_device *dma_dev, uint8_t *mmio_va_base) +{ + int i; +#ifdef _MIC_SCIF_ + dma_dev->mm_sbox = mic_sbox_md_init(); +#else + dma_dev->mm_sbox = mmio_va_base; +#endif + //pr_debug("sbox: va=%p\n", dma_dev.mm_sbox); + + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { + atomic_set(&(dma_dev->chan_info[i].in_use), CHAN_AVAILABLE); + dma_dev->chan_info[i].cookie = DMA_CHAN_COOKIE; + dma_dev->chan_info[i].dstat_wb_phys = 0; + dma_dev->chan_info[i].dstat_wb_loc = NULL; + } + return; +} + +/* One time DMA Uninit API */ +void md_mic_dma_uninit(struct mic_dma_device *dma_dev) +{ + return; +} + +/** + * md_mic_dma_request_chan + * @owner: DMA channel owner: MIC or Host + * + * Return - The DMA channel handle or NULL if failed + * + * Note: Allocating a Host owned channel is not allowed currently + */ +struct md_mic_dma_chan *md_mic_dma_request_chan(struct mic_dma_device *dma_dev, + enum md_mic_dma_chan_owner owner) +{ + struct md_mic_dma_chan *tmp = NULL; + int i; + + for (i = 0; i < MAX_NUM_DMA_CHAN; i++) { + if (CHAN_AVAILABLE == atomic_cmpxchg(&(dma_dev->chan_info[i].in_use), + CHAN_AVAILABLE, CHAN_INUSE)) { + tmp = &dma_dev->chan_info[i]; + tmp->owner = owner; + tmp->ch_num = i; + /* + * Setting endianness by default to MIC_LITTLE_ENDIAN + * in case the AES channel is used for clear transfers + * This is a don't care for clear transfers. + */ + tmp->endianness = MIC_LITTLE_ENDIAN; +#ifdef _MIC_SCIF_ + md_mic_dma_chan_init_attr(dma_dev, tmp); +#endif + break; + } + } + return tmp; +} + +/** + * md_mic_dma_free_chan - Frees up a DMA channel + * @chan: The DMA channel handle + */ +void md_mic_dma_free_chan(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan) +{ + CHECK_CHAN(chan); + atomic_set(&(chan->in_use), CHAN_AVAILABLE); + md_mic_dma_enable_chan(dma_dev, chan->ch_num, false); +} + +/** + * md_mic_dma_enable_chan - Enable/disable the DMA channel + * @chan_num: The DMA channel + * @enable: enable/disable + * + * Must set desc ring and update head pointer only + * after disabling the channel + */ +void md_mic_dma_enable_chan(struct mic_dma_device *dma_dev, + uint32_t chan_num, bool enable) +{ + uint32_t dcr = mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR); + + /* + * There is a separate bit for every channel. + * Look up sboxDcrReg. + */ + if (enable) { + dcr |= 2 << (chan_num << 1); + } else { + dcr &= ~(2 << (chan_num << 1)); + } + mic_sbox_write_mmio(dma_dev->mm_sbox, SBOX_DCR, dcr); +} + +#if 0 +uint32_t md_mic_dma_chan_read_completion_count(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + CHECK_CHAN(chan); + + return (md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DSTAT) & 0xffff); +} + + +/* This function needs to be used only in error case */ +void update_compcount_and_tail(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + chan->completion_count = md_mic_dma_chan_read_completion_count(dma_dev, chan); + chan->cached_tail = md_mic_dma_chan_read_tail(dma_dev, chan); +} +#endif +void md_mic_dma_chan_set_dstat_wb(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan) +{ + uint32_t dstat_wb, dstat_wb_hi; + CHECK_CHAN(chan); + + dstat_wb = (uint32_t)chan->dstat_wb_phys; + dstat_wb_hi = chan->dstat_wb_phys >> 32; + md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DSTATWB_LO, dstat_wb); + md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DSTATWB_HI, dstat_wb_hi); +} + +void md_mic_dma_chan_set_dcherr_msk(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan, uint32_t mask) +{ + CHECK_CHAN(chan); + md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DCHERRMSK, mask); +} +#if 0 +uint32_t md_mic_dma_chan_get_dcherr_msk(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan) +{ + CHECK_CHAN(chan); + return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCHERRMSK); +} + +uint32_t md_mic_dma_chan_get_dcherr(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan) +{ + CHECK_CHAN(chan); + return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCHERR); +} + +void md_mic_dma_chan_set_dcherr(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan, uint32_t value) +{ + CHECK_CHAN(chan); + md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DCHERR, value); + printk("dcherr = %d\n", md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCHERR)); +} +#endif + +/** + * md_mic_dma_chan_set_desc_ring - Configures the DMA channel desc ring + * @chan: The DMA channel handle + * @desc_ring_phys_addr: Physical address of the desc ring base. Needs to be + * physically contiguous and wired down memory. + * @num_desc: Number of descriptors must be a multiple of cache line size. + * Descriptor size should be determined using sizeof(union md_mic_dma_desc). + * The maximum number of descriptors is defined by + * MIC_MAX_NUM_DESC_PER_RING. + */ +void md_mic_dma_chan_set_desc_ring(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan, + phys_addr_t desc_ring_phys_addr, + uint32_t num_desc) +{ + uint32_t chan_num; + uint32_t drar_lo = 0; + uint32_t drar_hi = 0; + + CHECK_CHAN(chan); + chan_num = chan->ch_num; + /* + * TODO: Maybe the 2nd condition should be different considering the + * size of union md_mic_dma_desc? + */ + KASSERT((((num_desc) <= MIC_MAX_NUM_DESC_PER_RING) && + (ALIGN((num_desc - (L1_CACHE_BYTES - 1)), L1_CACHE_BYTES) == num_desc)), + "num_desc > max or not multiple of cache line num 0x%x", num_desc); + + md_mic_dma_enable_chan(dma_dev, chan_num, false); + + drar_hi = size_to_drar_hi_size(num_desc); + + if (MIC_DMA_CHAN_HOST_OWNED == chan->owner) { + drar_hi |= SBOX_DRARHI_SYS_MASK; + drar_hi |= addr_to_drar_hi_smpt_bits(desc_ring_phys_addr); + } + drar_lo = (uint32_t)desc_ring_phys_addr; + drar_hi |= physaddr_to_drarhi_ba(desc_ring_phys_addr); + md_mic_dma_write_mmio(dma_dev, chan_num, REG_DRAR_LO, drar_lo); + md_mic_dma_write_mmio(dma_dev, chan_num, REG_DRAR_HI, drar_hi); + chan->num_desc_in_ring = num_desc; + pr_debug("md_mic_dma_chan_set_desc_ring addr=0x%llx num=%d drar_hi.bits.pageno 0x%x\n", + desc_ring_phys_addr, num_desc, + (uint32_t)(desc_ring_phys_addr >> MIC_SYSTEM_PAGE_SHIFT)); + chan->cached_tail = md_mic_dma_chan_read_tail(dma_dev, chan); + + md_mic_dma_enable_chan(dma_dev, chan_num, true); +} + +uint32_t md_mic_dma_chan_read_head(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + CHECK_CHAN(chan); + + return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DHPR); +} + +uint32_t md_mic_dma_chan_read_tail(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + CHECK_CHAN(chan); + + return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DTPR); +} + +/** + * md_mic_dma_chan_intr_pending - Reads interrupt status to figure out + * if an interrupt is pending. + * @chan: The DMA channel handle. + */ +bool md_mic_dma_chan_intr_pending(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + uint32_t dcar; + CHECK_CHAN(chan); + + dcar = md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCAR); + return (dcar >> 26) & 0x1; +} + +/** + * md_mic_dma_chan_mask_intr - Mask or disable interrupts + * @chan: The DMA channel handle + * + * Masking interrupts will also acknowledge any pending + * interrupts on the channel. + */ +void md_mic_dma_chan_mask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + uint32_t dcar; + uint32_t chan_num; + CHECK_CHAN(chan); + chan_num = chan->ch_num; + + dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR); + + if (MIC_DMA_CHAN_MIC_OWNED == chan->owner) + dcar |= SBOX_DCAR_IM0; + else + dcar |= SBOX_DCAR_IM1; + + md_mic_dma_write_mmio(dma_dev, chan_num, REG_DCAR, dcar); + /* + * This read is completed only after previous write is completed. + * It guarantees that, interrupts has been acknowledged to SBOX DMA + * This read forces previous write to be commited in memory. + * This is the actual fix for HSD# 3497216 based on theoretical + * hypothesis that somehow previous write is not truly completed + * since for writes as long as transactions are accepted by SBOX + * ( not necessarily commited in memory) those write transactions + * reported as complete. + */ + dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR); +} + +/** + * md_mic_dma_chan_unmask_intr - Unmask or enable interrupts + * @chan: The DMA channel handle + */ +void md_mic_dma_chan_unmask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + uint32_t dcar; + uint32_t chan_num; + CHECK_CHAN(chan); + chan_num = chan->ch_num; + + dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR); + + if (MIC_DMA_CHAN_MIC_OWNED == chan->owner) + dcar &= ~SBOX_DCAR_IM0; + else + dcar &= ~SBOX_DCAR_IM1; + + md_mic_dma_write_mmio(dma_dev, chan_num, REG_DCAR, dcar); + /* + * This read is completed only after previous write is completed. + * It guarantees that, interrupts has been acknowledged to SBOX DMA + * This read forces previous write to be commited in memory. + * This is the actual fix for HSD# 3497216 based on theoretical + * hypothesis that somehow previous write is not truly completed + * since for writes as long as transactions are accepted by SBOX + * ( not necessarily commited in memory) those write transactions + * reported as complete. + */ + dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR); +} + +/** + * md_mic_dma_chan_get_desc_ring_phys - Compute the value of the descriptor ring + * base physical address from the descriptor ring attributes register. + * @dma_dev: DMA device. + * @chan: The DMA channel handle + */ +phys_addr_t +md_mic_dma_chan_get_desc_ring_phys(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + phys_addr_t phys, phys_hi; + uint32_t phys_lo, chan_num, drar_hi; + + CHECK_CHAN(chan); + chan_num = chan->ch_num; + phys_lo = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO); + drar_hi = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI); + phys_hi = drar_hi_to_ba_bits(drar_hi); + phys_hi |= drar_hi_to_smpt(drar_hi, chan_num) << 2; + + phys = phys_lo | (phys_hi << 32); + return phys; +} + +/** + * md_mic_dma_chan_get_dstatwb_phys - Compute the value of the DSTAT write back + * physical address. + * @dma_dev: DMA device. + * @chan: The DMA channel handle + */ +phys_addr_t md_mic_dma_chan_get_dstatwb_phys(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan) +{ + uint32_t reg, chan_num; + phys_addr_t phys; + + CHECK_CHAN(chan); + chan_num = chan->ch_num; + reg = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_HI); + phys = reg; + reg = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_LO); + + phys = phys << 32 | reg; + return phys; +} + +/** + * md_mic_dma_prep_nop_desc - Prepares a NOP descriptor. + * @desc: Descriptor to be populated. + * + * This descriptor is used to pad a cacheline if the previous + * descriptor does not end on a cacheline boundary. + */ +void md_mic_dma_prep_nop_desc(union md_mic_dma_desc *desc) +{ + KASSERT((desc != 0), ("NULL desc")); + + desc->qwords.qw0 = 0; + desc->qwords.qw1 = 0; + desc->desc.nop.type = 0; +} + +/* Only Debug Code Below */ + +/** + * md_mic_dma_print_debug - Print channel debug information + * @chan: The DMA channel handle + * @sbuf: Print to an sbuf if not NULL else prints to console + */ +void md_mic_dma_print_debug(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan) +{ + uint32_t dcr; + uint32_t dcar; + uint32_t dtpr; + uint32_t dhpr; + uint32_t drar_lo; + uint32_t drar_hi; + uint32_t dstat; + uint32_t chan_num = chan->ch_num; + + dcr = mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR); + dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR); + dtpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR); + dhpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR); + drar_lo = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO); + drar_hi = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI); + dstat = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTAT); + pr_debug(PR_PREFIX "Chan_Num 0x%x DCR 0x%x DCAR 0x%x DTPR 0x%x" + "DHPR 0x%x DRAR_HI 0x%x DRAR_LO 0x%x DSTAT 0x%x\n", + chan_num, dcr, dcar, dtpr, dhpr, drar_hi, drar_lo, dstat); + pr_debug(PR_PREFIX "DCR 0x%x\n", dcr); +} diff --git a/dma/mic_sbox_md.c b/dma/mic_sbox_md.c new file mode 100644 index 0000000..98118c2 --- /dev/null +++ b/dma/mic_sbox_md.c @@ -0,0 +1,57 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include + +#include +#include + +#define PR_PREFIX "SBOX:" + +extern void *mic_sbox_mmio_va; + +void *mic_sbox_md_init(void) +{ + return mic_sbox_mmio_va; +} + +void mic_sbox_md_uninit(void *mic_sbox_mmio_va) +{ + iounmap(mic_sbox_mmio_va); + pr_debug(PR_PREFIX "Uninitialized sbox md\n"); +} + diff --git a/host/Makefile b/host/Makefile new file mode 100644 index 0000000..52e6745 --- /dev/null +++ b/host/Makefile @@ -0,0 +1,47 @@ +# +# Manycore Throughput Linux Driver +# Copyright (c) 2010, Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms and conditions of the GNU General Public License, +# version 2, as published by the Free Software Foundation. +# +# This program is distributed in the hope it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. +# +# + +KERNELDIR = /lib/modules/$(shell uname -r)/build +KBUILD := $(MAKE) -C $(KERNELDIR) M=$(CURDIR) +EXTRADIR = $(shell readlink -f $(KERNELDIR)) + +ifneq ($(DESTDIR),) +INSTALL_MOD_PATH = $(DESTDIR) +endif + +.PHONY: default modules install modules_install clean + +default: modules +install: modules_install udev + +modules: + +$(KBUILD) $@ + +modules_install: + +$(KBUILD) INSTALL_MOD_PATH=$(DESTDIR) modules_install + mkdir -p $(DESTDIR)$(EXTRADIR)/include + install -m644 include/scif.h $(DESTDIR)$(EXTRADIR)/include + install -m644 Module.symvers $(DESTDIR)$(EXTRADIR)/Module.symvers.mic + +udev: udev-scif.rules + mkdir -p $(DESTDIR)/etc/udev/rules.d + cp $< $(DESTDIR)/etc/udev/rules.d/50-$< + +clean: + +$(KBUILD) clean diff --git a/host/acptboot.c b/host/acptboot.c new file mode 100644 index 0000000..be56f8d --- /dev/null +++ b/host/acptboot.c @@ -0,0 +1,194 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define ACPT_BACKLOG 120 +#define ACPT_POLL_MS 2000 + +#define ACPT_BOOTED 1 +#define ACPT_BOOT_ACK 2 +#define ACPT_NACK_VERSION 3 +#define ACPT_REQUEST_TIME 4 +#define ACPT_TIME_DATA 5 + +#define ACPT_VERSION 1 + +static acptboot_data_t *acptboot_data; + + +void acptboot_getconn(struct work_struct *work) +{ + mic_ctx_t *node_ctx; + struct scif_portID data; + scif_epd_t conn_epd; + struct timespec tod; + int proto; + int version; + int err; + + if ((err = scif_accept(acptboot_data->listen_epd, &data, &conn_epd, + SCIF_ACCEPT_SYNC))) { + pr_debug("ACPTBOOT: scif_accept_failed %d\n", err); + return; + + //goto requeue_accept; + } + + if (!data.node) { + printk(KERN_ERR "ACPTBOOT: connect received from invalid dev %d\n", + -EINVAL); + goto close_epd; + } + + if ((err = scif_recv(conn_epd, &version, sizeof(version), SCIF_RECV_BLOCK)) != sizeof(version)) { + printk(KERN_ERR "ACPTBOOT: failed to recieve version number err %d\n", err); + goto close_epd; + } + + if ((err = scif_recv(conn_epd, &proto, sizeof(proto), SCIF_RECV_BLOCK)) != sizeof(proto)) { + printk(KERN_ERR "ACPTBOOT: failed to recieve proto id %d\n", err); + goto close_epd; + } + + switch (proto) { + case ACPT_BOOTED: + node_ctx = get_per_dev_ctx(data.node - 1); + mic_setstate(node_ctx, MIC_ONLINE); + node_ctx->boot_count++; + + proto = ACPT_BOOT_ACK; + scif_send(conn_epd, &proto, sizeof(proto), SCIF_SEND_BLOCK); + break; + + case ACPT_REQUEST_TIME: + getnstimeofday(&tod); + proto = ACPT_TIME_DATA; + scif_send(conn_epd, &proto, sizeof(proto), SCIF_SEND_BLOCK); + scif_send(conn_epd, &tod, sizeof(tod), SCIF_SEND_BLOCK); + break; + } + +close_epd: + if ((err = scif_close(conn_epd))) + printk(KERN_ERR "ACPTBOOT: scif_close failed %d\n", err); + +//requeue_accept: + queue_work(acptboot_data->acptbootwq, &acptboot_data->acptbootwork); +} + +void acptboot_exit(void) +{ + int err = 0; + if (acptboot_data) { + if (acptboot_data->listen_epd) + if ((err = scif_close(acptboot_data->listen_epd)) < 0) + pr_debug("scif_close failed %d\n", err); + destroy_workqueue(acptboot_data->acptbootwq); + + kfree(acptboot_data); + } +} + +int +acptboot_init(void) +{ + int err, ret; + + acptboot_data = (acptboot_data_t *)kzalloc(sizeof(*acptboot_data), GFP_KERNEL); + + if (!acptboot_data) { + printk(KERN_ERR "ACPTBOOT: memory allocation failure\n"); + return -ENOMEM; + } + + acptboot_data->listen_epd = scif_open(); + + if (!acptboot_data->listen_epd) { + printk(KERN_ERR "ACPTBOOT: scif_open() failed!\n"); + err = -ENOMEM; + goto error; + } + + err = scif_bind(acptboot_data->listen_epd, MIC_NOTIFY); + if (err < 0) { + pr_debug("ACPTBOOT: scif_bind() failed! %d\n", err); + goto error; + } + + acptboot_data->acptboot_pn = err; + + err = scif_listen(acptboot_data->listen_epd, ACPT_BACKLOG); + if (err < 0) { + pr_debug("scif_listen() failed! %d\n", err); + goto error; + + } + + pr_debug("ACPT endpoint listening port %d\n", + acptboot_data->acptboot_pn); + + // Create workqueue + acptboot_data->acptbootwq = __mic_create_singlethread_workqueue( + "ACPTBOOT_WQ"); + + if (!acptboot_data->acptbootwq) { + printk(KERN_ERR "%s %d wq creation failed!\n", __func__, __LINE__); + goto error; + } + + INIT_WORK(&acptboot_data->acptbootwork, acptboot_getconn); + queue_work(acptboot_data->acptbootwq, + &acptboot_data->acptbootwork); + return 0; + +error: + + if (acptboot_data->listen_epd) + if ((ret = scif_close(acptboot_data->listen_epd)) < 0) + pr_debug("ACPTBOOT: scif_close() failed! %d\n", ret); + + kfree(acptboot_data); + + return err; +} + diff --git a/host/ioctl.c b/host/ioctl.c new file mode 100644 index 0000000..f4a8296 --- /dev/null +++ b/host/ioctl.c @@ -0,0 +1,186 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* contains code to handle MIC IO control codes */ + +#include "mic_common.h" + +static int do_send_flash_cmd(mic_ctx_t *mic_ctx, struct ctrlioctl_flashcmd *args); +static int get_card_mem(mic_ctx_t *mic_ctx, struct ctrlioctl_cardmemcpy *args); + +/* + DESCRIPTION:: Gets the opcode from the input buffer and call appropriate method + PARAMETERS:: + [in]mic_ctx_t *mic_ctx - pointer to the mic private context + [in]void *in_buffer - input buffer containing opcode + ioctl arguments, + RETURN_VALUE:: 0 if successful, non-zero if failure +*/ +int +adapter_do_ioctl(uint32_t cmd, uint64_t arg) +{ + int status = 0; + mic_ctx_t *mic_ctx = NULL; + + void __user *argp = (void __user *)arg; + switch (cmd) { + + case IOCTL_FLASHCMD: + { + struct ctrlioctl_flashcmd args = {0}; + + if (copy_from_user(&args, argp, sizeof(struct ctrlioctl_flashcmd))) { + return -EFAULT; + } + + if (args.brdnum >= (uint32_t)mic_data.dd_numdevs) { + printk(KERN_ERR "IOCTL error: given board num is invalid\n"); + return -EINVAL; + } + + mic_ctx = get_per_dev_ctx(args.brdnum); + if (!mic_ctx) { + printk(KERN_ERR "IOCTL error: null mic context\n"); + return -ENODEV; + } + + /* Make sure we are running in flash mode */ + if (mic_ctx->mode != MODE_FLASH || mic_ctx->state != MIC_ONLINE) { + printk(KERN_ERR "%s Card is not online in flash mode or online state\n", __func__); + return -EPERM; + } + + if (mic_ctx->bi_family != FAMILY_KNC) { + printk(KERN_ERR "%s IOCTL_FLASHCMD not supported for non KNC family cards\n", __func__); + return -EPERM; + } + + status = do_send_flash_cmd(mic_ctx, &args); + if (status) { + printk(KERN_ERR "IOCTL error: failed to complete IOCTL for bdnum %d\n", args.brdnum); + return status; + } + + if (copy_to_user(argp, &args, sizeof(struct ctrlioctl_flashcmd))) { + return -EFAULT; + } + + break; + } + + case IOCTL_CARDMEMCPY: + { + struct ctrlioctl_cardmemcpy args = {0}; + + if (copy_from_user(&args, argp, sizeof(struct ctrlioctl_cardmemcpy))) { + return -EFAULT; + } + + if (args.brdnum >= (uint32_t)mic_data.dd_numdevs) { + printk(KERN_ERR "IOCTL error: given board num is invalid\n"); + return -EINVAL; + } + mic_ctx = get_per_dev_ctx(args.brdnum); + if (!mic_ctx) { + printk(KERN_ERR "IOCTL error: null mic context\n"); + return -ENODEV; + } + + if(mic_ctx->state != MIC_ONLINE || mic_ctx->mode != MODE_LINUX) { + status = -EPERM; + printk("Error ! Card not in linux mode or online state!\n"); + return status; + } + + status = get_card_mem(mic_ctx, &args); + if (status) { + printk(KERN_ERR "IOCTL error: failed to complete IOCTL for bdnum %d\n", args.brdnum); + return status; + } + + ; + break; + } + + default: + printk("Invalid IOCTL\n"); + status = -EINVAL; + break; + } + + return status; +} + +int +do_send_flash_cmd(mic_ctx_t *mic_ctx, struct ctrlioctl_flashcmd *args) +{ + int status = 0; + + if(!capable(CAP_SYS_ADMIN)) { + printk(KERN_ERR "Cannot execute unless sysadmin\n"); + return -EACCES; + } + + pr_debug("%s\n IN:: brdnum = %d, type = %x, data = %p, len = %x\n", + __func__, args->brdnum, args->type, args->data, args->len); + + status = send_flash_cmd(mic_ctx, args->type, args->data, args->len); + + return status; +} + + +int +get_card_mem(mic_ctx_t *mic_ctx, struct ctrlioctl_cardmemcpy *args) +{ + int32_t status = 0; + + if(!capable(CAP_SYS_ADMIN)) { + printk(KERN_ERR "Cannot execute unless sysadmin\n"); + return -EACCES; + } + + if (args->dest == NULL) { + status = EINVAL; + goto exit; + } + pr_debug("%s\n IN:: brdnum = %d, start = %qx, size = %qx, dest = %p\n", + __func__, args->brdnum, args->start, args->size, args->dest); + + status = get_cardside_mem(mic_ctx, args->start, args->size, args->dest); + +exit: + return status; + +} diff --git a/host/linpm.c b/host/linpm.c new file mode 100644 index 0000000..43d2e9a --- /dev/null +++ b/host/linpm.c @@ -0,0 +1,232 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micint.h" +#include "mic/micveth.h" + +/* + * Retrieves the device context for a particular device + */ +mic_ctx_t * +get_device_context(struct pci_dev *dev) { + int i = 0; + mic_ctx_t *mic_ctx = NULL; + for (i = (mic_data.dd_numdevs -1); i >= 0; i--) { + mic_ctx = &mic_data.dd_bi[i]->bi_ctx; + if (mic_ctx!= NULL) { + //TODO: Is bus number enough to uniquely identify a + //pci_dev struct in mic_ctx? + if (mic_ctx->bi_pdev->bus->number == + dev->bus->number) { + + //Bus number matches + break; + } + } + } + return mic_ctx; +} + +/* + * Notifier callback with event specifying the actual power management + * event to have happened.Our events of Interest right now are: + * PM_HIBERNATION_PREPARE and PM_POST_RESTORE + */ +int +micpm_notifier_block(struct notifier_block *nb, unsigned long event, void *dummy) +{ + int i; + mic_ctx_t *mic_ctx; + switch (event) { + case PM_POST_RESTORE: + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + pr_debug("%s Calling MIC resume\n", __func__); + for(i = 0; i < mic_data.dd_numdevs; i++) { + mic_ctx = get_per_dev_ctx(i); + if (mic_ctx && mic_ctx->micpm_ctx.resume.wq) { + queue_work(mic_ctx->micpm_ctx.resume.wq, + &mic_ctx->micpm_ctx.resume.work); + } + } + break; + default: + pr_debug("%s: Unrecognized event %lu\n", __func__, event); + break; + } +return 0; +} + +/* + * Called by the OS when going into suspend. + * Puts our device to D3Cold. + */ +int +micpm_suspend(struct device *pdev) +{ + struct pci_dev *pci_dev = to_pci_dev(pdev); + mic_ctx_t *mic_ctx = get_device_context(pci_dev); + + if (!pci_dev) { + pr_debug("Not initialized, aborting suspend.\n"); + return -ENODEV; + } + + pr_debug("pm_stop_device called for dev: %d:%d:%d\n", pci_dev->bus->number, + PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn)); + pm_stop_device(mic_ctx); + pci_save_state(pci_dev); + pci_disable_device(pci_dev); + if (pci_set_power_state(pci_dev, PCI_D3cold)) + pr_debug("Not able to set to D3Cold state\n"); + pr_debug("Returning from mic_suspend\n"); + return 0; +} + +/* + * Called by the OS when coming out of suspend. + * Puts our device to D0 and starts driver components. + */ +int +micpm_resume(struct device *pdev) +{ + struct pci_dev *pci_dev = to_pci_dev(pdev); + if (!pci_dev) { + pr_debug("Device not initialized. aborting resume"); + return -ENODEV; + } + + pci_set_power_state(pci_dev, PCI_D0); + if (pci_enable_device(pci_dev)) { + pr_debug("Failed to wake-up device.\n"); + return -EIO; + } + pci_restore_state(pci_dev); + pci_set_master(pci_dev); + pr_debug("pm_start_device called for dev: %d:%d:%d\n", pci_dev->bus->number, + PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn)); + return 0; +} + +int micpm_suspend_noirq(struct device *pdev) { + + struct pci_dev *pci_dev = to_pci_dev(pdev); + mic_ctx_t *mic_ctx; + bd_info_t *bd_info; + + if (!pci_dev) { + pr_debug("Device not initialized. aborting suspend"); + return -ENODEV; + } + + mic_ctx = get_device_context(pci_dev); + if(mic_ctx) { + bd_info = mic_ctx->bd_info; + /* MSI interrupts do not work on resume. + * See http://www.digipedia.pl/usenet/thread/18815/2513/ + * for a discussion on this issue. + */ + if (mic_ctx->msie) { + free_irq(bd_info->bi_msix_entries[0].vector, &bd_info->bi_ctx); + } + } + return 0; +} + +int micpm_resume_noirq(struct device *pdev) { + + struct pci_dev *pci_dev = to_pci_dev(pdev); + mic_ctx_t *mic_ctx; + bd_info_t *bd_info; + int err; + + if (!pci_dev) { + pr_debug("Device not initialized. aborting resume"); + return -ENODEV; + } + mic_ctx = get_device_context(pci_dev); + if(mic_ctx) { + bd_info = mic_ctx->bd_info; + + /* MSI interrupts do not work on resume. + * See http://www.digipedia.pl/usenet/thread/18815/2513/ + * for a discussion on this issue. + */ + if (mic_ctx->msie) { + err = request_irq(bd_info->bi_msix_entries[0].vector, + mic_irq_isr, 0, "mic", mic_ctx); + if (err) { + pr_debug("%s: %d Error inititalizing MSI interrupts\n", + __func__, __LINE__); + return 0; + } + } + + } + return 0; +} + diff --git a/host/linpsmi.c b/host/linpsmi.c new file mode 100644 index 0000000..8c2780e --- /dev/null +++ b/host/linpsmi.c @@ -0,0 +1,152 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "micint.h" + +int mic_psmi_open(struct file *filp) +{ + bd_info_t *bd_info = mic_data.dd_bi[0]; + if (!bd_info->bi_ctx.bi_psmi.enabled) + return -EINVAL; + ((filp)->private_data) = &bd_info->bi_ctx; + return 0; +} + +extern int usagemode_param; + +ssize_t mic_psmi_read(struct file * filp, char __user *buf, + size_t count, loff_t *pos) +{ + ssize_t total_bytes = 0; + unsigned int pg_no, pg_off, bytes; + mic_ctx_t *mic_ctx = ((filp)->private_data); + struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi; + loff_t mem_size; + + if (!psmi_ctx->enabled) + return -EINVAL; + if (FAMILY_ABR == mic_ctx->bi_family && + USAGE_MODE_NORMAL != usagemode_param) + mem_size = MIC_APERTURE_SIZE; + else + mem_size = psmi_ctx->dma_mem_size; + if (*pos >= mem_size || count <= 0) + return 0; + if (*pos + count > mem_size) + count = mem_size - *pos; + /* read aperture memory */ + if (USAGE_MODE_NORMAL != usagemode_param) { + if (copy_to_user(buf, + mic_ctx->aper.va + *pos, count)) + return -EFAULT; + goto read_exit; + } + /* read host memory allocated for psmi handler */ + pg_no = *pos / MIC_PSMI_PAGE_SIZE; + pg_off = *pos % MIC_PSMI_PAGE_SIZE; + while (total_bytes < count) { + pci_dma_sync_single_for_cpu(mic_ctx->bi_pdev, + psmi_ctx->dma_tbl[pg_no + 1].pa, + MIC_PSMI_PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + bytes = MIC_PSMI_PAGE_SIZE - pg_off; + if (total_bytes + bytes > count) + bytes = count - total_bytes; + if (copy_to_user(buf, + (void *)psmi_ctx->va_tbl[pg_no].pa + pg_off, bytes)) + return -EFAULT; + total_bytes += bytes; + buf += bytes; + pg_no++; + /* Only the first page needs an offset */ + pg_off = 0; + } +read_exit: + *pos += count; + return count; +} + +static ssize_t show_mem_size(struct device *dev, + struct device_attribute *attr, char *buf) +{ + mic_ctx_t *mic_ctx = dev_get_drvdata(dev); + struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi; + + return snprintf(buf, PAGE_SIZE, "%ld\n", + (unsigned long)psmi_ctx->dma_mem_size); +} +static DEVICE_ATTR(mem_size, S_IRUGO, show_mem_size, NULL); + +static struct attribute *psmi_attributes[] = { + &dev_attr_mem_size.attr, + NULL +}; + +struct attribute_group psmi_attr_group = { + .attrs = psmi_attributes +}; + +#if (defined(RHEL_RELEASE_CODE) && \ + (LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32))) || \ + LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34) +static ssize_t mic_psmi_read_ptes(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, loff_t pos, size_t size) +#else +static ssize_t mic_psmi_read_ptes(struct kobject *kobj, + struct bin_attribute *attr, char *buf, loff_t pos, size_t size) +#endif +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct mic_psmi_ctx *psmi_ctx = + &((mic_ctx_t *)dev_get_drvdata(dev))->bi_psmi; + + if (pos >= psmi_ctx->dma_tbl_size || size <= 0) + return 0; + if (pos + size > psmi_ctx->dma_tbl_size) + size = psmi_ctx->dma_tbl_size - pos; + memcpy(buf, psmi_ctx->dma_tbl, size); + return size; +} + +struct bin_attribute mic_psmi_ptes_attr = { + .attr = { + .name = "psmi_ptes", + .mode = S_IRUSR + }, + .read = mic_psmi_read_ptes +}; + +extern bool mic_psmi_enable; +module_param_named(psmi, mic_psmi_enable, bool, S_IRUSR); +MODULE_PARM_DESC(psmi, "Enable/disable mic psmi"); diff --git a/host/linscif_host.c b/host/linscif_host.c new file mode 100644 index 0000000..233f8ea --- /dev/null +++ b/host/linscif_host.c @@ -0,0 +1,315 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic_common.h" +#include "mic/micscif_smpt.h" +#include "mic/micscif_nodeqp.h" +#include "mic/micscif_intr.h" +#include "mic/micscif_nm.h" +#include "micint.h" + +struct micscif_info ms_info; +struct micscif_dev scif_dev[MAX_BOARD_SUPPORTED + 1]; + +bool mic_watchdog_enable = 1; +bool mic_watchdog_auto_reboot = 1; +bool mic_crash_dump_enabled = 1; + +int +micscif_init(void) +{ + int err; + ms_info.mi_nodeid = 0; // Host is node 0 + ms_info.mi_maxid = 0; // Host is at start the max card ID + ms_info.mi_total = 1; // Host will know about this many MIC cards + ms_info.mi_mask = 1; // first bit in the mask is the host node + + mutex_init (&ms_info.mi_conflock); + spin_lock_init(&ms_info.mi_eplock); + spin_lock_init(&ms_info.mi_connlock); + spin_lock_init(&ms_info.mi_rmalock); + mutex_init (&ms_info.mi_fencelock); + mutex_init (&ms_info.mi_event_cblock); + spin_lock_init(&ms_info.mi_nb_connect_lock); + INIT_LIST_HEAD(&ms_info.mi_uaccept); + INIT_LIST_HEAD(&ms_info.mi_listen); + INIT_LIST_HEAD(&ms_info.mi_zombie); + INIT_LIST_HEAD(&ms_info.mi_connected); + INIT_LIST_HEAD(&ms_info.mi_disconnected); + INIT_LIST_HEAD(&ms_info.mi_rma); + INIT_LIST_HEAD(&ms_info.mi_rma_tc); +#ifdef CONFIG_MMU_NOTIFIER + INIT_LIST_HEAD(&ms_info.mi_mmu_notif_cleanup); +#endif + INIT_LIST_HEAD(&ms_info.mi_fence); + INIT_LIST_HEAD(&ms_info.mi_event_cb); + INIT_LIST_HEAD(&ms_info.mi_nb_connect_list); + ms_info.mi_watchdog_to = DEFAULT_WATCHDOG_TO; +#ifdef MIC_IS_EMULATION + ms_info.mi_watchdog_enabled = 0; + ms_info.mi_watchdog_auto_reboot = 0; +#else + ms_info.mi_watchdog_enabled = mic_watchdog_enable; + ms_info.mi_watchdog_auto_reboot = mic_watchdog_auto_reboot; +#endif +#ifdef RMA_DEBUG + ms_info.rma_unaligned_cpu_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + ms_info.rma_alloc_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + ms_info.rma_pin_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); +#ifdef CONFIG_MMU_NOTIFIER + ms_info.mmu_notif_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); +#endif +#endif + ms_info.mi_misc_wq = __mic_create_singlethread_workqueue("SCIF_MISC"); + if (!ms_info.mi_misc_wq) { + err = -ENOMEM; + goto wq_error; + } + INIT_WORK(&ms_info.mi_misc_work, micscif_misc_handler); +#ifdef CONFIG_MMU_NOTIFIER + ms_info.mi_mmu_notif_wq = create_singlethread_workqueue("SCIF_MMU"); + if (!ms_info.mi_mmu_notif_wq) { + err = -ENOMEM; + goto wq_error; + } + INIT_WORK(&ms_info.mi_mmu_notif_work, micscif_mmu_notif_handler); +#endif + ms_info.mi_conn_wq = __mic_create_singlethread_workqueue("SCIF_NB_CONN"); + if (!ms_info.mi_conn_wq) { + err = -ENOMEM; + goto wq_error; + } + INIT_WORK(&ms_info.mi_conn_work, micscif_conn_handler); + + //pr_debug("micscif_create(%d) \n", num_bds); + + // Setup information for self aka loopback. + scif_dev[SCIF_HOST_NODE].sd_node = SCIF_HOST_NODE; + micscif_setup_loopback_qp(&scif_dev[SCIF_HOST_NODE]); + scif_dev[SCIF_HOST_NODE].sd_state = SCIFDEV_RUNNING; + scif_dev[SCIF_HOST_NODE].scif_ref_cnt = + (atomic_long_t) ATOMIC_LONG_INIT(0); + scif_dev[SCIF_HOST_NODE].scif_map_ref_cnt = 0; + init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_wq); + init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_mmap_wq); + mutex_init (&scif_dev[SCIF_HOST_NODE].sd_lock); + ms_info.mi_rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT; + ms_info.en_msg_log = 0; + scif_proc_init(); + return 0; +wq_error: + if (ms_info.mi_misc_wq) + destroy_workqueue(ms_info.mi_misc_wq); +#ifdef CONFIG_MMU_NOTIFIER + if (ms_info.mi_mmu_notif_wq) + destroy_workqueue(ms_info.mi_mmu_notif_wq); +#endif + if (ms_info.mi_conn_wq) + destroy_workqueue(ms_info.mi_conn_wq); + return err; +} + +void +micscif_destroy(void) +{ + struct list_head *pos, *unused; + struct scif_callback *temp; +#ifdef CONFIG_MMU_NOTIFIER + destroy_workqueue(ms_info.mi_mmu_notif_wq); +#endif + destroy_workqueue(ms_info.mi_misc_wq); + destroy_workqueue(ms_info.mi_conn_wq); + micscif_destroy_loopback_qp(&scif_dev[SCIF_HOST_NODE]); + scif_proc_cleanup(); + mic_debug_uninit(); + list_for_each_safe(pos, unused, &ms_info.mi_event_cb) { + temp = list_entry(pos, struct scif_callback, list_member); + list_del(pos); + kfree(temp); + } + mutex_destroy(&ms_info.mi_event_cblock); +} + +int +micscif_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell) +{ + struct micscif_dev *dev = &scif_dev[mic_ctx->bi_id + 1]; + + queue_work(dev->sd_intr_wq, &dev->sd_intr_bh); + return 0; +} + +int micscif_setup_host_qp(mic_ctx_t *mic_ctx, struct micscif_dev *scifdev); + +void +micscif_probe(mic_ctx_t *mic_ctx) +{ + struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1]; + + // The host needs to keep track of scif_dev interfaces for all boards in + // the system. Host is node zero for MIC board 0 is SCIF node 1, etc. + // This will need to become more dynamic if hot plug is supported + + scifdev->sd_node = mic_ctx->bi_id + 1; + scifdev->sd_state = SCIFDEV_STOPPED; + scifdev->mm_sbox = mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS; + + /* This workqueue thread will handle all card->host interrupt processing. */ + micscif_setup_interrupts(scifdev); + + init_waitqueue_head(&scifdev->sd_mmap_wq); + init_waitqueue_head(&scifdev->sd_wq); + mutex_init (&scifdev->sd_lock); + INIT_LIST_HEAD(&scifdev->sd_p2p); + + init_waitqueue_head(&scifdev->sd_watchdog_wq); + snprintf(scifdev->sd_ln_wqname, sizeof(scifdev->sd_intr_wqname), + "SCIF LOSTNODE %d", scifdev->sd_node); + if (!(scifdev->sd_ln_wq = + __mic_create_singlethread_workqueue(scifdev->sd_ln_wqname))) + printk(KERN_ERR "%s %d wq creation failed\n", __func__, __LINE__); + INIT_DELAYED_WORK(&scifdev->sd_watchdog_work, micscif_watchdog_handler); + /* + * Register function for doorbell 0 which will + * basically kick off the workqueue. + */ + mic_reg_irqhandler(mic_ctx, 0, "SCIF DoorBell 0", + micscif_host_doorbell_intr_handler); +} + +void +micscif_start(mic_ctx_t *mic_ctx) +{ + struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1]; + + scifdev->scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + scifdev->scif_map_ref_cnt = 0; + + scifdev->sd_state = SCIFDEV_INIT; + + + /* Sets up bd_bs and the host side of the queuepair */ + pr_debug("micscif_probe: host setting up qp \n"); + micscif_setup_host_qp(mic_ctx, scifdev); +} + +void micscif_removehost_respose(struct micscif_dev *scifdev, struct nodemsg *msg); + +void +micscif_stop(mic_ctx_t *mic_ctx) +{ + struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1]; + + if (scifdev->sd_state == SCIFDEV_STOPPED || scifdev->sd_state == SCIFDEV_INIT) + return; + + micscif_disconnect_node(scifdev->sd_node, NULL, DISCONN_TYPE_LOST_NODE); +} + +void +micscif_remove(mic_ctx_t *mic_ctx) +{ + struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1]; + struct micscif_qp *qp = &scifdev->qpairs[0]; + + destroy_workqueue(scifdev->sd_intr_wq); + scifdev->sd_intr_wq = 0; + cancel_delayed_work_sync(&scifdev->sd_watchdog_work); + if (scifdev->sd_ln_wq){ + destroy_workqueue(scifdev->sd_ln_wq); + scifdev->sd_ln_wq = 0; + } + mic_unreg_irqhandler(mic_ctx, 0x0, "SCIF DoorBell 0"); + + if (qp) { + mic_ctx_unmap_single(mic_ctx, qp->local_buf, qp->inbound_q.size); + mic_ctx_unmap_single(mic_ctx, qp->local_qp, sizeof(struct micscif_qp)); + kfree((void*)(qp->inbound_q.rb_base)); + } + + if (scifdev->qpairs) { + kfree(scifdev->qpairs); + scifdev->qpairs = NULL; + } +} + +int +scif_get_node_status(int node_id) +{ + struct micscif_dev *scifdev = &scif_dev[node_id]; + + return scifdev->sd_state; +} + +struct scatterlist * +micscif_p2p_mapsg(void *va, int page_size, int page_cnt) +{ + struct scatterlist *sg; + struct page *page; + int i; + + if ((sg = kcalloc(page_cnt, sizeof(struct scatterlist), GFP_KERNEL)) == NULL) { + return NULL; + } + + sg_init_table(sg, page_cnt); + + for (i = 0; i < page_cnt; i++) { +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)) + phys_addr_t phys; + phys = slow_virt_to_phys(va); + + if ((page = pfn_to_page(phys >> PAGE_SHIFT)) == NULL) + goto p2p_sg_err; +#else + if ((page = vmalloc_to_page(va)) == NULL) + goto p2p_sg_err; +#endif + sg_set_page(&sg[i], page, page_size, 0); + va += page_size; + } + + return sg; + +p2p_sg_err: + kfree(sg); + return NULL; +} + +void +micscif_p2p_freesg(struct scatterlist *sg) +{ + kfree(sg); +} diff --git a/host/linsysfs.c b/host/linsysfs.c new file mode 100644 index 0000000..70c261f --- /dev/null +++ b/host/linsysfs.c @@ -0,0 +1,766 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "micint.h" +#include +#include +#include +#include "mic/micveth.h" + +#define SBOX_SCR9_VENDORID(x) ((x) & 0xf) +#define SBOX_SCR9_REVISION(x) (((x) >> 4) & 0xf) +#define SBOX_SCR9_DENSITY(x) (((x) >> 8) & 0x3) +#define SBOX_SCR9_ECC(x) (((x) >> 29) & 0x1) + +bd_info_t * +dev_to_bdi(struct device *dev) +{ + struct list_head *pos, *tmpq; + bd_info_t *bdi = NULL; + list_for_each_safe(pos, tmpq, &mic_data.dd_bdlist) { + bdi = list_entry(pos, bd_info_t, bi_list); + if (bdi->bi_sysfsdev == dev) + break; + } + return bdi; +} + +/* + * sysfs entries in lieu of MMIO ioctl + */ + +struct device_attribute_sbox { + struct device_attribute devattr; + uint32_t offset, mask, shift; +}; + +uint32_t +bd_sbox_read(bd_info_t *bdi, uint32_t offset) +{ + uint32_t reg_value, ret; + ret = micpm_get_reference(&bdi->bi_ctx, true); + if (ret) + return -EAGAIN; + reg_value = SBOX_READ(bdi->bi_ctx.mmio.va, offset); + ret = micpm_put_reference(&bdi->bi_ctx); + if (ret) + return -EAGAIN; + + return reg_value; +} + +#define DEVICE_ATTR_SBOX(_name, _mode, _offset, _mask, _shift) \ +struct device_attribute_sbox sbox_attr_##_name = \ +{ __ATTR(_name, _mode, show_sbox_register, NULL), _offset, _mask, _shift } + +ssize_t +show_sbox_register(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct device_attribute_sbox *attr_sbox = container_of(attr, + struct device_attribute_sbox, devattr); + bd_info_t *bdi = dev_to_bdi(dev); + return snprintf(buf, PAGE_SIZE, "%x\n", + (bd_sbox_read(bdi, attr_sbox->offset) >> attr_sbox->shift) & attr_sbox->mask); +} + +#ifdef CONFIG_ML1OM +static DEVICE_ATTR_SBOX(corevoltage, S_IRUGO, SBOX_COREVOLT, MASK_COREVOLT, SHIFT_COREVOLT); +static DEVICE_ATTR_SBOX(corefrequency, S_IRUGO, SBOX_COREFREQ, MASK_COREFREQ, SHIFT_COREFREQ); +#endif +static DEVICE_ATTR_SBOX(memoryvoltage, S_IRUGO, SBOX_MEMVOLT, MASK_MEMVOLT, SHIFT_MEMVOLT); +static DEVICE_ATTR_SBOX(memoryfrequency, S_IRUGO, SBOX_MEMORYFREQ, MASK_MEMORYFREQ, SHIFT_MEMORYFREQ); +static DEVICE_ATTR_SBOX(memsize, S_IRUGO, SBOX_SCRATCH0, MASK_MEMSIZE, SHIFT_MEMSIZE); +static DEVICE_ATTR_SBOX(flashversion, S_IRUGO, SBOX_SCRATCH7, MASK_FLASHVERSION, SHIFT_FLASHVERSION); + +/* HW Info */ +static DEVICE_ATTR_SBOX(substepping_data, S_IRUGO, SBOX_SCRATCH13, MASK_SUBSTEPPING_DATA, SHIFT_SUBSTEPPING_DATA); +static DEVICE_ATTR_SBOX(stepping_data, S_IRUGO, SBOX_SCRATCH13, MASK_STEPPING_DATA, SHIFT_STEPPING_DATA); +static DEVICE_ATTR_SBOX(model, S_IRUGO, SBOX_SCRATCH13, MASK_MODEL, SHIFT_MODEL); +static DEVICE_ATTR_SBOX(family_data, S_IRUGO, SBOX_SCRATCH13, MASK_FAMILY_DATA, SHIFT_FAMILY_DATA); +static DEVICE_ATTR_SBOX(processor, S_IRUGO, SBOX_SCRATCH13, MASK_PROCESSOR, SHIFT_PROCESSOR); +static DEVICE_ATTR_SBOX(platform, S_IRUGO, SBOX_SCRATCH13, MASK_PLATFORM, SHIFT_PLATFORM); +static DEVICE_ATTR_SBOX(extended_model, S_IRUGO, SBOX_SCRATCH13, MASK_EXTENDED_MODEL, SHIFT_EXTENDED_MODEL); +static DEVICE_ATTR_SBOX(extended_family, S_IRUGO, SBOX_SCRATCH13, MASK_EXTENDED_FAMILY, SHIFT_EXTENDED_FAMILY); +/* copy of fuse_configuration_revision [129:120] */ +static DEVICE_ATTR_SBOX(fuse_config_rev, S_IRUGO, SBOX_SCRATCH7, MASK_FUSE_CONFIG_REV, SHIFT_FUSE_CONFIG_REV); + +static DEVICE_ATTR_SBOX(active_cores, S_IRUGO, SBOX_SCRATCH4, MASK_ACTIVE_CORES, SHIFT_ACTIVE_CORES); +static DEVICE_ATTR_SBOX(fail_safe_offset, S_IRUSR, SBOX_FAIL_SAFE_OFFSET, MASK_FAIL_SAFE, SHIFT_FAIL_SAFE); + +ssize_t show_flash_update(struct device *dev, struct device_attribute *attr, char *buf) +{ + uint32_t value, ret; + bd_info_t *bdi = dev_to_bdi(dev); + ret = micpm_get_reference(&bdi->bi_ctx, true); + if (ret) + return -EAGAIN; + value = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF0X0); + ret = micpm_put_reference(&bdi->bi_ctx); + if (ret) + return -EAGAIN; + + return snprintf(buf, PAGE_SIZE, "%x\n", value); +} + +static ssize_t +set_flash_update(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned long value; + int ret; + bd_info_t *bdi = dev_to_bdi(dev); +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,39) + ret = kstrtoul(buf, 0, &value); + if (ret) + return count; +#else + value = simple_strtoul(buf, NULL, 10); +#endif + ret = micpm_get_reference(&bdi->bi_ctx, true); + if (ret) + return -EAGAIN; + DBOX_WRITE((unsigned int)value, bdi->bi_ctx.mmio.va, DBOX_SWF0X0); + ret = micpm_put_reference(&bdi->bi_ctx); + if (ret) + return -EAGAIN; + + return count; + +} +static DEVICE_ATTR(flash_update, S_IRUSR | S_IWUSR, show_flash_update, set_flash_update); + +ssize_t +show_meminfo(struct device *dev, struct device_attribute *attr, char *buf) +{ + uint32_t value; + bd_info_t *bdi = dev_to_bdi(dev); + value = bd_sbox_read(bdi, SBOX_SCRATCH9); + return snprintf(buf, PAGE_SIZE, "vendor:%x,revision:%x" + ",density:%x,ecc_enable:%x", + SBOX_SCR9_VENDORID(value), SBOX_SCR9_REVISION(value), + SBOX_SCR9_DENSITY(value), SBOX_SCR9_ECC(value)); +} +static DEVICE_ATTR(meminfo, S_IRUGO, show_meminfo, NULL); + +ssize_t +show_sku(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + return snprintf(buf, PAGE_SIZE, "%s\n", bdi->bi_ctx.sku_name); +} +static DEVICE_ATTR(sku, S_IRUGO, show_sku, NULL); +/******************************************************************************/ + +static ssize_t +show_version(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", BUILD_VERSION); +} +static DEVICE_ATTR(version, S_IRUGO, show_version, NULL); + +static ssize_t +show_p2p(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", mic_p2p_enable? "enable" : "disable"); +} +static DEVICE_ATTR(peer2peer, S_IRUGO, show_p2p, NULL); + +static struct attribute *host_attributes[] = { + &dev_attr_version.attr, + &dev_attr_peer2peer.attr, + NULL +}; + +struct attribute_group host_attr_group = { + .attrs = host_attributes +}; + +static ssize_t +show_family(struct device *dev, struct device_attribute *attr, char *buf) +{ + static const char KNF[] = "Knights Ferry"; + static const char KNC[] = "x100"; + bd_info_t *bdi = dev_to_bdi(dev); + const char *card = NULL; + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + if (mic_ctx->bi_family == FAMILY_ABR) + card = KNF; + else + card = KNC; + + if (card) + return snprintf(buf, PAGE_SIZE, "%s\n", card); + else + return snprintf(buf, PAGE_SIZE, "Unknown\n"); +} +static DEVICE_ATTR(family, S_IRUGO, show_family, NULL); + +static ssize_t +show_stepping(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + char string[3]; + show_stepping_comm(&bdi->bi_ctx,string); + return snprintf(buf, PAGE_SIZE, "%s\n", string); +} +static DEVICE_ATTR(stepping, S_IRUGO, show_stepping, NULL); + +char *micstates[] = {"ready", "booting", "no response", "boot failed", + "online", "shutdown", "lost", "resetting", "reset failed", "invalid"}; +static ssize_t +show_micstate(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + + if (bdi->bi_ctx.state >= MIC_INVALID) + mic_setstate(&bdi->bi_ctx, MIC_INVALID); + return snprintf(buf, PAGE_SIZE, "%s", micstates[bdi->bi_ctx.state]); +} + +static int +match_micstate(const char **buf, const char *string) +{ + size_t len = strlen(string); + if (!strncmp(*buf, string, len)) { + *buf += len; + return true; + } + return false; +} + +static ssize_t +set_micstate(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + const char *default_mm_image = "/usr/share/mpss/boot/rasmm-kernel.from-eeprom.elf"; + + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + int mode; + size_t len; + char *arg, *arg2 = NULL; + int err = 0; + + /* parse the new state */ + if (match_micstate(&buf, "boot:linux:")) { + mode = MODE_LINUX; + } else if (match_micstate(&buf, "boot:elf:")) { + mode = MODE_ELF; + } else if (match_micstate(&buf, "boot:flash:")) { + mode = MODE_FLASH; + } else if (sysfs_streq(buf, "reset")) { + + mutex_lock(&mic_ctx->state_lock); + if (mic_ctx->state == MIC_READY) { + mutex_unlock(&mic_ctx->state_lock); + return -EINVAL; + } + + mutex_unlock(&mic_ctx->state_lock); + adapter_stop_device(mic_ctx, 1, 0); + return count; + } else if (sysfs_streq(buf, "reset:force")) { + int reattempt = !RESET_REATTEMPT; + + mutex_lock(&mic_ctx->state_lock); + if (mic_ctx->state == MIC_READY) + reattempt = RESET_REATTEMPT; + + mutex_unlock(&mic_ctx->state_lock); + adapter_stop_device(mic_ctx, 1, reattempt); + return count; + } else if (sysfs_streq(buf, "shutdown")) { + adapter_shutdown_device(mic_ctx); + return count; + } else { + return -EINVAL; + } + + /* we're booting something; a filename follows the colon */ + len = strlen(buf); + if (buf && buf[0] == '\n') { + len = 0; + } + if (!len && mode == MODE_FLASH) { + buf = default_mm_image; + len = strlen(buf); + } + if (!(arg = kmalloc(len + 1, GFP_KERNEL))) + return -ENOMEM; + memcpy(arg, buf, len + 1); + if (arg[len - 1] == '\n') + arg[len - 1] = '\0'; + + /* if booting linux, there may be yet another filename */ + if (mode == MODE_LINUX && (arg2 = strchr(arg, ':'))) + *arg2++ = '\0'; + + /* atomically change the state */ + mutex_lock(&mic_ctx->state_lock); + if (mic_ctx->state == MIC_READY) { + kfree(mic_ctx->image); + mic_ctx->mode = mode; + mic_ctx->image = arg; + mic_ctx->initramfs = arg2; + mic_setstate(mic_ctx, MIC_BOOT); + mutex_unlock(&mic_ctx->state_lock); + printk("mic image: %s\n", mic_ctx->image); + } else { + kfree(arg); + printk(KERN_ERR "Error! Card not in offline/ready state. Cannot change mode\n"); + mutex_unlock(&mic_ctx->state_lock); + return -EIO; + } + + /* actually perform the boot */ + if (mode == MODE_LINUX) { + mic_ctx->card_usage_mode = USAGE_MODE_NORMAL; + err = boot_linux_uos(mic_ctx, mic_ctx->image, mic_ctx->initramfs); + if (!err) + adapter_post_boot_device(mic_ctx); + } else { + err = boot_micdev_app(mic_ctx, mic_ctx->image); + } + + if (!err) + return count; + printk("booting failed %d\n", err); + return err; +} +static DEVICE_ATTR(state, S_IRUGO|S_IWUSR, show_micstate, set_micstate); + +char *micmodes[] = {"N/A", "linux", "elf", "flash"}; + +static ssize_t +show_mode(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + + if (bdi->bi_ctx.mode > MODE_FLASH) + bdi->bi_ctx.mode = MODE_NONE; + return snprintf(buf, PAGE_SIZE, "%s", micmodes[bdi->bi_ctx.mode]); +} +static DEVICE_ATTR(mode, S_IRUGO, show_mode, NULL); + +int scif_get_node_status(int node_id); +static char *scif_status_stings[] = {"not present", "initializing", "online", + "sleeping", "stopping", "stopped"}; +static ssize_t +show_scif_status(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + int scif_status; + + scif_status = scif_get_node_status(bdi->bi_ctx.bi_id + 1); + return snprintf(buf, PAGE_SIZE, "%s\n", scif_status_stings[scif_status]); +} +static DEVICE_ATTR(scif_status, S_IRUGO, show_scif_status, NULL); + +static ssize_t +show_image(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + return snprintf(buf, PAGE_SIZE, "%s", bdi->bi_ctx.image); +} +static DEVICE_ATTR(image, S_IRUGO, show_image, NULL); + +static ssize_t +show_initramfs(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + return snprintf(buf, PAGE_SIZE, "%s", bdi->bi_ctx.initramfs); +} +static DEVICE_ATTR(initramfs, S_IRUGO, show_initramfs, NULL); + +static ssize_t +show_postcode(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + uint32_t postcode; + + if ((micpm_get_reference(mic_ctx, true))) { + PM_DEBUG("get_reference failed. Node may be lost\n"); + return -EBUSY; + } + postcode = mic_getpostcode(mic_ctx); + if (postcode == 0xffffffff) { + printk("Invalid Postcode : %c%c\n", postcode & 0xff, (postcode >> 8) & 0xff); + micpm_put_reference(mic_ctx); + return -ENXIO; + } + + if (postcode == 0x0) { + printk("Postcode : %c%c\n", postcode & 0xff, (postcode >> 8) & 0xff); + micpm_put_reference(mic_ctx); + return -EAGAIN; + } + micpm_put_reference(mic_ctx); + return snprintf(buf, PAGE_SIZE, "%c%c", postcode & 0xff, (postcode >> 8) & 0xff); +} +static DEVICE_ATTR(post_code, S_IRUGO, show_postcode, NULL); + +static ssize_t +show_boot_count(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + return snprintf(buf, PAGE_SIZE, "%d", mic_ctx->boot_count); +} +static DEVICE_ATTR(boot_count, S_IRUGO, show_boot_count, NULL); + +static ssize_t +show_crash_count(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + return snprintf(buf, PAGE_SIZE, "%d", mic_ctx->crash_count); +} +static DEVICE_ATTR(crash_count, S_IRUGO, show_crash_count, NULL); + +static ssize_t +show_cmdline(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + char *cmdline = mic_ctx->sysfs_info.cmdline; + + if (cmdline == NULL) { + return snprintf(buf, PAGE_SIZE, "not set\n"); + } else { + return snprintf(buf, PAGE_SIZE, "%s\n", cmdline); + } + return 0; +} + +static ssize_t +set_cmdline(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + if (mic_ctx->sysfs_info.cmdline != NULL) + kfree(mic_ctx->sysfs_info.cmdline); + + if ((mic_ctx->sysfs_info.cmdline = kmalloc(count + 100, GFP_ATOMIC)) == NULL) + return -ENOMEM; + strcpy(mic_ctx->sysfs_info.cmdline, buf); + + if (mic_ctx->sysfs_info.cmdline[count - 1] == '\n') + mic_ctx->sysfs_info.cmdline[count - 1] = '\0'; + + return count; +} +static DEVICE_ATTR(cmdline, S_IRUGO|S_IWUSR, show_cmdline, set_cmdline); + +static ssize_t +show_kernel_cmdline(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + char *cmdline = mic_ctx->sysfs_info.kernel_cmdline; + + if ((mic_ctx->state == MIC_READY) || (cmdline == NULL)) { + return snprintf(buf, PAGE_SIZE, "ready\n"); + } else { + return snprintf(buf, PAGE_SIZE, "%s\n", cmdline); + } +} +static DEVICE_ATTR(kernel_cmdline, S_IRUGO, show_kernel_cmdline, NULL); + +static ssize_t show_pc3_enabled(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + return snprintf(buf, PAGE_SIZE, "%d\n", mic_ctx->micpm_ctx.pc3_enabled); +} +static ssize_t +store_pc3_enabled(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + int i, ret; + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + if(sscanf(buf, "%d", &i) != 1) { + ret = -EINVAL; + goto exit; + } + + if (i < 0) { + ret = -EINVAL; + goto exit; + } + + ret = micpm_update_pc3(mic_ctx, (i) ? true : false); + if (ret) + goto exit; + + pr_debug("pc3_enabled = %d\n", mic_ctx->micpm_ctx.pc3_enabled); + ret = count; +exit: + return ret; +} +static DEVICE_ATTR(pc3_enabled, S_IRUGO | S_IWUSR, show_pc3_enabled, store_pc3_enabled); + +static ssize_t show_pc6_enabled(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + return snprintf(buf, PAGE_SIZE, "%d\n", mic_ctx->micpm_ctx.pc6_enabled); +} + +static ssize_t +store_pc6_enabled(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + int i, ret; + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + if(sscanf(buf, "%d", &i) != 1) { + ret = -EINVAL; + goto exit; + } + + if (i < 0) { + ret = -EINVAL; + goto exit; + } + + ret = micpm_update_pc6(mic_ctx, (i) ? true : false); + if (ret) + goto exit; + + pr_debug("pc6_enabled = %d\n", mic_ctx->micpm_ctx.pc6_enabled); + ret = count; +exit: + return ret; +} + +static DEVICE_ATTR(pc6_enabled, S_IRUGO | S_IWUSR, show_pc6_enabled, store_pc6_enabled); + +static ssize_t show_pc6_timeout(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + return snprintf(buf, PAGE_SIZE, "%u\n", mic_ctx->micpm_ctx.pc6_timeout); +} +static ssize_t +store_pc6_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + int i, ret; + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + if(sscanf(buf, "%d", &i) != 1) { + ret = -EINVAL; + goto exit; + } + + if (i < 0) { + ret = -EINVAL; + goto exit; + } + + if (mic_ctx->micpm_ctx.pc6_timeout != i) { + mic_ctx->micpm_ctx.pc6_timeout = i; + } + pr_debug("pc6 timeout set to %us\n", mic_ctx->micpm_ctx.pc6_timeout); + ret = count; +exit: + return ret; +} +static DEVICE_ATTR(pc6_timeout, S_IRUGO | S_IWUSR, show_pc6_timeout, store_pc6_timeout); + +static ssize_t show_log_buf_addr(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + return snprintf(buf, PAGE_SIZE, "%p\n", mic_ctx->log_buf_addr); +} + +static ssize_t +store_log_buf_addr(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + int ret; + uint64_t addr; + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + if (sscanf(buf, "%llx", &addr) != 1) { + ret = -EINVAL; + goto exit; + } + + mic_ctx->log_buf_addr = (void*)addr; + ret = count; +exit: + return ret; +} +static DEVICE_ATTR(log_buf_addr, S_IRUGO | S_IWUSR, show_log_buf_addr, store_log_buf_addr); + +static ssize_t show_log_buf_len(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + return snprintf(buf, PAGE_SIZE, "%p\n", mic_ctx->log_buf_len); +} + +static ssize_t +store_log_buf_len(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + int ret; + uint64_t addr; + bd_info_t *bdi = dev_to_bdi(dev); + mic_ctx_t *mic_ctx = &bdi->bi_ctx; + + if (sscanf(buf, "%llx", &addr) != 1) { + ret = -EINVAL; + goto exit; + } + + mic_ctx->log_buf_len = (int*)addr; + ret = count; +exit: + return ret; +} +static DEVICE_ATTR(log_buf_len, S_IRUGO | S_IWUSR, show_log_buf_len, store_log_buf_len); + +union serialnum { + uint32_t values[3]; + char serial[13]; +}; + +static ssize_t +show_serialnumber(struct device *dev, struct device_attribute *attr, char *buf) +{ + bd_info_t *bdi = dev_to_bdi(dev); + union serialnum serial; + uint32_t ret; + + memset(serial.serial, 0, sizeof(serial.serial)); + ret = micpm_get_reference(&bdi->bi_ctx, true); + if (ret) + return -EAGAIN; + serial.values[0] = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF1X0); + serial.values[1] = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF1X1); + serial.values[2] = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF1X2); + ret = micpm_put_reference(&bdi->bi_ctx); + if (ret) + return -EAGAIN; + return snprintf(buf, PAGE_SIZE, "%s", serial.serial); +} +static DEVICE_ATTR(serialnumber, S_IRUGO, show_serialnumber, NULL); + +static ssize_t +show_interface_version(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s", LINUX_INTERFACE_VERSION); +} +static DEVICE_ATTR(interface_version, S_IRUGO, show_interface_version, NULL); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \ + defined(RHEL_RELEASE_CODE) +extern ssize_t show_virtblk_file(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t store_virtblk_file(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); +static DEVICE_ATTR(virtblk_file, S_IRUGO | S_IWUSR, show_virtblk_file, store_virtblk_file); +#endif + +static struct attribute *bd_attributes[] = { + &dev_attr_family.attr, + &dev_attr_stepping.attr, + &dev_attr_state.attr, + &dev_attr_mode.attr, + &dev_attr_image.attr, + &dev_attr_initramfs.attr, + &dev_attr_post_code.attr, + &dev_attr_boot_count.attr, + &dev_attr_crash_count.attr, + &dev_attr_cmdline.attr, + &dev_attr_kernel_cmdline.attr, + &dev_attr_serialnumber.attr, + &dev_attr_scif_status.attr, + &dev_attr_meminfo.attr, + &dev_attr_pc3_enabled.attr, + &dev_attr_pc6_enabled.attr, + &dev_attr_pc6_timeout.attr, + &dev_attr_flash_update.attr, + &dev_attr_log_buf_addr.attr, + &dev_attr_log_buf_len.attr, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \ + defined(RHEL_RELEASE_CODE) + &dev_attr_virtblk_file.attr, +#endif + &dev_attr_sku.attr, + &dev_attr_interface_version.attr, + +#ifdef CONFIG_ML1OM + &sbox_attr_corevoltage.devattr.attr, + &sbox_attr_corefrequency.devattr.attr, +#endif + &sbox_attr_memoryvoltage.devattr.attr, + &sbox_attr_memoryfrequency.devattr.attr, + &sbox_attr_memsize.devattr.attr, + &sbox_attr_flashversion.devattr.attr, + &sbox_attr_substepping_data.devattr.attr, + &sbox_attr_stepping_data.devattr.attr, + &sbox_attr_model.devattr.attr, + &sbox_attr_family_data.devattr.attr, + &sbox_attr_processor.devattr.attr, + &sbox_attr_platform.devattr.attr, + &sbox_attr_extended_model.devattr.attr, + &sbox_attr_extended_family.devattr.attr, + &sbox_attr_fuse_config_rev.devattr.attr, + &sbox_attr_active_cores.devattr.attr, + &sbox_attr_fail_safe_offset.devattr.attr, + NULL +}; + +struct attribute_group bd_attr_group = { + .attrs = bd_attributes +}; diff --git a/host/linux.c b/host/linux.c new file mode 100644 index 0000000..fd0411a --- /dev/null +++ b/host/linux.c @@ -0,0 +1,796 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include + +#include "mic/micscif_kmem_cache.h" +#include "micint.h" +#include "mic_common.h" +#include "mic/io_interface.h" +#include "mic/mic_pm.h" +#include "mic/micveth.h" + +MODULE_LICENSE("GPL"); +MODULE_INFO(build_number, BUILD_NUMBER); +MODULE_INFO(build_bywhom, BUILD_BYWHOM); +MODULE_INFO(build_ondate, BUILD_ONDATE); +MODULE_INFO(build_scmver, BUILD_SCMVER); + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34)) +#include +#endif + +struct kmem_cache *unaligned_cache; +mic_lindata_t mic_lindata; + +module_param_named(ulimit, mic_ulimit_check, bool, 0600); +MODULE_PARM_DESC(ulimit, "SCIF ulimit check"); + +module_param_named(reg_cache, mic_reg_cache_enable, bool, 0600); +MODULE_PARM_DESC(reg_cache, "SCIF registration caching"); + +module_param_named(huge_page, mic_huge_page_enable, bool, 0600); +MODULE_PARM_DESC(huge_page, "SCIF Huge Page Support"); + +extern bool mic_p2p_enable; +module_param_named(p2p, mic_p2p_enable, bool, 0600); +MODULE_PARM_DESC(p2p, "SCIF peer-to-peer"); + +extern bool mic_p2p_proxy_enable; +module_param_named(p2p_proxy, mic_p2p_proxy_enable, bool, 0600); +MODULE_PARM_DESC(p2p_proxy, "SCIF peer-to-peer proxy DMA support"); + +extern bool mic_watchdog_enable; +module_param_named(watchdog, mic_watchdog_enable, bool, 0600); +MODULE_PARM_DESC(watchdog, "SCIF Watchdog"); + +extern bool mic_watchdog_auto_reboot; +module_param_named(watchdog_auto_reboot, mic_watchdog_auto_reboot, bool, 0600); +MODULE_PARM_DESC(watchdog_auto_reboot, "SCIF Watchdog auto reboot"); + +bool mic_msi_enable = 1; +module_param_named(msi, mic_msi_enable, bool, 0600); +MODULE_PARM_DESC(mic_msi_enable, "To enable MSIx in the driver."); + +int mic_pm_qos_cpu_dma_lat = -1; +module_param_named(pm_qos_cpu_dma_lat, mic_pm_qos_cpu_dma_lat, int, 0600); +MODULE_PARM_DESC(mic_pm_qos_cpu_dma_lat, "PM QoS CPU DMA latency in usecs."); + +extern int ramoops_count; +module_param_named(ramoops_count, ramoops_count, int, 0600); +MODULE_PARM_DESC(ramoops_count, "Maximum frame count for the ramoops driver."); + +extern bool mic_crash_dump_enabled; +module_param_named(crash_dump, mic_crash_dump_enabled, bool, 0600); +MODULE_PARM_DESC(mic_crash_dump_enabled, "MIC Crash Dump enabled."); + +#define GET_FILE_SIZE_FROM_INODE(fp) i_size_read((fp)->f_path.dentry->d_inode) + +int usagemode_param = 0; + +static int +mic_open(struct inode *inode, struct file *filp) +{ + dev_t dev = inode->i_rdev; + + switch (MINOR(dev)) { + case 0: + return 0; + case 1: + return scif_fdopen(filp); + case 2: + return mic_psmi_open(filp); + } + + return -EINVAL; +} + +static int +mic_release(struct inode *inode, struct file *filp) +{ + dev_t dev = inode->i_rdev; + int rc = 0; + + switch (MINOR(dev)) { + case 0: + if (filp->private_data == filp) { + // Fasync is set + rc = fasync_helper(-1, filp, 0, &mic_data.dd_fasync); + mic_data.dd_fasync = NULL; + } + return rc; + case 1: + return scif_fdclose(filp); + case 2: + // psmi access to device + return 0; + } + + return -EINVAL; +} + +extern ssize_t mic_psmi_read(struct file * filp, char __user *buf, + size_t count, loff_t *pos); +static ssize_t +mic_read(struct file * filp, char __user *buf, + size_t count, loff_t *pos) +{ + dev_t dev = filp->f_path.dentry->d_inode->i_rdev; + if (MINOR(dev) == 2) + return mic_psmi_read(filp, buf, count, pos); + + return -EINVAL; +} + +static long +mic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + dev_t dev; + int status = 0; + + dev = filp->f_path.dentry->d_inode->i_rdev; + if (MINOR(dev) == 1) + return scif_process_ioctl(filp, cmd, arg); + + if (MINOR(dev) == 2) + return -EINVAL; + + status = adapter_do_ioctl(cmd, arg); + return status; +} + +static int +mic_fasync(int fd, struct file *filp, int on) +{ + int rc; + + if ((rc = fasync_helper(fd, filp, on, &mic_data.dd_fasync)) < 0) { + return rc; + } + + if (on) { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)) + rc = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); +#else + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); +#endif + filp->private_data = filp; + } else { + filp->private_data = NULL; + } + + return rc; +} + +int +mic_mmap(struct file *f, struct vm_area_struct *vma) +{ + dev_t dev = f->f_path.dentry->d_inode->i_rdev; + if (MINOR(dev) == 1) + return micscif_mmap(f, vma); + + return -EINVAL; +} + +unsigned int +mic_poll(struct file *f, poll_table *wait) +{ + dev_t dev = f->f_path.dentry->d_inode->i_rdev; + if (MINOR(dev) == 1) + return micscif_poll(f, wait); + + return -EINVAL; +} + +int +mic_flush(struct file *f, fl_owner_t id) +{ + dev_t dev = f->f_path.dentry->d_inode->i_rdev; + if (MINOR(dev) == 1) + return micscif_flush(f, id); + + return -EINVAL; +} + +irqreturn_t +mic_irq_isr(int irq, void *data) +{ + if (((mic_ctx_t *)data)->msie) + adapter_imsr((mic_ctx_t *)data); + else if (adapter_isr((mic_ctx_t *)data) < 0 ){ + return IRQ_NONE; + } + + return IRQ_HANDLED; +} + +extern struct attribute_group bd_attr_group; +extern struct attribute_group host_attr_group; +extern struct attribute_group scif_attr_group; +extern struct attribute_group psmi_attr_group; +extern struct bin_attribute mic_psmi_ptes_attr; + +static int +mic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int brdnum = mic_data.dd_numdevs; + int err = 0; + bd_info_t *bd_info; + mic_ctx_t *mic_ctx; +#ifdef CONFIG_PCI_MSI + int i=0; +#endif + if ((bd_info = (bd_info_t *)kzalloc(sizeof(bd_info_t), GFP_KERNEL)) == NULL) { + printk("MIC: probe failed allocating memory for bd_info\n"); + return -ENOSPC; + } + + mic_ctx = &bd_info->bi_ctx; + mic_ctx->bd_info = bd_info; + mic_ctx->bi_id = brdnum; + mic_ctx->bi_pdev = pdev; + mic_ctx->msie = 0; + mic_data.dd_bi[brdnum] = bd_info; + + if ((err = pci_enable_device(pdev))) { + printk("pci_enable failed board #%d\n", brdnum); + goto probe_freebd; + } + + pci_set_master(pdev); + err = pci_reenable_device(pdev); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + printk("mic %d: ERROR DMA not available\n", brdnum); + goto probe_freebd; + } + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + printk("mic %d: ERROR pci_set_consistent_dma_mask(64) %d\n", brdnum, err); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + printk("mic %d: ERROR pci_set_consistent_dma_mask(32) %d\n", brdnum, err); + goto probe_freebd; + } + } + + // Allocate bar 4 for MMIO and GTT + bd_info->bi_ctx.mmio.pa = pci_resource_start(pdev, DLDR_MMIO_BAR); + bd_info->bi_ctx.mmio.len = pci_resource_len(pdev, DLDR_MMIO_BAR); + if (request_mem_region(bd_info->bi_ctx.mmio.pa, + bd_info->bi_ctx.mmio.len, "mic") == NULL) { + printk("mic %d: failed to reserve mmio space\n", brdnum); + goto probe_freebd; + } + + // Allocate bar 0 for access Aperture + bd_info->bi_ctx.aper.pa = pci_resource_start(pdev, DLDR_APT_BAR); + bd_info->bi_ctx.aper.len = pci_resource_len(pdev, DLDR_APT_BAR); + if (request_mem_region(bd_info->bi_ctx.aper.pa, + bd_info->bi_ctx.aper.len, "mic") == NULL) { + printk("mic %d: failed to reserve aperture space\n", brdnum); + goto probe_relmmio; + } + +#ifdef CONFIG_PCI_MSI + if (mic_msi_enable){ + for (i = 0; i < MIC_NUM_MSIX_ENTRIES; i ++) + bd_info->bi_msix_entries[i].entry = i; + err = pci_enable_msix(mic_ctx->bi_pdev, bd_info->bi_msix_entries, + MIC_NUM_MSIX_ENTRIES); + if (err == 0 ) { + // Only support 1 MSIx for now + err = request_irq(bd_info->bi_msix_entries[0].vector, + mic_irq_isr, 0, "mic", mic_ctx); + if (err != 0) { + printk("MIC: Error in request_irq %d\n", err); + goto probe_relaper; + } + mic_ctx->msie = 1; + } + } +#endif + + // TODO: this needs to be hardened and actually return errors + if ((err = adapter_init_device(mic_ctx)) != 0) { + printk("MIC: Adapter init device failed %d\n", err); + goto probe_relaper; + } + + // Adding sysfs entries + set_sysfs_entries(mic_ctx); + + bd_info->bi_sysfsdev = device_create(mic_lindata.dd_class, &pdev->dev, + mic_lindata.dd_dev + 2 + mic_ctx->bd_info->bi_ctx.bi_id, + NULL, "mic%d", mic_ctx->bd_info->bi_ctx.bi_id); + err = sysfs_create_group(&mic_ctx->bd_info->bi_sysfsdev->kobj, &bd_attr_group); + mic_ctx->sysfs_state = sysfs_get_dirent(mic_ctx->bd_info->bi_sysfsdev->kobj.sd, +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,35) && LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0)) + NULL, +#endif + "state"); + + dev_set_drvdata(mic_ctx->bd_info->bi_sysfsdev, mic_ctx); + + if (!mic_ctx->msie) + if ((err = request_irq(mic_ctx->bi_pdev->irq, mic_irq_isr, + IRQF_SHARED, "mic", mic_ctx)) != 0) { + printk("MIC: Error in request_irq %d\n", err); + goto probe_unmapaper; + } + + adapter_probe(&bd_info->bi_ctx); + + if (mic_ctx->bi_psmi.enabled) { + err = sysfs_create_group(&mic_ctx->bd_info->bi_sysfsdev->kobj, + &psmi_attr_group); + err = device_create_bin_file(mic_ctx->bd_info->bi_sysfsdev, + &mic_psmi_ptes_attr); + } + + adapter_wait_reset(mic_ctx); + + // Adding a board instance so increment the total number of MICs in the system. + list_add_tail(&bd_info->bi_list, &mic_data.dd_bdlist); + mic_data.dd_numdevs++; + printk("mic_probe %d:%d:%d as board #%d\n", pdev->bus->number, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), brdnum); + return 0; + +probe_unmapaper: + wait_event(mic_ctx->ioremapwq, mic_ctx->aper.va || mic_ctx->state == MIC_RESETFAIL); + if (mic_ctx->aper.va) + iounmap((void *)bd_info->bi_ctx.aper.va); + iounmap((void *)bd_info->bi_ctx.mmio.va); + +probe_relaper: + release_mem_region(bd_info->bi_ctx.aper.pa, bd_info->bi_ctx.aper.len); + +probe_relmmio: + release_mem_region(bd_info->bi_ctx.mmio.pa, bd_info->bi_ctx.mmio.len); + +probe_freebd: + kfree(bd_info); + return err; +} + +static void +mic_remove(struct pci_dev *pdev) +{ + int32_t brdnum; + bd_info_t *bd_info; + + if (mic_data.dd_numdevs - 1 < 0) + return; + mic_data.dd_numdevs--; + brdnum = mic_data.dd_numdevs; + + /* Make sure boards are shutdown and not available. */ + bd_info = mic_data.dd_bi[brdnum]; + + spin_lock_bh(&bd_info->bi_ctx.sysfs_lock); + sysfs_put(bd_info->bi_ctx.sysfs_state); + bd_info->bi_ctx.sysfs_state = NULL; + spin_unlock_bh(&bd_info->bi_ctx.sysfs_lock); + + if (bd_info->bi_ctx.bi_psmi.enabled) { + device_remove_bin_file(bd_info->bi_sysfsdev, &mic_psmi_ptes_attr); + sysfs_remove_group(&bd_info->bi_sysfsdev->kobj, &psmi_attr_group); + } + sysfs_remove_group(&bd_info->bi_sysfsdev->kobj, &bd_attr_group); + + free_sysfs_entries(&bd_info->bi_ctx); + device_destroy(mic_lindata.dd_class, + mic_lindata.dd_dev + 2 + bd_info->bi_ctx.bi_id); + + adapter_stop_device(&bd_info->bi_ctx, 1, 0); + /* + * Need to wait for reset since accessing the card while GDDR training + * is ongoing by adapter_remove(..) below for example can be fatal. + */ + wait_for_reset(&bd_info->bi_ctx); + + mic_disable_interrupts(&bd_info->bi_ctx); + + if (!bd_info->bi_ctx.msie) { + free_irq(bd_info->bi_ctx.bi_pdev->irq, &bd_info->bi_ctx); +#ifdef CONFIG_PCI_MSI + } else { + free_irq(bd_info->bi_msix_entries[0].vector, &bd_info->bi_ctx); + pci_disable_msix(bd_info->bi_ctx.bi_pdev); +#endif + } + adapter_remove(&bd_info->bi_ctx); + release_mem_region(bd_info->bi_ctx.aper.pa, bd_info->bi_ctx.aper.len); + release_mem_region(bd_info->bi_ctx.mmio.pa, bd_info->bi_ctx.mmio.len); + pci_disable_device(bd_info->bi_ctx.bi_pdev); + kfree(bd_info); +} + +static void +mic_shutdown(struct pci_dev *pdev) { + mic_ctx_t *mic_ctx; + mic_ctx = get_device_context(pdev); + + if(!mic_ctx) + return; + + adapter_stop_device(mic_ctx, !RESET_WAIT , !RESET_REATTEMPT); + return; +} +static const struct file_operations mic_fops = { + .open = mic_open, + .release = mic_release, + .read = mic_read, + .unlocked_ioctl = mic_ioctl, + .fasync = mic_fasync, + .mmap = mic_mmap, + .poll = mic_poll, + .flush = mic_flush, + .owner = THIS_MODULE, +}; + +static const struct dev_pm_ops pci_dev_pm_ops = { + .suspend = micpm_suspend, + .resume = micpm_resume, + .freeze = micpm_suspend, + .restore = micpm_resume, + .suspend_noirq = micpm_suspend_noirq, + .resume_noirq = micpm_resume_noirq, + .freeze_noirq = micpm_suspend_noirq, + .restore_noirq = micpm_resume_noirq, +}; + +static struct notifier_block mic_pm_notifer = { + .notifier_call = micpm_notifier_block, +}; + +static struct pci_device_id mic_pci_tbl[] = { +#ifdef CONFIG_ML1OM + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ABR_2249, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ABR_224a, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, +#endif +#ifdef CONFIG_MK1OM + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2250, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2251, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2252, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2253, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2254, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2255, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2256, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2257, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2258, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2259, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225a, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225b, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225c, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225d, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225e, PCI_ANY_ID, PCI_ANY_ID, + 0, 0, 0 }, + +#endif + { 0, } +}; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,31) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) +#define MODE_T umode_t +#else +#define MODE_T mode_t +#endif +static char * +mic_devnode(struct device *dev, MODE_T *mode) +{ + return kasprintf(GFP_KERNEL, "mic/%s", dev_name(dev)); +} +#undef MODE_T +#endif + +static int __init +mic_init(void) +{ + int ret, i; + + adapter_init(); + + unaligned_cache = micscif_kmem_cache_create(); + if (!unaligned_cache) { + ret = -ENOMEM; + goto init_free_ports; + } + + mic_lindata.dd_pcidriver.name = "mic"; + mic_lindata.dd_pcidriver.id_table = mic_pci_tbl; + mic_lindata.dd_pcidriver.probe = mic_probe; + mic_lindata.dd_pcidriver.remove = mic_remove; + mic_lindata.dd_pcidriver.driver.pm = &pci_dev_pm_ops; + mic_lindata.dd_pcidriver.shutdown = mic_shutdown; + + + if ((ret = alloc_chrdev_region(&mic_lindata.dd_dev, + 0, MAX_DLDR_MINORS, "mic") != 0)) { + printk("Error allocating device nodes: %d\n", ret); + goto init_free_ports; + } + + cdev_init(&mic_lindata.dd_cdev, &mic_fops); + mic_lindata.dd_cdev.owner = THIS_MODULE; + mic_lindata.dd_cdev.ops = &mic_fops; + + if ((ret = cdev_add(&mic_lindata.dd_cdev, + mic_lindata.dd_dev, MAX_DLDR_MINORS) != 0)) { + kobject_put(&mic_lindata.dd_cdev.kobj); + goto init_free_region; + } + + mic_lindata.dd_class = class_create(THIS_MODULE, "mic"); + if (IS_ERR(mic_lindata.dd_class)) { + printk("MICDLDR: Error createing mic class\n"); + cdev_del(&mic_lindata.dd_cdev); + ret = PTR_ERR(mic_lindata.dd_class); + goto init_free_region; + } + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,31) + mic_lindata.dd_class->devnode = mic_devnode; +#endif + + mic_lindata.dd_hostdev = device_create(mic_lindata.dd_class, NULL, + mic_lindata.dd_dev, NULL, "ctrl"); + mic_lindata.dd_scifdev = device_create(mic_lindata.dd_class, NULL, + mic_lindata.dd_dev + 1, NULL, "scif"); + ret = sysfs_create_group(&mic_lindata.dd_hostdev->kobj, &host_attr_group); + ret = sysfs_create_group(&mic_lindata.dd_scifdev->kobj, &scif_attr_group); + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,31) + mic_lindata.dd_class->devnode = NULL; +#endif + + if (micveth_init(mic_lindata.dd_hostdev)) + printk(KERN_ERR "%s: micveth_init failed\n", __func__); + + ret = pci_register_driver(&mic_lindata.dd_pcidriver); + if (ret) { + micscif_destroy(); + printk("mic: failed to register pci driver %d\n", ret); + goto clean_unregister; + } + + if (!mic_data.dd_numdevs) { + printk("mic: No MIC boards present. SCIF available in loopback mode\n"); + } else { + printk("mic: number of devices detected %d \n", mic_data.dd_numdevs); + } + + for (i = 0; i < mic_data.dd_numdevs; i++) { + mic_ctx_t *mic_ctx = get_per_dev_ctx(i); + wait_event(mic_ctx->ioremapwq, + mic_ctx->aper.va || mic_ctx->state == MIC_RESETFAIL); + destroy_workqueue(mic_ctx->ioremapworkq); + } + + micveth_init_legacy(mic_data.dd_numdevs, mic_lindata.dd_hostdev); + + ret = acptboot_init(); + +#ifdef USE_VCONSOLE + micvcons_create(mic_data.dd_numdevs); +#endif + + /* Initialize Data structures for PM Disconnect */ + ret = micpm_disconn_init(mic_data.dd_numdevs + 1); + if (ret) + printk(KERN_ERR "%s: Failed to initialize PM disconnect" + " data structures. PM may not work as expected." + " ret = %d\n", __func__, ret); + register_pm_notifier(&mic_pm_notifer); +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34)) + ret = pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "mic", mic_pm_qos_cpu_dma_lat); + if (ret) { + printk(KERN_ERR "%s %d mic_pm_qos_cpu_dma_lat %d ret %d\n", + __func__, __LINE__, mic_pm_qos_cpu_dma_lat, ret); + ret = 0; + /* Dont fail driver load due to PM QoS API. Fall through */ + } +#endif + return 0; + +clean_unregister: + device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev + 1); + device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev); + class_destroy(mic_lindata.dd_class); + cdev_del(&mic_lindata.dd_cdev); + unregister_pm_notifier(&mic_pm_notifer); +init_free_region: + unregister_chrdev_region(mic_lindata.dd_dev, MAX_DLDR_MINORS); +init_free_ports: + micpm_uninit(); + return ret; +} + +static void __exit +mic_exit(void) +{ + /* Close endpoints related to reverse registration */ + acptboot_exit(); + +#ifdef USE_VCONSOLE + micvcons_destroy(mic_data.dd_numdevs); +#endif + + pci_unregister_driver(&mic_lindata.dd_pcidriver); + micpm_uninit(); + + /* Uninit data structures for PM disconnect */ + micpm_disconn_uninit(mic_data.dd_numdevs + 1); + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34)) + pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "mic"); +#endif + micscif_kmem_cache_destroy(); + vmcore_exit(); + micveth_exit(); + micscif_destroy(); + ramoops_exit(); + + device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev + 1); + device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev); + class_destroy(mic_lindata.dd_class); + cdev_del(&mic_lindata.dd_cdev); + unregister_chrdev_region(mic_lindata.dd_dev, MAX_DLDR_MINORS); + unregister_pm_notifier(&mic_pm_notifer); + return; +} + +void +set_sysfs_entries(mic_ctx_t *mic_ctx) +{ + memset(&mic_ctx->sysfs_info, 0, sizeof(mic_ctx->sysfs_info)); +} + +void +free_sysfs_entries(mic_ctx_t *mic_ctx) +{ + if (mic_ctx->image != NULL) + kfree(mic_ctx->image); /* mic_ctx->initramfs points into this buffer */ + if (mic_ctx->sysfs_info.cmdline != NULL) + kfree(mic_ctx->sysfs_info.cmdline); + if (mic_ctx->sysfs_info.kernel_cmdline != NULL) + kfree(mic_ctx->sysfs_info.kernel_cmdline); +} + +mic_ctx_t * +get_per_dev_ctx(uint16_t node) +{ + /* TODO: Its important to check the upper bound of the dd_bi array as well. + * Cannot be done currently since not all calling functions to get_per_dev_ctx + * has the dd_numdevs set correctly. (See mic_ctx_map_single call in adapter_init_device + * thats callled even before dd_numdevs is incremented. */ + return &mic_data.dd_bi[node]->bi_ctx; +} + +int +get_num_devs(mic_ctx_t *mic_ctx, uint32_t *num_devs) +{ + if (num_devs == NULL) + return -EINVAL; + if (copy_to_user(num_devs, &mic_data.dd_numdevs, sizeof(uint32_t))) + return -EFAULT; + return 0; +} + +int +mic_get_file_size(const char* fn, uint32_t* file_len) +{ + struct file *filp; + loff_t filp_size; + uint32_t status = 0; + mm_segment_t fs = get_fs(); + + set_fs(get_ds()); + + if (!fn || IS_ERR(filp = filp_open(fn, 0, 0))) { + status = EINVAL; + goto cleanup_fs; + } + + filp_size = GET_FILE_SIZE_FROM_INODE(filp); + if (filp_size <= 0) { + status = EINVAL; + goto cleanup_filp; + } + + *file_len = filp_size; +cleanup_filp: + filp_close(filp, current->files); +cleanup_fs: + set_fs(fs); + return status; +} + +// loads file from hdd into pci physical memory +int +mic_load_file(const char* fn, uint8_t* buffer, uint32_t max_size) +{ + long c; + int status = 0; + struct file *filp; + loff_t filp_size, pos = 0; + + mm_segment_t fs = get_fs(); + set_fs(get_ds()); + + if (!fn || IS_ERR(filp = filp_open(fn, 0, 0))) { + status = EINVAL; + goto cleanup_fs; + } + + filp_size = GET_FILE_SIZE_FROM_INODE(filp); + if (filp_size <= 0) { + goto cleanup_filp; + } + + c = vfs_read(filp, buffer, filp_size, &pos); + if(c != (long)filp_size) { + status = -1; //FIXME + goto cleanup_filp; + } + +cleanup_filp: + filp_close(filp, current->files); +cleanup_fs: + set_fs(fs); + + return status; +} + +module_init(mic_init); +module_exit(mic_exit); diff --git a/host/linvcons.c b/host/linvcons.c new file mode 100644 index 0000000..556a9b5 --- /dev/null +++ b/host/linvcons.c @@ -0,0 +1,687 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "micint.h" + +/* TODO: Improve debug messages */ + +static int micvcons_open(struct tty_struct * tty, struct file * filp); +static void micvcons_close(struct tty_struct * tty, struct file * filp); +static int micvcons_write(struct tty_struct * tty, const unsigned char *buf, + int count); +static int micvcons_write_room(struct tty_struct *tty); +static void micvcons_set_termios(struct tty_struct *tty, struct ktermios * old); +static void micvcons_timeout(unsigned long); +static void micvcons_throttle(struct tty_struct *tty); +static void micvcons_unthrottle(struct tty_struct *tty); +static void micvcons_wakeup_readbuf(struct work_struct *work); +static int micvcons_resume(struct _mic_ctx_t *mic_ctx); + +static struct tty_operations micvcons_tty_ops = { + .open = micvcons_open, + .close = micvcons_close, + .write = micvcons_write, + .write_room = micvcons_write_room, + .set_termios = micvcons_set_termios, + .throttle = micvcons_throttle, + .unthrottle = micvcons_unthrottle, +}; + +static struct tty_driver *micvcons_tty = NULL; +static u16 extra_timeout = 0; +static u8 restart_timer_flag = MICVCONS_TIMER_RESTART; +static struct timer_list vcons_timer; +static struct list_head timer_list_head; +static spinlock_t timer_list_lock; + +int +micvcons_create(int num_bds) +{ + micvcons_port_t *port; + bd_info_t *bd_info; + int bd, ret = 0; + char wq_name[14]; + struct device *dev; + + INIT_LIST_HEAD(&timer_list_head); + + if (micvcons_tty) + goto exit; + + micvcons_tty = alloc_tty_driver(num_bds); + if (!micvcons_tty) { + ret = -ENOMEM; + goto exit; + } + micvcons_tty->owner = THIS_MODULE; + micvcons_tty->driver_name = MICVCONS_DEVICE_NAME; + micvcons_tty->name = MICVCONS_DEVICE_NAME; + micvcons_tty->major = 0; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)) + micvcons_tty->minor_num = num_bds; +#endif + micvcons_tty->minor_start = 0; + micvcons_tty->type = TTY_DRIVER_TYPE_SERIAL; + micvcons_tty->subtype = SERIAL_TYPE_NORMAL; + micvcons_tty->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; + micvcons_tty->init_termios = tty_std_termios; + micvcons_tty->init_termios.c_iflag = IGNCR; + micvcons_tty->init_termios.c_oflag = 0; + micvcons_tty->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; + micvcons_tty->init_termios.c_lflag = 0; + + tty_set_operations(micvcons_tty, &micvcons_tty_ops); + + if ((ret = tty_register_driver(micvcons_tty)) != 0) { + printk("Failed to register vcons tty driver\n"); + put_tty_driver(micvcons_tty); + micvcons_tty = NULL; + goto exit; + } + + for (bd = 0; bd < num_bds; bd++) { + port = &mic_data.dd_ports[bd]; + port->dp_bdinfo = mic_data.dd_bi[bd]; + + spin_lock_init(&port->dp_lock); + mutex_init (&port->dp_mutex); + + bd_info = (bd_info_t *)port->dp_bdinfo; + bd_info->bi_port = port; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + tty_port_init(&port->port); + dev = tty_port_register_device(&port->port, micvcons_tty, bd, NULL); +#else + dev = tty_register_device(micvcons_tty, bd, NULL); + if (IS_ERR(dev)) { + printk("Failed to register vcons tty device\n"); + micvcons_destroy(bd); + ret = PTR_ERR(dev); + goto exit; + } +#endif + snprintf(wq_name, sizeof(wq_name), "VCONS MIC %d", bd); + port->dp_wq = __mic_create_singlethread_workqueue(wq_name); + if (!port->dp_wq) { + printk(KERN_ERR "%s: create_singlethread_workqueue\n", + __func__); + tty_unregister_device(micvcons_tty, bd); + micvcons_destroy(bd); + ret = -ENOMEM; + goto exit; + } + INIT_WORK(&port->dp_wakeup_read_buf, micvcons_wakeup_readbuf); + } + vcons_timer.function = micvcons_timeout; + vcons_timer.data = (unsigned long)(&timer_list_head); + init_timer(&vcons_timer); +exit: + return ret; +} + +void micvcons_destroy(int num_bds) +{ + int bd, ret; + micvcons_port_t *port; + + if (!micvcons_tty) + return; + for (bd = 0; bd < num_bds; bd++) { + port = &mic_data.dd_ports[bd]; + destroy_workqueue(port->dp_wq); + tty_unregister_device(micvcons_tty, bd); + } + ret = tty_unregister_driver(micvcons_tty); + put_tty_driver(micvcons_tty); + micvcons_tty = NULL; + + if (ret) + printk(KERN_ERR "tty unregister_driver failed with code %d\n", ret); +} + +static int +micvcons_open(struct tty_struct * tty, struct file * filp) +{ + micvcons_port_t *port = &mic_data.dd_ports[tty->index]; + int ret = 0; + mic_ctx_t *mic_ctx = get_per_dev_ctx(tty->index); + + tty->driver_data = port; + + mutex_lock(&port->dp_mutex); + spin_lock_bh(&port->dp_lock); + + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) { + if (port->dp_writer) { + ret = -EBUSY; + goto exit_locked; + } + port->dp_writer = filp; + port->dp_bytes = 0; + } + + if ((filp->f_flags & O_ACCMODE) != O_WRONLY) { + if (port->dp_reader) { + ret = -EBUSY; + goto exit_locked; + } + port->dp_reader = filp; + port->dp_canread = 1; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)) + tty->low_latency = 0; +#endif + + if (!port->dp_tty) + port->dp_tty = tty; + if (!port->dp_vcons) + port->dp_vcons = &mic_ctx->bi_vcons; + if (tty->count == 1) { + ret = micvcons_start(mic_ctx); + if (ret != 0) + goto exit_locked; + spin_lock(&timer_list_lock); + list_add_tail_rcu(&port->list_member, &timer_list_head); + if (list_is_singular(&timer_list_head)) { + restart_timer_flag = MICVCONS_TIMER_RESTART; + mod_timer(&vcons_timer, jiffies + + msecs_to_jiffies(MICVCONS_SHORT_TIMEOUT)); + } + spin_unlock(&timer_list_lock); + } + +exit_locked: + spin_unlock_bh(&port->dp_lock); + mutex_unlock(&port->dp_mutex); + return ret; +} + +static inline void +micvcons_del_timer_entry(micvcons_port_t *port) +{ + spin_lock(&timer_list_lock); + list_del_rcu(&port->list_member); + if (list_empty(&timer_list_head)) { + restart_timer_flag = MICVCONS_TIMER_SHUTDOWN; + spin_unlock(&timer_list_lock); + del_timer_sync(&vcons_timer); + } else { + spin_unlock(&timer_list_lock); + } + synchronize_rcu(); +} + +static void +micvcons_close(struct tty_struct * tty, struct file * filp) +{ + micvcons_port_t *port = (micvcons_port_t *)tty->driver_data; + + mutex_lock(&port->dp_mutex); + if (tty->count == 1) { + micvcons_del_timer_entry(port); + flush_workqueue(port->dp_wq); + } + spin_lock_bh(&port->dp_lock); + if (port->dp_reader == filp) + port->dp_reader = 0; + + if (port->dp_writer == filp) + port->dp_writer = 0; + + if (tty->count == 1) + port->dp_tty = 0; + spin_unlock_bh(&port->dp_lock); + mutex_unlock(&port->dp_mutex); +} + +static int +micvcons_write(struct tty_struct * tty, const unsigned char *buf, int count) +{ + micvcons_port_t *port = (micvcons_port_t *)tty->driver_data; + mic_ctx_t *mic_ctx = get_per_dev_ctx(tty->index); + int bytes=0, status; + struct vcons_buf *vcons_host_header; + u8 card_alive = 1; + + spin_lock_bh(&port->dp_lock); + vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt; + if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) { + status = micvcons_resume(mic_ctx); + if (status != 0) { + /* If card can not wakeup, it is dead. */ + card_alive = 0; + goto exit; + } + } + if (vcons_host_header->mic_magic != MIC_VCONS_READY) + goto exit; + bytes = micvcons_port_write(port, buf, count); + if (bytes) { + mic_send_hvc_intr(mic_ctx); + extra_timeout = 0; + } +exit: + spin_unlock_bh(&port->dp_lock); + if (!card_alive) + micvcons_del_timer_entry(port); + return bytes; +} + +static int +micvcons_write_room(struct tty_struct *tty) +{ + micvcons_port_t *port = (micvcons_port_t *)tty->driver_data; + int room; + + spin_lock_bh(&port->dp_lock); + if (port->dp_out) + room = micscif_rb_space(port->dp_out); + else + room = 0; + spin_unlock_bh(&port->dp_lock); + + return room; +} + +static void +micvcons_set_termios(struct tty_struct *tty, struct ktermios * old) +{ +} + +static int +micvcons_readchars(micvcons_port_t *port) +{ + int len, ret, get_count; + int bytes_total = 0; + int bytes_read = 0; + char buf[64]; + + for (;;) { + len = micscif_rb_count(port->dp_in, sizeof(buf)); + if (!len) + break; + get_count = min(len, (int)sizeof(buf)); + ret = micscif_rb_get_next(port->dp_in, buf, get_count); + micscif_rb_update_read_ptr(port->dp_in); + if (port->dp_reader && port->dp_canread) { +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + if ((bytes_read = tty_insert_flip_string( + &port->port, buf, get_count)) != 0) + tty_flip_buffer_push(&port->port); +#else + bytes_read = tty_insert_flip_string(port->dp_tty, + buf, get_count); + tty_flip_buffer_push(port->dp_tty); +#endif + bytes_total += bytes_read; + if (bytes_read != get_count) { + printk(KERN_WARNING "dropping characters: \ + bytes_read %d, get_count %d\n", + bytes_read, get_count); + break; + } + } + } + return bytes_total; +} + +static int +micvcons_initport(micvcons_port_t *port) +{ + struct vcons_buf *vcons_host_header; + struct vcons_mic_header *vcons_mic_header; + char *mic_hdr, *mic_buf, *host_buf; + + vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt; + if (!vcons_host_header) { + printk(KERN_ERR "vcons_host_header NULL\n"); + return -EFAULT; + } + + host_buf = (char *)port->dp_vcons->dc_buf_virt; + if (!host_buf) { + printk(KERN_ERR "host_buf NULL\n"); + return -EFAULT; + } + + if (port->dp_bdinfo->bi_ctx.bi_family == FAMILY_ABR) { + set_pci_aperture(&port->dp_bdinfo->bi_ctx, + (port->dp_bdinfo->bi_ctx.aper.len - PAGE_SIZE) >> PAGE_SHIFT, + vcons_host_header->i_hdr_addr & PAGE_MASK, PAGE_SIZE); + mic_hdr = port->dp_bdinfo->bi_ctx.aper.va + + port->dp_bdinfo->bi_ctx.aper.len - PAGE_SIZE; + mic_buf = mic_hdr + PAGE_SIZE/2; + } else { + mic_hdr = port->dp_bdinfo->bi_ctx.aper.va + vcons_host_header->i_hdr_addr; + mic_buf = port->dp_bdinfo->bi_ctx.aper.va + vcons_host_header->i_buf_addr; + } + + port->dp_in = kmalloc(sizeof(struct micscif_rb), GFP_ATOMIC); + if (port->dp_in) + port->dp_out = kmalloc(sizeof(struct micscif_rb), GFP_ATOMIC); + else + return -ENOMEM; + + if (port->dp_out) { + vcons_mic_header = (struct vcons_mic_header *)mic_hdr; + micscif_rb_init(port->dp_in, + &vcons_mic_header->o_rd, + &vcons_host_header->o_wr, + host_buf, + vcons_host_header->o_size); + micscif_rb_init(port->dp_out, &vcons_host_header->i_rd, + &vcons_mic_header->i_wr, + mic_buf, + vcons_host_header->i_size); + wmb(); + writel(MIC_VCONS_HOST_OPEN, &vcons_mic_header->host_status); + } else { + kfree(port->dp_in); + return -ENOMEM; + } + return 0; +} + +static int +micvcons_readport(micvcons_port_t *port) +{ + int num_chars_read = 0, status; + static uint32_t prev_mic_magic; + struct vcons_buf *vcons_host_header; + + if (!port || !port->dp_vcons) + return 0; + + spin_lock_bh(&port->dp_lock); + if (!port->dp_tty) { + spin_unlock_bh(&port->dp_lock); + return 0; + } + + vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt; + if ((vcons_host_header->mic_magic != MIC_VCONS_READY) && + (vcons_host_header->mic_magic != MIC_VCONS_SLEEPING)) { + if ((vcons_host_header->mic_magic == MIC_VCONS_RB_VER_ERR) + && (vcons_host_header->mic_magic != prev_mic_magic)) { + printk(KERN_ERR "Card and host ring buffer versions mismatch."); + printk(KERN_ERR "Card version: %d, Host version: %d \n", + vcons_host_header->mic_rb_ver, + vcons_host_header->host_rb_ver); + } + goto exit; + } + if (!port->dp_in) { + status = micvcons_initport(port); + if (status != 0) { + spin_unlock_bh(&port->dp_lock); + return status; + } + } + + if (port->dp_in) { + if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) { + /* + * If the card is sleeping and there is data in the + * buffer, schedule work in a work queue to wake-up + * the card and read from the buffer. + */ + if (micscif_rb_count(port->dp_in, 1)) + queue_work(port->dp_wq, + &port->dp_wakeup_read_buf); + } else { + num_chars_read = micvcons_readchars(port); + tty_wakeup(port->dp_tty); + } + } +exit: + prev_mic_magic = vcons_host_header->mic_magic; + spin_unlock_bh(&port->dp_lock); + return num_chars_read; +} + +static void +micvcons_wakeup_readbuf(struct work_struct *work) +{ + u8 card_alive = 1; + int status; + micvcons_port_t *port; + struct vcons_buf *vcons_host_header; + + port = container_of(work, micvcons_port_t, dp_wakeup_read_buf); + + vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt; + spin_lock_bh(&port->dp_lock); + status = micvcons_resume(get_per_dev_ctx(port->dp_tty->index)); + if (status == 0) { + micvcons_readchars(port); + tty_wakeup(port->dp_tty); + } else { + /* If card can not wakeup, it is dead. */ + card_alive = 0; + } + spin_unlock_bh(&port->dp_lock); + if (!card_alive) + micvcons_del_timer_entry(port); +} + +static void +micvcons_timeout(unsigned long data) +{ + struct list_head *timer_list_ptr = (struct list_head *)data; + micvcons_port_t *port; + u8 console_active = 0; + int num_chars_read = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(port, timer_list_ptr, list_member) { + num_chars_read = micvcons_readport(port); + if (num_chars_read != 0) + console_active = 1; + } + rcu_read_unlock(); + + spin_lock(&timer_list_lock); + if (restart_timer_flag == MICVCONS_TIMER_RESTART) { + extra_timeout = (console_active ? 0 : + extra_timeout + MICVCONS_SHORT_TIMEOUT); + extra_timeout = min(extra_timeout, (u16)MICVCONS_MAX_TIMEOUT); + mod_timer(&vcons_timer, jiffies + + msecs_to_jiffies(MICVCONS_SHORT_TIMEOUT+extra_timeout)); + } + spin_unlock(&timer_list_lock); +} + +static void +micvcons_throttle(struct tty_struct *tty) +{ + micvcons_port_t *port = (micvcons_port_t *)tty->driver_data; + port->dp_canread = 0; +} + +static void +micvcons_unthrottle(struct tty_struct *tty) +{ + micvcons_port_t *port = (micvcons_port_t *)tty->driver_data; + port->dp_canread = 1; +} + +int micvcons_start(mic_ctx_t *mic_ctx) +{ + struct vcons_buf *vcons_host_header; + int status; + micvcons_port_t *port = mic_ctx->bd_info->bi_port; + + vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt; + if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) { + status = micvcons_resume(mic_ctx); + if (status != 0) + return status; + } + if (vcons_host_header->mic_magic == MIC_VCONS_READY) { + if (!port->dp_in) { + status = micvcons_initport(port); + if (status != 0) + return status; + } + } + return 0; +} + +int micvcons_port_write(struct micvcons_port *port, const unsigned char *buf, + int count) +{ + int ret; + uint32_t bytes = 0; + + if (port->dp_out) { + bytes = min(count, micscif_rb_space(port->dp_out)); + ret = micscif_rb_write(port->dp_out, (void *)buf, bytes); + BUG_ON(ret); + port->dp_bytes += bytes; + micscif_rb_commit(port->dp_out); + } + return bytes; +} + +/** + * micvcons_stop - cleans up before a node is rebooted + * @ mic_ctx: node to clean up + * + * Called before rebooting a node, reads remaining characters + * from the node's vcons output buffer, resets the input/output + * ring buffers so that things work when the node comes up again + */ +void +micvcons_stop(mic_ctx_t *mic_ctx) +{ + micvcons_port_t *port; + struct vcons_buf *vcons_host_header; + + port = mic_ctx->bd_info->bi_port; + micvcons_readport(port); + spin_lock_bh(&port->dp_lock); + if (port->dp_in) { + vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt; + vcons_host_header->mic_magic = 0; + kfree(port->dp_in); + kfree(port->dp_out); + port->dp_in = NULL; + port->dp_out = NULL; + } + spin_unlock_bh(&port->dp_lock); +} + +/** + * micvcons_resume - sets the state of a node's console to ready + * @ mic_ctx: node to clean up + * + * @ return: zero if successful. + * called before resuming a node from PC6. MUST acquire the spinlock + * port->dp_lock with bottom-halves disabled before calling this function. + */ +static int +micvcons_resume(mic_ctx_t *mic_ctx) +{ + int status = 0; + micvcons_port_t *port; + struct vcons_buf *vcons_host_header; + + port = mic_ctx->bd_info->bi_port; + vcons_host_header = mic_ctx->bi_vcons.dc_hdr_virt; + if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) { + do { + vcons_host_header->mic_magic = MIC_VCONS_WAKINGUP; + spin_unlock_bh(&port->dp_lock); + status = micscif_connect_node(mic_get_scifnode_id(mic_ctx), false); + spin_lock_bh(&port->dp_lock); + } while ((status == 0) && + (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING)); + if (status == 0) + vcons_host_header->mic_magic = MIC_VCONS_READY; + } + return status; +} + +/** + * micvcons_pm_disconnect_node - Check if a card can be put to sleep in case + * there is any activity on the virtual console. If yes, it also sets the + * internal state of a node's console to sleeping. + * @ node_bitmask: bits set indicate which cards to check. + * Bit-1 for the first, Bit-2 for the second,... + * Ignore Bit-0 which indicates host. + * @ return: bits set indicating which cards can sleep. + * This is called from PM to check if a card can be put to sleep (PC-6 state). + * This is called when the node is disconnected from the SCIF network + * before putting it into the PC6 state where it should no longer + * receive an PCIe transactions until woken up by the host driver. + */ +int +micvcons_pm_disconnect_node(uint8_t *node_bitmask, enum disconn_type type) +{ + int err = 0; + if ((type == DISCONN_TYPE_POWER_MGMT) && (node_bitmask)) { + int i = 0; + mic_ctx_t *mic_ctx; + micvcons_port_t *port; + struct vcons_buf *vcons_host_header; + + for (i = 0; i <= mic_data.dd_numdevs; i++) { + if (!get_nodemask_bit(node_bitmask, i)) + continue; + + if (!(mic_ctx = get_per_dev_ctx(i - 1))) + continue; + + port = mic_ctx->bd_info->bi_port; + micvcons_readport(port); + /* + * If this function is called when virtual console is + * not active, port->dp_vcons needs to be initialized. + */ + if (!port->dp_vcons) + port->dp_vcons = &mic_ctx->bi_vcons; + + vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt; + spin_lock_bh(&port->dp_lock); + vcons_host_header->mic_magic = MIC_VCONS_SLEEPING; + spin_unlock_bh(&port->dp_lock); + } + } + + return err; +} + diff --git a/host/linvnet.c b/host/linvnet.c new file mode 100644 index 0000000..8082e41 --- /dev/null +++ b/host/linvnet.c @@ -0,0 +1,802 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "micint.h" +#include "mic_common.h" +#include +#include +#include +#include +#include "mic/micveth.h" + +#define PWR_MGMT_NO_POLL_AFTER_LINKS_UP 1 + +/* + In intr/poll modes, mic_smpt_uninit has already been called before + micveth_destroy is called during rmmod. This results in host driver crash. The + current workaround is, given the 'legacy' nature of VNET intr/poll modes, to + not call mic_ctx_unmap_single() at rmmod. This workaround will result in some + unmapped memory and a warn_on from micscif_smpt.c. + */ +#define WA_UNMAP_AT_RMMOD 0 + +static void micveth_clientpoll(struct work_struct *work); +static void micveth_poll(struct work_struct *work); +static int micvnet_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell); +static void micvnet_intr_bh_handler(struct work_struct *work); +void micveth_send_intr(micveth_info_t *veth_info); + +micveth_t micveth; + +void dump_skb(struct sk_buff *skb, int xmit); + +static inline +mic_ctx_t *veth_to_ctx(micveth_info_t *veth_info) +{ + return veth_info->mic_ctx; +} + +static int +micveth_set_address(struct net_device *dev, void *p) +{ + struct sockaddr *sa = p; + + if (!is_valid_ether_addr(sa->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN); + return 0; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) +static void +micveth_multicast_list(struct net_device *dev) +{ +} +#endif + +static int +micveth_deliver(struct sk_buff *skb, struct net_device *dev, micveth_info_t *veth_info) +{ + veth_ring_t *ring; + ring_queue_t *tx_queue; + ring_desc_t *desc; + ring_packet_t *packet; + int next_tail; + + //dump_skb(skb, 1); + + spin_lock(&veth_info->vi_txlock); + ring = &veth_info->vi_ring.ring; + tx_queue = &ring->r_tx; + + next_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length; + if (next_tail == tx_queue->rq_head) { + // queue_full situation - just drop the packet and let the stack retry + spin_unlock(&veth_info->vi_txlock); + return 1; + } + + desc = &tx_queue->rq_descs[tx_queue->rq_tail]; + packet = &veth_info->vi_tx_desc[tx_queue->rq_tail]; + packet->pd_skb = skb; + packet->pd_phys = mic_ctx_map_single(veth_to_ctx(veth_info), + skb->data, skb->len); + packet->pd_length = skb->len; + desc->rd_phys = packet->pd_phys; + desc->rd_length = skb->len; + desc->rd_valid = 1; + + /* + * Need a write memory barrier between copying the skb data to + * the buffer and updating the tail pointer. NOT an smp_wmb(), + * because this memory barrier needs to be done even if there is + * a single CPU in the system. + */ + wmb(); + tx_queue->rq_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length; + spin_unlock(&veth_info->vi_txlock); + + if (mic_vnet_mode == VNET_MODE_INTR) { + micveth_send_intr(veth_info); + } + + return 0; +} + +static int +micveth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + micveth_info_t *veth_info; + + if (be16_to_cpu(skb->protocol) == ETH_P_IPV6) { + kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + veth_info = dev->ml_priv; + + if (veth_info->vi_state != VETH_STATE_LINKUP) { + kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + + if (micveth_deliver(skb, dev, veth_info)) { + kfree_skb(skb); + dev->stats.tx_dropped++; + } + + return NETDEV_TX_OK; +} + +static int +micveth_change_mtu(struct net_device *dev, int new_mtu) +{ + dev->mtu = new_mtu; + return 0; +} + +/* Start callback */ +static int +micveth_start_dev(struct net_device *dev) +{ + micveth_info_t *veth_info = dev->ml_priv; + + micveth_start(veth_info->mic_ctx); + return 0; +} + +/* Stop callback */ +static int +micveth_stop_dev(struct net_device *dev) +{ + micveth_info_t *veth_info = dev->ml_priv; + + micveth_stop(veth_info->mic_ctx); + return 0; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28) +static const struct net_device_ops veth_netdev_ops = { + .ndo_open = micveth_start_dev, + .ndo_stop = micveth_stop_dev, + .ndo_start_xmit = micveth_xmit, + .ndo_validate_addr = eth_validate_addr, +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) + .ndo_set_multicast_list = micveth_multicast_list, +#endif + .ndo_set_mac_address = micveth_set_address, + .ndo_change_mtu = micveth_change_mtu, +}; +#endif + +static void +micveth_setup(struct net_device *dev) +{ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28) + dev->hard_start_xmit = micveth_xmit; + dev->set_multicast_list = micveth_multicast_list; + dev->set_mac_address = micveth_set_address; +#endif + ether_setup(dev); + + /* Initialize the device structure. */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28) + dev->netdev_ops = &veth_netdev_ops; +#endif + dev->destructor = free_netdev; + + /* Fill in device structure with ethernet-generic values. */ + dev->mtu = (MICVETH_MAX_PACKET_SIZE); + dev->tx_queue_len = 0; + dev->flags &= ~IFF_MULTICAST; + random_ether_addr(dev->dev_addr); +} + +static int +micveth_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + return 0; +} + +static struct rtnl_link_ops micveth_link_ops __read_mostly = { + .kind = "micveth", + .setup = micveth_setup, + .validate = micveth_validate, +}; + +static int +micveth_probe_int(micveth_info_t *veth_info, mic_ctx_t *mic_ctx) +{ + struct net_device *dev_veth; + ring_queue_t *queue; + ring_desc_t *desc; + ring_packet_t *packet; + int idx; + int err = 0; + + veth_info->vi_pdev = mic_ctx->bi_pdev; + veth_info->vi_sbox = (uint8_t *)((unsigned long)mic_ctx->mmio.va + + HOST_SBOX_BASE_ADDRESS); + veth_info->vi_scratch14 = (uint32_t *)((unsigned long)mic_ctx->mmio.va + + HOST_SBOX_BASE_ADDRESS + SBOX_SCRATCH14); + veth_info->vi_scratch15 = (uint32_t *)((unsigned long)mic_ctx->mmio.va + + HOST_SBOX_BASE_ADDRESS + SBOX_SCRATCH15); + veth_info->mic_ctx = mic_ctx; + mic_ctx->bi_vethinfo = (void *)veth_info; + + spin_lock_init(&veth_info->vi_txlock); + spin_lock_init(&veth_info->vi_rxlock); + + if (mic_vnet_mode == VNET_MODE_POLL) + INIT_DELAYED_WORK(&veth_info->vi_poll, micveth_poll); + + // Set the current sk_buff allocation size + veth_info->vi_skb_mtu = MICVETH_MAX_PACKET_SIZE + 32; + + // Get the physical memory address for the ring descriptors + veth_info->vi_ring.phys = mic_ctx_map_single(veth_to_ctx(veth_info), &veth_info->vi_ring.ring, + sizeof(veth_ring_t)); + veth_info->vi_ring.length = sizeof(veth_ring_t); + + queue = &veth_info->vi_ring.ring.r_tx; + queue->rq_head = 0; + queue->rq_tail = 0; + queue->rq_length = MICVETH_TRANSFER_FIFO_SIZE; + + veth_info->vi_pend = 0; + + packet = &veth_info->vi_tx_desc[0]; + for (idx = 0; idx < queue->rq_length; idx++) { + desc = &queue->rq_descs[idx]; + packet[idx].pd_skb = NULL; + packet[idx].pd_phys = 0; + packet[idx].pd_length = 0; + + desc->rd_phys = 0; + desc->rd_length = 0; + desc->rd_valid = 0; + } + + // This is the recieve end. + queue = &veth_info->vi_ring.ring.r_rx; + queue->rq_head = 0; + queue->rq_tail = 0; + queue->rq_length = MICVETH_TRANSFER_FIFO_SIZE; + + packet = &veth_info->vi_rx_desc[0]; + for (idx = 0; idx < queue->rq_length; idx++) { + desc = &queue->rq_descs[idx]; + if (!(packet[idx].pd_skb = dev_alloc_skb(veth_info->vi_skb_mtu))) + return -ENOMEM; + packet[idx].pd_phys = mic_ctx_map_single(veth_to_ctx(veth_info), packet[idx].pd_skb->data, + veth_info->vi_skb_mtu); + packet[idx].pd_length = veth_info->vi_skb_mtu; + + desc->rd_phys = packet[idx].pd_phys; + desc->rd_length = packet[idx].pd_length; + desc->rd_valid = 1; + } +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0) + if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", micveth_setup)) == NULL) { +#else + if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", NET_NAME_UNKNOWN, micveth_setup)) == NULL) { +#endif + return -ENOMEM; + } + + veth_info->vi_netdev = dev_veth; + dev_veth->ml_priv = veth_info; + dev_veth->rtnl_link_ops = &micveth_link_ops; + + if ((err = register_netdev(dev_veth)) < 0) { + printk("register netdev failed %d\n", err); + free_netdev(dev_veth); + return err; + } + + veth_info->vi_state = VETH_STATE_INITIALIZED; + return 0; +} + +static ssize_t show_veth(struct device *dev, + struct device_attribute *attr, char *buf); +DEVICE_ATTR(veth, S_IRUGO, show_veth, NULL); + +static int +micveth_init_int(int num_bds, struct device *dev) +{ + int bd; + int err = 0; + + micveth.lv_num_interfaces = num_bds; + micveth.lv_num_clients = num_bds; + micveth.lv_active_clients = 0; + micveth.lv_num_links_remaining = num_bds; + + BUG_ON(rtnl_link_register(&micveth_link_ops)); + + // Allocate space for the control of each device in the system. + micveth.lv_info = kmalloc(sizeof(micveth_info_t) * num_bds, GFP_KERNEL); + + // Initialize state mutex. Overloaded use for several fields. + mutex_init(&micveth.lv_state_mutex); + + // Setup of timer for probeing active mic clients. When the total active board + // count is zero the poll is not running. + micveth.lv_pollstate = CLIENT_POLL_STOPPED; + INIT_DELAYED_WORK(&micveth.lv_poll, micveth_clientpoll); + init_waitqueue_head(&micveth.lv_wq); + + // Init each of the existing boards. + for (bd = 0; bd < num_bds; bd++) { + micveth_probe_int(&micveth.lv_info[bd], &mic_data.dd_bi[bd]->bi_ctx); + } + + err = device_create_file(dev, &dev_attr_veth); + return err; +} + +static void +micveth_exit_int(void) +{ + mic_ctx_t *mic_ctx = kmalloc(sizeof(mic_ctx_t), GFP_KERNEL); + micveth_info_t *veth_info; + ring_packet_t *packet; + int bd; + int idx; + + rtnl_link_unregister(&micveth_link_ops); + + for (bd = 0; bd < micveth.lv_num_clients; bd++) { + veth_info = &micveth.lv_info[bd]; + + /* veth_info->mic_ctx == mic_data.dd_bi[bd] is freed in + remove so cannot be used in exit */ + mic_ctx->bi_vethinfo = veth_info; + micveth_stop(mic_ctx); + +#if WA_UNMAP_AT_RMMOD + mic_ctx_unmap_single(veth_to_ctx(veth_info), veth_info->vi_ring.phys, + sizeof(veth_ring_t)); +#endif + + for (idx = 0; idx < veth_info->vi_ring.ring.r_tx.rq_length; idx++) { + packet = &veth_info->vi_tx_desc[idx]; + if (packet->pd_skb != NULL) { +#if WA_UNMAP_AT_RMMOD + mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys, + packet->pd_skb->len); +#endif + kfree_skb(packet->pd_skb); + } + } + + for (idx = 0; idx < veth_info->vi_ring.ring.r_rx.rq_length; idx++) { + packet = &veth_info->vi_rx_desc[idx]; +#if WA_UNMAP_AT_RMMOD + mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys, packet->pd_skb->len); +#endif + kfree_skb(packet->pd_skb); + } + } + + kfree(mic_ctx); + kfree(micveth.lv_info); +} + +static int +micveth_start_int(mic_ctx_t *mic_ctx) +{ + micveth_info_t *veth_info = &micveth.lv_info[mic_ctx->bi_id]; + + // Eventuall (very soon) most of the descriptor allocation for a board will be done here + if (veth_info->vi_state != VETH_STATE_INITIALIZED) + return 0; + + mutex_lock(&micveth.lv_state_mutex); + + if (micveth.lv_pollstate == CLIENT_POLL_STOPPED) { + schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY)); + micveth.lv_pollstate = CLIENT_POLL_RUNNING; + } + + micveth.lv_active_clients++; + mutex_unlock(&micveth.lv_state_mutex); + + veth_info->vi_pend = 0; + + veth_info->vi_ring.ring.r_tx.rq_head = 0; + veth_info->vi_ring.ring.r_tx.rq_tail = 0; + + veth_info->vi_ring.ring.r_rx.rq_head = 0; + veth_info->vi_ring.ring.r_rx.rq_tail = 0; + veth_info->vi_state = VETH_STATE_LINKDOWN; + + if (mic_vnet_mode == VNET_MODE_INTR) { + snprintf(veth_info->vi_wqname, sizeof(veth_info->vi_wqname), + "VNET INTR %d\n", mic_ctx->bi_id); + veth_info->vi_wq = create_singlethread_workqueue(veth_info->vi_wqname); + INIT_WORK(&veth_info->vi_bh, micvnet_intr_bh_handler); + + // Install interrupt handler on doorbell 3 + mic_reg_irqhandler(mic_ctx, 3, "Host DoorBell 3", + micvnet_host_doorbell_intr_handler); + } + + return 0; +} + +static void +micveth_stop_int(mic_ctx_t *mic_ctx) +{ + micveth_info_t *veth_info = (micveth_info_t *)(mic_ctx->bi_vethinfo); + + if (veth_info->vi_state == VETH_STATE_INITIALIZED) + return; + + mutex_lock(&micveth.lv_state_mutex); + + if (mic_vnet_mode == VNET_MODE_INTR) { + // Remove interrupt handler on doorbell 3 + mic_unreg_irqhandler(mic_ctx, 3, "Host DoorBell 3"); + + destroy_workqueue(veth_info->vi_wq); + } + + micveth.lv_active_clients--; + veth_info->vi_state = VETH_STATE_INITIALIZED; + + if (micveth.lv_active_clients) { + mutex_unlock(&micveth.lv_state_mutex); + return; + } + + micveth.lv_num_links_remaining = micveth.lv_num_clients; + +#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP + micveth.lv_pollstate = CLIENT_POLL_STOPPED; + mutex_unlock(&micveth.lv_state_mutex); +#else + micveth.lv_pollstate = CLIENT_POLL_STOPPING; + mutex_unlock(&micveth.lv_state_mutex); + wait_event(micveth.lv_wq, micveth.lv_pollstate == CLIENT_POLL_STOPPED); +#endif +} + +#define NO_SRATCHREGREAD_AFTER_CONNECT 1 +static void +micveth_clientpoll(struct work_struct *work) +{ + micveth_info_t *veth_info; + uint32_t transRingHi; + uint32_t transRingLo; + uint32_t scratch14 = 0; + uint32_t scratch15 = 0; + int bd; + static int enter = 0; + + if (enter == 0) + { + printk("micveth is polling\n"); + enter = 1; + } + + mutex_lock(&micveth.lv_state_mutex); + if (micveth.lv_pollstate == CLIENT_POLL_STOPPING) { + micveth.lv_pollstate = CLIENT_POLL_STOPPED; + mutex_unlock(&micveth.lv_state_mutex); + wake_up(&micveth.lv_wq); + return; + } + + // Check for state changes for each board in the system + for (bd = 0; bd < micveth.lv_num_clients; bd++) { + veth_info = &micveth.lv_info[bd]; + + // Do not poll boards that have not had the interface started. + if (veth_info->vi_state == VETH_STATE_INITIALIZED) { + break; + } + +#ifdef NO_SRATCHREGREAD_AFTER_CONNECT + if(veth_info->vi_state != VETH_STATE_LINKUP) { +#endif + scratch14 = readl(veth_info->vi_scratch14); + scratch15 = readl(veth_info->vi_scratch15); +#ifdef NO_SRATCHREGREAD_AFTER_CONNECT + } +#endif + + if (veth_info->vi_state == VETH_STATE_LINKUP) { + if (scratch14 == MICVETH_LINK_DOWN_MAGIC) { + veth_info->vi_state = VETH_STATE_LINKDOWN; + } + } else if (veth_info->vi_state == VETH_STATE_LINKDOWN) { + if (scratch14 == MICVETH_LINK_UP_MAGIC) { + // Write the transfer ring address. + transRingHi = (uint32_t)(veth_info->vi_ring.phys >> 32); + transRingLo = (uint32_t)(veth_info->vi_ring.phys & 0xffffffff); + + writel(transRingLo, veth_info->vi_scratch14); + writel(transRingHi, veth_info->vi_scratch15); + + veth_info->vi_state = VETH_STATE_LINKUP; + printk("MIC virtual ethernet up for board %d\n", bd); +#ifdef MIC_IS_EMULATION + printk("Card wrote Magic: It must be UP!\n"); +#endif + + if (mic_vnet_mode == VNET_MODE_POLL) { + schedule_delayed_work(&veth_info->vi_poll, + msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY)); + } + + micveth.lv_num_links_remaining--; + } +#ifdef MIC_IS_EMULATION + else if (scratch14) { + printk("---> 0x%x \n", scratch14); + writel(0x0, veth_info->vi_scratch14); + } +#endif + } + } + + mutex_unlock(&micveth.lv_state_mutex); + +#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP + if (micveth.lv_num_links_remaining) +#endif + schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY)); +} + +static int +micvnet_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell) +{ + micveth_info_t *veth_info; + veth_info = &micveth.lv_info[mic_ctx->bi_id]; + queue_work(veth_info->vi_wq, &veth_info->vi_bh); + return 0; +} + +void +micveth_send_intr(micveth_info_t *veth_info) +{ + mic_ctx_t *mic_ctx = veth_info->mic_ctx; + mic_send_vnet_intr(mic_ctx); +} + +void +_micveth_process_descriptors(micveth_info_t *veth_info) +{ + veth_ring_t *ring = &veth_info->vi_ring.ring; + ring_queue_t *rx_queue = &ring->r_rx; + ring_queue_t *tx_queue = &ring->r_tx; + ring_desc_t *desc; + ring_packet_t *packet; + struct sk_buff *skb; + int receive_skb = 0; + int err; + + if (veth_info->vi_state != VETH_STATE_LINKUP) { + return; + } + + spin_lock_bh(&veth_info->vi_rxlock); + + while (rx_queue->rq_head != rx_queue->rq_tail) { + desc = &rx_queue->rq_descs[rx_queue->rq_head]; + + veth_info->vi_netdev->stats.rx_packets++; + veth_info->vi_netdev->stats.rx_bytes += desc->rd_length; + + packet = &veth_info->vi_rx_desc[rx_queue->rq_head]; + + skb = packet->pd_skb; + skb_put(skb, desc->rd_length); + + //dump_skb(skb, 0); + mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys, veth_info->vi_skb_mtu); + packet->pd_skb = dev_alloc_skb(veth_info->vi_skb_mtu); + packet->pd_phys = mic_ctx_map_single(veth_to_ctx(veth_info), packet->pd_skb->data, + veth_info->vi_skb_mtu); + desc->rd_phys = packet->pd_phys; + desc->rd_length = packet->pd_length; + + skb->dev = veth_info->vi_netdev; + skb->protocol = eth_type_trans(skb, skb->dev); + skb->ip_summed = CHECKSUM_NONE; + + err = netif_receive_skb(skb); + /* + * Need a general memory barrier between copying the data from + * the buffer and updating the head pointer. It's the general + * mb() because we're ordering the read of the data with the write. + */ + mb(); + rx_queue->rq_head = (rx_queue->rq_head + 1) % rx_queue->rq_length; + receive_skb++; + } + + /* Send intr to TX so that pending SKB's can be freed */ + if (receive_skb && mic_vnet_mode == VNET_MODE_INTR) { + micveth_send_intr(veth_info); + } + + spin_unlock_bh(&veth_info->vi_rxlock); + + spin_lock_bh(&veth_info->vi_txlock); + + // Also handle completed tx requests + while (veth_info->vi_pend != tx_queue->rq_head) { + desc = &tx_queue->rq_descs[veth_info->vi_pend]; + packet = &veth_info->vi_tx_desc[veth_info->vi_pend]; + + skb = packet->pd_skb; + packet->pd_skb = NULL; + + mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys, skb->len); + packet->pd_phys = 0; + + kfree_skb(skb); + + veth_info->vi_pend = (veth_info->vi_pend + 1) % tx_queue->rq_length; + } + + spin_unlock_bh(&veth_info->vi_txlock); + + if (mic_vnet_mode == VNET_MODE_POLL) { + schedule_delayed_work(&veth_info->vi_poll, msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY)); + } +} + +static void +micvnet_intr_bh_handler(struct work_struct *work) +{ + micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_bh); + _micveth_process_descriptors(veth_info); +} + +static void +micveth_poll(struct work_struct *work) +{ + micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_poll.work); + + _micveth_process_descriptors(veth_info); +} + +static ssize_t +show_veth(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", + micveth.lv_pollstate == CLIENT_POLL_RUNNING ? + "running" : "stopped"); +} + +/* + VNET driver public API. These are simply wrappers which either invoke the old + interrupt/poll mode functions or the new DMA mode functions. These are temporary and + will be phased out with the old interrupt/poll mode so only the DMA mode will be around + eventually. + */ +int __init +micveth_init(struct device *dev) +{ + printk("vnet: mode: %s, buffers: %d\n", + mic_vnet_modes[mic_vnet_mode], vnet_num_buffers); + + if (mic_vnet_mode == VNET_MODE_DMA) + return micvnet_init(dev); + /* Intr/poll modes use micveth_init_legacy */ + return 0; +} + +int __init +micveth_init_legacy(int num_bds, struct device *dev) +{ + if (mic_vnet_mode != VNET_MODE_DMA) + return micveth_init_int(num_bds, dev); + /* DMA mode uses micveth_init */ + return 0; +} + +void +micveth_exit(void) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + micvnet_exit(); + else + micveth_exit_int(); +} + +int +micveth_probe(mic_ctx_t *mic_ctx) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + return micvnet_probe(mic_ctx); + /* No support for micveth_probe in legacy intr/poll modes */ + return 0; +} + +void +micveth_remove(mic_ctx_t *mic_ctx) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + micvnet_remove(mic_ctx); + /* No support for micveth_remove in legacy intr/poll modes */ +} + +int +micveth_start(mic_ctx_t *mic_ctx) +{ + micveth_info_t *veth_info = mic_ctx->bi_vethinfo; + int err; + + if (mic_vnet_mode == VNET_MODE_DMA) + err = micvnet_start(mic_ctx); + else + err = micveth_start_int(mic_ctx); + + if (!err) + netif_carrier_on(veth_info->vi_netdev); + + return err; +} + +void +micveth_stop(mic_ctx_t *mic_ctx) +{ + micveth_info_t *veth_info = mic_ctx->bi_vethinfo; + + if (mic_vnet_mode == VNET_MODE_DMA) + micvnet_stop(mic_ctx); + else + micveth_stop_int(mic_ctx); + + if (veth_info) + netif_carrier_off(veth_info->vi_netdev); +} diff --git a/host/micpsmi.c b/host/micpsmi.c new file mode 100644 index 0000000..3db1b64 --- /dev/null +++ b/host/micpsmi.c @@ -0,0 +1,184 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "micint.h" + +bool mic_psmi_enable = 0; + +extern struct bin_attribute mic_psmi_ptes_attr; + +static __always_inline void +mic_psmi_free_pte(mic_ctx_t *mic_ctx, int i) +{ + pci_unmap_single(mic_ctx->bi_pdev, + mic_ctx->bi_psmi.dma_tbl[i].pa, MIC_PSMI_PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + free_pages(mic_ctx->bi_psmi.va_tbl[i - 1].pa, MIC_PSMI_PAGE_ORDER); +} + +static int mic_psmi_alloc_buffer(mic_ctx_t *mic_ctx) +{ + int i, j, ret; + void *va; + dma_addr_t dma_hndl; + struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi; + + /* allocate psmi page tables */ + psmi_ctx->nr_dma_pages = + ALIGN(psmi_ctx->dma_mem_size, + MIC_PSMI_PAGE_SIZE) / MIC_PSMI_PAGE_SIZE; + if ((psmi_ctx->va_tbl = + kmalloc(psmi_ctx->nr_dma_pages * + sizeof(struct mic_psmi_pte), GFP_KERNEL)) == NULL) { + printk("mic: psmi va table alloc failed\n"); + return -ENOMEM; + } + psmi_ctx->dma_tbl_size = + (psmi_ctx->nr_dma_pages + 2) * sizeof(struct mic_psmi_pte); + if ((psmi_ctx->dma_tbl = + kmalloc(psmi_ctx->dma_tbl_size, GFP_KERNEL)) == NULL) { + printk("mic: psmi dma table alloc failed\n"); + ret = -ENOMEM; + goto free_va_tbl; + } + psmi_ctx->dma_tbl_hndl = + pci_map_single(mic_ctx->bi_pdev, + psmi_ctx->dma_tbl, psmi_ctx->dma_tbl_size, PCI_DMA_BIDIRECTIONAL); + if (pci_dma_mapping_error(mic_ctx->bi_pdev, + psmi_ctx->dma_tbl_hndl)) { + printk("mic: psmi dma table mapping failed\n"); + ret = -ENOMEM; + goto free_dma_tbl; + } + + /* allocate psmi pages */ + for (i = 0; i < psmi_ctx->nr_dma_pages; i++) { + if ((va = (void *)__get_free_pages( + GFP_KERNEL | __GFP_HIGHMEM, + MIC_PSMI_PAGE_ORDER)) == NULL) { + printk("mic: psmi page alloc failed: %d\n", i); + ret = -ENOMEM; + goto free_ptes; + } + memset(va, 0, MIC_PSMI_PAGE_SIZE); + dma_hndl = pci_map_single(mic_ctx->bi_pdev, va, + MIC_PSMI_PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + if (pci_dma_mapping_error(mic_ctx->bi_pdev, dma_hndl)) { + printk("mic: psmi page mapping failed: %d\n", i); + free_pages((unsigned long)va, MIC_PSMI_PAGE_ORDER); + ret = -ENOMEM; + goto free_ptes; + } + psmi_ctx->dma_tbl[i + 1].pa = dma_hndl; + psmi_ctx->va_tbl[i].pa = (uint64_t)va; + } + psmi_ctx->dma_tbl[0].pa = MIC_PSMI_SIGNATURE; + psmi_ctx->dma_tbl[psmi_ctx->nr_dma_pages + 1].pa = MIC_PSMI_SIGNATURE; + printk("mic: psmi #%d, %ld bytes, " + "dma_tbl va=0x%lx hndl=0x%lx\n", mic_ctx->bi_id + 1, + (unsigned long)psmi_ctx->dma_mem_size, + (unsigned long)psmi_ctx->dma_tbl, + (unsigned long)psmi_ctx->dma_tbl_hndl); + return 0; +free_ptes: + for (j = 1; j < i; j++) + mic_psmi_free_pte(mic_ctx, j); + pci_unmap_single(mic_ctx->bi_pdev, + psmi_ctx->dma_tbl_hndl, psmi_ctx->dma_tbl_size, PCI_DMA_BIDIRECTIONAL); +free_dma_tbl: + kfree(psmi_ctx->dma_tbl); + psmi_ctx->dma_tbl = NULL; +free_va_tbl: + kfree(psmi_ctx->va_tbl); + psmi_ctx->va_tbl = NULL; + return ret; +} + +static void mic_psmi_free_buffer(mic_ctx_t *mic_ctx) +{ + struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi; + int i; + + for (i = 1; i <= psmi_ctx->nr_dma_pages; i++) + mic_psmi_free_pte(mic_ctx, i); + pci_unmap_single(mic_ctx->bi_pdev, + psmi_ctx->dma_tbl_hndl, psmi_ctx->dma_tbl_size, PCI_DMA_BIDIRECTIONAL); + kfree(psmi_ctx->dma_tbl); + psmi_ctx->dma_tbl = NULL; + kfree(psmi_ctx->va_tbl); + psmi_ctx->va_tbl = NULL; + printk("mic: psmi freed %ld bytes for board #%d\n", + (unsigned long)psmi_ctx->dma_mem_size, mic_ctx->bi_id + 1); +} + +extern int usagemode_param; + +int mic_psmi_init(mic_ctx_t *mic_ctx) +{ + int ret; + int status = 0; + uint32_t scratch0; + struct mic_psmi_ctx * psmi_ctx = &mic_ctx->bi_psmi; + + psmi_ctx->enabled = 0; + /* Only initialize psmi for the first board */ + if (!mic_psmi_enable || mic_ctx->bi_id) + return 0; + if(!(scratch0 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH0))) { + status = wait_for_bootstrap(mic_ctx->mmio.va); + scratch0 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH0); + } + /* Memory size includes 512K reserved for VGA & GTT table */ + psmi_ctx->dma_mem_size = + SCRATCH0_MEM_SIZE_KB(scratch0) * ((1) * 1024) + + MIC_PSMI_PAGE_SIZE; + if (USAGE_MODE_NORMAL == usagemode_param) { + if ((ret = mic_psmi_alloc_buffer(mic_ctx))) + return ret; + mic_psmi_ptes_attr.size = psmi_ctx->dma_tbl_size; + } + psmi_ctx->enabled = 1; + return 0; +} + +void mic_psmi_uninit(mic_ctx_t *mic_ctx) +{ + struct mic_psmi_ctx * psmi_ctx = &mic_ctx->bi_psmi; + + if (!psmi_ctx->enabled) + return; + if (USAGE_MODE_NORMAL == usagemode_param) + mic_psmi_free_buffer(mic_ctx); + psmi_ctx->enabled = 0; +} diff --git a/host/micscif_pm.c b/host/micscif_pm.c new file mode 100644 index 0000000..95e229d --- /dev/null +++ b/host/micscif_pm.c @@ -0,0 +1,1062 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic_common.h" +#include "scif.h" +#include "mic/micscif.h" +#include "mic/mic_pm.h" +#include "mic/micveth.h" + +extern int set_host_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state); +extern int pc6_entry_start(mic_ctx_t *mic_ctx); + +/* Function that decrements the count of number of PM clients connected + * to the host. + */ +void +micpm_decrement_clients(void) +{ + if(unlikely(atomic_dec_return(&mic_data.dd_pm.connected_clients) < 0)) { + PM_DEBUG("connected_clients is negative (%d)\n", + atomic_read(&mic_data.dd_pm.connected_clients)); + } + return; +} + +static char *pm_message_types[PM_MESSAGE_MAX+1] = {"PM_MESSAGE_PC3READY", + "PM_MESSAGE_OPEN", + "PM_MESSAGE_OPEN_ACK", + "PM_MESSAGE_CLOSE", + "PM_MESSAGE_CLOSE_ACK", + "PM_MESSAGE_TEST", + "PM_MESSAGE_MAX"}; +void +micpm_display_message(mic_ctx_t *mic_ctx, void *header, void *msg, const char* label) { + pm_msg_header *header_ref; + int msg_len; + int i=0; + char *payload; + scif_epd_t epd = mic_ctx->micpm_ctx.pm_epd; + header_ref = (pm_msg_header *)header; + msg_len = header_ref->len; + + if(!epd) + return; + + if(0 <= header_ref->opcode && header_ref->opcode < PM_MESSAGE_MAX) { + if(strcmp(label,"SENT")==0) { + printk("%s: Msg type %s, SrcNode:SrcPort %d:%d, DestNode:DestPort %d:%d", label, + pm_message_types[header_ref->opcode], epd->port.node, epd->port.port, + epd->peer.node, epd->peer.port); + } + else + printk("%s: Msg type %s, DestNode:DestPort %d:%d, SrcNode:SrcPort %d:%d", label, + pm_message_types[header_ref->opcode], epd->port.node, epd->port.port, + epd->peer.node, epd->peer.port); + } + + + if(msg != NULL) { + payload = (char *)msg; + printk(" Payload"); + for(i=0;imicpm_ctx.pm_options.pc6_enabled) { + if (set && !mic_ctx->micpm_ctx.pc6_enabled) { + mic_ctx->micpm_ctx.pc6_enabled = set; + queue_delayed_work(mic_ctx->micpm_ctx.pc6_entry_wq, + &mic_ctx->micpm_ctx.pc6_entry_work, + mic_ctx->micpm_ctx.pc6_timeout*HZ); + } + if (set == false) { + mic_ctx->micpm_ctx.pc6_enabled = set; + micpm_get_reference(mic_ctx, true); + micpm_put_reference(mic_ctx); + } + } else { + if (set) + err = -EINVAL; + else + mic_ctx->micpm_ctx.pc6_enabled = set; + } + return err; +} + +int micpm_update_pc3(mic_ctx_t *mic_ctx, bool set) +{ + int err = 0; + if (mic_ctx->micpm_ctx.pm_options.pc3_enabled) { + if (set) { + mic_ctx->micpm_ctx.pc3_enabled = set; + } else { + mic_ctx->micpm_ctx.pc3_enabled = set; + micpm_get_reference(mic_ctx, true); + micpm_put_reference(mic_ctx); + } + } else { + if (set) + err = -EINVAL; + else + mic_ctx->micpm_ctx.pc3_enabled = set; + } + return err; +} + +/* + * Wraper to scif_send that takes in the buffer to be sent + * as input. + */ +int +mic_pm_send(mic_ctx_t *mic_ctx, void *msg, uint32_t len) +{ + int err; + scif_epd_t epd; + + if(mic_ctx == NULL) { + PM_DEBUG("Mic context not Initialized\n"); + return -EINVAL; + } + + if((msg == NULL) || (len == 0)) { + PM_DEBUG("Invalid Parameters\n"); + return -EINVAL; + } + + epd = mic_ctx->micpm_ctx.pm_epd; + if(epd == NULL) { + PM_DEBUG("Scif Endpoint Undefined\n"); + return -EINVAL; + } + + if ((mic_ctx->micpm_ctx.con_state != PM_CONNECTING) && + (mic_ctx->micpm_ctx.con_state != PM_CONNECTED)) { + PM_DEBUG("Endpoint not in connected state\n"); + return -EINVAL; + } + + err = scif_send(epd, msg, len, PM_SEND_MODE); + /*scif_send returns the number of bytes returned on success */ + if(err <= 0) { + PM_DEBUG("scif_send to node: %d port: %d failed with error %d\n", + epd->peer.node, epd->peer.port, err); + } else { + PM_DEBUG("Bytes sent = %d\n",err); + err = 0; + } + + return err; +} + +/* + * Wrapper to scif_recv. + */ +int +mic_pm_recv(mic_ctx_t *mic_ctx, void *msg, uint32_t len) +{ + int err; + scif_epd_t epd; + + if(mic_ctx == NULL) { + PM_DEBUG("Mic context not Initialized\n"); + return -EINVAL; + } + + if((msg == NULL) || (len == 0)) { + PM_DEBUG("Invalid Parameters\n"); + return -EINVAL; + } + + epd = mic_ctx->micpm_ctx.pm_epd; + if(epd == NULL) { + PM_DEBUG("Scif Endpoint Undefined\n"); + return -EINVAL; + } + + if ((mic_ctx->micpm_ctx.con_state != PM_CONNECTING) && + (mic_ctx->micpm_ctx.con_state != PM_CONNECTED)) { + PM_DEBUG("Endpoint not in connected state\n"); + return -EINVAL; + } + + err = scif_recv(epd, msg, len, PM_RECV_MODE); + + if(err <= 0) { + pr_debug("scif_recv failed with error %d\n", err); + if(err == 0) { + /*0 bytes were sent */ + err = -ENXIO; + } + } else { + PM_DEBUG("Bytes received = %d\n",err); + err = 0; + } + return err; +} + +/* + * Function to send a Power Management message over scif. Gets the message type + * as input and builds a message header. It then creates a single message buffer + * with this header and body and sends it to the receiving node. + */ +int +mic_pm_send_msg(mic_ctx_t *mic_ctx, PM_MESSAGE type, void *msg, uint32_t len) +{ + pm_msg_header header; + char *send_msg = NULL; + int err = 0; + + header.opcode = type; + header.len = len; + + send_msg = kmalloc(len + sizeof(pm_msg_header), GFP_KERNEL); + if(send_msg == NULL) { + PM_DEBUG("error allocating memory"); + err = -ENOMEM; + return err; + } + memcpy(send_msg , &header, sizeof(pm_msg_header)); + if((len != 0) && (msg != NULL)) { + memcpy((send_msg + sizeof(pm_msg_header)), msg, len); + } + + if(mic_data.dd_pm.enable_pm_logging) { + if((len != 0) && (msg != NULL)) + micpm_display_message(mic_ctx,send_msg,send_msg+sizeof(pm_msg_header),"SENT"); + else + micpm_display_message(mic_ctx,send_msg,NULL,"SENT"); + } + err = mic_pm_send(mic_ctx, send_msg, len + sizeof(pm_msg_header)); + kfree(send_msg); + return err; +} + +/* + * Handler invoked when receiving a PC3 ready message. + */ +int +handle_pc3_ready(mic_ctx_t *mic_ctx) +{ + int err = 0; + PM_ENTRY; + err = pm_pc3_entry(mic_ctx); + PM_EXIT; + return err; +} + +/* + * Handler invoked when receiving the latency response message + */ +int +handle_open_ack(mic_ctx_t *mic_ctx, pm_msg_pm_options *msg) +{ + int err = 0; + PM_ENTRY; + + if ((mic_ctx == NULL) || (msg == NULL)) { + err = EINVAL; + goto inval; + } + + if ((msg->version.major_version != PM_MAJOR_VERSION) || + (msg->version.minor_version != PM_MINOR_VERSION)) { + printk(KERN_ERR "PM Driver version mismatch. " + "Expected version: %d.%d Received version %d.%d\n", + PM_MAJOR_VERSION, PM_MINOR_VERSION, + msg->version.major_version, msg->version.minor_version); + schedule_work(&mic_ctx->micpm_ctx.pm_close); + goto inval; + } + + mic_ctx->micpm_ctx.pm_options.pc3_enabled = msg->pc3_enabled; + mic_ctx->micpm_ctx.pm_options.pc6_enabled = msg->pc6_enabled; + + mic_ctx->micpm_ctx.pc3_enabled = + (mic_ctx->micpm_ctx.pm_options.pc3_enabled)? true : false; + mic_ctx->micpm_ctx.pc6_enabled = + (mic_ctx->micpm_ctx.pm_options.pc6_enabled)? true : false; + + mic_ctx->micpm_ctx.con_state = PM_CONNECTED; + +inval: + PM_EXIT; + return err; +} + +/* + * Message handler invoked by the per device receive workqueue when it receives + * a message from the device. + */ +int +mic_pm_handle_message(mic_ctx_t *mic_ctx, pm_recv_msg_t *recv_msg) +{ + int res = 0; + + if(mic_ctx == NULL) { + return -EINVAL; + } + + if(recv_msg == NULL) { + PM_DEBUG("Undefined message\n"); + return -EINVAL; + } + + switch(recv_msg->msg_header.opcode) { + case PM_MESSAGE_PC3READY: + res = handle_pc3_ready(mic_ctx); + break; + case PM_MESSAGE_OPEN_ACK: + /*Size of the payload needs to be equal to what the + * host is trying to cast it to + */ + if (sizeof(pm_msg_pm_options) != recv_msg->msg_header.len) { + printk(KERN_ERR "Incompatible PM message. Opcode = %d\n", + recv_msg->msg_header.opcode); + return -EINVAL; + } + res = handle_open_ack(mic_ctx, + ((pm_msg_pm_options *) recv_msg->msg_body)); + break; + default: + printk(KERN_ERR "Unknown PM message. Opcode = %d\n", + recv_msg->msg_header.opcode); + break; + } + return res; +} + +/* + * retrieve_msg: + * + * Retrieve message from the head of list. + * @mic_ctx: The device context + * Returns the retrieved message. + */ +pm_recv_msg_t * +pm_retrieve_msg(mic_ctx_t *mic_ctx) { + + pm_recv_msg_t *recv_msg = NULL; + struct list_head *pos, *tmpq; + bool msg_found = false; + + mutex_lock(&mic_ctx->micpm_ctx.msg_mutex); + if (!list_empty_careful(&mic_ctx->micpm_ctx.msg_list)) + { + list_for_each_safe(pos, tmpq, &mic_ctx->micpm_ctx.msg_list) { + recv_msg = list_entry(pos, pm_recv_msg_t, msg); + /*Do not touch the message if its a test message */ + if (recv_msg->msg_header.opcode != PM_MESSAGE_TEST) { + list_del(&recv_msg->msg); + msg_found = true; + break; + } + } + } + + if (msg_found == false) + recv_msg = NULL; + + mutex_unlock(&mic_ctx->micpm_ctx.msg_mutex); + return recv_msg; +} + +/* + * pm_process_msg_list: + * + * Process the message list of a node and handle each message in the list. + * @mic_ctx[in]: The deive context whose message list is to be processed + * Returns: None + */ +void +pm_process_msg_list(mic_ctx_t *mic_ctx) { + + pm_recv_msg_t *process_msg = NULL; + int ret = 0; + + if(mic_ctx == NULL) { + PM_DEBUG("Cannot get device handle \n"); + return; + } + + while(!list_empty(&mic_ctx->micpm_ctx.msg_list)) { + process_msg = pm_retrieve_msg(mic_ctx); + if(!process_msg) { + PM_DEBUG("No Message to process.\n"); + return; + } + + ret = mic_pm_handle_message(mic_ctx, process_msg); + if(ret) { + PM_DEBUG("Power Management message not processed" + " successfully.\n"); + } + + if(process_msg->msg_body != NULL) { + kfree(process_msg->msg_body); + } + kfree(process_msg); + } +} + +/* + * Retrieves each message from the message list and calls the handler + * for the same. After the handler returns, the message is removed + * from the list and deleted. + */ +static void +mic_pm_msg_handle_work(struct work_struct *msg_handle_work) +{ + pm_wq_t *pm_wq = container_of(msg_handle_work, pm_wq_t, work); + micpm_ctx_t *pm_ctx = container_of(pm_wq, micpm_ctx_t, handle_msg); + mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx); + pm_process_msg_list(mic_ctx); + return; +} + +static void +pc6_entry_work(struct work_struct *work) +{ + int err; + micpm_ctx_t *pm_ctx = + container_of(to_delayed_work(work), + micpm_ctx_t, pc6_entry_work); + mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx); + + err = pc6_entry_start(mic_ctx); + if (err == -EAGAIN) + queue_delayed_work(mic_ctx->micpm_ctx.pc6_entry_wq, + &mic_ctx->micpm_ctx.pc6_entry_work, + mic_ctx->micpm_ctx.pc6_timeout*HZ); + return; +} + +/* + * Called when a device creates a PM connection to Host. There can be + * only one PM connection between Host and a device. The function checks + * for an existing connection and rejects this new request if present. + */ +static void +mic_pm_accept_work(struct work_struct *work) +{ + scif_epd_t newepd; + struct scif_portID portID; + int err; + uint16_t i; + mic_ctx_t *mic_ctx; + mic_data_t *mic_data_p = &mic_data; + + PM_DEBUG("Accept thread waiting for new PM connections\n"); + err = scif_accept(mic_data.dd_pm.epd, &portID, &newepd, SCIF_ACCEPT_SYNC); + if (err == -EBUSY || err == -ENODEV) { + PM_DEBUG("scif_accept error %d\n", err); + goto continue_accepting; + } + else if (err < 0) { + PM_DEBUG("scif_accept failed with errno %d\n", err); + goto exit; + + } + PM_DEBUG("Connection request received. \n"); + + mutex_lock(&mic_data.dd_pm.pm_accept_mutex); + + if (newepd->peer.node == SCIF_HOST_NODE) { + /* Reject connection request from HOST itself */ + PM_DEBUG("PM: Peer node cannot be HOST. Peer Node = %d Peer Port = %d", + newepd->peer.node, newepd->peer.port); + scif_close(newepd); + mutex_unlock(&mic_data.dd_pm.pm_accept_mutex); + goto continue_accepting; + } + + /*Only one Power Management connection per node. */ + for (i = 0; i < mic_data_p->dd_numdevs; i++) { + mic_ctx = get_per_dev_ctx(i); + if (mic_ctx != NULL) { + if (mic_ctx->micpm_ctx.pm_epd != NULL) { + if (mic_ctx->micpm_ctx.pm_epd->peer.node == newepd->peer.node) { + PM_DEBUG("There is already Power Management connection" + " established from this node. Rejecting request.\n"); + PM_DEBUG("Peer Node = %d, Peer Port = %d\n", + mic_ctx->micpm_ctx.pm_epd->peer.node, + mic_ctx->micpm_ctx.pm_epd->peer.port); + scif_close(newepd); + mutex_unlock(&mic_data.dd_pm.pm_accept_mutex); + goto continue_accepting; + } + } + } + + } + mutex_unlock(&mic_data.dd_pm.pm_accept_mutex); + mic_ctx = get_per_dev_ctx(newepd->peer.node -1); + mic_ctx->micpm_ctx.pm_epd = newepd; + micpm_start(mic_ctx); + + +continue_accepting: + mutex_lock(&mic_data.dd_pm.pm_accept_mutex); + queue_work(mic_data.dd_pm.accept.wq, + &mic_data.dd_pm.accept.work); + mutex_unlock(&mic_data.dd_pm.pm_accept_mutex); +exit: + return; +} + +/* + * Work item function that waits for incoming PM messages from + * a node. The function adds the message to a per device message + * list that is later processed by the message handler. + */ +static void +mic_pm_recv_work(struct work_struct *recv_work) +{ + int err = 0; + int size = 0; + + pm_wq_t *pm_wq = container_of(recv_work, pm_wq_t, work); + micpm_ctx_t *pm_ctx = container_of(pm_wq, micpm_ctx_t, recv); + mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx); + pm_recv_msg_t *recv_msg = NULL; + + if (mic_ctx == NULL || pm_ctx == NULL) { + PM_DEBUG("Error retrieving driver context \n"); + goto unqueue; + } + + size = sizeof(pm_msg_header); + recv_msg = (void *)kmalloc(sizeof(pm_recv_msg_t), GFP_KERNEL); + + if (recv_msg == NULL) { + PM_DEBUG("Error allocating memory to save receive message.\n"); + goto unqueue; + } + INIT_LIST_HEAD(&recv_msg->msg); + recv_msg->msg_body = NULL; + + /*Get the header */ + err = mic_pm_recv(mic_ctx, &recv_msg->msg_header, size); + if (err < 0) { + PM_DEBUG("Error in scif_recv while waiting for PM header message.\n"); + if (err == -ECONNRESET) { + /*Remote node is not in a connected state. */ + schedule_work(&mic_ctx->micpm_ctx.pm_close); + } + goto unqueue; + + } + + if(recv_msg->msg_header.len != 0) { + PM_DEBUG("Retrieving %d bytes of message body\n", recv_msg->msg_header.len); + recv_msg->msg_body = (void *)kmalloc((sizeof(char) * recv_msg->msg_header.len), GFP_KERNEL); + if (recv_msg->msg_body == NULL) { + PM_DEBUG("Error allocating memory to receive PM Message\n"); + goto unqueue; + } + err = mic_pm_recv(mic_ctx, recv_msg->msg_body, recv_msg->msg_header.len); + if (err < 0) { + PM_DEBUG("Error in scif_recv while waiting for PM message body\n"); + if (err == -ECONNRESET) { + /*Remote node is not in a connected state. */ + schedule_work(&mic_ctx->micpm_ctx.pm_close); + } + goto unqueue; + } + } + + if(mic_data.dd_pm.enable_pm_logging) { + micpm_display_message(mic_ctx,&recv_msg->msg_header, + recv_msg->msg_body,"RECV"); + } + + if ((recv_msg->msg_header.opcode != PM_MESSAGE_CLOSE) && + ((recv_msg->msg_header.opcode != PM_MESSAGE_CLOSE_ACK))){ + PM_DEBUG("Adding received message from node %d to list.\n", + mic_ctx->bi_id+1); + mutex_lock(&mic_ctx->micpm_ctx.msg_mutex); + list_add_tail(&recv_msg->msg , &mic_ctx->micpm_ctx.msg_list); + mutex_unlock(&mic_ctx->micpm_ctx.msg_mutex); + + if(likely(recv_msg->msg_header.opcode != PM_MESSAGE_TEST)) { + PM_DEBUG("Queue message handler work for node: %d\n",mic_ctx->bi_id+1); + queue_work(mic_ctx->micpm_ctx.handle_msg.wq, + &mic_ctx->micpm_ctx.handle_msg.work); + } + + queue_work(mic_ctx->micpm_ctx.recv.wq, + &mic_ctx->micpm_ctx.recv.work); + } else { + + if (recv_msg->msg_header.opcode == PM_MESSAGE_CLOSE) { + mic_pm_send_msg(mic_ctx , PM_MESSAGE_CLOSE_ACK, NULL, 0); + mic_ctx->micpm_ctx.con_state = PM_DISCONNECTING; + schedule_work(&mic_ctx->micpm_ctx.pm_close); + } else { + mic_ctx->micpm_ctx.con_state = PM_DISCONNECTING; + wake_up(&mic_ctx->micpm_ctx.disc_wq); + } + goto unqueue; + } + return; +unqueue: + if (recv_msg) { + if (recv_msg->msg_body) + kfree(recv_msg->msg_body); + kfree(recv_msg); + } + return; +} + +/* + * Work item to handle closing of PM end point to a device and all the + * related receive workqueues. + */ +static void +mic_pm_close_work(struct work_struct *work) +{ + micpm_ctx_t *pm_ctx = container_of(work, micpm_ctx_t, pm_close); + mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx); + micpm_stop(mic_ctx); + return; +} + +static void +mic_pm_resume_work(struct work_struct *resume_work) +{ + int err; + pm_wq_t *pm_wq = container_of(resume_work, pm_wq_t, work); + micpm_ctx_t *pm_ctx = container_of(pm_wq, micpm_ctx_t, resume); + mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx); + + if (mic_ctx != NULL) { + err = pm_start_device(mic_ctx); + if (err) { + PM_DEBUG("Failed to start device %d after resume\n", + mic_ctx->bi_id); + } + } else { + PM_DEBUG("Error retrieving node context.\n"); + } +} + +/* Create PM specific workqueues during driver probe. + * + * Receive workqueue will store the received message and kick-off + * a message handler workqueue which will process them. + * + * Resume workqueue handles the task of booting uOS rduring + * OSPM resume/restore phase. + */ +int +setup_pm_workqueues(mic_ctx_t *mic_ctx) +{ + int err = 0; + + if(!mic_ctx) { + PM_DEBUG("Failed to retrieve device context\n"); + err = -EINVAL; + goto err; + } + + /* setup resume wq */ + snprintf(mic_ctx->micpm_ctx.resume.wq_name, + sizeof(mic_ctx->micpm_ctx.resume.wq_name), + "PM_RESUME_WQ %d", mic_get_scifnode_id(mic_ctx)); + + if (!(mic_ctx->micpm_ctx.resume.wq + = __mic_create_singlethread_workqueue( + mic_ctx->micpm_ctx.resume.wq_name))) { + err = -ENOMEM; + goto err; + } + + /* Setup Receive wq */ + snprintf(mic_ctx->micpm_ctx.recv.wq_name, + sizeof(mic_ctx->micpm_ctx.recv.wq_name), + "RECV_WORK_Q %d", mic_get_scifnode_id(mic_ctx)); + + if (!(mic_ctx->micpm_ctx.recv.wq + = __mic_create_singlethread_workqueue( + mic_ctx->micpm_ctx.recv.wq_name))) { + err = -ENOMEM; + goto err; + } + + /* Setup Msg handler wq */ + snprintf(mic_ctx->micpm_ctx.handle_msg.wq_name, + sizeof(mic_ctx->micpm_ctx.handle_msg.wq_name), + "MSG_HANDLER_WQ %d", mic_get_scifnode_id(mic_ctx)); + + if (!(mic_ctx->micpm_ctx.handle_msg.wq + = __mic_create_singlethread_workqueue( + mic_ctx->micpm_ctx.handle_msg.wq_name))) { + err = -ENOMEM; + goto err; + } + + /* Setup pc6 entry wq */ + snprintf(mic_ctx->micpm_ctx.pc6_wq_name, + sizeof(mic_ctx->micpm_ctx.pc6_wq_name), + "PC6_WORK_Q %d", mic_get_scifnode_id(mic_ctx)); + + if (!(mic_ctx->micpm_ctx.pc6_entry_wq + = __mic_create_singlethread_workqueue( + mic_ctx->micpm_ctx.pc6_wq_name))) { + err = -ENOMEM; + goto err; + } + INIT_WORK(&mic_ctx->micpm_ctx.recv.work, mic_pm_recv_work); + INIT_WORK(&mic_ctx->micpm_ctx.handle_msg.work, mic_pm_msg_handle_work); + INIT_WORK(&mic_ctx->micpm_ctx.pm_close, mic_pm_close_work); + INIT_WORK(&mic_ctx->micpm_ctx.resume.work, mic_pm_resume_work); + INIT_DELAYED_WORK(&mic_ctx->micpm_ctx.pc6_entry_work, pc6_entry_work); + +err: + return err; +} +/*Power Management Initialization function. Sets up SCIF + * end points and accept threads. + */ +int micpm_init() +{ + scif_epd_t epd; + int con_port; + int err = 0; + + epd = scif_open(); + if (epd == SCIF_OPEN_FAILED || epd == NULL) { + PM_DEBUG("scif_open failed\n"); + return -1; + } + + if ((con_port = scif_bind(epd, SCIF_PM_PORT_0)) < 0) { + PM_DEBUG("scif_bind to port failed with error %d\n", con_port); + err = con_port; + goto exit_close; + } + + /*No real upper limit on number of connections. + Once scif_listen accepts 0 as an acceptable parameter for max + connections(to mean tht there is no upper limit), change this. */ + if ((err = scif_listen(epd, 100)) < 0) { + PM_DEBUG("Listen ioctl failed with error %d\n", err); + goto exit_close; + } + mic_data.dd_pm.epd = epd; + + snprintf(mic_data.dd_pm.accept.wq_name, + sizeof(mic_data.dd_pm.accept.wq_name),"PM ACCEPT"); + + mic_data.dd_pm.accept.wq = + __mic_create_singlethread_workqueue(mic_data.dd_pm.accept.wq_name); + if (!mic_data.dd_pm.accept.wq){ + err = -ENOMEM; + PM_DEBUG("create workqueue returned null\n"); + goto exit_close; + } + INIT_WORK(&mic_data.dd_pm.accept.work, mic_pm_accept_work); + mutex_init (&mic_data.dd_pm.pm_accept_mutex); + mutex_init (&mic_data.dd_pm.pm_idle_mutex); + atomic_set(&mic_data.dd_pm.connected_clients, 0); + + /*Add work to the work queue */ + queue_work(mic_data.dd_pm.accept.wq, + &mic_data.dd_pm.accept.work); + mic_data.dd_pm.enable_pm_logging = 0; + atomic_set(&mic_data.dd_pm.wakeup_in_progress, 0); + + micpm_dbg_parent_init(); + + return err; + +exit_close: + scif_close(epd); + return err; +} + +/* + * Close the SCIF acceptor endpoint and uninit a lot of driver level + * data structures including accept threads, + */ +void +micpm_uninit(void) +{ + int err; + scif_epd_t epd = mic_data.dd_pm.epd; + + if(atomic_read(&mic_data.dd_pm.connected_clients) > 0) { + PM_DEBUG("connected_clients is nonzero (%d)\n", + atomic_read(&mic_data.dd_pm.connected_clients)); + } + err = scif_close(epd); + if (err != 0) { + PM_DEBUG("Scif_close failed with error %d\n",err); + } + + if (mic_data.dd_pm.accept.wq != NULL) { + PM_DEBUG("Flushing accept workqueue\n"); + flush_workqueue(mic_data.dd_pm.accept.wq); + destroy_workqueue(mic_data.dd_pm.accept.wq); + mic_data.dd_pm.accept.wq = NULL; + } + + mutex_destroy(&mic_data.dd_pm.pm_accept_mutex); + mutex_destroy(&mic_data.dd_pm.pm_idle_mutex); + + debugfs_remove_recursive(mic_data.dd_pm.pmdbgparent_dir); + +} + +/* + * Open the Per device Power Management context. + */ +int +micpm_probe(mic_ctx_t * mic_ctx) { + + int err = 0; + + mic_ctx->micpm_ctx.pm_epd = NULL; + mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0; + mic_ctx->micpm_ctx.recv.wq = NULL; + mic_ctx->micpm_ctx.handle_msg.wq = NULL; + mic_ctx->micpm_ctx.mic_suspend_state = MIC_RESET; + mic_ctx->micpm_ctx.pc3_enabled = true; + mic_ctx->micpm_ctx.pc6_enabled = true; + mic_ctx->micpm_ctx.pm_options.pc3_enabled = 0; + mic_ctx->micpm_ctx.pm_options.pc6_enabled = 0; + + if ((err = setup_pm_workqueues(mic_ctx))) + goto err; + + mutex_init (&mic_ctx->micpm_ctx.msg_mutex); + INIT_LIST_HEAD(&mic_ctx->micpm_ctx.msg_list); + init_waitqueue_head(&mic_ctx->micpm_ctx.disc_wq); + atomic_set(&mic_ctx->micpm_ctx.pm_ref_cnt, 0); + mic_ctx->micpm_ctx.pc6_timeout = PC6_TIMER; + + /* create debugfs entries*/ + micpm_dbg_init(mic_ctx); + +err: + return err; +} + +int +micpm_remove(mic_ctx_t * mic_ctx) { + + debugfs_remove_recursive(mic_ctx->micpm_ctx.pmdbg_dir); + + if (mic_ctx->micpm_ctx.resume.wq != NULL) { + destroy_workqueue(mic_ctx->micpm_ctx.resume.wq); + mic_ctx->micpm_ctx.resume.wq = NULL; + } + + if(mic_ctx->micpm_ctx.pc6_entry_wq != NULL) { + destroy_workqueue(mic_ctx->micpm_ctx.pc6_entry_wq); + mic_ctx->micpm_ctx.pc6_entry_wq = NULL; + } + + if(mic_ctx->micpm_ctx.recv.wq != NULL) { + destroy_workqueue(mic_ctx->micpm_ctx.recv.wq); + mic_ctx->micpm_ctx.recv.wq = NULL; + } + + if(mic_ctx->micpm_ctx.handle_msg.wq != NULL) { + destroy_workqueue(mic_ctx->micpm_ctx.handle_msg.wq); + mic_ctx->micpm_ctx.handle_msg.wq = NULL; + } + + micpm_nodemask_uninit(mic_ctx); + + mutex_destroy(&mic_ctx->micpm_ctx.msg_mutex); + return 0; +} + +int +micpm_start(mic_ctx_t *mic_ctx) { + + int ref_cnt; + mic_ctx->micpm_ctx.con_state = PM_CONNECTING; + + /* queue receiver */ + queue_work(mic_ctx->micpm_ctx.recv.wq, + &mic_ctx->micpm_ctx.recv.work); + + atomic_inc(&mic_data.dd_pm.connected_clients); + if ((ref_cnt = atomic_read(&mic_ctx->micpm_ctx.pm_ref_cnt))) + printk("Warning: PM ref_cnt is non-zero during start. " + "ref_cnt = %d PM features may not work as expected\n", + ref_cnt); + mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0; + set_host_state(mic_ctx, PM_IDLE_STATE_PC0); + return mic_pm_send_msg(mic_ctx , PM_MESSAGE_OPEN, NULL, 0); +} + +/* + * Close the per device Power management context here. + * It does various things such as: closing scif endpoints, + * delete pending work items and wait for those that are + * executing to complete, delete pending messages in the + * message list, delete pending timers and wait for runnig + * timers to complete. The function can block. + */ +int +micpm_stop(mic_ctx_t *mic_ctx) { + + int err = 0; + int node_lost = 0; + if(mic_ctx == NULL) { + PM_DEBUG("Mic context not Initialized\n"); + return -EINVAL; + } + + if ((micpm_get_reference(mic_ctx, true))) { + PM_DEBUG("get_reference failed. Node may be lost\n"); + node_lost = 1; + } + + mutex_lock(&mic_data.dd_pm.pm_accept_mutex); + if ((mic_ctx->micpm_ctx.con_state == PM_CONNECTED) && + (mic_ctx->state != MIC_LOST)) { + if (!mic_pm_send_msg(mic_ctx, PM_MESSAGE_CLOSE, NULL, 0)) { + err = wait_event_timeout( + mic_ctx->micpm_ctx.disc_wq, + mic_ctx->micpm_ctx.con_state == PM_DISCONNECTING, + NODE_ALIVE_TIMEOUT); + if (!err) { + PM_DEBUG("Timed out waiting CLOSE ACK" + " from node.\n"); + } + } + } + + if(mic_ctx->micpm_ctx.pm_epd != NULL) { + PM_DEBUG("Power Management: Closing connection to" + " node: %d port:%d\n", mic_ctx->micpm_ctx.pm_epd->peer.node, + mic_ctx->micpm_ctx.pm_epd->peer.port); + err = scif_close(mic_ctx->micpm_ctx.pm_epd); + if(err!= 0) + PM_DEBUG("Scif_close failed with error %d\n",err); + mic_ctx->micpm_ctx.pm_epd = NULL; + micpm_decrement_clients(); + } + mic_ctx->micpm_ctx.con_state = PM_DISCONNECTED; + mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0; + flush_workqueue(mic_ctx->micpm_ctx.resume.wq); + flush_workqueue(mic_ctx->micpm_ctx.recv.wq); + flush_workqueue(mic_ctx->micpm_ctx.handle_msg.wq); + cancel_delayed_work_sync(&mic_ctx->micpm_ctx.pc6_entry_work); + + /* Process messages in message queue */ + pm_process_msg_list(mic_ctx); + + if (!node_lost) + micpm_put_reference(mic_ctx); + mutex_unlock(&mic_data.dd_pm.pm_accept_mutex); + return err; +} + +/* + * Function to load the uOS and start all the driver components + * after a resume/restore operation + */ +int +pm_start_device(mic_ctx_t *mic_ctx) +{ + if (!mic_ctx) { + PM_DEBUG("Error retreving driver context\n"); + return 0; + } + + PM_DEBUG("Resume MIC device:%d\n", mic_ctx->bi_id); + /* Make sure the Power reset during Resume/Restore is complete*/ + adapter_wait_reset(mic_ctx); + wait_for_reset(mic_ctx); + + /*Perform software reset */ + adapter_reset(mic_ctx, RESET_WAIT, !RESET_REATTEMPT); + wait_for_reset(mic_ctx); + + /* Boot uOS only if it was online before suspend */ + if (MIC_ONLINE == mic_ctx->micpm_ctx.mic_suspend_state) { + if(adapter_start_device(mic_ctx)) { + PM_DEBUG("booting uos... failed\n"); + } + } + + return 0; +} + +/* + * Function to stop all the driver components and unload the uOS + * during a suspend/hibernate operation + */ +int +pm_stop_device(mic_ctx_t *mic_ctx) +{ + if (!mic_ctx) { + PM_DEBUG("Error retreving driver context\n"); + return 0; + } + + mic_ctx->micpm_ctx.mic_suspend_state = mic_ctx->state; + + PM_DEBUG("Suspend MIC device:#%d\n", mic_ctx->bi_id); + if (MIC_ONLINE == mic_ctx->micpm_ctx.mic_suspend_state) { + adapter_shutdown_device(mic_ctx); + if (!wait_for_shutdown_and_reset(mic_ctx)) { + /* Shutdown failed. Fall back on forced reset */ + adapter_stop_device(mic_ctx, RESET_WAIT, !RESET_REATTEMPT); + wait_for_reset(mic_ctx); + } + } + else { + /* If card is in any state but ONLINE, make sure card stops */ + adapter_stop_device(mic_ctx, RESET_WAIT, !RESET_REATTEMPT); + wait_for_reset(mic_ctx); + } + + mutex_lock(&mic_ctx->state_lock); + mic_ctx->state = MIC_RESET; + mutex_unlock(&mic_ctx->state_lock); + return 0; +} diff --git a/host/pm_ioctl.c b/host/pm_ioctl.c new file mode 100644 index 0000000..139d820 --- /dev/null +++ b/host/pm_ioctl.c @@ -0,0 +1,603 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* contains code to handle MIC IO control codes */ + + +#include "mic_common.h" +#include + +/* helper methods for debugging/unit testing /*/ +static int check_test_msg(mic_ctx_t *mic_ctx, void *buf, uint32_t len); + + + +#define PM_MMIO_REGVALUE_GET(_name, _offset) \ +int get_##_name(void *data, uint64_t *value) \ +{ \ + uint64_t bid; \ + mic_ctx_t *mic_ctx; \ + \ + bid = (uint64_t)data; \ + if (bid >= mic_data.dd_numdevs) { \ + return -EINVAL; \ + } \ + mic_ctx = get_per_dev_ctx(bid); \ + if (!mic_ctx) { \ + printk("DD"); \ + return -EINVAL; \ + } \ + \ + *value = pm_reg_read(mic_ctx, _offset); \ + return 0; \ +} \ +DEFINE_SIMPLE_ATTRIBUTE(fops_##_name, get_##_name, NULL, "%llu"); \ + +static PM_MMIO_REGVALUE_GET(svidctrl, SBOX_SVID_CONTROL); +static PM_MMIO_REGVALUE_GET(pcuctrl, SBOX_PCU_CONTROL); +static PM_MMIO_REGVALUE_GET(hoststate,SBOX_HOST_PMSTATE); +static PM_MMIO_REGVALUE_GET(cardstate, SBOX_UOS_PMSTATE); +static PM_MMIO_REGVALUE_GET(wtimer, SBOX_C3WAKEUP_TIMER); +static PM_MMIO_REGVALUE_GET(gpmctrl, GBOX_PM_CTRL); +static PM_MMIO_REGVALUE_GET(core_volt, SBOX_COREVOLT); +static PM_MMIO_REGVALUE_GET(uos_pcuctrl, SBOX_UOS_PCUCONTROL); + +static int depgraph_j2i_show(struct seq_file *s, void *pos) +{ + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + int i, j; + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + seq_printf(s,"=================================================================\n"); + seq_printf(s,"%-10s |%-25s\n", "Scif Node" , "dependent nodes"); + seq_printf(s,"=================================================================\n"); + + for ( i = 0; i <= ms_info.mi_maxid; i++) { + seq_printf(s, "%-10d |", i); + for (j = 0; j <= ms_info.mi_maxid; j++) { + switch(ms_info.mi_depmtrx[j][i]) { + case DEP_STATE_DEPENDENT: + { + /* (A) - active dependency on node i */ + seq_printf(s, "%d(A),", j); + break; + } + case DEP_STATE_DISCONNECT_READY: + { + /* (R) - node j has sent PC6 ready message to the host + * dependency is not active so node i can go idle + */ + seq_printf(s, "%d(R),", j); + break; + } + case DEP_STATE_DISCONNECTED: + { + /* (D) - node j is in idle state. + * dependency is not active so node i can go idle + */ + seq_printf(s, "%d(D),", j); + break; + } + } + } + seq_printf(s,"\n=================================================================\n"); + } + + return 0; +} + +static int depgraph_j2i_open(struct inode *inode, struct file *file) +{ + return single_open(file, depgraph_j2i_show, inode->i_private); +} + +static struct file_operations depgraph_j2i_file_ops = { + .owner = THIS_MODULE, + .open = depgraph_j2i_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int depgraph_i2j_show(struct seq_file *s, void *pos) +{ + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + int i, j; + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + seq_printf(s,"=================================================================\n"); + seq_printf(s,"%-10s |%-25s\n", "Scif Node" , "is dependent on Nodes"); + seq_printf(s,"=================================================================\n"); + + for ( i = 0; i <= ms_info.mi_maxid; i++) { + seq_printf(s, "%-10d |", i); + for (j = 0; j <= ms_info.mi_maxid; j++) { + switch(ms_info.mi_depmtrx[i][j]) { + case DEP_STATE_DEPENDENT: + { + /* (A) - active dependency on node j */ + seq_printf(s, "%d(A),", j); + break; + } + case DEP_STATE_DISCONNECT_READY: + { + /* (R) - node j has sent PC6 ready message to the host */ + seq_printf(s, "%d(R),", j); + break; + } + case DEP_STATE_DISCONNECTED: + { + /* (D) - node j is in idle state. + * This should not happen unless node i itself is in idle state + */ + seq_printf(s, "%d(D),", j); + break; + } + } + } + seq_printf(s,"\n=================================================================\n"); + } + + return 0; +} + +static int depgraph_i2j_open(struct inode *inode, struct file *file) +{ + return single_open(file, depgraph_i2j_show, inode->i_private); +} + +static struct file_operations depgraph_i2j_file_ops = { + .owner = THIS_MODULE, + .open = depgraph_i2j_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int connection_info_show(struct seq_file *s, void *pos) { + + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + int count = 0; + struct list_head *position, *tmpq; + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + seq_printf(s,"=========================================================================\n"); + if(mic_ctx->micpm_ctx.pm_epd != NULL) { + seq_printf(s, "%-35s | %35d\n", "Local Node", mic_ctx->micpm_ctx.pm_epd->port.node); + seq_printf(s, "%-35s | %35d\n", "Local Port", mic_ctx->micpm_ctx.pm_epd->port.port); + seq_printf(s, "%-35s | %35d\n", "Remote Node", mic_ctx->micpm_ctx.pm_epd->peer.node); + seq_printf(s, "%-35s | %35d\n", "Remote Port", mic_ctx->micpm_ctx.pm_epd->peer.port); + seq_printf(s, "%-35s | %35d\n", "Connection state", mic_ctx->micpm_ctx.pm_epd->state); + if(!list_empty(&mic_ctx->micpm_ctx.msg_list)) { + list_for_each_safe(position, tmpq, &mic_ctx->micpm_ctx.msg_list) { + count++; + } + } else { + count = 0; + } + seq_printf(s, "%-35s | %35d\n", "Messages in queue", count); + } else { + seq_printf(s, "%s\n", "No PM connection found"); + } + seq_printf(s,"=========================================================================\n"); + + return 0; +} + +static int connection_info_open(struct inode *inode, struct file *file) +{ + return single_open(file, connection_info_show, inode->i_private); +} + +static struct file_operations connection_info_file_ops = { + .owner = THIS_MODULE, + .open = connection_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int active_set_show(struct seq_file *s, void *pos) { + + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + int i, j = 0; + uint8_t *nodemask; + uint8_t *temp_buf_ptr; + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + nodemask = (uint8_t*) kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL); + if (!nodemask) { + seq_printf(s, "%s\n", "Cannot allocate buffer"); + return 0; + } + + if ((micscif_get_activeset(mic_ctx->bi_id + 1, nodemask))) { + seq_printf(s, "%s\n", "Cannot calculate activation set"); + kfree(nodemask); + return 0; + } + + seq_printf(s, "%s\n", "Nodes in activation set:"); + temp_buf_ptr = nodemask; + for ( i = 0; i < mic_ctx->micpm_ctx.nodemask.len; i++) { + temp_buf_ptr = nodemask + i; + for (j = 0; j < 8; j++) { + if (*temp_buf_ptr & (1ULL << j)) + seq_printf(s, "%d ", j + (i * 8)); + } + } + seq_printf(s, "\n"); + kfree(nodemask); + return 0; +} + +static int active_set_open(struct inode *inode, struct file *file) +{ + return single_open(file, active_set_show, inode->i_private); +} + +static struct file_operations activation_set_file_ops = { + .owner = THIS_MODULE, + .open = active_set_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int deactive_set_show(struct seq_file *s, void *pos) { + + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + int i, j; + uint8_t *nodemask; + uint8_t *temp_buf_ptr; + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + nodemask = (uint8_t*) kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL); + if (!nodemask) { + seq_printf(s, "%s\n", "Cannot allocate buffer"); + return 0; + } + + if ((micscif_get_deactiveset(mic_ctx->bi_id +1, nodemask, 1))) { + seq_printf(s, "%s\n", "Cannot calculate activation set"); + kfree(nodemask); + return 0; + } + + seq_printf(s, "%s\n", "Nodes in deactivation set:"); + temp_buf_ptr = nodemask; + for ( i = 0; i < mic_ctx->micpm_ctx.nodemask.len; i++) { + temp_buf_ptr = nodemask + i; + for (j = 0; j < 8; j++) { + if (*temp_buf_ptr & (1ULL << j)) + seq_printf(s, "%d ", j + (i * 8)); + } + } + seq_printf(s, "\n"); + kfree(nodemask); + return 0; +} + +static int deactive_set_open(struct inode *inode, struct file *file) +{ + return single_open(file, deactive_set_show, inode->i_private); +} + +static struct file_operations deactivation_set_file_ops = { + .owner = THIS_MODULE, + .open = deactive_set_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int ospm_restart_show(struct seq_file *s, void *pos) { + + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + int err; + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + err = pm_stop_device(mic_ctx); + if(err) { + seq_printf(s, "%s:%d\n", "Error calling pm_stop_device.", err); + return err; + } + + err = pm_start_device(mic_ctx); + if(err) { + seq_printf(s, "%s:%d\n", "Error calling pm_start_device.", err); + return err; + } + + return 0; +} + +static int ospm_restart_open(struct inode *inode, struct file *file) +{ + return single_open(file, ospm_restart_show, inode->i_private); +} + +static struct file_operations ospm_restart_file_ops = { + .owner = THIS_MODULE, + .open = ospm_restart_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int testmsg_set(void *data, uint64_t value) +{ + uint64_t bid; + mic_ctx_t *mic_ctx; + int err; + + bid = (uint64_t)data; + if (bid >= mic_data.dd_numdevs) { + return -EINVAL; + } + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + if (value == 0) { + return -EINVAL; + } + + err = mic_pm_send_msg(mic_ctx ,PM_MESSAGE_TEST, PM_TEST_MSG_BODY, sizeof(PM_TEST_MSG_BODY)); + return err; +} + +static int testmsg_get(void *data, uint64_t *value) +{ + uint64_t bid; + mic_ctx_t *mic_ctx; + int err; + + bid = (uint64_t)data; + if (bid >= mic_data.dd_numdevs) { + return -EINVAL; + } + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx) { + return -EINVAL; + } + + err = check_test_msg(mic_ctx,PM_TEST_MSG_BODY, sizeof(PM_TEST_MSG_BODY)); + *value = err; + + return err; +} +DEFINE_SIMPLE_ATTRIBUTE(testmsg_fops, testmsg_get, testmsg_set, "%llu"); + +int +micpm_dbg_init(mic_ctx_t *mic_ctx) +{ + /* directory name will be in format micpmXXXXX + * so assuming the name string wont excceed 12 characters */ + const uint32_t DBG_DIRNAME_LENGTH = 12; + char pmdbg_dir_name[DBG_DIRNAME_LENGTH]; + micpm_ctx_t *micpm_ctx = &mic_ctx->micpm_ctx; + struct dentry *mmiodir; + uint64_t id = mic_ctx->bi_id; + + + if(!mic_data.dd_pm.pmdbgparent_dir) { + printk(KERN_ERR "%s: %d Parent debugfs directory does not exist.\n" + "debugfs may not be supported in kernel", __func__, __LINE__); + return -EOPNOTSUPP; + } + + snprintf(pmdbg_dir_name, sizeof(pmdbg_dir_name), "micpm%d", mic_ctx->bi_id); + micpm_ctx->pmdbg_dir = debugfs_create_dir + (pmdbg_dir_name, mic_data.dd_pm.pmdbgparent_dir); + if (!micpm_ctx->pmdbg_dir) { + printk(KERN_ERR "%s: %d Failed in creating debugfs directory\n" + "debugfs may noe be supported in kernel", __func__, __LINE__); + return -EOPNOTSUPP; + } + + /* Create debugfs entry to get/set idle state of the card known by host*/ + debugfs_create_u32("idle_state", S_IRUGO | S_IWUSR, micpm_ctx->pmdbg_dir, &micpm_ctx->idle_state); + + /* + * Create debugfs entry for sending PM_TEST_MESSAGE for testing communication to card + * set value = PM_MESSAGE_TEST to send the message to card + * get value to verfy that message was successfully sent, looped back by card and received.(0 = success) + */ + debugfs_create_file("testmsg", S_IRUGO | S_IWUSR, micpm_ctx->pmdbg_dir, (void*)id, &testmsg_fops); + + /* Create debugfs entry for showing for each node 'i' , all nodes 'j' i is dependent on */ + debugfs_create_file("depgraph_i2j", + S_IRUGO, + micpm_ctx->pmdbg_dir, + (void*)id, + &depgraph_i2j_file_ops); + + /* Create debugfs entry for showing for each node 'i', all nodes 'j' which are dependent on 'i' */ + debugfs_create_file("depgraph_j2i", + S_IRUGO, + micpm_ctx->pmdbg_dir, + (void*)id, + &depgraph_j2i_file_ops); + + /* Create debugfs entry for showing connection info for a node */ + debugfs_create_file("connection_info", + S_IRUGO, + micpm_ctx->pmdbg_dir, + (void*)id, + &connection_info_file_ops); + + /* Create debugfs entry to initiate OSPM restart for a node */ + debugfs_create_file("ospm_restart", + S_IRUGO, + micpm_ctx->pmdbg_dir, + (void*)id, + &ospm_restart_file_ops); + + /* Create debugfs entry to display activation set for a node */ + debugfs_create_file("activation_set", + S_IRUGO, + micpm_ctx->pmdbg_dir, + (void*)id, + &activation_set_file_ops); + + /* Create debugfs entry to display de-activation set for a node */ + debugfs_create_file("deactivation_set", + S_IRUGO, + micpm_ctx->pmdbg_dir, + (void*)id, + &deactivation_set_file_ops); + + /* Create debugfs entries for reading power management status/control register value in MMIO region */ + mmiodir = debugfs_create_dir("mmio", micpm_ctx->pmdbg_dir); + if (!mmiodir) { + printk(KERN_ERR "%s: %d Failed in creating debugfs directory\n" + "debugfs may noe be supported in kernel", __func__, __LINE__); + return -EOPNOTSUPP; + } + debugfs_create_file("svidctrl", S_IRUGO, mmiodir,(void*)id, &fops_svidctrl); + debugfs_create_file("pcuctrl", S_IRUGO, mmiodir,(void*)id, &fops_pcuctrl); + debugfs_create_file("hoststate", S_IRUGO, mmiodir,(void*)id, &fops_hoststate); + debugfs_create_file("cardstate", S_IRUGO, mmiodir,(void*)id, &fops_cardstate); + debugfs_create_file("wtimer", S_IRUGO, mmiodir,(void*)id, &fops_wtimer); + debugfs_create_file("gpmctrl", S_IRUGO, mmiodir,(void*)id, &fops_gpmctrl); + debugfs_create_file("core_volt", S_IRUGO, mmiodir,(void*)id, &fops_core_volt); + debugfs_create_file("uos_pcuctrl", S_IRUGO, mmiodir,(void*)id, &fops_uos_pcuctrl); + + return 0; +} + +void micpm_dbg_parent_init(void) { + mic_data.dd_pm.pmdbgparent_dir = debugfs_create_dir("micpm", NULL); + if (!mic_data.dd_pm.pmdbgparent_dir) { + PM_DEBUG("%s: %d Failed in creating debugfs directory\n" + "debugfs may not be supported in kernel", __func__, __LINE__); + } + + debugfs_create_u32("enable_pm_logging", S_IRUGO | S_IWUSR, + mic_data.dd_pm.pmdbgparent_dir, &mic_data.dd_pm.enable_pm_logging); + + return; +} + + +/* + * Test message is looped back to driver and lives in the message list. + * This function retrieves the message and send it to user space which + * can check if its the same message as that was sent. + */ +static int +check_test_msg(mic_ctx_t *mic_ctx, void *buf, uint32_t len) +{ + int err = -EINVAL; + pm_recv_msg_t *recv_msg = NULL; + struct list_head *pos = NULL, *tmpq = NULL; + bool msg_found = false; + + if(len != sizeof(pm_msg_unit_test)) { + pr_debug("Invalid Args: Size of buffer\n"); + return -EINVAL; + } + + mutex_lock(&mic_ctx->micpm_ctx.msg_mutex); + if(!list_empty_careful(&mic_ctx->micpm_ctx.msg_list)) { + list_for_each_safe(pos, tmpq, &mic_ctx->micpm_ctx.msg_list) { + recv_msg = list_entry(pos, pm_recv_msg_t, msg); + /*Do not touch the message if its not a test message */ + if (recv_msg->msg_header.opcode == PM_MESSAGE_TEST) { + list_del(&recv_msg->msg); + msg_found = true; + break; + } + } + } else { + pr_debug("empty message list \n"); + goto no_msg; + } + + if (msg_found == false) { + pr_debug("Test msg not found \n"); + goto no_msg; + } + + if(recv_msg->msg_body == NULL) { + pr_debug("Invalid source buffer\n"); + goto list_free; + } + + err = strncmp((char*)recv_msg->msg_body, (char*)buf, len); + kfree(recv_msg->msg_body); + +list_free: + kfree(recv_msg); + +no_msg: + mutex_unlock(&mic_ctx->micpm_ctx.msg_mutex); + return err; + +} diff --git a/host/pm_pcstate.c b/host/pm_pcstate.c new file mode 100644 index 0000000..00c9ec4 --- /dev/null +++ b/host/pm_pcstate.c @@ -0,0 +1,1107 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic_common.h" +#include "scif.h" +#include "mic/micscif.h" +#include "mic/mic_pm.h" +#include "mic/micveth_dma.h" +#include +#include "linux/virtio_blk.h" +#include "mic/mic_virtio.h" + +//few helper functions +int pm_reg_read(mic_ctx_t *mic_ctx, uint32_t regoffset) { + uint32_t regval = 0; +if (mic_ctx->bi_family == FAMILY_ABR) + regval = DBOX_READ(mic_ctx->mmio.va, regoffset); +else if (mic_ctx->bi_family == FAMILY_KNC) + regval = SBOX_READ(mic_ctx->mmio.va, regoffset); + + return regval; +} + +int pm_reg_write(uint32_t value, mic_ctx_t *mic_ctx, uint32_t regoffset) { + int err = 0; +if (mic_ctx->bi_family == FAMILY_ABR) + DBOX_WRITE(value, mic_ctx->mmio.va, regoffset); +else if (mic_ctx->bi_family == FAMILY_KNC) + SBOX_WRITE(value, mic_ctx->mmio.va, regoffset); + + return err; +} + +int hw_idle(mic_ctx_t *mic_ctx) { + + uint8_t is_ring_active; + sbox_pcu_ctrl_t ctrl_regval = {0}; + uint32_t idle_wait_cnt; + + for(idle_wait_cnt = 0; idle_wait_cnt <= MAX_HW_IDLE_WAIT_COUNT; + idle_wait_cnt++) { + ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL); + is_ring_active = ctrl_regval.bits.mclk_enabled; + if(likely(!is_ring_active)) + return !is_ring_active; + msleep(1); + } + + PM_DEBUG("Timing out waiting for HW to become idle\n"); + return !is_ring_active; +} + +int hw_active(mic_ctx_t *mic_ctx) { + uint8_t is_ring_active; + sbox_pcu_ctrl_t ctrl_regval; + uint32_t idle_wait_cnt; + + for(idle_wait_cnt = 0; idle_wait_cnt <= MAX_HW_IDLE_WAIT_COUNT; + idle_wait_cnt++) { + ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL); + is_ring_active = ctrl_regval.bits.mclk_enabled; + if (likely(is_ring_active)) + return is_ring_active; + msleep(10); + } + + PM_DEBUG("Timing out waiting for HW to become active\n"); + return is_ring_active; + +} + +PM_IDLE_STATE get_card_state(mic_ctx_t *mic_ctx) { + + PM_IDLE_STATE state; + sbox_uos_pm_state_t upmstate_regval = {0}; + upmstate_regval.value = pm_reg_read(mic_ctx, SBOX_UOS_PMSTATE); + state = (PM_IDLE_STATE)(upmstate_regval.bits.uos_pm_state); + return state; + +} + +PM_IDLE_STATE get_host_state(mic_ctx_t *mic_ctx) { + + PM_IDLE_STATE state; + sbox_host_pm_state_t hpmstate_regval = {0}; + hpmstate_regval.value = pm_reg_read(mic_ctx, SBOX_HOST_PMSTATE); + state = (PM_IDLE_STATE)(hpmstate_regval.bits.host_pm_state); + return state; + +} + +int set_host_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state) { + + int err = 0; + sbox_host_pm_state_t hpmstate_regval = {0}; + hpmstate_regval.value = pm_reg_read(mic_ctx, SBOX_HOST_PMSTATE); + hpmstate_regval.bits.host_pm_state = 0; + hpmstate_regval.bits.host_pm_state = state; + pm_reg_write(hpmstate_regval.value, mic_ctx, SBOX_HOST_PMSTATE); + return err; +} + +int check_card_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state) { + PM_IDLE_STATE card_state = get_card_state(mic_ctx); + return (state == card_state) ? 1 : 0; +} + +int check_host_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state) { + PM_IDLE_STATE host_state = get_host_state(mic_ctx); + return (state == host_state) ? 1 : 0; +} + +uint32_t svid_cmd_fmt(unsigned int bits) +{ + unsigned int bits_set,bmask; + + bmask = bits; + + for (bits_set = 0; bmask; bits_set++) { + /* Zero the least significant bit that is set */ + bmask &= (bmask - 1); + } + bits <<= 1; /* Make way for the parity bit */ + if (bits_set & 1) { /* odd number of 1s */ + bits |= 1; + } + + return bits; +} + +void set_vid(mic_ctx_t *mic_ctx, sbox_svid_control svidctrl_regval, unsigned int vidcode) { + + uint32_t temp; + uint32_t svid_cmd = 0; + uint32_t svid_dout = 0; + temp = svid_cmd_fmt((KNC_SVID_ADDR << 13) | + (KNC_SETVID_SLOW << 8) | vidcode); + svid_cmd = (KNC_SVID_ADDR << 5) | KNC_SETVID_SLOW; + svidctrl_regval.bits.svid_cmd = 0x0e0; + svidctrl_regval.bits.svid_cmd = svid_cmd; + + svid_dout = temp & 0x1ff; + svidctrl_regval.bits.svid_dout = 0; + svidctrl_regval.bits.svid_dout = svid_dout; + + svidctrl_regval.bits.cmd_start = 0x1; + pm_reg_write(svidctrl_regval.value, mic_ctx, + SBOX_SVID_CONTROL); + + msleep(10); + + return; +} + +int set_vid_knc(mic_ctx_t *mic_ctx, unsigned int vidcode) +{ + uint32_t status = 0; + + sbox_svid_control svidctrl_regval = {0}; + uint32_t svid_idle = 0; + uint32_t svid_error = 0; + int i = 0; + uint32_t wait_cnt = 0; + sbox_core_volt_t core_volt_regval = {0}; + int retry = 0; + + if (mic_ctx->bi_stepping >= KNC_B0_STEP) { + for (retry = 0; retry < SET_VID_RETRY_COUNT; retry++) { + status = 0; + for (i = 0; i < KNC_SETVID_ATTEMPTS; i++) { + svidctrl_regval.value = pm_reg_read(mic_ctx,SBOX_SVID_CONTROL); + svid_idle = svidctrl_regval.bits.svid_idle; + + if (svid_idle) { + set_vid(mic_ctx, svidctrl_regval, vidcode); + svidctrl_regval.value = + pm_reg_read(mic_ctx,SBOX_SVID_CONTROL); + svid_idle = svidctrl_regval.bits.svid_idle; + svid_error = svidctrl_regval.bits.svid_error; + + if (!svid_idle) { + printk(KERN_ERR "%s SVID command failed - Idle not set\n", + __func__); + msleep(10); + continue; + } + + if (svid_error) { + if (SBOX_SVIDCTRL_ACK1ACK0(svidctrl_regval.value) == 0x2) { + printk(KERN_ERR "%s SVID command failed - rx parity error\n", + __func__); + } else { + printk(KERN_ERR "%s SVID command failed - tx parity error\n", + __func__); + } + status = -EINVAL; + goto exit; + } else { + PM_DEBUG("SVID Command Successful - VID set to %d\n",vidcode); + break; + } + } + } + + if (i == KNC_SETVID_ATTEMPTS) { + printk(KERN_ERR "%s Timed out waiting for SVID idle\n", __func__); + status = -EINVAL; + goto exit; + } + + /* Verify that the voltage is set */ + for(wait_cnt = 0; wait_cnt <= 100; wait_cnt++) { + core_volt_regval.value = pm_reg_read(mic_ctx, SBOX_COREVOLT); + if(vidcode == core_volt_regval.bits.vid) { + return status; + } + msleep(10); + PM_DEBUG("Retry: %d Voltage not set yet. vidcode = 0x%x Current vid = 0x%x\n", + retry, vidcode, core_volt_regval.bits.vid); + } + + PM_PRINT("Retry: %d Failed to set vid for node %d. vid code = 0x%x Current vid = 0x%x.\n", + retry, mic_get_scifnode_id(mic_ctx), vidcode, core_volt_regval.bits.vid); + status = -ENODEV; + } + } else { + set_vid(mic_ctx, svidctrl_regval, vidcode); + + /* SBOX_COREVOLT does not reflect the correct vid + * value on A0. Just wait here for sometime to + * allow for the vid to be set. + */ + msleep(20); + } + +exit: + return status; +} + +/* @print_nodemaskbuf + * + * @param - buf - the nodemask buffer + * + * prints the nodes in the nodemask. + * + * @returns - none + */ +void print_nodemaskbuf(uint8_t* buf) { + + uint8_t *temp_buf_ptr; + uint32_t i,j; + + temp_buf_ptr = buf; + PM_DEBUG("Nodes in nodemask: "); + for(i = 0; i <= ms_info.mi_maxid; i++) { + temp_buf_ptr = buf + i; + for (j = 0; j < 8; j++) { + if (get_nodemask_bit(temp_buf_ptr, j)) + pr_debug("%d ", j + (i * 8)); + } + } +} + +void restore_pc6_registers(mic_ctx_t *mic_ctx, bool from_dpc3) { + sbox_pcu_ctrl_t ctrl_regval = {0}; + sbox_uos_pcu_ctrl_t uos_ctrl_regval = {0}; + gbox_pm_control pmctrl_reg = {0}; + sbox_core_freq_t core_freq_reg = {0}; + + if (!from_dpc3) { + if(KNC_A_STEP == mic_ctx->bi_stepping) { + ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL); + ctrl_regval.bits.enable_mclk_pl_shutdown = 0; + pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL); + } else { + uos_ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_UOS_PCUCONTROL); + uos_ctrl_regval.bits.enable_mclk_pll_shutdown = 0; + pm_reg_write(uos_ctrl_regval.value, mic_ctx, SBOX_UOS_PCUCONTROL); + } + + + ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL); + ctrl_regval.bits.prevent_auto_c3_exit = 0; + pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL); + } + + pmctrl_reg.value = pm_reg_read(mic_ctx, GBOX_PM_CTRL); + pmctrl_reg.bits.in_pckgc6 = 0; + pm_reg_write(pmctrl_reg.value, mic_ctx, GBOX_PM_CTRL); + + ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL); + ctrl_regval.bits.grpB_pwrgood_mask = 0; + pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL); + + core_freq_reg.value = pm_reg_read(mic_ctx, SBOX_COREFREQ); + core_freq_reg.bits.booted = 1; + pm_reg_write(core_freq_reg.value, mic_ctx, SBOX_COREFREQ); +} + +void program_mclk_shutdown(mic_ctx_t *mic_ctx, bool set) +{ + sbox_uos_pcu_ctrl_t uos_ctrl_regval; + sbox_pcu_ctrl_t ctrl_regval; + + if(KNC_A_STEP == mic_ctx->bi_stepping) { + ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL); + ctrl_regval.bits.enable_mclk_pl_shutdown = (set ? 1: 0); + pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL); + } else { + uos_ctrl_regval.value = pm_reg_read(mic_ctx, + SBOX_UOS_PCUCONTROL); + uos_ctrl_regval.bits.enable_mclk_pll_shutdown = (set ? 1: 0); + pm_reg_write(uos_ctrl_regval.value, + mic_ctx, SBOX_UOS_PCUCONTROL); + } +} + +void program_prevent_C3Exit(mic_ctx_t *mic_ctx, bool set) +{ + sbox_pcu_ctrl_t ctrl_regval; + ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL); + ctrl_regval.bits.prevent_auto_c3_exit = (set ? 1: 0); + pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL); + +} + +int pm_pc3_to_pc6_entry(mic_ctx_t *mic_ctx) +{ + int err; + sbox_pcu_ctrl_t ctrl_regval; + gbox_pm_control pmctrl_reg; + sbox_core_freq_t core_freq_reg; + + if ((get_card_state(mic_ctx)) != PM_IDLE_STATE_PC3) { + PM_DEBUG("Card not ready to go to PC6. \n"); + err = -EAGAIN; + goto exit; + } + + if (atomic_cmpxchg(&mic_ctx->gate_interrupt, 0, 1) == 1) { + PM_DEBUG("Cannot gate interrupt handler while it is in use\n"); + err = -EFAULT; + goto exit; + } + + program_prevent_C3Exit(mic_ctx, true); + program_mclk_shutdown(mic_ctx, true); + + /* Wait for uos to become idle. */ + if (!hw_idle(mic_ctx)) { + program_mclk_shutdown(mic_ctx, false); + if (!hw_idle(mic_ctx)) { + program_prevent_C3Exit(mic_ctx, false); + PM_DEBUG("Card not ready to go to PC6. \n"); + err = -EAGAIN; + goto intr_ungate; + } else { + program_mclk_shutdown(mic_ctx, true); + } + } + + pmctrl_reg.value = pm_reg_read(mic_ctx, GBOX_PM_CTRL); + pmctrl_reg.bits.in_pckgc6 = 1; + pm_reg_write(pmctrl_reg.value, mic_ctx, GBOX_PM_CTRL); + + core_freq_reg.value = pm_reg_read(mic_ctx, SBOX_COREFREQ); + core_freq_reg.bits.booted = 0; + pm_reg_write(core_freq_reg.value, mic_ctx, SBOX_COREFREQ); + + udelay(500); + + ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL); + ctrl_regval.bits.grpB_pwrgood_mask = 1; + pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL); + + err = set_vid_knc(mic_ctx, 0); + if (err != 0) { + PM_DEBUG("Aborting PC6 entry...Failed to set VID\n"); + restore_pc6_registers(mic_ctx, true); + goto intr_ungate; + } + + mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC6; + set_host_state(mic_ctx, PM_IDLE_STATE_PC6); + + dma_prep_suspend(mic_ctx->dma_handle); + + PM_PRINT("Node %d entered PC6\n", + mic_get_scifnode_id(mic_ctx)); + + return err; + +intr_ungate: + atomic_set(&mic_ctx->gate_interrupt, 0); + tasklet_schedule(&mic_ctx->bi_dpc); +exit: + return err; +} + +/* + * pm_pc6_exit: + * + * Execute pc6 exit for a node. + * mic_ctx: The driver context of the node. + */ +int pm_pc6_exit(mic_ctx_t *mic_ctx) +{ + + int err = 0; + + sbox_host_pm_state_t hpmstate_regval; + sbox_pcu_ctrl_t ctrl_regval; + uint8_t tdp_vid = 0; + uint8_t is_pll_locked; + uint32_t wait_cnt; + int i; + + + if (!check_host_state(mic_ctx, PM_IDLE_STATE_PC6)) { + PM_DEBUG("Wrong Host PM state. State = %d\n", + get_host_state(mic_ctx)); + err = -EINVAL; + goto restore_registers; + } + + hpmstate_regval.value = pm_reg_read(mic_ctx, SBOX_HOST_PMSTATE); + tdp_vid = hpmstate_regval.bits.tdp_vid; + PM_DEBUG("TDP_VID value obtained from Host PM Register = %d",tdp_vid); + + PM_DEBUG("Setting voltage to %dV using SVID Control\n",tdp_vid); + err = set_vid_knc(mic_ctx, tdp_vid); + if (err != 0) { + printk(KERN_ERR "%s Failed PC6 entry...error in setting VID\n", + __func__); + goto restore_registers; + } + + ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL); + + program_mclk_shutdown(mic_ctx, false); + program_prevent_C3Exit(mic_ctx, false); + + for(wait_cnt = 0; wait_cnt < 200; wait_cnt++) { + ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL); + is_pll_locked = ctrl_regval.bits.mclk_pll_lock; + if(likely(is_pll_locked)) + break; + msleep(10); + } + + if(wait_cnt >= 200) { + PM_DEBUG("mclk_pll_locked bit is not set.\n"); + err = -EAGAIN; + goto restore_registers; + } + + ctrl_regval.bits.grpB_pwrgood_mask = 0; + pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL); + + if (!hw_active(mic_ctx)) { + PM_DEBUG("Timing out waiting for hw to become active"); + goto restore_registers; + } + + for(wait_cnt = 0; wait_cnt < 200; wait_cnt++) { + if ((get_card_state(mic_ctx)) == PM_IDLE_STATE_PC0) + break; + msleep(10); + } + + if(wait_cnt >= 200) { + PM_DEBUG("PC6 Exit not complete.\n"); + err = -EFAULT; + goto restore_registers; + } + + mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0; + + for (i = 0; i <= mic_data.dd_numdevs; i++) { + if (micscif_get_nodedep(mic_get_scifnode_id(mic_ctx), i) == + DEP_STATE_DISCONNECTED) { + micscif_set_nodedep(mic_get_scifnode_id(mic_ctx), i, + DEP_STATE_DEPENDENT); + } + } + + PM_PRINT("Node %d exited PC6\n", + mic_get_scifnode_id(mic_ctx)); + goto exit; + +restore_registers: + restore_pc6_registers(mic_ctx, false); +exit: + atomic_set(&mic_ctx->gate_interrupt, 0); + tasklet_schedule(&mic_ctx->bi_dpc); + return err; +} + +/* + * setup_pm_dependency: + * + * Function sets up the dependency matrix by populating + * the matrix with node depency information. + * + * Returns 0 on success. Appropriate error on failure. + */ +int setup_pm_dependency(void){ + int err = 0; + uint16_t i; + uint16_t j; + mic_ctx_t *mic_ctx; + + for (i = 0; i < mic_data.dd_numdevs; i++) { + mic_ctx = get_per_dev_ctx(i); + if (!mic_ctx) { + PM_DEBUG("Failed to retrieve driver context\n"); + return -EFAULT; + } + if (mic_ctx->micpm_ctx.idle_state == + PM_IDLE_STATE_PC3_READY) { + for (j = 0; j < mic_data.dd_numdevs; j++) { + if (micscif_get_nodedep(mic_get_scifnode_id(mic_ctx),j+1) == + DEP_STATE_DEPENDENT) { + micscif_set_nodedep(mic_get_scifnode_id(mic_ctx),j+1, + DEP_STATE_DISCONNECT_READY); + } + } + } + } + return err; +} + +/* + * teardown_pm_dependency + * + * Function resets dependency matrix by removing all depenendy info + * from it. + * + * Returns 0 on success. Appropriate error on failure. + */ +int teardown_pm_dependency(void) { + int err = 0; + int i; + int j; + + for (i = 0; i < mic_data.dd_numdevs; i++) { + for (j = 0; j < mic_data.dd_numdevs; j++) { + + if (micscif_get_nodedep(i+1,j+1) == DEP_STATE_DISCONNECT_READY) { + micscif_set_nodedep(i+1,j+1, DEP_STATE_DEPENDENT); + } + } + } + return err; +} + +/* + * revert_idle_entry_trasaction: + * + * @node_disc_bitmask: Bitmask of nodes which were involved in the + * transaction + * + * Function Reverts idle state changes made to nodes when an idle + * state trasaction fails. + */ +int revert_idle_entry_trasaction(uint8_t *node_disc_bitmask) { + int err = 0; + mic_ctx_t *node_ctx; + uint32_t node_id = 0; + + for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) { + if (node_id == SCIF_HOST_NODE) + continue; + + if (!get_nodemask_bit(node_disc_bitmask, node_id)) + continue; + + node_ctx = get_per_dev_ctx(node_id - 1); + if (!node_ctx) { + PM_DEBUG("Failed to retrieve node context."); + err = -EINVAL; + goto exit; + } + + if (node_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC3) { + err = pm_pc3_exit(node_ctx); + if (err) { + PM_DEBUG("Wakeup of Node %d failed. Node is lost" + " and is to be disconnected",node_id); + node_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_LOST; + /* Since node is lost, ref_cnt increment(decement) through the + * pm_get(put)_reference interface is prevented by idle_state. + * We still need to ensure the ref_cnt iself is reset + * back to 0 so that pm_get(put)_reference will work after the + * lost node interface recovers the node. */ + atomic_set(&node_ctx->micpm_ctx.pm_ref_cnt, 0); + } + } + } +exit: + return err; +} + +/* pm_node_disconnect + * + * Called during idlestate entry. + * + * Function checks the pm_ref_cnt and returns ACK + * or NACK depending on the pm_ref_cnt value. + */ +int pm_node_disconnect(uint8_t *nodemask) { + + uint32_t node_id; + mic_ctx_t *mic_ctx; + int ret = 0; + int err = 0; + + for (node_id = 0; node_id <= ms_info.mi_maxid; node_id++) { + if (node_id == SCIF_HOST_NODE) + continue; + + if (!get_nodemask_bit(nodemask, node_id)) + continue; + + mic_ctx = get_per_dev_ctx(node_id - 1); + if (!mic_ctx) { + set_nodemask_bit(nodemask, node_id, 0); + return -EAGAIN; + } + + if (mic_ctx->state != MIC_ONLINE) { + set_nodemask_bit(nodemask, node_id, 0); + return -EAGAIN; + } + + ret = atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt, + 0, PM_NODE_IDLE); + if (((ret != 0) && (ret != PM_NODE_IDLE)) + || atomic_read(&mic_data.dd_pm.wakeup_in_progress)) { + set_nodemask_bit(nodemask, node_id, 0); + return -EAGAIN; + } + } + + return err; +} + +/* + * pm_pc3_entry: + * + * Execute pc3 entry for a node. + * mic_ctx: The driver context of the node. + */ +int pm_pc3_entry(mic_ctx_t *mic_ctx) +{ + int err = 0; + if (mic_ctx == NULL) { + err = -EINVAL; + goto exit; + } + + if (((!check_host_state(mic_ctx, PM_IDLE_STATE_PC0))) || + (mic_ctx->micpm_ctx.idle_state != PM_IDLE_STATE_PC0)) { + PM_DEBUG("Wrong host state. register state = %d" + " idle state = %d\n", get_host_state(mic_ctx), + mic_ctx->micpm_ctx.idle_state); + goto send_wakeup; + } + + /* cancel pc6 entry work that may be scheduled. We need to + * do this either here or after a pervious pc3 exit */ + cancel_delayed_work_sync(&mic_ctx->micpm_ctx.pc6_entry_work); + + if ((mic_ctx->micpm_ctx.con_state != PM_CONNECTED) || + (!mic_ctx->micpm_ctx.pc3_enabled)) + goto send_wakeup; + + mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC3_READY; + err = do_idlestate_entry(mic_ctx); + if (err) + goto exit; + if ((mic_ctx->micpm_ctx.pc6_enabled) && + (KNC_C_STEP <= mic_ctx->bi_stepping) && + (KNC_B1_STEP != mic_ctx->bi_stepping)) { + queue_delayed_work(mic_ctx->micpm_ctx.pc6_entry_wq, + &mic_ctx->micpm_ctx.pc6_entry_work, + mic_ctx->micpm_ctx.pc6_timeout*HZ); + } + + goto exit; + +send_wakeup: + mutex_lock(&mic_data.dd_pm.pm_idle_mutex); + pm_pc3_exit(mic_ctx); + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); +exit: + return err; +} + +/* + * pm_pc3_exit: + * Calling function needs to grab idle_state mutex. + * + * Execute pc3 exit for a node. + * mic_ctx: The driver context of the node. + */ +int pm_pc3_exit(mic_ctx_t *mic_ctx) +{ + int err; + int wait_cnt; + + WARN_ON(!mutex_is_locked(&mic_data.dd_pm.pm_idle_mutex)); + mic_send_pm_intr(mic_ctx); + for (wait_cnt = 0; wait_cnt < PC3_EXIT_WAIT_COUNT; wait_cnt++) { + if (check_card_state(mic_ctx, PM_IDLE_STATE_PC0)) + break; + msleep(1); + } + + + if(wait_cnt >= PC3_EXIT_WAIT_COUNT) { + PM_DEBUG("Syncronization with card failed." + " Node is lost\n"); + err = -EFAULT; + goto exit; + } + + set_host_state(mic_ctx, PM_IDLE_STATE_PC0); + mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0; + PM_DEBUG("Node %d exited PC3\n", mic_get_scifnode_id(mic_ctx)); + + return 0; +exit: + return err; +} + +/* + * do_idlestate_entry: + * + * Function to start the idle state entry transaction for a node. Puts a node + * and all the nodes that are dependent on this node to idle state if + * it is possible. + * + * mic_ctx: The device context of node that needs to be put in idle state + * Returs 0 in success. Appropriate error code on failure + */ +int do_idlestate_entry(mic_ctx_t *mic_ctx) +{ + int err = 0; + uint32_t node_id = 0; + mic_ctx_t *node_ctx; + uint8_t *nodemask_buf; + + if(!mic_ctx) + return -EINVAL; + + mutex_lock(&mic_data.dd_pm.pm_idle_mutex); + + if ((err = setup_pm_dependency())) { + PM_DEBUG("Failed to set up PM specific dependencies"); + goto unlock; + } + + nodemask_buf = (uint8_t *) + kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL); + if(!nodemask_buf) { + PM_DEBUG("Error allocating nodemask buffer\n"); + err = ENOMEM; + goto dep_teardown; + } + + err = micscif_get_deactiveset(mic_get_scifnode_id(mic_ctx), + nodemask_buf, 1); + if (err) { + PM_DEBUG("Node disconnection failed " + "during deactivation set calculation"); + goto free_buf; + } + + print_nodemaskbuf(nodemask_buf); + + if ((err = micscif_disconnect_node(mic_get_scifnode_id(mic_ctx), + nodemask_buf, DISCONN_TYPE_POWER_MGMT))) { + PM_DEBUG("SCIF Node disconnect failed. err: %d", err); + goto free_buf; + } + + if ((err = pm_node_disconnect(nodemask_buf))) { + PM_DEBUG("PM Node disconnect failed. err = %d\n", err); + goto free_buf; + } + + if ((err = micvcons_pm_disconnect_node(nodemask_buf, + DISCONN_TYPE_POWER_MGMT))) { + PM_DEBUG("VCONS Node disconnect failed. err = %d\n", err); + goto free_buf; + } + + for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) { + if (node_id == SCIF_HOST_NODE) + continue; + if (!get_nodemask_bit(nodemask_buf, node_id)) + continue; + node_ctx = get_per_dev_ctx(node_id - 1); + if (!node_ctx) { + PM_DEBUG("Failed to retrieve node context."); + err = -EINVAL; + goto revert; + } + + if (node_ctx->micpm_ctx.idle_state == + PM_IDLE_STATE_PC3_READY) { + set_host_state(node_ctx, PM_IDLE_STATE_PC3); + node_ctx->micpm_ctx.idle_state = + PM_IDLE_STATE_PC3; + PM_DEBUG("Node %d entered PC3\n", + mic_get_scifnode_id(node_ctx)); + } else { + PM_DEBUG("Invalid idle state \n"); + err = -EINVAL; + goto revert; + } + } + +revert: + if (err) + revert_idle_entry_trasaction(nodemask_buf); +free_buf: + kfree(nodemask_buf); +dep_teardown: + teardown_pm_dependency(); +unlock: + if (err && (mic_ctx->micpm_ctx.idle_state != PM_IDLE_STATE_PC0)) + pm_pc3_exit(mic_ctx); + + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + return err; +} + +/** + * is_idlestate_exit_needed: + * + * @node_id[in]: node to wakeup. + * + * Method responsible for checking if idle state exit is required + * In some situation we would like to know whether node is idle or not before + * making decision to bring the node out of idle state. + * For example - Lost node detection. + * returns false if the node is not in IDLE state, returns true otherwise + */ +int +is_idlestate_exit_needed(mic_ctx_t *mic_ctx) +{ + int ret = 0; + mutex_lock(&mic_data.dd_pm.pm_idle_mutex); + + switch (mic_ctx->micpm_ctx.idle_state) + { + case PM_IDLE_STATE_PC0: + case PM_IDLE_STATE_LOST: + break; + case PM_IDLE_STATE_PC3: + case PM_IDLE_STATE_PC3_READY: + case PM_IDLE_STATE_PC6: + { + ret = 1; + break; + } + default: + ret = 1; + } + + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + return ret; +} + +/* do_idlestate_exit: + * + * Initiate idle state exits for nodes specified + * by the bitmask. + * + * mic_ctx: The device context. + * get_ref: Set to true if the entity that wants to wake + * a node up also wantes to get a reference to the node. + * + * Returs 0 on success. Appropriate error on failure. + * + */ +int do_idlestate_exit(mic_ctx_t *mic_ctx, bool get_ref) { + int err = 0; + uint32_t node_id = 0; + mic_ctx_t *node_ctx; + uint8_t *nodemask_buf; + + if(!mic_ctx) + return -EINVAL; + + might_sleep(); + /* If the idle_state_mutex is already obtained by another thread + * try to wakeup the thread which MAY be waiting for REMOVE_NODE + * responses. This way, we give priority to idle state exits than + * idle state entries. + */ + if (!mutex_trylock(&mic_data.dd_pm.pm_idle_mutex)) { + atomic_inc(&mic_data.dd_pm.wakeup_in_progress); + wake_up(&ms_info.mi_disconn_wq); + mutex_lock(&mic_data.dd_pm.pm_idle_mutex); + atomic_dec(&mic_data.dd_pm.wakeup_in_progress); + } + + nodemask_buf = (uint8_t *)kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL); + if(!nodemask_buf) { + PM_DEBUG("Error allocating nodemask buffer\n"); + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + err = ENOMEM; + goto abort_node_wake; + } + + if ((err = micscif_get_activeset(mic_get_scifnode_id(mic_ctx), nodemask_buf))) { + PM_DEBUG("Node connect failed during Activation set calculation for node\n"); + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + err = -EINVAL; + goto free_buf; + } + + print_nodemaskbuf(nodemask_buf); + + for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) { + if (node_id == SCIF_HOST_NODE) + continue; + + if (!get_nodemask_bit(nodemask_buf, node_id)) + continue; + + node_ctx = get_per_dev_ctx(node_id - 1); + if (!node_ctx) { + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + goto free_buf; + } + + switch (node_ctx->micpm_ctx.idle_state) { + case PM_IDLE_STATE_PC3: + case PM_IDLE_STATE_PC3_READY: + if ((err = pm_pc3_exit(node_ctx))) { + PM_DEBUG("Wakeup of Node %d failed." + "Node to be disconnected",node_id); + set_nodemask_bit(nodemask_buf, node_id, 0); + node_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_LOST; + /* Since node is lost, ref_cnt increment(decement) through the + * pm_get(put)_reference interface is prevented by idle_state. + * We still need to ensure the ref_cnt iself is reset + * back to 0 so that pm_get(put)_reference will work after the + * lost node interface recovers the node. */ + atomic_set(&node_ctx->micpm_ctx.pm_ref_cnt, 0); + } else { + if ((mic_ctx == node_ctx) && get_ref) + if (atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt, PM_NODE_IDLE, 1) != + PM_NODE_IDLE) + atomic_inc(&mic_ctx->micpm_ctx.pm_ref_cnt); + } + break; + case PM_IDLE_STATE_PC6: + if ((err = pm_pc6_exit(node_ctx))) { + PM_DEBUG("Wakeup of Node %d failed." + "Node to be disconnected",node_id); + set_nodemask_bit(nodemask_buf, node_id, 0); + node_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_LOST; + /* Since node is lost, ref_cnt increment(decement) through the + * pm_get(put)_reference interface is prevented by idle_state. + * We still need to ensure the ref_cnt iself is reset + * back to 0 so that pm_get(put)_reference will work after the + * lost node interface recovers the node. */ + atomic_set(&node_ctx->micpm_ctx.pm_ref_cnt, 0); + } else { + if ((mic_ctx == node_ctx) && get_ref) + if (atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt, PM_NODE_IDLE, 1) != + PM_NODE_IDLE) + atomic_inc(&mic_ctx->micpm_ctx.pm_ref_cnt); + } + break; + case PM_IDLE_STATE_PC0: + PM_DEBUG("Node %d is in state %d " + "and already out of package state.\n",node_id, + node_ctx->micpm_ctx.idle_state); + if ((mic_ctx == node_ctx) && get_ref) + if (atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt, PM_NODE_IDLE, 1) != + PM_NODE_IDLE) + atomic_inc(&mic_ctx->micpm_ctx.pm_ref_cnt); + break; + default: + PM_DEBUG("Invalid idle state of node %d." + " State = %d \n", node_id, + node_ctx->micpm_ctx.idle_state); + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + err = -ENODEV; + goto free_buf; + } + } + + /* Idle state exit of nodes are complete. + * Set the register state now for those nodes + * that are successfully up. + */ + for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) { + if (node_id == SCIF_HOST_NODE) + continue; + + if (!get_nodemask_bit(nodemask_buf, node_id)) + continue; + + node_ctx = get_per_dev_ctx(node_id - 1); + if (!node_ctx) { + PM_DEBUG("Failed to retrieve node context."); + continue; + } + + + if (node_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC0) + set_host_state(node_ctx, PM_IDLE_STATE_PC0); + } + + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); +free_buf: + kfree(nodemask_buf); +abort_node_wake: + return err; +} + +int pc6_entry_start(mic_ctx_t *mic_ctx) { + + int err = 0; + + if (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC0) { + PM_DEBUG("Node not in PC3\n"); + err = -EFAULT; + goto exit; + } + + mutex_lock(&mic_data.dd_pm.pm_idle_mutex); + + if (mic_ctx->micpm_ctx.idle_state != PM_IDLE_STATE_PC3) { + PM_DEBUG("PC6 transition failed. Node not in PC3\n"); + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + err = -EINVAL; + goto exit; + } + + if ((err = pm_pc3_to_pc6_entry(mic_ctx))) { + PM_DEBUG("PC6 transition from PC3 failed for node %d\n", + mic_get_scifnode_id(mic_ctx)); + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); + goto exit; + } + mutex_unlock(&mic_data.dd_pm.pm_idle_mutex); +exit: + return err; + +} + +/* + * mic_get_scifnode_id: + * + * Function to retrieve node id of a scif node. + * + * mic_ctx: The driver context of the specified node. + * Returns the scif node_id of the specified node. + */ +uint32_t mic_get_scifnode_id(mic_ctx_t *mic_ctx) { + /* NOTE: scif node_id cannot assumed to be a simple increment + * of the bi_id of the driver context. This function is really + * a placeholder for the board_id to node_id conversion that + * we need to do in the host driver. + */ + return (uint32_t)mic_ctx->bi_id + 1; +} diff --git a/host/tools_support.c b/host/tools_support.c new file mode 100644 index 0000000..93922f8 --- /dev/null +++ b/host/tools_support.c @@ -0,0 +1,978 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* code to send escape calls to uOS; meant to test the ring buffer */ + +#include "mic_common.h" +#include "mic/mic_dma_lib.h" +#include "mic/mic_dma_api.h" +#include +#include + +// constants defined for flash commands for setting PCI aperture +#define RASMM_DEFAULT_OFFSET 0x4000000 +#define RASMM_FLASH_SIZE 0x200000 +#define MAX_CORE_INDEX 61 +#define SKU_MEM_DIVIDE 4 +#define SKU_LOW_MEM 0 +#define SKU_HIGH_MEM 1 +#define FREQ_2P4 0x630 +#define FREQ_4P5 0x65A +#define FREQ_5P0 0x664 +#define FREQ_5P5 0x66E +#define MASK_MEMFREQ 0xfff +#define SHIFT_MEMFREQ 16 + +int +mic_unpin_user_pages(struct page **pages, uint32_t nf_pages) +{ + uint32_t j = 0; + uint32_t status = 0; + if (pages) { + for (j = 0; j < nf_pages; j++) { + if (pages[j]) { + SetPageDirty(pages[j]); + page_cache_release(pages[j]); + } + } + kfree(pages); + } + + return status; +} + +int +mic_pin_user_pages (void *data, struct page **pages, uint32_t len, int32_t *nf_pages, int32_t nr_pages) +{ + + int32_t status = 0; + + + if (!(pages)) { + printk("%s Failed to allocate memory for pages\n", __func__); + status = -ENOMEM; + return status; + + } + + // pin the user pages; use semaphores on linux for doing the same + down_read(¤t->mm->mmap_sem); + *nf_pages = (int32_t)get_user_pages(current, current->mm, (uint64_t)data, + nr_pages, PROT_WRITE, 1, pages, NULL); + up_read(¤t->mm->mmap_sem); + + // compare if the no of final pages is equal to no of requested pages + if ((*nf_pages) < nr_pages) { + printk("%s failed to do _get_user_pages\n", __func__); + status = -EFAULT; + mic_unpin_user_pages(pages, *nf_pages); + return status; + } + + + return status; + +} + +int +send_flash_cmd(mic_ctx_t *mic_ctx, MIC_FLASH_CMD_TYPE type, void *data, uint32_t len) +{ + int32_t status = 0; + uint8_t *mmio_va = mic_ctx->mmio.va; + sbox_scratch1_reg_t scratch1reg = {0}; + sbox_scratch2_reg_t scratch2reg = {0}; + uint32_t ret = 0; + void *src; + struct timeval t; + struct flash_stat *statbuf = NULL; + uint64_t temp; + uint32_t i = 0; + struct version_struct *verbuf = NULL; + int32_t offset = 0; + uint8_t cmddata = 0; + + scratch1reg.bits.status = FLASH_CMD_INVALID; + switch (type) { + case FLASH_CMD_READ: + + /* + * image address = the upper 20 bits of the 32-bit of scracth2 register + * is card side physical address where the flash image resides + * program scratch2 register to notify the image address + */ + scratch2reg.bits.image_addr = RASMM_DEFAULT_OFFSET >> 12; + SBOX_WRITE(scratch2reg.value, mmio_va, SBOX_SCRATCH2); + + /* set command */ + scratch1reg.bits.command = FLASH_CMD_READ; + SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1); + + mic_send_bootstrap_intr(mic_ctx); + break; + + case FLASH_CMD_READ_DATA: + + /* + * flash read_data command : set pci aperture to 128MB + * read the value of scratch2 in a variable + */ + ret = SBOX_READ(mmio_va, SBOX_SCRATCH2); + scratch2reg.value = ret; + + /* + * convert physical to virtual address + * image address = the upper 20 bits of the 32-bit KNC side physical + * address where the flash image resides + */ + offset = scratch2reg.bits.image_addr << 12 ; + if (len == 0) { + status = -EINVAL; + goto exit; + } + + if (len > (mic_ctx->aper.len - offset)) { + status = -EINVAL; + goto exit; + } + src = mic_ctx->aper.va + offset; + + temp = copy_to_user(data, src, len); + if (temp > 0) { + printk("error while copy to user \n"); + status = -EFAULT; + goto exit; + } + break; + + case FLASH_CMD_ABORT: + + scratch1reg.bits.command = FLASH_CMD_ABORT; + SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1); + + mic_send_bootstrap_intr(mic_ctx); + break; + + case FLASH_CMD_VERSION: + + /* + * image address = the upper 20 bits of the 32-bit of scracth2 register + * is card side physical address where the flash image resides + */ + scratch2reg.bits.image_addr = RASMM_DEFAULT_OFFSET >> 12; + SBOX_WRITE(scratch2reg.value, mmio_va, SBOX_SCRATCH2); + + /* + * flash version command : similar to read_data command. + * Instead of get_user_pages(), use kmalloc() as we are allocating + * buffer of lesser size + */ + scratch1reg.bits.command = FLASH_CMD_VERSION; + SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1); + + mic_send_bootstrap_intr(mic_ctx); + + /* poll for completion */ + while(scratch1reg.bits.status != FLASH_CMD_COMPLETED) { + ret = SBOX_READ(mmio_va, SBOX_SCRATCH1); + scratch1reg.value = ret; + msleep(1); + i++; + printk("Looping for status (time = %d ms)\n", i); + if(i > 3000) { + status = -ETIME; + goto exit; + } + + } + + src = mic_ctx->aper.va + RASMM_DEFAULT_OFFSET; + + if (len == 0) { + status = -EINVAL; + goto exit; + } + verbuf = kmalloc(len, GFP_KERNEL); + if (!verbuf) { + status = -ENOMEM; + goto exit; + } + + memcpy(verbuf, src, len); + + printk("header verbuf is : %x\n", verbuf->hdr_ver); + printk("odm verbuf is : %x\n", verbuf->odm_ver); + printk("uptd time bcd is : %llu\n", verbuf->upd_time_bcd); + printk("updated verbuf is : %d\n", *((int*)(&verbuf->upd_ver))); + printk("mfg time bcd is : %llu\n", verbuf->mfg_time_bcd); + printk("mfg verbuf is : %d\n", *((int*)(&verbuf->mfg_ver))); + + temp = copy_to_user(data, verbuf, len); + if(temp > 0) { + printk("error while copy to user \n"); + status = -EFAULT; + if(verbuf) { + kfree(verbuf); + } + goto exit; + } + + if(verbuf) { + kfree(verbuf); + } + + break; + + case FLASH_CMD_WRITE: + + /* flash write command : pin user pages for the data buffer which contains + * the image. + * For the write command, we provide the offset for writing. + * GTT is set to 64MB and offset = 0. + */ + if (len > (mic_ctx->aper.len - RASMM_DEFAULT_OFFSET)) { + status = -EINVAL; + goto exit; + } + src = mic_ctx->aper.va + RASMM_DEFAULT_OFFSET; + if (len == 0) { + status = -EINVAL; + goto exit; + } + temp = copy_from_user(src, data, len); + if (temp > 0) { + printk("error while copying from user \n"); + status = -EFAULT; + goto exit; + } + + /* image address = the upper 20 bits of the 32-bit KNC side physical + * address where the flash image resides + */ + scratch2reg.bits.image_addr = RASMM_DEFAULT_OFFSET >> 12; + SBOX_WRITE(scratch2reg.value, mmio_va, SBOX_SCRATCH2); + + scratch1reg.bits.command = FLASH_CMD_WRITE; + SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1); + + mic_send_bootstrap_intr(mic_ctx); + ; + + break; + + case RAS_CMD_CORE_DISABLE: + case RAS_CMD_CORE_ENABLE: + if (copy_from_user(&cmddata, data, sizeof(cmddata))) { + status = -EFAULT; + goto exit; + } + scratch1reg.bits.cmd_data = cmddata; + if (cmddata > MAX_CORE_INDEX) { + printk("Parameter given is greater than physical core index\n"); + status = -EINVAL; + goto exit; + } + + case RAS_CMD: + case RAS_CMD_INJECT_REPAIR: + case RAS_CMD_ECC_DISABLE: + case RAS_CMD_ECC_ENABLE: + case RAS_CMD_EXIT: + do_gettimeofday(&t); + SBOX_WRITE(t.tv_sec, mmio_va, SBOX_SCRATCH3); + scratch1reg.bits.command = type; + SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1); + + mic_send_bootstrap_intr(mic_ctx); + + break; + + case FLASH_CMD_STATUS: + + /* status command : mmio read of SCRATCH1 register + * The percentage completion is only updated on the + * Flash Write function as currently implemented. + * The other functions are expected to complete almost instantly + */ + if(len != sizeof(struct flash_stat)) { + status = -EINVAL; + goto exit; + } + if (len == 0) { + status = -EINVAL; + goto exit; + } + statbuf = kmalloc(len, GFP_KERNEL); + if(!statbuf) { + status = -ENOMEM; + goto exit; + } + + temp = SBOX_READ(mmio_va, SBOX_SCRATCH1); + scratch1reg.value = (uint32_t)temp; + + statbuf->status = scratch1reg.bits.status; + statbuf->percent = scratch1reg.bits.percent; + statbuf->smc_status = scratch1reg.bits.smc_status; + statbuf->cmd_data = scratch1reg.bits.cmd_data; + statbuf->mm_debug = scratch1reg.bits.mm_debug; + + temp = copy_to_user(data, statbuf, len); + if(temp > 0) { + printk("Error copying data to user buffer\n"); + status = -EFAULT; + if(statbuf) { + kfree(statbuf); + } + goto exit; + } + + if(statbuf) { + kfree(statbuf); + } + + break; + + default: + printk(KERN_ERR "Unknown command\n"); + status = -EOPNOTSUPP; + break; + + } + + exit : + return status; +} + +int get_cardside_mem(mic_ctx_t *mic_ctx, uint64_t start, uint64_t size, void *dest) +{ + int32_t status = 0; + uint64_t len; + uint64_t dest_pa; + struct dma_channel *ch = NULL; + int flags = 0; + int poll_cookie; + int i, next_page; + int j; + uint64_t num_pages; + uint64_t card_pa; + int32_t nf_pages = 0; + uint64_t nr_pages = 0; + struct page **pages = NULL; + void *pg_virt_add; + unsigned long t = jiffies; + int dma_ret = 0; + card_pa = start; + len = size; + + if (len % PAGE_SIZE) + nr_pages = (len >> PAGE_SHIFT) + 1; + else + nr_pages = len >> PAGE_SHIFT; + + flags |= DO_DMA_POLLING; + num_pages = len / PAGE_SIZE; + next_page = 0; + + pages = kmalloc(nr_pages * sizeof(struct page*), GFP_KERNEL); + if (!pages) + return -ENOMEM; + status = mic_pin_user_pages(dest, pages, (uint32_t)len, &nf_pages, (int32_t)nr_pages); + + if (status) + goto exit; + + /* allocate_dma_channel should fail in 2 cases : 1. if it doesnt get dma channel + * then it times out 2. there is no device present + */ + status = micpm_get_reference(mic_ctx, true); + if (status) + goto exit; + + while ((dma_ret = allocate_dma_channel(mic_ctx->dma_handle, &ch)) != 0) { + if (dma_ret == -ENODEV) { + printk("No device present\n"); + status = -ENODEV; + goto put_ref; + } + msleep(1); + if (time_after(jiffies,t + NODE_ALIVE_TIMEOUT)) { + printk("dma channel allocation error\n"); + status = -EBUSY; + goto put_ref; + } + } + + for(j = 0; j < num_pages; j++) { + i = 0; + pg_virt_add = lowmem_page_address(pages[j]); + /* get card side address */ + dest_pa = mic_ctx_map_single(mic_ctx, pg_virt_add, PAGE_SIZE); + + /* do dma and keep polling for completion */ + poll_cookie = do_dma(ch, flags, card_pa + next_page, dest_pa, PAGE_SIZE, NULL); + pr_debug("Poll cookie %d\n", poll_cookie); + if (0 > poll_cookie) { + printk("Error programming the dma descriptor\n"); + status = poll_cookie; + goto put_ref; + } else if (-2 == poll_cookie) { + printk( "Copy was done successfully, check for validity\n"); + } else if(-1 != poll_cookie) { + while (i < 10000 && 1 != poll_dma_completion(poll_cookie, ch)) { + i++; + } + if (i == 10000) { + printk("DMA timed out \n"); + } else { + pr_debug("DMA SUCCESS at %d\n", i); + /* increment by PAGE_SIZE on DMA SUCCESS to transfer next page */ + next_page = next_page + PAGE_SIZE; + } + } + mic_ctx_unmap_single(mic_ctx, (dma_addr_t)dest_pa, PAGE_SIZE); + } + +put_ref: + micpm_put_reference(mic_ctx); +exit: + mic_unpin_user_pages(pages, nf_pages); + if (ch) + free_dma_channel(ch); + return status; +} + +/* SKU functions */ +void +sku_swap_list(struct list_head *in, struct list_head *out) +{ + struct list_head *pos, *tmp; + sku_info_t *node; + list_for_each_safe(pos, tmp, in) { + node = list_entry(pos, sku_info_t, sku); + list_del(pos); + list_add_tail(&node->sku, out); + } +} + +int +sku_create_node(uint32_t fuserev_low, + uint32_t fuserev_high, uint32_t mem_size, + uint32_t mem_freq, char *sku_name, + sku_info_t ** newnode) +{ + sku_info_t *temp; + + temp = kmalloc(sizeof(sku_info_t), GFP_KERNEL); + if (temp == NULL) + return -ENOMEM; + temp->fuserev_low = fuserev_low; + temp->fuserev_high = fuserev_high; + temp->memsize = mem_size; + temp->memfreq = mem_freq; + strncpy(temp->sku_name, sku_name, SKU_NAME_LEN - 1); + temp->sku_name[SKU_NAME_LEN - 1] = '\0'; + *newnode = temp; + return 0; +} + +void +sku_destroy_table() +{ + int i; + sku_info_t *node; + struct list_head *pos, *tmp; + for (i = 0; i < MAX_DEV_IDS; i++) + list_for_each_safe(pos, tmp, &mic_data.sku_table[i]) { + node = list_entry(pos, sku_info_t, sku); + list_del(pos); + kfree(node); + } +} + +int +sku_find(mic_ctx_t *mic_ctx, uint32_t device_id) +{ + int ret = 0; + uint32_t cnt = 0; + sku_info_t *match, *newnode = NULL, *skunode; + struct list_head skulist_memsize_in; + struct list_head skulist_memfreq_in; + struct list_head skulist_out; + uint32_t fuse_rev, memsize, memfreq; + struct list_head *pos, *tmp; + const char *invalid = "INVALID SKU"; + + /* Use the LSB as index to the array of pointers to the SKU table*/ + device_id = device_id & 0xf; + + if (device_id > MAX_DEV_IDS) { + strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + return -EINVAL; + } + + INIT_LIST_HEAD(&skulist_memsize_in); + INIT_LIST_HEAD(&skulist_memfreq_in); + INIT_LIST_HEAD(&skulist_out); + + /* Search by fuse_config_rev */ + fuse_rev = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH7); + fuse_rev = (fuse_rev >> SHIFT_FUSE_CONFIG_REV) & MASK_FUSE_CONFIG_REV; + + list_for_each_safe(pos, tmp, &mic_data.sku_table[device_id]) { + match = list_entry(pos, sku_info_t, sku); + if ((match->fuserev_low <= fuse_rev) && (match->fuserev_high >= fuse_rev)) { + cnt++; + ret = sku_create_node(match->fuserev_low, match->fuserev_high, + match->memsize, match->memfreq, match->sku_name, &newnode); + if (ret) { + strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + goto cleanup; + } + list_add_tail(&newnode->sku, &skulist_out); + } + } + /* If only one node is present, the match has been found */ + if (cnt == 1) { + strncpy(mic_ctx->sku_name, newnode->sku_name, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + goto cleanup; + } + + sku_swap_list(&skulist_out, &skulist_memsize_in); + /* Search by memsize */ + memsize = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH0); + memsize = (memsize >> SHIFT_MEMSIZE) & MASK_MEMSIZE; + memsize = memsize >> 20; + if (memsize > SKU_MEM_DIVIDE) + memsize = SKU_HIGH_MEM; + else + memsize = SKU_LOW_MEM; + + cnt = 0; + list_for_each_safe(pos, tmp, &skulist_memsize_in) { + match = list_entry(pos, sku_info_t, sku); + /* Use the MSB for comparison */ + /* Assumption - From the latest documentation, a particular + * combination of device id and fuse_rev can either have memory + * <=4GB (SKU_LOW_MEM) or > 4GB (SKU_HIGH_MEM) + */ + if (memsize == match->memsize) { + cnt++; + ret = sku_create_node(match->fuserev_low, match->fuserev_high, + match->memsize, match->memfreq, match->sku_name, &newnode); + if (ret) { + strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + goto cleanup; + } + list_add_tail(&newnode->sku, &skulist_out); + } + + } + list_for_each_safe(pos, tmp, &skulist_memsize_in) { + skunode = list_entry(pos, sku_info_t, sku); + list_del(pos); + kfree(skunode); + } + if (cnt == 1) { + strncpy(mic_ctx->sku_name, newnode->sku_name, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + goto cleanup; + } + + sku_swap_list(&skulist_out, &skulist_memfreq_in); + /* Search by memfreq */ + memfreq = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH9); + memfreq = (memfreq >> SHIFT_MEMFREQ) & MASK_MEMFREQ; + + cnt = 0; + list_for_each_safe(pos, tmp, &skulist_memfreq_in) { + match = list_entry(pos, sku_info_t, sku); + if (memfreq == match->memfreq) { + cnt++; + ret = sku_create_node(match->fuserev_low, match->fuserev_high, + match->memsize, match->memfreq, match->sku_name, &newnode); + if (ret) { + strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + goto cleanup; + } + list_add_tail(&newnode->sku, &skulist_out); + } + + } + list_for_each_safe(pos, tmp, &skulist_memfreq_in) { + skunode = list_entry(pos, sku_info_t, sku); + list_del(pos); + kfree(skunode); + } + if (cnt == 1) { + strncpy(mic_ctx->sku_name, newnode->sku_name, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + } else { + strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1); + mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0'; + } + + +cleanup: + list_for_each_safe(pos, tmp, &skulist_out) { + skunode = list_entry(pos, sku_info_t, sku); + list_del(pos); + kfree(skunode); + } + + return ret; +} + + +int +sku_build_table(void) +{ + int i = 0; + sku_info_t *newnode = NULL; + + for ( i = 0; i < MAX_DEV_IDS; i++) + INIT_LIST_HEAD(&mic_data.sku_table[i]); + + /*2250*/ + if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU1", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(0, 1, SKU_HIGH_MEM, FREQ_2P4, "A0PO-SKU1", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5,"ES1-SKU2", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(2, 3, SKU_HIGH_MEM, FREQ_4P5, "ES1-SKU2", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(4, 49, SKU_HIGH_MEM, FREQ_4P5, "ES1B-SKU2", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(50, 100, SKU_HIGH_MEM, FREQ_4P5, "B0PO-SKU2", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(101, 150, SKU_HIGH_MEM, FREQ_5P0, "ES2-P1640", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(153, 154, SKU_HIGH_MEM, FREQ_5P0, "B1PO-5110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(151, 152, SKU_HIGH_MEM, FREQ_5P0, "B1PO-P1640/D1650", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(156, 156, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-5110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(155, 155, SKU_HIGH_MEM, FREQ_5P0, "B1QS-5110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-5110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(158, 250, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-5110P/5120D", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P0, "C0-5110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P0, "C0QS-5110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P5, "C0-5120D", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P5, "C0QS-5120D", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P0, "C0PRQ-5110P/5140P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P5, "C0PRQ-5120D/5140D", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[0]); + + /*2251*/ + if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU2", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[1]); + + if (sku_create_node(0, 1, SKU_HIGH_MEM, FREQ_2P4, "A0PO-SKU2", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[1]); + + /*2252*/ + if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU3", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[2]); + + /*2253*/ + if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU4", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[3]); + + if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_2P4, "ES1-SKU5", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[3]); + + if (sku_create_node(4, 49, SKU_LOW_MEM, FREQ_2P4, "ES1B-SKU5", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[3]); + + if (sku_create_node(50, 100, SKU_LOW_MEM, FREQ_4P5, "B0PO-SKU5", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[3]); + + /*2254*/ + + /*2255*/ + if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKUX", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[5]); + + /*2256*/ + if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU5", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[6]); + + /*2257*/ + if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKUZ", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[7]); + + /*2258*/ + if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5, "ES1-SKU1", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[8]); + if (sku_create_node(2, 3, SKU_HIGH_MEM, FREQ_4P5, "ES1-SKU1", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[8]); + if (sku_create_node(4, 49, SKU_HIGH_MEM, FREQ_5P5, "ES1B-SKU1", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[8]); + if (sku_create_node(50, 100, SKU_HIGH_MEM, FREQ_5P5, "B0PO-SKU1", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[8]); + + /*2259*/ + if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5, "ES1-SKU3", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[9]); + + if (sku_create_node(2, 3, SKU_HIGH_MEM, FREQ_4P5, "ES1-SKU3", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[9]); + + /*225A*/ + if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5, "ES1-SKU4", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[10]); + + if (sku_create_node(4, 49, SKU_LOW_MEM, FREQ_5P0, "ES1B-SKU4", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[10]); + + if (sku_create_node(50, 100, SKU_LOW_MEM, FREQ_5P0, "B0PO-SKU4", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[10]); + + if (sku_create_node(101, 150, SKU_LOW_MEM, FREQ_5P0, "ES2-SKU4", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[10]); + + /*225B*/ + if (sku_create_node(4, 49, SKU_HIGH_MEM, FREQ_5P5, "ES1B-SKU3cs", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[11]); + + if (sku_create_node(4, 49, SKU_LOW_MEM, FREQ_5P5, "ES1B-SKU3ncs", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[11]); + + if (sku_create_node(50, 100, SKU_HIGH_MEM, FREQ_5P5, "B0PO-SKU3cs", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[11]); + + if (sku_create_node(50, 100, SKU_LOW_MEM, FREQ_5P5, "B0PO-SKU3ncs", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[11]); + + /*225C*/ + if (sku_create_node(101, 150, SKU_HIGH_MEM, FREQ_5P5, "ES2-P/A/X 1750", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(153, 154, SKU_HIGH_MEM, FREQ_5P5, "B1PO-7110 P/A/X", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(155, 155, SKU_HIGH_MEM, FREQ_5P5, "B1QS-7110 P/A/X", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(151, 152, SKU_HIGH_MEM, FREQ_5P0, "B1PO-P/A 1750", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(156, 156, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-7110 P/A/X", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-7110 P/A/X", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(158, 202, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-7110 P/X", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(203, 250, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-SE10 P/X", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P5, "C0-7120 P/A/X/D", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P5, "C0QS-7120 P/A/X/D", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P5, "C0PRQ-7120 P/A/X/D", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[12]); + + /*225D*/ + if (sku_create_node(101, 150, SKU_LOW_MEM, FREQ_5P0, "ES2-P1310", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(101, 150, SKU_HIGH_MEM, FREQ_5P0, "ES2-A1330", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(153, 154, SKU_LOW_MEM, FREQ_5P0, "B1PO-3110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(153, 154, SKU_HIGH_MEM, FREQ_5P0, "B1PO-3115A", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(157, 157, SKU_LOW_MEM, FREQ_5P0, "B1PRQ-3110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-3115A", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(156, 156, SKU_LOW_MEM, FREQ_5P0, "B1PRQ-3110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(156, 156, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-3115A", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(155, 155, SKU_HIGH_MEM, FREQ_5P0, "B1QS-3115A", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(155, 155, SKU_LOW_MEM, FREQ_5P0, "B1QS-3110P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(158, 250, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-3120P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P0, "C0-3120 P/A", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P0, "C0QS-3120 P/A", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P0, "C0PRQ-3120/3140 P/A", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[13]); + + /*225E*/ + if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-31S1P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[14]); + + if (sku_create_node(158, 250, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-31S1P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[14]); + + if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P0, "C0-31S1P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[14]); + + if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P0, "C0QS-31S1P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[14]); + + if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P0, "C0PRQ-31S1P", &newnode)) + return -ENOMEM; + list_add_tail(&newnode->sku, &mic_data.sku_table[14]); + + return 0; // Successed +} diff --git a/host/uos_download.c b/host/uos_download.c new file mode 100644 index 0000000..6a323c7 --- /dev/null +++ b/host/uos_download.c @@ -0,0 +1,1950 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* contains code to download uos on MIC card */ + +#include "mic_common.h" +#include +#include "micint.h" +#include +#include +#include "mic/mic_virtio.h" +#include +#include "mic/micveth.h" + + +#define APERTURE_SEGMENT_SIZE ((1) * 1024 * 1024 * 1024ULL) + +#define UOS_RESERVE_SIZE_MIN ((128) * 1024 * 1024) +#define OS_RESERVE_SIZE_MIN ((32) * 1024 * 1024) +#define UOS_RESERVE_SIZE_MAX (((4) * 1024 * 1024 * 1024ULL) - ((4) * 1024)) +#define UOS_RESERVE_PERCENT 50 + +#define UOS_WATCHDOG_TIMEOUT 5000 // default watchdog timeout in milliseconds + +#define PCIE_CLASS_CODE(x) ((x) >> 24 ) + +/* zombie class code as per the HAS is 0xFF + * but on KNC, we found it as 0x03 + */ +#define ZOMBIE_CLASS_CODE 0x03 +#define DISABLE_BAR 0x02 +#define RESET_FAILED_F2 12870 +#define RESET_FAILED_F4 13382 + +void ramoops_remove(mic_ctx_t *mic_ctx); + +static struct proc_dir_entry *ramoops_dir; +struct proc_dir_entry *vmcore_dir; + + +static void adapter_dpc(unsigned long dpc); +extern int mic_vhost_blk_probe(bd_info_t *bd_info); +extern void mic_vhost_blk_remove(bd_info_t *bd_info); + +/* driver wide global common data */ +mic_data_t mic_data; +extern int usagemode_param; +extern bool mic_crash_dump_enabled; +extern bool mic_watchdog_auto_reboot; + +static int64_t etc_comp = 0; + +static uint64_t +etc_read(uint8_t *mmio_va) +{ + uint32_t low; + uint32_t hi1,hi2; + + do { + hi1 = SBOX_READ(mmio_va, SBOX_ELAPSED_TIME_HIGH); + low = SBOX_READ(mmio_va, SBOX_ELAPSED_TIME_LOW); + hi2 = SBOX_READ(mmio_va, SBOX_ELAPSED_TIME_HIGH); + } while(hi1 != hi2); + + return((uint64_t)((((uint64_t)hi1 << 32) | low) >> 5)); +} + +static int64_t +calc_deltaf(mic_ctx_t *mic_ctx) +{ + const int64_t ETC_CLK_FREQ = 15625000; + const uint32_t TIME_DELAY_IN_SEC = 10; + const int64_t etc_cnt1 = ETC_CLK_FREQ * TIME_DELAY_IN_SEC; + int64_t etc_cnt2; + + uint64_t cnt1, cnt2; + int64_t deltaf_in_ppm, deltaf; + + /* + * (etc_freq2 / etc_freq1) = (etc_count2 / etc_count1) + * etc_freq1 = ETC_CLK_FREQ + * => etc_count1 = TIME_DELAY_IN_SEC * ETC_CLK_FREQ + * (etc_freq2 / etc_freq1) = (etc_count2 / etc_count1) + * etc_freq2 = etc_freq1 * (etc_count2 / etc_count1) + * etc_freq2 - etc_freq1 = etc_freq1((etc_count2 / etc_count1) - 1) + * deltaf = etc_freq1(etc_count2 - etc_count1)/etc_count1 + * deltaf_in_ppm = deltaf * 10 ^ 6 / etc_freq1 + * deltaf_in_ppm = ((etc_count2 - etc_count1) * 10 ^ 6) / etc_count1 + */ + /* Need to implement the monotonic/irqsave logic for windows */ + unsigned long flags; + struct timespec ts1, ts2; + int64_t mono_ns; + int i = 0; + do { + local_irq_save(flags); + cnt1 = etc_read(mic_ctx->mmio.va); + getrawmonotonic(&ts1); + local_irq_restore(flags); + mdelay(TIME_DELAY_IN_SEC * 1000); + local_irq_save(flags); + cnt2 = etc_read(mic_ctx->mmio.va); + getrawmonotonic(&ts2); + local_irq_restore(flags); + etc_cnt2 = cnt2 - cnt1; + ts2 = timespec_sub(ts2, ts1); + mono_ns = timespec_to_ns(&ts2); + /* Recalculate etc_cnt2 based on getrawmonotonic */ + etc_cnt2 = (etc_cnt2 * TIME_DELAY_IN_SEC * 1000 * 1000 * 1000) / mono_ns; + deltaf = ( ETC_CLK_FREQ * (etc_cnt2 - etc_cnt1)) / etc_cnt1; + deltaf_in_ppm = (1000 * 1000 * (etc_cnt2 - etc_cnt1)) / etc_cnt1; + i++; + /* + * HSD #4844900 + * On some of the systems deltaf_in_ppm is turning out + * way higher than expected. The only reasons I can think of + * are: + * i) mmio traffic cauing variable delays for mmio read + * ii) NMIs affecting this code + */ + } while (i < 10 && (deltaf_in_ppm > 2700 || deltaf_in_ppm < -2700)); + + pr_debug("etc deltaf: %lld\n", deltaf); + /* + * For intel chipsets, Spread Spectrum Clocking (SSC) (in the limit) + * is downspread with a frequency of 30hz and an amplitude of 0.5% + * which translates to 2500ppm. This is also the ppm observed on KNC + CrownPass + * Hence, if ppm > 2500, the code would need to retry to eliminate any chance of error + * Added an error margin of 1ppm (etc mmio reads can take really long time) + */ + if (deltaf_in_ppm > 2700 || deltaf_in_ppm < -2700) { + printk(KERN_ERR "ETC timer compensation(%lldppm) is much higher" + "than expected\n", deltaf_in_ppm); + /* + * HSD #4844900 + * Clamp etc compensation to 2500ppm + */ + if (deltaf_in_ppm > 2700) + deltaf_in_ppm = 2500; + else + deltaf_in_ppm = -2500; + deltaf = (ETC_CLK_FREQ * deltaf_in_ppm) / (1000 * 1000); + } + if (deltaf > 0 && deltaf <= 10) + deltaf = 0; + return deltaf; +} + +void +calculate_etc_compensation(mic_ctx_t *mic_ctx) +{ + if (mic_ctx->bi_family == FAMILY_KNC) { + if (!etc_comp) + etc_comp = calc_deltaf(mic_ctx); + mic_ctx->etc_comp = etc_comp; + } +} + +/* + DESCRIPTION:: waits for bootstrap loader is finished + PARAMETERS:: + [in]void *mmio_va - virtual address to access MMIO registers + RETURN_VALUE:: 0 if successful, non-zero if failure +*/ +int +wait_for_bootstrap(uint8_t *mmio_va) +{ + uint32_t scratch2 = 0; + int count = 0; +#ifdef MIC_IS_EMULATION + int wait_time = 0; +#endif + + // Wait until the boot loader is finished + while (!SCRATCH2_DOWNLOAD_STATUS(scratch2)) { + msleep(100); + if (count == 600) { +#ifndef MIC_IS_EMULATION + printk("Firmware is not responding with ready bit\n"); + return -EIO; +#else + /* We don't want to be polling too often on the emulator, it is SLOW! */ + pr_debug("Wait for bootstrap: %d min(s) \n", wait_time++); + count = 0; +#endif + } + + count++; + scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2); + } + + return 0; +} + +/* + DESCRIPTION::gets adapter memory size. calculates size based on scratch register 0 + PARAMETERS:: + [in]void *mmio_va - virtual address to access MMIO registers + [out]uint32_t *adapter_mem_size - adapter memory size + RETURN_VALUE:: none +*/ +void +get_adapter_memsize(uint8_t *mmio_va, uint32_t *adapter_mem_size) +{ + uint32_t memsize = 0; + uint32_t scratch0 = {0}; + + scratch0 = SBOX_READ(mmio_va, SBOX_SCRATCH0); + memsize = SCRATCH0_MEM_SIZE_KB(scratch0) * ((1) * 1024); + + // Adjust the memory size based on the memory usage + switch (SCRATCH0_MEM_USAGE(scratch0)) { + case SCR0_MEM_ALL: + // Do nothing + break; + + case SCR0_MEM_HALF: + memsize /= 2; + break; + + case SCR0_MEM_THIRD: + memsize /= 3; + break; + + case SCR0_MEM_FOURTH: + memsize /= 4; + break; + + default: + // DBG_ASSERT_MSG(false, "Invalid memory usage specified by the bootstrap.\n"); + break; + } + + *adapter_mem_size = memsize; +} + +/* + DESCRIPTION:: gets uos load offset from scratch register 2 + PARAMETERS:: + [in]void *mmio_va - virtual address to access MMIO registers + [out]uint32_t *uos_load_offset - offset at which uos will be loaded + RETURN_VALUE:: none +*/ +void +get_uos_loadoffset(uint8_t *mmio_va, uint32_t *uos_load_offset) +{ + uint32_t scratch2 = 0; + + scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2); + *uos_load_offset = SCRATCH2_DOWNLOAD_ADDR(scratch2); +} + +/* + DESCRIPTION:: gets reserved size for uos + PARAMETERS:: + [out]uint32_t *uos_reserve_size - reserved uos size + RETURN_VALUE:: none +*/ +void +get_uos_reserved_size(uint8_t* mmio_va, uint32_t adapter_memsize, uint32_t *uos_reserve_size) +{ + uint32_t reserve_size = 0; + + // Only calculate if not explicitly specified by the user + reserve_size = (uint32_t)(adapter_memsize * UOS_RESERVE_PERCENT / 100); + + // Make sure there is at least WINDOWS_RESERVE_SIZE_MIN bytes + reserve_size = GET_MIN(reserve_size, adapter_memsize - OS_RESERVE_SIZE_MIN); + + // Keep in mind maximum uos reserve size is uint32_t, so we never overflow + reserve_size = GET_MIN(reserve_size, UOS_RESERVE_SIZE_MAX); + reserve_size = GET_MAX(reserve_size, UOS_RESERVE_SIZE_MIN); + + // Always align uos reserve size to a page + reserve_size = (uint32_t)AlignLow(reserve_size, ((4) * 1024)); + + *uos_reserve_size = reserve_size; +} + +/* + DESCRIPTION:: gets APIC ID from scratch register 2 + PARAMETERS:: + [in]void *mmio_va - virtual address to access MMIO registers + [out]uint32_t *apic_id - apic id + RETURN_VALUE:: none +*/ +void +get_apic_id(uint8_t *mmio_va, uint32_t *apic_id) +{ + uint32_t scratch2 = 0; + + scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2); + *apic_id = SCRATCH2_APIC_ID(scratch2); +} + +/* + DESCRIPTION::program the PCI aperture as a contiguous window. (only supports upto 4GB memory) + PARAMETERS:: + [in]mic_ctx_t *mic_ctx - mic ctx + [in]int gtt_index - beginning gtt entry index + [in]uint64_t phy_addr - physical address for PCI aperture + [in]uint32_t num_bytes - size of PCI aperture + RETURN_VALUE:: None + */ +void +set_pci_aperture(mic_ctx_t *mic_ctx, uint32_t gtt_index, uint64_t phy_addr, uint32_t num_bytes) +{ + uint32_t num_pages; + uint32_t gtt_entry; + uint32_t i; + + num_pages = ALIGN(num_bytes, PAGE_SIZE) >> PAGE_SHIFT; + + for (i = 0; i < num_pages; i++) { + + gtt_entry = ((uint32_t)(phy_addr >> PAGE_SHIFT) + i) << 1 | 0x1u; + GTT_WRITE(gtt_entry, mic_ctx->mmio.va, (gtt_index + i)*sizeof(gtt_entry)); + } + + // XPU_RACE_CONDITION: + // Writing GttTlbFlushReg DOES NOT flush all write transactions from SBOX to GDDR + // because GttTlbFlushReg is an SBOX register and transaction terminates in SBOX + // MMIO write must use MIC ringbus to be serializing. + // Writing GTT itself DOES serialize: GTT is in MMIO space, and write goes to the ringbus + // MemoryBarrier makes sure all writes make it to GDDR before tlbFlush write + smp_mb(); // FIXME: only needs SFENCE + + // write any value to cause a flush + SBOX_WRITE(1, mic_ctx->mmio.va, SBOX_TLB_FLUSH); +} + +/* + DESCRIPTION:: Programs a scratch register that the bootstrap reads to determine + how large is uOS image. + PARAMETERS:: + [in]void *mmio_va - virtual address to mmio register, + [in]uint32_t uos_size - size of uos image + RETURN_VALUE:: none +*/ +void +set_uos_size(uint8_t *mmio_va, uint32_t uos_size) +{ + uint32_t scratch5; + + scratch5 = uos_size; + // XPU_RACE_CONDITION: write to MMIO space is uncached and flushes WC buffers + SBOX_WRITE(scratch5, mmio_va, SBOX_SCRATCH5); +} + +/* + DESCRIPTION:: Programs a scratch register that the uOS reads to determine how + much memory to reserve. + PARAMETERS:: + [in]void *mmio_va - virtual address to mmio register, + [in]uint32_t uos_reserved_size - size of memory to be reserved by uos. + RETURN_VALUE:: none +*/ +void +set_uos_reserved_size(uint8_t *mmio_va, uint32_t uos_reserved_size) +{ + uint32_t scratch3; + + scratch3 = uos_reserved_size; + // XPU_RACE_CONDITION: write to MMIO space is uncached and flushes WC buffers + SBOX_WRITE(scratch3, mmio_va, SBOX_SCRATCH3); +} + +/* + DESCRIPTION:: . + PARAMETERS:: + [in]uint32_t device_id - device ID, + RETURN_VALUE:: family type +*/ +product_family_t +get_product_family(uint32_t device_id) +{ + product_family_t product_family; + + switch (device_id) { + case PCI_DEVICE_ABR_2249: + case PCI_DEVICE_ABR_224a: + product_family = FAMILY_ABR; + break; + + case PCI_DEVICE_KNC_2250: + case PCI_DEVICE_KNC_2251: + case PCI_DEVICE_KNC_2252: + case PCI_DEVICE_KNC_2253: + case PCI_DEVICE_KNC_2254: + case PCI_DEVICE_KNC_2255: + case PCI_DEVICE_KNC_2256: + case PCI_DEVICE_KNC_2257: + case PCI_DEVICE_KNC_2258: + case PCI_DEVICE_KNC_2259: + case PCI_DEVICE_KNC_225a: + case PCI_DEVICE_KNC_225b: + case PCI_DEVICE_KNC_225c: + case PCI_DEVICE_KNC_225d: + case PCI_DEVICE_KNC_225e: + product_family = FAMILY_KNC; + break; + + default: + pr_debug( "Invalid/Unknown device ID %d\r\n", device_id); + product_family = FAMILY_UNKNOWN; + break; + } + + return product_family; +} + +/* + DESCRIPTION:: loads uos image at given path into gddr + PARAMETERS:: + [in]mic_ctx_t *mic_ctx - mic context + [in]imgname - file path for uos file to be loaded + [out]uos_size - size of uos image + */ +int +load_uos_into_gddr(mic_ctx_t *mic_ctx, char *imgname, uint32_t* uos_size, uint64_t *uos_cmd_offset) +{ + void *aperture_va; + uint8_t *mmio_va; + uint32_t apic_id = 0; + uint32_t uos_load_offset = 0; + uint32_t adapter_memsize = 0; + int status = 0; + + aperture_va = mic_ctx->aper.va; + mmio_va = mic_ctx->mmio.va; + + if (mic_ctx->state != MIC_BOOT) { + printk("Not in booting state\n"); + return -EPERM; + } + + status = mic_get_file_size(imgname, uos_size); + + if (status) { + mic_ctx->state = MIC_BOOTFAIL; + printk("Linux image not found at %s , status returned %d\n", imgname, status); + return status; + } + + get_uos_loadoffset(mmio_va, &uos_load_offset); + // Determine the uOS reserve size after we have the m_pXpu interface + get_adapter_memsize(mmio_va, &adapter_memsize); + + get_apic_id(mmio_va, &apic_id); + // store apic_id in adapter context for later use + mic_ctx->apic_id = apic_id; + + if (mic_ctx->bi_family == FAMILY_ABR){ + // Program the PCI aperture as a contiguous window + // Need an extra page to provide enough buffer space for command line arguments. + set_pci_aperture(mic_ctx, 0, uos_load_offset, *uos_size + PAGE_SIZE); + uos_load_offset = 0; + } + + // transfer uOs image file to gddr + status = mic_load_file(imgname, ((uint8_t*)aperture_va) + uos_load_offset, *uos_size); + + // for the emulator we want to skip "downloading" the file + *uos_cmd_offset = (uint64_t)uos_load_offset + *uos_size; + + // This only applies to KNF bootstrap, it is NOT needed for KNC + if (mic_ctx->bi_family == FAMILY_ABR) { + // clear UOS load offset register after uOS was uploaded + SBOX_WRITE(0, mmio_va, SBOX_SCRATCH2); + SBOX_READ(mmio_va, SBOX_SCRATCH2); + } + + return status; +} + +/* + DESCRIPTION:: loads uos initramfs image at given path into gddr for KNC. + PARAMETERS:: + [in]mic_ctx_t *mic_ctx - mic context + [in]initramfsname - file path for uos initramfs file to be loaded + */ +int +load_initramfs(mic_ctx_t *mic_ctx, char *initramfsname, uint32_t *initramfs_image, uint32_t *initramfs_size) +{ + uint8_t *aperture_va; + uint8_t *mmio_va; + uint32_t apic_id = 0; + uint32_t uos_load_offset = 0; + uint32_t file_load_offset = 0; + uint32_t adapter_memsize = 0; + uint32_t file_size = 0; + int status = 0; + uint32_t *ramfs_addr_ptr; + + aperture_va = mic_ctx->aper.va; + mmio_va = mic_ctx->mmio.va; + + if (mic_ctx->state != MIC_BOOT) { + printk("Not in booting state\n"); + return -EPERM; + } + + status = mic_get_file_size(initramfsname, &file_size); + + if (status) { + mic_ctx->state = MIC_BOOTFAIL; + printk("Init ram disk image not found at %s , status returned %d\n", initramfsname, status); + return status; + } + + get_uos_loadoffset(mmio_va, &uos_load_offset); + file_load_offset = uos_load_offset << 1; /* Place initramfs higher than kernel; 128MB is ok */ + + *initramfs_size = file_size; + *initramfs_image = file_load_offset; + + // Determine the uOS reserve size after we have the m_pXpu interface + get_adapter_memsize(mmio_va, &adapter_memsize); + get_apic_id(mmio_va, &apic_id); + + // store apic_id in adapter context for later use + mic_ctx->apic_id = apic_id; + + // transfer uOs image file to gddr + status = mic_load_file(initramfsname, aperture_va + file_load_offset, file_size); + + // write the initramfs load address and size to the fields in the kernel header + ramfs_addr_ptr = (uint32_t *)(aperture_va + uos_load_offset + 0x218); + *ramfs_addr_ptr = file_load_offset; + ramfs_addr_ptr = (uint32_t *)(aperture_va + uos_load_offset + 0x21c); + *ramfs_addr_ptr = *initramfs_size; + + return status; +} + +struct tmpqp { + uint64_t ep; + uint64_t magic; +}; + +int +load_command_line(mic_ctx_t *mic_ctx, uint64_t uos_cmd_offset) +{ + void *cmd_line_va = mic_ctx->aper.va + uos_cmd_offset; + uint32_t cmdlen = 0; + char *buf = NULL; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE) + struct board_info *bi = mic_ctx->bd_info; +#endif + +#ifdef USE_VCONSOLE + micvcons_t *vcons = &mic_ctx->bi_vcons; + dma_addr_t vc_hdr_dma_addr = 0; +#endif + + /* + * mic_ctx->boot_mem will also be set in IOCTL to boot the card in restricted memory + * FIXME::This code is added to keep the backward compatibility with IOCTLs + */ + if (mic_ctx->bi_family == FAMILY_KNC) + if (mic_ctx->boot_mem == 0 || mic_ctx->boot_mem > mic_ctx->aper.len >> 20) + mic_ctx->boot_mem = (uint32_t)(mic_ctx->aper.len >> 20); + if (!(buf = kzalloc(MIC_CMDLINE_BUFSIZE, GFP_KERNEL))) { + printk(KERN_ERR "failed to allocate %d bytes for uOS command line\n", + MIC_CMDLINE_BUFSIZE); + return -ENOMEM; + } + + cmdlen = snprintf(buf, MIC_CMDLINE_BUFSIZE, "card=%d vnet=%s scif_id=%d scif_addr=0x%llx", + mic_ctx->bi_id, mic_vnet_modes[mic_vnet_mode], + mic_ctx->bi_id + 1, mic_ctx->bi_scif.si_pa); + + if (mic_vnet_mode == VNET_MODE_DMA) { + struct micvnet_info *vnet_info = mic_ctx->bi_vethinfo; + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " vnet_addr=0x%llx", vnet_info->vi_rp_phys); + } + +#ifdef USE_VCONSOLE + if (vcons->dc_enabled) + vc_hdr_dma_addr = vcons->dc_hdr_dma_addr; + + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " vcons_hdr_addr=0x%llx", vc_hdr_dma_addr); +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE) + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, " virtio_addr=0x%llx", + mic_ctx_map_single(mic_ctx, bi->bi_virtio, sizeof(struct vb_shared))); +#endif + + if (mic_ctx->boot_mem) + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " mem=%dM", mic_ctx->boot_mem); + mic_ctx->boot_mem = 0; + + if (mic_ctx->ramoops_size) + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " ramoops_size=%d ramoops_addr=0x%llx", + mic_ctx->ramoops_size, mic_ctx->ramoops_pa[0]); + + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " p2p=%d p2p_proxy=%d", mic_p2p_enable, mic_p2p_proxy_enable); + + if (mic_ctx->bi_family == FAMILY_KNC) + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " etc_comp=%lld", mic_ctx->etc_comp); + + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " reg_cache=%d", mic_reg_cache_enable); + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " ulimit=%d", mic_ulimit_check); + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " huge_page=%d", mic_huge_page_enable); + if (mic_crash_dump_enabled) + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " crashkernel=1M@80M"); + /* + * Limitations in the Intel Jaketown and Ivytown platforms require SCIF + * to proxy P2P DMA read transfers in order to convert them into a P2P DMA + * write for better performance. The SCIF module on MIC needs the + * numa node the MIC is connected to on the host to make decisions + * about whether to proxy P2P DMA reads or not based on whether the two MIC + * devices are connected to the same QPI/socket/numa node or not. + * The assumption here is that a socket/QPI will have a unique + * numa node number. + */ + pr_debug("CPU family = %d, CPU model = %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model); + + if (mic_p2p_proxy_enable && (boot_cpu_data.x86==6) && + (boot_cpu_data.x86_model == 45 || boot_cpu_data.x86_model == 62)) { + int numa_node = dev_to_node(&mic_ctx->bi_pdev->dev); + if (-1 != numa_node) { + if (boot_cpu_data.x86_model == 45) + ms_info.mi_proxy_dma_threshold = SCIF_PROXY_DMA_THRESHOLD_JKT; + if (boot_cpu_data.x86_model == 62) + ms_info.mi_proxy_dma_threshold = SCIF_PROXY_DMA_THRESHOLD_IVT; + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " numa_node=%d", numa_node); + cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " p2p_proxy_thresh=%lld", ms_info.mi_proxy_dma_threshold); + } + } + + if (mic_ctx->sysfs_info.cmdline != NULL) + snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " %s", mic_ctx->sysfs_info.cmdline); + else + snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, + " hostname=mic%d ipaddr=171.31.%d.2 quiet console=ttyS0,115200n8", + mic_ctx->bi_id, mic_ctx->bi_id + 1); + + memcpy_toio(cmd_line_va, buf, strlen(buf) + 1); + + if (mic_ctx->sysfs_info.kernel_cmdline != NULL) + kfree(mic_ctx->sysfs_info.kernel_cmdline); + + if ((mic_ctx->sysfs_info.kernel_cmdline = kmalloc(strlen(buf) + 1, GFP_KERNEL)) != NULL) + strcpy(mic_ctx->sysfs_info.kernel_cmdline, buf); + + kfree(buf); + return 0; +} + +/* + DESCRIPTION:: method responsible for programming scratch register with uos image size + and notifying bootstrap to start booting uos + PARAMETERS:: + [in]mic_ctx_t *mic_ctx - mic context + [in]uint32_t uos_size - size of uos image + */ +int +notify_uosboot(mic_ctx_t *mic_ctx, uint32_t uos_size) +{ + int status = 0; + uint32_t adapter_memsize = 0; + uint32_t uos_reserved_size = 0; + uint8_t* mmio_va = mic_ctx->mmio.va; + + // Program the register with uOS image size for bootstrap + set_uos_size(mmio_va, uos_size); + + get_adapter_memsize(mmio_va, &adapter_memsize); + + // Program the register to inform the uOS of how much space to reserve + get_uos_reserved_size(mmio_va, adapter_memsize, &uos_reserved_size); + set_uos_reserved_size(mmio_va, uos_reserved_size); + + mic_send_bootstrap_intr(mic_ctx); + + return status; +} + +/* + DESCRIPTION :: boots Linux OS on the card + PARAMETERS :: + [in]mic_ctx_t *mic_ctx - mic context + [in]char *imgname - file path for uos image to be loaded on the card + RETURN_VALUE:: 0 if successful, non-zero if failure +*/ +int +boot_linux_uos(mic_ctx_t *mic_ctx, char *imgname, char *initramfsname) +{ + int status = 0; + uint32_t uos_size = 0; + uint64_t uos_cmd_offset = 0; + uint32_t initramfs_image = 0; + uint32_t initramfs_size = 0; + + printk("MIC %d Booting\n", mic_ctx->bi_id); + + if (mic_ctx->state != MIC_BOOT) { + printk(KERN_ERR "MIC %d is not in offline mode\n", mic_ctx->bi_id); + return -EPERM; + } + + //loads uos image at given path into gddr + if ((status = load_uos_into_gddr(mic_ctx, imgname, &uos_size, &uos_cmd_offset)) != 0) { + printk("Cannot load uos in gddr\n"); + return status; + } + + if (initramfsname && (status = load_initramfs(mic_ctx, initramfsname, &initramfs_image, &initramfs_size)) != 0) { + printk("Cannot load initramfs in gddr\n"); + return status; + } + + status = load_command_line(mic_ctx, uos_cmd_offset); + + //program scratch register with uos image size and notify bootstrap + status = notify_uosboot(mic_ctx, uos_size); + + return status; +} + +/* + DESCRIPTION :: boots Maintenance mode handler on the card + PARAMETERS :: + [in]mic_ctx_t *mic_ctx - mic context + [in]char *imgname - file path for uos image to be loaded on the card + RETURN_VALUE:: 0 if successful, non-zero if failure +*/ +int boot_micdev_app(mic_ctx_t *mic_ctx, char *imgname) +{ + int status = 0; + uint32_t uos_size = 0; + uint8_t *mmio_va = 0; + uint64_t uos_cmd_offset = 0; + int32_t temp_scratch2 = 0; + + printk("MIC %d Booting\n", mic_ctx->bi_id); + mmio_va = mic_ctx->mmio.va; + status = load_uos_into_gddr(mic_ctx, imgname, &uos_size, &uos_cmd_offset); + if(status) { + printk("Cannot load uos in gddr\n"); + goto exit; + } + + temp_scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2); + /* clear download bit */ + temp_scratch2 = SCRATCH2_CLEAR_DOWNLOAD_STATUS(temp_scratch2); + SBOX_WRITE(temp_scratch2, mmio_va, SBOX_SCRATCH2); + + //program scratch register with uos image size and notify bootstrap + status = notify_uosboot(mic_ctx, uos_size); + if(status) + goto exit; + status = wait_for_bootstrap(mmio_va); +exit: + if(status) { + mic_setstate(mic_ctx, MIC_BOOTFAIL); + } else { + mic_setstate(mic_ctx, MIC_ONLINE); + mic_ctx->boot_count++; + printk("ELF booted succesfully\n"); + ; + } + return status; +} + +/* Perform hardware reset of the device */ +void +reset_timer(unsigned long arg) +{ + mic_ctx_t *mic_ctx = (mic_ctx_t *)arg; + uint32_t scratch2 = 0; + uint32_t postcode = mic_getpostcode(mic_ctx); + + printk("mic%d: Resetting (Post Code %c%c)\n", mic_ctx->bi_id, + postcode & 0xff, (postcode >> 8) & 0xff); + mic_ctx->reset_count++; + + /* Assuming that the bootstrap takes around 90 seconds to reset, + * we fail after 300 seconds, thus allowing 3 attempts to reset + */ + if (mic_ctx->reset_count == RESET_FAIL_TIME || + !postcode || 0xffffffff == postcode || mic_ctx->state == MIC_RESETFAIL) { + mic_ctx->reset_count = 0; + mic_setstate(mic_ctx, MIC_RESETFAIL); + wake_up(&mic_ctx->resetwq); + printk("MIC %d RESETFAIL postcode %c%c %d\n", mic_ctx->bi_id, + postcode & 0xff, (postcode >> 8) & 0xff, postcode); + return; + } + + /* check for F2 or F4 error codes from bootstrap */ + if ((postcode == RESET_FAILED_F2) || (postcode == RESET_FAILED_F4)) { + if (mic_ctx->resetworkq) { + queue_work(mic_ctx->resetworkq, &mic_ctx->resetwork); + } else { + mic_ctx->reset_count = 0; + mic_setstate(mic_ctx, MIC_RESETFAIL); + wake_up(&mic_ctx->resetwq); + return; + } + } + + /* checking if bootstrap is ready or still resetting */ + scratch2 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH2); + if (SCRATCH2_DOWNLOAD_STATUS(scratch2)) { + mic_ctx->boot_start = 0; + mic_setstate(mic_ctx, MIC_READY); + + if (mic_ctx->msie) + mic_enable_msi_interrupts(mic_ctx); + mic_enable_interrupts(mic_ctx); + mic_smpt_restore(mic_ctx); + micscif_start(mic_ctx); + + wake_up(&mic_ctx->resetwq); + mic_ctx->reset_count = 0; + + return; + } + + mic_ctx->boot_timer.function = reset_timer; + mic_ctx->boot_timer.data = (unsigned long)mic_ctx; + mic_ctx->boot_timer.expires = jiffies + HZ; + + add_timer(&mic_ctx->boot_timer); +} + +void +adapter_wait_reset(mic_ctx_t *mic_ctx) +{ + mic_ctx->boot_timer.function = reset_timer; + mic_ctx->boot_timer.data = (unsigned long)mic_ctx; + mic_ctx->boot_timer.expires = jiffies + HZ; + mic_ctx->boot_start = jiffies; + + add_timer(&mic_ctx->boot_timer); +} + +void +adapter_reset(mic_ctx_t *mic_ctx, int wait_reset, int reattempt) +{ + uint32_t resetReg; + mutex_lock(&mic_ctx->state_lock); + /* TODO: check state for lost node as well once design is done */ + if ((mic_ctx->state == MIC_RESET || mic_ctx->state == MIC_READY) && (reattempt == 0)) { + if (wait_reset == 0) { + mic_setstate(mic_ctx, MIC_INVALID); + del_timer_sync(&mic_ctx->boot_timer); + mutex_unlock(&mic_ctx->state_lock); + return; + } + mutex_unlock(&mic_ctx->state_lock); + return; + } + + mic_setstate(mic_ctx, MIC_RESET); + + mutex_unlock(&mic_ctx->state_lock); + + del_timer_sync(&mic_ctx->boot_timer); + + //Write 0 to uos download status otherwise we might continue booting + //before reset has completed... + SBOX_WRITE(0, mic_ctx->mmio.va, SBOX_SCRATCH2); + + // Virtual network link value should be 0 before reset + SBOX_WRITE(0, mic_ctx->mmio.va, SBOX_SCRATCH14); + + // Data from Doorbell1 about restart/shutdown should be 0 before reset + SBOX_WRITE(0, mic_ctx->mmio.va, SBOX_SDBIC1); + + //This will trigger reset + resetReg = SBOX_READ(mic_ctx->mmio.va, SBOX_RGCR); + resetReg |= 0x1; + SBOX_WRITE(resetReg, mic_ctx->mmio.va, SBOX_RGCR); + + /* At least of KNF it seems we really want to delay at least 1 second */ + /* after touching reset to prevent a lot of problems. */ + msleep(1000); + + if (!wait_reset) { + return; + } + + adapter_wait_reset(mic_ctx); + +} + +void ramoops_flip(mic_ctx_t *mic_ctx); + +int +adapter_shutdown_device(mic_ctx_t *mic_ctx) +{ + ; + + if (micpm_get_reference(mic_ctx, true)) + return 0; + + mutex_lock(&mic_ctx->state_lock); + if (mic_ctx->state == MIC_ONLINE) { + mic_setstate(mic_ctx, MIC_SHUTDOWN); + + /* + * Writing to SBOX RDMASR0 will generate an interrupt + * on the uOS which will initiate orderly shutdown. + */ + mic_send_sht_intr(mic_ctx); + } + mutex_unlock(&mic_ctx->state_lock); + + micpm_put_reference(mic_ctx); + return 0; +} + +int +adapter_stop_device(mic_ctx_t *mic_ctx, int wait_reset, int reattempt) +{ + ; + + micvcons_stop(mic_ctx); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \ + defined(RHEL_RELEASE_CODE) + mic_vhost_blk_stop(mic_ctx->bd_info); +#endif + micveth_stop(mic_ctx); + + micpm_stop(mic_ctx); + micscif_stop(mic_ctx); + vmcore_remove(mic_ctx); + close_dma_device(mic_ctx->bi_id + 1, &mic_ctx->dma_handle); + ramoops_flip(mic_ctx); + + /* Calling adapter_reset after issuing Host shutdown/reboot + * leads to randon NMIs. These are not rleated to any Card in + * specific but occurs on the PCI bridge. */ + if ((system_state == SYSTEM_POWER_OFF) || + (system_state == SYSTEM_RESTART) || + (system_state == SYSTEM_HALT)) + return 0; + adapter_reset(mic_ctx, wait_reset, reattempt); + + return 0; +} + +static void +destroy_reset_workqueue(mic_ctx_t *mic_ctx) +{ + struct workqueue_struct *tempworkq; + tempworkq = mic_ctx->resetworkq; + mic_ctx->resetworkq = NULL; + destroy_workqueue(tempworkq); + del_timer_sync(&mic_ctx->boot_timer); +} + +int +adapter_remove(mic_ctx_t *mic_ctx) +{ + +#ifdef USE_VCONSOLE + if (mic_ctx->bi_vcons.dc_hdr_virt) { + mic_ctx_unmap_single(mic_ctx, mic_ctx->bi_vcons.dc_hdr_dma_addr, + sizeof(struct vcons_buf)); + kfree(mic_ctx->bi_vcons.dc_hdr_virt); + mic_ctx->bi_vcons.dc_hdr_virt = NULL; + } + + if (mic_ctx->bi_vcons.dc_buf_virt) { + mic_ctx_unmap_single(mic_ctx, mic_ctx->bi_vcons.dc_dma_addr, + MICVCONS_BUF_SIZE); + free_pages((uint64_t)mic_ctx->bi_vcons.dc_buf_virt, 0); + mic_ctx->bi_vcons.dc_buf_virt = NULL; + } +#endif + + mic_psmi_uninit(mic_ctx); + micpm_remove(mic_ctx); + micscif_remove(mic_ctx); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE) + mic_vhost_blk_remove(mic_ctx->bd_info); +#endif + micveth_remove(mic_ctx); + mic_unreg_irqhandler(mic_ctx, 0x1, "MIC SHUTDOWN DoorBell 1"); + + ramoops_remove(mic_ctx); + vmcore_remove(mic_ctx); + mic_smpt_uninit(mic_ctx); + /* Make sure that no reset timer is running after the workqueue is destroyed */ + destroy_reset_workqueue(mic_ctx); + + if (mic_ctx->mmio.va) { + iounmap((void *)mic_ctx->mmio.va); + mic_ctx->mmio.va = 0; + } + + if (mic_ctx->aper.va) { + iounmap((void *)mic_ctx->aper.va); + mic_ctx->aper.va = 0; + } + + + return 0; +} + +#define MIC_MAX_BOOT_TIME 180 // Maximum number of seconds to wait for boot to complete + +static void +online_timer(unsigned long arg) +{ + mic_ctx_t *mic_ctx = (mic_ctx_t *)arg; + uint64_t delay = (jiffies - mic_ctx->boot_start) / HZ; + + if (mic_ctx->state == MIC_ONLINE) + return; + + if (delay > MIC_MAX_BOOT_TIME) { + printk("Fail booting MIC %d. Wait time execeed %d seconds\n", mic_ctx->bi_id, MIC_MAX_BOOT_TIME); + mic_ctx->state = MIC_BOOTFAIL; + return; + } + + mic_ctx->boot_timer.function = online_timer; + mic_ctx->boot_timer.data = (unsigned long)mic_ctx; + mic_ctx->boot_timer.expires = jiffies + HZ; + add_timer(&mic_ctx->boot_timer); + + if (!(delay % 5)) + printk("Waiting for MIC %d boot %lld\n", mic_ctx->bi_id, delay); +} + +static void +boot_timer(unsigned long arg) +{ + mic_ctx_t *mic_ctx = (mic_ctx_t *)arg; + struct micvnet_info *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo; + uint64_t delay = (jiffies - mic_ctx->boot_start) / HZ; + bool timer_restart = false; + + if ((mic_ctx->state != MIC_BOOT) && (mic_ctx->state != MIC_ONLINE)) { + return; + } + + if (delay > MIC_MAX_BOOT_TIME) { + printk("Fail booting MIC %d. Wait time execeed %d seconds\n", mic_ctx->bi_id, MIC_MAX_BOOT_TIME); + mic_ctx->state = MIC_BOOTFAIL; + return; + } + + if (!(delay % 5)) + printk("Waiting for MIC %d boot %lld\n", mic_ctx->bi_id, delay); + + if (mic_vnet_mode != VNET_MODE_DMA) + timer_restart = (SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH14) == 0)? + true : false; + else if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP) + timer_restart = (mic_ctx->state != MIC_ONLINE)? true: false; + + if (timer_restart) { + mic_ctx->boot_timer.function = boot_timer; + mic_ctx->boot_timer.data = (unsigned long)mic_ctx; + mic_ctx->boot_timer.expires = jiffies + HZ; + + add_timer(&mic_ctx->boot_timer); + return; + } + + mic_ctx->boot_timer.function = online_timer; + mic_ctx->boot_timer.data = (unsigned long)mic_ctx; + mic_ctx->boot_timer.expires = jiffies + HZ; + add_timer(&mic_ctx->boot_timer); + + printk("MIC %d Network link is up\n", mic_ctx->bi_id); + schedule_work(&mic_ctx->boot_ws); +} + +void +post_boot_startup(struct work_struct *work) +{ + + mic_ctx_t *mic_ctx + = container_of(work, mic_ctx_t, boot_ws); + + if (micpm_get_reference(mic_ctx, true) != 0) + return; + + // We should only enable DMA after uos is booted + BUG_ON(open_dma_device(mic_ctx->bi_id+1, + mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS, + &mic_ctx->dma_handle)); + if (micveth_start(mic_ctx)) + printk(KERN_ERR "%s: micveth_start failed\n", __FUNCTION__); + micpm_put_reference(mic_ctx); + +} + +void +attempt_reset(struct work_struct *work) +{ + mic_ctx_t *mic_ctx + = container_of(work, mic_ctx_t, resetwork); + printk("Reattempting reset after F2/F4 failure\n"); + adapter_reset(mic_ctx, RESET_WAIT, RESET_REATTEMPT); +} + +static void +ioremap_work(struct work_struct *work) +{ + mic_ctx_t *mic_ctx + = container_of(work, mic_ctx_t, ioremapwork); + mic_ctx->aper.va = ioremap_wc(mic_ctx->aper.pa, mic_ctx->aper.len); + if (mic_ctx->aper.va == NULL) { + printk(KERN_ERR "mic %d: failed to map aperture space\n", mic_ctx->bi_id); + mutex_lock(&mic_ctx->state_lock); + mic_setstate(mic_ctx, MIC_RESETFAIL); + mutex_unlock(&mic_ctx->state_lock); + } + wake_up(&mic_ctx->ioremapwq); +} + +int +adapter_post_boot_device(mic_ctx_t *mic_ctx) +{ + mic_ctx->boot_timer.function = boot_timer; + mic_ctx->boot_timer.data = (unsigned long)mic_ctx; + mic_ctx->boot_timer.expires = jiffies + HZ; + mic_ctx->boot_start = jiffies; + + add_timer(&mic_ctx->boot_timer); + return 0; +} + +int +mic_shutdown_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell) +{ + struct micscif_dev *dev = &scif_dev[mic_get_scifnode_id(mic_ctx)]; + mic_ctx->sdbic1 = SBOX_READ(mic_ctx->mmio.va, SBOX_SDBIC1); + SBOX_WRITE(0x0, mic_ctx->mmio.va, SBOX_SDBIC1); + if (mic_ctx->sdbic1) + queue_delayed_work(dev->sd_ln_wq, + &dev->sd_watchdog_work, 0); + return 0; +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +static int +ramoops_proc_show(struct seq_file *m, void *data) +{ + uint64_t id = ((uint64_t)data) & 0xffffffff; + uint64_t entry = ((uint64_t)data) >> 32; + struct list_head *pos, *tmpq; + bd_info_t *bd = NULL; + mic_ctx_t *mic_ctx = NULL; + char *record; + char *end; + int size = 0; + int l = 0; + char *output; + unsigned long flags; + + list_for_each_safe(pos, tmpq, &mic_data.dd_bdlist) { + bd = list_entry(pos, bd_info_t, bi_list); + mic_ctx = &bd->bi_ctx; + if (mic_ctx->bi_id == id) + break; + } + + if (mic_ctx == NULL) + return 0; + + spin_lock_irqsave(&mic_ctx->ramoops_lock, flags); + + record = mic_ctx->ramoops_va[entry]; + if (record == NULL) { + spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags); + return -EEXIST; + } + + size = mic_ctx->ramoops_size; + end = record + size; + + if ((output = kzalloc(size, GFP_ATOMIC)) == NULL) { + spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags); + return -ENOMEM; + } + + l += scnprintf(output, size, "%s", record); + + spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags); + + seq_printf(m, "%s", output); + return 0; +} + +static int +ramoops_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ramoops_proc_show, NULL); +} + +struct file_operations ramoops_proc_fops = { + .open = ramoops_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#else // LINUX VERSION +static int +ramoops_read(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + uint64_t id = ((uint64_t)data) & 0xffffffff; + uint64_t entry = ((uint64_t)data) >> 32; + struct list_head *pos, *tmpq; + bd_info_t *bd = NULL; + mic_ctx_t *mic_ctx = NULL; + char *record; + char *end; + int size = 0; + int l = 0; + int left_to_read; + char *output; + unsigned long flags; + + list_for_each_safe(pos, tmpq, &mic_data.dd_bdlist) { + bd = list_entry(pos, bd_info_t, bi_list); + mic_ctx = &bd->bi_ctx; + if (mic_ctx->bi_id == id) + break; + } + + if (mic_ctx == NULL) + return 0; + + spin_lock_irqsave(&mic_ctx->ramoops_lock, flags); + + record = mic_ctx->ramoops_va[entry]; + if (record == NULL) { + spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags); + *eof = 1; + return 0; + } + + size = mic_ctx->ramoops_size; + end = record + size; + + if ((output = kzalloc(size, GFP_ATOMIC)) == NULL) { + spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags); + return -ENOMEM; + } + + l += scnprintf(output, size, "%s", record); + + spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags); + + left_to_read = l - offset; + if (left_to_read < 0) + left_to_read = 0; + if (left_to_read == 0) + *eof = 1; + + left_to_read = min(len, left_to_read); + memcpy(buf, output + offset, left_to_read); + kfree(output); + *start = buf; + return left_to_read; +} +#endif // LINUX VERSION + +int +set_ramoops_pa(mic_ctx_t *mic_ctx) +{ + if (mic_ctx->ramoops_pa[0] == 0L) { + kfree(mic_ctx->ramoops_va[0]); + mic_ctx->ramoops_size = 0; + mic_ctx->ramoops_va[0] = NULL; + return 1; + } + return 0; +} + +int ramoops_count = 4; + +void +ramoops_probe(mic_ctx_t *mic_ctx) +{ + char name[64]; + + mic_ctx->ramoops_size = ramoops_count * PAGE_SIZE; + if ((mic_ctx->ramoops_va[0] = kzalloc(mic_ctx->ramoops_size, GFP_KERNEL)) != NULL) { + spin_lock_init(&mic_ctx->ramoops_lock); + mic_ctx->ramoops_va[1] = NULL; + + mic_ctx->ramoops_pa[0] = mic_ctx_map_single(mic_ctx, mic_ctx->ramoops_va[0], + mic_ctx->ramoops_size); + if (set_ramoops_pa(mic_ctx)) + return; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + snprintf(name, 64, "mic%d", mic_ctx->bi_id); + proc_create_data(name, 0444, ramoops_dir, &ramoops_proc_fops, + (void *)(long)mic_ctx->bi_id); + + snprintf(name, 64, "mic%d_prev", mic_ctx->bi_id); + proc_create_data(name, 0444, ramoops_dir, &ramoops_proc_fops, + (void *)((long)mic_ctx->bi_id | (1L << 32))); +#else // LINUX VERSION + snprintf(name, 64, "mic%d", mic_ctx->bi_id); + if (create_proc_read_entry(name, 0444, ramoops_dir, ramoops_read, + (void *)(long)mic_ctx->bi_id) == NULL) + printk("Failed to intialize /proc/mic_ramoops/%s\n", name); + + snprintf(name, 64, "mic%d_prev", mic_ctx->bi_id); + if (create_proc_read_entry(name, 0444, ramoops_dir, ramoops_read, + (void *)((long)mic_ctx->bi_id | (1L << 32))) == NULL) + printk("Failed to intialize /proc/mic_ramoops/%s\n", name); +#endif //LINUX VERSION + } else { + mic_ctx->ramoops_size = 0; + } +} + +void +ramoops_flip(mic_ctx_t *mic_ctx) +{ + unsigned long flags; + + if (mic_ctx->ramoops_size == 0) + return; + + spin_lock_irqsave(&mic_ctx->ramoops_lock, flags); + if (mic_ctx->ramoops_va[1] != NULL) { + mic_ctx_unmap_single(mic_ctx, mic_ctx->ramoops_pa[1], mic_ctx->ramoops_size); + kfree(mic_ctx->ramoops_va[1]); + } + + mic_ctx->ramoops_pa[1] = mic_ctx->ramoops_pa[0]; + mic_ctx->ramoops_va[1] = mic_ctx->ramoops_va[0]; + if ((mic_ctx->ramoops_va[0] = kzalloc(mic_ctx->ramoops_size, GFP_ATOMIC)) != NULL) { + mic_ctx->ramoops_pa[0] = mic_ctx_map_single(mic_ctx, mic_ctx->ramoops_va[0], + mic_ctx->ramoops_size); + set_ramoops_pa(mic_ctx); + } + spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags); +} + +int +adapter_probe(mic_ctx_t *mic_ctx) +{ + int db; + uint32_t scratch13; + int32_t status = 0; + + // Init the irq information + atomic_set(&mic_ctx->bi_irq.mi_received, 0); + spin_lock_init(&mic_ctx->bi_irq.mi_lock); + tasklet_init(&mic_ctx->bi_dpc, adapter_dpc, (unsigned long)&mic_ctx->bi_dpc); + + for (db = 0; db < MIC_NUM_DB; db++) { + INIT_LIST_HEAD(&mic_ctx->bi_irq.mi_dblist[db]); + } + + if (mic_ctx->msie) + mic_enable_msi_interrupts(mic_ctx); + + scratch13 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH13); + mic_ctx->bi_stepping = SCRATCH13_STEP_ID(scratch13); + mic_ctx->bi_substepping = SCRATCH13_SUB_STEP(scratch13); +#ifdef MIC_IS_EMULATION + mic_ctx->bi_platform = PLATFORM_EMULATOR; +#else + mic_ctx->bi_platform = SCRATCH13_PLATFORM_ID(scratch13); +#endif + + mic_enable_interrupts(mic_ctx); + if (micveth_probe(mic_ctx)) + printk(KERN_ERR "%s: micveth_probe failed\n", __FUNCTION__); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE) + if (mic_vhost_blk_probe(mic_ctx->bd_info)) + printk(KERN_ERR "%s: mic_vhost_blk_probe failed\n", __FUNCTION__); +#endif + micscif_probe(mic_ctx); + if(micpm_probe(mic_ctx)) + printk(KERN_ERR "%s: micpm_probe failed\n", __FUNCTION__); + + mic_reg_irqhandler(mic_ctx, 1, "MIC SHUTDOWN DoorBell 1", + mic_shutdown_host_doorbell_intr_handler); + + ramoops_probe(mic_ctx); + if (status) { + printk("boot_linux_uos failed \n"); + return status; + } + + // We should only enable DMA after uos is booted + //mic_dma_lib_init(mic_ctx->mmio.va+HOST_SBOX_BASE_ADDRESS); + + return status; +} + +int +adapter_start_device(mic_ctx_t *mic_ctx) +{ + int ret; + + mutex_lock(&mic_ctx->state_lock); + if (mic_ctx->state == MIC_READY) { + mic_setstate(mic_ctx, MIC_BOOT); + } else { + mutex_unlock(&mic_ctx->state_lock); + /* TODO: Unknown state handling? */ + printk(KERN_ERR "%s %d state %d??\n", + __func__, __LINE__, mic_ctx->state); + ret = -EINVAL; + goto exit; + } + mutex_unlock(&mic_ctx->state_lock); + mic_ctx->mode = MODE_LINUX; + ret = boot_linux_uos(mic_ctx, mic_ctx->image, mic_ctx->initramfs); + if (ret) { + printk(KERN_ERR "boot_linux_uos failed %d\n", ret); + goto exit; + } + + ret = adapter_post_boot_device(mic_ctx); + if (ret) { + printk(KERN_ERR "adapter post boot failed %d\n", ret); + goto exit; + } + + pr_debug("adapter started successfully\n"); +exit: + return ret; +} + +int +adapter_init_device(mic_ctx_t *mic_ctx) +{ +#ifdef USE_VCONSOLE + struct vcons_buf *vcons_buf; +#endif + uint32_t mmio_data_cc; /* mmio data from class code register */ + uint32_t mmio_data_bar; /* mmio data from bar enable register */ + uint32_t device_id; + int err = 0; + + spin_lock_init(&mic_ctx->sysfs_lock); + mic_setstate(mic_ctx, MIC_RESET); + mic_ctx->mode = MODE_NONE; + mic_ctx->reset_count = 0; + mutex_init (&mic_ctx->state_lock); + init_waitqueue_head(&mic_ctx->resetwq); + init_waitqueue_head(&mic_ctx->ioremapwq); + init_timer(&mic_ctx->boot_timer); + if (!(mic_ctx->resetworkq = __mic_create_singlethread_workqueue("RESET WORK"))) + return -ENOMEM; + if (!(mic_ctx->ioremapworkq = __mic_create_singlethread_workqueue("IOREMAP_WORK"))) { + err = -EINVAL; + goto destroy_reset_wq; + } + INIT_WORK(&mic_ctx->ioremapwork, ioremap_work); + INIT_WORK(&mic_ctx->boot_ws, post_boot_startup); + INIT_WORK(&mic_ctx->resetwork, attempt_reset); + atomic_set(&mic_ctx->gate_interrupt, 0); + + device_id = mic_ctx->bi_pdev->device; + mic_ctx->bi_family = get_product_family(device_id); + + if ((mic_ctx->mmio.va = ioremap_nocache(mic_ctx->mmio.pa, + mic_ctx->mmio.len)) == NULL) { + printk("mic %d: failed to map mmio space\n", mic_ctx->bi_id); + err = -ENOMEM; + goto destroy_remap_wq; + } + + if (mic_ctx->aper.pa == 0) { + /* + * Read class code from SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8 register + * If the mode is zombie, then + * 1> Aperture is not available + * 2> Register 0x5CD4 is written to 0x00000002 to disable all BARs except MMIO + * 3> Register 0x5808 is written to 0xFF0000XX to set the class ID to a generic PCI device. + */ + mmio_data_cc = SBOX_READ(mic_ctx->mmio.va, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8); + mmio_data_cc = PCIE_CLASS_CODE(mmio_data_cc); + mmio_data_bar = SBOX_READ(mic_ctx->mmio.va, SBOX_PCIE_BAR_ENABLE); + + if((mmio_data_cc == ZOMBIE_CLASS_CODE) && (mmio_data_bar == DISABLE_BAR)) { + mic_ctx->card_usage_mode = USAGE_MODE_ZOMBIE; + usagemode_param = USAGE_MODE_ZOMBIE; + } else { + printk("Error: Not in zombie mode and aperture is 0\n"); + err = -EINVAL; + goto adap_init_unmapmmio; + } + } else { + if (mic_ctx->ioremapworkq) { + queue_work(mic_ctx->ioremapworkq, &mic_ctx->ioremapwork); + } else { + if ((mic_ctx->aper.va = ioremap_wc(mic_ctx->aper.pa, mic_ctx->aper.len)) == NULL) { + printk("mic %d: failed to map aperture space\n", mic_ctx->bi_id); + err = -EINVAL; + goto adap_init_unmapmmio; + } + } + } + + mic_debug_init(mic_ctx); + mic_smpt_init(mic_ctx); +#ifdef USE_VCONSOLE + // Allocate memory for PCI serial console + mic_ctx->bi_vcons.dc_buf_virt = (void *)get_zeroed_page(GFP_KERNEL); + mic_ctx->bi_vcons.dc_hdr_virt = kzalloc(sizeof(struct vcons_buf), GFP_KERNEL); + + if ((!mic_ctx->bi_vcons.dc_buf_virt) || (!mic_ctx->bi_vcons.dc_hdr_virt)) { + printk(KERN_ERR "mic %d: failed to allocate memory for vcons buffer\n", + mic_ctx->bi_id); + mic_ctx->bi_vcons.dc_enabled = 0; + if (mic_ctx->bi_vcons.dc_buf_virt) + free_pages((uint64_t)mic_ctx->bi_vcons.dc_buf_virt, 0); + if (mic_ctx->bi_vcons.dc_hdr_virt) + kfree(mic_ctx->bi_vcons.dc_hdr_virt); + } else { + mic_ctx->bi_vcons.dc_hdr_dma_addr = mic_ctx_map_single(mic_ctx, + mic_ctx->bi_vcons.dc_hdr_virt, + sizeof(struct vcons_buf)); + mic_ctx->bi_vcons.dc_dma_addr = mic_ctx_map_single(mic_ctx, + mic_ctx->bi_vcons.dc_buf_virt, + MICVCONS_BUF_SIZE); + if ((!mic_ctx->bi_vcons.dc_dma_addr) || + (!mic_ctx->bi_vcons.dc_hdr_dma_addr)) + mic_ctx->bi_vcons.dc_enabled = 0; + else + mic_ctx->bi_vcons.dc_enabled = 1; + mic_ctx->bi_vcons.dc_size = MICVCONS_BUF_SIZE; + vcons_buf = (struct vcons_buf *)(mic_ctx->bi_vcons.dc_hdr_virt); + vcons_buf->o_buf_dma_addr = mic_ctx->bi_vcons.dc_dma_addr; + vcons_buf->o_size = MICVCONS_BUF_SIZE; + smp_wmb(); + vcons_buf->host_magic = MIC_HOST_VCONS_READY; + vcons_buf->host_rb_ver = micscif_rb_get_version(); + } +#endif // USE_VCONSOLE + mic_ctx->boot_mem = 0; + mic_psmi_init(mic_ctx); + mic_ctx->dma_handle = NULL; + mic_ctx->sdbic1 = 0; + // To avoid hazard on Windows, sku_build_table is done on DriverEntry + sku_build_table(); + device_id = mic_ctx->bi_pdev->device; + sku_find(mic_ctx, device_id); + // To avoid hazard on Windows, sku_destroy_table is done on MicUnload + sku_destroy_table(); + + /* Determine the amount of compensation that needs to be applied to MIC's ETC timer */ + calculate_etc_compensation(mic_ctx); + + return 0; + +adap_init_unmapmmio: + iounmap(mic_ctx->mmio.va); +destroy_remap_wq: + destroy_workqueue(mic_ctx->ioremapworkq); +destroy_reset_wq: + destroy_workqueue(mic_ctx->resetworkq); + return err; +} + +void +mic_enable_interrupts(mic_ctx_t *mic_ctx) +{ + ENABLE_MIC_INTERRUPTS(mic_ctx->mmio.va); +} + +void +mic_disable_interrupts(mic_ctx_t *mic_ctx) +{ + uint32_t sboxSice0reg; + + sboxSice0reg = SBOX_READ(mic_ctx->mmio.va, SBOX_SICE0); + SBOX_WRITE(sboxSice0reg, mic_ctx->mmio.va, SBOX_SICC0); +} + +void +mic_enable_msi_interrupts(mic_ctx_t *mic_ctx) +{ + uint32_t sboxMXARreg; + + // Only support single MSI interrupt for now + sboxMXARreg = SBOX_SICE0_DBR_BITS(0xf) | SBOX_SICE0_DMA_BITS(0xff); + if (mic_ctx->bi_family == FAMILY_KNC) + SBOX_WRITE(sboxMXARreg, mic_ctx->mmio.va, SBOX_MXAR0_K1OM); + else + SBOX_WRITE(sboxMXARreg, mic_ctx->mmio.va, SBOX_MXAR0); +} + +int +mic_reg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring, + int (*irqfunc)(mic_ctx_t *mic_ctx, int doorbell)) +{ + mic_irqhandler_t *irqhandle; + unsigned long flags; + + if (doorbell > MIC_IRQ_MAX) { + return EINVAL; + } + + if (!(irqhandle = kmalloc(sizeof(mic_irqhandler_t), GFP_ATOMIC))) + goto memerror1; + + if (!(irqhandle->ih_idstring = kmalloc(strlen(idstring) + 1, GFP_ATOMIC))) + goto memerror2; + + irqhandle->ih_func = irqfunc; + strcpy(irqhandle->ih_idstring, idstring); + + spin_lock_irqsave(&mic_ctx->bi_irq.mi_lock, flags); + list_add_tail(&irqhandle->ih_list, &mic_ctx->bi_irq.mi_dblist[doorbell]); + spin_unlock_irqrestore(&mic_ctx->bi_irq.mi_lock, flags); + return 0; + +memerror2: + kfree(irqhandle); +memerror1: + return -ENOMEM; +} + +int +mic_unreg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring) +{ + mic_irqhandler_t *irqhandle; + struct list_head *pos, *tmpq; + unsigned long flags; + + spin_lock_irqsave(&mic_ctx->bi_irq.mi_lock, flags); + list_for_each_safe(pos, tmpq, &mic_ctx->bi_irq.mi_dblist[doorbell]) { + irqhandle = list_entry(pos, mic_irqhandler_t, ih_list); + if (strcmp(idstring, irqhandle->ih_idstring) == 0) { + list_del(pos); + kfree(irqhandle->ih_idstring); + kfree(irqhandle); + } + } + spin_unlock_irqrestore(&mic_ctx->bi_irq.mi_lock, flags); + + return 0; +} + +static __always_inline +void adapter_process_one_interrupt(mic_ctx_t *mic_ctx, uint32_t events) +{ + mic_irqhandler_t *irqhandle; + struct list_head *pos; + int doorbell; + + atomic_inc(&mic_ctx->bi_irq.mi_received); + + if (SBOX_SICR0_DBR(events)) { + for (doorbell = 0; doorbell < 4; doorbell++) { + if (SBOX_SICR0_DBR(events) & (0x1 << doorbell)) { + spin_lock(&mic_ctx->bi_irq.mi_lock); + list_for_each(pos, &mic_ctx->bi_irq.mi_dblist[doorbell]) { + irqhandle = list_entry(pos, mic_irqhandler_t, ih_list); + irqhandle->ih_func(mic_ctx, doorbell); + } + spin_unlock(&mic_ctx->bi_irq.mi_lock); + } + } + + } + + if (SBOX_SICR0_DMA(events)) + host_dma_interrupt_handler(mic_ctx->dma_handle, events); +} + +int +adapter_isr(mic_ctx_t *mic_ctx) +{ + volatile uint32_t sboxSicr0reg; + if (atomic_cmpxchg(&mic_ctx->gate_interrupt, 0, 1) == 1) + return -1; + + sboxSicr0reg = SBOX_READ(mic_ctx->mmio.va, SBOX_SICR0); + + if (unlikely(!sboxSicr0reg)) { + // Spurious interrupt + atomic_set(&mic_ctx->gate_interrupt, 0); + return -1; + } + + // tell mic that we recived interrupt otherwise it will keep sending them + SBOX_WRITE(sboxSicr0reg, mic_ctx->mmio.va, SBOX_SICR0); + + // This only applies to KNC B0 + if (FAMILY_KNC == mic_ctx->bi_family && + mic_ctx->bi_stepping >= KNC_B0_STEP) + mic_enable_interrupts(mic_ctx); + + atomic_set(&mic_ctx->gate_interrupt, 0); + adapter_process_one_interrupt(mic_ctx, sboxSicr0reg); + return 0; +} + +int +adapter_imsr(mic_ctx_t *mic_ctx) +{ +#if 0 /* TODO: disable interrupt when KNC auto-enable isn't used */ + mic_disable_interrupts(mic_ctx); +#endif + tasklet_schedule(&mic_ctx->bi_dpc); + return 0; +} + +static void adapter_dpc(unsigned long dpc) +{ + mic_ctx_t *mic_ctx = + container_of((struct tasklet_struct *)dpc, mic_ctx_t, bi_dpc); + + volatile uint32_t sboxSicr0reg; + + if (atomic_cmpxchg(&mic_ctx->gate_interrupt, 0, 1) == 1) + return; + + /* Clear pending bit array */ + if (FAMILY_KNC == mic_ctx->bi_family) { + if (KNC_A_STEP == mic_ctx->bi_stepping) + SBOX_WRITE(1, mic_ctx->mmio.va, SBOX_MSIXPBACR_K1OM); + } else + SBOX_WRITE(1, mic_ctx->mmio.va, SBOX_MSIXPBACR); + + sboxSicr0reg = SBOX_READ(mic_ctx->mmio.va, SBOX_SICR0); + if (unlikely(!sboxSicr0reg)) { + atomic_set(&mic_ctx->gate_interrupt, 0); + return; + } + + SBOX_WRITE(sboxSicr0reg, mic_ctx->mmio.va, SBOX_SICR0); + + // This only applies to KNC B0 + if (FAMILY_KNC == mic_ctx->bi_family && + mic_ctx->bi_stepping >= KNC_B0_STEP) + mic_enable_interrupts(mic_ctx); + + atomic_set(&mic_ctx->gate_interrupt, 0); + adapter_process_one_interrupt(mic_ctx, sboxSicr0reg); +} + +void ramoops_init(void) +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + ramoops_dir = proc_mkdir("mic_ramoops", NULL); +#else + ramoops_dir = create_proc_entry("mic_ramoops", S_IFDIR | S_IRUGO, NULL); +#endif +} + +void ramoops_exit(void) +{ + remove_proc_entry("mic_ramoops", NULL); +} + +void ramoops_remove(mic_ctx_t *mic_ctx) +{ + char name[64]; + int i; + + snprintf(name, 64, "mic%d", mic_ctx->bi_id); + remove_proc_entry(name, ramoops_dir); + + snprintf(name, 64, "mic%d_prev", mic_ctx->bi_id); + remove_proc_entry(name, ramoops_dir); + if (mic_ctx->ramoops_size == 0) + return; + + for (i = 0; i < 2; i++) { + if (mic_ctx->ramoops_va[i] != NULL) { + mic_ctx_unmap_single(mic_ctx, mic_ctx->ramoops_pa[i], + mic_ctx->ramoops_size); + kfree(mic_ctx->ramoops_va[i]); + } + } +} + +void vmcore_init(void) +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + vmcore_dir = proc_mkdir("mic_vmcore", NULL); +#else + vmcore_dir = create_proc_entry("mic_vmcore", S_IFDIR | S_IRUGO, NULL); +#endif +} + +void vmcore_exit(void) +{ + if (vmcore_dir) { + remove_proc_entry("mic_vmcore", NULL); + vmcore_dir = NULL; + } +} + +void vmcore_remove(mic_ctx_t *mic_ctx) +{ + char name[64]; + + snprintf(name, 64, "mic%d", mic_ctx->bi_id); + if (mic_ctx->vmcore_dir) { + remove_proc_entry(name, vmcore_dir); + mic_ctx->vmcore_dir = NULL; + } + if (mic_ctx->elfcorebuf) { + kfree(mic_ctx->elfcorebuf); + mic_ctx->elfcorebuf = NULL; + mic_ctx->elfcorebuf_sz = 0; + mic_ctx->vmcore_size = 0; + } +} + + +void +adapter_init(void) +{ + // Per driver init ONLY. + mic_dma_init(); + micscif_init(); + micpm_init(); + ramoops_init(); + vmcore_init(); + INIT_LIST_HEAD(&mic_data.dd_bdlist); +} + + +void show_stepping_comm(mic_ctx_t *mic_ctx,char *buf) +{ +#define STEPINGSTRSIZE 3 + char string[STEPINGSTRSIZE]; + switch (mic_ctx->bi_family) { + case FAMILY_ABR: + switch (mic_ctx->bi_stepping) { + case 0: + string[0] = 'A'; + string[1] = mic_ctx->bi_substepping + '0'; + break; + case 2: + string[0] = 'B'; + string[1] = '0'; + break; + case 3: + string[0] = 'B'; + string[1] = '1'; + break; + case 4: + string[0] = 'C'; + string[1] = '0'; + break; + case 5: + string[0] = 'C'; + string[1] = '1'; + break; + case 6: + string[0] = 'D'; + string[1] = '0'; + break; + default: + string[0] = '?'; + string[1] = '?'; + break; + } + break; + case FAMILY_KNC: + switch (mic_ctx->bi_stepping) { + case KNC_A_STEP: + string[0] = 'A'; + string[1] = '0'; + break; + case KNC_B0_STEP: + string[0] = 'B'; + string[1] = '0'; + break; + case KNC_B1_STEP: + string[0] = 'B'; + string[1] = '1'; + break; + case KNC_C_STEP: + string[0] = 'C'; + string[1] = '0'; + break; + default: + string[0] = '?'; + string[1] = '?'; + break; + } + break; + default: + string[0] = '?'; + string[1] = '?'; + break; + } + + string[2] = '\0'; + + strncpy(buf,string,STEPINGSTRSIZE); +} + + diff --git a/host/vhost/mic_blk.c b/host/vhost/mic_blk.c new file mode 100644 index 0000000..9ac2cb8 --- /dev/null +++ b/host/vhost/mic_blk.c @@ -0,0 +1,665 @@ + /* + * Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin + * + * This work is licensed under the terms of the GNU GPL, version 2. + + * (C) Badari Pulavarty pbadari@us.ibm.com 2010 with the following comment. + * He posted on http://lwn.net/Articles/382543/ + + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + + * For adapting to MIC + * (C) Copyright 2012 Intel Corporation + * Author: Caz Yokoyama + */ +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \ + defined(RHEL_RELEASE_CODE) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef VIRTIO_RING_F_EVENT_IDX /* virtio_ring.h of rhel6.0 does not define */ +#define VIRTIO_RING_F_EVENT_IDX 29 +#endif +#include "mic_common.h" +#include "mic/micveth_dma.h" +#include "vhost.h" +#include "mic/mic_virtio.h" + +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1UL << SECTOR_SHIFT) +#define VIRTIO_BLK_QUEUE_SIZE 128 +#define DISK_SEG_MAX (VIRTIO_BLK_QUEUE_SIZE - 2) + +#define VHOST_BLK_VQ_MAX 1 +#define WQNAME_SIZE 16 + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; + struct workqueue_struct *vb_wq; + char vb_wqname[WQNAME_SIZE]; + struct work_struct vb_ws_bh; + struct workqueue_struct *vblk_workqueue; + struct board_info *bd_info; + char *file_name; + struct file *virtblk_file; +}; + +struct vhost_blk_io { + struct list_head list; + struct work_struct work; + struct vhost_blk *blk; + struct file *file; + int head; + uint32_t type; + uint32_t nvecs; + uint64_t sector; + uint64_t len; + struct iovec iov[0]; +}; + +#define mic_addr_in_host(va, pa) ((u8 *)(va) + (u64)(pa)) + +static LIST_HEAD(write_queue); +static LIST_HEAD(read_queue); + +static void +cleanup_vblk_workqueue(struct vhost_blk_io *vbio, struct vhost_virtqueue *vq) +{ + struct list_head single, *head, *node, *tmp; + int need_free; + struct vhost_blk_io *entry; + + if (vbio->head != -1) { + INIT_LIST_HEAD(&single); + list_add(&vbio->list, &single); + head = &single; + need_free = 0; + } else { + head = &vbio->list; + need_free = 1; + } + + mutex_lock(&vq->mutex); + list_for_each_safe(node, tmp, head) { + entry = list_entry(node, struct vhost_blk_io, list); + list_del(node); + kfree(entry); + } + mutex_unlock(&vq->mutex); + + if (need_free) + kfree(vbio); +} + +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio, *entry; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + struct list_head single, *head, *node, *tmp; + struct iovec *iov; + uint8_t *aper_va; + struct vring *vring; + unsigned int num; + + int need_free, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio->blk; + vq = &blk->dev.vqs[0]; + pos = vbio->sector << SECTOR_SHIFT; + aper_va = blk->bd_info->bi_ctx.aper.va; + + vring = &((struct mic_virtblk *)blk->bd_info->bi_virtio)->vb_shared.vring; + num = readl(&vring->num); + if (num == 0 || micpm_get_reference(&blk->bd_info->bi_ctx, true)) { + cleanup_vblk_workqueue(vbio, vq); + return; + } + + if (atomic64_read(&vbio->file->f_count) == 0) { /* file is closed */ + ret = -1; + } else if (vbio->type & VIRTIO_BLK_T_FLUSH) { +#ifdef RHEL_RELEASE_CODE +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + ret = vfs_fsync(vbio->file, 1); +#else + ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1); +#endif +#else + ret = vfs_fsync(vbio->file, 1); +#endif + } else if (vbio->type & VIRTIO_BLK_T_OUT) { + for (iov = vbio->iov; iov < &vbio->iov[vbio->nvecs]; iov++) { + iov->iov_base = mic_addr_in_host(aper_va, iov->iov_base); + } + ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos); + } else { + for (iov = vbio->iov; iov < &vbio->iov[vbio->nvecs]; iov++) { + iov->iov_base = mic_addr_in_host(aper_va, iov->iov_base); + } + ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos); + } + status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (vbio->head != -1) { + INIT_LIST_HEAD(&single); + list_add(&vbio->list, &single); + head = &single; + need_free = 0; + } else { + head = &vbio->list; + need_free = 1; + } + list_for_each_entry(entry, head, list) { + memcpy_toio(mic_addr_in_host(aper_va, entry->iov[entry->nvecs].iov_base), &status, sizeof(status)); + } + mutex_lock(&vq->mutex); + list_for_each_safe(node, tmp, head) { + entry = list_entry(node, struct vhost_blk_io, list); + vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret); + list_del(node); + kfree(entry); + } + mutex_unlock(&vq->mutex); + if (need_free) + kfree(vbio); + micpm_put_reference(&blk->bd_info->bi_ctx); +} + +static struct vhost_blk_io *allocate_vbio(int nvecs) +{ + struct vhost_blk_io *vbio; + int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec); + vbio = kmalloc(size, GFP_KERNEL); + if (vbio) { + INIT_WORK(&vbio->work, handle_io_work); + INIT_LIST_HEAD(&vbio->list); + } + return vbio; +} + +static void merge_and_handoff_work(struct list_head *queue) +{ + struct vhost_blk_io *vbio, *entry; + int nvecs = 0; + int entries = 0; + + list_for_each_entry(entry, queue, list) { + nvecs += entry->nvecs; + entries++; + } + + if (entries == 1) { + vbio = list_first_entry(queue, struct vhost_blk_io, list); + list_del(&vbio->list); + queue_work(vbio->blk->vblk_workqueue, &vbio->work); + return; + } + + vbio = allocate_vbio(nvecs); + if (!vbio) { + /* Unable to allocate memory - submit IOs individually */ + list_for_each_entry(vbio, queue, list) { + queue_work(vbio->blk->vblk_workqueue, &vbio->work); + } + INIT_LIST_HEAD(queue); + return; + } + + entry = list_first_entry(queue, struct vhost_blk_io, list); + vbio->nvecs = nvecs; + vbio->blk = entry->blk; + vbio->file = entry->file; + vbio->type = entry->type; + vbio->sector = entry->sector; + vbio->head = -1; + vbio->len = 0; + nvecs = 0; + + list_for_each_entry(entry, queue, list) { + memcpy(&vbio->iov[nvecs], entry->iov, entry->nvecs * sizeof(struct iovec)); + nvecs += entry->nvecs; + vbio->len += entry->len; + } + list_replace_init(queue, &vbio->list); + queue_work(vbio->blk->vblk_workqueue, &vbio->work); +} + +static void start_io(struct list_head *queue) +{ + struct list_head start; + struct vhost_blk_io *vbio = NULL, *entry; + + if (list_empty(queue)) + return; + + list_for_each_entry(entry, queue, list) { + if (!vbio) { + vbio = entry; + continue; + } + if (vbio->sector + (vbio->len >> SECTOR_SHIFT) == entry->sector) { + vbio = entry; + } else { + INIT_LIST_HEAD(&start); + list_cut_position(&start, queue, &vbio->list); + merge_and_handoff_work(&start); + vbio = entry; + } + } + if (!list_empty(queue)) + merge_and_handoff_work(queue); +} + +static uint64_t calculate_len(struct iovec *iov, int nvecs) +{ + uint64_t len = 0; + int i; + + for (i=0; isector > vbio->sector) + break; + } + list_add_tail(&vbio->list, &entry->list); +} + +static int handoff_io(struct vhost_blk *blk, int head, + uint32_t type, uint64_t sector, + struct iovec *iov, int nvecs) +{ + struct vhost_virtqueue *vq = &blk->dev.vqs[0]; + struct vhost_blk_io *vbio; + + vbio = allocate_vbio(nvecs+1); + if (!vbio) { + return -ENOMEM; + } + vbio->blk = blk; + vbio->head = head; + vbio->file = vq->private_data; + vbio->type = type; + vbio->sector = sector; + vbio->nvecs = nvecs; + vbio->len = calculate_len(iov, nvecs); + memcpy(vbio->iov, iov, (nvecs + 1) * sizeof(struct iovec)); + + if (vbio->type & VIRTIO_BLK_T_FLUSH) { +#if 0 + /* Sync called - do I need to submit IOs in the queue ? */ + start_io(&read_queue); + start_io(&write_queue); +#endif + queue_work(blk->vblk_workqueue, &vbio->work); + } else if (vbio->type & VIRTIO_BLK_T_OUT) { + insert_to_queue(vbio, &write_queue); + } else { + insert_to_queue(vbio, &read_queue); + } + return 0; +} + +static void handle_blk(struct vhost_blk *blk) +{ + struct vhost_virtqueue *vq = &blk->dev.vqs[0]; + unsigned head, out, in; + struct virtio_blk_outhdr hdr; + int nvecs; + struct board_info *bd_info = blk->bd_info; + struct vring *vring; + + vring = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.vring; + if (vring == 0 || readl(&vring->num) == 0) { + printk("request comes in while card side driver is not loaded yet. Ignore\n"); + return; + } + /* the first time since the card side driver becomes ready */ + if (vq->desc == NULL || readb(&((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.update)) { + vq->num = readl(&vring->num); + vq->desc = (struct vring_desc *)readq(&vring->desc); + vq->avail = (struct vring_avail *)readq(&vring->avail); + vq->used = (struct vring_used *)readq(&vring->used); + vq->last_avail_idx = 0; + vq->avail_idx = 0; + vq->last_used_idx = 0; + vq->signalled_used = 0; + vq->signalled_used_valid = false; + vq->done_idx = 0; + writeb(false, &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.update); + } + + if (micpm_get_reference(&blk->bd_info->bi_ctx, true)) + return; + + mutex_lock(&vq->mutex); + + vhost_disable_notify(&blk->dev, vq); + + for (;;) { + head = vhost_get_vq_desc(&blk->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if ((head == vq->num) || (head == -EFAULT) || (head == -EINVAL)) { + if (unlikely(vhost_enable_notify(&blk->dev, vq))) { + vhost_disable_notify(&blk->dev, vq); + continue; + } + start_io(&read_queue); + start_io(&write_queue); + break; + } + + BUG_ON(vq->iov[0].iov_len != 16); + + memcpy_fromio(&hdr, mic_addr_in_host(bd_info->bi_ctx.aper.va, vq->iov[0].iov_base), + sizeof(hdr)); + + nvecs = out - 1; + if (hdr.type == VIRTIO_BLK_T_IN) + nvecs = in - 1; + + BUG_ON(vq->iov[nvecs+1].iov_len != 1); + if (handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs) < 0) { + vhost_discard_vq_desc(vq, 1); + continue; + } + } + mutex_unlock(&vq->mutex); + micpm_put_reference(&blk->bd_info->bi_ctx); +} + +static void handle_blk_kick(struct work_struct *work) +{ + struct vhost_blk *vblk; + + vblk = container_of(work, struct vhost_blk, vb_ws_bh); + handle_blk(vblk); +} + +#if 0 +static void handle_rq_blk(struct vhost_work *work) +{ + struct vhost_blk *blk; + + blk = container_of(work, struct vhost_blk, poll[0].work); + handle_blk(blk); +} +#endif + +static int +vhost_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell) +{ + struct board_info *bi; + struct vhost_blk *vblk; + + bi = container_of(mic_ctx, struct board_info, bi_ctx); + vblk = ((struct mic_virtblk *)bi->bi_virtio)->vblk; + queue_work(vblk->vb_wq, &vblk->vb_ws_bh); + + return 0; +} + +static long vhost_blk_set_backend(struct vhost_blk *vblk) +{ + struct vhost_virtqueue *vq; + struct board_info *bd_info = vblk->bd_info; + unsigned index = bd_info->bi_ctx.bi_id; + struct vb_shared *vb_shared; + int ret = 0; + struct kstat stat; + unsigned int virtio_blk_features = (1U << VIRTIO_BLK_F_SEG_MAX) | + (1U << VIRTIO_BLK_F_BLK_SIZE); + + if (index >= MAX_BOARD_SUPPORTED) { + ret = -ENOBUFS; + goto _exit_; + } + if (vblk->virtblk_file == NULL) { + ret = -EBADF; + goto _exit_; + } + + vq = &vblk->vqs[0]; + mutex_lock(&vq->mutex); + rcu_assign_pointer(vq->private_data, vblk->virtblk_file); + mutex_unlock(&vq->mutex); + + snprintf(vblk->vb_wqname, sizeof(vblk->vb_wqname), + "virtblk wq %d", index); + vblk->vb_wq = __mic_create_singlethread_workqueue(vblk->vb_wqname); + if (vblk->vb_wq == NULL) { + ret = -ENOMEM; + goto _exit_; + } + INIT_WORK(&vblk->vb_ws_bh, handle_blk_kick); + + /* They have to be accessed from "struct vhost_virtqueue *vq" in mic_vhost.c. + They are not used in vhost block. I don't modify vhost.h. */ + vq->log_base = (void __user *)&bd_info->bi_ctx; + vq->log_addr = (u64)bd_info->bi_ctx.aper.va; + + vb_shared = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0)) + virtio_blk_features |= (1U << VIRTIO_BLK_F_FLUSH); +#endif + writel(virtio_blk_features, &vb_shared->host_features); + writel(DISK_SEG_MAX, &vb_shared->blk_config.seg_max); + writel(SECTOR_SIZE, &vb_shared->blk_config.blk_size); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)) + ret = vfs_getattr(&vblk->virtblk_file->f_path, &stat); +#else + ret = vfs_getattr(vblk->virtblk_file->f_path.mnt, + vblk->virtblk_file->f_path.dentry, &stat); +#endif + if (ret < 0) + goto _exit_; + + if (S_ISBLK(stat.mode)) { + writel(i_size_read(I_BDEV(vblk->virtblk_file->f_mapping->host)->bd_inode) / SECTOR_SIZE, + &vb_shared->blk_config.capacity); + } else { + writel(stat.size / SECTOR_SIZE, &vb_shared->blk_config.capacity); + } + + ret = mic_reg_irqhandler(&bd_info->bi_ctx, MIC_IRQ_DB2, "Host DoorBell 2", + vhost_doorbell_intr_handler); + +_exit_: + return ret; +} + +void +mic_vhost_blk_stop(bd_info_t *bd_info) +{ + struct vring *vring = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.vring; + + writel(0, &vring->num); /* reject subsequent request from MIC card */ +} + +extern bd_info_t *dev_to_bdi(struct device *dev); + +ssize_t +show_virtblk_file(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct board_info *bd_info = dev_to_bdi(dev); + struct mic_virtblk *mic_virtblk; + struct vhost_blk *vblk; + + BUG_ON(bd_info == NULL); + mic_virtblk = bd_info->bi_virtio; + BUG_ON(mic_virtblk == NULL); + vblk = mic_virtblk->vblk; + BUG_ON(vblk == NULL); + + if (vblk->file_name != NULL) + return snprintf(buf, PAGE_SIZE, "%s\n", vblk->file_name); + else + return 0; +} + +ssize_t +store_virtblk_file(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret = 0; + struct board_info *bd_info = dev_to_bdi(dev); + struct mic_virtblk *mic_virtblk; + struct vhost_blk *vblk; + struct vhost_virtqueue *vq; + char *p; + struct file *virtblk_file; + + BUG_ON(bd_info == NULL); + mic_virtblk = bd_info->bi_virtio; + BUG_ON(mic_virtblk == NULL); + vblk = mic_virtblk->vblk; + BUG_ON(vblk == NULL); + vq = &vblk->vqs[0]; + BUG_ON(vq == NULL); + + if (buf == NULL) { + ret = -EINVAL; + goto _return_; + } + if (count <= 1) { + ret = -EINVAL; + goto _return_; + } + + p = strchr(buf, '\n'); + if (p != NULL) + *p = '\0'; + + mutex_lock(&vq->mutex); + if (vblk->virtblk_file != NULL) { /* if virtblk file is already assigned */ + printk(KERN_ALERT "you are changing virtblk file: %s -> %s.\n", vblk->file_name, buf); + kfree(vblk->file_name); + vblk->file_name = NULL; + filp_close(vblk->virtblk_file, current->files); + vblk->virtblk_file = NULL; + } + + vblk->file_name = kmalloc(count + 1, GFP_KERNEL); + strcpy(vblk->file_name, buf); + virtblk_file = filp_open(vblk->file_name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(virtblk_file)) { + ret = PTR_ERR(virtblk_file); + mutex_unlock(&vq->mutex); + goto free_file_name; + } + vblk->virtblk_file = virtblk_file; + mutex_unlock(&vq->mutex); + + ret = vhost_blk_set_backend(vblk); + if (ret < 0) + goto close_virtblk_file; + + return count; + + close_virtblk_file: + filp_close(vblk->virtblk_file, current->files); + free_file_name: + kfree(vblk->file_name); + _return_: + return ret; +} + +int mic_vhost_blk_probe(bd_info_t *bd_info) +{ + int ret = 0; + char wq_name[8]; + struct mic_virtblk *mic_virtblk; + struct vhost_blk *vblk; + + mic_virtblk = kzalloc(sizeof(*mic_virtblk), GFP_KERNEL); + if (mic_virtblk == NULL) { + ret = -ENOMEM; + goto err_vblk; + } + bd_info->bi_virtio = mic_virtblk; + + vblk = kzalloc(sizeof *vblk, GFP_KERNEL); + if (vblk == NULL) { + ret = -ENOMEM; + goto free_mic_virtblk; + } + mic_virtblk->vblk = vblk; + vblk->bd_info = bd_info; + + ret = vhost_dev_init(&vblk->dev, vblk->vqs, VHOST_BLK_VQ_MAX); + if (ret < 0) + goto free_vblk; + +#if 0 + vhost_poll_init(vblk->poll, handle_rq_blk, POLLOUT|POLLIN, &vblk->dev); +#endif + + BUG_ON(bd_info->bi_ctx.bi_id >= 1000); + snprintf(wq_name, ARRAY_SIZE(wq_name), "vblk%03d", bd_info->bi_ctx.bi_id); + vblk->vblk_workqueue = __mic_create_singlethread_workqueue(wq_name); + if (vblk->vblk_workqueue == NULL) { + ret = -ENOMEM; + goto free_vblk; + } + + return ret; + + free_vblk: + kfree(vblk); + free_mic_virtblk: + kfree(mic_virtblk); + err_vblk: + return ret; +} + +void mic_vhost_blk_remove(bd_info_t *bd_info) +{ + struct mic_virtblk *mic_virtblk = bd_info->bi_virtio; + struct vhost_blk *vblk = mic_virtblk->vblk; + struct vb_shared *vb_shared = &mic_virtblk->vb_shared; + + if (vblk->virtblk_file != NULL) { + mic_unreg_irqhandler(&bd_info->bi_ctx, MIC_IRQ_DB2, "Host DoorBell 2"); + memset(&vb_shared->blk_config, 0, sizeof(vb_shared->blk_config)); + destroy_workqueue(vblk->vb_wq); + if (vblk->vqs[0].private_data != NULL) + fput(vblk->vqs[0].private_data); + kfree(vblk->file_name); + filp_close(vblk->virtblk_file, current->files); + } + vhost_dev_cleanup(&vblk->dev); + destroy_workqueue(vblk->vblk_workqueue); + kfree(vblk); + kfree(mic_virtblk); +} +#endif diff --git a/host/vhost/mic_vhost.c b/host/vhost/mic_vhost.c new file mode 100644 index 0000000..1aa946b --- /dev/null +++ b/host/vhost/mic_vhost.c @@ -0,0 +1,697 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * Author: Michael S. Tsirkin + * + * (C) Badari Pulavarty pbadari@us.ibm.com 2010 with the following comment. + * Inspiration, some code, and most witty comments come from + * Documentation/lguest/lguest.c, by Rusty Russell + * + * This work is licensed under the terms of the GNU GPL, version 2. + + * For adapting to MIC + * (C) Copyright 2012 Intel Corporation + * Author: Caz Yokoyama + * + * Generic code for virtio server in host kernel. + */ + +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || \ + defined(RHEL_RELEASE_CODE) + +#include +#ifdef RHEL_RELEASE_CODE +#include +#else +#include "./linux/vhost.h" +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#ifndef VIRTIO_RING_F_EVENT_IDX /* virtio_ring.h of rhel6.0 does not define */ +#define VIRTIO_RING_F_EVENT_IDX 29 +#endif +#include "vhost.h" +#include "mic/micveth_dma.h" + +#define mic_addr_in_host(va, pa) ((u8 *)(va) + (u64)(pa)) + +enum { + VHOST_MEMORY_MAX_NREGIONS = 64, + VHOST_MEMORY_F_LOG = 0x1, +}; + +#if 0 +static unsigned vhost_zcopy_mask __read_mostly; +#endif + +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct vhost_poll *poll; + poll = container_of(pt, struct vhost_poll, table); + + poll->wqh = wqh; + add_wait_queue(wqh, &poll->wait); +} + +static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); + + if (!((unsigned long)key & poll->mask)) + return 0; + + vhost_poll_queue(poll); + return 0; +} + +static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) +{ + INIT_LIST_HEAD(&work->node); + work->fn = fn; + init_waitqueue_head(&work->done); + work->flushing = 0; + work->queue_seq = work->done_seq = 0; +} + +/* Init poll structure */ +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, + unsigned long mask, struct vhost_dev *dev) +{ + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); + init_poll_funcptr(&poll->table, vhost_poll_func); + poll->mask = mask; + poll->dev = dev; + + vhost_work_init(&poll->work, fn); +} + +#if 0 +/* Start polling a file. We add ourselves to file's wait queue. The caller must + * keep a reference to a file until after vhost_poll_stop is called. */ +void vhost_poll_start(struct vhost_poll *poll, struct file *file) +{ + unsigned long mask; + mask = file->f_op->poll(file, &poll->table); + if (mask) + vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); +} +#endif + +/* Stop polling a file. After this function returns, it becomes safe to drop the + * file reference. You must also flush afterwards. */ +void vhost_poll_stop(struct vhost_poll *poll) +{ + remove_wait_queue(poll->wqh, &poll->wait); +} + +static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, + unsigned seq) +{ + int left; + spin_lock_irq(&dev->work_lock); + left = seq - work->done_seq; + spin_unlock_irq(&dev->work_lock); + return left <= 0; +} + +static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +{ + unsigned seq; + int flushing; + + spin_lock_irq(&dev->work_lock); + seq = work->queue_seq; + work->flushing++; + spin_unlock_irq(&dev->work_lock); + wait_event(work->done, vhost_work_seq_done(dev, work, seq)); + spin_lock_irq(&dev->work_lock); + flushing = --work->flushing; + spin_unlock_irq(&dev->work_lock); + BUG_ON(flushing < 0); +} + +/* Flush any work that has been scheduled. When calling this, don't hold any + * locks that are also used by the callback. */ +void vhost_poll_flush(struct vhost_poll *poll) +{ + vhost_work_flush(poll->dev, &poll->work); +} + +static inline void vhost_work_queue(struct vhost_dev *dev, + struct vhost_work *work) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->work_lock, flags); + if (list_empty(&work->node)) { + list_add_tail(&work->node, &dev->work_list); + work->queue_seq++; + wake_up_process(dev->worker); + } + spin_unlock_irqrestore(&dev->work_lock, flags); +} + +void vhost_poll_queue(struct vhost_poll *poll) +{ + vhost_work_queue(poll->dev, &poll->work); +} + +static void vhost_vq_reset(struct vhost_dev *dev, + struct vhost_virtqueue *vq) +{ + vq->num = 1; + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + vq->last_avail_idx = 0; + vq->avail_idx = 0; + vq->last_used_idx = 0; + vq->signalled_used = 0; + vq->signalled_used_valid = false; + vq->used_flags = 0; + vq->log_used = false; + vq->log_addr = -1ull; + vq->vhost_hlen = 0; + vq->sock_hlen = 0; + vq->private_data = NULL; + vq->log_base = NULL; + vq->error_ctx = NULL; + vq->error = NULL; + vq->kick = NULL; + vq->call_ctx = NULL; + vq->call = NULL; + vq->log_ctx = NULL; + vq->upend_idx = 0; + vq->done_idx = 0; + vq->ubufs = NULL; +} + +static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) +{ + kfree(vq->indirect); + vq->indirect = NULL; + kfree(vq->log); + vq->log = NULL; + kfree(vq->heads); + vq->heads = NULL; + kfree(vq->ubuf_info); + vq->ubuf_info = NULL; +} + +#if 0 +void vhost_enable_zcopy(int vq) +{ + vhost_zcopy_mask |= 0x1 << vq; +} +#endif + +static void vhost_dev_free_iovecs(struct vhost_dev *dev) +{ + int i; + for (i = 0; i < dev->nvqs; ++i) + vhost_vq_free_iovecs(&dev->vqs[i]); +} + +long vhost_dev_init(struct vhost_dev *dev, + struct vhost_virtqueue *vqs, int nvqs) +{ + int i; + + dev->vqs = vqs; + dev->nvqs = nvqs; + mutex_init(&dev->mutex); + dev->log_ctx = NULL; + dev->log_file = NULL; + dev->memory = NULL; + dev->mm = NULL; + spin_lock_init(&dev->work_lock); + INIT_LIST_HEAD(&dev->work_list); + dev->worker = NULL; + + for (i = 0; i < dev->nvqs; ++i) { + dev->vqs[i].log = NULL; + dev->vqs[i].indirect = NULL; + dev->vqs[i].heads = NULL; + dev->vqs[i].ubuf_info = NULL; + dev->vqs[i].dev = dev; + mutex_init(&dev->vqs[i].mutex); + vhost_vq_reset(dev, dev->vqs + i); + if (dev->vqs[i].handle_kick) + vhost_poll_init(&dev->vqs[i].poll, + dev->vqs[i].handle_kick, POLLIN, dev); + } + + return 0; +} + +#if 0 +/* Caller should have device mutex */ +long vhost_dev_check_owner(struct vhost_dev *dev) +{ + /* Are you the owner? If not, I don't think you mean to do that */ + return dev->mm == current->mm ? 0 : -EPERM; +} +#endif + +struct vhost_attach_cgroups_struct { + struct vhost_work work; + struct task_struct *owner; + int ret; +}; + +#if 0 +/* Caller should have device mutex */ +long vhost_dev_reset_owner(struct vhost_dev *dev) +{ + struct vhost_memory *memory; + + /* Restore memory to default empty mapping. */ + memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); + if (!memory) + return -ENOMEM; + + vhost_dev_cleanup(dev); + + memory->nregions = 0; + dev->memory = memory; + return 0; +} +#endif + +/* In case of DMA done not in order in lower device driver for some reason. + * upend_idx is used to track end of used idx, done_idx is used to track head + * of used idx. Once lower device DMA done contiguously, we will signal KVM + * guest used idx. + */ +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq) +{ + int i; + int j = 0; + + for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) { + if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) { + vq->heads[i].len = VHOST_DMA_CLEAR_LEN; + vhost_add_used_and_signal(vq->dev, vq, + vq->heads[i].id, 0); + ++j; + } else + break; + } + if (j) + vq->done_idx = i; + return j; +} + +/* Caller should have device mutex */ +void vhost_dev_cleanup(struct vhost_dev *dev) +{ + int i; + for (i = 0; i < dev->nvqs; ++i) { + if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { + vhost_poll_stop(&dev->vqs[i].poll); + vhost_poll_flush(&dev->vqs[i].poll); + } + BUG_ON(dev->vqs[i].ubufs != NULL); + + /* Signal guest as appropriate. */ + vhost_zerocopy_signal_used(&dev->vqs[i]); + + if (dev->vqs[i].error_ctx) + eventfd_ctx_put(dev->vqs[i].error_ctx); + if (dev->vqs[i].error) + fput(dev->vqs[i].error); + if (dev->vqs[i].kick) + fput(dev->vqs[i].kick); + if (dev->vqs[i].call_ctx) + eventfd_ctx_put(dev->vqs[i].call_ctx); + if (dev->vqs[i].call) + fput(dev->vqs[i].call); + vhost_vq_reset(dev, dev->vqs + i); + } + vhost_dev_free_iovecs(dev); + if (dev->log_ctx) + eventfd_ctx_put(dev->log_ctx); + dev->log_ctx = NULL; + if (dev->log_file) + fput(dev->log_file); + dev->log_file = NULL; + /* No one will access memory at this point */ + kfree(dev->memory); + dev->memory = NULL; + WARN_ON(!list_empty(&dev->work_list)); + if (dev->worker) { + kthread_stop(dev->worker); + dev->worker = NULL; + } + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +} + +#if 0 +/* Caller must have device mutex */ +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) +{ + return 0; +} +#endif + +static int vhost_update_used_flags(struct vhost_virtqueue *vq) +{ + iowrite16(vq->used_flags, mic_addr_in_host(vq->log_addr, &vq->used->flags)); + return 0; +} + +#if 0 +int vhost_init_used(struct vhost_virtqueue *vq) +{ + int r; + if (!vq->private_data) + return 0; + + r = vhost_update_used_flags(vq); + if (r) + return r; + vq->signalled_used_valid = false; + vq->last_used_idx = ioread16(mic_addr_in_host(vq->log_addr, &vq->used->idx)); + return 0; +} +#endif + +/* Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, + * or -1U if we're at the end. */ +static unsigned next_desc(struct vring_desc *desc) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc->flags & VRING_DESC_F_NEXT)) + return -1U; + + /* Check they're not leading us off end of descriptors. */ + next = desc->next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + /* We will use the result as an index in an array, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + return next; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which is + * never a valid descriptor number) if none was found. A negative code is + * returned on error. */ +int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num) +{ + struct vring_desc desc; + unsigned int i, head, found = 0; + u16 last_avail_idx; + int ret; + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vq->last_avail_idx; + vq->avail_idx = ioread16(mic_addr_in_host(vq->log_addr, &vq->avail->idx)); + + if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { + vq_err(vq, "Guest moved used index from %u to %u", + last_avail_idx, vq->avail_idx); + return -EFAULT; + } + + /* If there's nothing new since last we looked, return invalid. */ + if (vq->avail_idx == last_avail_idx) + return vq->num; + + /* Only get avail ring entries after they have been exposed by guest. */ + smp_rmb(); + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + head = ioread16(mic_addr_in_host(vq->log_addr, + &vq->avail->ring[last_avail_idx % vq->num])); + + /* If their number is silly, that's an error. */ + if (unlikely(head >= vq->num)) { + vq_err(vq, "Guest says index %u > %u is available", + head, vq->num); + return -EINVAL; + } + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + if (unlikely(log)) + *log_num = 0; + + i = head; + do { + unsigned iov_count = *in_num + *out_num; + if (unlikely(i >= vq->num)) { + vq_err(vq, "Desc index is %u > %u, head = %u", + i, vq->num, head); + return -EINVAL; + } + if (unlikely(++found > vq->num)) { + vq_err(vq, "Loop detected: last one at %u " + "vq size %u head %u\n", + i, vq->num, head); + return -EINVAL; + } + memcpy_fromio(&desc, mic_addr_in_host(vq->log_addr, vq->desc + i), sizeof(desc)); + + (iov + iov_count)->iov_base = (void *)desc.addr; + (iov + iov_count)->iov_len = desc.len; + ret = 1; + if (desc.flags & VRING_DESC_F_WRITE) { + /* If this is an input descriptor, + * increment that count. */ + *in_num += ret; + if (unlikely(log)) { + log[*log_num].addr = desc.addr; + log[*log_num].len = desc.len; + ++*log_num; + } + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (unlikely(*in_num)) { + vq_err(vq, "Descriptor has out after in: " + "idx %d\n", i); + return -EINVAL; + } + *out_num += ret; + } + } while ((i = next_desc(&desc)) != -1); + + /* On success, increment avail index. */ + vq->last_avail_idx++; + + /* Assume notifications from guest are disabled at this point, + * if they aren't we would need to update avail_event index. */ + BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); + return head; +} + +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) +{ + vq->last_avail_idx -= n; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to notify the guest, using eventfd. */ +int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) +{ + struct vring_used_elem __user *used; + + /* The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. */ + used = &vq->used->ring[vq->last_used_idx % vq->num]; + iowrite16(head, mic_addr_in_host(vq->log_addr, &used->id)); + iowrite16(len, mic_addr_in_host(vq->log_addr, &used->len)); + /* Make sure buffer is written before we update index. */ + smp_wmb(); + ioread16(mic_addr_in_host(vq->log_addr, &used->id)); + iowrite16(vq->last_used_idx + 1, mic_addr_in_host(vq->log_addr, &vq->used->idx)); + + vq->last_used_idx++; + + /* If the driver never bothers to signal in a very long while, + * used index might wrap around. If that happens, invalidate + * signalled_used index we stored. TODO: make sure driver + * signals at least once in 2^16 and remove this. */ + if (unlikely(vq->last_used_idx == vq->signalled_used)) + vq->signalled_used_valid = false; + return 0; +} + +static int __vhost_add_used_n(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + unsigned count) +{ + struct vring_used_elem __user *used; + u16 old, new; + int start; + + start = vq->last_used_idx % vq->num; + used = vq->used->ring + start; + memcpy_toio(mic_addr_in_host(vq->log_addr, used), heads, count * sizeof(*used)); + old = vq->last_used_idx; + new = (vq->last_used_idx += count); + /* If the driver never bothers to signal in a very long while, + * used index might wrap around. If that happens, invalidate + * signalled_used index we stored. TODO: make sure driver + * signals at least once in 2^16 and remove this. */ + if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) + vq->signalled_used_valid = false; + return 0; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to notify the guest, using eventfd. */ +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, + unsigned count) +{ + int start, n, r; + + start = vq->last_used_idx % vq->num; + n = vq->num - start; + if (n < count) { + r = __vhost_add_used_n(vq, heads, n); + if (r < 0) + return r; + heads += n; + count -= n; + } + r = __vhost_add_used_n(vq, heads, count); + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + iowrite16(vq->last_used_idx, mic_addr_in_host(vq->log_addr, &vq->used->idx)); + return r; +} + +static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __u16 old, new; + bool v; + /* Flush out used index updates. This is paired + * with the barrier that the Guest executes when enabling + * interrupts. */ + smp_mb(); + + if (vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + unlikely(vq->avail_idx == vq->last_avail_idx)) + return true; + + if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + __u16 flags; + flags = ioread16(mic_addr_in_host(vq->log_addr, &vq->avail->flags)); + return !(flags & VRING_AVAIL_F_NO_INTERRUPT); + } + old = vq->signalled_used; + v = vq->signalled_used_valid; + new = vq->signalled_used = vq->last_used_idx; + vq->signalled_used_valid = true; + + if (unlikely(!v)) + return true; + + return false; +} + +/* This actually signals the guest, using eventfd. */ +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + /* Signal the Guest tell them we used something up. */ + if (vq->log_base && vhost_notify(dev, vq)) + mic_send_virtio_intr((struct _mic_ctx_t *)vq->log_base); +} + +/* And here's the combo meal deal. Supersize me! */ +void vhost_add_used_and_signal(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + vhost_add_used(vq, head, len); + vhost_signal(dev, vq); +} + +#if 0 +/* multi-buffer version of vhost_add_used_and_signal */ +void vhost_add_used_and_signal_n(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + struct vring_used_elem *heads, unsigned count) +{ + vhost_add_used_n(vq, heads, count); + vhost_signal(dev, vq); +} +#endif + +/* OK, now we need to know about added descriptors. */ +bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + u16 avail_idx; + int r; + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) + return false; + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; + if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + r = vhost_update_used_flags(vq); + if (r) { + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + return false; + } + } + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + smp_mb(); + avail_idx = ioread16(mic_addr_in_host(vq->log_addr, &vq->avail->idx)); + + return avail_idx != vq->avail_idx; +} + +/* We don't need to be notified again. */ +void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + int r; + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) + return; + vq->used_flags |= VRING_USED_F_NO_NOTIFY; + if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + r = vhost_update_used_flags(vq); + if (r) + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + } +} +#endif diff --git a/host/vhost/vhost.h b/host/vhost/vhost.h new file mode 100644 index 0000000..9bb1653 --- /dev/null +++ b/host/vhost/vhost.h @@ -0,0 +1,261 @@ +/* + This is the exact copy of linux-2.6.32-220.7.1.el6.x86_64/drivers/vhost/vhost.h + except for this comment. + */ +#ifndef _VHOST_H +#define _VHOST_H + +#include +#ifdef RHEL_RELEASE_CODE +#include +#else +#include "./linux/vhost.h" +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This is for zerocopy, used buffer len is set to 1 when lower device DMA + * done */ +#define VHOST_DMA_DONE_LEN 1 +#define VHOST_DMA_CLEAR_LEN 0 + +struct vhost_device; + +struct vhost_work; +typedef void (*vhost_work_fn_t)(struct vhost_work *work); + +struct vhost_work { + struct list_head node; + vhost_work_fn_t fn; + wait_queue_head_t done; + int flushing; + unsigned queue_seq; + unsigned done_seq; +}; + +/* Poll a file (eventfd or socket) */ +/* Note: there's nothing vhost specific about this structure. */ +struct vhost_poll { + poll_table table; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct vhost_work work; + unsigned long mask; + struct vhost_dev *dev; +}; + +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, + unsigned long mask, struct vhost_dev *dev); +void vhost_poll_start(struct vhost_poll *poll, struct file *file); +void vhost_poll_stop(struct vhost_poll *poll); +void vhost_poll_flush(struct vhost_poll *poll); +void vhost_poll_queue(struct vhost_poll *poll); + +struct vhost_log { + u64 addr; + u64 len; +}; + +struct vhost_virtqueue; + +struct vhost_ubuf_ref { + struct kref kref; + wait_queue_head_t wait; + struct vhost_virtqueue *vq; +}; + +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool zcopy); +void vhost_ubuf_put(struct vhost_ubuf_ref *); +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *); + +/* The virtqueue structure describes a queue attached to a device. */ +struct vhost_virtqueue { + struct vhost_dev *dev; + + /* The actual ring of buffers. */ + struct mutex mutex; + unsigned int num; + struct vring_desc __user *desc; + struct vring_avail __user *avail; + struct vring_used __user *used; + struct file *kick; + struct file *call; + struct file *error; + struct eventfd_ctx *call_ctx; + struct eventfd_ctx *error_ctx; + struct eventfd_ctx *log_ctx; + + struct vhost_poll poll; + + /* The routine to call when the Guest pings us, or timeout. */ + vhost_work_fn_t handle_kick; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* Caches available index value from user. */ + u16 avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* Used flags */ + u16 used_flags; + + /* Last used index value we have signalled on */ + u16 signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + /* Log writes to used structure. */ + bool log_used; + u64 log_addr; + + struct iovec iov[UIO_MAXIOV]; + /* hdr is used to store the virtio header. + * Since each iovec has >= 1 byte length, we never need more than + * header length entries to store the header. */ + struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)]; + struct iovec *indirect; + size_t vhost_hlen; + size_t sock_hlen; + struct vring_used_elem *heads; + /* We use a kind of RCU to access private pointer. + * All readers access it from worker, which makes it possible to + * flush the vhost_work instead of synchronize_rcu. Therefore readers do + * not need to call rcu_read_lock/rcu_read_unlock: the beginning of + * vhost_work execution acts instead of rcu_read_lock() and the end of + * vhost_work execution acts instead of rcu_read_lock(). + * Writers use virtqueue mutex. */ + void *private_data; + /* Log write descriptors */ + void __user *log_base; + struct vhost_log *log; + /* vhost zerocopy support fields below: */ + /* last used idx for outstanding DMA zerocopy buffers */ + int upend_idx; + /* first used idx for DMA done zerocopy buffers */ + int done_idx; + /* an array of userspace buffers info */ + struct ubuf_info *ubuf_info; + /* Reference counting for outstanding ubufs. + * Protected by vq mutex. Writers must also take device mutex. */ + struct vhost_ubuf_ref *ubufs; +}; + +struct vhost_dev { + /* Readers use RCU to access memory table pointer + * log base pointer and features. + * Writers use mutex below.*/ + struct vhost_memory *memory; + struct mm_struct *mm; + struct mutex mutex; + unsigned acked_features; + struct vhost_virtqueue *vqs; + int nvqs; + struct file *log_file; + struct eventfd_ctx *log_ctx; + spinlock_t work_lock; + struct list_head work_list; + struct task_struct *worker; +}; + +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +long vhost_dev_check_owner(struct vhost_dev *); +long vhost_dev_reset_owner(struct vhost_dev *); +void vhost_dev_cleanup(struct vhost_dev *); +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); +int vhost_vq_access_ok(struct vhost_virtqueue *vq); +int vhost_log_access_ok(struct vhost_dev *); + +int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, + struct iovec iov[], unsigned int iov_count, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num); +void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); + +int vhost_init_used(struct vhost_virtqueue *); +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); +int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, + unsigned count); +void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, + unsigned int id, int len); +void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, + struct vring_used_elem *heads, unsigned count); +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); +void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); +bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *); + +int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, + unsigned int log_num, u64 len); +void vhost_zerocopy_callback(void *arg); +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq); + +#define vq_err(vq, fmt, ...) do { \ + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ + if ((vq)->error_ctx) \ + eventfd_signal((vq)->error_ctx, 1);\ + } while (0) + +#ifndef __rcu_dereference_index_check +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0)) +#define __rcu_dereference_index_check(p, c) \ + ({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + rcu_lockdep_assert(c, \ + "suspicious rcu_dereference_index_check()" \ + " usage"); \ + smp_read_barrier_depends(); \ + (_________p1); \ + }) +#else +#define __rcu_dereference_index_check(p, c) \ + ({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + RCU_LOCKDEP_WARN(c, \ + "suspicious rcu_dereference_index_check()" \ + " usage"); \ + smp_read_barrier_depends(); \ + (_________p1); \ + }) +#endif +#endif + +enum { + VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | + (1ULL << VHOST_F_LOG_ALL) | + (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | + (1ULL << VIRTIO_NET_F_MRG_RXBUF), +}; + +static inline int vhost_has_feature(struct vhost_dev *dev, int bit) +{ +#ifdef RHEL_RELEASE_CODE +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + unsigned acked_features = rcu_dereference_index_check(dev->acked_features, rcu_read_lock_held()); +#else + unsigned acked_features = rcu_dereference(dev->acked_features); +#endif +#else +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)) + unsigned acked_features = rcu_dereference_index_check(dev->acked_features, rcu_read_lock_held()); +#else + unsigned acked_features = __rcu_dereference_index_check(dev->acked_features, rcu_read_lock_held()); +#endif +#endif + return acked_features & (1 << bit); +} + +void vhost_enable_zcopy(int vq); + +#endif diff --git a/host/vmcore.c b/host/vmcore.c new file mode 100644 index 0000000..fb5819d --- /dev/null +++ b/host/vmcore.c @@ -0,0 +1,821 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * fs/proc/vmcore.c Interface for accessing the crash + * dump from the system's previous life. + * Heavily borrowed from fs/proc/kcore.c + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +#include +#endif +#include "mic_common.h" + +extern struct proc_dir_entry *vmcore_dir; + +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = 0x50e9000; + +/** + * mic_copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t mic_copy_oldmem_page(mic_ctx_t *mic_ctx, + unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr, *tmp; + int err; + struct dma_channel *dma_chan; + dma_addr_t mic_dst_phys_addr; + + vaddr = mic_ctx->aper.va + (pfn << PAGE_SHIFT); + + if (!csize) + return 0; + if (csize == PAGE_SIZE && !offset) { + if (!(tmp = (void*)__get_free_pages(GFP_KERNEL, get_order(PAGE_SIZE)))) { + printk(KERN_ERR "%s: tmp buffer allocation failed\n", __func__); + return -ENOMEM; + } + mic_dst_phys_addr = mic_ctx_map_single(mic_ctx, tmp, csize); + if (mic_map_error(mic_dst_phys_addr)) { + printk(KERN_ERR "%s: mic_ctx_map_single failed\n", __func__); + free_pages((unsigned long)tmp, get_order(PAGE_SIZE)); + return -ENOMEM; + } + + if ((allocate_dma_channel(mic_ctx->dma_handle, &dma_chan))) { + printk(KERN_ERR "%s: allocate_dma_channel failed\n", __func__); + mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize); + free_pages((unsigned long)tmp, get_order(PAGE_SIZE)); + return -EBUSY; + } + + err = do_dma(dma_chan, + 0, + pfn << PAGE_SHIFT, + mic_dst_phys_addr, + csize, + NULL); + if (err) { + printk(KERN_ERR "DMA do_dma err %s %d err %d src 0x%lx " + "dst 0x%llx csize 0x%lx\n", + __func__, __LINE__, err, pfn << PAGE_SHIFT, + mic_dst_phys_addr, csize); + free_dma_channel(dma_chan); + mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize); + free_pages((unsigned long)tmp, get_order(PAGE_SIZE)); + return err; + } + free_dma_channel(dma_chan); + err = drain_dma_poll(dma_chan); + if (err) { + printk(KERN_ERR "DMA poll err %s %d err %d src 0x%lx i" + "dst 0x%llx csize 0x%lx\n", + __func__, __LINE__, err, pfn << PAGE_SHIFT, + mic_dst_phys_addr, csize); + mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize); + free_pages((unsigned long)tmp, get_order(PAGE_SIZE)); + return err; + } + if (userbuf) { + if (copy_to_user(buf, tmp, csize)) { + mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize); + free_pages((unsigned long)tmp, get_order(PAGE_SIZE)); + return -EFAULT; + } + } else { + memcpy(buf, tmp, csize); + } + smp_mb(); + mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize); + free_pages((unsigned long)tmp, get_order(PAGE_SIZE)); + } else { + if (userbuf) { + if (copy_to_user(buf, vaddr + offset, csize)) + return -EFAULT; + } else + memcpy_fromio(buf, vaddr + offset, csize); + } + return csize; +} + +/* Reads a page from the oldmem device from given offset. */ +static ssize_t read_from_oldmem(mic_ctx_t *mic_ctx, + char *buf, size_t count, + u64 *ppos, int userbuf) +{ + unsigned long pfn, offset; + size_t nr_bytes; + ssize_t read = 0, tmp; + + if (!count) + return 0; + + offset = (unsigned long)(*ppos % PAGE_SIZE); + pfn = (unsigned long)(*ppos / PAGE_SIZE); + + do { + if (count > (PAGE_SIZE - offset)) + nr_bytes = PAGE_SIZE - offset; + else + nr_bytes = count; + + tmp = mic_copy_oldmem_page(mic_ctx, pfn, buf, nr_bytes, offset, userbuf); + if (tmp < 0) + return tmp; + *ppos += nr_bytes; + count -= nr_bytes; + buf += nr_bytes; + read += nr_bytes; + ++pfn; + offset = 0; + } while (count); + + return read; +} + +/* Maps vmcore file offset to respective physical address in memroy. */ +static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, + struct vmcore **m_ptr) +{ + struct vmcore *m; + u64 paddr; + + list_for_each_entry(m, vc_list, list) { + u64 start, end; + start = m->offset; + end = m->offset + m->size - 1; + if (offset >= start && offset <= end) { + paddr = m->paddr + offset - start; + *m_ptr = m; + return paddr; + } + } + *m_ptr = NULL; + return 0; +} + +/* Read from the ELF header and then the crash dump. On error, negative value is + * returned otherwise number of bytes read are returned. + */ +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + ssize_t acc = 0, tmp; + size_t tsz; + u64 start, nr_bytes; + struct vmcore *curr_m = NULL; + struct inode *inode = file->f_path.dentry->d_inode; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + mic_ctx_t *mic_ctx = PDE_DATA(inode); +#else + struct proc_dir_entry *entry = PDE(inode); + mic_ctx_t *mic_ctx = entry->data; +#endif + + if (buflen == 0 || *fpos >= mic_ctx->vmcore_size) + return 0; + + /* trim buflen to not go beyond EOF */ + if (buflen > mic_ctx->vmcore_size - *fpos) + buflen = mic_ctx->vmcore_size - *fpos; + + /* Read ELF core header */ + if (*fpos < mic_ctx->elfcorebuf_sz) { + tsz = mic_ctx->elfcorebuf_sz - *fpos; + if (buflen < tsz) + tsz = buflen; + if (copy_to_user(buffer, mic_ctx->elfcorebuf + *fpos, tsz)) + return -EFAULT; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + start = map_offset_to_paddr(*fpos, &mic_ctx->vmcore_list, &curr_m); + if (!curr_m) + return -EINVAL; + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + + while (buflen) { + tmp = read_from_oldmem(mic_ctx,buffer, tsz, &start, 1); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + if (start >= (curr_m->paddr + curr_m->size)) { + if (curr_m->list.next == &mic_ctx->vmcore_list) + return acc; /*EOF*/ + curr_m = list_entry(curr_m->list.next, + struct vmcore, list); + start = curr_m->paddr; + } + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + } + return acc; +} + +static const struct file_operations proc_vmcore_operations = { + .read = read_vmcore, +}; + +static struct vmcore* get_new_element(void) +{ + return kzalloc(sizeof(struct vmcore), GFP_KERNEL); +} + +static u64 get_vmcore_size_elf64(char *elfptr) +{ + int i; + u64 size; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +static u64 get_vmcore_size_elf32(char *elfptr) +{ + int i; + u64 size; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +/* Merges all the PT_NOTE headers into one. */ +static int merge_note_headers_elf64(mic_ctx_t *mic_ctx, + char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr, *phdr_ptr; + Elf64_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(mic_ctx, notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf64_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf64_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int merge_note_headers_elf32(mic_ctx_t *mic_ctx, + char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr, *phdr_ptr; + Elf32_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(mic_ctx, notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf32_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf32_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Add memory chunks represented by program headers to vmcore list. Also update + * the new offset fields of exported program headers. */ +static int process_ptload_program_headers_elf64(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset. */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +static int process_ptload_program_headers_elf32(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +/* Sets offset fields of vmcore elements. */ +static void set_vmcore_list_offsets_elf64(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf64_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +/* Sets offset fields of vmcore elements. */ +static void set_vmcore_list_offsets_elf32(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf32_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +static int parse_crash_elf64_headers(mic_ctx_t *mic_ctx) +{ + int rc=0; + Elf64_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem(mic_ctx, (char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || +#ifdef CONFIG_CRASH_DUMP +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,36)) + !vmcore_elf64_check_arch(&ehdr) || +#else + !vmcore_elf_check_arch(&ehdr) || +#endif +#else + !elf_check_arch(&ehdr) || +#endif + ehdr.e_ident[EI_CLASS] != ELFCLASS64 || + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf64_Ehdr) || + ehdr.e_phentsize != sizeof(Elf64_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + WARN_ON(mic_ctx->elfcorebuf); + /* Read in all elf headers. */ + mic_ctx->elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); + mic_ctx->elfcorebuf = kmalloc(mic_ctx->elfcorebuf_sz, GFP_KERNEL); + if (!mic_ctx->elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(mic_ctx, mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(mic_ctx->elfcorebuf); + mic_ctx->elfcorebuf = NULL; + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf64(mic_ctx, mic_ctx->elfcorebuf, &mic_ctx->elfcorebuf_sz, &mic_ctx->vmcore_list); + if (rc) { + kfree(mic_ctx->elfcorebuf); + mic_ctx->elfcorebuf = NULL; + return rc; + } + rc = process_ptload_program_headers_elf64(mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz, + &mic_ctx->vmcore_list); + if (rc) { + kfree(mic_ctx->elfcorebuf); + mic_ctx->elfcorebuf = NULL; + return rc; + } + set_vmcore_list_offsets_elf64(mic_ctx->elfcorebuf, &mic_ctx->vmcore_list); + return 0; +} + +static int parse_crash_elf32_headers(mic_ctx_t *mic_ctx) +{ + int rc=0; + Elf32_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem(mic_ctx, (char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !elf_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS32|| + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf32_Ehdr) || + ehdr.e_phentsize != sizeof(Elf32_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + WARN_ON(mic_ctx->elfcorebuf); + /* Read in all elf headers. */ + mic_ctx->elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + mic_ctx->elfcorebuf = kmalloc(mic_ctx->elfcorebuf_sz, GFP_KERNEL); + if (!mic_ctx->elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(mic_ctx, mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(mic_ctx->elfcorebuf); + mic_ctx->elfcorebuf = NULL; + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf32(mic_ctx, mic_ctx->elfcorebuf, &mic_ctx->elfcorebuf_sz, &mic_ctx->vmcore_list); + if (rc) { + kfree(mic_ctx->elfcorebuf); + mic_ctx->elfcorebuf = NULL; + return rc; + } + rc = process_ptload_program_headers_elf32(mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz, + &mic_ctx->vmcore_list); + if (rc) { + kfree(mic_ctx->elfcorebuf); + mic_ctx->elfcorebuf = NULL; + return rc; + } + set_vmcore_list_offsets_elf32(mic_ctx->elfcorebuf, &mic_ctx->vmcore_list); + return 0; +} + +static int parse_crash_elf_headers(mic_ctx_t *mic_ctx) +{ + unsigned char e_ident[EI_NIDENT]; + u64 addr; + int rc=0; + + addr = elfcorehdr_addr; + rc = read_from_oldmem(mic_ctx, e_ident, EI_NIDENT, &addr, 0); + if (rc < 0) + return rc; + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { + printk(KERN_WARNING "Warning: Core image elf header" + " not found\n"); + return -EINVAL; + } + + if (e_ident[EI_CLASS] == ELFCLASS64) { + rc = parse_crash_elf64_headers(mic_ctx); + if (rc) + return rc; + + /* Determine vmcore size. */ + mic_ctx->vmcore_size = get_vmcore_size_elf64(mic_ctx->elfcorebuf); + } else if (e_ident[EI_CLASS] == ELFCLASS32) { + rc = parse_crash_elf32_headers(mic_ctx); + if (rc) + return rc; + + /* Determine vmcore size. */ + mic_ctx->vmcore_size = get_vmcore_size_elf32(mic_ctx->elfcorebuf); + } else { + printk(KERN_WARNING "Warning: Core image elf header is not" + " sane\n"); + return -EINVAL; + } + return 0; +} + +/* Init function for vmcore module. */ +int vmcore_create(mic_ctx_t *mic_ctx) +{ + int rc = 0; + char name[64]; + if (!vmcore_dir) { + rc = -ENOMEM; + return rc; + } + INIT_LIST_HEAD(&mic_ctx->vmcore_list); + rc = parse_crash_elf_headers(mic_ctx); + if (rc) { + printk(KERN_WARNING "Kdump: vmcore not initialized\n"); + if (mic_ctx->vmcore_dir) { + remove_proc_entry(name, vmcore_dir); + mic_ctx->vmcore_dir = NULL; + } + return rc; + } + snprintf(name, 64, "mic%d", mic_ctx->bi_id); + if (!mic_ctx->vmcore_dir) { + mic_ctx->vmcore_dir = proc_create_data(name, S_IRUSR, + vmcore_dir, &proc_vmcore_operations, mic_ctx); + if (!mic_ctx->vmcore_dir) { + printk(KERN_WARNING "Kdump: proc creation for %s failed\n", name); + rc = -ENOMEM; + return rc; + } + } +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +#else + if (mic_ctx->vmcore_dir) + mic_ctx->vmcore_dir->size = mic_ctx->vmcore_size; +#endif + return 0; +} diff --git a/include/mic/bootparams.h b/include/mic/bootparams.h new file mode 100644 index 0000000..2102362 --- /dev/null +++ b/include/mic/bootparams.h @@ -0,0 +1,170 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MIC_BOOT_PARAM_HEADER_VERSION 8 + +#define MIC_OS_BOOTSTATUS_SUCCESS 1 +#define MIC_OS_BOOTSTATUS_BOOT_0 2 // Initial state of uOS boot +#define MIC_OS_BOOTSTATUS_ERROR_VERSION_MISMATCH 3 +#define MIC_OS_BOOTSTATUS_ERROR 4 + +#define MIC_HOST_DEFAULT 6 // Only value accepted so do not change + +#define MIC_ENG_APPLICATION 0 +#define MIC_ENG_PAGING 1 +#define MIC_ENG_VIDEO 2 +#define MIC_ENG_HIGHPRIORITY 3 +#define MIC_ENG_MAX_SUPPORTED_ENGINES 4 + +struct ringbuf_memdesc +{ + uint64_t address; // Location of the ring buffer + uint32_t size; // size of ring buffer + uint32_t reserved; // pad +}; + +struct mic_bootparam +{ + uint64_t bp_version; + + union + { + uint32_t bp_bootstatus; + uint64_t bp_reserved; + }; + + uint64_t bp_vcons_addr; + uint64_t bp_vcons_size; + uint64_t bp_shdata_addr; + uint64_t bp_shdata_size; + struct ringbuf_memdesc bp_ringbuf[MIC_ENG_MAX_SUPPORTED_ENGINES]; + + uint64_t bp_unused0; + uint64_t bp_unused1; + uint64_t bp_unused2; + uint64_t bp_unused3; + uint64_t bp_unused4; + uint64_t bp_unused5; + uint64_t bp_unused6; + uint64_t bp_unused7; + + uint64_t bp_engstate_addr; + + struct ringbuf_memdesc bp_unused8; + + uint64_t bp_unused9; + uint64_t bp_unused10; + uint64_t bp_unused11; + +}; + +struct host_bootparam +{ + uint64_t bp_version; + + union + { + uint64_t bp_host_type; + uint64_t bp_reserved; + }; + + uint64_t bp_vcons_addr; + uint64_t bp_vcons_size; + + uint64_t bp_unused0; + + uint64_t bp_engstate_addr; + + struct ringbuf_memdesc bp_ringbuf[MIC_ENG_MAX_SUPPORTED_ENGINES]; + + uint64_t bp_dmabuf_size[MIC_ENG_MAX_SUPPORTED_ENGINES]; + + uint64_t bp_unused1; + uint64_t bp_unused2; + + uint64_t bp_aper_size; + + uint8_t bp_unused3[36]; + uint64_t bp_unused4; + + struct ringbuf_memdesc bp_unused5; + + uint64_t bp_unused6; + uint64_t bp_unused7; + + uint32_t bp_watchdog_timeout; +}; + +struct enginestate_mic +{ + uint32_t writeOffset __attribute__((aligned(64))); + uint32_t lastCompletedFence __attribute__((aligned(64))); + uint32_t fenceWhenPreempted __attribute__((aligned(64))); + uint32_t preemptOffset __attribute__((aligned(64))); +}; + diff --git a/include/mic/compl_buf_ring.h b/include/mic/compl_buf_ring.h new file mode 100644 index 0000000..4882525 --- /dev/null +++ b/include/mic/compl_buf_ring.h @@ -0,0 +1,220 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef COMPL_BUF_RING_H +#define COMPL_BUF_RING_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mic_dma_md.h" +#ifndef _MIC_SCIF_ +#include "micscif.h" +#include "micscif_smpt.h" +#endif +#define MAX_POLL_TAIL_READ_RETRIES 20 + +/* + * Assuming read/write to int is atomic + * This can't be used as generic ring because of update_tail() + * One entry is left in the ring to differentiate between ring being empty and + * full + */ +struct compl_buf_ring { + int head; + int tail; + int size; + uint64_t tail_location; + dma_addr_t tail_phys; +}; + +/* + * FIXME: + * Function calls pci_map_single etc, return type needs to indicate + * an error + */ +static __always_inline void init_ring(struct compl_buf_ring *ring, int size, + int device_num) +{ +#ifndef _MIC_SCIF_ + struct pci_dev *pdev; +#endif + ring->head = 0; + ring->tail = 0; + ring->size = size; + ring->tail_location = (uint64_t) kmalloc(sizeof(uint64_t), GFP_ATOMIC); + BUG_ON(!ring->tail_location); + *(int*)ring->tail_location = -1; +#ifdef _MIC_SCIF_ + ring->tail_phys = virt_to_phys((void*)ring->tail_location); +#else + micscif_pci_dev(device_num, &pdev); + + ring->tail_phys = mic_map_single(device_num - 1, pdev, (void *)ring->tail_location, + sizeof(uint64_t)); + if (mic_map_error(ring->tail_phys)) + printk(KERN_ERR "mic_map returned error please help\n"); +#endif +} + +static __always_inline void uninit_ring(struct compl_buf_ring *ring, + int device_num) +{ +#ifndef _MIC_SCIF_ + struct pci_dev *pdev; +#endif + ring->head = 0; + ring->tail = 0; + ring->size = 0; +#ifndef _MIC_SCIF_ + micscif_pci_dev(device_num, &pdev); + mic_unmap_single(device_num - 1, pdev, ring->tail_phys, sizeof(uint64_t)); +#endif + kfree((void *)ring->tail_location); +} + +static __always_inline int incr_rb_index(int cur_index, int ring_size) +{ + return((cur_index + 1) % ring_size); +} + +/* + * Tail location has the index that has been recently processed by dma engine + * But, tail has to point to the index that will be processed next + * So increment the tail + */ +static __always_inline void update_tail(struct compl_buf_ring *ring, int new_tail) +{ + ring->tail = new_tail; +} + +static __always_inline int read_tail(struct compl_buf_ring *ring) +{ + return incr_rb_index(*(volatile int*)ring->tail_location, ring->size); +} + +/* + * This fn. assumes no one else is updating head + * Returns - avaliable space + * 0 - if no space is available + */ +static __always_inline bool avail_space_in_ring(struct compl_buf_ring *ring) +{ + int count = 0, max_num_retries = MAX_POLL_TAIL_READ_RETRIES, num_retries = 0; + int head = ring->head, tail = ring->tail; +retry: + if (head > tail) + count = (tail - 0) + (ring->size - head); + else if (tail > head) + count = tail - head; + else + return ring->size - 1; + + if (1 != count) + return count - 1; + + num_retries++; + if (num_retries == max_num_retries) + return 0; + cpu_relax(); + + ring->tail = read_tail(ring); + tail = ring->tail; + + goto retry; +} + +/* + * Used for polling + */ +static __always_inline bool is_entry_processed(struct compl_buf_ring *ring, int index) +{ + int head = ring->head, tail = ring->tail; + if (head < tail) { + if (index >= head && index < tail) + return 1; + } else { + if (index >= head || index < tail) + return 1; + } + return 0; +} + +static __always_inline void incr_head(struct compl_buf_ring *ring) +{ + ring->head = incr_rb_index(ring->head, ring->size); +} + +/* + * This function is not reentrant + * It is expected that the user of this func, will call incr_head() if allocated + * buffer is used + */ +static __always_inline int allocate_buffer(struct compl_buf_ring *ring) +{ + if (avail_space_in_ring(ring)) + return ring->head; + else + return -1; +} +#endif diff --git a/include/mic/io_interface.h b/include/mic/io_interface.h new file mode 100644 index 0000000..755a381 --- /dev/null +++ b/include/mic/io_interface.h @@ -0,0 +1,217 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* Contains common definitions for Windows and Linux IO Interface */ + +#ifndef __IO_INTERFACE_H__ +#define __IO_INTERFACE_H__ + +/* + * The host driver exports sysfs entries in + * /sys/class/mic/micX/ + * The "/sys/class/mic/micX/state" entry reflects the state of the + * card as it transitions from hardware reset through booting an image + * + * All the other entries have valid values when the state entry is either + * "ready" or "online" + */ + +/* + * ----------------------------------------- + * IOCTL interface information + * ----------------------------------------- + */ + +#define IOCTL_FLASHCMD _IOWR('c', 5, struct ctrlioctl_flashcmd *) +#define IOCTL_CARDMEMCPY _IOWR('c', 8, struct ctrlioctl_cardmemcpy *) + +typedef enum _product_knc_stepping_t +{ + KNC_A_STEP, + KNC_B0_STEP, + KNC_C_STEP, + KNC_B1_STEP +} product_knc_stepping_t; + +typedef enum { + FLASH_CMD_ABORT, + FLASH_CMD_READ, + FLASH_CMD_WRITE, + FLASH_CMD_VERSION, + RAS_CMD, + RAS_CMD_INJECT_REPAIR, + RAS_CMD_CORE_DISABLE, + RAS_CMD_CORE_ENABLE, + RAS_CMD_ECC_DISABLE = 0xD, + RAS_CMD_ECC_ENABLE = 0xE, + RAS_CMD_EXIT = 0xF, + /* Driver only commands that are not passed to RASMM */ + FLASH_CMD_READ_DATA, + FLASH_CMD_STATUS, +} MIC_FLASH_CMD_TYPE; + +/** + * struct ctrlioctl_flashcmd: + * + * \param brdnum board for which IOCLT is requested + * \param type arguments needed for the uos escape call + * \param data size of escape arguments + * \param len uos escape opecode + * + * This structure is used for IOCTL_FLASHCMD. + * + * This IOCTL can only be issued when /sys/class/mic/mic0/state returns "online" + * after it has been set to "boot:flash" + */ +struct ctrlioctl_flashcmd { + uint32_t brdnum; + MIC_FLASH_CMD_TYPE type; + void *data; + uint32_t len; +}; + + +/* + * IN/OUT structure used by MIC_FLASH_CMD_TYPE FLASH_CMD_VERSION + * This structure is passed in as data in above command + */ +#define MAX_FLASH_VER_STRLEN 16 +struct version_struct { + uint16_t hdr_ver; + uint16_t odm_ver;//revision for ODM change for flash + uint64_t upd_time_bcd; + uint8_t upd_ver[MAX_FLASH_VER_STRLEN]; // 16 bytes for flash version + uint64_t mfg_time_bcd; + uint8_t mfg_ver[MAX_FLASH_VER_STRLEN]; // 16 bytes for flash version +}; + +/* + * status values returned in MIC_FLASH_CMD_TYPE FLASH_CMD_STATUS + */ +typedef enum { + FLASH_IDLE, + FLASH_CMD_IN_PROGRESS, + FLASH_CMD_COMPLETED, + FLASH_CMD_FAILED, + FLASH_CMD_AUTH_FAILED, + FLASH_SMC_CMD_IN_PROGRESS, + FLASH_SMC_CMD_COMPLETE, + FLASH_SMC_CMD_FAILED, + FLASH_SMC_CMD_AUTH_FAILED, + FLASH_CMD_INVALID = 0xF, +} MIC_FLASH_STATUS; + +struct flash_stat { + MIC_FLASH_STATUS status; + uint32_t percent; + uint32_t smc_status; + uint32_t cmd_data; + uint32_t mm_debug; +}; + +typedef enum { + DBOX, + SBOX, +} MMIO_REGISTER_TYPE; + +/** + * struct ctrlioctl_cardmemcpy: + * + * \param brdnum board for which IOCLT is requested + * \param start card side physical address from which the copy will start + * \param size offset of the register from data is to be read + * \param dest user buffer in which data is to be copied + * + * This structure is used for IOCTL_MMIOREAD. + */ +struct ctrlioctl_cardmemcpy { + uint32_t brdnum; + uint64_t start; + uint64_t size; + void *dest; +}; + +/* + * FIXME:: All the typedefines and structures below and their references need + * to be cleaned up from the driver code + *--------------------------------------------------------------------------- + */ + +typedef enum _product_family_t +{ + FAMILY_UNKNOWN = 0, + FAMILY_ABR, + FAMILY_KNC +} product_family_t; + +typedef enum { + USAGE_MODE_NORMAL = 0, + USAGE_MODE_MAINTENANCE, + USAGE_MODE_ZOMBIE, + USAGE_MODE_MEMDIAG, + USAGE_MODE_NORMAL_RESTRICTED, + USAGE_MODE_NOP, + USAGE_MODE_MAX, + +} CARD_USAGE_MODE; + +/* + * SBOX register definitions + * TODO: Remove the bit fields and replace them with bitwise operators + */ +typedef union sbox_scratch1_reg { + uint32_t value; + struct { + uint32_t percent : 7; + uint32_t status : 4; + uint32_t command : 4; + uint32_t smc_status : 4; + uint32_t reserved : 5; + uint32_t cmd_data : 7; + uint32_t mm_debug : 1; + } bits; +} sbox_scratch1_reg_t; + +typedef union sbox_scratch2_reg { + uint32_t value; + struct { + uint32_t bootstrap_ready : 1; + uint32_t bsp_apic_id : 9; + uint32_t reserved : 2; + uint32_t image_addr : 20; + } bits; +} sbox_scratch2_reg_t; + +#endif //!__IO_INTERFACE_H__ diff --git a/include/mic/mic_dma_api.h b/include/mic/mic_dma_api.h new file mode 100644 index 0000000..f9caffa --- /dev/null +++ b/include/mic/mic_dma_api.h @@ -0,0 +1,170 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MIC_DMA_API_H +#define MIC_DMA_API_H + +struct dma_channel; +/* API exported by the DMA library */ + +/* + * Per MIC device (per MIC card) DMA handle. The card opens the handle to its own device. + * The host opens the handle to the DMA devices of one of the cards. + */ +typedef void * mic_dma_handle_t; + +/* DMA Library Init/Uninit Routines */ +int open_dma_device(int device_num, uint8_t *mmio_va_base, mic_dma_handle_t* dma_handle); + +void close_dma_device(int device_num, mic_dma_handle_t *dma_handle); + +/* + * reserve_dma_channel - reserve a given dma channel for exclusive use + * + * @dma_handle - handle to DMA device returned by open_dma_device + * @chan_num - Channel number to be reserved + * @chan - set to point to the dma channel reserved by the call + * + * Returns < 1 on error (errorno) + * Returns 0 on success + */ +int reserve_dma_channel(mic_dma_handle_t dma_handle, int chan_num, struct dma_channel **chan); + +/* + * allocate_dma_channel - dynamically allocate a dma channel (for a short while). Will + * search for, choose, and lock down one channel for use by the calling thread. + * + * @dma_handle - handle to DMA device returned by open_dma_device + * @chan - Returns the dma_channel pointer that was allocated by the call + * + * Returns < 1 on error + * Returns 0 on success + * + * NOTE: This function grabs a lock before exiting -- the calling thread MUST NOT + * sleep, and must call free_dma_channel before returning to user-space or switching + * volantarily to another thread. Similarly, this function cannot be called from + * an interrupt context at this time. + */ +int allocate_dma_channel(mic_dma_handle_t dma_handle, struct dma_channel **chan); + +/* + * request_dma_channel - Request a specific DMA channel. + * + * @dma_handle - handle to DMA device returned by open_dma_device + * @chan - Returns the dma_channel pointer that was requested + * + * Returns: 0 on success and -ERESTARTSYS if the wait was interrupted + * or -EBUSY if the channel was not available. + * + * NOTE: This function grabs a lock before exiting -- the calling thread MUST NOT + * sleep, and must call free_dma_channel before returning to user-space or switching + * volantarily to another thread. Similarly, this function cannot be called from + * an interrupt context at this time. + */ +int request_dma_channel(struct dma_channel *chan); + +/* + * free_dma_channel - after allocating a channel, used to + * free the channel after DMAs are submitted + * + * @chan - pointer to the dma_channel struct that was allocated + * + * Returns 0 on success, < 1 on error (errorno) + * + * NOTE: This function must be called after all do_dma calls are finished, + * but can be called before the DMAs actually complete (as long as the comp_cb() + * handler in do_dma don't refer to the dma_channel struct). If called with a + * dynamically allocated dma_channel, the caller must be the thread that called + * allocate_dma_channel. When operating on a dynamic channel, free unlocks the + * mutex locked in allocate. Statically allocated channels cannot be freed, + * and calling this function with that type of channel will return an error. + */ +int free_dma_channel(struct dma_channel *chan); + +/* + * drain_dma_poll - Drain all outstanding DMA operations for a particular + * DMA channel via polling. + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +int drain_dma_poll(struct dma_channel *chan); + +/* + * drain_dma_intr - Drain all outstanding DMA operations for a particular + * DMA channel via interrupt based blocking wait. + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +int drain_dma_intr(struct dma_channel *chan); + +/* + * drain_dma_global - Drain all outstanding DMA operations for + * all online DMA channel. + * @block - Is it okay to block while operations are drained? + * Return 0 on success and -errno on error. + */ +int drain_dma_global(mic_dma_handle_t dma_handle); + +#ifdef _MIC_SCIF_ +/* + * dma_suspend: DMA tasks before transition to low power state. + * @dma_handle: Handle for a DMA driver context. + */ +void dma_suspend(mic_dma_handle_t dma_handle); + +/* + * dma_resume: DMA tasks after wake up from low power state. + * @dma_handle: Handle for a DMA driver context. + */ +void dma_resume(mic_dma_handle_t dma_handle); +#else +/* + * dma_prep_suspend: DMA tasks required on host before a device can transition + * to a low power state. + * @dma_handle: Handle for a DMA driver context. + */ +void dma_prep_suspend(mic_dma_handle_t dma_handle); +#endif + +static inline void mic_dma_thread_free_chan(struct dma_channel *chan) +{ + free_dma_channel(chan); +} +#ifndef _MIC_SCIF_ +//extern struct mutex lock_dma_dev_init; +void host_dma_interrupt_handler(mic_dma_handle_t dma_handle, uint32_t sboxSicr0Reg); +#endif + +#endif /* MIC_DMA_API_H */ diff --git a/include/mic/mic_dma_lib.h b/include/mic/mic_dma_lib.h new file mode 100644 index 0000000..7b7d30a --- /dev/null +++ b/include/mic/mic_dma_lib.h @@ -0,0 +1,207 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MIC_DMA_LIB_H +#define MIC_DMA_LIB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Program SUD for poll ring */ +#define DO_DMA_POLLING (1<<0) +/* Program SUD for interrupt ring */ +#define DO_DMA_INTR (1<<1) + +struct dma_channel; + +struct dma_completion_cb { + void (*dma_completion_func) (uint64_t cookie); + uint64_t cb_cookie; + uint8_t *temp_buf; + uint8_t *temp_buf_to_free; + bool is_cache; + uint64_t dst_offset; + uint64_t tmp_offset; + struct reg_range_t *dst_window; + size_t len; + dma_addr_t temp_phys; + int remote_node; + int header_padding; +}; + +int get_chan_num(struct dma_channel *chan); +/* + * do_dma - main dma function: perform a dma memcpy, len bytes from src to dst + * + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_channel, or statically by + * reserve_dma_channel. Using a channel not allocated in this way will + * result in undefined behavior. + * @flags - ATOMIC, called from an interrupt context (no blocking) + * @src - src physical address + * @dst - dst physical address + * @len - Length of the dma + * @comp_cb - When the DMA is complete, the struct's function will be called. NOTE! + * comp_cb(cb_cookie) is called from an interrupt context, so the + * function must not sleep or block. + * + * Return < 1 on error + * Return 0 on success and DMA is completed + * Return > 1: DMA has been queued. Return value can be polled on for completion + * (poll cookie). An example (simplified w/ no error handling). + * int cookie = do_dma(...); + * while (poll_dma_completion(cookie) == 0); + * printf("DMA now complete\n"); + */ +int do_dma(struct dma_channel *chan, int flags, + uint64_t src, uint64_t dst, size_t len, + struct dma_completion_cb *comp_cb); +/* + * poll_dma_completion - check if a DMA is complete + * + * @poll_cookie - value returned from do_dma + * + * Returns + * < 0 -> error (e.g., invalid cookie) + * 0 -> DMA pending + * 1 -> DMA completed + * + * Note: This is mostly useful after calling do_dma with a NULL comp_cb parameter, as + * it will allow the caller to wait for DMA completion. + */ +int poll_dma_completion(int poll_cookie, struct dma_channel *chan); + +/* + * do_status_update: Update physical address location with the value provided. + * Ensures all previous DMA descriptors submitted on this DMA + * channel are executed. + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_channel, or statically by + * reserve_dma_channel. Using a channel not allocated in this way will + * result in undefined behavior. + * @phys - physical address + * @value - Value to be programmed + * + * Return 0 on success and appropriate error value on error. + */ +int do_status_update(struct dma_channel *chan, uint64_t phys, uint64_t value); + +/* + * get_dma_mark: Obtain current value of DMA mark + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_channel, or statically by + * reserve_dma_channel. Using a channel not allocated in this way will + * result in undefined behavior. + * + * Return mark. + */ +int get_dma_mark(struct dma_channel *chan); + +/* + * is_current_dma_mark: Check if the dma mark provided is the current DMA mark. + * @chan - DMA channel + * @mark - DMA mark + * + * Return true on success and false on failure. + */ +bool is_current_dma_mark(struct dma_channel *chan, int mark); + +/* + * program_dma_mark: Increment the current value of the DMA mark for a DMA channel + * and program an interrupt status update descriptor which ensures that all DMA + * descriptors programmed until this point in time are completed. + * @chan - DMA channel to use for the transfer. The channel can be allocated + * dynamically by calling allocate_dma_channel, or statically by + * reserve_dma_channel. Using a channel not allocated in this way will + * result in undefined behavior. + * + * Return mark upon success and appropriate negative error value on error. + */ +int program_dma_mark(struct dma_channel *chan); + +/* + * is_dma_mark_wait: Check if the dma mark provided has been processed. + * @chan - DMA channel + * @mark - DMA mark + * + * Return true on success and false on failure. + */ +bool is_dma_mark_processed(struct dma_channel *chan, int mark); + +/* + * dma_mark_wait: Wait for the dma mark to complete. + * @chan - DMA channel + * @mark - DMA mark + * @is_interruptible - Use wait_event_interruptible() or not. + * + * Return 0 on success and appropriate error value on error. + */ +int dma_mark_wait(struct dma_channel *chan, int mark, bool is_interruptible); + +#ifndef _MIC_SCIF_ +void host_dma_lib_interrupt_handler(struct dma_channel *chan); +#endif + +#endif /* MIC_DMA_LIB_H */ diff --git a/include/mic/mic_dma_md.h b/include/mic/mic_dma_md.h new file mode 100644 index 0000000..bc8af28 --- /dev/null +++ b/include/mic/mic_dma_md.h @@ -0,0 +1,462 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MIC_DMA_MD_H +#define MIC_DMA_MD_H + +#include "mic_sbox_md.h" +#include "micsboxdefine.h" + +#define MAX_NUM_DMA_CHAN 8 +/* + * WE ASSUME 0 to __LAST_HOST_CHAN_NUM are owned by host + * Keep this in mind when changing this value + */ +#define __LAST_HOST_CHAN_NUM 3 + +#ifdef _MIC_SCIF_ +static inline int first_dma_chan(void) +{ + return __LAST_HOST_CHAN_NUM + 1; +} + +static inline int last_dma_chan(void) +{ + return MAX_NUM_DMA_CHAN - 1; +} +#else +static inline int first_dma_chan(void) +{ + return 0; +} + +static inline int last_dma_chan(void) +{ + return __LAST_HOST_CHAN_NUM; +} +#endif +enum md_mic_dma_chan_reg { + REG_DCAR = 0, + REG_DHPR, + REG_DTPR, + REG_DAUX_HI, + REG_DAUX_LO, + REG_DRAR_HI, + REG_DRAR_LO, + REG_DITR, + REG_DSTAT, + REG_DSTATWB_LO, + REG_DSTATWB_HI, + REG_DCHERR, + REG_DCHERRMSK, +}; + + +/* Pre-defined L1_CACHE_SHIFT is 6 on RH and 7 on Suse */ +#undef L1_CACHE_SHIFT +#define L1_CACHE_SHIFT 6 +#undef L1_CACHE_BYTES +#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) + +enum dma_chan_flags { + CHAN_AVAILABLE = 2, + CHAN_INUSE = 3 +}; + +/* Maximum DMA transfer size for a single memory copy descriptor */ +#define MIC_MAX_DMA_XFER_SIZE (((1U) * 1024 * 1024) - L1_CACHE_BYTES) + +/* TODO: + * I think it should be 128K - 64 (even 128k - 4 may work). + * SIVA: Check this in the end + */ +/* + * The maximum number of descriptors in the DMA descriptor queue is + * 128K - 1 but since it needs to be a multiple of cache lines it is 128K - 64 + */ +#define MIC_MAX_NUM_DESC_PER_RING ((128 * 1024) - L1_CACHE_BYTES) + +/** + * enum md_mic_dma_chan_owner - Memory copy DMA channels can be Host or MIC owned. + * AES channel can only be MIC owned. + */ +enum md_mic_dma_chan_owner { + MIC_DMA_CHAN_MIC_OWNED = 0, + MIC_DMA_CHAN_HOST_OWNED +}; + +/** + * enum md_mic_dma_aes_endianness - Endianness needs to be provided + * only for the AES channel + */ +enum md_mic_dma_aes_endianness { + /* + * The following two bits are opposite of what is given in + * content protection HAS but this is how it is implemented in RTL. + */ + MIC_BIG_ENDIAN = 0, + MIC_LITTLE_ENDIAN +}; + + +/** + * struct md_mic_dma_chan - Opaque data structure for DMA channel specific fields. + */ +/* + * struct md_mic_dma_chan: DMA channel specific structure + * @in_use - true if the channel is in use and false otherwise + * @owner - host or MIC required for masking/unmasking + * interrupts and enabling channels + * @endianness - required for enabling AES channel + * @cookie - Debug cookie to identify this structure + * @num_desc_in_ring - Number of descriptors in the descriptor + * ring for this channel. + */ +struct md_mic_dma_chan { + int ch_num; + atomic_t in_use; + enum md_mic_dma_chan_owner owner; + enum md_mic_dma_aes_endianness endianness; + int cookie; + uint32_t num_desc_in_ring; + uint32_t cached_tail; + uint32_t completion_count; + void *dstat_wb_loc; + dma_addr_t dstat_wb_phys; + /* Add debug/profiling stats here */ +}; + + +/* + * struct mic_dma_device - MIC DMA Device specific structure + * @chan_info - static array of MIC DMA channel specific structures + * @lock - MTX_DEF lock to synchronize allocation/deallocation of DMA channels + */ +struct mic_dma_device { + struct md_mic_dma_chan chan_info[MAX_NUM_DMA_CHAN]; + void *mm_sbox; +}; + + +/** + * union md_mic_dma_desc - Opaque data structure for DMA descriptor format. + */ +/* TODO: Change bitfields to portable masks */ +union md_mic_dma_desc { + union { + struct { + uint64_t rsvd0; + uint64_t rsvd1:60; + uint64_t type:4; + } nop; + struct { + uint64_t sap:40; + uint64_t index:3; + uint64_t rsvd0:3; + uint64_t length:14; + uint64_t rsvd1:4; + uint64_t dap:40; + uint64_t resd:15; + uint64_t twb:1; + uint64_t intr:1; + uint64_t c:1; + uint64_t co:1; + uint64_t ecy:1; + uint64_t type:4; + } memcopy; + struct { + uint64_t data; + uint64_t dap:40; + uint64_t rsvdr0:19; + uint64_t intr:1; + uint64_t type:4; + } status; + struct { + uint64_t data:32; + uint64_t rsvd0:32; + uint64_t dap:40; + uint64_t rsvd1:20; + uint64_t type:4; + } general; + struct { + uint64_t data; + uint64_t rsvd0:53; + uint64_t cs:1; + uint64_t index:3; + uint64_t h:1; + uint64_t sel:2; + uint64_t type:4; + } keynoncecnt; + struct { + uint64_t skap:40; + uint64_t ski:3; + uint64_t rsvd0:21; + uint64_t rsvd1:51; + uint64_t di:3; + uint64_t rsvd2:6; + uint64_t type:4; + } key; + } desc; + struct { + uint64_t qw0; + uint64_t qw1; + } qwords; +}; + +/* Initialization functions */ +void md_mic_dma_init(struct mic_dma_device *dma_dev, uint8_t *mmio_va_base); +void md_mic_dma_uninit(struct mic_dma_device *dma_dev); +void md_mic_dma_chan_init_attr(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan); +void md_mic_dma_chan_mask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan); +void md_mic_dma_chan_unmask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan); +void md_mic_dma_chan_set_desc_ring(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan, + phys_addr_t desc_ring_phys_addr, + uint32_t num_desc); +void md_mic_dma_enable_chan(struct mic_dma_device *dma_dev, uint32_t chan_num, bool enable); +/* API */ +struct md_mic_dma_chan *md_mic_dma_request_chan(struct mic_dma_device *dma_dev, + enum md_mic_dma_chan_owner owner); +void md_mic_dma_free_chan(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan); + +static uint32_t mic_dma_reg[8][13] = { + {SBOX_DCAR_0, SBOX_DHPR_0, SBOX_DTPR_0, SBOX_DAUX_HI_0, SBOX_DAUX_LO_0, SBOX_DRAR_HI_0, + SBOX_DRAR_LO_0, SBOX_DITR_0, SBOX_DSTAT_0, + SBOX_DSTATWB_LO_0, SBOX_DSTATWB_HI_0, SBOX_DCHERR_0, SBOX_DCHERRMSK_0}, + {SBOX_DCAR_1, SBOX_DHPR_1, SBOX_DTPR_1, SBOX_DAUX_HI_1, SBOX_DAUX_LO_1, SBOX_DRAR_HI_1, + SBOX_DRAR_LO_1, SBOX_DITR_1, SBOX_DSTAT_1, + SBOX_DSTATWB_LO_1, SBOX_DSTATWB_HI_1, SBOX_DCHERR_1, SBOX_DCHERRMSK_1}, + {SBOX_DCAR_2, SBOX_DHPR_2, SBOX_DTPR_2, SBOX_DAUX_HI_2, SBOX_DAUX_LO_2, SBOX_DRAR_HI_2, + SBOX_DRAR_LO_2, SBOX_DITR_2, SBOX_DSTAT_2, + SBOX_DSTATWB_LO_2, SBOX_DSTATWB_HI_2, SBOX_DCHERR_2, SBOX_DCHERRMSK_2}, + {SBOX_DCAR_3, SBOX_DHPR_3, SBOX_DTPR_3, SBOX_DAUX_HI_3, SBOX_DAUX_LO_3, SBOX_DRAR_HI_3, + SBOX_DRAR_LO_3, SBOX_DITR_3, SBOX_DSTAT_3, + SBOX_DSTATWB_LO_3, SBOX_DSTATWB_HI_3, SBOX_DCHERR_3, SBOX_DCHERRMSK_3}, + {SBOX_DCAR_4, SBOX_DHPR_4, SBOX_DTPR_4, SBOX_DAUX_HI_4, SBOX_DAUX_LO_4, SBOX_DRAR_HI_4, + SBOX_DRAR_LO_4, SBOX_DITR_4, SBOX_DSTAT_4, + SBOX_DSTATWB_LO_4, SBOX_DSTATWB_HI_4, SBOX_DCHERR_4, SBOX_DCHERRMSK_4}, + {SBOX_DCAR_5, SBOX_DHPR_5, SBOX_DTPR_5, SBOX_DAUX_HI_5, SBOX_DAUX_LO_5, SBOX_DRAR_HI_5, + SBOX_DRAR_LO_5, SBOX_DITR_5, SBOX_DSTAT_5, + SBOX_DSTATWB_LO_5, SBOX_DSTATWB_HI_5, SBOX_DCHERR_5, SBOX_DCHERRMSK_5}, + {SBOX_DCAR_6, SBOX_DHPR_6, SBOX_DTPR_6, SBOX_DAUX_HI_6, SBOX_DAUX_LO_6, SBOX_DRAR_HI_6, + SBOX_DRAR_LO_6, SBOX_DITR_6, SBOX_DSTAT_6, + SBOX_DSTATWB_LO_6, SBOX_DSTATWB_HI_6, SBOX_DCHERR_6, SBOX_DCHERRMSK_6}, + {SBOX_DCAR_7, SBOX_DHPR_7, SBOX_DTPR_7, SBOX_DAUX_HI_7, SBOX_DAUX_LO_7, SBOX_DRAR_HI_7, + SBOX_DRAR_LO_7, SBOX_DITR_7, SBOX_DSTAT_7, + SBOX_DSTATWB_LO_7, SBOX_DSTATWB_HI_7, SBOX_DCHERR_7, SBOX_DCHERRMSK_7} +}; + +static __always_inline uint32_t +md_mic_dma_read_mmio(struct mic_dma_device *dma_dev, + int chan, enum md_mic_dma_chan_reg reg) +{ + return mic_sbox_read_mmio(dma_dev->mm_sbox, mic_dma_reg[chan][reg]); +} + +static __always_inline void +md_mic_dma_write_mmio(struct mic_dma_device *dma_dev, int chan, + enum md_mic_dma_chan_reg reg, uint32_t value) +{ + mic_sbox_write_mmio(dma_dev->mm_sbox, mic_dma_reg[chan][reg], value); +} + +#ifdef DEBUG +#ifndef KASSERT +#define KASSERT(x, y, ...) \ + do { \ + if(!x) \ + printk(y, ##__VA_ARGS__);\ + BUG_ON(!x); \ + } while(0) +#endif +#define CHECK_CHAN(chan) \ + do { \ + KASSERT((chan), "NULL DMA channel\n"); \ + KASSERT((DMA_CHAN_COOKIE == chan->cookie), \ + "Bad DMA channel cookie 0x%x\n", chan->cookie); \ + KASSERT(atomic_read(&(chan->in_use)), "DMA Channel not in use\n"); \ + } while(0) +#else // DEBUG +#ifndef KASSERT +#define KASSERT(x, y, ...) \ + do { \ + if(!x) \ + printk(y, ##__VA_ARGS__);\ + BUG_ON(!x); \ + } while(0) +#endif +#define CHECK_CHAN(chan) + +#endif // DEBUG + +struct mic_dma_ctx_t; +void md_mic_dma_chan_set_dstat_wb(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan); + +void md_mic_dma_chan_set_dcherr_msk(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan, uint32_t mask); + +static __always_inline void +md_mic_dma_chan_write_head(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan, uint32_t head) +{ + uint32_t chan_num; + CHECK_CHAN(chan); + chan_num = chan->ch_num; + KASSERT((head < chan->num_desc_in_ring), + "head 0x%x > num_desc_in_ring 0x%x chan_num %d\n", + head, chan->num_desc_in_ring, chan_num); + md_mic_dma_write_mmio(dma_dev, chan_num, REG_DHPR, head); +} + +uint32_t md_mic_dma_chan_read_head(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan); +uint32_t md_mic_dma_chan_read_tail(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan); + +#define TAIL_PTR_READ_RETRIES 500000 +#define HW_CMP_CNT_MASK 0x1ffff +static __always_inline uint32_t +md_avail_desc_ring_space(struct mic_dma_device *dma_dev, bool is_astep, + struct md_mic_dma_chan *chan, uint32_t head, uint32_t required) +{ + uint32_t count = 0, max_num_retries = TAIL_PTR_READ_RETRIES, num_retries = 0; + uint32_t tail = chan->cached_tail; +retry: + if (head > tail) + count = (tail - 0) + (chan->num_desc_in_ring - head); + else if (tail > head) + count = tail - head; + else + return (chan->num_desc_in_ring - 1); + + if (count > required) { + return count - 1; + } else { + if (is_astep) + tail = md_mic_dma_chan_read_tail(dma_dev, chan); + else + tail = HW_CMP_CNT_MASK & md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DSTAT); + } + chan->cached_tail = tail; + num_retries++; + if (num_retries == max_num_retries) + return 0; + cpu_relax(); + goto retry; +} + +bool md_mic_dma_chan_intr_pending(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan); +phys_addr_t md_mic_dma_chan_get_desc_ring_phys(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan); +phys_addr_t md_mic_dma_chan_get_dstatwb_phys(struct mic_dma_device *dma_dev, + struct md_mic_dma_chan *chan); +inline uint32_t md_mic_dma_read_mmio(struct mic_dma_device *dma_dev, + int chan, enum md_mic_dma_chan_reg reg); + +/* Descriptor programming helpers */ +void md_mic_dma_prep_nop_desc(union md_mic_dma_desc *desc); + +/** + * md_mic_dma_memcpy_desc - Prepares a memory copy descriptor + * @src_phys: Source Physical Address must be cache line aligned + * @dst_phys: Destination physical address must be cache line aligned + * @size: Size of the transfer should not be 0 and must be a multiple + * of cache line size + */ +static __always_inline void +md_mic_dma_memcpy_desc(union md_mic_dma_desc *desc, + uint64_t src_phys, + uint64_t dst_phys, + uint64_t size) +{ + KASSERT((desc != 0), ("NULL desc")); + KASSERT((ALIGN(src_phys - (L1_CACHE_BYTES - 1), L1_CACHE_BYTES) == src_phys), + "src not cache line aligned 0x%llx\n", (unsigned long long)src_phys); + KASSERT((ALIGN(dst_phys - (L1_CACHE_BYTES - 1), L1_CACHE_BYTES) == dst_phys), + "dst not cache line aligned 0x%llx\n", (unsigned long long)dst_phys); + KASSERT(((size != 0) && (size <= MIC_MAX_DMA_XFER_SIZE) && + (ALIGN(size - (L1_CACHE_BYTES - 1), L1_CACHE_BYTES) == size)), + "size > MAX_DMA_XFER_SIZE size 0x%llx", (unsigned long long)size); + + desc->qwords.qw0 = 0; + desc->qwords.qw1 = 0; + desc->desc.memcopy.type = 1; + desc->desc.memcopy.sap = src_phys; + desc->desc.memcopy.dap = dst_phys; + desc->desc.memcopy.length = (size >> L1_CACHE_SHIFT); +} + +/** + * md_mic_dma_prep_status_desc - Prepares a status descriptor + * @data - Value to be updated by the DMA engine @ dst_phys + * @dst_phys: Destination physical address + * @generate_intr: Interrupt must be generated when the DMA HW + * completes processing this descriptor + */ +static __always_inline void +md_mic_dma_prep_status_desc(union md_mic_dma_desc *desc, uint64_t data, + uint64_t dst_phys, bool generate_intr) +{ + KASSERT((desc != 0), ("NULL desc")); + + desc->qwords.qw0 = 0; + desc->qwords.qw1 = 0; + desc->desc.memcopy.type = 2; + desc->desc.status.data = data; + desc->desc.status.dap = dst_phys; + if (generate_intr) + desc->desc.status.intr = 1; +} + +/** + * md_mic_dma_prep_gp_desc - Prepares a general purpose descriptor + * @data - Value to be updated by the DMA engine @ dst_phys + * @dst_phys: Destination physical address + */ +static __always_inline void +md_mic_dma_prep_gp_desc(union md_mic_dma_desc *desc, uint32_t data, uint64_t dst_phys) +{ + KASSERT((desc != 0), ("NULL desc")); + + desc->qwords.qw0 = 0; + desc->qwords.qw1 = 0; + desc->desc.general.type = 3; + desc->desc.general.data = data; + desc->desc.general.dap = dst_phys; +} +/* Debug functions */ +void md_mic_dma_print_debug(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan); +#endif diff --git a/include/mic/mic_macaddr.h b/include/mic/mic_macaddr.h new file mode 100644 index 0000000..520d735 --- /dev/null +++ b/include/mic/mic_macaddr.h @@ -0,0 +1,104 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef __MIC_MACADDR_H__ +#define __MIC_MACADDR_H__ + +#define MAC_RUN_SHIFT 1 +#define MAC_DATE_SHIFT 16 + +/** + * mic_get_mac_from_serial - Create MAC address from serial number string + * \param serial string containing serial number + * \param mac data space to place MAC address + * \param host if true set least significant bit for hosts MAC + * + * mic_get_mac_from_serial() creates a MAC address from a MIC host's serial number. + * + * A MAC address contains 6 bytes of which the first 3 are either assigned by IEEE + * or bit 2 of the first byte is set to indicate locally created. While awaiting + * our assigned values, the first the bytes have been set to 'MIC' with the local + * bit also being set and multicast not. The result is actually seeing "NIC". + * + * The last 3 bytes, or 24 bits are set in the pattern: + * o 8 bits are created by subtracting 1 from the cards year character mulitplied + * by the work week field. By subtracting 1 the year starts at 2012 and there + * is enough room to accout for MIC cards build through 2017 + * o 15 bits are the work week running number from the serail number. This allows + * space for 32k of boards to be build in any one week. + * o 1 bit is used to indicated whether it is the host or card end of the virtual + * network connection. The bit being set is the card MAC address. + * + * Upon successful completion, mic_get_mac_from_serial returns zero. If the serial + * number does not have "KC" (for Knights Corner) as the 3rd and 4th characters + * then the serial number is invalid and a non zero value is returned. + */ + +static int +mic_get_mac_from_serial(char *serial, unsigned char *mac, int host) +{ + unsigned long final; + int y; + int ww; + + if ((serial == NULL) || (serial[2] != 'K') || (serial[3] != 'C')) + return 1; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,39) + y = kstrtoul(&serial[7], 10, &final); // y is to shutup Suse build +#else + final = simple_strtoul(&serial[7], NULL, 10); +#endif + + final = final << MAC_RUN_SHIFT; /* Card side will add one */ + + y = (serial[4] - '1'); /* start year 2012 end year 2016 */ + ww = ((serial[5] - '0') * 10) + (serial[6] - '0'); + + final += (y * ww) << MAC_DATE_SHIFT; + + if (host) /* least bit indicates host MAC */ + final++; + + mac[0] = 0x4c; + mac[1] = 0x79; + mac[2] = 0xba; + mac[3] = (final >> 16) & 0xff; + mac[4] = (final >> 8) & 0xff; + mac[5] = final & 0xff; + return 0; +} + +#endif /* __MIC_MACADDR_H__ */ diff --git a/include/mic/mic_pm.h b/include/mic/mic_pm.h new file mode 100644 index 0000000..12b492c --- /dev/null +++ b/include/mic/mic_pm.h @@ -0,0 +1,442 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* common power management specific header defines for host and card */ + +#include "io_interface.h" + +#if !defined(__MIC_PM_H) +#define __MIC_PM_H + +#define PC6_TIMER 10 + +#define IOCTL_PM_SendIoctl _IOC(_IOC_READ|_IOC_WRITE, 'l', 2, 0) + +#define MAX_HW_IDLE_WAIT_COUNT 100 +#define PC3_EXIT_WAIT_COUNT 1000 +#define PM_SEND_MODE SCIF_SEND_BLOCK +#define PM_RECV_MODE SCIF_RECV_BLOCK +#define SET_VID_RETRY_COUNT 3 + +#define PM_NODE_MAGIC_BIT 31 +#define PM_NODE_IDLE (1 << PM_NODE_MAGIC_BIT) + +#define PM_PRINT(fmt, ...) printk("[ %s : %d ]:"fmt, \ + __func__, __LINE__, ##__VA_ARGS__) + +#define PM_DEBUG(fmt, ...) pr_debug("[ %s : %d ]:"fmt, \ + __func__, __LINE__, ##__VA_ARGS__) + +#define PM_ENTRY PM_DEBUG("==> %s\n", __func__) +#define PM_EXIT PM_DEBUG("<== %s\n", __func__) +#define PM_MAJOR_VERSION 1 +#define PM_MINOR_VERSION 0 + + +typedef enum _PM_MESSAGE { + PM_MESSAGE_PC3READY, + PM_MESSAGE_OPEN, + PM_MESSAGE_OPEN_ACK, + PM_MESSAGE_CLOSE, + PM_MESSAGE_CLOSE_ACK, + PM_MESSAGE_TEST, + PM_MESSAGE_MAX, +} PM_MESSAGE; + +typedef enum _PM_IDLE_STATE { + PM_IDLE_STATE_PC0, + PM_IDLE_STATE_PC3_READY, + PM_IDLE_STATE_PC3, + PM_IDLE_STATE_PC6, + PM_IDLE_STATE_LOST, + PM_IDLE_STATE_MAX, +} PM_IDLE_STATE; + +#ifndef _MIC_SCIF_ +typedef enum { + IOCTL_pm_send, + IOCTL_pm_recv, + IOCTL_pm_send_check, + IOCTL_pm_get_idle_state, + IOCTL_pm_exit_idle_state, + // For emulator testing + IOCTL_pmemu_pc3_entry, + IOCTL_pmemu_pc3_exit, + IOCTL_pmemu_pc6_entry, + IOCTL_pmemu_pc6_exit, + IOCTL_pmemu_dpc3_entry, + IOCTL_pmemu_dpc3_exit, + IOCTL_get_dependency_graph, + IOCTL_get_dependency_set, + IOCTL_pm_toggle_connection, + IOCTL_pm_idlestate_exit, + IOCTL_pm_enable_dpc3_testing, + IOCTL_pm_device_restart, +} PM_IOCTL_TYPE; + +struct pm_ioctl_header { + uint32_t node; + PM_IOCTL_TYPE opcode; + uint64_t arglen; +}; +#define PM_TEST_MSG_BODY "PM Test Message" +#endif + +//Generic PM Header. Has message type and length of message. +typedef struct _pm_msg_header { + PM_MESSAGE opcode; + uint32_t len; +} pm_msg_header; + +typedef struct _pm_msg_unit_test +{ + pm_msg_header header; + void * buf; +} pm_msg_unit_test; + +typedef struct _pm_version +{ + uint16_t major_version; + uint16_t minor_version; + +} pm_version; + +typedef struct _pm_msg_pm_options +{ + uint8_t pc3_enabled; + uint8_t pc6_enabled; + pm_version version; +} pm_msg_pm_options; + +#ifndef _MIC_SCIF_ +// PM IOCTLs +struct pm_scif_send { + struct pm_ioctl_header header; + uint32_t length; + void *buf; +}; + +struct pm_scif_recv { + struct pm_ioctl_header header; + uint32_t length; + void *buf; +}; + +struct pm_scif_send_check { + struct pm_ioctl_header header; + uint32_t length; + void *buf; +}; + +typedef struct pm_get_idle_state { + struct pm_ioctl_header header; + PM_IDLE_STATE *idle_state; +} pm_get_idle_state_t; + +typedef struct pm_exit_idle_state { + struct pm_ioctl_header header; + PM_IDLE_STATE idle_state; +}pm_exit_idlestate_t; + +typedef struct dependency_graph { + struct pm_ioctl_header header; + uint32_t** depmtrx; +} dependency_graph_t; + +struct io_dependency_set { + struct pm_ioctl_header header; + int is_active_set; + uint64_t dep_set; +}; + +struct io_enable_dpc3_test { + struct pm_ioctl_header header; + uint32_t enable_test; + uint32_t state; +}; + +typedef struct _pm_status { + uint32_t hoststate_reg; + uint32_t cardstate_reg; + uint32_t c3waketimer_reg; + uint32_t pcucontrol_reg; + uint32_t uos_pcucontrol_reg; + uint32_t corevolt_reg; + uint32_t gpmctrl_reg; + uint32_t idle_state; + uint32_t board_id; +} pm_status_t; + +typedef struct _test_msg_ctrl { + uint32_t action; +} test_msg_ctrl_t; + +typedef struct _connection_info { + int32_t conn_state; + int32_t local_port; + int32_t local_node; + int32_t remote_port; + int32_t remote_node; + int32_t num_messages_queued; +} connection_info_t; + +#endif //_MIC_SCIF_ + +#if defined(CONFIG_MK1OM) + +#define SBOX_SVID_CONTROL 0x00004110 +#define SBOX_PCU_CONTROL 0x00004114 +#define SBOX_HOST_PMSTATE 0x00004118 +#define SBOX_UOS_PMSTATE 0x0000411c +#define SBOX_C3WAKEUP_TIMER 0x00004120 +#define GBOX_PM_CTRL 0x0000413C +#define SBOX_UOS_PCUCONTROL 0x0000412C + +#elif defined(CONFIG_ML1OM) || defined(WINDOWS) + +#define DBOX_SWFOX1 0x00002414 +#define DBOX_SWFOX2 0x00002418 +#define DBOX_SWFOX3 0x0000241C +#define DBOX_SWFOX4 0x00002420 +#define DBOX_SWFOX5 0x00002424 +#define DBOX_SWFOX6 0x00002428 +#define DBOX_SWFOX7 0x0000242C +#define DBOX_SWF0X8 0x00002430 + +#define SBOX_SVID_CONTROL DBOX_SWFOX1 +#define SBOX_PCU_CONTROL DBOX_SWFOX2 +#define SBOX_HOST_PMSTATE DBOX_SWFOX3 +#define SBOX_UOS_PMSTATE DBOX_SWFOX4 +#define SBOX_C3WAKEUP_TIMER DBOX_SWFOX5 +#define GBOX_PM_CTRL DBOX_SWFOX6 +#define SBOX_UOS_PCUCONTROL DBOX_SWFOX7 + +#else +#error Neither CONFIG_ML1OM nor CONFIG_MK1OM defined +#endif + +#define SBOX_SVIDCTRL_SVID_DOUT(x) ((x) & 0x1ff) +#define SBOX_SVIDCTRL_SVID_DOUT_BITS(x) ((x) & 0x1ff) +#define SBOX_SVIDCTRL_SVID_CMD(x) (((x) >> 9) & 0x1ff) +#define SBOX_SVIDCTRL_SVID_CMD_BITS(x) (((x) & 0x1ff) << 9) +#define SBOX_SVIDCTRL_SVID_DIN(x) (((x) >> 18) & 0x3ff) +#define SBOX_SVIDCTRL_SVID_ERROR(x) (((x) >> 29) & 0x1) +#define SBOX_SVIDCTRL_SVID_IDLE(x) (((x) >> 30) & 0x1) +#define SBOX_SVIDCTRL_CMD_START(x) (((x) >> 31) & 0x1) +#define SBOX_SVIDCTRL_CMD_START_BITS(x) (((x) & 0x1) << 31) +// This is not a register field, but we need to check these bits to determine parity error +#define SBOX_SVIDCTRL_ACK1ACK0(x) (((x) >> 27) & 0x11) + +#define SBOX_PCUCTRL_ENABLE_MCLK_SHUTDWN(x) ((x) & 0x1) +#define SBOX_PCUCTRL_ENABLE_MCLK_SHUTDWN_BITS(x) ((x) & 0x1) +#define SBOX_PCUCTRL_RING_ACTIVE(x) (((x) >> 2) & 0x1) +#define SBOX_PCUCTRL_RING_ACTIVE_BITS(x) (((x) & 0x1) << 2) +#define SBOX_PCUCTRL_PREVENT_AUTOC3_EXIT(x) (((x) >> 3) & 0x1) +#define SBOX_PCUCTRL_PREVENT_AUTOC3_EXIT_BITS(x) (((x) & 0x1) << 3) +#define SBOX_PCUCTRL_PWRGOOD_MASK(x) (((x) >> 17) & 0x1) +#define SBOX_PCUCTRL_PWRGOOD_MASK_BITS(x) (((x) & 0x1) << 17) +#define SBOX_PCUCTRL_MCLK_PLL_LCK(x) (((x) >> 16) & 0x1) +#define SBOX_THERMAL_STS_ALERT_LOG(x) (((x) >> 3) & 0x1) +#define SBOX_THERMAL_STS_ALERT_LOG_BITS(x) (((x) & 0x1) << 3) + +// used by host to communicate card idle state to uos +#define SBOX_HPMSTATE_STATUS(x) ((x) & 0xff) +#define SBOX_HPMSTATE_STATUS_BITS(x) ((x) & 0xff) +#define SBOX_HPMSTATE_MINVID(x) (((x) >> 8) & 0xff) +#define SBOX_HPMSTATE_TDPVID(x) (((x) >> 16) & 0xff) +// used by uos to communicate card idle state to host +#define SBOX_UPMSTATE_STATUS(x) ((x) & 0xff) +#define SBOX_UPMSTATE_STATUS_BITS(x) ((x) & 0xff) + +#define SBOX_C3WAKEUP_TIME(x) ((x) & 0xffff) +#define SBOX_C3WAKEUP_TIME_BITS(x) ((x) & 0xffff) + +#define IN_PCKGC6_BITS(x) (((x) & 0x1) << 1) +#define KNC_SVID_ADDR 0 +#define KNC_SETVID_FAST 1 +#define KNC_SETVID_SLOW 2 +#define KNC_SETVID_ATTEMPTS 50 + + +typedef union _sbox_pcu_ctrl { + uint32_t value; + struct { + uint32_t enable_mclk_pl_shutdown :1; + uint32_t mclk_enabled :1; + uint32_t ring_active :1; + uint32_t prevent_auto_c3_exit :1; + uint32_t ghost_active :1; + uint32_t tcu_active :1; + uint32_t itp_scllk_gate_disable :1; + uint32_t itp_pkg_c3_disable :1; + uint32_t scratch :1; + uint32_t unallocated_1 :1; + uint32_t sysint_active :1; + uint32_t sclk_grid_off_disable :1; + uint32_t icc_dvo_ssc_cg_enable :1; + uint32_t icc_core_ref_clk_cg_enable :1; + uint32_t icc_gddr_ssc_cg_enable :1; + uint32_t icc_pll_disable :1; + uint32_t mclk_pll_lock :1; + uint32_t grpB_pwrgood_mask :1; + uint32_t unallocated_2 :14; + } bits; + +} sbox_pcu_ctrl_t; + +typedef union _sbox_host_pm_state { + uint32_t value; + struct { + uint32_t host_pm_state :7; + uint32_t abort_not_processed :1; + uint32_t min_vid :8; + uint32_t tdp_vid :8; + uint32_t unallocated :8; + } bits; + +} sbox_host_pm_state_t; + +typedef union _sbox_uos_pm_state { + uint32_t value; + struct { + uint32_t uos_pm_state :8; + uint32_t unallocated :24; + }bits; + +} sbox_uos_pm_state_t; + +typedef union _c3_wakeup_timer { + uint32_t value; + struct { + uint32_t c3_wake_time :16; + uint32_t unallocated_1 :1; + uint32_t c3_wake_timeout :1; + uint32_t unallocated_2 :14; + } bits; + +} c3_wakeup_timer_t; + +typedef union _sbox_svid_control { + uint32_t value; + struct { + uint32_t svid_dout :9; + uint32_t svid_cmd :9; + uint32_t svid_din :11; + uint32_t svid_error :1; + uint32_t svid_idle :1; + uint32_t cmd_start :1; + } bits; + +} sbox_svid_control; + +typedef union _gbox_pm_control { + uint32_t value; + struct { + uint32_t c6_disable :1; + uint32_t in_pckgc6 :1; + uint32_t gbox_inM3 :2; + uint32_t unallocated :28; + } bits; + +} gbox_pm_control; + +typedef union _sbox_thermal_sts_interrupt { + uint32_t value; + struct { + uint32_t mclk_ratio_status :1; + uint32_t mclk_ratio_log :1; + uint32_t alert_status :1; + uint32_t alert_log :1; + uint32_t gpu_hot_status :1; + uint32_t gpu_hot_log :1; + uint32_t pwr_alert_status :1; + uint32_t pwr_alert_log :1; + uint32_t pmu_status :1; + uint32_t pmu_log :1; + uint32_t etc_freeze :1; + uint32_t unallocated :21; + }bits; + +} sbox_thermal_sts_interrupt; + +typedef union _sboxUosPcucontrolReg +{ + uint32_t value; + struct + { + uint32_t c3_wakeuptimer_enable :1; + uint32_t enable_mclk_pll_shutdown :1; + uint32_t spi_clk_disable :1; + uint32_t unallocated :29; + } bits; + +} sbox_uos_pcu_ctrl_t; + +typedef union _sboxCorefreqReg +{ + uint32_t value; + struct + { + uint32_t ratio :12; // bit 0-11 Ratio + uint32_t rsvd0 : 3; // bit 12-14 + uint32_t fuseratio : 1; // bit 15 If overclocking is enabled, setting this bit will default the goal ratio to the fuse value. + uint32_t asyncmode : 1; // bit 16 Async Mode Bit 16, Reserved Bits 20:17 used to be ExtClkFreq, + uint32_t rsvd1 : 9; // bit 17-25 + uint32_t ratiostep : 4; // bit 26-29 Power throttle ratio-step + uint32_t jumpratio : 1; // bit 30 Power throttle jump at once + uint32_t booted : 1; // bit 31 Booted: This bit selects between the default MCLK Ratio (600MHz) and the programmable MCLK ratio. 0=default 1=programmable. + } bits; + +} sbox_core_freq_t; + +typedef union _sboxCoreVoltReg +{ + uint32_t value; + struct + { + uint32_t vid :8; + uint32_t unallocated :24; + } bits; + +} sbox_core_volt_t; + +typedef enum _PM_CONNECTION_STATE { + PM_CONNECTING, + PM_CONNECTED, + PM_DISCONNECTING, + PM_DISCONNECTED +} PM_CONNECTION_STATE; + +#endif //__MIC_PM_H diff --git a/include/mic/mic_sbox_md.h b/include/mic/mic_sbox_md.h new file mode 100644 index 0000000..4ad8cf9 --- /dev/null +++ b/include/mic/mic_sbox_md.h @@ -0,0 +1,90 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MIC_SBOX_MD_H +#define MIC_SBOX_MD_H +/* + * TODO: SBOX MCA Handling + */ +#ifdef _MIC_SCIF_ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif // _MIC_SCIF_ + +#ifdef _MIC_SCIF_ +void *mic_sbox_md_init(void); +void mic_sbox_md_uninit(void *mic_sbox_mmio_va); +#endif + +static inline uint32_t mic_sbox_read_mmio(void *mic_sbox_mmio_va, uint32_t offset) +{ + return readl((uint8_t *)mic_sbox_mmio_va + offset); +} + +static inline void mic_sbox_write_mmio(void *mic_sbox_mmio_va, uint32_t offset, uint32_t value) +{ + writel(value, (uint8_t *)mic_sbox_mmio_va + offset); +} +#endif diff --git a/include/mic/mic_virtio.h b/include/mic/mic_virtio.h new file mode 100644 index 0000000..4222e7d --- /dev/null +++ b/include/mic/mic_virtio.h @@ -0,0 +1,70 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + Structures which are passed from host to MIC card through + uOS kernel command line option, virtio_addr. + + (C) Copyright 2012 Intel Corporation + Author: Caz Yokoyama + */ +#ifndef MIC_VIRTIO_H +#define MIC_VIRTIO_H + +struct vb_shared { + uint32_t host_features; + uint32_t client_features; + bool update; + struct vring vring; + struct virtio_blk_config blk_config; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0)) + uint32_t unused; +#endif +} __attribute__((aligned(8))); + +struct mic_virtblk { +#ifdef HOST + struct vb_shared vb_shared; + void *vblk; /* keep vblk in vhost for virtblk */ +#else + struct vb_shared *vb_shared; + void *vdev; /* keep vdev in virtio for virtblk */ +#endif +}; + +uint64_t mic_vhost_pm_disconnect_node(uint64_t node_bitmask, enum disconn_type type); +void mic_vhost_blk_stop(bd_info_t *bd_info); + +#endif // MIC_VIRTIO_H diff --git a/include/mic/micbaseaddressdefine.h b/include/mic/micbaseaddressdefine.h new file mode 100644 index 0000000..15e3991 --- /dev/null +++ b/include/mic/micbaseaddressdefine.h @@ -0,0 +1,111 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* "Raw" register offsets & bit specifications for MIC */ +#ifndef _MIC_MICBASEDEFINE_REGISTERS_H_ +#define _MIC_MICBASEDEFINE_REGISTERS_H_ + +#define COMMON_MMIO_BOX_SIZE (1<<16) + +/* CBOX register base defines */ +#define CBOX_BASE 0x0000000000ULL + +/* TXS register base defines */ +#define TXS0_BASE 0x0800780000ULL +#define TXS1_BASE 0x0800770000ULL +#define TXS2_BASE 0x0800760000ULL +#define TXS3_BASE 0x0800750000ULL +#define TXS4_BASE 0x0800740000ULL +#define TXS5_BASE 0x0800730000ULL +#define TXS6_BASE 0x0800720000ULL +#define TXS7_BASE 0x0800710000ULL +#define TXS8_BASE 0x08006E0000ULL + +/* GBOX register base defines */ +#define GBOX0_BASE 0x08007A0000ULL +#define GBOX1_BASE 0x0800790000ULL +#define GBOX2_BASE 0x0800700000ULL +#define GBOX3_BASE 0x08006F0000ULL + +#define GBOX_CHANNEL0_BASE 0x00000000 +#define GBOX_CHANNEL1_BASE 0x00000800 +#define GBOX_CHANNEL2_BASE 0x00001000 + +/* VBOX register base defines */ +#define VBOX_BASE 0x08007B0000ULL + +/* DBOX register base defines */ +#define DBOX_BASE 0x08007C0000ULL + +/* SBOX register base defines */ +#define SBOX_BASE 0x08007D0000ULL + +#define MIC_GTT_BASE 0x0800800000ULL +#define MIC_GTT_TOP 0x080083FFFFULL +#define MIC_GTT_SIZE (MIC_GTT_TOP - MIC_GTT_BASE + 1) + +/* Aperture defines */ +#define MIC_APERTURE_BASE 0x0900000000ULL +#define MIC_APERTURE_TOP 0x090FFFFFFFULL +#define MIC_APERTURE_SIZE (MIC_APERTURE_TOP - MIC_APERTURE_BASE + 1) + +/* SPI flash defines */ +#define MIC_SPI_BOOTLOADER_BASE 0x0FFFFF0000ULL +#define MIC_SPI_BOOTLOADER_TOP 0x0FFFFFFFFFULL +#define MIC_SPI_BOOTLOADER_SIZE (MIC_SPI_BOOTLOADER_TOP - MIC_SPI_BOOTLOADER_BASE + 1) +#define MIC_SPI_2ND_STAGE_BASE 0x0FFFFE0000ULL +#define MIC_SPI_2ND_STAGE_TOP 0x0FFFFEFFFFULL +#define MIC_SPI_2ND_STAGE_SIZE (MIC_SPI_2ND_STAGE_TOP - MIC_SPI_2ND_STAGE_BASE + 1) +#define MIC_SPI_PARAMETER_BASE 0x0FFFFDC000ULL +#define MIC_SPI_PARAMETER_TOP 0x0FFFFDFFFFULL +#define MIC_SPI_PARAMETER_SIZE (MIC_SPI_PARAMETER_TOP - MIC_SPI_PARAMETER_BASE + 1) + +/* remote defines */ +#define MIC_REMOTE_BASE 0x1000000000ULL +#define MIC_REMOTE_TOP 0x7FFFFFFFFFULL +#define MIC_REMOTE_SIZE (MIC_REMOTE_TOP - MIC_REMOTE_BASE + 1) + +/* system defines */ +#define MIC_SYSTEM_BASE 0x8000000000ULL +#define MIC_SYSTEM_TOP 0xFFFFFFFFFFULL +#define MIC_SYSTEM_PAGE_SIZE 0x0400000000ULL +#define MIC_SYSTEM_SIZE (MIC_SYSTEM_TOP - MIC_SYSTEM_BASE + 1) + +#define MIC_PHYSICAL_ADDRESS_BITS 40 +#define MIC_PHYSICAL_ADDRESS_SPACE_SIZE ( 1ULL << MIC_PHYSICAL_ADDRESS_BITS ) + +#define MIC_HOST_MMIO_BASE DBOX_BASE + +#endif diff --git a/include/mic/micdboxdefine.h b/include/mic/micdboxdefine.h new file mode 100644 index 0000000..cba2c7a --- /dev/null +++ b/include/mic/micdboxdefine.h @@ -0,0 +1,48 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* "Raw" register offsets & bit specifications for Intel MIC (KNF) */ +#ifndef _MIC_DBOXDEFINE_REGISTERS_H_ +#define _MIC_DBOXDEFINE_REGISTERS_H_ + +#define DBOX_SWF0X0 0x00002410 + + +#define DBOX_SWF1X0 0x00003410 +#define DBOX_SWF1X1 0x00003414 +#define DBOX_SWF1X2 0x00003418 +#define DBOX_SWF1X3 0x0000341C + +#endif diff --git a/include/mic/micpsmi.h b/include/mic/micpsmi.h new file mode 100644 index 0000000..f9c3b90 --- /dev/null +++ b/include/mic/micpsmi.h @@ -0,0 +1,62 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef _MIC_PSMI_H +#define _MIC_PSMI_H + +struct mic_psmi_pte { + uint64_t pa; +}; + +struct mic_psmi_ctx +{ + unsigned char enabled; + + struct mic_psmi_pte *dma_tbl; + int dma_tbl_size; + dma_addr_t dma_tbl_hndl; + uint64_t dma_mem_size; + int nr_dma_pages; + + struct mic_psmi_pte *va_tbl; +}; + +#define MIC_PSMI_PAGE_ORDER (7) +#define MIC_PSMI_PAGE_SIZE (PAGE_SIZE << MIC_PSMI_PAGE_ORDER) +#define MIC_PSMI_SIGNATURE 0x4B434F52494D5350L + +int mic_psmi_open(struct file *filp); + +#endif /* _MIC_PSMI_H */ diff --git a/include/mic/micsboxdefine.h b/include/mic/micsboxdefine.h new file mode 100644 index 0000000..36b1b30 --- /dev/null +++ b/include/mic/micsboxdefine.h @@ -0,0 +1,255 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* "Raw" register offsets & bit specifications for Intel MIC (KNF) */ +#ifndef _MIC_SBOXDEFINE_REGISTERS_H_ +#define _MIC_SBOXDEFINE_REGISTERS_H_ + + +#define SBOX_OC_I2C_ICR 0x00001000 +#define SBOX_THERMAL_STATUS 0x00001018 +#define SBOX_THERMAL_INTERRUPT_ENABLE 0x0000101C +#define SBOX_STATUS_FAN1 0x00001024 +#define SBOX_STATUS_FAN2 0x00001028 +#define SBOX_SPEED_OVERRIDE_FAN 0x0000102C +#define SBOX_BOARD_TEMP1 0x00001030 +#define SBOX_BOARD_TEMP2 0x00001034 +#define SBOX_BOARD_VOLTAGE_SENSE 0x00001038 +#define SBOX_CURRENT_DIE_TEMP0 0x0000103C +#define SBOX_CURRENT_DIE_TEMP1 0x00001040 +#define SBOX_CURRENT_DIE_TEMP2 0x00001044 +#define SBOX_MAX_DIE_TEMP0 0x00001048 +#define SBOX_MAX_DIE_TEMP1 0x0000104C +#define SBOX_MAX_DIE_TEMP2 0x00001050 +#define SBOX_ELAPSED_TIME_LOW 0x00001074 +#define SBOX_ELAPSED_TIME_HIGH 0x00001078 +#define SBOX_FAIL_SAFE_OFFSET 0x00002004 +#define SBOX_CURRENT_CLK_RATIO 0x00003004 +#define SBOX_SMPT00 0x00003100 +#define SBOX_SMPT02 0x00003108 +#define SBOX_RGCR 0x00004010 +#define SBOX_DSTAT 0x00004014 +#define SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8 0x00005808 +#define SBOX_PCIE_BAR_ENABLE 0x00005CD4 +#define SBOX_SICR0 0x00009004 +#define SBOX_SICE0 0x0000900C +#define SBOX_SICC0 0x00009010 +#define SBOX_SICR1 0x0000901C +#define SBOX_SICC1 0x00009028 +#ifdef CONFIG_MK1OM +#define SBOX_PMU_PERIOD_SEL 0x00001070 +#define SBOX_THERMAL_STATUS_INTERRUPT 0x0000107C +#define SBOX_THERMAL_STATUS_2 0x00001080 +#define SBOX_THERMAL_TEST_2 0x00001084 +#define SBOX_COREFREQ 0x00004100 +#define SBOX_COREVOLT 0x00004104 +#define SBOX_MEMORYFREQ 0x00004108 +#define SBOX_MEMVOLT 0x0000410C +//add defines used by drivers that are the same as DOORBELL_INTX +#define SBOX_SDBIC0 0x0000CC90 +#define SBOX_SDBIC1 0x0000CC94 +#define SBOX_SDBIC2 0x0000CC98 +#define SBOX_SDBIC3 0x0000CC9C +#else +#define SBOX_SDBIC0 0x00009030 +#define SBOX_SDBIC1 0x00009034 +#define SBOX_SDBIC2 0x00009038 +#define SBOX_SDBIC3 0x0000903C +#define SBOX_COREFREQ 0x00004040 +#define SBOX_COREVOLT 0x00004044 +#define SBOX_MEMORYFREQ 0x00004048 +#define SBOX_MEMVOLT 0x0000404C +#define SBOX_RSC0 0x0000CC10 +#define SBOX_RSC1 0x0000CC14 + +#endif +#define SBOX_MXAR0 0x00009040 +#define SBOX_MXAR0_K1OM 0x00009044 +#define SBOX_MXAR1 0x00009044 +#define SBOX_MXAR2 0x00009048 +#define SBOX_MXAR3 0x0000904C +#define SBOX_MXAR4 0x00009050 +#define SBOX_MXAR5 0x00009054 +#define SBOX_MXAR6 0x00009058 +#define SBOX_MXAR7 0x0000905C +#define SBOX_MXAR8 0x00009060 +#define SBOX_MXAR9 0x00009064 +#define SBOX_MXAR10 0x00009068 +#define SBOX_MXAR11 0x0000906C +#define SBOX_MXAR12 0x00009070 +#define SBOX_MXAR13 0x00009074 +#define SBOX_MXAR14 0x00009078 +#define SBOX_MXAR15 0x0000907C +#define SBOX_MSIXPBACR 0x00009080 +#define SBOX_MSIXPBACR_K1OM 0x00009084 +#define SBOX_DCAR_0 0x0000A000 +#define SBOX_DHPR_0 0x0000A004 +#define SBOX_DTPR_0 0x0000A008 +#define SBOX_DAUX_LO_0 0x0000A00C +#define SBOX_DAUX_HI_0 0x0000A010 +#define SBOX_DRAR_LO_0 0x0000A014 +#define SBOX_DRAR_HI_0 0x0000A018 +#define SBOX_DITR_0 0x0000A01C +#define SBOX_DSTAT_0 0x0000A020 +#define SBOX_DSTATWB_LO_0 0x0000A024 +#define SBOX_DSTATWB_HI_0 0x0000A028 +#define SBOX_DCHERR_0 0x0000A02C +#define SBOX_DCHERRMSK_0 0x0000A030 +#define SBOX_DCAR_1 0x0000A040 +#define SBOX_DHPR_1 0x0000A044 +#define SBOX_DTPR_1 0x0000A048 +#define SBOX_DAUX_LO_1 0x0000A04C +#define SBOX_DAUX_HI_1 0x0000A050 +#define SBOX_DRAR_LO_1 0x0000A054 +#define SBOX_DRAR_HI_1 0x0000A058 +#define SBOX_DITR_1 0x0000A05C +#define SBOX_DSTAT_1 0x0000A060 +#define SBOX_DSTATWB_LO_1 0x0000A064 +#define SBOX_DSTATWB_HI_1 0x0000A068 +#define SBOX_DCHERR_1 0x0000A06C +#define SBOX_DCHERRMSK_1 0x0000A070 +#define SBOX_DCAR_2 0x0000A080 +#define SBOX_DHPR_2 0x0000A084 +#define SBOX_DTPR_2 0x0000A088 +#define SBOX_DAUX_LO_2 0x0000A08C +#define SBOX_DAUX_HI_2 0x0000A090 +#define SBOX_DRAR_LO_2 0x0000A094 +#define SBOX_DRAR_HI_2 0x0000A098 +#define SBOX_DITR_2 0x0000A09C +#define SBOX_DSTAT_2 0x0000A0A0 +#define SBOX_DSTATWB_LO_2 0x0000A0A4 +#define SBOX_DSTATWB_HI_2 0x0000A0A8 +#define SBOX_DCHERR_2 0x0000A0AC +#define SBOX_DCHERRMSK_2 0x0000A0B0 +#define SBOX_DCAR_3 0x0000A0C0 +#define SBOX_DHPR_3 0x0000A0C4 +#define SBOX_DTPR_3 0x0000A0C8 +#define SBOX_DAUX_LO_3 0x0000A0CC +#define SBOX_DAUX_HI_3 0x0000A0D0 +#define SBOX_DRAR_LO_3 0x0000A0D4 +#define SBOX_DRAR_HI_3 0x0000A0D8 +#define SBOX_DITR_3 0x0000A0DC +#define SBOX_DSTAT_3 0x0000A0E0 +#define SBOX_DSTATWB_LO_3 0x0000A0E4 +#define SBOX_DSTATWB_HI_3 0x0000A0E8 +#define SBOX_DCHERR_3 0x0000A0EC +#define SBOX_DCHERRMSK_3 0x0000A0F0 +#define SBOX_DCAR_4 0x0000A100 +#define SBOX_DHPR_4 0x0000A104 +#define SBOX_DTPR_4 0x0000A108 +#define SBOX_DAUX_LO_4 0x0000A10C +#define SBOX_DAUX_HI_4 0x0000A110 +#define SBOX_DRAR_LO_4 0x0000A114 +#define SBOX_DRAR_HI_4 0x0000A118 +#define SBOX_DITR_4 0x0000A11C +#define SBOX_DSTAT_4 0x0000A120 +#define SBOX_DSTATWB_LO_4 0x0000A124 +#define SBOX_DSTATWB_HI_4 0x0000A128 +#define SBOX_DCHERR_4 0x0000A12C +#define SBOX_DCHERRMSK_4 0x0000A130 +#define SBOX_DCAR_5 0x0000A140 +#define SBOX_DHPR_5 0x0000A144 +#define SBOX_DTPR_5 0x0000A148 +#define SBOX_DAUX_LO_5 0x0000A14C +#define SBOX_DAUX_HI_5 0x0000A150 +#define SBOX_DRAR_LO_5 0x0000A154 +#define SBOX_DRAR_HI_5 0x0000A158 +#define SBOX_DITR_5 0x0000A15C +#define SBOX_DSTAT_5 0x0000A160 +#define SBOX_DSTATWB_LO_5 0x0000A164 +#define SBOX_DSTATWB_HI_5 0x0000A168 +#define SBOX_DCHERR_5 0x0000A16C +#define SBOX_DCHERRMSK_5 0x0000A170 +#define SBOX_DCAR_6 0x0000A180 +#define SBOX_DHPR_6 0x0000A184 +#define SBOX_DTPR_6 0x0000A188 +#define SBOX_DAUX_LO_6 0x0000A18C +#define SBOX_DAUX_HI_6 0x0000A190 +#define SBOX_DRAR_LO_6 0x0000A194 +#define SBOX_DRAR_HI_6 0x0000A198 +#define SBOX_DITR_6 0x0000A19C +#define SBOX_DSTAT_6 0x0000A1A0 +#define SBOX_DSTATWB_LO_6 0x0000A1A4 +#define SBOX_DSTATWB_HI_6 0x0000A1A8 +#define SBOX_DCHERR_6 0x0000A1AC +#define SBOX_DCHERRMSK_6 0x0000A1B0 +#define SBOX_DCAR_7 0x0000A1C0 +#define SBOX_DHPR_7 0x0000A1C4 +#define SBOX_DTPR_7 0x0000A1C8 +#define SBOX_DAUX_LO_7 0x0000A1CC +#define SBOX_DAUX_HI_7 0x0000A1D0 +#define SBOX_DRAR_LO_7 0x0000A1D4 +#define SBOX_DRAR_HI_7 0x0000A1D8 +#define SBOX_DITR_7 0x0000A1DC +#define SBOX_DSTAT_7 0x0000A1E0 +#define SBOX_DSTATWB_LO_7 0x0000A1E4 +#define SBOX_DSTATWB_HI_7 0x0000A1E8 +#define SBOX_DCHERR_7 0x0000A1EC +#define SBOX_DCHERRMSK_7 0x0000A1F0 +#define SBOX_DCR 0x0000A280 +#define SBOX_APICICR0 0x0000A9D0 +#define SBOX_APICICR1 0x0000A9D8 +#define SBOX_APICICR2 0x0000A9E0 +#define SBOX_APICICR3 0x0000A9E8 +#define SBOX_APICICR4 0x0000A9F0 +#define SBOX_APICICR5 0x0000A9F8 +#define SBOX_APICICR6 0x0000AA00 +#define SBOX_APICICR7 0x0000AA08 +#define SBOX_SCRATCH0 0x0000AB20 +#define SBOX_SCRATCH1 0x0000AB24 +#define SBOX_SCRATCH2 0x0000AB28 +#define SBOX_SCRATCH3 0x0000AB2C +#define SBOX_SCRATCH4 0x0000AB30 +#define SBOX_SCRATCH5 0x0000AB34 +#define SBOX_SCRATCH6 0x0000AB38 +#define SBOX_SCRATCH7 0x0000AB3C +#define SBOX_SCRATCH8 0x0000AB40 +#define SBOX_SCRATCH9 0x0000AB44 +#define SBOX_SCRATCH10 0x0000AB48 +#define SBOX_SCRATCH11 0x0000AB4C +#define SBOX_SCRATCH12 0x0000AB50 +#define SBOX_SCRATCH13 0x0000AB54 +#define SBOX_SCRATCH14 0x0000AB58 +#define SBOX_SCRATCH15 0x0000AB5C +#define SBOX_RDMASR0 0x0000B180 +#define SBOX_SBQ_FLUSH 0x0000B1A0 // Pseudo-register, not autogen, must add manually +#define SBOX_TLB_FLUSH 0x0000B1A4 +#define SBOX_GTT_PHY_BASE 0x0000C118 +#define SBOX_EMON_CNT0 0x0000CC28 +#define SBOX_EMON_CNT1 0x0000CC2C +#define SBOX_EMON_CNT2 0x0000CC30 +#define SBOX_EMON_CNT3 0x0000CC34 + +#endif diff --git a/include/mic/micscif.h b/include/mic/micscif.h new file mode 100644 index 0000000..c0b6223 --- /dev/null +++ b/include/mic/micscif.h @@ -0,0 +1,900 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICSCIF_H +#define MICSCIF_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MODULE_SCIF_ +#include +#include +#include +#include +#include +#include +#endif /* MODULE_SCIF */ + +#include +#include "scif.h" +#include "mic/micbaseaddressdefine.h" +#include "mic/micsboxdefine.h" + +/* The test runs in a separate thread context from the bottom + * half that processes messages from the card and setup p2p + * when these run concurrently, p2p messages get lost since they + * may be consumed by the test thread + */ +//#define ENABLE_TEST // Used to enable testing at board connect +#ifdef MIC_IS_EMULATION +#define TEST_LOOP 2 +#else +#define TEST_LOOP 2000 +#endif + +//#define P2P_HACK 0 +#include "scif.h" +#include "scif_ioctl.h" + +#define SCIF_READY_MAGIC_NUM 0x1eedfee0 + +#ifndef SCIF_MAJOR +#define SCIF_MAJOR 0 /* Use dynamic major number by default */ +#endif + +#define SCIF_HOST_NODE 0 // By default the host is always node zero + +#define SCIF_RMA_TEMP_CACHE_LIMIT 0x20000 +/* + * The overhead for proxying a P2P DMA read to convert it to + * a DMA write by sending a SCIF Node QP message has been + * seen to be higher than programming a P2P DMA Read on self + * for transfer sizes less than the PROXY_DMA_THRESHOLD. + * The minimum threshold is different for Jaketown versus + * Ivytown and tuned for best DMA performance. + */ +#define SCIF_PROXY_DMA_THRESHOLD_JKT (32 * 1024ULL) +#define SCIF_PROXY_DMA_THRESHOLD_IVT (1024 * 1024ULL) + +//#define RMA_DEBUG 0 + +/* Pre-defined L1_CACHE_SHIFT is 6 on RH and 7 on Suse */ +#undef L1_CACHE_SHIFT +#define L1_CACHE_SHIFT 6 +#undef L1_CACHE_BYTES +#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) + +#define MI_EPLOCK_HELD (true) +#define MAX_RDMASR 8 + +// Device wide SCIF information +struct micscif_info { + uint32_t mi_nodeid; // Node ID this node is to others. + + struct mutex mi_conflock; // Configuration lock (used in p2p setup) + uint32_t mi_maxid; // Max known board ID + uint32_t mi_total; // Total number of running interfaces + uint32_t mi_nr_zombies; // Keep track of the number of zombie EP. + unsigned long mi_mask; // bit mask of online scif interfaces + uint64_t mi_nr_ioremap; // Keep track of number of ioremap() calls on the host + // to decide when to purge aliases for performance. + spinlock_t mi_eplock; + spinlock_t mi_connlock; + spinlock_t mi_rmalock; // Synchronize access to list of temporary registered + // windows to be destroyed. + struct mutex mi_fencelock; // Synchronize access to list of remote fences requested. + struct mutex mi_event_cblock; + spinlock_t mi_nb_connect_lock; + + struct list_head mi_uaccept; // List of user acceptreq waiting for acceptreg + struct list_head mi_listen; // List of listening end points + struct list_head mi_zombie; // List of zombie end points with pending RMA's. + struct list_head mi_connected; // List of end points in connected state + struct list_head mi_disconnected; // List of end points in disconnected state + struct list_head mi_rma; // List of temporary registered windows to be destroyed. + struct list_head mi_rma_tc; // List of temporary + // registered & cached windows + // to be destroyed. + struct list_head mi_fence; // List of remote fence requests. + struct list_head mi_event_cb; /* List of event handlers registered */ + struct list_head mi_nb_connect_list; +#ifdef CONFIG_MMU_NOTIFIER + struct list_head mi_mmu_notif_cleanup; +#endif + struct notifier_block mi_panic_nb; +#ifndef _MIC_SCIF_ + /* The host needs to keep track of node dependencies in form of graph. + * This will need to be dynamically grown to support hotplug. + */ + uint32_t **mi_depmtrx; + /* + * Wait queue used for blocking while waiting for nodes + * to respond for disconnect message sent from host. + */ + wait_queue_head_t mi_disconn_wq; + /* stus of node remove operation*/ + uint64_t mi_disconnect_status; + atomic_long_t mi_unique_msgid; +#endif + /* + * Watchdog timeout on the host. Timer expiry will result in the host + * treating the remote node as a lost node. Default value is + * DEFAULT_WATCHDOG_TO and can be modified to a value greater than 1 + * second via SCIF sysfs watchdog_to entry. + */ + int mi_watchdog_to; // Watchdog timeout + int mi_watchdog_enabled; // Watchdog timeout enabled + int mi_watchdog_auto_reboot; // Watchdog auto reboot enabled + struct workqueue_struct *mi_misc_wq; // Workqueue for miscellaneous SCIF tasks. + struct work_struct mi_misc_work; +#ifdef CONFIG_MMU_NOTIFIER + struct workqueue_struct *mi_mmu_notif_wq; // Workqueue for MMU notifier cleanup tasks. + struct work_struct mi_mmu_notif_work; +#endif + int nr_gtt_entries; // GTT Debug Counter to detect leaks + uint64_t nr_2mb_pages; // Debug Counter for number of 2mb pages. + uint64_t nr_4k_pages; // Debug Counter for number of 4K pages + uint8_t en_msg_log; + wait_queue_head_t mi_exitwq; + unsigned long mi_rma_tc_limit; + uint64_t mi_proxy_dma_threshold; +#ifdef RMA_DEBUG + atomic_long_t rma_mm_cnt; + atomic_long_t rma_unaligned_cpu_cnt; + atomic_long_t rma_alloc_cnt; + atomic_long_t rma_pin_cnt; +#ifdef CONFIG_MMU_NOTIFIER + atomic_long_t mmu_notif_cnt; +#endif +#endif +#ifdef _MIC_SCIF_ + int mi_intr_rcnt[MAX_RDMASR]; // Ref count to track SCIF Interrupt Handlers +#endif + struct workqueue_struct *mi_conn_wq; + struct work_struct mi_conn_work; +}; + +extern struct micscif_info ms_info; + +#define SCIF_NODE_MAGIC_BIT 63 +/* Magic value used to indicate a remote idle node without grabbing any locks */ +#define SCIF_NODE_IDLE (1ULL << SCIF_NODE_MAGIC_BIT) + +enum scif_state { + SCIFDEV_NOTPRESENT, + SCIFDEV_INIT, + SCIFDEV_RUNNING, + SCIFDEV_SLEEPING, + SCIFDEV_STOPPING, + SCIFDEV_STOPPED +}; + +extern bool mic_p2p_enable; +extern bool mic_p2p_proxy_enable; +extern bool mic_reg_cache_enable; +extern bool mic_ulimit_check; +/* p2p mapping from node id to peer id */ +struct scif_p2p_info { + int ppi_peer_id; + struct scatterlist *ppi_sg[2]; + uint64_t sg_nentries[2]; // no of entries in scatterlists + dma_addr_t ppi_pa[2]; // one for mmio; one for aper + dma_addr_t ppi_mic_addr[2]; // one for mmio; one for aper + uint64_t ppi_len[2]; +#define PPI_MMIO 0 +#define PPI_APER 1 + enum scif_state ppi_disc_state; //Disconnection state of this peer node. + struct list_head ppi_list; +}; + +/* one per remote node */ +struct micscif_dev { + uint16_t sd_node; + enum scif_state sd_state; + volatile void *mm_sbox; + uint64_t sd_base_addr; /* Remote node's base bus addr + * for the local node's aperture + */ +#ifndef _MIC_SCIF_ + struct list_head sd_p2p; /* List of bus addresses for + * other nodes, these are allocated + * by the host driver and are + * valid only on the host node + */ + struct delayed_work sd_watchdog_work; + wait_queue_head_t sd_watchdog_wq; + struct workqueue_struct *sd_ln_wq; + char sd_ln_wqname[16]; +#endif + + int n_qpairs; /* FIXME: + * This is always set to 1, + */ + + struct micscif_qp *qpairs; /* Same FIXME as above + * There is single qp established + * with this remote node + */ + + struct workqueue_struct *sd_intr_wq; /* sd_intr_wq & sd_intr_bh + * together constitute the workqueue + * infrastructure needed to + * run the bottom half handler + * for messages received from + * this node + */ + char sd_intr_wqname[16]; + struct work_struct sd_intr_bh; + unsigned int sd_intr_handle; + uint32_t sd_rdmasr; + struct workqueue_struct *sd_loopb_wq; + char sd_loopb_wqname[16]; + struct work_struct sd_loopb_work; + struct list_head sd_loopb_recv_q; + /* Lock to synchronize remote node state transitions */ + struct mutex sd_lock; + /* + * Global Ref count per SCIF device tracking all SCIF API's which + * might communicate across PCIe. + */ + atomic_long_t scif_ref_cnt; + /* + * Global Ref count per SCIF device tracking scif_mmap()/ + * scif_get_pages(). sd_lock protects scif_map_ref_cnt + * hence it does not need to be an atomic operation. Note that + * scif_mmap()/scif_get_pages() is not in the critical + * perf path. + */ + int scif_map_ref_cnt; + /* + * Wait queue used for blocking while waiting for nodes + * to wake up or to be removed. + */ + wait_queue_head_t sd_wq; + uint64_t sd_wait_status; +#ifdef _MIC_SCIF_ + wait_queue_head_t sd_p2p_wq; + bool sd_proxy_dma_reads; + struct delayed_work sd_p2p_dwork; + int sd_p2p_retry; +#endif + /* + * The NUMA node the peer is attached to on the host. + */ + int sd_numa_node; + /* + * Waitqueue for blocking while waiting for remote memory + * mappings to drop to zero. + */ + wait_queue_head_t sd_mmap_wq; + + /* When a nodeqp message is received, this is set. + * And it is reset by the watchdog time */ + atomic_t sd_node_alive; + int num_active_conn; +#ifdef ENABLE_TEST + struct workqueue_struct *producer; + struct workqueue_struct *consumer; + char producer_name[16]; + char consumer_name[16]; + struct work_struct producer_work; + struct work_struct consumer_work; + int count; + int test_done; +#endif // ENABLE_TEST +}; + +extern struct micscif_dev scif_dev[]; + +#include "mic/micscif_nodeqp.h" +#include "mic/micscif_nm.h" +#include "mic/micscif_smpt.h" +#include "mic/micscif_va_gen.h" +#include "mic/mic_dma_api.h" +#include "mic/mic_dma_lib.h" +#include "mic/micscif_rma.h" +#include "mic/micscif_rma_list.h" + +/* + * data structure used to sync SCIF_GET_NODE_INFO messaging + */ +struct get_node_info { + enum micscif_msg_state state; + wait_queue_head_t wq; +}; + +static inline uint64_t align_low(uint64_t data, uint32_t granularity) +{ + return ALIGN(data - (granularity - 1), granularity); +} + +#define SCIF_MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define SCIF_MAX(a, b) (((a) > (b)) ? (a) : (b)) + +enum endptstate { + SCIFEP_CLOSED, // Internal state + SCIFEP_UNBOUND, // External state + SCIFEP_BOUND, // External state + SCIFEP_LISTENING, // External state + SCIFEP_CONNECTED, // External state + SCIFEP_CONNECTING, // Internal state + SCIFEP_MAPPING, // Internal state + SCIFEP_CLOSING, // Internal state + SCIFEP_CLLISTEN, // Internal state + SCIFEP_DISCONNECTED, // Internal state + SCIFEP_ZOMBIE // Internal state +}; + +extern char *scif_ep_states[]; + +// Used for coordinating connection accept sequence. This is the data structure +// for the conlist in the endpoint. +struct conreq { + struct nodemsg msg; + struct list_head list; +}; + +/* Size of the RB for the Node QP */ +#define NODE_QP_SIZE 0x10000 +/* Size of the RB for the Endpoint QP */ +#define ENDPT_QP_SIZE 0x1000 + +struct endpt_qp_info { + /* Qpair for this endpoint */ + struct micscif_qp *qp; + /* + * Physical addr of the QP for Host or + * GTT offset of the QP for MIC. + * Required for unmapping the QP during close. + */ + dma_addr_t qp_offset; + /* + * Payload in a SCIF_CNCT_GNT message containing the + * physical address of the remote_qp. + */ + dma_addr_t cnct_gnt_payload; +}; + +#define SCIFEP_MAGIC 0x5c1f000000005c1f + +struct endpt { + volatile enum endptstate state; + spinlock_t lock; + + struct scif_portID port; + struct scif_portID peer; + + int backlog; + + struct endpt_qp_info qp_info; + struct endpt_rma_info rma_info; + /* + * scifdev used by this endpt to communicate with remote node. + */ + struct micscif_dev *remote_dev; + uint64_t remote_ep; + /* + * Keep track of number of connection requests. + */ + int conreqcnt; + /* + * Cache remote SCIF device state. + */ + enum scif_state sd_state; + /* + * True if the endpoint was created + * via scif_accept(..). + */ + bool accepted_ep; + /* + * Open file information used to match the id passed + * in with the flush routine. + */ + struct files_struct *files; + /* + * Reference count for functions using this endpoint. + */ + struct kref ref_count; + struct list_head conlist; + wait_queue_head_t conwq; + wait_queue_head_t disconwq; + wait_queue_head_t diswq; + wait_queue_head_t sendwq; + wait_queue_head_t recvwq; + struct mutex sendlock; + struct mutex recvlock; + struct list_head list; + +#ifdef CONFIG_MMU_NOTIFIER + struct list_head mmu_list; +#endif + + struct list_head li_accept; /* pending ACCEPTREG */ + int acceptcnt; /* pending ACCEPTREG cnt */ + struct list_head liacceptlist; /* link to listen accept */ + struct list_head miacceptlist; /* link to mi_uaccept */ + struct endpt *listenep; /* associated listen ep */ + + /* Non-blocking connect */ + struct work_struct conn_work; + struct scif_portID conn_port; + int conn_err; + int conn_async_state; + wait_queue_head_t conn_pend_wq; + struct list_head conn_list; +}; + +static __always_inline void +micscif_queue_for_cleanup(struct reg_range_t *window, struct list_head *list) +{ + struct endpt *ep = (struct endpt *)window->ep; + INIT_LIST_HEAD(&window->list_member); + window->dma_mark = get_dma_mark(ep->rma_info.dma_chan); + spin_lock(&ms_info.mi_rmalock); + list_add_tail(&window->list_member, list); + spin_unlock(&ms_info.mi_rmalock); + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); +} + +static __always_inline void +__micscif_rma_destroy_tcw_helper(struct reg_range_t *window) +{ + list_del(&window->list_member); + micscif_queue_for_cleanup(window, &ms_info.mi_rma_tc); +} + +void print_ep_state(struct endpt *ep, char *label); + +// Function prototypes needed by Unix/Linux drivers linking to scif +int scif_fdopen(struct file *f); +int scif_fdclose(struct file *f); +int scif_process_ioctl(struct file *f, unsigned int cmd, uint64_t arg); +int micscif_mmap(struct file *file, struct vm_area_struct *vma); +int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd); +void scif_munmap(struct vm_area_struct *vma); +void scif_proc_init(void); +void scif_proc_cleanup(void); +int scif_user_send(scif_epd_t epd, void *msg, int len, int flags); +int scif_user_recv(scif_epd_t epd, void *msg, int len, int flags); +int __scif_pin_pages(void *addr, size_t len, int *out_prot, + int map_flags, scif_pinned_pages_t *pages); +scif_epd_t __scif_open(void); +int __scif_bind(scif_epd_t epd, uint16_t pn); +int __scif_listen(scif_epd_t epd, int backlog); +int __scif_connect(scif_epd_t epd, struct scif_portID *dst, bool non_block); +int __scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t +*newepd, int flags); +int __scif_close(scif_epd_t epd); +int __scif_send(scif_epd_t epd, void *msg, int len, int flags); +int __scif_recv(scif_epd_t epd, void *msg, int len, int flags); +off_t __scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, +int prot_flags, int map_flags); +int __scif_unregister(scif_epd_t epd, off_t offset, size_t len); +int __scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t +roffset, int rma_flags); +int __scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t +roffset, int rma_flags); +int __scif_fence_mark(scif_epd_t epd, int flags, int *mark); +int __scif_fence_wait(scif_epd_t epd, int mark); +int __scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, off_t roff, +uint64_t rval, int flags); +off_t __scif_register_pinned_pages(scif_epd_t epd, +scif_pinned_pages_t pinned_pages, off_t offset, int map_flags); +int __scif_get_pages(scif_epd_t epd, off_t offset, size_t len, +struct scif_range **pages); +int __scif_put_pages(struct scif_range *pages); +int __scif_flush(scif_epd_t epd); + +void micscif_misc_handler(struct work_struct *work); +void micscif_conn_handler(struct work_struct *work); + +uint16_t rsrv_scif_port(uint16_t port); +uint16_t get_scif_port(void); +void put_scif_port(uint16_t port); + +void micscif_send_exit(void); + +void scif_ref_rel(struct kref *kref_count); + +#ifdef _MODULE_SCIF_ +unsigned int micscif_poll(struct file *f, poll_table *wait); +unsigned int scif_pollfd(struct file *f, poll_table *wait, scif_epd_t epd); +unsigned int __scif_pollfd(struct file *f, poll_table *wait, struct endpt *ep); +int micscif_flush(struct file *f, fl_owner_t id); +#endif + +#ifdef _MIC_SCIF_ +void mic_debug_init(void); +void micscif_get_node_info(void); +void scif_poll_qp_state(struct work_struct *work); +#endif +void mic_debug_uninit(void); + +#define serializing_request(x) ((void)*(volatile uint8_t*)(x)) + +// State list helper functions. +// Each of these functions must be called with the end point lock unlocked. If +// the end point is found on the list the end point returned will have its lock +// set and sflags will return the value to be used to do an unlock_irqrestore +// at the end of the calling function. +static inline struct endpt * +micscif_find_listen_ep(uint16_t port, unsigned long *sflags) +{ + struct endpt *ep = NULL; + struct list_head *pos, *tmpq; + unsigned long flags; + + spin_lock_irqsave(&ms_info.mi_eplock, flags); + list_for_each_safe(pos, tmpq, &ms_info.mi_listen) { + ep = list_entry(pos, struct endpt, list); + if (ep->port.port == port) { + *sflags = flags; + spin_lock(&ep->lock); + spin_unlock(&ms_info.mi_eplock); + return ep; + } + } + spin_unlock_irqrestore(&ms_info.mi_eplock, flags); + return (struct endpt *)NULL; +} + +// Must be called with end point locked +static inline struct conreq * +miscscif_get_connection_request(struct endpt *ep, uint64_t payload) +{ + struct conreq *conreq; + struct list_head *pos, *tmpq; + + list_for_each_safe(pos, tmpq, &ep->conlist) { + conreq = list_entry(pos, struct conreq, list); + if (conreq->msg.payload[0] == payload) { + list_del(pos); + ep->conreqcnt--; + return conreq; + } + } + return (struct conreq *)NULL; +} + +// There is no requirement for the callee to have the end point +// locked like other API's above. +static inline void +micscif_remove_zombie_ep(struct endpt *ep) +{ + struct list_head *pos, *tmpq; + unsigned long sflags; + struct endpt *tmpep; + + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_zombie) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == ep) { + list_del(pos); + ms_info.mi_nr_zombies--; + } + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); +} + +static inline void +micscif_cleanup_zombie_epd(void) +{ + struct list_head *pos, *tmpq; + unsigned long sflags; + struct endpt *ep; + + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_zombie) { + ep = list_entry(pos, struct endpt, list); + if (micscif_rma_ep_can_uninit(ep)) { + list_del(pos); + ms_info.mi_nr_zombies--; + va_gen_destroy(&ep->rma_info.va_gen); + kfree(ep); + } + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); +} + +#define SCIF_WAKE_UP_SEND (1 << 1) +#define SCIF_WAKE_UP_RECV (1 << 2) + +/** + * scif_wakeup_ep() - Wake up all clients based on the type + * requested i.e. threads blocked in scif_send(..) and/or scif_recv(..). + */ +static inline void +scif_wakeup_ep(int type) +{ + struct endpt *ep; + unsigned long sflags; + struct list_head *pos, *tmpq; + + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_connected) { + ep = list_entry(pos, struct endpt, list); + if (type & SCIF_WAKE_UP_SEND) + wake_up_interruptible(&ep->sendwq); + if (type & SCIF_WAKE_UP_RECV) + wake_up_interruptible(&ep->recvwq); + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); +} + +/* + * is_self_scifdev: + * @dev: The remote SCIF Device + * + * Returns true if the SCIF Device passed is the self aka Loopback SCIF device. + */ +static inline int is_self_scifdev(struct micscif_dev *dev) +{ + return dev->sd_node == ms_info.mi_nodeid; +} + +/* + * is_p2p_scifdev: + * @dev: The remote SCIF Device + * + * Returns true if the SCIF Device is a MIC Peer to Peer SCIF device. + */ +static inline bool is_p2p_scifdev(struct micscif_dev *dev) +{ +#ifdef _MIC_SCIF_ + return dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(dev); +#else + return false; +#endif +} + +/* + * get_conn_count: + * @dev: The remote SCIF Device + * + * Increments the number of active SCIF connections. Callee is expected + * to synchronize calling this API with put_conn_count. + */ +static __always_inline void +get_conn_count(struct micscif_dev *dev) +{ + dev->num_active_conn++; +} + +/* + * put_conn_count: + * @dev: The remote SCIF Device + * + * Decrements the number of active connections. Callee is expected + * to synchronize calling this API with get_conn_count. + */ +static __always_inline void +put_conn_count(struct micscif_dev *dev) +{ + dev->num_active_conn--; + BUG_ON(dev->num_active_conn < 0); +} + +/* + * get_kref_count: + * epd: SCIF endpoint + * + * Increments kmod endpoint reference count. Callee is expected + * to synchronize calling this API with put_kref_count. + */ +static __always_inline void +get_kref_count(scif_epd_t epd) +{ + kref_get(&(epd->ref_count)); +} + +/* + * put_kref_count: + * epd: SCIF endpoint + * + * Decrements kmod endpoint reference count. Callee is expected + * to synchronize calling this API with get_kref_count. + */ +static __always_inline void +put_kref_count(scif_epd_t epd) +{ + kref_put(&(epd->ref_count), scif_ref_rel); +} + +/* + * is_scifdev_alive: + * @dev: The remote SCIF Device + * + * Returns true if the remote SCIF Device is running or sleeping for + * this endpoint. + */ +static inline int scifdev_alive(struct endpt *ep) +{ + return (((SCIFDEV_RUNNING == ep->remote_dev->sd_state) || + (SCIFDEV_SLEEPING == ep->remote_dev->sd_state)) && + SCIFDEV_RUNNING == ep->sd_state); +} + +/* + * verify_epd: + * ep: SCIF endpoint + * + * Checks several generic error conditions and returns the + * appropiate error. + */ +static inline int verify_epd(struct endpt *ep) +{ + if (ep->state == SCIFEP_DISCONNECTED) + return -ECONNRESET; + + if (ep->state != SCIFEP_CONNECTED) + return -ENOTCONN; + + if (!scifdev_alive(ep)) + return -ENODEV; + + return 0; +} + +/** + * scif_invalidate_ep() - Set remote SCIF device state for all connected + * and disconnected endpoints for a particular node to SCIFDEV_STOPPED, + * change endpoint state to disconnected and wake up all send/recv/con + * waitqueues. + */ +static inline void +scif_invalidate_ep(int node) +{ + struct endpt *ep; + unsigned long sflags; + struct list_head *pos, *tmpq; + + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) { + ep = list_entry(pos, struct endpt, list); + if (ep->remote_dev->sd_node == node) { + spin_lock(&ep->lock); + ep->sd_state = SCIFDEV_STOPPED; + spin_unlock(&ep->lock); + } + } + list_for_each_safe(pos, tmpq, &ms_info.mi_connected) { + ep = list_entry(pos, struct endpt, list); + if (ep->remote_dev->sd_node == node) { + list_del(pos); + put_conn_count(ep->remote_dev); + spin_lock(&ep->lock); + ep->state = SCIFEP_DISCONNECTED; + list_add_tail(&ep->list, &ms_info.mi_disconnected); + ep->sd_state = SCIFDEV_STOPPED; + wake_up_interruptible(&ep->sendwq); + wake_up_interruptible(&ep->recvwq); + wake_up_interruptible(&ep->conwq); + spin_unlock(&ep->lock); + } + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + flush_workqueue(ms_info.mi_conn_wq); +} + +/* + * Only Debug Functions Below + */ +#define SCIF_CRUMB pr_debug("%s %d\n", __func__, __LINE__) + +static inline void +micscif_display_all_zombie_ep(void) +{ + struct list_head *pos, *tmpq; + unsigned long sflags; + struct endpt *ep; + + pr_debug("Zombie Info Start\n"); + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_zombie) { + ep = list_entry(pos, struct endpt, list); + if (!list_empty(&ep->rma_info.reg_list)) + micscif_display_all_windows(&ep->rma_info.reg_list); + if (!list_empty(&ep->rma_info.remote_reg_list)) + micscif_display_all_windows( + &ep->rma_info.remote_reg_list); + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + pr_debug("Zombie Info End\n"); +} + +static inline void dump_ep(scif_epd_t epd, const char *func, int line) +{ + struct endpt *ep = (struct endpt *)epd; + pr_debug("%s %d state %d lock %p port.node 0x%x" + "port.port 0x%x peer.node 0x%x peer.port 0x%x backlog %d qp %p" + "qp_offset 0x%llx cnct_gnt_payload 0x%llx remote_dev %p\n", + func, line, ep->state, &ep->lock, ep->port.node, + ep->port.port, ep->peer.node, ep->peer.port, ep->backlog, + ep->qp_info.qp, ep->qp_info.qp_offset, + ep->qp_info.cnct_gnt_payload, ep->remote_dev); +} + +static inline void dump_qp(volatile struct micscif_qp *qp, const char *func, int line) +{ + pr_debug("%s %d qp %p local_buf 0x%llx" + " local_qp 0x%llx remote_buf 0x%llx remote_qp %p ep 0x%llx\n", + func, line, qp, qp->local_buf, + qp->local_qp, qp->remote_buf, qp->remote_qp, qp->ep); +} + +static inline void dump_rb(struct micscif_rb *rb, const char *func, int line) +{ + pr_debug("%s %d rb %p rb_base %p *read_ptr 0x%x" + " *write_ptr 0x%x size 0x%x" + " cro 0x%x cwo 0x%x ocro 0x%x ocwo 0x%x\n", + func, line, rb, rb->rb_base, *rb->read_ptr, + *rb->write_ptr, rb->size, rb->current_read_offset, + rb->current_write_offset, + rb->old_current_read_offset, + rb->old_current_write_offset); +} + +#endif /* MICSCIF_H */ diff --git a/include/mic/micscif_intr.h b/include/mic/micscif_intr.h new file mode 100644 index 0000000..204d7b5 --- /dev/null +++ b/include/mic/micscif_intr.h @@ -0,0 +1,52 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICSCIF_INTR_H +#define MICSCIF_INTR_H +#define SBOX_SDBIC0_DBSTAT_BIT 0x40000000 +#define SBOX_SDBIC0_DBREQ_BIT 0x80000000 + +/* RDMASR Info */ +#define RDMASR_IRQ_BASE 17 +#define get_rdmasr_irq(m) ((RDMASR_IRQ_BASE) + (m)) +#define get_rdmasr_offset(m) (((m) << 2) + (SBOX_RDMASR0)) + +#ifdef _MIC_SCIF_ +int register_scif_intr_handler(struct micscif_dev *dev); +void deregister_scif_intr_handler(struct micscif_dev *dev); +#endif +int micscif_setup_interrupts(struct micscif_dev *dev); +void micscif_destroy_interrupts(struct micscif_dev *scifdev); +#endif /* MICSCIF_INTR_H */ diff --git a/include/mic/micscif_kmem_cache.h b/include/mic/micscif_kmem_cache.h new file mode 100644 index 0000000..3f40e29 --- /dev/null +++ b/include/mic/micscif_kmem_cache.h @@ -0,0 +1,62 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MIC_KMEM_CACHE_H +#define MIC_KMEM_CACHE_H +#define MAX_UNALIGNED_BUF_SIZE (1024 * 1024ULL) +#define KMEM_UNALIGNED_BUF_SIZE (MAX_UNALIGNED_BUF_SIZE + (L1_CACHE_BYTES << 1)) +#include +extern struct kmem_cache *unaligned_cache; + +static inline void micscif_kmem_cache_free(void *buffer) +{ + kmem_cache_free(unaligned_cache, buffer); +} + +static inline void *micscif_kmem_cache_alloc(void) +{ + return kmem_cache_alloc(unaligned_cache, GFP_KERNEL|GFP_ATOMIC); +} + +static inline struct kmem_cache *micscif_kmem_cache_create(void) +{ + return kmem_cache_create("Unaligned_DMA", KMEM_UNALIGNED_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); +} + +static inline void micscif_kmem_cache_destroy(void) +{ + kmem_cache_destroy(unaligned_cache); +} +#endif diff --git a/include/mic/micscif_map.h b/include/mic/micscif_map.h new file mode 100644 index 0000000..ef2f9a5 --- /dev/null +++ b/include/mic/micscif_map.h @@ -0,0 +1,276 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICSCIF_MAP_H +#define MICSCIF_MAP_H + +static __always_inline +void *get_local_va(off_t off, struct reg_range_t *window, size_t len) +{ + struct page **pages = window->pinned_pages->pages; + + uint64_t page_nr = ((off - window->offset) >> PAGE_SHIFT); + + off_t page_off = off & ~PAGE_MASK; + + return (void *)((uint64_t) + (page_address(pages[page_nr])) | page_off); +} + +static __always_inline void +scif_iounmap(void *virt, size_t len, struct micscif_dev *dev) +{ +#ifdef _MIC_SCIF_ + if (!is_self_scifdev(dev)) + iounmap(virt); +#endif +} + +#ifdef _MIC_SCIF_ +/* FIXME: fix the documentation and functions names since these are also + * used in p2p + */ +/* + * Maps the VA passed in local to the aperture and returns the + * corresponding GTT index in offset by reference. + * In the loopback case simply return the physical address. + */ +static __always_inline int +map_virt_into_aperture(phys_addr_t *out_offset, + void *local, + struct micscif_dev *dev, + size_t size) +{ + if (is_self_scifdev(dev)) + *out_offset = virt_to_phys(local); + else { + /* Error unwinding code relies on return value being zero */ + *out_offset = virt_to_phys(local); + if (dev != &scif_dev[0]) + *out_offset = *out_offset + dev->sd_base_addr; + } + + return 0; +} + +/* + * Maps the struct page passed in page to the aperture and returns the + * corresponding GTT index in offset by reference. + * In the loopback case simply return the physical address. + */ +static __always_inline int +map_page_into_aperture(phys_addr_t *out_offset, + struct page *page, + struct micscif_dev *dev) +{ + if (is_self_scifdev(dev)) + *out_offset = page_to_phys(page); + else { + /* Error unwinding code relies on return value being zero */ + *out_offset = page_to_phys(page); + if (dev != &scif_dev[0]) + *out_offset = *out_offset + dev->sd_base_addr; + } + return 0; +} + +/* + * Nothing to do on card side + */ +static __always_inline void +unmap_from_aperture(phys_addr_t local, + struct micscif_dev *dev, + size_t size) +{ +} + +/* + * Maps Host physical address passed in phys to MIC. + * In the loopback case simply return the VA from the PA. + */ +static __always_inline void * +scif_ioremap(phys_addr_t phys, size_t size, struct micscif_dev *dev) +{ + void *out_virt; + + if (is_self_scifdev(dev)) + out_virt = phys_to_virt(phys); + else + out_virt = ioremap_nocache(phys, size); + + return out_virt; +} + +/* + * Get the system physical address from the physical address passed + * by the host. In the case of loopback simply return the physical + * address. + */ +static __always_inline phys_addr_t +get_phys_addr(phys_addr_t phys, struct micscif_dev *dev) +{ + return phys; +} + +#else /* !_MIC_SCIF_ */ +/* + * Maps the VA passed in local to the aperture and returns the + * corresponding physical address in offset. + * In the loopback case simply return the physical address. + */ +static __always_inline int +map_virt_into_aperture(phys_addr_t *out_offset, + void *local, + struct micscif_dev *dev, + size_t size) +{ + int err = 0; + int bid; + struct pci_dev *hwdev; + + if (is_self_scifdev(dev)) + *(out_offset) = virt_to_phys((local)); + else { + + bid = dev->sd_node - 1; + hwdev = get_per_dev_ctx(bid)->bi_pdev; + *out_offset = mic_map_single(bid, hwdev, local, size); + if (mic_map_error(*out_offset)) + err = -ENOMEM; + } + + if (err) + *out_offset = 0; + + return err; +} +/* + * Maps the struct page passed in page to the aperture and returns the + * corresponding physical address in offset. + * In the loopback case simply return the physical address. + */ +static __always_inline int +map_page_into_aperture(phys_addr_t *out_offset, + struct page *page, + struct micscif_dev *dev) +{ + int err = 0; + int bid; + dma_addr_t mic_addr; + struct pci_dev *hwdev; + + if (is_self_scifdev(dev)) + *out_offset = page_to_phys(page); + else { + + bid = dev->sd_node - 1; + hwdev = get_per_dev_ctx(bid)->bi_pdev; + + *out_offset = pci_map_page(hwdev, page, 0x0, PAGE_SIZE, + PCI_DMA_BIDIRECTIONAL); + if (pci_dma_mapping_error(hwdev, *out_offset)) { + err = -EINVAL; + } else { + if (!(mic_addr = mic_map(bid, *out_offset, PAGE_SIZE))) { + printk(KERN_ERR "mic_map failed board id %d\ + addr %#016llx size %#016zx\n", + bid, *out_offset, PAGE_SIZE); + pci_unmap_single(hwdev, *out_offset, + PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + err = -EINVAL; + } else + *out_offset = mic_addr; + } + } + + if (err) + *out_offset = 0; + + return err; +} + +/* + * Unmaps the physical address passed in lo/al from the PCIe aperture. + * Nothing to do in the loopback case. + */ +static __always_inline void +unmap_from_aperture(phys_addr_t local, + struct micscif_dev *dev, + size_t size) +{ + + if (!is_self_scifdev(dev)) + mic_ctx_unmap_single(get_per_dev_ctx(dev->sd_node - 1), + local, size); +} + +/* + * TODO: I'm thinking maybe we should take the apt_phys offset off of this macro + * and have it be outside ... + * Maps the page corresponding to the GTT offset passed in phys. + * In the loopback case simply return the VA from the PA. + */ +static __always_inline void * +scif_ioremap(phys_addr_t phys, size_t size, struct micscif_dev *dev) +{ + void *out_virt; + + if (is_self_scifdev(dev)) + out_virt = phys_to_virt(phys); + else { + out_virt = get_per_dev_ctx(dev->sd_node - 1)->aper.va + phys; + } + return out_virt; +} + +static __always_inline phys_addr_t +get_phys_addr(phys_addr_t phys, struct micscif_dev *dev) +{ + phys_addr_t out_phys; + + if (is_self_scifdev(dev)) + out_phys = phys; + else { + phys_addr_t __apt_base = + (phys_addr_t)get_per_dev_ctx(dev->sd_node - 1)->aper.pa; + out_phys = phys + __apt_base; + } + + return out_phys; +} + +#endif /* !_MIC_SCIF_ */ + +#endif /* MICSCIF_MAP_H */ diff --git a/include/mic/micscif_nm.h b/include/mic/micscif_nm.h new file mode 100644 index 0000000..9f2ff48 --- /dev/null +++ b/include/mic/micscif_nm.h @@ -0,0 +1,234 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICSCIF_NM_H +#define MICSCIF_NM_H + +#include + +#ifdef MIC_IS_EMULATION +#define DEFAULT_WATCHDOG_TO (INT_MAX) +#define NODE_ALIVE_TIMEOUT (INT_MAX) +#define NODE_QP_TIMEOUT (INT_MAX) +#define NODE_ACCEPT_TIMEOUT (INT_MAX) +#define NODEQP_SEND_TO_MSEC (INT_MAX) +#else +#define DEFAULT_WATCHDOG_TO (30) +#define NODE_ALIVE_TIMEOUT (ms_info.mi_watchdog_to * HZ) +#define NODE_QP_TIMEOUT (100) +#define NODE_ACCEPT_TIMEOUT (3 * HZ) +#define NODEQP_SEND_TO_MSEC (3 * 1000) +#endif + +#define SCIF_ENABLE_PM 1 + +#define DESTROY_WQ (true) + +enum disconn_type { + DISCONN_TYPE_POWER_MGMT, + DISCONN_TYPE_LOST_NODE, + DISCONN_TYPE_MAINTENANCE_MODE, +}; + +/* + * Notify the host about a new dependency with the remote SCIF device. + * Dependencies are created during scif_mmap()/scif_get_pages(). + */ +void micscif_create_node_dep(struct micscif_dev *dev, int nr_pages); + +/* + * Notify the host that an existing dependency with the remote SCIF + * device no longer exists. + */ +void micscif_destroy_node_dep(struct micscif_dev *dev, int nr_pages); + +/** + * micscif_inc_node_refcnt: + * + * @dev: Remote SCIF device. + * @count: ref count + * + * Increment the global activity ref count for the remote SCIF device. + * If the remote SCIF device is idle, then notify the host to wake up + * the remote SCIF device and then wait for an ACK. + */ +static __always_inline void +micscif_inc_node_refcnt(struct micscif_dev *dev, long cnt) +{ +#ifdef SCIF_ENABLE_PM + if (unlikely(dev && !atomic_long_add_unless(&dev->scif_ref_cnt, + cnt, SCIF_NODE_IDLE))) { + /* + * This code path would not be entered unless the remote + * SCIF device has actually been put to sleep by the host. + */ + mutex_lock(&dev->sd_lock); + if (SCIFDEV_STOPPED == dev->sd_state || + SCIFDEV_STOPPING == dev->sd_state || + SCIFDEV_INIT == dev->sd_state) + goto bail_out; + if (test_bit(SCIF_NODE_MAGIC_BIT, + &dev->scif_ref_cnt.counter)) { + /* Notify host that the remote node must be woken */ + struct nodemsg notif_msg; + + dev->sd_wait_status = OP_IN_PROGRESS; + notif_msg.uop = SCIF_NODE_WAKE_UP; + notif_msg.src.node = ms_info.mi_nodeid; + notif_msg.dst.node = SCIF_HOST_NODE; + notif_msg.payload[0] = dev->sd_node; + /* No error handling for Host SCIF device */ + micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], + ¬if_msg, NULL); + /* + * A timeout is not required since only the cards can + * initiate this message. The Host is expected to be alive. + * If the host has crashed then so will the cards. + */ + wait_event(dev->sd_wq, + dev->sd_wait_status != OP_IN_PROGRESS); + /* + * Aieee! The host could not wake up the remote node. + * Bail out for now. + */ + if (dev->sd_wait_status == OP_COMPLETED) { + dev->sd_state = SCIFDEV_RUNNING; + clear_bit(SCIF_NODE_MAGIC_BIT, + &dev->scif_ref_cnt.counter); + } + } + /* The ref count was not added if the node was idle. */ + atomic_long_add(cnt, &dev->scif_ref_cnt); +bail_out: + mutex_unlock(&dev->sd_lock); + } +#endif +} + +/** + * micscif_dec_node_refcnt: + * + * @dev: Remote SCIF device. + * @nr_pages: number of pages + * + * Decrement the global activity ref count for the remote SCIF device. + * Assert if the ref count drops to negative. + */ +static __always_inline void +micscif_dec_node_refcnt(struct micscif_dev *dev, long cnt) +{ +#ifdef SCIF_ENABLE_PM + if (dev) { + if (unlikely((atomic_long_sub_return(cnt, + &dev->scif_ref_cnt)) < 0)) { + printk(KERN_ERR "%s %d dec dev %p node %d ref %ld " + " caller %p Lost Node?? \n", + __func__, __LINE__, dev, dev->sd_node, + atomic_long_read(&dev->scif_ref_cnt), + __builtin_return_address(0)); + atomic_long_add_unless(&dev->scif_ref_cnt, cnt, + SCIF_NODE_IDLE); + } + } +#endif +} + +/* Handle a SCIF_NODE_REMOVE message */ +uint64_t micscif_handle_remove_node(uint64_t mask, uint64_t flags); +void micscif_cleanup_scifdev(struct micscif_dev *dev, bool destroy_wq); + +void micscif_node_add_callback(int node); + +void set_nodemask_bit(uint8_t* nodemask, uint32_t node_id, int val); +int get_nodemask_bit(uint8_t* nodemask, uint32_t node_id); + +#ifndef _MIC_SCIF_ + +/* definition of stack node used in activation/deactivation set algorithms*/ +struct stack_node { + struct list_head next; + uint32_t node_id; +}; + +enum dependency_state { + DEP_STATE_NOT_DEPENDENT, + DEP_STATE_DEPENDENT, + DEP_STATE_DISCONNECT_READY, + DEP_STATE_DISCONNECTED +}; + + +uint64_t micscif_send_pm_rmnode_msg(int node, uint64_t nodemask_addr, + uint64_t nodemask_size, int orig_node); +uint64_t micscif_send_lost_node_rmnode_msg(int node, int orig_node); + +/* definitions of stack methods used in activation/deactivation set algorithms */ +int init_depgraph_stack(struct list_head *stack_ptr); +int uninit_depgraph_stack(struct list_head *stack_ptr); +int is_stack_empty(struct list_head *stack_ptr); +int stack_push_node(struct list_head *stack_ptr, uint32_t node_id); +int stack_pop_node(struct list_head *stack_ptr, uint32_t *node_id); +int micscif_get_activeset(uint32_t node_id, uint8_t *nodemask); +int micscif_get_minimal_deactiveset(uint32_t node_id, uint8_t *nodemask, uint8_t *visited); +int micscif_get_deactiveset(uint32_t node_id, uint8_t *nodemask, int max_possible); +void micscif_update_p2p_state(uint32_t node_id, uint32_t peer_id, enum scif_state state); + +/* Method responsible for disconnecting node from the scif network */ +int micscif_disconnect_node(uint32_t node_id, uint8_t *nodemask, enum disconn_type type); +int micscif_connect_node(uint32_t node_id, bool get_ref); + +void micscif_set_nodedep(uint32_t src_node, uint32_t dst_node, enum dependency_state state); +enum dependency_state micscif_get_nodedep(uint32_t src_node, uint32_t dst_node); +uint64_t micscif_send_node_alive(int node); +void micscif_watchdog_handler(struct work_struct *work); +int micscif_handle_lostnode(uint32_t nodeid); +#endif /*_MIC_SCIF_*/ + +/* SCIF tasks before transition to low power state */ +int micscif_suspend_handler(struct notifier_block *notif, + unsigned long event, void *ptr); + +/* + * SCIF tasks if a previous low power state transition + * has failed after a suspend call. + */ +int micscif_fail_suspend_handler(struct notifier_block *notif, + unsigned long event, void *ptr); + +/* SCIF tasks after wake up from low powe state */ +int micscif_resume_handler(struct notifier_block *notif, + unsigned long event, void *ptr); + +#endif /* MICSCIF_NM_H */ diff --git a/include/mic/micscif_nodeqp.h b/include/mic/micscif_nodeqp.h new file mode 100644 index 0000000..a69ec93 --- /dev/null +++ b/include/mic/micscif_nodeqp.h @@ -0,0 +1,200 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICSCIF_NODEQP +#define MICSCIF_NODEQP + +#include "micscif_rb.h" + + /* Payload Description */ +#define SCIF_INIT 1 /* Address of node's node First message sent by a node to + * array the host, and host to node + */ +#define SCIF_EXIT 2 /* Last message telling the host the driver is exiting */ +#define SCIF_NODE_ADD 3 /* Tell Online nodes a new node exits */ +#define SCIF_NODE_ADD_ACK 4 /* Confirm to host sequence is finished TODO Needed??? */ +#define SCIF_CNCT_REQ 5 /* Phys addr of Request connection to a port */ +#define SCIF_CNCT_GNT 6 /* Phys addr of new Grant connection request */ +#define SCIF_CNCT_GNTACK 7 /* Error type Reject a connection request */ +#define SCIF_CNCT_GNTNACK 8 /* Error type Reject a connection request */ +#define SCIF_CNCT_REJ 9 /* Error type Reject a connection request */ +#define SCIF_CNCT_TERM 10 /* Terminate type Terminate a connection request */ +#define SCIF_TERM_ACK 11 /* Terminate type Terminate a connection request */ +#define SCIF_DISCNCT 12 /* Notify peer that connection is being terminated */ +#define SCIF_DISCNT_ACK 13 /* Notify peer that connection is being terminated */ +#define SCIF_REGISTER 14 /* Tell peer about a new registered window */ +#define SCIF_REGISTER_ACK 15 /* Notify peer about unregistration success */ +#define SCIF_REGISTER_NACK 16 /* Notify peer about registration success */ +#define SCIF_UNREGISTER 17 /* Tell peer about unregistering a registered window */ +#define SCIF_UNREGISTER_ACK 18 /* Notify peer about registration failure */ +#define SCIF_UNREGISTER_NACK 19 /* Notify peer about unregistration failure */ +#define SCIF_ALLOC_REQ 20 /* Request a mapped buffer */ +#define SCIF_ALLOC_GNT 21 /* Notify peer about allocation success */ +#define SCIF_ALLOC_REJ 22 /* Notify peer about allocation failure */ +#define SCIF_FREE_PHYS 23 /* Free previously allocated GTT/PCI mappings */ +#define SCIF_FREE_VIRT 24 /* Free previously allocated virtual memory */ +#define SCIF_CLIENT_SENT 25 /* Notify the peer that a data message has been written to the RB */ +#define SCIF_CLIENT_RCVD 26 /* Notify the peer that a data message has been read from the RB */ +#define SCIF_MUNMAP 27 /* Acknowledgment for a SCIF_MMAP request */ +#define SCIF_MARK 28 /* SCIF Remote Fence Mark Request */ +#define SCIF_MARK_ACK 29 /* SCIF Remote Fence Mark Success */ +#define SCIF_MARK_NACK 30 /* SCIF Remote Fence Mark Failure */ +#define SCIF_WAIT 31 /* SCIF Remote Fence Wait Request */ +#define SCIF_WAIT_ACK 32 /* SCIF Remote Fence Wait Success */ +#define SCIF_WAIT_NACK 33 /* SCIF Remote Fence Wait Failure */ +#define SCIF_SIG_LOCAL 34 /* SCIF Remote Fence Local Signal Request */ +#define SCIF_SIG_REMOTE 35 /* SCIF Remote Fence Remote Signal Request */ +#define SCIF_SIG_ACK 36 /* SCIF Remote Fence Remote Signal Success */ +#define SCIF_SIG_NACK 37 /* SCIF Remote Fence Remote Signal Failure */ +#define SCIF_NODE_CREATE_DEP 42 /* Notify the Host that a new dependency is + * being created between two nodes + */ +#define SCIF_NODE_DESTROY_DEP 43 /* Notify the Host that an existing dependency is + * being destroyed between two nodes + */ +#define SCIF_NODE_REMOVE 44 /* Request to deactivate a set of remote SCIF nodes */ +#define SCIF_NODE_REMOVE_ACK 45 /* Response to a SCIF_NODE_REMOVE message */ +#define SCIF_NODE_WAKE_UP 46 /* Notification to the Host to wake up a remote node */ +#define SCIF_NODE_WAKE_UP_ACK 47 /* Response to SCIF_NODE_WAKE_UP message */ +#define SCIF_NODE_WAKE_UP_NACK 48 /* Response to SCIF_NODE_WAKE_UP message. Think Lost Node */ +#define SCIF_NODE_ALIVE 49 /* Check if kn* card is alive */ +#define SCIF_NODE_ALIVE_ACK 50 /* ACK the for above message */ +#define SMPT_SET 51 /* Add a smpt entry */ +#define SCIF_PROXY_DMA 56 /* Proxies DMA read requests to peer for performance */ +#define SCIF_PROXY_ORDERED_DMA 57 /* Proxies DMA read requests to peer for performance */ +#define SCIF_NODE_CONNECT 58 /* Setup a p2p connection b/w two nodes */ +#define SCIF_NODE_CONNECT_NACK 59 /* p2p connection is not successful */ +#define SCIF_NODE_ADD_NACK 60 /* SCIF_NODE_ADD failed report to the waiting thread(s) */ +#define SCIF_GET_NODE_INFO 61 /* Get current node mask from the host*/ +#define SCIF_TEST 62 /* Test value Used for test only */ +#define SCIF_MAX_MSG SCIF_TEST + + +/* + * The *only* reason we need 2 uint64_t for payload + * right now is because the SCIF_CNCT_GNT message needs + * to send across both the QP offset and the QP id. + * + * Now we have to increase this to 3 uint64_t because + * the Alloc message requires the remote EP, allocation size + * and the allocation handle. + * + * Increased to 4 uint64_t because SCIF_FENCE requires + * ep, offset, len and the waitqueue pointer to wake up. + */ +struct nodemsg { + struct scif_portID src; + struct scif_portID dst; + uint32_t uop; + uint64_t payload[4]; +} __attribute__ ((packed)); + + +/* + * Generic state used for certain node QP message exchanges + * like Unregister, Alloc etc. + */ +enum micscif_msg_state { + OP_IDLE = 1, + OP_IN_PROGRESS, + OP_COMPLETED, + OP_FAILED +}; + +/* + * Generic structure used for exchanging ALLOC_REQ/GNT messages. + */ +struct allocmsg { + dma_addr_t phys_addr; + void *vaddr; + uint32_t uop; + size_t size; + enum micscif_msg_state state; + wait_queue_head_t allocwq; +}; + +/* Interesting structure -- a little difficult because we can only + * write across the PCIe, so any r/w pointer we need to read is + * local. We only need to read the read pointer on the inbound_q + * and read the write pointer in the outbound_q + */ +struct micscif_qp { + uint64_t ep; + uint64_t magic; + uint64_t blast; +#define SCIFEP_MAGIC 0x5c1f000000005c1f + struct micscif_rb outbound_q; + struct micscif_rb inbound_q; + /* FIXME cache align local_write/read */ + uint32_t local_write; /* For local inbound */ + uint32_t local_read; /* For local outbound */ + volatile struct micscif_qp *remote_qp; + dma_addr_t local_buf; /* Local BS */ + dma_addr_t local_qp; + dma_addr_t remote_buf; /* Remote BS */ + volatile uint32_t qp_state; +#define QP_OFFLINE 0xdead +#define QP_ONLINE 0xc0de + uint16_t scif_version; + spinlock_t qp_send_lock; + spinlock_t qp_recv_lock; +}; + +/* + * An element in the loopback Node QP message list. + */ +struct loopb_msg { + struct nodemsg msg; + struct list_head list_member; +}; + +struct micscif_qp *micscif_nodeqp_nextmsg(struct micscif_dev *scifdev); +int micscif_nodeqp_send(struct micscif_dev *scifdev, struct nodemsg *msg, struct endpt *ep); +int micscif_nodeqp_intrhandler(struct micscif_dev *scifdev, struct micscif_qp *qp); +int micscif_loopb_msg_handler(struct micscif_dev *scifdev, struct micscif_qp *qp); + +// Card side only functions +int micscif_setup_card_qp(phys_addr_t host_phys, struct micscif_dev *dev); + +int micscif_setuphost_response(struct micscif_dev *scifdev, uint64_t payload); +int micscif_setup_qp_connect(struct micscif_qp *qp, dma_addr_t *qp_offset, int local_size, struct micscif_dev *scifdev); +int micscif_setup_qp_accept(struct micscif_qp *qp, dma_addr_t *qp_offset, dma_addr_t phys, int local_size, struct micscif_dev *scifdev); +int micscif_setup_qp_connect_response(struct micscif_dev *scifdev, struct micscif_qp *qp, uint64_t payload); +int micscif_setup_loopback_qp(struct micscif_dev *scifdev); +int micscif_destroy_loopback_qp(struct micscif_dev *scifdev); +void micscif_teardown_ep(void *endpt); +void micscif_add_epd_to_zombie_list(struct endpt *ep, bool mi_eplock_held); + +#endif /* MICSCIF_NODEQP */ diff --git a/include/mic/micscif_rb.h b/include/mic/micscif_rb.h new file mode 100644 index 0000000..20a5fe7 --- /dev/null +++ b/include/mic/micscif_rb.h @@ -0,0 +1,170 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef _SCIF_RING_BUFFER_DEFINE +#define _SCIF_RING_BUFFER_DEFINE + +/* + * This describes a general purpose, byte based + * ring buffer. It handles multiple readers or + * writers using a lock -- it is lockless between + * producer and consumer (so it can handle being + * used across the PCIe bus). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * This version is used to ensure component compatibility between the host and + * card driver modules that use the ring buffer functions. This version should + * be incremented whenever there is a change to the ring buffer module that + * affects the functionality of the ring buffer. + */ +#define RING_BUFFER_VERSION 1 + +/* Two of these actually form a single queue -- one on each side of the PCIe + * bus + * + * NOTE! This only works if the queue (pointed to at rb_base) exists in the + * consumer's memory. The code does not do any wbinvd after writing to the + * buffer, which assumes that the memory is not cached on the writers side. + * + * If this structure were to be used across the PCIe bus with the buffer + * living on the other side of the bus, it wouldn't work (would require a + * wbinvd or use the linux dma streaming buffer API) + */ +struct micscif_rb { + volatile void *rb_base; + volatile uint32_t *read_ptr; /* Points to the read offset */ + volatile uint32_t *write_ptr; /* Points to the write offset */ + uint32_t size; + uint32_t current_read_offset; /* cache it to improve performance */ + uint32_t current_write_offset; /* cache it to improve performance */ + uint32_t old_current_read_offset; + uint32_t old_current_write_offset; +}; + +/** + * methods used by both + */ +void micscif_rb_init(struct micscif_rb *rb, volatile uint32_t *read_ptr, + volatile uint32_t *write_ptr, volatile void *rb_base, + const uint32_t size); + +/** + * writer-only methods + */ +/* + * write a new command, then micscif_rb_commit() + */ +int micscif_rb_write(struct micscif_rb *rb, void *msg, uint32_t size); +/* + * After write(), then micscif_rb_commit() + */ +void micscif_rb_commit(struct micscif_rb *rb); +/* + * used on power state change to reset cached pointers + */ +void micscif_rb_reset(struct micscif_rb *rb); + +/* + * Query space available for writing to a RB. + */ +int micscif_rb_space(struct micscif_rb *rb); +/** + * reader-only methods + */ +/* + * uses (updates) the cached read pointer to get the next command, + * so writer doesnt see the command as consumed. + * + * Returns number of bytes read + * + * Size is IN -- the caller passes in a size (the max size that + * the function will read out) + * + * msg is OUT, but the caller is responsible for allocating space to + * read into. The max size this function will read is what is passed + * into by size, so the buffer pointer to by msg MUST be at least size + * bytes long. + */ +int micscif_rb_get_next (struct micscif_rb *rb, void *msg, uint32_t size); + +/* + * updates the control block read pointer, + * which will be visible to the writer so it can re-use the space + */ +void micscif_rb_update_read_ptr(struct micscif_rb *rb); + +/* + * Count the number of empty slots in the RB + */ +uint32_t micscif_rb_count(struct micscif_rb *rb, uint32_t size); + +/** + * Return the ring buffer module version. + */ +uint16_t micscif_rb_get_version(void); +#endif diff --git a/include/mic/micscif_rma.h b/include/mic/micscif_rma.h new file mode 100644 index 0000000..275e086 --- /dev/null +++ b/include/mic/micscif_rma.h @@ -0,0 +1,960 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICSCIF_RMA_H +#define MICSCIF_RMA_H + +#ifdef CONFIG_MMU_NOTIFIER +#include +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#include +#endif +#ifdef CONFIG_HUGETLB_PAGE +#include +#endif +#endif +#include "scif.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mic/micscif_kmem_cache.h" + +struct rma_mmu_notifier { +#ifdef CONFIG_MMU_NOTIFIER + struct mmu_notifier ep_mmu_notifier; +#endif + bool ep_mn_registered; + /* List of temp registration windows for self */ + struct list_head tc_reg_list; + struct mm_struct *mm; + struct endpt *ep; + struct list_head list_member; +}; + +/* Per Endpoint Remote Memory Access Information */ +struct endpt_rma_info { + /* List of registration windows for self */ + struct list_head reg_list; + /* List of registration windows for peer */ + struct list_head remote_reg_list; + /* Offset generator */ + struct va_gen_addr va_gen; + /* + * Synchronizes access to self/remote list and also + * protects the window from being destroyed while + * RMAs are in progress. + */ + struct mutex rma_lock; + /* + * Synchronizes access to temporary cached windows list + * for SCIF Registration Caching. + */ + spinlock_t tc_lock; + /* + * Synchronizes access to the list of MMU notifiers + * registered for this SCIF endpoint. + */ + struct mutex mmn_lock; + /* + * Synchronizes access to the SCIF registered address space + * offset generator. + */ + struct mutex va_lock; + /* + * Keeps track of number of outstanding temporary registered + * windows created by scif_vreadfrom/scif_vwriteto which have + * not been destroyed. tcw refers to the number of temporary + * cached windows and total number of pages pinned. + */ + atomic_t tw_refcount; + atomic_t tw_total_pages; + atomic_t tcw_refcount; + atomic_t tcw_total_pages; + /* + * MMU notifier so that we can destroy the windows when there is + * a change + */ + struct list_head mmn_list; + /* + * Keeps track of number of outstanding remote fence requests + * which have been received by the peer. + */ + int fence_refcount; + /* + * The close routine blocks on this wait queue to ensure that all + * remote fence requests have been serviced. + */ + wait_queue_head_t fence_wq; + /* + * DMA channel used for all DMA transfers for this endpoint. + */ + struct dma_channel *dma_chan; + /* Detect asynchronous list entry deletion */ + int async_list_del; +#ifdef _MIC_SCIF_ + /* Local P2P proxy DMA virtual address for SUD updates by peer */ + void *proxy_dma_va; + /* Local P2P proxy DMA physical address location for SUD updates */ + dma_addr_t proxy_dma_phys; + /* Remote P2P proxy DMA physical address location for SUD updates */ + dma_addr_t proxy_dma_peer_phys; +#endif + /* List of tasks which have remote memory mappings */ + struct list_head task_list; +}; + +/* Information used for tracking remote fence requests */ +struct fence_info { + /* State of this transfer */ + enum micscif_msg_state state; + + /* Fences wait on this queue */ + wait_queue_head_t wq; + + /* Used for storing the DMA mark */ + int dma_mark; +}; + +/* Per remote fence wait request */ +struct remote_fence_info { + /* The SCIF_WAIT message */ + struct nodemsg msg; + + struct list_head list_member; +}; + +/* Self or Peer window */ +enum rma_window_type { + RMA_WINDOW_SELF = 0x1, + RMA_WINDOW_PEER +}; + +/* The number of physical addresses that can be stored in a PAGE. */ +#define NR_PHYS_ADDR_IN_PAGE (PAGE_SIZE >> 3) + +/* + * Store an array of lookup offsets. Each offset in this array maps + * one 4K page containing 512 physical addresses i.e. 2MB. 512 such + * offsets in a 4K page will correspond to 1GB of registered address space. + */ +struct rma_lookup { + /* Array of offsets */ + dma_addr_t *lookup; + /* Offset used to map lookup array */ + dma_addr_t offset; +}; + + +/* + * A set of pinned pages obtained with scif_pin_pages() which could be part + * of multiple registered windows across different end points. + */ +struct scif_pinned_pages { + int64_t nr_pages; + int prot; + int map_flags; + atomic_t ref_count; + uint64_t magic; + /* + * Array of pointers to struct pages populated + * with get_user_pages(..) + */ + struct page **pages; + int *num_pages; + int64_t nr_contig_chunks; + /* Only for Hosts without THP but with Huge TLB FS Like SuSe11 SP1 */ + struct vm_area_struct **vma; +}; + +/* + * Information about a particular task which has remote memory mappings + * created via scif_mmap(..). + */ +struct rma_task_info { + /* + * Stores the pid struct of the grp_leader task structure which + * scif_mmap(..)'d the remote window. + */ + struct pid *pid; + int ref_count; + struct list_head list_member; +}; + +/* Registration Window for Self */ +struct reg_range_t { + int64_t nr_pages; + /* Number of contiguous physical chunks */ + int64_t nr_contig_chunks; + int prot; + int ref_count; + /* Cookie to detect corruption */ + uint64_t magic; + uint64_t offset; + /* va address that this window represents + * Useful for only for temp windows*/ + void *va_for_temp; + /* Used for temporary windows*/ + int dma_mark; + /* + * Pointer to EP. Useful for passing EP around + * with messages to avoid expensive list + * traversals. + */ + uint64_t ep; + + struct list_head list_member; + + enum rma_window_type type; + + /* + * Pointer to peer window. Useful for sending + * messages to peer without requiring an + * extra list traversal + */ + uint64_t peer_window; + + /* Unregistration state */ + enum micscif_msg_state unreg_state; + + /* + * True for temporary windows created via + * scif_vreadfrom/scif_vwriteto. + */ + bool temp; + + bool offset_freed; + + /* Local P2P proxy DMA physical address location for SUD updates */ + dma_addr_t proxy_dma_phys; + + union { + /* Self RAS */ + struct { + /* The set of pinned_pages backing this window */ + struct scif_pinned_pages *pinned_pages; + + /* Handle for sending ALLOC_REQ */ + struct allocmsg alloc_handle; + + /* Wait Queue for an registration (N)ACK */ + wait_queue_head_t regwq; + + /* Registration state */ + enum micscif_msg_state reg_state; + + /* Wait Queue for an unregistration (N)ACK */ + wait_queue_head_t unregwq; + }; + /* Peer RAS specific window elements */ + struct { +#ifdef CONFIG_ML1OM + /* Lookup for physical addresses used for mmap */ + struct rma_lookup phys_addr_lookup; + + /* Lookup for temp physical addresses used for mmap */ + struct rma_lookup temp_phys_addr_lookup; + + /* Mmap state */ + enum micscif_msg_state gttmap_state; + + /* Wait Queue for an unregistration (N)ACK */ + wait_queue_head_t gttmapwq; + + /* Ref count per page */ + int *page_ref_count; +#endif + /* Lookup for physical addresses used for DMA */ + struct rma_lookup dma_addr_lookup; + + /* Number of entries in lookup */ + int nr_lookup; + + /* Offset used to map the window by the peer */ + dma_addr_t mapped_offset; + + /* Ref count for tracking scif_get_pages */ + int get_put_ref_count; + }; + }; +#ifdef CONFIG_ML1OM + /* Array of physical addresses used for creating VtoP mappings */ + /* FIXME: these are phys_addr as seen by the peer node, node at the + * opposite end of the endpt + */ + dma_addr_t *phys_addr; + + /* Temporary array for storing physical addresses for performance */ + dma_addr_t *temp_phys_addr; +#endif + + /* Array of physical addresses used for Host & MIC initiated DMA */ + dma_addr_t *dma_addr; + + /* Array specifying number of pages for each physical address */ + int *num_pages; + struct mm_struct *mm; +} __attribute__ ((packed)); + + +#define RMA_MAGIC(x) BUG_ON(x->magic != SCIFEP_MAGIC) + +/* If this bit is set then the mark is a remote fence mark */ +#define SCIF_REMOTE_FENCE_BIT 30 +/* Magic value used to indicate a remote fence request */ +#define SCIF_REMOTE_FENCE (1ULL << SCIF_REMOTE_FENCE_BIT) + +enum rma_direction { + LOCAL_TO_REMOTE, + REMOTE_TO_LOCAL +}; + +/* Initialize RMA for this EP */ +int micscif_rma_ep_init(struct endpt *ep); + +/* Check if epd can be uninitialized */ +int micscif_rma_ep_can_uninit(struct endpt *ep); + +/* Obtain a new offset. Callee must grab RMA lock */ +int micscif_get_window_offset(struct endpt *ep, int flags, + uint64_t offset, size_t len, uint64_t *out_offset); + +/* Free offset. Callee must grab RMA lock */ +void micscif_free_window_offset(struct endpt *ep, + uint64_t offset, size_t len); + +/* Create self registration window */ +struct reg_range_t *micscif_create_window(struct endpt *ep, + int64_t nr_pages, uint64_t offset, bool temp); + +/* Create a set of pinned pages */ +struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot); + +/* Destroy a set of pinned pages */ +int micscif_destroy_pinned_pages(struct scif_pinned_pages *pages); + +/* Destroy self registration window.*/ +int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window); + +int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window); + +/* Map pages of self window to Aperture/PCI */ +int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool temp); + +/* Unregister a self window */ +int micscif_unregister_window(struct reg_range_t *window); + +/* Create remote registration window */ +struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages); + +/* Destroy remote registration window */ +void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window); + +int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window); + +/* Prepare a remote registration window */ +int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window); + +/* Create remote lookup entries for physical addresses */ +int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window); + +/* Destroy remote lookup entries for physical addresses */ +void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window); + +/* Send a SCIF_REGISTER message and wait for an ACK */ +int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window); + +/* Send a SCIF_UNREGISTER message */ +int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window); + +/* RMA copy API */ +int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len, + off_t roffset, int flags, enum rma_direction dir, bool last_chunk); + +/* Sends a remote fence mark request */ +int micscif_send_fence_mark(scif_epd_t epd, int *out_mark); + +/* Sends a remote fence wait request */ +int micscif_send_fence_wait(scif_epd_t epd, int mark); + +/* Sends a remote fence signal request */ +int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval, + off_t loff, uint64_t lval, int flags); + +/* Setup a DMA mark for an endpoint */ +int micscif_fence_mark(scif_epd_t epd); + +void ep_unregister_mmu_notifier(struct endpt *ep); +#ifdef CONFIG_MMU_NOTIFIER +void micscif_mmu_notif_handler(struct work_struct *work); +#endif + +void micscif_rma_destroy_temp_windows(void); +void micscif_rma_destroy_tcw_ep(struct endpt *ep); +void micscif_rma_destroy_tcw_invalid(struct list_head *list); + +void micscif_rma_handle_remote_fences(void); + +/* Reserve a DMA channel for a particular endpoint */ +int micscif_reserve_dma_chan(struct endpt *ep); + +/* Program DMA SUD's after verifying the registered offset */ +int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val, + enum rma_window_type type); + +/* Kill any applications which have valid remote memory mappings */ +void micscif_kill_apps_with_mmaps(int node); + +/* Query if any applications have remote memory mappings */ +bool micscif_rma_do_apps_have_mmaps(int node); + +/* Get a reference to the current task which is creating a remote memory mapping */ +int micscif_rma_get_task(struct endpt *ep, int nr_pages); + +/* Release a reference to the current task which is destroying a remote memory mapping */ +void micscif_rma_put_task(struct endpt *ep, int nr_pages); + +/* Cleanup remote registration lists for zombie endpoints */ +void micscif_cleanup_rma_for_zombies(int node); + +#ifdef _MIC_SCIF_ +void micscif_teardown_proxy_dma(struct endpt *ep); +#endif + +static __always_inline +bool is_unaligned(off_t src_offset, off_t dst_offset) +{ + src_offset = src_offset & (L1_CACHE_BYTES - 1); + dst_offset = dst_offset & (L1_CACHE_BYTES - 1); + if (src_offset == dst_offset) + return false; + else + return true; +} + +static __always_inline +int __scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int err; + + pr_debug("SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx" + " offset 0x%lx flags 0x%x\n", + epd, loffset, len, roffset, flags); + + if (is_unaligned(loffset, roffset)) { + while(len > MAX_UNALIGNED_BUF_SIZE) { + err = micscif_rma_copy(epd, loffset, NULL, + MAX_UNALIGNED_BUF_SIZE, + roffset, flags, REMOTE_TO_LOCAL, false); + if (err) + goto readfrom_err; + loffset += MAX_UNALIGNED_BUF_SIZE; + roffset += MAX_UNALIGNED_BUF_SIZE; + len -=MAX_UNALIGNED_BUF_SIZE; + } + } + err = micscif_rma_copy(epd, loffset, NULL, len, + roffset, flags, REMOTE_TO_LOCAL, true); +readfrom_err: + return err; +} + +static __always_inline +int __scif_writeto(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int err; + + pr_debug("SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx" + " roffset 0x%lx flags 0x%x\n", + epd, loffset, len, roffset, flags); + + if (is_unaligned(loffset, roffset)) { + while(len > MAX_UNALIGNED_BUF_SIZE) { + err = micscif_rma_copy(epd, loffset, NULL, + MAX_UNALIGNED_BUF_SIZE, + roffset, flags, LOCAL_TO_REMOTE, false); + if (err) + goto writeto_err; + loffset += MAX_UNALIGNED_BUF_SIZE; + roffset += MAX_UNALIGNED_BUF_SIZE; + len -= MAX_UNALIGNED_BUF_SIZE; + } + } + err = micscif_rma_copy(epd, loffset, NULL, len, + roffset, flags, LOCAL_TO_REMOTE, true); +writeto_err: + return err; +} + +static __always_inline +int __scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags) +{ + int err; + + pr_debug("SCIFAPI vreadfrom: ep %p addr %p len 0x%lx" + " roffset 0x%lx flags 0x%x\n", + epd, addr, len, roffset, flags); + + if (is_unaligned((off_t)addr, roffset)) { + if (len > MAX_UNALIGNED_BUF_SIZE) + flags &= ~SCIF_RMA_USECACHE; + + while(len > MAX_UNALIGNED_BUF_SIZE) { + err = micscif_rma_copy(epd, 0, addr, + MAX_UNALIGNED_BUF_SIZE, + roffset, flags, REMOTE_TO_LOCAL, false); + if (err) + goto vreadfrom_err; + addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE); + roffset += MAX_UNALIGNED_BUF_SIZE; + len -= MAX_UNALIGNED_BUF_SIZE; + } + } + err = micscif_rma_copy(epd, 0, addr, len, + roffset, flags, REMOTE_TO_LOCAL, true); +vreadfrom_err: + return err; +} + +static __always_inline +int __scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags) +{ + int err; + + pr_debug("SCIFAPI vwriteto: ep %p addr %p len 0x%lx" + " roffset 0x%lx flags 0x%x\n", + epd, addr, len, roffset, flags); + + if (is_unaligned((off_t)addr, roffset)) { + if (len > MAX_UNALIGNED_BUF_SIZE) + flags &= ~SCIF_RMA_USECACHE; + + while(len > MAX_UNALIGNED_BUF_SIZE) { + err = micscif_rma_copy(epd, 0, addr, + MAX_UNALIGNED_BUF_SIZE, + roffset, flags, LOCAL_TO_REMOTE, false); + if (err) + goto vwriteto_err; + addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE); + roffset += MAX_UNALIGNED_BUF_SIZE; + len -= MAX_UNALIGNED_BUF_SIZE; + } + } + err = micscif_rma_copy(epd, 0, addr, len, + roffset, flags, LOCAL_TO_REMOTE, true); +vwriteto_err: + return err; +} + +void micscif_rma_completion_cb(uint64_t data); + +int micscif_pci_dev(uint16_t node, struct pci_dev **pdev); +#ifndef _MIC_SCIF_ +int micscif_pci_info(uint16_t node, struct scif_pci_info *dev); +#endif + +/* + * nr_pages in a 2MB page is specified via the top 12 bits in the + * physical address. + */ + +/* Check all parenthesis in these macros. See if putting in bottom makes sense? */ +#define RMA_HUGE_NR_PAGE_SHIFT ((52)) +#define RMA_HUGE_NR_PAGE_MASK (((0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT)) +#define RMA_GET_NR_PAGES(addr) ((addr) >> RMA_HUGE_NR_PAGE_SHIFT) +#define RMA_SET_NR_PAGES(addr, nr_pages) ((addr) = (((nr_pages) & 0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT) | ((uint64_t)(addr))) +#define RMA_GET_ADDR(addr) ((addr) & ~(RMA_HUGE_NR_PAGE_MASK)) + +extern bool mic_huge_page_enable; + +#define SCIF_HUGE_PAGE_SHIFT 21 + +/* + * micscif_is_huge_page: + * @page: A physical page. + */ +static __always_inline int +micscif_is_huge_page(struct scif_pinned_pages *pinned_pages, int index) +{ + int huge = 0; + struct page *page = pinned_pages->pages[index]; + + if (compound_order(page) + PAGE_SHIFT == SCIF_HUGE_PAGE_SHIFT) + huge = 1; + if (huge) + ms_info.nr_2mb_pages++; + if (!mic_huge_page_enable) + huge = 0; +#ifdef RMA_DEBUG + WARN_ON(!page_count(page)); + WARN_ON(page_mapcount(page) < 0); +#endif + return huge; +} + +/* + * micscif_detect_large_page: + * @pinned_pages: A set of pinned pages. + */ +static __always_inline int +micscif_detect_large_page(struct scif_pinned_pages *pinned_pages, char *addr) +{ + int i = 0, nr_pages, huge; + char *next_huge, *end; + char *end_addr = addr + (pinned_pages->nr_pages << PAGE_SHIFT); + + while (addr < end_addr) { + huge = micscif_is_huge_page(pinned_pages, i); + if (huge) { + next_huge = (char *)ALIGN( + (unsigned long)(addr + 1), + PMD_SIZE); + end = next_huge > end_addr ? end_addr : next_huge; + nr_pages = (int)((end - addr) >> PAGE_SHIFT); + pinned_pages->num_pages[i] = (int)nr_pages; + addr = end; + i += (int)nr_pages; + + } else { + pinned_pages->num_pages[i] = 1; + i++; + addr += PAGE_SIZE; + ms_info.nr_4k_pages++; + } + pinned_pages->nr_contig_chunks++; + } + return 0; +} + +/** + * micscif_set_nr_pages: + * @ep: end point + * @window: self registration window + * + * Set nr_pages in every entry of physical address/dma address array + * and also remove nr_pages information from physical addresses. + */ +static __always_inline void +micscif_set_nr_pages(struct micscif_dev *dev, struct reg_range_t *window) +{ + int j; +#ifdef CONFIG_ML1OM + int l = 0, k; +#endif + + for (j = 0; j < window->nr_contig_chunks; j++) { + window->num_pages[j] = RMA_GET_NR_PAGES(window->dma_addr[j]); + if (window->num_pages[j]) + window->dma_addr[j] = RMA_GET_ADDR(window->dma_addr[j]); + else + break; +#ifdef CONFIG_ML1OM + for (k = 0; k < window->num_pages[j]; k++) + if (window->temp_phys_addr[j]) + window->phys_addr[l + k] = + RMA_GET_ADDR(window->temp_phys_addr[j]) + (k << PAGE_SHIFT); + l += window->num_pages[j]; +#endif + } +} + +#ifdef CONFIG_ML1OM +/* + * micscif_get_phys_addr: + * Obtain the phys_addr given the window and the offset. + * @window: Registered window. + * @off: Window offset. + */ +static __always_inline dma_addr_t +micscif_get_phys_addr(struct reg_range_t *window, uint64_t off) +{ + int page_nr = (off - window->offset) >> PAGE_SHIFT; + off_t page_off = off & ~PAGE_MASK; + return window->phys_addr[page_nr] | page_off; +} +#endif + +#define RMA_ERROR_CODE (~(dma_addr_t)0x0) + +/* + * micscif_get_dma_addr: + * Obtain the dma_addr given the window and the offset. + * @window: Registered window. + * @off: Window offset. + * @nr_bytes: Return the number of contiguous bytes till next DMA addr index. + * @index: Return the index of the dma_addr array found. + * @start_off: start offset of index of the dma addr array found. + * The nr_bytes provides the callee an estimate of the maximum possible + * DMA xfer possible while the index/start_off provide faster lookups + * for the next iteration. + */ +static __always_inline dma_addr_t +micscif_get_dma_addr(struct reg_range_t *window, uint64_t off, size_t *nr_bytes, int *index, uint64_t *start_off) +{ + if (window->nr_pages == window->nr_contig_chunks) { + int page_nr = (int)((off - window->offset) >> PAGE_SHIFT); + off_t page_off = off & ~PAGE_MASK; + if (nr_bytes) + *nr_bytes = PAGE_SIZE - page_off; + if (page_nr >= window->nr_pages) { + printk(KERN_ERR "%s dma_addr out of boundary\n", __FUNCTION__); + } + return window->dma_addr[page_nr] | page_off; + } else { + int i = index ? *index : 0; + uint64_t end; + uint64_t start = start_off ? *start_off : window->offset; + for (; i < window->nr_contig_chunks; i++) { + end = start + (window->num_pages[i] << PAGE_SHIFT); + if (off >= start && off < end) { + if (index) + *index = i; + if (start_off) + *start_off = start; + if (nr_bytes) + *nr_bytes = end - off; + return (window->dma_addr[i] + (off - start)); + } + start += (window->num_pages[i] << PAGE_SHIFT); + } + } +#ifdef CONFIG_MK1OM + printk(KERN_ERR "%s %d BUG. Addr not found? window %p off 0x%llx\n", __func__, __LINE__, window, off); + BUG_ON(1); +#endif + return RMA_ERROR_CODE; +} + +/* + * scif_memset: + * @va: kernel virtual address + * @c: The byte used to fill the memory + * @size: Buffer size + * + * Helper API which fills size bytes of memory pointed to by va with the + * constant byte c. This API fills the memory in chunks of 4GB - 1 bytes + * for a single invocation of memset(..) to work around a kernel bug in + * x86_64 @ https://bugzilla.kernel.org/show_bug.cgi?id=27732 + * where memset(..) does not do "ANY" work for size >= 4GB. + * This kernel bug has been fixed upstream in v3.2 via the commit + * titled "x86-64: Fix memset() to support sizes of 4Gb and above" + * but has not been backported to distributions like RHEL 6.3 yet. + */ +static __always_inline void scif_memset(char *va, int c, size_t size) +{ + size_t loop_size; + const size_t four_gb = 4 * 1024 * 1024 * 1024ULL; + + while (size) { + loop_size = min(size, four_gb - 1); + memset(va, c, loop_size); + size -= loop_size; + va += loop_size; + } +} + +/* + * scif_zalloc: + * @size: Size of the allocation request. + * + * Helper API which attempts to allocate zeroed pages via + * __get_free_pages(..) first and then falls back on + * vmalloc(..) if that fails. This is required because + * vmalloc(..) is *slow*. + */ +static __always_inline void *scif_zalloc(size_t size) +{ + void *ret; + size_t align = ALIGN(size, PAGE_SIZE); + + if (!align) + return NULL; + + if (align <= (1 << (MAX_ORDER + PAGE_SHIFT - 1))) + if ((ret = (void*)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(align)))) + goto done; + if (!(ret = vmalloc(align))) + return NULL; + + /* TODO: Use vzalloc once kernel supports it */ + scif_memset(ret, 0, size); +done: +#ifdef RMA_DEBUG + atomic_long_add_return(align, &ms_info.rma_alloc_cnt); +#endif + return ret; +} + +/* + * scif_free: + * @addr: Address to be freed. + * @size: Size of the allocation. + * Helper API which frees memory allocated via scif_zalloc(). + */ +static __always_inline void scif_free(void *addr, size_t size) +{ + size_t align = ALIGN(size, PAGE_SIZE); + + if (unlikely(is_vmalloc_addr(addr))) + vfree(addr); + else { + free_pages((unsigned long)addr, get_order(align)); + } +#ifdef RMA_DEBUG + WARN_ON(atomic_long_sub_return(align, &ms_info.rma_alloc_cnt) < 0); +#endif +} + +static __always_inline void +get_window_ref_count(struct reg_range_t *window, int64_t nr_pages) +{ + window->ref_count += (int)nr_pages; +} + +static __always_inline void +put_window_ref_count(struct reg_range_t *window, int64_t nr_pages) +{ + window->ref_count -= (int)nr_pages; + BUG_ON(window->nr_pages < 0); +} + +static __always_inline void +set_window_ref_count(struct reg_range_t *window, int64_t nr_pages) +{ + window->ref_count = (int)nr_pages; +} + +/* Debug API's */ +void micscif_display_window(struct reg_range_t *window, const char *s, int line); +static inline struct mm_struct *__scif_acquire_mm(void) +{ + if (mic_ulimit_check) { +#ifdef RMA_DEBUG + atomic_long_add_return(1, &ms_info.rma_mm_cnt); +#endif + return get_task_mm(current); + } + return NULL; +} + +static inline void __scif_release_mm(struct mm_struct *mm) +{ + if (mic_ulimit_check && mm) { +#ifdef RMA_DEBUG + WARN_ON(atomic_long_sub_return(1, &ms_info.rma_mm_cnt) < 0); +#endif + mmput(mm); + } +} + +static inline int __scif_dec_pinned_vm_lock(struct mm_struct *mm, + int64_t nr_pages, bool try_lock) +{ + if (mm && nr_pages && mic_ulimit_check) { + if (try_lock) { + if (!down_write_trylock(&mm->mmap_sem)) { + return -1; + } + } else { + down_write(&mm->mmap_sem); + } +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)) + mm->pinned_vm -= nr_pages; +#else + mm->locked_vm -= nr_pages; +#endif + up_write(&mm->mmap_sem); + } + return 0; +} + +static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, + int64_t nr_pages) +{ + if (mm && mic_ulimit_check && nr_pages) { + unsigned long locked, lock_limit; + locked = nr_pages; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)) + locked += mm->pinned_vm; +#else + locked += mm->locked_vm; +#endif + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + pr_debug("locked(%lu) > lock_limit(%lu)\n", + locked, lock_limit); + return -ENOMEM; + } else { +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)) + mm->pinned_vm = locked; +#else + mm->locked_vm = locked; +#endif + } + } + return 0; +} +#endif diff --git a/include/mic/micscif_rma_list.h b/include/mic/micscif_rma_list.h new file mode 100644 index 0000000..c7f25ed --- /dev/null +++ b/include/mic/micscif_rma_list.h @@ -0,0 +1,151 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICSCIF_RMA_LIST_H +#define MICSCIF_RMA_LIST_H + +/* + * RMA Linked List Manipulation API's. + * Callee must Hold RMA lock to call the API's below. + * When and if RMA uses RB trees for log(n) search, + * similar API's should be implemented. + */ + +/* + * Specifies whether an RMA operation can span + * across partial windows, a single window or multiple + * contiguous windows. + * Mmaps can span across parial windows. + * Unregistration can span across complete windows. + * scif_get_pages() can span a single window. + */ +enum range_request { + WINDOW_PARTIAL, + WINDOW_SINGLE, + WINDOW_FULL +}; + +/* Self Registration list RMA Request query */ +struct micscif_rma_req { + struct reg_range_t **out_window; + uint64_t offset; + size_t nr_bytes; + int prot; + enum range_request type; + struct list_head *head; + void *va_for_temp; +}; + +/** + * struct mic_copy_work: + * + * Work for DMA copy thread is provided by alloocating and preparing + * struct mic_copy_work and calling mic_enqueue_copy_work. + */ +struct mic_copy_work { + uint64_t src_offset; + + uint64_t dst_offset; + + /* Starting src registered window */ + struct reg_range_t *src_window; + + /* Starting dst registered window */ + struct reg_range_t *dst_window; + + /* Is this transfer a loopback transfer? */ + int loopback; + + size_t len; + /* DMA copy completion callback. Details in mic_dma_lib.h */ + struct dma_completion_cb *comp_cb; + + struct micscif_dev *remote_dev; + + /* DO_DMA_POLLING or DO_DMA_INTR or none */ + int fence_type; + + bool ordered; + +#ifdef CONFIG_ML1OM + /* GTT map state */ + enum micscif_msg_state gttmap_state; + + /* Wait Queue for a GTT map (N)ACK */ + wait_queue_head_t gttmapwq; + + uint64_t gtt_offset; + + uint64_t gtt_length; + +#endif + bool dma_chan_released; + struct list_head list_member; +}; + +/* Insert */ +void micscif_insert_window(struct reg_range_t *window, struct list_head *head); +void micscif_insert_tcw(struct reg_range_t *window, + struct list_head *head); + +/* Query */ +int micscif_query_window(struct micscif_rma_req *request); +int micscif_query_tcw(struct endpt *ep, struct micscif_rma_req *request); + +/* Called from close to unregister all self windows */ +int micscif_unregister_all_windows(scif_epd_t epd); + +/* Traverse list and munmap */ +void micscif_rma_list_munmap(struct reg_range_t *window, uint64_t offset, int nr_pages); +/* Traverse list and mmap */ +int micscif_rma_list_mmap(struct reg_range_t *start_window, + uint64_t offset, int nr_pages, struct vm_area_struct *vma); +/* Traverse list and unregister */ +int micscif_rma_list_unregister(struct reg_range_t *window, uint64_t offset, int nr_pages); + +/* CPU copy */ +int micscif_rma_list_cpu_copy(struct mic_copy_work *work); + +/* Traverse remote RAS and ensure none of the get_put_ref_counts are +ve */ +int micscif_rma_list_get_pages_check(struct endpt *ep); + +/* Debug API's */ +void micscif_display_all_windows(struct list_head *head); + +int micscif_rma_list_dma_copy_wrapper(struct endpt *epd, struct mic_copy_work *work, struct dma_channel *chan, off_t loffset); + +void micscif_rma_local_cpu_copy(uint64_t offset, struct reg_range_t *window, uint8_t *temp, size_t remaining_len, bool to_temp); + +#endif /* MICSCIF_RMA_LIST_H */ diff --git a/include/mic/micscif_smpt.h b/include/mic/micscif_smpt.h new file mode 100644 index 0000000..7c3c0f9 --- /dev/null +++ b/include/mic/micscif_smpt.h @@ -0,0 +1,120 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MIC_SMPT_H +#define MIC_SMPT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_BOARD_SUPPORTED 256 + +#define SNOOP_ON (0 << 0) +#define SNOOP_OFF (1 << 0) +#define NUM_SMPT_REGISTERS 32 +#define NUM_SMPT_ENTRIES_IN_USE 32 +#define SMPT_MASK 0x1F +#define MIC_SYSTEM_PAGE_SHIFT 34ULL +#define MIC_SYSTEM_PAGE_MASK ((1ULL << MIC_SYSTEM_PAGE_SHIFT) - 1ULL) + +struct _mic_ctx_t; +struct pci_dev; + +typedef struct mic_smpt { + dma_addr_t dma_addr; + int64_t ref_count; +} mic_smpt_t; + + +/* Sbox Smpt Reg Bits: + * Bits 31:2 Host address + * Bits 1 RSVD + * Bits 0 No snoop + */ +#define BUILD_SMPT(NO_SNOOP, HOST_ADDR) \ + (uint32_t)(((((HOST_ADDR)<< 2) & (~0x03)) | ((NO_SNOOP) & (0x01)))) + +bool is_syspa(dma_addr_t hostmic_pa); + +dma_addr_t mic_map(int bid, dma_addr_t dma_addr, size_t size); +void mic_unmap(int bid, dma_addr_t dma_addr, size_t size); + +dma_addr_t mic_map_single(int bid, struct pci_dev *hwdev, void *p, size_t size); +void mic_unmap_single(int bid, struct pci_dev *hwdev, dma_addr_t mic_addr, + size_t size); + +dma_addr_t mic_ctx_map_single(struct _mic_ctx_t *mic_ctx, void *p, size_t size); +void mic_ctx_unmap_single(struct _mic_ctx_t *mic_ctx, dma_addr_t dma_addr, + size_t size); + +dma_addr_t mic_to_dma_addr(int bid, dma_addr_t mic_addr); +void mic_smpt_set(volatile void *mm_sbox, uint64_t dma_addr, uint64_t index); + +static inline +bool mic_map_error(dma_addr_t mic_addr) +{ + return !mic_addr; +} +#endif // MIC_SMPT_H diff --git a/include/mic/micscif_va_gen.h b/include/mic/micscif_va_gen.h new file mode 100644 index 0000000..b1df13b --- /dev/null +++ b/include/mic/micscif_va_gen.h @@ -0,0 +1,86 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* generate a virtual address for a given size */ + +#ifndef MICSCIF_VA_GEN_H +#define MICSCIF_VA_GEN_H + +#include "micscif_va_node.h" + +/* + * To avoid collisions with user applications trying to use + * MAP_FIXED with scif_register(), the following window address space + * allocation scheme is used. + * + * 1) (0) - (2 ^ 62 - 1)) + * Window Address Space that can be claimed using MAP_FIXED. + * 2) (2 ^ 62) - (2 ^ 63) - 1) + * Window address space used for allocations by the SCIF driver + * when MAP_FIXED is not passed. + */ +#define VA_GEN_MIN 0x4000000000000000 +#define VA_GEN_RANGE 0x3f00000000000000 + +#define INVALID_VA_GEN_ADDRESS 0xff00000000000000 +#define INVALID_VA_PAGE_INDEX 0xff00000000000 + +struct va_gen_addr { + struct va_node_allocator allocator; + uint32_t hole_list; + uint32_t claims_list; + uint64_t base; +}; + +/* + * return a base for the range + * caller trusted to keep track of both base and range + */ +uint64_t va_gen_alloc(struct va_gen_addr *addr, + uint64_t num_bytes, uint32_t align_bytes); + +/* Claim ownership of memory region. Fails if already occupied */ +uint64_t va_gen_claim(struct va_gen_addr *addr, + uint64_t address, uint64_t num_bytes); + +/* release ownership of a base/range */ +void va_gen_free(struct va_gen_addr *addr, + uint64_t address, uint64_t num_bytes); + +int va_gen_init(struct va_gen_addr *addr, uint64_t base, uint64_t range); + +void va_gen_destroy(struct va_gen_addr *addr); + +#endif diff --git a/include/mic/micscif_va_node.h b/include/mic/micscif_va_node.h new file mode 100644 index 0000000..659f62f --- /dev/null +++ b/include/mic/micscif_va_node.h @@ -0,0 +1,115 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* generate a virtual address for a given size */ +#ifndef MICSCIF_VA_NODE_H +#define MICSCIF_VA_NODE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define invalid_va_node_index ((uint32_t)(-1)) + +struct va_node { + uint32_t next; + uint64_t base; + uint64_t range; +}; + +struct va_node_allocator { + /* Emulated variable-size array + * is implemented as a sequence of fixed-sized slabs. + * SlabDirectory keeps the sequence. + * Slab is a contiguous block of nodes -- saves number of allocations + * when allocing a new slab of nodes, alloc this size + */ + uint32_t slab_shift; + uint32_t nodes_in_slab; + uint32_t slab_mask; + struct va_node **pp_slab_directory; + uint32_t num_slabs; + uint32_t num_free_slabs; + uint32_t free_list; +}; + +int va_node_is_valid(uint32_t index); + +/* + * get the node corresponding to a NodePtr + * We are emulating a variable-size array + */ +struct va_node *va_node_get(struct va_node_allocator *node, uint32_t index); + +/* returns an NodePtr to a free node */ +int va_node_alloc(struct va_node_allocator *node, uint32_t *out_alloc); + +/* put a node back into the free pool, by NodePtr */ +void va_node_free(struct va_node_allocator *node, uint32_t index); + +void va_node_init(struct va_node_allocator *node); + +void va_node_destroy(struct va_node_allocator *node); + +#endif diff --git a/include/mic/micvcons.h b/include/mic/micvcons.h new file mode 100644 index 0000000..26e60a5 --- /dev/null +++ b/include/mic/micvcons.h @@ -0,0 +1,164 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICVCONS_H +#define MICVCONS_H + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MICVCONS_DEVICE_NAME "ttyMIC" + +#define MICVCONS_BUF_SIZE PAGE_SIZE +#define MICDCONS_MAX_OUTPUT_BYTES 64 +#define MICVCONS_SHORT_TIMEOUT 100 +#define MICVCONS_MAX_TIMEOUT 500 + +#define MIC_VCONS_READY 0xc0de +#define MIC_VCONS_SLEEPING 0xd00d +#define MIC_VCONS_WAKINGUP 0xd12d +#define MIC_HOST_VCONS_READY 0xbad0 +#define MIC_VCONS_HOST_OPEN 0xbad1 +#define MIC_VCONS_RB_VER_ERR 0xbad2 + +#define MICVCONS_TIMER_RESTART 1 +#define MICVCONS_TIMER_SHUTDOWN 0 + +typedef struct micvcons { + int dc_enabled; + void *dc_hdr_virt; + void *dc_buf_virt; + dma_addr_t dc_hdr_dma_addr; + dma_addr_t dc_dma_addr; + uint32_t dc_size; +} micvcons_t; + +typedef struct micvcons_port { + struct board_info *dp_bdinfo; + struct micvcons *dp_vcons; + struct micscif_rb *dp_in; + struct micscif_rb *dp_out; + struct tty_struct *dp_tty; + struct list_head list_member; + /* + * work queue to schedule work that wakes up a sleeping card + * and read the data from the buffer. + */ + struct workqueue_struct *dp_wq; + struct work_struct dp_wakeup_read_buf; + + spinlock_t dp_lock; + struct mutex dp_mutex; + + volatile int dp_bytes; + volatile uint32_t dp_canread; + + volatile struct file *dp_reader; + volatile struct file *dp_writer; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + struct tty_port port; +#endif +} micvcons_port_t; + +/* vcons IPC layout */ +struct vcons_buf +{ + uint32_t host_magic; + uint32_t mic_magic; + + uint16_t host_rb_ver; + uint16_t mic_rb_ver; + + /* mic o/p buffer */ + dma_addr_t o_buf_dma_addr; /* host buf dma addr*/ + uint32_t o_wr; + uint32_t o_size; + + /* mic i/p buffer */ + uint64_t i_hdr_addr; /* mic hdr addr */ + uint64_t i_buf_addr; /* mic buf addr */ + uint32_t i_rd; + uint32_t i_size; +}; + +struct vcons_mic_header +{ + uint32_t o_rd; + uint32_t i_wr; + uint32_t host_status; +}; + +int micvcons_start(struct _mic_ctx_t *mic_ctx); +int micvcons_port_write(struct micvcons_port *port, const unsigned char *buf, + int count); +struct _mic_ctx_t; +void micvcons_stop(struct _mic_ctx_t *mic_ctx); +int micvcons_pm_disconnect_node(uint8_t *node_bitmask, enum disconn_type type); +#endif /* MICVCONS_H */ diff --git a/include/mic/micveth.h b/include/mic/micveth.h new file mode 100644 index 0000000..c4e65a6 --- /dev/null +++ b/include/mic/micveth.h @@ -0,0 +1,145 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICVETH_H +#define MICVETH_H + +#include "micveth_dma.h" + +#include "micint.h" +#include "micveth_common.h" + +#define MICVETH_MAX_PACKET_SIZE (63 * 1024) +#define MICVETH_TRANSFER_FIFO_SIZE 128 + +#define MICVETH_LINK_UP_MAGIC 0x1A77ABEE +#define MICVETH_LINK_DOWN_MAGIC 0x1DEADBEE + +#define MICVETH_POLL_TIMER_DELAY 1 +#define MICVETH_CLIENT_TIMER_DELAY 10 + +typedef struct ring_packet { + struct sk_buff *pd_skb; + uint64_t pd_phys; + uint64_t pd_length; +} ring_packet_t; + +typedef struct ring_desc { + uint64_t rd_phys; + uint64_t rd_length; + uint32_t rd_valid; +} ring_desc_t; + +typedef struct ring_queue { + uint32_t rq_head; + uint32_t rq_tail; + uint32_t rq_length; + ring_desc_t rq_descs[MICVETH_TRANSFER_FIFO_SIZE]; +} ring_queue_t; + +typedef struct ring { + ring_queue_t r_tx; + ring_queue_t r_rx; +} veth_ring_t; + +#define VETH_STATE_INITIALIZED 0 +#define VETH_STATE_LINKUP 1 +#define VETH_STATE_LINKDOWN 2 + + +typedef struct micveth_info { + struct pci_dev *vi_pdev; + struct net_device *vi_netdev; + uint8_t *vi_sbox; + uint8_t *vi_dbox; + uint32_t *vi_scratch14; + uint32_t *vi_scratch15; + mic_ctx_t *mic_ctx; + volatile uint32_t vi_state; + uint32_t vi_skb_mtu; + + struct delayed_work vi_poll; + + struct workqueue_struct *vi_wq; + char vi_wqname[16]; + struct work_struct vi_bh; + struct work_struct vi_txws; + + spinlock_t vi_rxlock; + spinlock_t vi_txlock; + + struct { + veth_ring_t ring; + uint64_t phys; + uint64_t length; + } vi_ring; + + veth_ring_t *ring_ptr; + + ring_packet_t vi_tx_desc[MICVETH_TRANSFER_FIFO_SIZE]; + ring_packet_t vi_rx_desc[MICVETH_TRANSFER_FIFO_SIZE]; + uint32_t vi_pend; +} micveth_info_t; + +enum { + CLIENT_POLL_STOPPED, + CLIENT_POLL_RUNNING, + CLIENT_POLL_STOPPING, +}; + +typedef struct micveth { + int lv_num_interfaces; + int lv_num_clients; + int lv_active_clients; + int lv_num_links_remaining; + micveth_info_t *lv_info; + + struct mutex lv_state_mutex; + + uint32_t lv_pollstate; + struct delayed_work lv_poll; + wait_queue_head_t lv_wq; + +} micveth_t; + +int micveth_init(struct device *dev); +int micveth_init_legacy(int num_bds, struct device *dev); +void micveth_exit(void); +int micveth_probe(mic_ctx_t *mic_ctx); +void micveth_remove(mic_ctx_t *mic_ctx); +int micveth_start(mic_ctx_t *mic_ctx); +void micveth_stop(mic_ctx_t *mic_ctx); + +#endif /* MICVETH_H */ diff --git a/include/mic/micveth_common.h b/include/mic/micveth_common.h new file mode 100644 index 0000000..5df0afb --- /dev/null +++ b/include/mic/micveth_common.h @@ -0,0 +1,69 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICVETHCOMMON_H +#define MICVETHCOMMON_H + +#ifndef ETH_HLEN +#define ETH_HLEN 14 +#endif + +typedef enum micvnet_state { + MICVNET_STATE_UNDEFINED, + MICVNET_STATE_UNINITIALIZED, + MICVNET_STATE_LINKUP, + MICVNET_STATE_LINK_DOWN, + MICVNET_STATE_BEGIN_UNINIT, + MICVNET_STATE_TRANSITIONING, +}micvnet_state; + + +/* + * Fancy way of defining an enumeration and the mapping between them and + * the module parameter--they're guaranteed to be in sync this way. + */ +#define VNET_MODES \ + __VNET_MODE(POLL, poll) \ + __VNET_MODE(INTR, intr) \ + __VNET_MODE(DMA, dma) \ + /* end */ +#define __VNET_MODE(u, l) VNET_MODE_##u , +enum { VNET_MODES }; +#undef __VNET_MODE + +extern char *mic_vnet_modes[]; +extern int mic_vnet_mode; + +#endif /* MICVETHCOMMON_H */ diff --git a/include/mic/micveth_dma.h b/include/mic/micveth_dma.h new file mode 100644 index 0000000..d48598f --- /dev/null +++ b/include/mic/micveth_dma.h @@ -0,0 +1,279 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICVETH_DMA_H +#define MICVETH_DMA_H + +#include +#include "micint.h" + +#include "mic_common.h" +#include "mic_dma_lib.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + Define this if only DMA mode is supported without legacy POLL/INTR modes + (i.e if only micveth_dma.c is included in the host/card side drivers, i.e + when linvnet.c is excluded from host side driver and micveth.c from card + side driver). This will ensure that other global symbols which are at + present common with legacy modes (in linvnet.c/micveth.c) are all included + in micveth_dma.c. +*/ +#undef STANDALONE_VNET_DMA + +/*******************************************************/ +#define MICVNET_MSG_RB_SIZE 128 +#define DMA_ALIGNMENT L1_CACHE_BYTES +#define VNET_MAX_SKBS 62 + +/* The maximum total number of outstanding messages possible in the current + implementation is 2 * VNET_MAX_SKBS + 1. */ +#if (MICVNET_MSG_RB_SIZE < 2 * VNET_MAX_SKBS + 2) +#error "MICVNET_MSG_RB_SIZE should be at least (2 * VNET_MAX_SKBS + 2)" +#endif + +#if (MICVNET_MSG_RB_SIZE & (MICVNET_MSG_RB_SIZE - 1)) +#error "MICVNET_MSG_RB_SIZE should be power of 2" +#endif + +enum micvnet_msg_id { + MICVNET_MSG_ADD_DMA_BUFFER, + MICVNET_MSG_DMA_COMPLETE, + MICVNET_MSG_LINK_DOWN, + MICVNET_MSG_LINK_UP, +}; + +struct micvnet_msg_add_dma_buffer { + uint64_t buf_phys; + uint64_t buf_size; +}; + +struct micvnet_msg_dma_complete { + uint64_t dst_phys; + uint64_t size; + uint64_t dma_offset; +}; + +#define VNET_DRIVER_VERSION 1 +struct micvnet_msg_link_up { + uint64_t vnet_driver_version; +}; + +union micvnet_msg_body { + struct micvnet_msg_add_dma_buffer micvnet_msg_add_dma_buffer; + struct micvnet_msg_dma_complete micvnet_msg_dma_complete; + struct micvnet_msg_link_up micvnet_msg_link_up; +}; + +struct micvnet_msg { + uint64_t msg_id; + union micvnet_msg_body body; +}; + +struct micvnet_msg_rb { + struct micvnet_msg buf[MICVNET_MSG_RB_SIZE]; + volatile uint32_t head; + volatile uint32_t tail; + uint32_t size; + volatile uint32_t prev_head; + volatile uint32_t prev_tail; +}; + +struct micvnet_msg_ring_pair { + struct micvnet_msg_rb rb_tx; + struct micvnet_msg_rb rb_rx; +}; + +struct micvnet_msg_qp { + struct micvnet_msg_rb *tx; + struct micvnet_msg_rb *rx; +}; + +/*******************************************************/ + +/* Restict micvnet mtu to 63K because ping does not work on RHEL 6.3 with 64K + MTU - HSD [4118026] */ +#define MICVNET_MAX_MTU (63 * 1024) +#define MICVNET_CARD_UP_MAGIC 0x1A77BBEE + +struct rx_node { + struct list_head list; + struct sk_buff *skb; + uint64_t phys; + uint64_t size; +}; + +struct dma_node { + struct list_head list; + uint64_t phys; + uint64_t size; +}; + +struct tx_node { + struct list_head list; + struct sk_buff *skb; +}; + +struct sched_node { + struct list_head list; + struct sk_buff *skb; + unsigned char *skb_data_aligned; + uint64_t dma_src_phys; + uint64_t dma_size; + uint64_t dma_offset; + uint64_t dst_phys; +}; + +struct obj_list { + char *buf; + int size; + size_t obj_size; + volatile uint32_t head; + volatile uint32_t tail; +}; + +struct micvnet_info { + struct pci_dev *vi_pdev; + struct net_device *vi_netdev; + uint8_t *vi_sbox; + uint8_t *vi_dbox; + uint32_t *vi_scratch14; + mic_ctx_t *mic_ctx; + atomic_t vi_state; + + struct workqueue_struct *vi_wq; + char vi_wqname[16]; + struct work_struct vi_ws_bh; + struct work_struct vi_ws_tx; + struct work_struct vi_ws_dmacb; + struct work_struct vi_ws_link_down; + struct work_struct vi_ws_stop; + struct work_struct vi_ws_start; + + spinlock_t vi_rxlock; + spinlock_t vi_txlock; + +#ifdef HOST + struct micvnet_msg_ring_pair vi_rp; +#else + struct micvnet_msg_ring_pair *ring_ptr; +#endif + uint64_t vi_rp_phys; + struct micvnet_msg_qp vi_qp; + + struct obj_list dnode_list; + + struct list_head vi_rx_skb; + struct list_head vi_dma_buf; + struct list_head vi_tx_skb; + struct list_head vi_sched_skb; + + mic_dma_handle_t dma_handle; + struct dma_channel *dma_chan; + struct dma_completion_cb dma_cb; + atomic_t cnt_dma_complete; + + atomic_t cnt_dma_buf_avail; + bool link_down_initiator; + atomic_t cnt_tx_pending; + wait_queue_head_t stop_waitq; +}; + + +struct micvnet { + atomic_t lv_active_clients; + int created; +}; + +int micvnet_init(struct device *dev); +void micvnet_exit(void); +int micvnet_probe(mic_ctx_t *mic_ctx); +void micvnet_remove(mic_ctx_t *mic_ctx); +int micvnet_xmit(struct sk_buff *skb, struct net_device *dev); + +int micvnet_start(mic_ctx_t *mic_ctx); +void micvnet_stop(mic_ctx_t *mic_ctx); + +#ifndef HOST +int __init micvnet_module_init(void); +void __exit micvnet_module_exit(void); +#endif + +#ifdef STANDALONE_VNET_DMA +#define micveth_init micvnet_init +#define micveth_exit micvnet_exit +#define micveth_probe micvnet_probe +#define micveth_remove micvnet_remove +#define micveth_start micvnet_start +#define micveth_stop micvnet_stop +#endif + +extern int vnet_num_buffers; +#ifndef HOST +extern ulong vnet_addr; +#endif +#endif // MICVETH_DMA_H diff --git a/include/mic/ringbuffer.h b/include/mic/ringbuffer.h new file mode 100644 index 0000000..5fe81af --- /dev/null +++ b/include/mic/ringbuffer.h @@ -0,0 +1,195 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* +Description: This is a generic ring buffer implementation to be used by +anyone who needs a ring buffer. The ring buffer is maipulated +using Read and Write functions. These functions perform all of +the necessary space checks and only complete the operation if +if the requested number of items can be read or written. A +return value of false indicates that either the ring buffer +contains less then the requested number of items (for Read) or +there isn't enough space left in the ring buffer (for Write). +*/ + +#ifndef _MICHOST_RING_BUFFER_DEFINE + +#define _MICHOST_RING_BUFFER_DEFINE + +// +// Requirements: +// Ring base should be already aligned properly +// Ring size should be just multiple of the alignment size +// All packets should be at least multiple of 4 bytes for the purpose of padding +// + +#define RINGBUFFER_ALIGNMENT_SIZE 64 // in byte + +typedef struct _ringbuffer +{ + uint8_t *ringbuff_ptr; + volatile uint32_t *readptr; // Points to the read offset + volatile uint32_t *writeptr; // Points to the write offset + uint32_t ringbuffsize; + uint32_t curr_readoffset; // cache it to improve performance. + uint32_t curr_writeoffset; // cache it to improve performance. + uint32_t old_readoffset; + uint32_t old_writeoffset; +} ringbuffer; + +// Commands common across all ring buffers +typedef enum _rb_cmdopcode +{ + // note: don't use 0, because the ring buffer + // is initialized to a bunch of 0's that aren't really commands. + MIC_RBCT_ERROR = 0x0, // an error has occurred if encountered + MIC_RBCT_NOP, // Used to skip empty space in the ringbuffer. + MIC_RBCT_DMAEXEC, // DMA buffer to transfer/execute + MIC_RBCT_SHUTDOWN, // bus power-down eminent + MIC_RBCT_CREATESTDPROCESS, // Launches an executable on the ramdisk. + MIC_RBCT_CREATENATIVEPROCESS, // Launches a native process. + // NRFIX : not implemented. If native apps are launched by loading shared + // libraries(DLLs) into a standard stub app then this command goes away. + MIC_RBCT_DESTROYPROCESS, // Destroys a process. + MIC_RBCT_VIRTUALALLOC, // Creates a uOS virtual address range + MIC_RBCT_MAPHOSTMEMORY, // Used by implement host kernel mode driver services + MIC_RBCT_UNMAPHOSTMEMORY, // Unmaps host memory + MIC_RBCT_UOSESCAPE, // Used to pass uOS escapes from the host + MIC_RBCT_RESERVED1, // Reserved for future use + MIC_RBCT_RESERVED2, // Reserved for future use + MIC_RBCT_UPLOADSTDAPPLICATION, // Uploads a standard application to the uOS + MIC_RBCT_CREATEUOSRESOURCE, // Creates a DPT page cache + MIC_RBCT_DESTROYUOSRESOURCE, // Destroys a DPT page cache + MIC_RBCT_RESERVE_RING_BANDWIDTH_DBOX_TRAFFIC, // Reserves a ring bandwidth for DBOX traffic + + // Following commands are from MIC->Host (CRBT => CPU ring buffer.) + MIC_CRBT_LOG_INFO, // Host logs information sent by the uOS. + + // Always make these the last ones in the list +#if defined(MIC_DEBUG) || defined(ENABLE_GFXDEBUG) + MIC_RBCT_READPHYSICALMEMORY = 0x8000, // Used by debug tools to read memory on the device + MIC_RBCT_WRITEPHYSICALMEMORY, // Used by debug tools to write memory on the device +#endif // defined(MIC_DEBUG) || defined(ENABLE_GFXDEBUG) + MIC_RBCT_CMD_MAX // No valid OpCodes above this one +}ringbuff_cmdop; + +typedef struct _ringbuff_cmdhdr +{ + ringbuff_cmdop opcode:16; + uint32_t size:16; +}ringbuff_cmdhdr; + +#ifdef __cplusplus +extern "C" { +#endif + +//--------------------------------- +// methods used by both +//--------------------------------- +// initialize cached ring buffer structure +void rb_initialize(ringbuffer* ringbuff, volatile uint32_t* readptr, + volatile uint32_t* writeptr, void *buff, const uint32_t size); + +//--------------------------------- +// writer-only methods +//--------------------------------- +// write a new command. Must follow with fence/MMIO, then RingBufferCommit() +int rb_write(ringbuffer* ringbuff, ringbuff_cmdhdr* cmd_header); +// After write(), do an mfence(), an MMIO write to serialize, then Commit() +void rb_commit(ringbuffer* ringbuff); +// used on power state change to reset cached pointers +void rb_reset(ringbuffer* ringbuff); +// used to determine the largest possible command that could be sent next +uint32_t rb_get_available_space(ringbuffer* ringbuff); + +// TODO: It may be more optimal to have "Reserve" function exposed to the client +// instead of requiring it to create a command that will be copied into the ring buffer. + + +//--------------------------------- +// reader-only methods +//--------------------------------- +// uses (updates) the cached read pointer to get the next command, so writer doesn't +// see the command as consumed +ringbuff_cmdhdr* rb_get_next_cmd(ringbuffer* ringbuff); +// updates the control block read pointer, which will be visible to the writer so it +// can re-use the space +void rb_update_readptr(ringbuffer* ringbuff, ringbuff_cmdhdr* cmd_header); +// reader skips all commands, updating its next read offset +void rb_skip_to_offset(ringbuffer* ringbuff, uint32_t new_readptr); + +// uOS used this method to determine if RingBuffer is empty or not before attempting +// to fetch command out of ring buffer If ringbuffer is empty, means uOS would have +// fetched it earlier. +uint32_t rb_empty(ringbuffer* ringbuff); + +// only used by host simulator +void rb_sync(ringbuffer* ringbuff); + +#ifdef __cplusplus +} +#endif + +#ifdef __LINUX_GPL__ +//============================================================================== +// FUNCTION: AlignLow +// +// DESCRIPTION: Returns trunk(in_data / in_granularity) * in_granularity +// +// PARAMETERS: +// in_data - Data to be aligned +// in_granularity - Alignment chunk size - must be a power of 2 +#if defined(__cplusplus) +template +#else // no C++ +#define TData uint64_t +#endif // if C++ + +static inline TData AlignLow(TData in_data, uintptr_t in_granularity) +{ + TData mask = (TData)(in_granularity-1); // 64 -> 0x3f + + // floor to granularity + TData low = in_data & ~mask; + + return low; +} + +#if !defined(__cplusplus) +#undef TData +#endif // if no C++ +#endif // __LINUX_GPL_ + +#endif //_MICHOST_RING_BUFFER_DEFINE diff --git a/include/mic_common.h b/include/mic_common.h new file mode 100644 index 0000000..92554ad --- /dev/null +++ b/include/mic_common.h @@ -0,0 +1,769 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#if !defined(__MIC_COMMON_H) +#define __MIC_COMMON_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef USE_VCONSOLE +#include +#endif +#include +#include +#include +#include +#include +#include + +#define GET_MAX(a, b) ( ((a) > (b)) ? (a) : (b) ) +#define GET_MIN(a, b) ( ((a) < (b)) ? (a) : (b) ) + +// System Interrupt Cause Read Register 0 +#define SBOX_SICR0_DBR(x) ((x) & 0xf) +#define SBOX_SICR0_DMA(x) (((x) >> 8) & 0xff) + +// System Interrupt Cause Enable Register 0 +#define SBOX_SICE0_DBR(x) ((x) & 0xf) +#define SBOX_SICE0_DBR_BITS(x) ((x) & 0xf) +#define SBOX_SICE0_DMA(x) (((x) >> 8) & 0xff) +#define SBOX_SICE0_DMA_BITS(x) (((x) & 0xff) << 8) + +// System Interrupt Cause Read Register 1 +#define SBOX_SICR1_SBOXERR(x) ((x) & 0x1) +#define SBOX_SICR1_SPIDONE(x) (((x) >> 4) & 0x1) + +// System Interrupt Cause Set Register 1 +#define SBOX_SICC1_SBOXERR(x) ((x) & 0x1) +#define SBOX_SICC1_SPIDONE(x) (((x) >> 4) & 0x1) + +// Offsets in the MMIO Range for register segments +#define HOST_DBOX_BASE_ADDRESS 0x00000000 +#define HOST_SBOX_BASE_ADDRESS 0x00010000 +#define HOST_GTT_BASE_ADDRESS 0x00040000 + +#define SCRATCH0_MEM_TEST_DISABLE(x) ((x) & 0x1) +#define SCRATCH0_MEM_USAGE(x) (((x) >> 1) & 0x3) +#define SCR0_MEM_ALL 0x0 +#define SCR0_MEM_HALF 0x1 +#define SCR0_MEM_THIRD 0x2 +#define SCR0_MEM_FOURTH 0x3 +#define SCRATCH0_MEM_SIZE_KB(x) ((x) >> 0x3) + +#define SCRATCH2_DOWNLOAD_STATUS(x) ((x) & 0x1) + +#define SCRATCH2_CLEAR_DOWNLOAD_STATUS(x) ((x) & ~0x1) +#define SCRATCH2_APIC_ID(x) (((x) >> 1) & 0x1ff) +#define SCRATCH2_DOWNLOAD_ADDR(x) ((x) & 0xfffff000) + +#define SCRATCH13_SUB_STEP(x) ((x) & 0xf) +#define SCRATCH13_STEP_ID(x) (((x) >> 4) & 0xf) +#define SCRATCH13_PLATFORM_ID(x) (((x) >> 18) & 0x3) + + +#define MEMVOLT_MEMVOLT(x) (((x) >>SHIFT_MEMVOLT) & MASK_MEMVOLT) +#define MEMFREQ_MEMFREQ(x) (((x) >>SHIFT_MEMORYFREQ) & MASK_MEMORYFREQ) +#define FAILSAFEOFFSET_FAILSAFE(x) (((x) >>SHIFT_FAIL_SAFE) & MASK_FAIL_SAFE) + +#define SCRATCH4_ACTIVE_CORES(x) (((x) >>SHIFT_ACTIVE_CORES) & MASK_ACTIVE_CORES) +#define SCRATCH0_MEMSIZE(x) (((x) >>SHIFT_MEMSIZE) & MASK_MEMSIZE) +#define SCRATCH7_FLASHVERSION(x) (((x) >>SHIFT_FLASHVERSION) & MASK_FLASHVERSION) +#define SCRATCH7_FUSECONFIGREV(x) (((x) >>SHIFT_FUSE_CONFIG_REV) & MASK_FUSE_CONFIG_REV) +#define SCRATCH13_MODEL(x) (((x) >>SHIFT_MODEL) & MASK_MODEL) +#define SCRATCH13_FAMILY_DATA(x) (((x) >>SHIFT_FAMILY_DATA) & MASK_FAMILY_DATA) +#define SCRATCH13_PROCESSOR(x) (((x) >>SHIFT_PROCESSOR) & MASK_PROCESSOR) +#define SCRATCH13_EXTENDED_MODEL(x) (((x) >>SHIFT_EXTENDED_MODEL) & MASK_EXTENDED_MODEL) +#define SCRATCH13_EXTENDED_FAMILY(x) (((x) >>SHIFT_EXTENDED_FAMILY) & MASK_EXTENDED_FAMILY) + + +#define DBOX_READ(mmio, offset) \ + readl((uint32_t*)((uint8_t*)(mmio) + (HOST_DBOX_BASE_ADDRESS + (offset)))) +#define DBOX_WRITE(value, mmio, offset) \ + writel((value), (uint32_t*)((uint8_t*)(mmio) + (HOST_DBOX_BASE_ADDRESS + (offset)))) + +#define SBOX_READ(mmio, offset) \ + readl((uint32_t*)((uint8_t*)(mmio) + (HOST_SBOX_BASE_ADDRESS + (offset)))) +#define SBOX_WRITE(value, mmio, offset) \ + writel((value), (uint32_t*)((uint8_t*)(mmio) + (HOST_SBOX_BASE_ADDRESS + (offset)))) + +#define SET_BUS_DEV_FUNC(bus, device, function, reg_offset) \ + (( bus << 16 ) | ( device << 11 ) | ( function << 8 ) | reg_offset) + +#define GTT_READ(mmio, offset) \ + readl((uint32_t*)((uint8_t*)(mmio) + (HOST_GTT_BASE_ADDRESS + (offset)))) +#define GTT_WRITE(value, mmio, offset) \ + writel((value), (uint32_t*)((uint8_t*)(mmio) + (HOST_GTT_BASE_ADDRESS + (offset)))) + + +#define ENABLE_MIC_INTERRUPTS(mmio) { \ + uint32_t sboxSice0reg = SBOX_READ((mmio), SBOX_SICE0); \ + sboxSice0reg |= SBOX_SICE0_DBR_BITS(0xf) | SBOX_SICE0_DMA_BITS(0xff); \ + SBOX_WRITE(sboxSice0reg, (mmio), SBOX_SICE0); } + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) +#endif + +#define DLDR_APT_BAR 0 +#define DLDR_MMIO_BAR 4 + +#define PCI_VENDOR_INTEL 0x8086 + +#define PCI_DEVICE_ABR_2249 0x2249 +#define PCI_DEVICE_ABR_224a 0x224a + +#define PCI_DEVICE_KNC_2250 0x2250 +#define PCI_DEVICE_KNC_2251 0x2251 +#define PCI_DEVICE_KNC_2252 0x2252 +#define PCI_DEVICE_KNC_2253 0x2253 +#define PCI_DEVICE_KNC_2254 0x2254 +#define PCI_DEVICE_KNC_2255 0x2255 +#define PCI_DEVICE_KNC_2256 0x2256 +#define PCI_DEVICE_KNC_2257 0x2257 +#define PCI_DEVICE_KNC_2258 0x2258 +#define PCI_DEVICE_KNC_2259 0x2259 +#define PCI_DEVICE_KNC_225a 0x225a + +#define PCI_DEVICE_KNC_225b 0x225b +#define PCI_DEVICE_KNC_225c 0x225c +#define PCI_DEVICE_KNC_225d 0x225d +#define PCI_DEVICE_KNC_225e 0x225e + +#define MIC_CMDLINE_BUFSIZE 1024 +#define RESET_FAIL_TIME 300 + +/* Masks for sysfs entries */ +#ifdef CONFIG_ML1OM +#define MASK_COREVOLT 0xff +#define MASK_COREFREQ 0xfff +#endif +#define MASK_MEMVOLT 0xff +#define MASK_MEMORYFREQ 0xff +#define MASK_MEMSIZE 0x1fffffff +#define MASK_FLASHVERSION 0xffff +#define MASK_SUBSTEPPING_DATA 0xf +#define MASK_STEPPING_DATA 0xf +#define MASK_MODEL 0xf +#define MASK_FAMILY_DATA 0xf +#define MASK_PROCESSOR 0x3 +#define MASK_PLATFORM 0x3 +#define MASK_EXTENDED_MODEL 0xf +#define MASK_EXTENDED_FAMILY 0xff +#define MASK_FUSE_CONFIG_REV 0x3ff +#define MASK_ACTIVE_CORES 0x3f +#define MASK_FAIL_SAFE 0xffffffff +#define MASK_FLASH_UPDATE 0xffffffff +/* Shifts for sysfs entries */ +#ifdef CONFIG_ML1OM +#define SHIFT_COREVOLT 0 +#define SHIFT_COREFREQ 0 +#endif +#define SHIFT_MEMVOLT 0 +#define SHIFT_MEMORYFREQ 0 +#define SHIFT_MEMSIZE 3 +#define SHIFT_FLASHVERSION 16 +#define SHIFT_SUBSTEPPING_DATA 0 +#define SHIFT_STEPPING_DATA 4 +#define SHIFT_MODEL 8 +#define SHIFT_FAMILY_DATA 12 +#define SHIFT_PROCESSOR 16 +#define SHIFT_PLATFORM 18 +#define SHIFT_EXTENDED_MODEL 20 +#define SHIFT_EXTENDED_FAMILY 24 +#define SHIFT_FUSE_CONFIG_REV 0 +#define SHIFT_ACTIVE_CORES 10 +#define SHIFT_FAIL_SAFE 0 +#define SHIFT_FLASH_UPDATE 0 + +#define SKU_NAME_LEN 20 + +/* Should be updated to reflect the latest interface version in sysfs and wmi property */ +#define LINUX_INTERFACE_VERSION "1.0" +#define WINDOWS_INTERFACE_VERSION "1.0" + +typedef enum mic_modes +{ + MODE_NONE, + MODE_LINUX, + MODE_ELF, + MODE_FLASH +} MIC_MODES; + +typedef enum mic_status +{ + MIC_READY, + MIC_BOOT, + MIC_NORESPONSE, + MIC_BOOTFAIL, + MIC_ONLINE, + MIC_SHUTDOWN, + MIC_LOST, + MIC_RESET, + MIC_RESETFAIL, + MIC_INVALID +} MIC_STATUS; + +typedef enum _product_platform_t +{ + PLATFORM_SILICON = 0, + PLATFORM_EMULATOR = 2, +}product_platform_t; + + +typedef enum _platform_resource_type +{ + PCI_APERTURE, + MMIO, + MAX_RESOURCE_TYPE +}platform_resource_type; + +typedef struct _platform_resource_t +{ + uint8_t* va; // mapped by driver + uint64_t pa; // from PCI config space + uint64_t len;// from PCI config space +}platform_resource_t; + + +typedef struct micscifhost_info { + dma_addr_t si_pa; + struct delayed_work si_bs_check; + uint32_t si_bs_wait_count; +} scifhost_info_t; + +#define MIC_NUM_DB 4 +typedef struct mic_irq { + spinlock_t mi_lock; + struct list_head mi_dblist[MIC_NUM_DB]; // The 4 doorbell interrupts. + atomic_t mi_received; +} mic_irq_t; + +typedef struct sysfs_info { + char *cmdline; + char *kernel_cmdline; +} sysfs_info_t; + +typedef struct pm_recv_msg { + struct list_head msg; + pm_msg_header msg_header; + void * msg_body; +} pm_recv_msg_t; + +typedef struct pm_wq { + struct workqueue_struct *wq; + struct work_struct work; + char wq_name[20]; +} pm_wq_t; + +/* + * Driver wide power management context + * common power management context for all the devices + */ +typedef struct micscif_pm { + scif_epd_t epd; + atomic_t connected_clients; + pm_wq_t accept; + struct mutex pm_accept_mutex; + struct mutex pm_idle_mutex; + struct dentry *pmdbgparent_dir; + uint32_t enable_pm_logging; + atomic_t wakeup_in_progress; + uint8_t *nodemask; + uint32_t nodemask_len; +} micscif_pm_t; + +/* per device power management context */ +typedef struct micpm_ctx +{ + scif_epd_t pm_epd; + PM_IDLE_STATE idle_state; + struct mutex msg_mutex; + struct list_head msg_list; + uint32_t pc6_timeout; + struct work_struct pm_close; + MIC_STATUS mic_suspend_state; + bool pc3_enabled; + bool pc6_enabled; + pm_msg_pm_options pm_options; + atomic_t pm_ref_cnt; + platform_resource_t nodemask; + pm_wq_t recv; + pm_wq_t handle_msg; + pm_wq_t resume; + struct workqueue_struct *pc6_entry_wq; + struct delayed_work pc6_entry_work; + char pc6_wq_name[20]; + struct dentry *pmdbg_dir; + PM_CONNECTION_STATE con_state; + wait_queue_head_t disc_wq; +} micpm_ctx_t; + +typedef struct _mic_ctx_t { + platform_resource_t mmio; + platform_resource_t aper; + uint32_t apic_id; + uint32_t msie; + ringbuffer ringbuff[MIC_ENG_MAX_SUPPORTED_ENGINES]; + uint32_t rb_readoff __attribute__((aligned(64))); + micpm_ctx_t micpm_ctx; + CARD_USAGE_MODE card_usage_mode; + uint64_t adptr_base_pa; + + int32_t bi_id; + mic_irq_t bi_irq; + struct tasklet_struct bi_dpc; + scifhost_info_t bi_scif; +#ifdef USE_VCONSOLE + micvcons_t bi_vcons; +#endif + void *bi_vethinfo; + struct mic_psmi_ctx bi_psmi; + struct pci_dev *bi_pdev; + + MIC_STATUS state; + struct mutex state_lock; + MIC_MODES mode; + wait_queue_head_t resetwq; + char *image; + char *initramfs; + struct timer_list boot_timer; + unsigned long boot_start; + struct work_struct boot_ws; + + struct workqueue_struct *resetworkq; + struct work_struct resetwork; + struct workqueue_struct *ioremapworkq; + struct work_struct ioremapwork; + wait_queue_head_t ioremapwq; + uint32_t reset_count; + + atomic_t bi_irq_received; + uint8_t bi_stepping; + uint8_t bi_substepping; + product_platform_t bi_platform; + product_family_t bi_family; + struct board_info *bd_info; + sysfs_info_t sysfs_info; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)) + struct kernfs_node *sysfs_state; +#else + struct sysfs_dirent *sysfs_state; +#endif + spinlock_t sysfs_lock; + mic_dma_handle_t dma_handle; + uint32_t boot_mem; + mic_smpt_t *mic_smpt; + spinlock_t smpt_lock; + uint32_t sdbic1; + int64_t etc_comp; + spinlock_t ramoops_lock; + void *ramoops_va[2]; + int ramoops_size; + dma_addr_t ramoops_pa[2]; + struct proc_dir_entry *ramoops_dir; + struct proc_dir_entry *vmcore_dir; + /* + * List representing chunks of contiguous memory areas and + * their offsets in vmcore file. + */ + struct list_head vmcore_list; + /* Stores the pointer to the buffer containing kernel elf core headers */ + char *elfcorebuf; + size_t elfcorebuf_sz; + /* Total size of vmcore file. */ + uint64_t vmcore_size; + int crash_count; + int boot_count; + void *log_buf_addr; + int *log_buf_len; + char sku_name[SKU_NAME_LEN]; + atomic_t disconn_rescnt; + atomic_t gate_interrupt; + uint16_t numa_node; +} mic_ctx_t; + + +typedef struct mic_irqhander { + int (*ih_func)(mic_ctx_t *mic_ctx, int doorbell); + struct list_head ih_list; + char *ih_idstring; +} mic_irqhandler_t; + +/* SKU related definitions and declarations */ +#define MAX_DEV_IDS 16 +typedef struct sku_info { + uint32_t fuserev_low; + uint32_t fuserev_high; + uint32_t memsize; + uint32_t memfreq; + char sku_name[SKU_NAME_LEN]; + struct list_head sku; +} sku_info_t; + +int sku_create_node(uint32_t fuserev_low, + uint32_t fuserev_high, uint32_t mem_size, + uint32_t mem_freq, char *sku_name, + sku_info_t ** newnode); + +int sku_build_table(void); +void sku_destroy_table(void); +int sku_find(mic_ctx_t *mic_ctx, uint32_t device_id); + +/* End SKU related definitions and declarations */ + +#define MIC_NUM_MSIX_ENTRIES 1 +typedef struct mic_data { + int32_t dd_numdevs; + int32_t dd_inuse; +#ifdef USE_VCONSOLE + micvcons_port_t dd_ports[MAX_BOARD_SUPPORTED]; +#endif + struct board_info *dd_bi[MAX_BOARD_SUPPORTED]; + struct list_head dd_bdlist; + micscif_pm_t dd_pm; + uint64_t sysram; + struct fasync_struct *dd_fasync; + struct list_head sku_table[MAX_DEV_IDS]; +} mic_data_t; + +#include "mic_interrupts.h" +extern mic_data_t mic_data; +extern struct micscif_dev scif_dev[]; + +typedef struct acptboot_data { + scif_epd_t listen_epd; + uint16_t acptboot_pn; + struct workqueue_struct *acptbootwq; + struct work_struct acptbootwork; +}acptboot_data_t; + +void acptboot_exit(void); +int acptboot_init(void); +void adapter_init(void); +int adapter_isr(mic_ctx_t *mic_ctx); +int adapter_imsr(mic_ctx_t *mic_ctx); +int adapter_remove(mic_ctx_t *mic_ctx); +int adapter_do_ioctl(uint32_t cmd, uint64_t arg); +int adapter_stop_device(mic_ctx_t *mic_ctx, int wait_reset, int reattempt); +int adapter_shutdown_device(mic_ctx_t *mic_ctx); +void calculate_etc_compensation(mic_ctx_t *mic_ctx); +int adapter_probe(mic_ctx_t *mic_ctx); +int adapter_post_boot_device(mic_ctx_t *mic_ctx); +int adapter_start_device(mic_ctx_t *mic_ctx); +int adapter_restart_device(mic_ctx_t *mic_ctx); +int adapter_init_device(mic_ctx_t *mic_ctx); +int pm_adapter_do_ioctl(mic_ctx_t *mic_ctx, void *in_buffer); +int adapter_reset_depgraph(mic_ctx_t *mic_ctx); + +/* + * RESET_WAIT : launch the timer thread and wait for reset to complete + * The caller has to add itself to the resetwq by calling wait_for_reset + * RESET_REATTEMPT : Reattempt reset after detecting failures in reset + */ +#define RESET_WAIT 1 +#define RESET_REATTEMPT 1 +void adapter_reset(mic_ctx_t *mic_ctx, int wait_reset, int reattempt); + +void adapter_wait_reset(mic_ctx_t *mic_ctx); +void get_adapter_memsize(uint8_t *mmio_va, uint32_t *adapter_mem_size); +int wait_for_bootstrap(uint8_t *mmio_va); +void post_boot_startup(struct work_struct *work); +void attempt_reset(struct work_struct *work); + +int send_uos_escape(mic_ctx_t *mic_ctx, uint32_t uos_op, + uint32_t data_size, void *escape_data); +int boot_linux_uos(mic_ctx_t *mic_ctx, char *imgname, char *initramfsname); + +int boot_micdev_app(mic_ctx_t *mic_ctx, char *imgname); +int allocate_tools_buffer(mic_ctx_t *mic_ctx, uint32_t databuf_size, + uint32_t stsbuf_size, uint64_t *gddr_data_ptr, + uint64_t *gddr_stsbuf_ptr); + +int micpm_init(void); +void micpm_uninit(void); +int micpm_stop(mic_ctx_t *mic_ctx); +int micpm_start(mic_ctx_t *mic_ctx); +int micpm_probe(mic_ctx_t *mic_ctx); +int micpm_remove(mic_ctx_t *mic_ctx); +void micpm_nodemask_uninit(mic_ctx_t* mic_ctx); +int micpm_nodemask_init(uint32_t num_devs, mic_ctx_t* mic_ctx); +int micpm_disconn_init(uint32_t num_nodes); +int micpm_disconn_uninit(uint32_t num_nodes); +int micpm_dbg_init(mic_ctx_t *mic_ctx); +void micpm_dbg_parent_init(void); +int pm_reg_read(mic_ctx_t *mic_ctx, uint32_t regoffset); +int micpm_update_pc6(mic_ctx_t *mic_ctx, bool set); +int micpm_update_pc3(mic_ctx_t *mic_ctx, bool set); +int pm_start_device(mic_ctx_t *mic_ctx); +int pm_stop_device(mic_ctx_t *mic_ctx); +int mic_pm_recv(mic_ctx_t *mic_ctx, void *msg, uint32_t len); +int mic_pm_send_msg(mic_ctx_t *mic_ctx, PM_MESSAGE type, + void *msg, uint32_t len); + +int pm_pc3_entry(mic_ctx_t *mic_ctx); +int pm_pc3_exit(mic_ctx_t *mic_ctx); +int do_idlestate_entry(mic_ctx_t *mic_ctx); +int do_idlestate_exit(mic_ctx_t *mic_ctx, bool get_ref); +int is_idlestate_exit_needed(mic_ctx_t *mic_ctx); +uint32_t mic_get_scifnode_id(mic_ctx_t *mic_ctx); + +mic_ctx_t* get_per_dev_ctx(uint16_t node); +int get_num_devs(mic_ctx_t *mic_ctx, uint32_t *num_devs); + + +void adapter_uninit(void); +void adapter_add(mic_ctx_t *mic_ctx); +void adapter_start(mic_ctx_t *mic_ctx); +int send_flash_cmd(mic_ctx_t *mic_ctx, MIC_FLASH_CMD_TYPE type, void *data, + uint32_t len); +int cmdline_mem(mic_ctx_t *mic_ctx, uint32_t mem); +int get_cardside_mem(mic_ctx_t *mic_ctx, uint64_t start, uint64_t size, void *dest); + +int mic_pin_user_pages (void *data, struct page **pages, uint32_t len, int32_t *nf_pages, int32_t nr_pages); +int mic_unpin_user_pages(struct page **pages, uint32_t nf_pages); +product_family_t get_product_family(uint32_t device_id); +void show_stepping_comm(mic_ctx_t *mic_ctx,char *buf); +void micscif_destroy_p2p(mic_ctx_t *mic_ctx); + +#ifdef HOST +void mic_smpt_init(mic_ctx_t *mic_ctx); +void mic_smpt_restore(mic_ctx_t *mic_ctx); +#endif +void mic_smpt_uninit(mic_ctx_t *mic_ctx); +int mic_dma_init(void); + +#ifndef _MIC_SCIF_ +static __always_inline int micpm_get_reference(mic_ctx_t *mic_ctx, bool force_wakeup) { + int err; + if (!mic_ctx) + return -EINVAL; + + if (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_LOST) + return -ENODEV; + + if (unlikely(!atomic_add_unless(&mic_ctx->micpm_ctx.pm_ref_cnt, + 1, PM_NODE_IDLE))) { + if (!force_wakeup) { + if (is_idlestate_exit_needed(mic_ctx)) { + return -EAGAIN; + } + } + + if ((err = micscif_connect_node(mic_get_scifnode_id(mic_ctx), true)) != 0) + return -ENODEV; + } + return 0; +} +#endif + +static __always_inline int micpm_put_reference(mic_ctx_t *mic_ctx) { + int ret; + + if(!mic_ctx) + return -EINVAL; + + if (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_LOST) + return -ENODEV; + + if (unlikely((ret = atomic_sub_return(1, + &mic_ctx->micpm_ctx.pm_ref_cnt)) < 0)) { + printk(KERN_ERR "%s %d Invalid PM ref_cnt %d \n", + __func__, __LINE__, atomic_read(&mic_ctx->micpm_ctx.pm_ref_cnt)); + } + + return 0; + +} + +static __always_inline int +mic_hw_family(int node_id) { + mic_ctx_t *mic_ctx; + + /* For Host Loopback */ + if (!node_id) + return -EINVAL; + + mic_ctx = get_per_dev_ctx(node_id - 1); + return mic_ctx->bi_family; +} + +static __always_inline void +wait_for_reset(mic_ctx_t *mic_ctx) +{ + int ret = 0; + while (!ret) { + ret = wait_event_timeout(mic_ctx->resetwq, + mic_ctx->state != MIC_RESET, RESET_FAIL_TIME * HZ); + } +} + +/* Called only by host PM suspend */ +static __always_inline int +wait_for_shutdown_and_reset(mic_ctx_t *mic_ctx) +{ + int ret; + ret = wait_event_interruptible_timeout(mic_ctx->resetwq, + mic_ctx->state != MIC_RESET && mic_ctx->state != MIC_SHUTDOWN, + RESET_FAIL_TIME * HZ); + return ret; +} + +static __always_inline void +mic_signal_daemon(void) +{ + if (mic_data.dd_fasync != NULL) + kill_fasync(&mic_data.dd_fasync, SIGIO, POLL_IN); +} + +extern char *micstates[]; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +#define __mic_create_singlethread_workqueue(name) alloc_ordered_workqueue(name, 0) +#else +#define __mic_create_singlethread_workqueue(name) create_singlethread_workqueue(name) +#endif + +static __always_inline void +mic_setstate(mic_ctx_t *mic_ctx, enum mic_status newstate) +{ + printk("mic%d: Transition from state %s to %s\n", mic_ctx->bi_id, + micstates[mic_ctx->state], micstates[newstate]); + mic_ctx->state = newstate; + spin_lock_bh(&mic_ctx->sysfs_lock); + if (mic_ctx->sysfs_state) + sysfs_notify_dirent(mic_ctx->sysfs_state); + spin_unlock_bh(&mic_ctx->sysfs_lock); +} + +#define MICREG_POSTCODE 0x242c + +static __always_inline uint32_t +mic_getpostcode(mic_ctx_t *mic_ctx) +{ + return DBOX_READ(mic_ctx->mmio.va, MICREG_POSTCODE); +} + +static __always_inline int +mic_hw_stepping(int node_id) { + mic_ctx_t *mic_ctx; + + /* For Host Loopback */ + if (!node_id) + return -EINVAL; + + mic_ctx = get_per_dev_ctx(node_id - 1); + return mic_ctx->bi_stepping; +} + +#define MIC_IRQ_DB0 0 +#define MIC_IRQ_DB1 1 +#define MIC_IRQ_DB2 2 +#define MIC_IRQ_DB3 3 +#define MIC_IRQ_MAX MIC_IRQ_DB3 + +int mic_reg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring, + int (*irqfunc)(mic_ctx_t *mic_ctx, int doorbell)); +int mic_unreg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring); +void mic_enable_interrupts(mic_ctx_t *mic_ctx); +void mic_disable_interrupts(mic_ctx_t *mic_ctx); +void mic_enable_msi_interrupts(mic_ctx_t *mic_ctx); + +int micscif_init(void); +void micscif_destroy(void); +void micscif_probe(mic_ctx_t *mic_ctx); +void micscif_remove(mic_ctx_t *mic_ctx); +void micscif_start(mic_ctx_t *mic_ctx); +void micscif_stop(mic_ctx_t *mic_ctx); + +mic_ctx_t *get_device_context(struct pci_dev *dev); +void ramoops_exit(void); +void vmcore_exit(void); +int vmcore_create(mic_ctx_t *mic_ctx); +void vmcore_remove(mic_ctx_t *mic_ctx); + +// loads file into memory +int mic_get_file_size(const char *path, uint32_t *file_length); +int mic_load_file(const char *fn, uint8_t *buffer, uint32_t max_size); +#ifndef _MIC_SCIF_ +void mic_debug_init(mic_ctx_t *mic_ctx); +#endif +void mic_debug_uninit(void); +void +set_pci_aperture(mic_ctx_t *mic_ctx, uint32_t gtt_index, uint64_t phy_addr, uint32_t num_bytes); +#ifdef __cplusplus +}; +#endif + +#endif // __MIC_COMMON_H + diff --git a/include/mic_interrupts.h b/include/mic_interrupts.h new file mode 100644 index 0000000..b3c6b60 --- /dev/null +++ b/include/mic_interrupts.h @@ -0,0 +1,118 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic_common.h" + +/* vnet/mic_shutdown/hvc/virtio */ +#define VNET_SBOX_INT_IDX 0 +#define MIC_SHT_SBOX_INT_IDX 1 +#define HVC_SBOX_INT_IDX 2 +#define VIRTIO_SBOX_INT_IDX 3 +#define PM_SBOX_INT_IDX 4 + +#define MIC_BSP_INTERRUPT_VECTOR 229 // Host->Card(bootstrap) Interrupt Vector# +/* + * Current usage of MIC interrupts: + * APICICR1 - mic shutdown interrupt + * APCICR0 - rest + * + * Planned Usage: + * SCIF - rdmasrs + * vnet/hvc/virtio - APICICR0 + * mic shutdown interrupt - APICICR1 + */ +static void __mic_send_intr(mic_ctx_t *mic_ctx, int i) +{ + uint32_t apicicr_low; + uint64_t apic_icr_offset = SBOX_APICICR0 + i * 8; + + apicicr_low = SBOX_READ(mic_ctx->mmio.va, apic_icr_offset); + /* for KNC we need to make sure we "hit" the send_icr bit (13) */ + if (mic_ctx->bi_family == FAMILY_KNC) + apicicr_low = (apicicr_low | (1 << 13)); + + /* MIC card only triggers when we write the lower part of the + * address (upper bits) + */ + SBOX_WRITE(apicicr_low, mic_ctx->mmio.va, apic_icr_offset); +} + +static inline void mic_send_vnet_intr(mic_ctx_t *mic_ctx) +{ + __mic_send_intr(mic_ctx, VNET_SBOX_INT_IDX); +} + +static inline void mic_send_hvc_intr(mic_ctx_t *mic_ctx) +{ + __mic_send_intr(mic_ctx, HVC_SBOX_INT_IDX); +} + +static inline void mic_send_scif_intr(mic_ctx_t *mic_ctx) +{ + __mic_send_intr(mic_ctx, 0); +} + +static inline void mic_send_virtio_intr(mic_ctx_t *mic_ctx) +{ + __mic_send_intr(mic_ctx, VIRTIO_SBOX_INT_IDX); +} + +static inline void mic_send_sht_intr(mic_ctx_t *mic_ctx) +{ + __mic_send_intr(mic_ctx, 1); +} + +static inline void mic_send_pm_intr(mic_ctx_t *mic_ctx) +{ + __mic_send_intr(mic_ctx, PM_SBOX_INT_IDX); +} + +static inline void mic_send_bootstrap_intr(mic_ctx_t *mic_ctx) +{ + uint32_t apicicr_low; + uint64_t apic_icr_offset = SBOX_APICICR7; + int vector = MIC_BSP_INTERRUPT_VECTOR; + + if (mic_ctx->bi_family == FAMILY_ABR){ + apicicr_low = vector; + } else { + /* for KNC we need to make sure we "hit" the send_icr bit (13) */ + apicicr_low = (vector | (1 << 13)); + } + + SBOX_WRITE(mic_ctx->apic_id, mic_ctx->mmio.va, apic_icr_offset + 4); + // MIC card only triggers when we write the lower part of the address (upper bits) + SBOX_WRITE(apicicr_low, mic_ctx->mmio.va, apic_icr_offset); +} diff --git a/include/micint.h b/include/micint.h new file mode 100644 index 0000000..bf3f095 --- /dev/null +++ b/include/micint.h @@ -0,0 +1,114 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICINT_H +#define MICINT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mic_common.h" +#include + +#define MAX_DLDR_MINORS 68 +typedef struct mic_lindata { + dev_t dd_dev; + struct cdev dd_cdev; + struct device *dd_hostdev; + struct device *dd_scifdev; + struct class *dd_class; + struct pci_driver dd_pcidriver; +}mic_lindata_t; + +typedef struct board_info { + struct device *bi_sysfsdev; +#ifdef CONFIG_PCI_MSI + struct msix_entry bi_msix_entries[MIC_NUM_MSIX_ENTRIES]; +#endif +#ifdef USE_VCONSOLE + micvcons_port_t *bi_port; +#endif + void *bi_virtio; /* for virtio */ + + struct list_head bi_list; + mic_ctx_t bi_ctx; +} bd_info_t; + +extern mic_lindata_t mic_lindata; + +#ifdef USE_VCONSOLE +int micvcons_create(int num_bds); +void micvcons_destroy(int num_bds); +#endif + +int micpm_suspend(struct device *pdev); +int micpm_resume(struct device *pdev); +int micpm_suspend_noirq(struct device *pdev); +int micpm_resume_noirq(struct device *pdev); +int micpm_notifier_block(struct notifier_block *nb, unsigned long event, void *dummy); +irqreturn_t mic_irq_isr(int irq, void *data); + +int mic_psmi_init(mic_ctx_t *mic_ctx); +void mic_psmi_uninit(mic_ctx_t *mic_ctx); + +void set_sysfs_entries(mic_ctx_t *mic_ctx); +void free_sysfs_entries(mic_ctx_t *mic_ctx); +#endif // MICINT_H diff --git a/include/scif.h b/include/scif.h new file mode 100644 index 0000000..934bc82 --- /dev/null +++ b/include/scif.h @@ -0,0 +1,1743 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * Revised 15:05 11/24/2010 + * Derived from SCIF SAS v0.41 with additional corrections + */ + +#ifndef __SCIF_H__ +#define __SCIF_H__ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCIF_ACCEPT_SYNC 1 +#define SCIF_SEND_BLOCK 1 +#define SCIF_RECV_BLOCK 1 + +/* Start: Deprecated Temporary definition for compatability */ +#define ACCEPT_SYNC SCIF_ACCEPT_SYNC +#define SEND_BLOCK SCIF_SEND_BLOCK +#define RECV_BLOCK SCIF_RECV_BLOCK +/* End: Deprecated Temporary definition for compatability */ + +enum { + SCIF_PROT_READ = (1<<0), + SCIF_PROT_WRITE = (1<<1) +}; + +/* 0x40 is used internally by scif */ +enum { + SCIF_MAP_FIXED = 0x10, + SCIF_MAP_KERNEL = 0x20, +}; + +enum { + SCIF_FENCE_INIT_SELF = (1<<0), + SCIF_FENCE_INIT_PEER = (1<<1) +}; + +enum { + SCIF_FENCE_RAS_SELF = (1<<2), + SCIF_FENCE_RAS_PEER = (1<<3) +}; + +enum { + SCIF_SIGNAL_LOCAL = (1<<4), + SCIF_SIGNAL_REMOTE = (1<<5) +}; + +#define SCIF_RMA_USECPU 1 +#define SCIF_RMA_USECACHE (1<<1) +#define SCIF_RMA_SYNC (1<<2) +#define SCIF_RMA_ORDERED (1<<3) +//! @cond (Prevent doxygen from including these) +#define SCIF_POLLIN POLLIN +#define SCIF_POLLOUT POLLOUT +#define SCIF_POLLERR POLLERR +#define SCIF_POLLHUP POLLHUP +#define SCIF_POLLNVAL POLLNVAL + +/* SCIF Reserved Ports */ +/* COI */ +#define SCIF_COI_PORT_0 40 +#define SCIF_COI_PORT_1 41 +#define SCIF_COI_PORT_2 42 +#define SCIF_COI_PORT_3 43 +#define SCIF_COI_PORT_4 44 +#define SCIF_COI_PORT_5 45 +#define SCIF_COI_PORT_6 46 +#define SCIF_COI_PORT_7 47 +#define SCIF_COI_PORT_8 48 +#define SCIF_COI_PORT_9 49 + +/* OFED */ +#define SCIF_OFED_PORT_0 60 +#define SCIF_OFED_PORT_1 61 +#define SCIF_OFED_PORT_2 62 +#define SCIF_OFED_PORT_3 63 +#define SCIF_OFED_PORT_4 64 +#define SCIF_OFED_PORT_5 65 +#define SCIF_OFED_PORT_6 66 +#define SCIF_OFED_PORT_7 67 +#define SCIF_OFED_PORT_8 68 +#define SCIF_OFED_PORT_9 69 + +/* NETDEV */ +#define SCIF_NETDEV_PORT_0 80 +#define SCIF_NETDEV_PORT_1 81 +#define SCIF_NETDEV_PORT_2 82 +#define SCIF_NETDEV_PORT_3 83 +#define SCIF_NETDEV_PORT_4 84 +#define SCIF_NETDEV_PORT_5 85 +#define SCIF_NETDEV_PORT_6 86 +#define SCIF_NETDEV_PORT_7 87 +#define SCIF_NETDEV_PORT_8 88 +#define SCIF_NETDEV_PORT_9 89 + +/* RAS */ +#define SCIF_RAS_PORT_0 100 +#define SCIF_RAS_PORT_1 101 +#define SCIF_RAS_PORT_2 102 +#define SCIF_RAS_PORT_3 103 +#define SCIF_RAS_PORT_4 104 +#define SCIF_RAS_PORT_5 105 +#define SCIF_RAS_PORT_6 106 +#define SCIF_RAS_PORT_7 107 +#define SCIF_RAS_PORT_8 108 +#define SCIF_RAS_PORT_9 109 + +/* Power Management */ +#define SCIF_PM_PORT_0 120 +#define SCIF_PM_PORT_1 121 +#define SCIF_PM_PORT_2 122 +#define SCIF_PM_PORT_3 123 +#define SCIF_PM_PORT_4 124 +#define SCIF_PM_PORT_5 125 +#define SCIF_PM_PORT_6 126 +#define SCIF_PM_PORT_7 127 +#define SCIF_PM_PORT_8 128 +#define SCIF_PM_PORT_9 129 + +/* Board Tools */ +#define SCIF_BT_PORT_0 130 +#define SCIF_BT_PORT_1 131 +#define SCIF_BT_PORT_2 132 +#define SCIF_BT_PORT_3 133 +#define SCIF_BT_PORT_4 134 +#define SCIF_BT_PORT_5 135 +#define SCIF_BT_PORT_6 136 +#define SCIF_BT_PORT_7 137 +#define SCIF_BT_PORT_8 138 +#define SCIF_BT_PORT_9 139 + +/* MIC Boot/Configuration support */ +#define MPSSD_MONRECV 160 +#define MIC_NOTIFY 161 +#define MPSSD_CRED 162 +#define MPSSD_MONSEND 163 +#define MPSSD_MICCTRL 164 +#define MPSSD_RESV5 165 +#define MPSSD_RESV6 166 +#define MPSSD_RESV7 167 +#define MPSSD_RESV8 168 +#define MPSSD_RESV9 169 + +#define SCIF_ADMIN_PORT_END 1024 + +/* MYO */ +#define SCIF_MYO_PORT_0 1025 +#define SCIF_MYO_PORT_1 1026 +#define SCIF_MYO_PORT_2 1027 +#define SCIF_MYO_PORT_3 1028 +#define SCIF_MYO_PORT_4 1029 +#define SCIF_MYO_PORT_5 1030 +#define SCIF_MYO_PORT_6 1031 +#define SCIF_MYO_PORT_7 1032 +#define SCIF_MYO_PORT_8 1033 +#define SCIF_MYO_PORT_9 1034 + +/* SSG Tools */ +#define SCIF_ST_PORT_0 1044 +#define SCIF_ST_PORT_1 1045 +#define SCIF_ST_PORT_2 1046 +#define SCIF_ST_PORT_3 1047 +#define SCIF_ST_PORT_4 1048 +#define SCIF_ST_PORT_5 1049 +#define SCIF_ST_PORT_6 1050 +#define SCIF_ST_PORT_7 1051 +#define SCIF_ST_PORT_8 1052 +#define SCIF_ST_PORT_9 1053 + +/* End of SCIF Reserved Ports */ +#define SCIF_PORT_RSVD 1088 +//! @endcond + +typedef struct endpt *scif_epd_t; + +typedef struct scif_pinned_pages *scif_pinned_pages_t; + +struct scif_range { + void *cookie; /* cookie */ + int nr_pages; /* Number of Pages */ + int prot_flags; /* R/W protection */ + /* Arrays phys_addr/va below are virtually contiguous */ + dma_addr_t *phys_addr; /* Array of physical addresses */ + void **va; /* Array of virtual addresses + * and populated only when called + * on the host for a remote SCIF + * connection on MIC. + */ +}; + +struct scif_pollepd { + scif_epd_t epd; /* endpoint descriptor */ + short events; /* requested events */ + short revents; /* returned events */ +}; +enum scif_event_type { + SCIF_NODE_ADDED = 1<<0, + SCIF_NODE_REMOVED = 1<<1 +}; + +union eventd { + uint16_t scif_node_added; + uint16_t scif_node_removed; +}; + +typedef void (*scif_callback_t)(enum scif_event_type event, union eventd +data); + +struct scif_callback { + struct list_head list_member; + scif_callback_t callback_handler; +}; + +#define SCIF_OPEN_FAILED ((scif_epd_t)-1) +#define SCIF_REGISTER_FAILED ((off_t)-1) +#define SCIF_MMAP_FAILED ((void *)-1) + +struct scif_portID { + uint16_t node; /* node on which port resides */ + uint16_t port; /* Local port number */ +}; + +/* Start: Deprecated Temporary definition for compatability */ +#define portID scif_portID +typedef struct portID portID_t; +/* End: Deprecated Temporary definition for compatability */ + +/** + * scif_open - Create an endpoint + * + *\return + * The scif_open() function creates a new endpoint. + * + * Upon successful completion, scif_open() returns an endpoint descriptor to + * be used in subsequent SCIF functions calls to refer to that endpoint; + * otherwise: in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is + * returned and errno is set to indicate the error; in kernel mode a NULL + * scif_epd_t is returned. + * + *\par Errors: + *- ENOMEM + * - Insufficient kernel memory was available. + *- ENXIO + * - Version mismatch between micscif driver and libscif. + */ +scif_epd_t scif_open(void); + +/** + * scif _bind - Bind an endpoint to a port + * \param epd endpoint descriptor + * \param pn port number + * + * scif_bind() binds endpoint epd to port pn, where pn is a port number on the + * local node. If pn is zero, a port number greater than or equal to + * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to + * exactly one local port. Ports less than 1024 when requested can only be bound + * by system (or root) processes or by processes executed by privileged users. + * + *\return + * Upon successful completion, scif_bind() returns the port number to which epd + * is bound; otherwise: in user mode -1 is returned and errno is set to + * indicate the error; in kernel mode the negative of one of the following + * errors is returned. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - The endpoint or the port are already bound. + *- EISCONN + * - The endpoint is already connected. + *- ENOSPC + * - No port number available for assignment (when pn==0). + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- EACCES + * - The port requested is protected and the user is not the superuser. +*/ +int scif_bind(scif_epd_t epd, uint16_t pn); + +/** + * scif_listen - Listen for connections on an endpoint + * + * \param epd endpoint descriptor + * \param backlog maximum pending connection requests + * + * scif_listen() marks the endpoint epd as a listening endpoint - that is, as + * an endpoint that will be used to accept incoming connection requests. Once + * so marked, the endpoint is said to be in the listening state and may not be + * used as the endpoint of a connection. + * + * The endpoint, epd, must have been bound to a port. + * + * The backlog argument defines the maximum length to which the queue of + * pending connections for epd may grow. If a connection request arrives when + * the queue is full, the client may receive an error with an indication that + * the connection was refused. + * + *\return + * Upon successful completion, scif_listen() returns 0; otherwise: in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - The endpoint is not bound to a port + *- EISCONN + * - The endpoint is already connected or listening + *- ENOTTY + * - epd is not a valid endpoint descriptor +*/ +int scif_listen(scif_epd_t epd, int backlog); + +/** + * scif_connect - Initiate a connection on a port + * \param epd endpoint descriptor + * \param dst global id of port to which to connect + * + * The scif_connect() function requests the connection of endpoint epd to remote + * port dst. If the connection is successful, a peer endpoint, bound to dst, is + * created on node dst.node. On successful return, the connection is complete. + * + * If the endpoint epd has not already been bound to a port, scif_connect() + * will bind it to an unused local port. + * + * A connection is terminated when an endpoint of the connection is closed, + * either explicitly by scif_close(), or when a process that owns one of the + * endpoints of a connection is terminated. + * + *\return + * Upon successful completion, scif_connect() returns the port ID to which the + * endpoint, epd, is bound; otherwise: in user mode -1 is returned and errno is + * set to indicate the error; in kernel mode the negative of one of the + * following errors is returned. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNREFUSED + * - The destination was not listening for connections or refused the + * connection request. + *- EINTR + * - Interrupted function + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - dst.port is not a valid port ID + *- EISCONN + * - The endpoint is already connected + *- ENOBUFS + * - No buffer space is available + *- ENODEV + * - The destination node does not exist, or + * - The node is lost. + *- ENOSPC + * - No port number available for assignment (when pn==0). + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- EOPNOTSUPP + * - The endpoint is listening and cannot be connected +*/ +int scif_connect(scif_epd_t epd, struct scif_portID *dst); + +/** + * scif_accept - Accept a connection on an endpoint + * \param epd endpoint descriptor + * \param peer global id of port to which connected + * \param newepd new connected endpoint descriptor + * \param flags flags + * + * The scif_accept() call extracts the first connection request on the queue of + * pending connections for the port on which epd is listening. scif_accept() + * creates a new endpoint, bound to the same port as epd, and allocates a new + * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new + * endpoint is connected to the endpoint through which the connection was + * requested. epd is unaffected by this call, and remains in the listening + * state. + * + * On successful return, peer holds the global port identifier (node id and + * local port number) of the port which requested the connection. + * + * If the peer endpoint which requested the connection is closed, the endpoint + * returned by scif_accept() is closed. + * + * The number of connections that can (subsequently) be accepted on epd is only + * limited by system resources (memory). + * + * The flags argument is formed by OR'ing together zero or more of the + * following values: + *- SCIF_ACCEPT_SYNC: block until a connection request is presented. If + * SCIF_ACCEPT_SYNC is not in flags, and no pending + * connections are present on the queue, scif_accept()fails + * with an EAGAIN error + * + * On Linux in user mode, the select() and poll() functions can be used to + * determine when there is a connection request. On Microsoft Windows* and on + * Linux in kernel mode, the scif_poll() function may be used for this purpose. + * A readable event will be delivered when a connection is requested. + * + *\return + * Upon successful completion, scif_accept() returns 0; otherwise: in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + *\par Errors: + *- EAGAIN + * - SCIF_ACCEPT_SYNC is not set and no connections are present to be accepted, or + * - SCIF_ACCEPT_SYNC is not set and remote node failed to complete its + * connection request + *- EBADF + * - epd is not a valid endpoint descriptor + *- EINTR + * - Interrupted function + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - epd is not a listening endpoint + * - flags is invalid + * - peer is NULL + * - newepd is NULL + *- ENOBUFS + * - No buffer space is available + *- ENODEV + * - The requesting node is lost. + *- ENOMEM + * - Not enough space + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- ENOENT + * - Secondary part of epd registeration failed. +*/ +int scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t +*newepd, int flags); + +/** + * scif_close - Close an endpoint + * \param epd endpoint descriptor + * + * scif_close() closes an endpoint and performs necessary teardown of + * facilities associated with that endpoint. + * + * If epd is a listening endpoint then it will no longer accept connection + * requests on the port to which it is bound. Any pending connection requests + * are rejected. + * + * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs + * which are in-process through epd or its peer endpoint will complete before + * scif_close() returns. Registered windows of the local and peer endpoints are + * released as if scif_unregister() was called against each window. + * + * Closing an endpoint does not affect mappings to remote memory. These remain + * until explicitly removed by calling scif_munmap(). + * + * If the peer endpoint's receive queue is not empty at the time that epd is + * closed, then the peer endpoint can be passed as the endpoint parameter to + * scif_recv() until the receive queue is empty. + * + * If epd is bound to a port, then the port is returned to the pool of + * available ports. + * + * epd is freed and may no longer be accessed. + * + *\return + * Upon successful completion, scif_close() returns 0; otherwise: in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode the + * negative of one of the following errors is returned. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- EINVAL + * - epd is not a valid endpoint descriptor + */ +int scif_close(scif_epd_t epd); + +/** + * scif_send - Send a message + * \param epd endpoint descriptor + * \param msg message buffer address + * \param len message length + * \param flags blocking mode flags + * + * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data + * are copied from memory starting at address msg. On successful execution the + * return value of scif_send() is the number of bytes that were sent, and is + * zero if no bytes were sent because len was zero. scif_send() may be called + * only when the endpoint is in a connected state. + * + * If a scif_send() call is non-blocking, then it sends only those bytes which + * can be sent without waiting, up to a maximum of len bytes. + * + * If a scif_send() call is blocking, then it normally returns after sending + * all len bytes. If a blocking call is interrupted or the connection is + * forcibly closed, the call is considered successful if some bytes were sent + * or len is zero, otherwise the call is considered unsuccessful. + * + * On Linux in user mode, the select() and poll() functions can be used to + * determine when the send queue is not full. On Microsoft Windows* and on + * Linux in kernel mode, the scif_poll() function may be used for this purpose. + * + * It is recommended that scif_send()/scif_recv() only be used for short + * control-type message communication between SCIF endpoints. The SCIF RMA + * APIs are expected to provide better performance for transfer sizes of + * 1024 bytes or longer. + * + * The flags argument is formed by ORing together zero or more of the following + * values: + *- SCIF_SEND_BLOCK: block until the entire message is sent. + * + *\return + * Upon successful completion, scif_send() returns the number of bytes sent; + * otherwise: in user mode -1 is returned and errno is set to indicate the + * error; in kernel mode the negative of one of the following errors is + * returned. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EFAULT + * - An invalid address was specified for a parameter. + *- EINTR + * - epd was closed by scif_close() + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - flags is invalid + * - len is negative + *- ENODEV + * - The remote node is lost. + *- ENOMEM + * - Not enough space + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + */ +int scif_send(scif_epd_t epd, void *msg, int len, int flags); + +/** + * scif_recv - Receive a message + * \param epd endpoint descriptor + * \param msg message buffer address + * \param len message buffer length + * \param flags blocking mode flags + * + * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of + * data are copied to memory starting at address msg. On successful execution + * the return value of scif_recv() is the number of bytes that were received, + * and is zero if no bytes were received because len was zero. scif_recv() may + * be called only when the endpoint is in a connected state. + * + * If a scif_recv() call is non-blocking, then it receives only those bytes + * which can be received without waiting, up to a maximum of len bytes. + * + * If a scif_recv() call is blocking, then it normally returns after receiving + * all len bytes. If a blocking call is interrupted or the connection is + * forcibly closed, the call is considered successful if some bytes were + * received or len is zero, otherwise the call is considered unsuccessful; + * subsequent calls to scif_recv() will successfully receive all data sent + * through peer endpoint interruption or the connection was forcibly closed. + * + * On Linux in user mode, the select() and poll() functions can be used to + * determine when data is available to be received. On Microsoft Windows* and + * on Linux in kernel mode, the scif_poll() function may be used for this + * purpose. + * + * It is recommended that scif_send()/scif_recv() only be used for short + * control-type message communication between SCIF endpoints. The SCIF RMA + * APIs are expected to provide better performance for transfer sizes of + * 1024 bytes or longer. + * + * The flags argument is formed by ORing together zero or more of the following + * values: + *- SCIF_RECV_BLOCK: block until the entire message is received. + * + *\return + * Upon successful completion, scif_recv() returns the number of bytes + * received; otherwise: in user mode -1 is returned and errno is set to + * indicate the error; in kernel mode the negative of one of the following + * errors is returned. + * + *\par Errors: + *- EAGAIN + * - The destination node is returning from a low power state. + *- EBADF + * - epd is not a valid endpoint descriptor . + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EFAULT + * - An invalid address was specified for a parameter. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - flags is invalid, or + * - len is negative. + *- ENODEV + * - The remote node is lost. + *- ENOMEM + * - Not enough space. + *- ENOTCONN + * - The endpoint is not connected. + *- ENOTTY + * - epd is not a valid endpoint descriptor + */ +int scif_recv(scif_epd_t epd, void *msg, int len, int flags); + +/** + * scif_register - Mark a memory region for remote access. + * \param epd endpoint descriptor + * \param addr starting virtual address + * \param len length of range + * \param offset offset of window + * \param prot_flags read/write protection flags + * \param map_flags mapping flags + * + * The scif_register() function opens a window, a range of whole pages of the + * registered address space of the endpoint epd, starting at offset po and + * continuing for len bytes. The value of po, further described below, is a + * function of the parameters offset and len, and the value of map_flags. Each + * page of the window represents the physical memory page which backs the + * corresponding page of the range of virtual address pages starting at addr + * and continuing for len bytes. addr and len are constrained to be multiples + * of the page size. addr is interpreted as a user space address. A successful + * scif_register() call returns po as the return value. + * + * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset + * exactly, and offset is constrained to be a multiple of the page size. The + * mapping established by scif_register() will not replace any existing + * registration; an error is returned if any page within the range [offset, + * offset+len-1] intersects an existing window. + * Note: When SCIF_MAP_FIXED is set the current implementation limits + * offset to the range [0..2^62-1] and returns EADDRINUSE if the offset + * requested with SCIF_MAP_FIXED is in the range [2^62..2^63-1]. + * + * When SCIF_MAP_FIXED is not set, the implementation uses offset in an + * implementation-defined manner to arrive at po. The po value so chosen will + * be an area of the registered address space that the implementation deems + * suitable for a mapping of len bytes. An offset value of 0 is interpreted as + * granting the implementation complete freedom in selecting po, subject to + * constraints described below. A non-zero value of offset is taken to be a + * suggestion of an offset near which the mapping should be placed. When the + * implementation selects a value for po, it does not replace any extant + * window. In all cases, po will be a multiple of the page size. + * + * The physical pages which are so represented by a window are available for + * access in calls to scif_mmap(), scif_readfrom(), scif_writeto(), + * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the + * physical pages represented by the window will not be reused by the memory + * subsystem for any other purpose. Note that the same physical page may be + * represented by multiple windows. + * + * Subsequent operations which change the memory pages to which virtual + * addresses are mapped (such as mmap(), munmap(), scif_mmap() and + * scif_munmap()) have no effect on existing windows. + * + * On Linux, if the process will fork(), it is recommended that the registered + * virtual address range be marked with MADV_DONTFORK. Doing so will prevent + * problems due to copy-on-write semantics. + * + * The prot_flags argument is formed by OR'ing together one or more of the + * following values: + *- SCIF_PROT_READ: allow read operations from the window + *- SCIF_PROT_WRITE: allow write operations to the window + * + * The map_flags argument is formed by OR'ing together zero or more of + * the following values: + *- SCIF_MAP_FIXED: interpret offset exactly + * + *\return + * Upon successful completion, scif_register() returns the offset at which the + * mapping was placed (po); otherwise: in user mode SCIF_REGISTER_FAILED (that + * is (off_t *)-1) is returned and errno is set to indicate the error; in + * kernel mode the negative of one of the following errors is returned. + * + *\par Errors: + *- EADDRINUSE + * - SCIF_MAP_FIXED is set in map_flags, and pages in the range [offset, + * offset+len-1] are already registered + *- EAGAIN + * - The mapping could not be performed due to lack of resources + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EFAULT + * - Addresses in the range [addr , addr + len - 1] are invalid + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - map_flags is invalid, or + * - prot_flags is invalid, or + * - SCIF_MAP_FIXED is set in flags, and offset is not a multiple of + * the page size, or + * - addr is not a multiple of the page size, or + * - len is not a multiple of the page size, or is 0, or + * - offset is negative + *- ENODEV + * - The remote node is lost. + *- ENOMEM + * - Not enough space + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + */ +off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, +int prot_flags, int map_flags); + +/** + * scif_unregister - Mark a memory region for remote access. + * \param epd endpoint descriptor + * \param offset start of range to unregister + * \param len length of range to unregister + * + * The scif_unregister() function closes those previously registered windows + * which are entirely within the range [offset,offset+len-1]. It is an error to + * specify a range which intersects only a subrange of a window. + * + * On a successful return, pages within the window may no longer be specified + * in calls to scif_mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(), + * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, however, + * continues to exist until all previous references against it are removed. A + * window is referenced if there is a mapping to it created by scif_mmap(), or if + * scif_get_pages() was called against the window (and the pages have not been + * returned via scif_put_pages()). A window is also referenced while an RMA, in + * which some range of the window is a source or destination, is in progress. + * Finally a window is referenced while some offset in that window was specified + * to scif_fence_signal(), and the RMAs marked by that call to + * scif_fence_signal() have not completed. While a window is in this state, its + * registered address space pages are not available for use in a new registered + * window. + * + * When all such references to the window have been removed, its references to + * all the physical pages which it represents are removed. Similarly, the + * registered address space pages of the window become available for + * registration in a new window. + * + *\return + * Upon successful completion, scif_unregister() returns 0; otherwise: in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. In the event of an + * error, no windows are unregistered. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - The range [offset,offset+len-1] intersects a subrange of a window, or + * - offset is negative + *- ENODEV + * -The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- ENXIO + * - Addresses in the range [offset,offset+len-1] are invalid for the + * registered address space of epd. + */ +int scif_unregister(scif_epd_t epd, off_t offset, size_t len); + + +/** + * scif_readfrom - Copy from a remote address space + * \param epd endpoint descriptor + * \param loffset offset in local registered address space to + * which to copy + * \param len length of range to copy + * \param roffset offset in remote registered address space + * from which to copy + * \param rma_flags transfer mode flags + * + * scif_readfrom() copies len bytes from the remote registered address space of + * the peer of endpoint epd, starting at the offset roffset to the local + * registered address space of epd, starting at the offset loffset. + * + * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+ + * len-1] must be within some registered window or windows of the local and + * remote nodes respectively. A range may intersect multiple registered + * windows, but only if those windows are contiguous in the registered address + * space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two aynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * The optimal DMA performance will likely be realized if both + * loffset and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if loffset and roffset are not + * cacheline aligned but are separated by some multiple of 64. The lowest level + * of performance is likely if loffset and roffset are not separated by a + * multiple of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values: + *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA + * engine. + *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag might result in + * the API busy waiting and consuming CPU cycles while the DMA + * transfer is in progress. + *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + *\return + * Upon successful completion, scif_readfrom() returns 0; otherwise: in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + *\par Errors + *- EACCESS + * - Attempt to write to a read-only range or read from a write-only range + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - rma_flags is invalid + *- ENODEV + * -The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- ENXIO + * - The range [loffset,loffset+len-1] is invalid for the registered address + * space of epd, or, + * - The range [roffset,roffset+len-1] is invalid for the registered address + * space of the peer of epd, or + * - loffset or roffset is negative +*/ +int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t +roffset, int rma_flags); + +/** + * scif_writeto - Copy to a remote address space + * \param epd endpoint descriptor + * \param loffset offset in local registered address space + * from which to copy + * \param len length of range to copy + * \param roffset offset in remote registered address space to + * which to copy + * \param rma_flags transfer mode flags + * + * scif_writeto() copies len bytes from the local registered address space of + * epd, starting at the offset loffset to the remote registered address space + * of the peer of endpoint epd, starting at the offset roffset. + * + * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+ + * len-1] must be within some registered window or windows of the local and + * remote nodes respectively. A range may intersect multiple registered + * windows, but only if those windows are contiguous in the registered address + * space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two aynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * The optimal DMA performance will likely be realized if both + * loffset and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if loffset and roffset are not cacheline + * aligned but are separated by some multiple of 64. The lowest level of + * performance is likely if loffset and roffset are not separated by a multiple + * of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values: + *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA + * engine. + *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag might result in + * the API busy waiting and consuming CPU cycles while the DMA + * transfer is in progress. + *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + *\return + * Upon successful completion, scif_readfrom() returns 0; otherwise: in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + *\par Errors: + *- EACCESS + * - Attempt to write to a read-only range or read from a write-only range + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - rma_flags is invalid + *- ENODEV + * - The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- ENXIO + * - The range [loffset,loffset+len-1] is invalid for the registered address + * space of epd, or, + * - The range [roffset , roffset + len -1] is invalid for the registered + * address space of the peer of epd, or + * - loffset or roffset is negative + */ +int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t +roffset, int rma_flags); + +/** + * scif_vreadfrom - Copy from a remote address space + * \param epd endpoint descriptor + * \param addr address to which to copy + * \param len length of range to copy + * \param roffset offset in remote registered address space + * from which to copy + * \param rma_flags transfer mode flags + * + * scif_vreadfrom() copies len bytes from the remote registered address + * space of the peer of endpoint epd, starting at the offset roffset, to local + * memory, starting at addr. addr is interpreted as a user space address. + * + * The specified range [roffset,roffset+len-1] must be within some registered + * window or windows of the remote nodes respectively. The range may intersect + * multiple registered windows, but only if those windows are contiguous in the + * registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two aynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back + * the specified local memory range may be remain in a pinned state even after + * the specified transfer completes. This may reduce overhead if some or all of + * the same virtual address range is referenced in a subsequent call of + * scif_vreadfrom() or scif_vwriteto(). + * + * The optimal DMA performance will likely be realized if both + * loffset and roffset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if loffset and roffset are not + * cacheline aligned but are separated by some multiple of 64. The lowest level + * of performance is likely if loffset and roffset are not separated by a + * multiple of 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values: + *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA + * engine. + *- SCIF_RMA_USECACHE: enable registration caching + *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag might result in + * the API busy waiting and consuming CPU cycles while the DMA + * transfer is in progress. + *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + *\return + * Upon successful completion, scif_vreadfrom() returns 0; otherwise: in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + *\par Errors: + *- EACCESS + * - Attempt to write to a read-only range or read from a write-only range + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EFAULT + * - Addresses in the range [addr,addr+len-1] are invalid + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - rma_flags is invalid + *- ENODEV + * - The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- ENXIO + * - Addresses in the range [roffset,roffset+len-1] are invalid for the + * registered address space of epd. + */ +int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t offset, +int rma_flags); + +/** + * scif_vwriteto - Copy to a remote address space + * \param epd endpoint descriptor + * \param addr address from which to copy + * \param len length of range to copy + * \param roffset offset in remote registered address space to + * which to copy + * \param rma_flags transfer mode flags + * + * scif_vwriteto() copies len bytes from the local memory, starting at addr, to + * the remote registered address space of the peer of endpoint epd, starting at + * the offset roffset. addr is interpreted as a user space address. + * + * The specified range [roffset,roffset+len-1] must be within some registered + * window or windows of the remote nodes respectively. The range may intersect + * multiple registered windows, but only if those windows are contiguous in the + * registered address space. + * + * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using + * programmed read/writes. Otherwise the data is copied using DMA. If rma_- + * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the + * transfer is complete. Otherwise, the transfer may be performed asynchron- + * ously. The order in which any two aynchronous RMA operations complete + * is non-deterministic. The synchronization functions, scif_fence_mark()/ + * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to + * the completion of asynchronous RMA operations. + * + * The DMA transfer of individual bytes is not guaranteed to complete in + * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last + * cacheline or partial cacheline of the source range will become visible on + * the destination node after all other transferred data in the source + * range has become visible on the destination node. + * + * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back + * the specified local memory range may be remain in a pinned state even after + * the specified transfer completes. This may reduce overhead if some or all of + * the same virtual address range is referenced in a subsequent call of + * scif_vreadfrom() or scif_vwriteto(). + * + * The optimal DMA performance will likely be realized if both + * addr and offset are cacheline aligned (are a multiple of 64). Lower + * performance will likely be realized if addr and offset are not cacheline + * aligned but are separated by some multiple of 64. The lowest level of + * performance is likely if addr and offset are not separated by a multiple of + * 64. + * + * The rma_flags argument is formed by ORing together zero or more of the + * following values: + *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA + * engine. + *- SCIF_RMA_USECACHE: allow registration caching + *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the + * transfer has completed. Passing this flag might result in + * the API busy waiting and consuming CPU cycles while the DMA + * transfer is in progress. + *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of + * the source range becomes visible on the destination node + * after all other transferred data in the source range has + * become visible on the destination + * + *\return + * Upon successful completion, scif_vwriteto () returns 0; otherwise: in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + *\par Errors: + *- EACCESS + * - Attempt to write to a read-only range or read from a write-only range + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EFAULT + * - Addresses in the range [addr,addr+len-1] are invalid + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - rma_flags is invalid + *- ENODEV + * - The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- ENXIO + * - Addresses in the range [roffset,roffset+len-1] are invalid for the + * registered address space of epd. + */ +int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t offset, +int rma_flags); + +/** + * scif_fence_mark - Mark previously issued RMAs + * \param epd endpoint descriptor + * \param flags control flags + * \param mark marked handle returned as output. + * + * scif_fence_mark() returns after marking the current set of all uncompleted + * RMAs initiated through the endpoint epd or the current set of all + * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are + * marked with a value returned at mark. The application may subsequently call + * scif_fence_wait(), passing the value returned at mark, to await completion + * of all RMAs so marked. + * + * The flags argument has exactly one of the following values: + *- SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint + * epd are marked + *- SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer + * of endpoint epd are marked + * + * \return + * Upon successful completion, scif_fence_mark() returns 0; otherwise: in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - flags is invalid, or + * - epd is not a valid endpoint descriptor, or + *- ENODEV + * - The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOMEM + * - Insufficient kernel memory was available. + *- ENOTTY + * - epd is not a valid endpoint descriptor + */ +int scif_fence_mark(scif_epd_t epd, int flags, int *mark); + +/** + * scif_fence_wait - Wait for completion of marked RMAs + * + * \param epd endpoint descriptor + * \param mark mark request + * + * scif_fence_wait() returns after all RMAs marked with mark have completed. + * The value passed in mark must have been obtained in a previous call to + * scif_fence_mark(). + * + *\return + * Upon successful completion, scif_fence_wait() returns 0; otherwise: in user + * mode -1 is returned and errno is set to indicate the error; in kernel mode + * the negative of one of the following errors is returned. + * + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + *- ENODEV + * - The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOMEM + * - Insufficient kernel memory was available. + *- ENOTTY + * - epd is not a valid endpoint descriptor + */ +int scif_fence_wait(scif_epd_t epd, int mark); + +/** + * scif_fence_signal - Request a signal on completion of RMAs + * \param loff local offset + * \param lval local value to write to loffset + * \param roff remote offset + * \param rval remote value to write to roffset + * \param flags flags + * + * scif_fence_signal() returns after marking the current set of all uncompleted + * RMAs initiated through the endpoint epd or marking the current set of all + * uncompleted RMAs initiated through the peer of endpoint epd. + * + * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the + * marked set, lval is written to memory at the address corresponding to offset + * loff in the local registered address space of epd. loff must be within a + * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion + * of the RMAs in the marked set, rval is written to memory at the * address + * corresponding to offset roff in the remote registered address space of epd. + * roff must be within a remote registered window of the peer of epd. Note + * that any specified offset must be DWORD (4 byte / 32 bit) aligned. + * + * The flags argument is formed by OR'ing together the following: + *- Exactly one of the following values: + * - SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint + * epd are marked + * - SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer + * of endpoint epd are marked + *- One or more of the following values: + * - SCIF_SIGNAL_LOCAL: On completion of the marked set of RMAs, write lval to + * memory at the address corresponding to offset loff in the local registered + * address space of epd. + * - SCIF_SIGNAL_REMOTE: On completion of the marked set of RMAs, write lval to + * memory at the address corresponding to offset roff in the remote registered + * address space of epd. + * + *\return + * Upon successful completion, scif_fence_signal() returns 0; otherwise: in + * user mode -1 is returned and errno is set to indicate the error; in kernel + * mode the negative of one of the following errors is returned. + *\par Errors: + *- EBADF + * - epd is not a valid endpoint descriptor + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - flags is invalid, or + * - loff or roff are not DWORD aligned + *- ENODEV + * - The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENOTTY + * - epd is not a valid endpoint descriptor + *- ENXIO + * - loff is invalid for the registered address of epd, or + * - roff is invalid for the registered address space, of the peer of epd + */ +int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, off_t roff, +uint64_t rval, int flags); + +/** + * scif_get_nodeIDs - Return information about online nodes + * \param nodes array in which to return online node IDs + * \param len number of entries in the nodes array + * \param self address to place the node ID of the local node + * + * scif_get_nodeIDs() fills in the nodes array with up to len node IDs of the + * nodes in the SCIF network. If there is not enough space in nodes, as + * indicated by the len parameter, only len node IDs are returned in nodes. The + * return value of scif_get_nodeID() is the total number of nodes currently in + * the SCIF network. By checking the return value against the len parameter, the user may + * determine if enough space for nodes was allocated. + * + * The node ID of the local node is returned at self. + * + *\return + * Upon successful completion, scif_get_nodeIDs() returns the actual number of + * online nodes in the SCIF network including 'self'; otherwise: in user mode + * -1 is returned and errno is set to indicate the error; in kernel mode no + * errors are returned. + * + *\par Errors: + *- EFAULT + * - Bad address + */ +int scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self); + + +/** + * scif_pin_pages - Pin a set of pages + * \param addr Virtual address of range to pin + * \param len Length of range to pin + * \param prot_flags Page protection flags + * \param map_flags Page classification flags + * \param pinned_pages Opaque handle of pinned pages + * + * scif_pin_pages() pins (locks in physical memory) the physical pages which + * back the range of virtual address pages starting at addr and continuing for + * len bytes. addr and len are constrained to be multiples of the page size. A + * successful scif_register() call returns an opaque pointer value at + * pinned_pages which may be used in subsequent calls to + * scif_register_pinned_pages(). + * + * The pages will remain pinned as long as there is a reference against the + * scif_pinned_pages_t value returned by scif_pin_pages() and until + * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A + * reference is added to a scif_pinned_pages_t value each time a window is + * created by calling scif_register_pinned_pages() and passing the + * scif_pinned_pages_t value. A reference is removed from a scif_pinned_pages_t value + * each time such a window is deleted. + * + * Subsequent operations which change the memory pages to which virtual + * addresses are mapped (such as mmap(), munmap(), scif_mmap() and + * scif_munmap()) have no effect on the scif_pinned_pages_t value or windows + * created against it. + * + * On Linux, if the process will fork(), it is recommended that the registered + * virtual address range be marked with MADV_DONTFORK. Doing so will prevent + * problems due to copy-on-write semantics. + * + * The prot_flags argument is formed by OR'ing together one or more of the + * following values: + *- SCIF_PROT_READ: allow read operations against the pages + *- SCIF_PROT_WRITE: allow write operations against the pages + * The map_flags argument is formed by OR'ing together zero or more of the + * following values: + *- SCIF_MAP_KERNEL: interpret addr as a kernel space address. By default, addr + * is interpreted as a user space address. + * + *\return + * Upon successful completion, scif_register() returns 0; otherwise the + * negative of one of the following errors is returned. + *\par Errors: + *- EFAULT + * - Addresses in the range [addr,addr+len-1] are invalid + *- EINVAL + * - prot_flags is invalid, + * - map_flags is invalid, or + * - offset is negative + *- ENOMEM + * - Not enough space + */ +int +scif_pin_pages( + void *addr, + size_t len, + int prot_flags, + int map_flags, + scif_pinned_pages_t *pinned_pages); + +/** + * scif_unpin_pages - Unpin a set of pages + * \param pinned_pages Opaque handle of pages to be unpinned + * + * scif_unpin_pages() prevents scif_register_pinned_pages()from registering new + * windows against pinned_pages. The physical pages represented by pinned_pages + * will remain pinned until all windows previously registered against + * pinned_pages are deleted (the window is scif_unregister()'d and all + * references to the window are removed (see scif_unregister()). + * + * pinned_pages must have been obtain from a previous call to scif_pin_pages(). + * After calling scif_unpin_pages(), it is an error to pass pinned_pages to + * scif_register_pinned_pages(). + * + *\return: + * Upon successful completion, scif_unpin_pages() returns 0; otherwise the + * negative of one of the following errors is returned. + * + *\par Errors: + *- EINVAL + * - pinned_pages is not valid + */ +int +scif_unpin_pages( + scif_pinned_pages_t pinned_pages); + +/** + * scif_register_pinned_pages - Mark a memory region for remote access. + * \param epd Endpoint descriptor + * \param pinned_pages Opaque handle of pinned pages + * \param offset Registered address space offset + * \param map_flags Flags which control where pages are mapped + * + * The scif_register_pinned_pages() function opens a window, a range of whole + * pages of the registered address space of the endpoint epd, starting at + * offset po. The value of po, further described below, is a function of the + * parameters offset and pinned_pages, and the value of map_flags. Each page of + * the window represents a corresponding physical memory page of the range + * represented by pinned_pages; the length of the window is the same as the + * length of range represented by pinned_pages. A successful scif_register() + * call returns po as the return value. + * + * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset + * exactly, and offset is constrained to be a multiple of the page size. The + * mapping established by scif_register() will not replace any existing + * registration; an error is returned if any page of the new window would + * intersect an existing window. + * + * When SCIF_MAP_FIXED is not set, the implementation uses offset in an + * implementation-defined manner to arrive at po. The po so chosen will be an + * area of the registered address space that the implementation deems suitable + * for a mapping of the required size. An offset value of 0 is interpreted as + * granting the implementation complete freedom in selecting po, subject to + * constraints described below. A non-zero value of offset is taken to be a + * suggestion of an offset near which the mapping should be placed. When the + * implementation selects a value for po, it does not replace any extant + * window. In all cases, po will be a multiple of the page size. + * + * The physical pages which are so represented by a window are available for + * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(), + * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the + * physical pages represented by the window will not be reused by the memory + * subsytem for any other purpose. Note that the same physical page may be + * represented by multiple windows. + * + * Windows created by scif_register_pinned_pages() are unregistered by + * scif_unregister(). + * + * The map_flags argument is formed by OR'ing together zero or more of the + * following values: + *- SCIF_MAP_FIXED: interpret offset exactly + * + *\return + * Upon successful completion, scif_register_pinned_pages() returns the offset + * at which the mapping was placed (po); otherwise the negative of one of the + * following errors is returned. + *\par Errors: + *- EADDRINUSE + * - SCIF_MAP_FIXED is set in map_flags and pages in the new + * window would intersect an existing window + *- EAGAIN + * - The mapping could not be performed due to lack of resources + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - map_flags is invalid, or + * - SCIF_MAP_FIXED is set in map_flags, and offset is not a + * multiple of the page size, or + * - offset is negative + *- ENODEV + * - The remote node is lost. + *- ENOMEM + * - Not enough space + *- ENOTCONN + * - The endpoint is not connected + */ +off_t +scif_register_pinned_pages( + scif_epd_t epd, + scif_pinned_pages_t pinned_pages, + off_t offset, + int map_flags); + +/** + * scif_get_pages - Add references to remote registered pages + * \param epd endpoint descriptor + * \param offset registered address space offset + * \param len length of range of pages + * \param pages returned scif_range structure + * + * scif_get_pages() returns the addresses of the physical pages represented by + * those pages of the registered address space of the peer of epd, starting at + * offset and continuing for len bytes. offset and len are constrained to be + * multiples of the page size. + * + * All of the pages in the specified range [offset,offset+len-1] must be within + * a single window of the registered address space of the peer of epd. + * + * The addresses are returned as a virtually contiguous array pointed to by the + * phys_addr component of the scif_range structure whose address is returned in + * pages. The nr_pages component of scif_range is the length of the array. The + * prot_flags component of scif_range holds the protection flag value passed + * when the pages were registered. + * + * Each physical page whose address is returned by scif_get_pages() remains + * available and will not be released for reuse until the scif_range structure + * is returned in a call to scif_put_pages(). The scif_range structure returned + * by scif_get_pages() must be unmodified. + * + * It is an error to call scif_close() on an endpoint on which a scif_range + * structure of that endpoint has not been returned to scif_put_pages(). + * + *\return + * Upon successful completion, scif_get_pages() returns 0; otherwise the + * negative of one of the following errors is returned. + *\par Errors: + *- ECONNRESET + * - A connection was forcibly closed by a peer. + *- EINVAL + * - epd is not a valid endpoint descriptor, or + * - offset is not a multiple of the page size, or + * - offset is negative, or + * - len is not a multiple of the page size + *- ENODEV + * -The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected + *- ENXIO + * - Addresses in the range [offset,offset+len-1] are invalid + * for the registered address space of the peer epd. + */ +int scif_get_pages( + scif_epd_t epd, + off_t offset, + size_t len, + struct scif_range **pages); + +/** + * scif_put_pages - Remove references from remote registered pages + * \param pages pages to be returned + * + * scif_put_pages() releases a scif_range structure previously obtained by + * calling scif_get_pages(). The physical pages represented by pages may + * be reused when the window which represented those pages is unregistered. + * Therefore, those pages must not be accessed after calling scif_put_pages(). + * + *\return + * Upon successful completion, scif_put_pages() returns 0; otherwise the + * negative of one of the following errors is returned. + *\par Errors: + *- EINVAL + * - pages does not point to a valid scif_range structure, or + * - the scif_range structure pointed to by pages was already returned. + *- ENODEV + * - The remote node is lost. + *- ENOTCONN + * - The endpoint is not connected. + */ +int scif_put_pages( + struct scif_range *pages); + +/** + * scif_poll - Wait for some event on an endpoint + * \param epds Array of endpoint descriptors + * \param nepds Length of epds + * \param timeout Upper limit on time for which scif_poll() will + * block + * + * scif_poll() waits for one of a set of endpoints to become ready to perform + * an I/O operation. scif_poll() exposes a subset of the functionality of the + * POSIX standard poll() function. + * + * The epds argument specifies the endpoint descriptors to be examined and the + * events of interest for each endpoint descriptor. epds is a pointer to an + * array with one member for each open endpoint descriptor of interest. + * + * The number of items in the epds array is specified in nepds. The epd field + * of scif_pollepd is an endpoint descriptor of an open endpoint. The field + * events is a bitmask specifying the events which the application is + * interested in. The field revents is an output parameter, filled by the + * kernel with the events that actually occurred. The bits returned in revents + * can include any of those specified in events, or one of the values + * SCIF_POLLERR, SCIF_POLLHUP, or SCIF_POLLNVAL. (These three bits are + * meaningless in the events field, and will be set in the revents field + * whenever the corresponding condition is true.) + * + * If none of the events requested (and no error) has occurred for any of the + * endpoint descriptors, then scif_poll() blocks until one of the events occurs. + * + * The timeout argument specifies an upper limit on the time for which + * scif_poll() will block, in milliseconds. Specifying a negative value in + * timeout means an infinite timeout. + * + * The following bits may be set in events and returned in revents: + *- SCIF_POLLIN: Data may be received without blocking. For a connected + * endpoint, this means that scif_recv() may be called without blocking. For a + * listening endpoint, this means that scif_accept() may be called without + * blocking. + *- SCIF_POLLOUT: Data may be sent without blocking. For a connected endpoint, + * this means that scif_send() may be called without blocking. This bit value + * has no meaning for a listening endpoint and is ignored if specified. + * + * The following bits are only returned in revents, and are ignored if set in + * events: + *- SCIF_POLLERR: An error occurred on the endpoint + *- SCIF_POLLHUP: The connection to the peer endpoint was disconnected + *- SCIF_POLLNVAL: The specified endpoint descriptor is invalid. + * + *\return + * Upon successful completion, scif_poll()returns a non-negative value. A + * positive value indicates the total number of endpoint descriptors that have + * been selected (that is, endpoint descriptors for which the revents member is + * non-zero. A value of 0 indicates that the call timed out and no endpoint + * descriptors have been selected. Otherwise: in user mode -1 is returned and + * errno is set to indicate the error; in kernel mode the negative of one of + * the following errors is returned. + * + *\par Errors: + *- EFAULT + * - The array given as argument was not contained in the calling program's + * address space. + *- EINTR + * - A signal occurred before any requested event. + *- EINVAL + * - The nepds argument is greater than {OPEN_MAX} + *- ENOMEM + * - There was no space to allocate file descriptor tables. +*/ +int +scif_poll( + struct scif_pollepd *epds, + unsigned int nepds, + long timeout); + +/** + * scif_event_register - Register an event handler + * \param handler Event handler to be registered + * + * scif_event_register() registers a routine, handler, to be called when some + * event occurs. The event parameter to handler indicates the type of event + * which has occurred, and the corresponding component of the data parameter to + * handler provides additional data about the event. + * + * The following events are defined: + *- SCIF_NODE_ADDED: A node has been added to the SCIF network. The + * scif_node_added component of the data parameter to handler identifies the + * node. This event is informational. There are no requirements on the event + * handler. + *- SCIF_NODE_REMOVED: A node is being removed from the SCIF network. The + * scif_node_removed component of the data parameter to handler identifies the + * node. Upon being called, and before returning, the event handler must + * return, using scif_put_pages(), all structures obtained using + * scif_get_pages() against an endpoint connected to the lost node. It is + * recommended and expected that the handler will also scif_close() all + * endpoints connected to the lost node. + * + *\return + * Upon successful completion scif_event_register() returns 0. + * + *\par Errors: + *- ENOMEM + * - There was no space to allocate file descriptor tables. +*/ + +int +scif_event_register( + scif_callback_t handler); + +/** + * scif_event_unregister - Unregister event handler + * \param handler Event handler to be unregistered + * + * scif_event_unregister() unregisters the handler which was registered + * previously by using scif_event_register(). + * + * WARNING: scif_event_unregister must be called before the module + * (that registered handles) exits for every handler that is registered. + * Failure to do so will result in crash of the scif module. + * + *\return + * Upon successful completion scif_event_unregister() returns 0. + *\par Errors: + *- EINVAL + * -If the event handler was not found/registered. +*/ +int +scif_event_unregister( + scif_callback_t handler); + +/* + * Note: The callee can use pci_resource_start(dev, index) and + * pci_resource_len(dev, index) to obtain the PCI resource starting + * physical address and length for valid non null indexes of the va + * array. MMIO bars will not have IORESOURCE_PREFETCH set in the + * flags obtained from pci_resource_flags(dev, index). va[index] + * will be set to NULL for invalid resources. + */ +struct scif_pci_info { + /* pci_dev pointer associated with a node */ + struct pci_dev *pdev; + /* Ioremapped virtual address base for every valid PCIe resource */ + void __iomem *va[PCI_NUM_RESOURCES]; +}; + +/** + * scif_pci_info - Populate the scif_pci_info structure for a node. + * \param node The node to query + * \param dev The scif_pci_info structure to populate. + * + * scif_pci_info() populates the provided scif_pci_info structure + * associated with a node. The requested node ID cannot be the same as + * the current node. This routine will only return success when called from + * the host. + * + *\return + * Upon successful completion, scif_pci_info() returns 0; otherwise the + * negative of one of the following errors is returned. + * + *\par Errors: + *- EINVAL + * - The requested node is not valid. + * - Called on MIC instead of the host. + *- ENODEV + * - No pci_dev association exists for the node. + */ +int +scif_pci_info( + uint16_t node, + struct scif_pci_info *dev); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __SCIF_H__ */ diff --git a/include/scif_ioctl.h b/include/scif_ioctl.h new file mode 100644 index 0000000..fd72fc4 --- /dev/null +++ b/include/scif_ioctl.h @@ -0,0 +1,225 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * ----------------------------------------- + * SCIF IOCTL interface information + * ----------------------------------------- + */ +#if defined(_WIN32) && !defined(_WIN64) +#define ptr64_t __ptr64 +#else +#define ptr64_t +#endif + +/** + * The purpose of SCIF_VERSION is to check for compatibility between host and + * card SCIF modules and also between SCIF driver and libscif. This version + * should be incremented whenever a change is made to SCIF that affects the + * interface between SCIF driver and libscif or between the card and host SCIF + * driver components. + */ +#define SCIF_VERSION 1 + +/** + * struct scifioctl_connect: + * + * \param self used to read back the assigned portID + * \param peer destination node and port to connect to + * + * This structure is used for CONNECT IOCTL. + */ +struct scifioctl_connect { + struct scif_portID self; + struct scif_portID peer; +}; + + +/** + * struct scifioctl_accept: + * + * \param flags flags + * \param peer global id of peer endpoint + * \param newepd new connected endpoint descriptor + * + * This structure is used for SCIF_ACCEPTREQ IOCTL. + */ +struct scifioctl_accept { + int flags; + struct scif_portID peer; + void * ptr64_t endpt; +}; + +/** + * struct scifioctl_msg: + * + * \param msg message buffer address + * \param len message length + * \param flags flags + * \param out_len Number of bytes sent/received. + * + * This structure is used for SCIF_SEND/SCIF_RECV IOCTL. + */ +struct scifioctl_msg { + void * ptr64_t msg; + int len; + int flags; + int out_len; +}; + +/** + * struct scifioctl_reg: + * + * \param addr starting virtual address + * \param len length of range + * \param offset offset of window + * \param prot read/write protection + * \param flags flags + * \param out_len offset returned. + * + * This structure is used for SCIF_REG IOCTL. + */ +struct scifioctl_reg { + void * ptr64_t addr; + uint64_t len; + off_t offset; + int prot; + int flags; + off_t out_offset; +}; + +/** + * struct scifioctl_unreg: + * + * \param offset start of range to unregister + * \param len length of range to unregister + * + * This structure is used for SCIF_UNREG IOCTL. + */ +struct scifioctl_unreg { + off_t offset; + uint64_t len; +}; + +/** + * struct scifioctl_copy: + * + * \param loffset offset in local registered address space to/from +which to copy + * \param len length of range to copy + * \param roffset offset in remote registered address space to/from +which to copy + * \param addr user virtual address to/from which to copy + * \param flags flags + * + * This structure is used for SCIF_READFROM, SCIF_WRITETO, SCIF_VREADFROM +and + * SCIF_VREADFROM IOCTL's. + */ +struct scifioctl_copy { + off_t loffset; + uint64_t len; + off_t roffset; + uint8_t * ptr64_t addr; + int flags; +}; + +/** + * struct scifioctl_fence_mark: + * + * \param flags flags + * \param mark Fence handle returned by reference. + * + * This structure is used from SCIF_FENCE_MARK IOCTL. + */ +struct scifioctl_fence_mark { + int flags; + int *mark; +}; + +/** + * struct scifioctl_fence_signal: + * + * \param loff local offset + * \param lval local value to write to loffset + * \param roff remote offset + * \param rval remote value to write to roffset + * \param flags flags + * + * This structure is used for SCIF_FENCE_SIGNAL IOCTL. + */ +struct scifioctl_fence_signal { + off_t loff; + uint64_t lval; + off_t roff; + uint64_t rval; + int flags; +}; + +/** + * struct scifioctl_nodeIDs: + * + * \param nodes pointer to an array of nodeIDs + * \param len length of array + * \param self ID of the current node + * + * This structure is used for the SCIF_GET_NODEIDS ioctl + */ +struct scifioctl_nodeIDs { + uint16_t * ptr64_t nodes; + int len; + uint16_t * ptr64_t self; +}; + + +#define SCIF_BIND _IOWR('s', 1, int *) +#define SCIF_LISTEN _IOW('s', 2, int) +#define SCIF_CONNECT _IOWR('s', 3, struct scifioctl_connect *) +#define SCIF_ACCEPTREQ _IOWR('s', 4, struct scifioctl_accept *) +#define SCIF_ACCEPTREG _IOWR('s', 5, void *) +#define SCIF_SEND _IOWR('s', 6, struct scifioctl_msg *) +#define SCIF_RECV _IOWR('s', 7, struct scifioctl_msg *) +#define SCIF_REG _IOWR('s', 8, struct scifioctl_reg *) +#define SCIF_UNREG _IOWR('s', 9, struct scifioctl_unreg *) +#define SCIF_READFROM _IOWR('s', 10, struct scifioctl_copy *) +#define SCIF_WRITETO _IOWR('s', 11, struct scifioctl_copy *) +#define SCIF_VREADFROM _IOWR('s', 12, struct scifioctl_copy *) +#define SCIF_VWRITETO _IOWR('s', 13, struct scifioctl_copy *) +#define SCIF_GET_NODEIDS _IOWR('s', 14, struct scifioctl_nodeIDs *) +#define SCIF_FENCE_MARK _IOWR('s', 15, struct scifioctl_fence_mark *) +#define SCIF_FENCE_WAIT _IOWR('s', 16, int) +#define SCIF_FENCE_SIGNAL _IOWR('s', 17, struct scifioctl_fence_signal *) + +#define SCIF_GET_VERSION _IO('s', 23) diff --git a/mic.conf b/mic.conf new file mode 100644 index 0000000..a661522 --- /dev/null +++ b/mic.conf @@ -0,0 +1,32 @@ +# Options for the Intel Many Integrated Core Co-processor card driver +# +# p2p enables the use of the SCIF interface peer to peer communication +# 1 to enable or 0 to disable +# +# p2p_proxy enables the use of SCIF P2P Proxy DMA which converts DMA +# reads into DMA writes for performance on certain Intel platforms. +# 1 to enable or 0 to disable +# +# reg_cache enables SCIF Registration Caching +# 1 to enable or 0 to disable +# +# huge_page enables SCIF Huge Page Support +# 1 to enable or 0 to disable +# +# watchdog enables the SCIF watchdog for Lost Node detection. +# 1 to enable or 0 to disable +# +# watchdog_auto_reboot configures the behavior of the MIC host driver +# upon detection of a lost node. This option is a nop if watchdog=0. +# 1 Allow the host driver to reboot the node back to "online" state +# 0 Allow the host driver to reset the node back to "ready" state. +# It will be upto the user to reboot the node or not. +# +# crash_dump enables uOS Kernel Crash Dump Captures +# 1 to enable or 0 to disable +# +# ulimit enables ulimit checks on max locked memory for scif_register +# 1 to enable or 0 to disable +# +options mic reg_cache=1 huge_page=1 watchdog=1 watchdog_auto_reboot=1 crash_dump=1 p2p=1 p2p_proxy=1 ulimit=0 +options mic_host reg_cache=1 huge_page=1 watchdog=1 watchdog_auto_reboot=1 crash_dump=1 p2p=1 p2p_proxy=1 ulimit=0 diff --git a/mic.modules b/mic.modules new file mode 100755 index 0000000..a95f8b0 --- /dev/null +++ b/mic.modules @@ -0,0 +1,5 @@ +#!/bin/sh + +if [ ! -d /sys/class/mic ]; then + exec /sbin/modprobe mic >/dev/null 2>&1 +fi diff --git a/micscif/Kbuild b/micscif/Kbuild new file mode 100644 index 0000000..1171632 --- /dev/null +++ b/micscif/Kbuild @@ -0,0 +1,21 @@ +obj-m := ringbuffer.o +obj-m += micscif.o + +ringbuffer-objs := micscif_rb.o + +micscif-objs := micscif_main.o +micscif-objs += micscif_sysfs.o +micscif-objs += micscif_smpt.o +micscif-objs += micscif_intr.o +micscif-objs += micscif_api.o +micscif-objs += micscif_fd.o +micscif-objs += micscif_nodeqp.o +micscif-objs += micscif_va_node.o +micscif-objs += micscif_va_gen.o +micscif-objs += micscif_rma.o +micscif-objs += micscif_rma_list.o +micscif-objs += micscif_rma_dma.o +micscif-objs += micscif_debug.o +micscif-objs += micscif_ports.o +micscif-objs += micscif_select.o +micscif-objs += micscif_nm.o diff --git a/micscif/micscif_api.c b/micscif/micscif_api.c new file mode 100644 index 0000000..e13e59d --- /dev/null +++ b/micscif/micscif_api.c @@ -0,0 +1,3464 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include "scif.h" +#include "mic/micscif.h" +#ifndef _MIC_SCIF_ +#include "mic_common.h" +#endif +#include "mic/micscif_map.h" + +#define SCIF_MAP_ULIMIT 0x40 + +bool mic_ulimit_check = 0; + +char *scif_ep_states[] = { + "Closed", + "Unbound", + "Bound", + "Listening", + "Connected", + "Connecting", + "Mapping", + "Closing", + "Close Listening", + "Disconnected", + "Zombie"}; + +enum conn_async_state { + ASYNC_CONN_IDLE = 1, /* ep setup for async connect */ + ASYNC_CONN_INPROGRESS, /* async connect in progress */ + ASYNC_CONN_FLUSH_WORK /* async work flush in progress */ +}; + +/** + * scif_open() - Create a SCIF end point + * + * Create a SCIF end point and set the state to UNBOUND. This function + * returns the address of the end point data structure. + */ +scif_epd_t +__scif_open(void) +{ + struct endpt *ep; + + might_sleep(); + if ((ep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL)) == NULL) { + printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point descriptor\n"); + goto err_ep_alloc; + } + + if ((ep->qp_info.qp = (struct micscif_qp *) + kzalloc(sizeof(struct micscif_qp), GFP_KERNEL)) == NULL) { + printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point queue pointer\n"); + goto err_qp_alloc; + } + + spin_lock_init(&ep->lock); + mutex_init (&ep->sendlock); + mutex_init (&ep->recvlock); + + if (micscif_rma_ep_init(ep) < 0) { + printk(KERN_ERR "SCIFAPI _open: RMA EP Init failed\n"); + goto err_rma_init; + } + + ep->state = SCIFEP_UNBOUND; + pr_debug("SCIFAPI open: ep %p success\n", ep); + return (scif_epd_t)ep; + +err_rma_init: + kfree(ep->qp_info.qp); +err_qp_alloc: + kfree(ep); +err_ep_alloc: + return NULL; +} + +scif_epd_t +scif_open(void) +{ + struct endpt *ep; + ep = (struct endpt *)__scif_open(); + if (ep) + kref_init(&(ep->ref_count)); + return (scif_epd_t)ep; +} +EXPORT_SYMBOL(scif_open); + +/** + * scif_close() - Terminate a SCIF end point + * @epd: The end point address returned from scif_open() + * + * The function terminates a scif connection. It must ensure all traffic on + * the connection is finished before removing it. + * + * On Connection with memory mapped this become more difficult. Once normal + * DMA and message traffic has ended the end point must be placed in a zombie + * state and wait for the other side to also release it's memory references. + */ +int +__scif_close(scif_epd_t epd) +{ + struct endpt *ep = (struct endpt *)epd; + struct endpt *tmpep; + struct list_head *pos, *tmpq; + unsigned long sflags; + enum endptstate oldstate; + int err; + bool flush_conn; + + pr_debug("SCIFAPI close: ep %p %s\n", ep, scif_ep_states[ep->state]); + + might_sleep(); + + spin_lock(&ep->lock); + flush_conn = (ep->conn_async_state == ASYNC_CONN_INPROGRESS); + spin_unlock(&ep->lock); + + if (flush_conn) + flush_workqueue(ms_info.mi_conn_wq); + + micscif_inc_node_refcnt(ep->remote_dev, 1); + + spin_lock_irqsave(&ep->lock, sflags); + oldstate = ep->state; + + ep->state = SCIFEP_CLOSING; + + switch (oldstate) { + case SCIFEP_ZOMBIE: + BUG_ON(SCIFEP_ZOMBIE == oldstate); + case SCIFEP_CLOSED: + case SCIFEP_DISCONNECTED: + spin_unlock_irqrestore(&ep->lock, sflags); + micscif_unregister_all_windows(epd); + // Remove from the disconnected list + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == ep) { + list_del(pos); + break; + } + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + break; + case SCIFEP_UNBOUND: + case SCIFEP_BOUND: + case SCIFEP_CONNECTING: + spin_unlock_irqrestore(&ep->lock, sflags); + break; + case SCIFEP_MAPPING: + case SCIFEP_CONNECTED: + case SCIFEP_CLOSING: + { + struct nodemsg msg; + struct endpt *fep = NULL; + struct endpt *tmpep; + unsigned long ts = jiffies; + struct list_head *pos, *tmpq; + + // Very short time before mapping completes and state becomes connected + // and does a standard teardown. + ts = jiffies; + while (ep->state == SCIFEP_MAPPING) { + cpu_relax(); + if (time_after((unsigned long)jiffies,ts + NODE_ALIVE_TIMEOUT)) { + printk(KERN_ERR "%s %d ep->state %d\n", __func__, __LINE__, ep->state); + ep->state = SCIFEP_BOUND; + break; + } + } + + init_waitqueue_head(&ep->disconwq); // Wait for connection queue + spin_unlock_irqrestore(&ep->lock, sflags); + + micscif_unregister_all_windows(epd); + + // Remove from the connected list + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_connected) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == ep) { + list_del(pos); + put_conn_count(ep->remote_dev); + fep = tmpep; + spin_lock(&ep->lock); + break; + } + } + + if (fep == NULL) { + // The other side has completed the disconnect before + // the end point can be removed from the list. Therefore + // the ep lock is not locked, traverse the disconnected list + // to find the endpoint, release the conn lock and + // proceed to teardown the end point below. + list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == ep) { + list_del(pos); + break; + } + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + break; + } + + spin_unlock(&ms_info.mi_connlock); + + // Now we are free to close out the connection + msg.uop = SCIF_DISCNCT; + msg.src = ep->port; + msg.dst = ep->peer; + msg.payload[0] = (uint64_t)ep; + msg.payload[1] = ep->remote_ep; + + err = micscif_nodeqp_send(ep->remote_dev, &msg, ep); + spin_unlock_irqrestore(&ep->lock, sflags); + + if (!err) + /* Now wait for the remote node to respond */ + wait_event_timeout(ep->disconwq, + (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT); + /* + * Grab and release the ep lock to synchronize with the + * thread waking us up. If we dont grab this lock, then + * the ep might be freed before the wakeup completes + * resulting in potential memory corruption. + */ + spin_lock_irqsave(&ep->lock, sflags); + spin_unlock_irqrestore(&ep->lock, sflags); + break; + } + case SCIFEP_LISTENING: + case SCIFEP_CLLISTEN: + { + struct conreq *conreq; + struct nodemsg msg; + struct endpt *aep; + + spin_unlock_irqrestore(&ep->lock, sflags); + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + + // remove from listen list + list_for_each_safe(pos, tmpq, &ms_info.mi_listen) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == ep) { + list_del(pos); + } + } + // Remove any dangling accepts + while (ep->acceptcnt) { + aep = list_first_entry(&ep->li_accept, struct endpt, liacceptlist); + BUG_ON(!aep); + list_del(&aep->liacceptlist); + if (aep->port.port && !aep->accepted_ep) + put_scif_port(aep->port.port); + list_for_each_safe(pos, tmpq, &ms_info.mi_uaccept) { + tmpep = list_entry(pos, struct endpt, miacceptlist); + if (tmpep == aep) { + list_del(pos); + break; + } + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_connected) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == aep) { + list_del(pos); + put_conn_count(aep->remote_dev); + break; + } + } + list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == aep) { + list_del(pos); + break; + } + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + micscif_teardown_ep(aep); + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + micscif_add_epd_to_zombie_list(aep, MI_EPLOCK_HELD); + ep->acceptcnt--; + } + + spin_lock(&ep->lock); + spin_unlock(&ms_info.mi_eplock); + + // Remove and reject any pending connection requests. + while (ep->conreqcnt) { + conreq = list_first_entry(&ep->conlist, struct conreq, list); + list_del(&conreq->list); + + msg.uop = SCIF_CNCT_REJ; + msg.dst.node = conreq->msg.src.node; + msg.dst.port = conreq->msg.src.port; + msg.payload[0] = conreq->msg.payload[0]; + msg.payload[1] = conreq->msg.payload[1]; + /* + * No Error Handling on purpose for micscif_nodeqp_send(). + * If the remote node is lost we still want free the connection + * requests on the self node. + */ + micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, ep); + + ep->conreqcnt--; + kfree(conreq); + } + + // If a kSCIF accept is waiting wake it up + wake_up_interruptible(&ep->conwq); + spin_unlock_irqrestore(&ep->lock, sflags); + break; + } + } + if (ep->port.port && !ep->accepted_ep) + put_scif_port(ep->port.port); + micscif_dec_node_refcnt(ep->remote_dev, 1); + micscif_teardown_ep(ep); + micscif_add_epd_to_zombie_list(ep, !MI_EPLOCK_HELD); + return 0; +} + +void +scif_ref_rel(struct kref *kref_count) +{ + struct endpt *epd; + epd = container_of(kref_count, struct endpt, ref_count); + __scif_close((scif_epd_t)epd); +} + +int +scif_close(scif_epd_t epd) +{ + __scif_flush(epd); + put_kref_count(epd); + return 0; +} +EXPORT_SYMBOL(scif_close); + +/** + * scif_flush() - Flush the endpoint + * @epd: The end point address returned from scif_open() + * + */ +int +__scif_flush(scif_epd_t epd) +{ + struct endpt *ep = (struct endpt *)epd; + struct endpt *tmpep; + struct list_head *pos, *tmpq; + unsigned long sflags; + int err; + + might_sleep(); + + micscif_inc_node_refcnt(ep->remote_dev, 1); + + spin_lock_irqsave(&ep->lock, sflags); + + switch (ep->state) { + case SCIFEP_CONNECTED: + { + struct nodemsg msg; + struct endpt *fep = NULL; + + init_waitqueue_head(&ep->disconwq); // Wait for connection queue + WARN_ON(ep->files); // files should never be set while connected + spin_unlock_irqrestore(&ep->lock, sflags); + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + + list_for_each_safe(pos, tmpq, &ms_info.mi_connected) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == ep) { + list_del(pos); + put_conn_count(ep->remote_dev); + fep = tmpep; + spin_lock(&ep->lock); + break; + } + } + + if (fep == NULL) { + // The other side has completed the disconnect before + // the end point can be removed from the list. Therefore + // the ep lock is not locked, traverse the disconnected list + // to find the endpoint, release the conn lock. + list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) { + tmpep = list_entry(pos, struct endpt, list); + if (tmpep == ep) { + list_del(pos); + break; + } + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + break; + } + + spin_unlock(&ms_info.mi_connlock); + + msg.uop = SCIF_DISCNCT; + msg.src = ep->port; + msg.dst = ep->peer; + msg.payload[0] = (uint64_t)ep; + msg.payload[1] = ep->remote_ep; + + err = micscif_nodeqp_send(ep->remote_dev, &msg, ep); + + spin_unlock_irqrestore(&ep->lock, sflags); + if (!err) + /* Now wait for the remote node to respond */ + wait_event_timeout(ep->disconwq, + (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT); + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + spin_lock(&ep->lock); + list_add_tail(&ep->list, &ms_info.mi_disconnected); + ep->state = SCIFEP_DISCONNECTED; + spin_unlock(&ep->lock); + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + // Wake up threads blocked in send and recv + wake_up_interruptible(&ep->sendwq); + wake_up_interruptible(&ep->recvwq); + break; + } + case SCIFEP_LISTENING: + { + ep->state = SCIFEP_CLLISTEN; + + // If an accept is waiting wake it up + wake_up_interruptible(&ep->conwq); + spin_unlock_irqrestore(&ep->lock, sflags); + break; + } + default: + spin_unlock_irqrestore(&ep->lock, sflags); + break; + } + micscif_dec_node_refcnt(ep->remote_dev, 1); + return 0; +} + +/** + * scif_bind() - Bind a SCIF end point to a port ID. + * @epd: The end point address returned from scif_open() + * @pn: Port ID (number) to bind to + * + * Set the port ID associated with the end point and place it in the bound state. + * If a port ID of zero is requested a non zero port ID is allocated for it. + * + * Upon successful compltion the port id (number) will be returned. + * + * If the end point is not in the unbound state then return -EISCONN. + * + * If port ID zero is specified and allocation of a port ID fails -ENOSPC + * will be returned. + */ +int +__scif_bind(scif_epd_t epd, uint16_t pn) +{ + struct endpt *ep = (struct endpt *)epd; + unsigned long sflags; + int ret = 0; + int tmp; + + pr_debug("SCIFAPI bind: ep %p %s requested port number %d\n", + ep, scif_ep_states[ep->state], pn); + + might_sleep(); + + if (pn) { + /* + * Modeled on http://www.ietf.org/rfc/rfc1700.txt?number=1700 + * SCIF ports below SCIF_ADMIN_PORT_END can only be bound by + * system (or root) processes or by processes executed by + * privileged users. + */ + if ( pn < SCIF_ADMIN_PORT_END && !capable(CAP_SYS_ADMIN)) { + ret = -EACCES; + goto scif_bind_admin_exit; + } + } + + spin_lock_irqsave(&ep->lock, sflags); + if (ep->state == SCIFEP_BOUND) { + ret = -EINVAL; + goto scif_bind_exit; + } else if (ep->state != SCIFEP_UNBOUND) { + ret = -EISCONN; + goto scif_bind_exit; + } + + if (pn) { + if ((tmp = rsrv_scif_port(pn)) != pn) { + ret = -EINVAL; + goto scif_bind_exit; + } + } else { + pn = get_scif_port(); + if (!pn) { + ret = -ENOSPC; + goto scif_bind_exit; + } + } + + ep->state = SCIFEP_BOUND; + ep->port.node = ms_info.mi_nodeid; + ep->port.port = pn; + ep->conn_async_state = ASYNC_CONN_IDLE; + ret = pn; + pr_debug("SCIFAPI bind: bound to port number %d\n", pn); + +scif_bind_exit: + spin_unlock_irqrestore(&ep->lock, sflags); +scif_bind_admin_exit: + return ret; +} + +int +scif_bind(scif_epd_t epd, uint16_t pn) +{ + int ret; + get_kref_count(epd); + ret = __scif_bind(epd, pn); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_bind); + +/** + * scif_listen() - Place the end point in the listening state + * @epd: The end point address returned from scif_open() + * @backlog: Maximum number of pending connection requests. + * + * The end point is placed in the listening state ready to accept connection + * requests. The backlog paramter is saved to indicate the maximun number of + * connection requests from the remote node to save. The end point is + * placed on a list of listening end points to allow a connection request to + * find it. + * + * Upon successful completion a zero is returned. + * + * If the end point is not in the bound state -EINVAL or -EISCONN is returned. + * + */ +int +__scif_listen(scif_epd_t epd, int backlog) +{ + struct endpt *ep = (struct endpt *)epd; + unsigned long sflags; + + pr_debug("SCIFAPI listen: ep %p %s\n", ep, scif_ep_states[ep->state]); + + might_sleep(); + spin_lock_irqsave(&ep->lock, sflags); + switch (ep->state) { + case SCIFEP_ZOMBIE: + BUG_ON(SCIFEP_ZOMBIE == ep->state); + case SCIFEP_CLOSED: + case SCIFEP_CLOSING: + case SCIFEP_CLLISTEN: + case SCIFEP_UNBOUND: + case SCIFEP_DISCONNECTED: + spin_unlock_irqrestore(&ep->lock, sflags); + return -EINVAL; + case SCIFEP_LISTENING: + case SCIFEP_CONNECTED: + case SCIFEP_CONNECTING: + case SCIFEP_MAPPING: + spin_unlock_irqrestore(&ep->lock, sflags); + return -EISCONN; + case SCIFEP_BOUND: + break; + } + + ep->state = SCIFEP_LISTENING; + ep->backlog = backlog; + + ep->conreqcnt = 0; + ep->acceptcnt = 0; + INIT_LIST_HEAD(&ep->conlist); // List of connection requests + init_waitqueue_head(&ep->conwq); // Wait for connection queue + INIT_LIST_HEAD(&ep->li_accept); // User ep list for ACCEPTREG calls + spin_unlock_irqrestore(&ep->lock, sflags); + + // Listen status is complete so delete the qp information not needed + // on a listen before placing on the list of listening ep's + micscif_teardown_ep((void *)ep); + ep->qp_info.qp = NULL; + + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_add_tail(&ep->list, &ms_info.mi_listen); + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + return 0; +} + +int +scif_listen(scif_epd_t epd, int backlog) +{ + int ret; + get_kref_count(epd); + ret = __scif_listen(epd, backlog); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_listen); + +#ifdef _MIC_SCIF_ +/* + * scif_p2p_connect: + * @node: destination node id + * + * Try to setup a p2p connection between the current + * node and the desitination node. We need host to + * setup the initial p2p connections. So we send + * this message to the host which acts like proxy + * in setting up p2p connection. + */ +static int scif_p2p_connect(int node) +{ + struct micscif_dev *remote_dev = &scif_dev[node]; + struct nodemsg msg; + int err; + + pr_debug("%s:%d SCIF_NODE_CONNECT to host\n", __func__, __LINE__); + micscif_inc_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1); + + msg.dst.node = SCIF_HOST_NODE; + msg.payload[0] = node; + msg.uop = SCIF_NODE_CONNECT; + + if ((err = micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], + &msg, NULL))) { + printk(KERN_ERR "%s:%d error while sending SCIF_NODE_CONNECT to" + " node %d\n", __func__, __LINE__, node); + micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1); + goto error; + } + + wait_event_interruptible_timeout(remote_dev->sd_p2p_wq, + (remote_dev->sd_state == SCIFDEV_RUNNING) || + (remote_dev->sd_state == SCIFDEV_NOTPRESENT), NODE_ALIVE_TIMEOUT); + + pr_debug("%s:%d SCIF_NODE_CONNECT state:%d\n", __func__, __LINE__, + remote_dev->sd_state); + micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1); +error: + return err; +} +#endif + +static int scif_conn_func(struct endpt *ep) +{ + int err = 0; + struct nodemsg msg; + unsigned long sflags; + int term_sent = 0; + + if ((err = micscif_reserve_dma_chan(ep))) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + ep->state = SCIFEP_BOUND; + goto connect_error_simple; + } + // Initiate the first part of the endpoint QP setup + err = micscif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset, + ENDPT_QP_SIZE, ep->remote_dev); + if (err) { + printk(KERN_ERR "%s err %d qp_offset 0x%llx\n", + __func__, err, ep->qp_info.qp_offset); + ep->state = SCIFEP_BOUND; + goto connect_error_simple; + } + + micscif_inc_node_refcnt(ep->remote_dev, 1); + + // Format connect message and send it + msg.src = ep->port; + msg.dst = ep->conn_port; + msg.uop = SCIF_CNCT_REQ; + msg.payload[0] = (uint64_t)ep; + msg.payload[1] = ep->qp_info.qp_offset; + if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + goto connect_error_simple; + } + // Wait for request to be processed. + while ((err = wait_event_interruptible_timeout(ep->conwq, + (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT)) <= 0) { + if (!err) + err = -ENODEV; + + pr_debug("SCIFAPI connect: ep %p ^C detected\n", ep); + // interrupted out of the wait + if (!term_sent++) { + int bak_err = err; + msg.uop = SCIF_CNCT_TERM; + if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) { +retry: + err = wait_event_timeout(ep->diswq, + (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + } + if (ep->state == SCIFEP_MAPPING) { + micscif_setup_qp_connect_response(ep->remote_dev, + ep->qp_info.qp, ep->qp_info.cnct_gnt_payload); + // Send grant nack + msg.uop = SCIF_CNCT_GNTNACK; + msg.payload[0] = ep->remote_ep; + /* No error handling for Notification messages */ + micscif_nodeqp_send(ep->remote_dev, &msg, ep); + } + // Ensure after that even after a timeout the state of the end point is bound + ep->state = SCIFEP_BOUND; + if (bak_err) + err = bak_err; + break; + } + } + + if (err > 0) + err = 0; + + if (term_sent || err) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + goto connect_error_simple; + } + + if (ep->state == SCIFEP_MAPPING) { + err = micscif_setup_qp_connect_response(ep->remote_dev, + ep->qp_info.qp, ep->qp_info.cnct_gnt_payload); + + // If the resource to map the queue are not available then we need + // to tell the other side to terminate the accept + if (err) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + + // Send grant nack + msg.uop = SCIF_CNCT_GNTNACK; + msg.payload[0] = ep->remote_ep; + /* No error handling for Notification messages */ + micscif_nodeqp_send(ep->remote_dev, &msg, ep); + + ep->state = SCIFEP_BOUND; + micscif_dec_node_refcnt(ep->remote_dev, 1); + goto connect_error_simple; + } + + // Send a grant ack to inform the accept we are done mapping its resources. + msg.uop = SCIF_CNCT_GNTACK; + msg.payload[0] = ep->remote_ep; + if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) { + ep->state = SCIFEP_CONNECTED; + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_add_tail(&ep->list, &ms_info.mi_connected); + get_conn_count(ep->remote_dev); + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + pr_debug("SCIFAPI connect: ep %p connected\n", ep); + } else + ep->state = SCIFEP_BOUND; + micscif_dec_node_refcnt(ep->remote_dev, 1); + goto connect_error_simple; + + } else if (ep->state == SCIFEP_BOUND) { + pr_debug("SCIFAPI connect: ep %p connection refused\n", ep); + err = -ECONNREFUSED; + micscif_dec_node_refcnt(ep->remote_dev, 1); + goto connect_error_simple; + + } else { + pr_debug("SCIFAPI connect: ep %p connection interrupted\n", ep); + err = -EINTR; + micscif_dec_node_refcnt(ep->remote_dev, 1); + goto connect_error_simple; + } + micscif_dec_node_refcnt(ep->remote_dev, 1); +connect_error_simple: + return err; +} + +/* + * micscif_conn_handler: + * + * Workqueue handler for servicing non-blocking SCIF connect + * + */ +void micscif_conn_handler(struct work_struct *work) +{ + struct endpt *ep; + + do { + ep = NULL; + spin_lock(&ms_info.mi_nb_connect_lock); + if (!list_empty(&ms_info.mi_nb_connect_list)) { + ep = list_first_entry(&ms_info.mi_nb_connect_list, + struct endpt, conn_list); + list_del(&ep->conn_list); + } + spin_unlock(&ms_info.mi_nb_connect_lock); + if (ep) { + ep->conn_err = scif_conn_func(ep); + wake_up_interruptible(&ep->conn_pend_wq); + } + } while (ep); +} + +/** + * scif_connect() - Request a connection to a remote node + * @epd: The end point address returned from scif_open() + * @dst: Remote note address informtion + * + * The function requests a scif connection to the remote node + * identified by the dst parameter. "dst" contains the remote node and + * port ids. + * + * Upon successful complete a zero will be returned. + * + * If the end point is not in the bound state -EINVAL will be returned. + * + * If during the connection sequence resource allocation fails the -ENOMEM + * will be returned. + * + * If the remote side is not responding to connection requests the caller may + * terminate this funciton with a signal. If so a -EINTR will be returned. + */ +int +__scif_connect(scif_epd_t epd, struct scif_portID *dst, bool non_block) +{ + struct endpt *ep = (struct endpt *)epd; + unsigned long sflags; + int err = 0; +#ifdef _MIC_SCIF_ + struct micscif_dev *remote_dev; +#endif + + pr_debug("SCIFAPI connect: ep %p %s\n", ep, + scif_ep_states[ep->state]); + + if (dst->node > MAX_BOARD_SUPPORTED) + return -ENODEV; + + might_sleep(); + +#ifdef _MIC_SCIF_ + remote_dev = &scif_dev[dst->node]; + if ((SCIFDEV_INIT == remote_dev->sd_state || + SCIFDEV_STOPPED == remote_dev->sd_state) && mic_p2p_enable) + if ((err = scif_p2p_connect(dst->node))) + return err; +#endif + + if (SCIFDEV_RUNNING != scif_dev[dst->node].sd_state && + SCIFDEV_SLEEPING != scif_dev[dst->node].sd_state) + return -ENODEV; + + spin_lock_irqsave(&ep->lock, sflags); + switch (ep->state) { + case SCIFEP_ZOMBIE: + BUG_ON(SCIFEP_ZOMBIE == ep->state); + + case SCIFEP_CLOSED: + case SCIFEP_CLOSING: + err = -EINVAL; + break; + + case SCIFEP_DISCONNECTED: + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) + ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; + else + err = -EINVAL; + break; + + case SCIFEP_LISTENING: + case SCIFEP_CLLISTEN: + err = -EOPNOTSUPP; + break; + + case SCIFEP_CONNECTING: + case SCIFEP_MAPPING: + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) + err = -EINPROGRESS; + else + err = -EISCONN; + break; + + case SCIFEP_CONNECTED: + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) + ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; + else + err = -EISCONN; + break; + + case SCIFEP_UNBOUND: + if ((ep->port.port = get_scif_port()) == 0) + err = -ENOSPC; + else { + ep->port.node = ms_info.mi_nodeid; + ep->conn_async_state = ASYNC_CONN_IDLE; + } + /* Fall through */ + case SCIFEP_BOUND: + /* + * If a non-blocking connect has been already initiated (conn_async_state + * is either ASYNC_CONN_INPROGRESS or ASYNC_CONN_FLUSH_WORK), the end point + * could end up in SCIF_BOUND due an error in the connection + * process (e.g., connnection refused) + * If conn_async_state is ASYNC_CONN_INPROGRESS - transition to + * ASYNC_CONN_FLUSH_WORK so that the error status can be collected. + * If the state is already ASYNC_CONN_FLUSH_WORK - then set the error + * to EINPROGRESS since some other thread is waiting to collect error status. + */ + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) + ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; + else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) + err = -EINPROGRESS; + else { + ep->conn_port = *dst; + init_waitqueue_head(&ep->sendwq); + init_waitqueue_head(&ep->recvwq); + init_waitqueue_head(&ep->conwq); + init_waitqueue_head(&ep->diswq); + ep->conn_async_state = 0; + + if (unlikely(non_block)) + ep->conn_async_state = ASYNC_CONN_INPROGRESS; + } + break; + } + + if (err || ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) + goto connect_simple_unlock1; + + ep->state = SCIFEP_CONNECTING; + ep->remote_dev = &scif_dev[dst->node]; + ep->sd_state = SCIFDEV_RUNNING; + ep->qp_info.qp->magic = SCIFEP_MAGIC; + ep->qp_info.qp->ep = (uint64_t)ep; + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + init_waitqueue_head(&ep->conn_pend_wq); + spin_lock(&ms_info.mi_nb_connect_lock); + list_add_tail(&ep->conn_list, + &ms_info.mi_nb_connect_list); + spin_unlock(&ms_info.mi_nb_connect_lock); + err = -EINPROGRESS; + queue_work(ms_info.mi_conn_wq, &ms_info.mi_conn_work); + } +connect_simple_unlock1: + spin_unlock_irqrestore(&ep->lock, sflags); + + if (err) + return err; + else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) { + flush_workqueue(ms_info.mi_conn_wq); + err = ep->conn_err; + spin_lock_irqsave(&ep->lock, sflags); + ep->conn_async_state = ASYNC_CONN_IDLE; + spin_unlock_irqrestore(&ep->lock, sflags); + } else { + err = scif_conn_func(ep); + } + return err; +} + +int +scif_connect(scif_epd_t epd, struct scif_portID *dst) +{ + int ret; + get_kref_count(epd); + ret = __scif_connect(epd, dst, false); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_connect); + +/** + * scif_accept() - Accept a connection request from the remote node + * @epd: The end point address returned from scif_open() + * @peer: Filled in with pear node and port information + * @newepd: New end point created for connection + * @flags: Indicates sychronous or asynchronous mode + * + * The function accepts a connection request from the remote node. Successful + * complete is indicate by a new end point being created and passed back + * to the caller for future reference. + * + * Upon successful complete a zero will be returned and the peer information + * will be filled in. + * + * If the end point is not in the listening state -EINVAL will be returned. + * + * If during the connection sequence resource allocation fails the -ENOMEM + * will be returned. + * + * If the function is called asynchronously and not connection request are + * pending it will return -EAGAIN. + * + * If the remote side is not sending any connection requests the caller may + * terminate this funciton with a signal. If so a -EINTR will be returned. + */ +int +__scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags) +{ + struct endpt *lep = (struct endpt *)epd; + struct endpt *cep; + struct conreq *conreq; + struct nodemsg msg; + unsigned long sflags; + int err; + + pr_debug("SCIFAPI accept: ep %p %s\n", lep, scif_ep_states[lep->state]); + + // Error if flags other than SCIF_ACCEPT_SYNC are set + if (flags & ~SCIF_ACCEPT_SYNC) { + pr_debug("SCIFAPI accept: ep %p invalid flags %x\n", lep, flags & ~SCIF_ACCEPT_SYNC); + return -EINVAL; + } + + if (!peer || !newepd) { + pr_debug("SCIFAPI accept: ep %p peer %p or newepd %p NULL\n", + lep, peer, newepd); + return -EINVAL; + } + + might_sleep(); + spin_lock_irqsave(&lep->lock, sflags); + if (lep->state != SCIFEP_LISTENING) { + pr_debug("SCIFAPI accept: ep %p not listending\n", lep); + spin_unlock_irqrestore(&lep->lock, sflags); + return -EINVAL; + } + + if (!lep->conreqcnt && !(flags & SCIF_ACCEPT_SYNC)) { + // No connection request present and we do not want to wait + pr_debug("SCIFAPI accept: ep %p async request with nothing pending\n", lep); + spin_unlock_irqrestore(&lep->lock, sflags); + return -EAGAIN; + } + +retry_connection: + spin_unlock_irqrestore(&lep->lock, sflags); + lep->files = current ? current->files : NULL; + if ((err = wait_event_interruptible(lep->conwq, + (lep->conreqcnt || (lep->state != SCIFEP_LISTENING)))) != 0) { + // wait was interrupted + pr_debug("SCIFAPI accept: ep %p ^C detected\n", lep); + return err; // -ERESTARTSYS + } + + if (lep->state != SCIFEP_LISTENING) { + return -EINTR; + } + + spin_lock_irqsave(&lep->lock, sflags); + + if (!lep->conreqcnt) { + goto retry_connection; + } + + // Get the first connect request off the list + conreq = list_first_entry(&lep->conlist, struct conreq, list); + list_del(&conreq->list); + lep->conreqcnt--; + spin_unlock_irqrestore(&lep->lock, sflags); + + // Fill in the peer information + peer->node = conreq->msg.src.node; + peer->port = conreq->msg.src.port; + + // Create the connection endpoint + cep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL); + if (!cep) { + pr_debug("SCIFAPI accept: ep %p new end point allocation failed\n", lep); + err = -ENOMEM; + goto scif_accept_error_epalloc; + } + spin_lock_init(&cep->lock); + mutex_init (&cep->sendlock); + mutex_init (&cep->recvlock); + cep->state = SCIFEP_CONNECTING; + cep->remote_dev = &scif_dev[peer->node]; + cep->remote_ep = conreq->msg.payload[0]; + cep->sd_state = SCIFDEV_RUNNING; + + if (!scifdev_alive(cep)) { + err = -ENODEV; + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto scif_accept_error_qpalloc; + } + + if (micscif_rma_ep_init(cep) < 0) { + pr_debug("SCIFAPI accept: ep %p new %p RMA EP init failed\n", lep, cep); + err = -ENOMEM; + goto scif_accept_error_qpalloc; + } + + if ((err = micscif_reserve_dma_chan(cep))) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto scif_accept_error_qpalloc; + } + + cep->qp_info.qp = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_KERNEL); + if (!cep->qp_info.qp) { + printk(KERN_ERR "Port Qp Allocation Failed\n"); + err = -ENOMEM; + goto scif_accept_error_qpalloc; + } + + cep->qp_info.qp->magic = SCIFEP_MAGIC; + cep->qp_info.qp->ep = (uint64_t)cep; + micscif_inc_node_refcnt(cep->remote_dev, 1); + err = micscif_setup_qp_accept(cep->qp_info.qp, &cep->qp_info.qp_offset, + conreq->msg.payload[1], ENDPT_QP_SIZE, cep->remote_dev); + if (err) { + pr_debug("SCIFAPI accept: ep %p new %p micscif_setup_qp_accept %d qp_offset 0x%llx\n", + lep, cep, err, cep->qp_info.qp_offset); + micscif_dec_node_refcnt(cep->remote_dev, 1); + goto scif_accept_error_map; + } + + cep->port.node = lep->port.node; + cep->port.port = lep->port.port; + cep->peer.node = peer->node; + cep->peer.port = peer->port; + cep->accepted_ep = true; + init_waitqueue_head(&cep->sendwq); // Wait for data to be consumed + init_waitqueue_head(&cep->recvwq); // Wait for data to be produced + init_waitqueue_head(&cep->conwq); // Wait for connection request + + // Return the grant message + msg.uop = SCIF_CNCT_GNT; + msg.src = cep->port; + msg.payload[0] = cep->remote_ep; + msg.payload[1] = cep->qp_info.qp_offset; + msg.payload[2] = (uint64_t)cep; + + err = micscif_nodeqp_send(cep->remote_dev, &msg, cep); + + micscif_dec_node_refcnt(cep->remote_dev, 1); + if (err) + goto scif_accept_error_map; +retry: + err = wait_event_timeout(cep->conwq, + (cep->state != SCIFEP_CONNECTING), NODE_ACCEPT_TIMEOUT); + if (!err && scifdev_alive(cep)) + goto retry; + + if (!err) { + err = -ENODEV; + goto scif_accept_error_map; + } + + if (err > 0) + err = 0; + + kfree(conreq); + + spin_lock_irqsave(&cep->lock, sflags); + + if (cep->state == SCIFEP_CONNECTED) { + // Connect sequence complete return new endpoint information + *newepd = (scif_epd_t)cep; + spin_unlock_irqrestore(&cep->lock, sflags); + pr_debug("SCIFAPI accept: ep %p new %p returning new epnd point\n", lep, cep); + return 0; + } + + if (cep->state == SCIFEP_CLOSING) { + // Remote failed to allocate resources and NAKed the grant. + // There is at this point nothing referencing the new end point. + spin_unlock_irqrestore(&cep->lock, sflags); + micscif_teardown_ep((void *)cep); + kfree(cep); + + // If call with sync flag then go back and wait. + if (flags & SCIF_ACCEPT_SYNC) { + spin_lock_irqsave(&lep->lock, sflags); + goto retry_connection; + } + + pr_debug("SCIFAPI accept: ep %p new %p remote failed to allocate resources\n", lep, cep); + return -EAGAIN; + } + + // While connect was in progress the other side closed and sent a disconnect + // so set the end point status to closed but return anyway. This will allow + // the caller to drain anything the other side may have put in the message queue. + *newepd = (scif_epd_t)cep; + spin_unlock_irqrestore(&cep->lock, sflags); + return 0; + + // Error allocating or mapping resources +scif_accept_error_map: + kfree(cep->qp_info.qp); + +scif_accept_error_qpalloc: + kfree(cep); + +scif_accept_error_epalloc: + micscif_inc_node_refcnt(&scif_dev[conreq->msg.src.node], 1); + // New reject the connection request due to lack of resources + msg.uop = SCIF_CNCT_REJ; + msg.dst.node = conreq->msg.src.node; + msg.dst.port = conreq->msg.src.port; + msg.payload[0] = conreq->msg.payload[0]; + msg.payload[1] = conreq->msg.payload[1]; + /* No error handling for Notification messages */ + micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, NULL); + micscif_dec_node_refcnt(&scif_dev[conreq->msg.src.node], 1); + + kfree(conreq); + return err; +} + +int +scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags) +{ + int ret; + get_kref_count(epd); + ret = __scif_accept(epd, peer, newepd, flags); + if (ret == 0) { + kref_init(&((*newepd)->ref_count)); + } + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_accept); + +/* + * scif_msg_param_check: + * @epd: The end point address returned from scif_open() + * @len: Length to receive + * @flags: Syncronous or asynchronous access + * + * Validate parameters for messaging APIs scif_send(..)/scif_recv(..). + */ +static inline int +scif_msg_param_check(scif_epd_t epd, int len, int flags) +{ + int ret = -EINVAL; + + if (len < 0) + goto err_ret; + + if (flags && (!(flags & SCIF_RECV_BLOCK))) + goto err_ret; + + ret = 0; + +err_ret: + return ret; +} + +#define SCIF_BLAST (1 << 1) /* Use bit 1 of flags field */ + +#ifdef SCIF_BLAST +/* + * Added a temporary implementation of the exception path. + * The cost to the normal path is 1 local variable (set once and + * tested once) plus 2 tests for the 'blast' flag. + * This only apply to the card side kernel API. + */ +#ifndef _MIC_SCIF_ +#undef SCIF_BLAST +#endif +#endif + +/** + * _scif_send() - Send data to connection queue + * @epd: The end point address returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: Syncronous or asynchronous access + * + * This function sends a packet of data to the queue * created by the + * connection establishment sequence. It returns when the packet has + * been completely sent. + * + * Successful completion returns the number of bytes sent. + * + * If the end point is not in the connect state returns -ENOTCONN; + * + * This function may be interrupted by a signal and will return -EINTR. + */ +int +_scif_send(scif_epd_t epd, void *msg, int len, int flags) +{ + struct endpt *ep = (struct endpt *)epd; + struct nodemsg notif_msg; + unsigned long sflags; + size_t curr_xfer_len = 0; + size_t sent_len = 0; + size_t write_count; + int ret; +#ifdef SCIF_BLAST + int tl; +#endif + + if (flags & SCIF_SEND_BLOCK) + might_sleep(); + +#ifdef SCIF_BLAST + if (flags & SCIF_BLAST) { + /* + * Do a decent try to acquire lock (~100 uSec) + */ + for (ret = tl = 0; ret < 100 && !tl; ret++) { + tl = spin_trylock_irqsave(&ep->lock, sflags); + cpu_relax(); + } + } else { + tl = 1; + spin_lock_irqsave(&ep->lock, sflags); + } +#else + spin_lock_irqsave(&ep->lock, sflags); +#endif + + while (sent_len != len) { + if (ep->state == SCIFEP_DISCONNECTED) { + ret = (int)(sent_len ? sent_len : -ECONNRESET); + goto unlock_dec_return; + } + if (ep->state != SCIFEP_CONNECTED) { + ret = (int)(sent_len ? sent_len : -ENOTCONN); + goto unlock_dec_return; + } + if (!scifdev_alive(ep)) { + ret = (int) (sent_len ? sent_len : -ENODEV); + goto unlock_dec_return; + } + write_count = micscif_rb_space(&ep->qp_info.qp->outbound_q); + if (write_count) { + /* + * Best effort to send as much data as there + * is space in the RB particularly important for the + * Non Blocking case. + */ + curr_xfer_len = min(len - sent_len, write_count); + ret = micscif_rb_write(&ep->qp_info.qp->outbound_q, msg, + (uint32_t)curr_xfer_len); + if (ret < 0) { + ret = -EFAULT; + goto unlock_dec_return; + } + if (ret) { + spin_unlock_irqrestore(&ep->lock, sflags); + /* + * If there is space in the RB and we have the + * EP lock held then writing to the RB should + * succeed. Releasing spin lock before asserting + * to avoid deadlocking the system. + */ + BUG_ON(ret); + } + /* + * Success. Update write pointer. + */ + micscif_rb_commit(&ep->qp_info.qp->outbound_q); +#ifdef SCIF_BLAST + if (flags & SCIF_BLAST) { + /* + * Bypass-path; set flag int the host side node_qp + * and ring the doorbell. Host will wake-up all + * listeners, such that the message will be seen. + * Need micscif_send_host_intr() to be non-static. + */ + extern int micscif_send_host_intr(struct micscif_dev *, uint32_t); + ep->remote_dev->qpairs->remote_qp->blast = 1; + smp_wmb(); /* Sufficient or need sfence? */ + micscif_send_host_intr(ep->remote_dev, 0); + } else { + /* + * Normal path: send notification on the + * node_qp ring buffer and ring the doorbell. + */ + notif_msg.src = ep->port; + notif_msg.uop = SCIF_CLIENT_SENT; + notif_msg.payload[0] = ep->remote_ep; + if ((ret = micscif_nodeqp_send(ep->remote_dev, ¬if_msg, ep))) { + ret = sent_len ? sent_len : ret; + goto unlock_dec_return; + } + } +#else + /* + * Send a notification to the peer about the + * produced data message. + */ + notif_msg.src = ep->port; + notif_msg.uop = SCIF_CLIENT_SENT; + notif_msg.payload[0] = ep->remote_ep; + if ((ret = micscif_nodeqp_send(ep->remote_dev, ¬if_msg, ep))) { + ret = (int)(sent_len ? sent_len : ret); + goto unlock_dec_return; + } +#endif + sent_len += curr_xfer_len; + msg = (char *)msg + curr_xfer_len; + continue; + } + curr_xfer_len = min(len - sent_len, (size_t)(ENDPT_QP_SIZE - 1)); + /* + * Not enough space in the RB. Return in the Non Blocking case. + */ + if (!(flags & SCIF_SEND_BLOCK)) { + ret = (int)sent_len; + goto unlock_dec_return; + } +#ifdef SCIF_BLAST + /* + * Flags SCIF_BLAST and SCIF_SEND_BLOCK are mutually + * exclusive, so if we get here we know that SCIF_BLAST + * was not set and thus we _do_ have the spinlock. + * No need to check variable tl here + */ +#endif + spin_unlock_irqrestore(&ep->lock, sflags); + /* + * Wait for a message now in the Blocking case. + */ + if ((ret = wait_event_interruptible(ep->sendwq, + (SCIFEP_CONNECTED != ep->state) || + (micscif_rb_space(&ep->qp_info.qp->outbound_q) + >= curr_xfer_len) || (!scifdev_alive(ep))))) { + ret = (int) (sent_len ? sent_len : ret); + goto dec_return; + } + spin_lock_irqsave(&ep->lock, sflags); + } + ret = len; +unlock_dec_return: +#ifdef SCIF_BLAST + if (tl) +#endif + spin_unlock_irqrestore(&ep->lock, sflags); +dec_return: + return ret; +} + +/** + * _scif_recv() - Recieve data from connection queue + * @epd: The end point address returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: Syncronous or asynchronous access + * @touser: package send to user buffer or kernel + * + * This function requests to receive a packet of data from the queue + * created by the connection establishment sequence. It reads the amount + * of data requested before returning. + * + * This function differs from the scif_send() by also returning data if the + * end point is in the disconnected state and data is present. + * + * Successful completion returns the number of bytes read. + * + * If the end point is not in the connect state or in the disconnected state + * with data prosent it returns -ENOTCONN; + * + * This function may be interrupted by a signal and will return -EINTR. + */ +int +_scif_recv(scif_epd_t epd, void *msg, int len, int flags) +{ + int read_size; + struct endpt *ep = (struct endpt *)epd; + unsigned long sflags; + struct nodemsg notif_msg; + size_t curr_recv_len = 0; + size_t remaining_len = len; + size_t read_count; + int ret; + + if (flags & SCIF_RECV_BLOCK) + might_sleep(); + + micscif_inc_node_refcnt(ep->remote_dev, 1); + spin_lock_irqsave(&ep->lock, sflags); + while (remaining_len) { + if (ep->state != SCIFEP_CONNECTED && + ep->state != SCIFEP_DISCONNECTED) { + ret = (int) (len - remaining_len) ? + (int) (len - remaining_len) : -ENOTCONN; + goto unlock_dec_return; + } + read_count = micscif_rb_count(&ep->qp_info.qp->inbound_q, + (int) remaining_len); + if (read_count) { + /* + * Best effort to recv as much data as there + * are bytes to read in the RB particularly + * important for the Non Blocking case. + */ + curr_recv_len = min(remaining_len, read_count); + read_size = micscif_rb_get_next( + &ep->qp_info.qp->inbound_q, + msg, (int) curr_recv_len); + if (read_size < 0){ + /* only could happen when copy to USER buffer + */ + ret = -EFAULT; + goto unlock_dec_return; + } + if (read_size != curr_recv_len) { + spin_unlock_irqrestore(&ep->lock, sflags); + /* + * If there are bytes to be read from the RB and + * we have the EP lock held then reading from + * RB should succeed. Releasing spin lock before + * asserting to avoid deadlocking the system. + */ + BUG_ON(read_size != curr_recv_len); + } + if (ep->state == SCIFEP_CONNECTED) { + /* + * Update the read pointer only if the endpoint is + * still connected else the read pointer might no + * longer exist since the peer has freed resources! + */ + micscif_rb_update_read_ptr(&ep->qp_info.qp->inbound_q); + /* + * Send a notification to the peer about the + * consumed data message only if the EP is in + * SCIFEP_CONNECTED state. + */ + notif_msg.src = ep->port; + notif_msg.uop = SCIF_CLIENT_RCVD; + notif_msg.payload[0] = ep->remote_ep; + if ((ret = micscif_nodeqp_send(ep->remote_dev, ¬if_msg, ep))) { + ret = (len - (int)remaining_len) ? + (len - (int)remaining_len) : ret; + goto unlock_dec_return; + } + } + remaining_len -= curr_recv_len; + msg = (char *)msg + curr_recv_len; + continue; + } + curr_recv_len = min(remaining_len, (size_t)(ENDPT_QP_SIZE - 1)); + /* + * Bail out now if the EP is in SCIFEP_DISCONNECTED state else + * we will keep looping forever. + */ + if (ep->state == SCIFEP_DISCONNECTED) { + ret = (len - (int)remaining_len) ? + (len - (int)remaining_len) : -ECONNRESET; + goto unlock_dec_return; + } + /* + * Return in the Non Blocking case if there is no data + * to read in this iteration. + */ + if (!(flags & SCIF_RECV_BLOCK)) { + ret = len - (int)remaining_len; + goto unlock_dec_return; + } + spin_unlock_irqrestore(&ep->lock, sflags); + micscif_dec_node_refcnt(ep->remote_dev, 1); + /* + * Wait for a message now in the Blocking case. + * or until other side disconnects. + */ + if ((ret = wait_event_interruptible(ep->recvwq, + (SCIFEP_CONNECTED != ep->state) || + (micscif_rb_count(&ep->qp_info.qp->inbound_q, + curr_recv_len) >= curr_recv_len) || (!scifdev_alive(ep))))) { + ret = (len - remaining_len) ? + (len - (int)remaining_len) : ret; + goto dec_return; + } + micscif_inc_node_refcnt(ep->remote_dev, 1); + spin_lock_irqsave(&ep->lock, sflags); + } + ret = len; +unlock_dec_return: + spin_unlock_irqrestore(&ep->lock, sflags); + micscif_dec_node_refcnt(ep->remote_dev, 1); +dec_return: + return ret; +} + + +/** + * scif_user_send() - Send data to connection queue + * @epd: The end point address returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: Syncronous or asynchronous access + * + * This function is called from the driver IOCTL entry point + * only and is a wrapper for _scif_send(). + */ +int +scif_user_send(scif_epd_t epd, void *msg, int len, int flags) +{ + struct endpt *ep = (struct endpt *)epd; + int err = 0; + int sent_len = 0; + char *tmp; + int loop_len; + int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));; + pr_debug("SCIFAPI send (U): ep %p %s\n", ep, scif_ep_states[ep->state]); + + if (!len) + return 0; + + if ((err = scif_msg_param_check(epd, len, flags))) + goto send_err; + + if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) { + err = -ENOMEM; + goto send_err; + } + err = 0; + micscif_inc_node_refcnt(ep->remote_dev, 1); + /* + * Grabbing the lock before breaking up the transfer in + * multiple chunks is required to ensure that messages do + * not get fragmented and reordered. + */ + mutex_lock(&ep->sendlock); + + while (sent_len != len) { + msg = (void *)((char *)msg + err); + loop_len = len - sent_len; + loop_len = min(chunk_len, loop_len); + if (copy_from_user(tmp, msg, loop_len)) { + err = -EFAULT; + goto send_free_err; + } + err = _scif_send(epd, (void *)tmp, loop_len, flags); + if (err < 0) { + goto send_free_err; + } + sent_len += err; + if (err !=loop_len) { + goto send_free_err; + } + } +send_free_err: + mutex_unlock(&ep->sendlock); + micscif_dec_node_refcnt(ep->remote_dev, 1); + kfree(tmp); +send_err: + return err < 0 ? err : sent_len; +} + +/** + * scif_user_recv() - Recieve data from connection queue + * @epd: The end point address returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: Syncronous or asynchronous access + * + * This function is called from the driver IOCTL entry point + * only and is a wrapper for _scif_recv(). + */ +int +scif_user_recv(scif_epd_t epd, void *msg, int len, int flags) +{ + struct endpt *ep = (struct endpt *)epd; + int err = 0; + int recv_len = 0; + char *tmp; + int loop_len; + int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));; + pr_debug("SCIFAPI recv (U): ep %p %s\n", ep, scif_ep_states[ep->state]); + + if (!len) + return 0; + + if ((err = scif_msg_param_check(epd, len, flags))) + goto recv_err; + + if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) { + err = -ENOMEM; + goto recv_err; + } + err = 0; + /* + * Grabbing the lock before breaking up the transfer in + * multiple chunks is required to ensure that messages do + * not get fragmented and reordered. + */ + mutex_lock(&ep->recvlock); + + while (recv_len != len) { + msg = (void *)((char *)msg + err); + loop_len = len - recv_len; + loop_len = min(chunk_len, loop_len); + if ((err = _scif_recv(epd, tmp, loop_len, flags)) < 0) + goto recv_free_err; + if (copy_to_user(msg, tmp, err)) { + err = -EFAULT; + goto recv_free_err; + } + recv_len += err; + if (err !=loop_len) { + goto recv_free_err; + } + } +recv_free_err: + mutex_unlock(&ep->recvlock); + kfree(tmp); +recv_err: + return err < 0 ? err : recv_len; +} + +#ifdef SCIF_BLAST +/* + * Added a temporary implementation of the exception path. + * The cost to the normal path testing of 2 flag bits instead + * of just one and a change to condition for node-wakeup. + */ +#endif + +/** + * scif_send() - Send data to connection queue + * @epd: The end point address returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: Syncronous or asynchronous access + * + * This function is called from the kernel mode only and is + * a wrapper for _scif_send(). + */ +int +__scif_send(scif_epd_t epd, void *msg, int len, int flags) +{ + struct endpt *ep = (struct endpt *)epd; + int ret; + + pr_debug("SCIFAPI send (K): ep %p %s\n", ep, scif_ep_states[ep->state]); + if (!len) + return 0; + +#ifdef SCIF_BLAST + /* + * KAA: this is same code as scif_msg_param_check(), + * but since that routine is shared with scif_recv + * I thought is safer to replicate code here. + */ + if (len < 0) + return -EINVAL; + + if (flags && !(flags & (SCIF_SEND_BLOCK | SCIF_BLAST))) + return -EINVAL; + + if ((flags & (SCIF_SEND_BLOCK | SCIF_BLAST)) == + (SCIF_SEND_BLOCK | SCIF_BLAST)) + return -EINVAL; +#else + if ((ret = scif_msg_param_check(epd, len, flags))) + return ret; +#endif + /* + * Cannot block while waiting for node to wake up + * if non blocking messaging mode is requested. Return + * ENODEV if the remote node is idle. + */ + if (!(flags & SCIF_SEND_BLOCK) && ep->remote_dev && + SCIF_NODE_IDLE == atomic_long_read( + &ep->remote_dev->scif_ref_cnt)) + return -ENODEV; + + micscif_inc_node_refcnt(ep->remote_dev, 1); + + /* + * Grab the mutex lock in the blocking case only + * to ensure messages do not get fragmented/reordered. + * The non blocking mode is protected using spin locks + * in _scif_send(). + */ + if (flags & SCIF_SEND_BLOCK) + mutex_lock(&ep->sendlock); + + ret = _scif_send(epd, msg, len, flags); + + if (flags & SCIF_SEND_BLOCK) + mutex_unlock(&ep->sendlock); + + micscif_dec_node_refcnt(ep->remote_dev, 1); + return ret; +} + +int +scif_send(scif_epd_t epd, void *msg, int len, int flags) +{ + int ret; + get_kref_count(epd); + ret = __scif_send(epd, msg, len, flags); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_send); + +/** + * scif_recv() - Recieve data from connection queue + * @epd: The end point address returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: Syncronous or asynchronous access + * + * This function is called from the kernel mode only and is + * a wrapper for _scif_recv(). + */ +int +__scif_recv(scif_epd_t epd, void *msg, int len, int flags) +{ + struct endpt *ep = (struct endpt *)epd; + int ret; + + pr_debug("SCIFAPI recv (K): ep %p %s\n", ep, scif_ep_states[ep->state]); + + if (!len) + return 0; + + if ((ret = scif_msg_param_check(epd, len, flags))) + return ret; + + /* + * Cannot block while waiting for node to wake up + * if non blocking messaging mode is requested. Return + * ENODEV if the remote node is idle. + */ + if (!flags && ep->remote_dev && + SCIF_NODE_IDLE == atomic_long_read( + &ep->remote_dev->scif_ref_cnt)) + return -ENODEV; + + /* + * Grab the mutex lock in the blocking case only + * to ensure messages do not get fragmented/reordered. + * The non blocking mode is protected using spin locks + * in _scif_send(). + */ + if (flags & SCIF_RECV_BLOCK) + mutex_lock(&ep->recvlock); + + ret = _scif_recv(epd, msg, len, flags); + + if (flags & SCIF_RECV_BLOCK) + mutex_unlock(&ep->recvlock); + + return ret; +} + +int +scif_recv(scif_epd_t epd, void *msg, int len, int flags) +{ + int ret; + get_kref_count(epd); + ret = __scif_recv(epd, msg, len, flags); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_recv); + +/** + * __scif_pin_pages - __scif_pin_pages() pins the physical pages which back + * the range of virtual address pages starting at addr and continuing for + * len bytes. addr and len are constrained to be multiples of the page size. + * A successful scif_register() call returns an opaque pointer value + * which may be used in subsequent calls to scif_register_pinned_pages(). + * + * Return Values + * Upon successful completion, __scif_pin_pages() returns a + * scif_pinned_pages_t value else an apt error is returned as documented + * in scif.h. Protections of the set of pinned pages are also returned by + * reference via out_prot. + */ +int +__scif_pin_pages(void *addr, size_t len, int *out_prot, + int map_flags, scif_pinned_pages_t *pages) +{ + struct scif_pinned_pages *pinned_pages; + int nr_pages, err = 0, i; + bool vmalloc_addr = false; + bool try_upgrade = false; + int prot = *out_prot; + int ulimit = 0; + struct mm_struct *mm = NULL; + + /* Unsupported flags */ + if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT)) + return -EINVAL; + ulimit = !!(map_flags & SCIF_MAP_ULIMIT); + + /* Unsupported protection requested */ + if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) + return -EINVAL; + + /* addr/len must be page aligned. len should be non zero */ + if ((!len) || + (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) || + (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len)) + return -EINVAL; + + might_sleep(); + + nr_pages = (int)(len >> PAGE_SHIFT); + + /* Allocate a set of pinned pages */ + if (!(pinned_pages = micscif_create_pinned_pages(nr_pages, prot))) + return -ENOMEM; + + if (unlikely(map_flags & SCIF_MAP_KERNEL)) { + if (is_vmalloc_addr(addr)) + vmalloc_addr = true; + + for (i = 0; i < nr_pages; i++) { + if (unlikely(vmalloc_addr)) + pinned_pages->pages[i] = + vmalloc_to_page((char *)addr + (i * PAGE_SIZE) ); + else + pinned_pages->pages[i] = + virt_to_page((char *)addr + (i * PAGE_SIZE) ); + pinned_pages->num_pages[i] = 1; + pinned_pages->nr_contig_chunks++; + } + pinned_pages->nr_pages = nr_pages; + pinned_pages->map_flags = SCIF_MAP_KERNEL; + } else { + if (prot == SCIF_PROT_READ) + try_upgrade = true; + prot |= SCIF_PROT_WRITE; +retry: + mm = current->mm; + down_write(&mm->mmap_sem); + if (ulimit) { + err = __scif_check_inc_pinned_vm(mm, nr_pages); + if (err) { + up_write(&mm->mmap_sem); + pinned_pages->nr_pages = 0; + goto error_unmap; + } + } + + pinned_pages->nr_pages = get_user_pages( + current, + mm, + (uint64_t)addr, + nr_pages, + !!(prot & SCIF_PROT_WRITE), + 0, + pinned_pages->pages, + pinned_pages->vma); + up_write(&mm->mmap_sem); + if (nr_pages == pinned_pages->nr_pages) { +#ifdef RMA_DEBUG + atomic_long_add_return(nr_pages, &ms_info.rma_pin_cnt); +#endif + micscif_detect_large_page(pinned_pages, addr); + } else { + if (try_upgrade) { + if (ulimit) + __scif_dec_pinned_vm_lock(mm, nr_pages, 0); +#ifdef RMA_DEBUG + WARN_ON(atomic_long_sub_return(1, + &ms_info.rma_mm_cnt) < 0); +#endif + /* Roll back any pinned pages */ + for (i = 0; i < pinned_pages->nr_pages; i++) { + if (pinned_pages->pages[i]) + page_cache_release(pinned_pages->pages[i]); + } + prot &= ~SCIF_PROT_WRITE; + try_upgrade = false; + goto retry; + } + } + pinned_pages->map_flags = 0; + } + + if (pinned_pages->nr_pages < nr_pages) { + err = -EFAULT; + pinned_pages->nr_pages = nr_pages; + goto dec_pinned; + } + + *out_prot = prot; + atomic_set(&pinned_pages->ref_count, nr_pages); + *pages = pinned_pages; + return err; +dec_pinned: + if (ulimit) + __scif_dec_pinned_vm_lock(mm, nr_pages, 0); + /* Something went wrong! Rollback */ +error_unmap: + pinned_pages->nr_pages = nr_pages; + micscif_destroy_pinned_pages(pinned_pages); + *pages = NULL; + pr_debug("%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len); + return err; + +} + +/** + * scif_pin_pages - scif_pin_pages() pins the physical pages which back + * the range of virtual address pages starting at addr and continuing for + * len bytes. addr and len are constrained to be multiples of the page size. + * A successful scif_register() call returns an opaque pointer value + * which may be used in subsequent calls to scif_register_pinned_pages(). + * + * Return Values + * Upon successful completion, scif_register() returns a + * scif_pinned_pages_t value else an apt error is returned as documented + * in scif.h + */ +int +scif_pin_pages(void *addr, size_t len, int prot, + int map_flags, scif_pinned_pages_t *pages) +{ + return __scif_pin_pages(addr, len, &prot, map_flags, pages); +} +EXPORT_SYMBOL(scif_pin_pages); + +/** + * scif_unpin_pages: Unpin a set of pages + * + * Return Values: + * Upon successful completion, scif_unpin_pages() returns 0; + * else an apt error is returned as documented in scif.h + */ +int +scif_unpin_pages(scif_pinned_pages_t pinned_pages) +{ + int err = 0, ret; + + if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic) + return -EINVAL; + + ret = atomic_sub_return((int32_t)pinned_pages->nr_pages, + &pinned_pages->ref_count); + BUG_ON(ret < 0); + + /* + * Destroy the window if the ref count for this set of pinned + * pages has dropped to zero. If it is positive then there is + * a valid registered window which is backed by these pages and + * it will be destroyed once all such windows are unregistered. + */ + if (!ret) + err = micscif_destroy_pinned_pages(pinned_pages); + + return err; +} +EXPORT_SYMBOL(scif_unpin_pages); + +/** + * scif_register_pinned_pages: Mark a memory region for remote access. + * + * The scif_register_pinned_pages() function opens a window, a range + * of whole pages of the registered address space of the endpoint epd, + * starting at offset po. The value of po, further described below, is + * a function of the parameters offset and pinned_pages, and the value + * of map_flags. Each page of the window represents a corresponding + * physical memory page of pinned_pages; the length of the window is + * the same as the length of pinned_pages. A successful scif_register() + * call returns po as the return value. + * + * Return Values + * Upon successful completion, scif_register_pinned_pages() returns + * the offset at which the mapping was placed (po); + * else an apt error is returned as documented in scif.h + */ +off_t +__scif_register_pinned_pages(scif_epd_t epd, + scif_pinned_pages_t pinned_pages, off_t offset, int map_flags) +{ + struct endpt *ep = (struct endpt *)epd; + uint64_t computed_offset; + struct reg_range_t *window; + int err; + size_t len; + +#ifdef DEBUG + /* Bad EP */ + if (!ep || !pinned_pages || pinned_pages->magic != SCIFEP_MAGIC) + return -EINVAL; +#endif + /* Unsupported flags */ + if (map_flags & ~SCIF_MAP_FIXED) + return -EINVAL; + + len = pinned_pages->nr_pages << PAGE_SHIFT; + + /* + * Offset is not page aligned/negative or offset+len + * wraps around with SCIF_MAP_FIXED. + */ + if ((map_flags & SCIF_MAP_FIXED) && + ((align_low(offset, PAGE_SIZE) != offset) || + (offset < 0) || + (offset + (off_t)len < offset))) + return -EINVAL; + + might_sleep(); + + if ((err = verify_epd(ep))) + return err; + + /* Compute the offset for this registration */ + if ((err = micscif_get_window_offset(ep, map_flags, offset, + len, &computed_offset))) + return err; + + /* Allocate and prepare self registration window */ + if (!(window = micscif_create_window(ep, pinned_pages->nr_pages, + computed_offset, false))) { + micscif_free_window_offset(ep, computed_offset, len); + return -ENOMEM; + } + + window->pinned_pages = pinned_pages; + window->nr_pages = pinned_pages->nr_pages; + window->nr_contig_chunks = pinned_pages->nr_contig_chunks; + window->prot = pinned_pages->prot; + + /* + * This set of pinned pages now belongs to this window as well. + * Assert if the ref count is zero since it is an error to + * pass pinned_pages to scif_register_pinned_pages() after + * calling scif_unpin_pages(). + */ + if (!atomic_add_unless(&pinned_pages->ref_count, + (int32_t)pinned_pages->nr_pages, 0)) + BUG_ON(1); + + micscif_inc_node_refcnt(ep->remote_dev, 1); + + if ((err = micscif_send_alloc_request(ep, window))) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Prepare the remote registration window */ + if ((err = micscif_prep_remote_window(ep, window))) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + micscif_set_nr_pages(ep->remote_dev, window); + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Tell the peer about the new window */ + if ((err = micscif_send_scif_register(ep, window))) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + micscif_dec_node_refcnt(ep->remote_dev, 1); + + /* No further failures expected. Insert new window */ + mutex_lock(&ep->rma_info.rma_lock); + set_window_ref_count(window, pinned_pages->nr_pages); + micscif_insert_window(window, &ep->rma_info.reg_list); + mutex_unlock(&ep->rma_info.rma_lock); + + return computed_offset; +error_unmap: + micscif_destroy_window(ep, window); + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + return err; +} + +off_t +scif_register_pinned_pages(scif_epd_t epd, + scif_pinned_pages_t pinned_pages, off_t offset, int map_flags) +{ + off_t ret; + get_kref_count(epd); + ret = __scif_register_pinned_pages(epd, pinned_pages, offset, map_flags); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_register_pinned_pages); + +/** + * scif_get_pages - Add references to remote registered pages + * + * scif_get_pages() returns the addresses of the physical pages represented + * by those pages of the registered address space of the peer of epd, starting + * at offset offset and continuing for len bytes. offset and len are constrained + * to be multiples of the page size. + * + * Return Values + * Upon successful completion, scif_get_pages() returns 0; + * else an apt error is returned as documented in scif.h. + */ +int +__scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages) +{ + struct endpt *ep = (struct endpt *)epd; + struct micscif_rma_req req; + struct reg_range_t *window = NULL; + int nr_pages, err, i; + + pr_debug("SCIFAPI get_pinned_pages: ep %p %s offset 0x%lx len 0x%lx\n", + ep, scif_ep_states[ep->state], offset, len); + + if ((err = verify_epd(ep))) + return err; + + if ((!len) || + (offset < 0) || + (offset + len < offset) || + (align_low((uint64_t)offset, PAGE_SIZE) != (uint64_t)offset) || + (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len)) + return -EINVAL; + + nr_pages = len >> PAGE_SHIFT; + + req.out_window = &window; + req.offset = offset; + req.prot = 0; + req.nr_bytes = len; + req.type = WINDOW_SINGLE; + req.head = &ep->rma_info.remote_reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + if ((err = micscif_query_window(&req))) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + RMA_MAGIC(window); + + /* Allocate scif_range */ + if (!(*pages = kzalloc(sizeof(struct scif_range), GFP_KERNEL))) { + err = -ENOMEM; + goto error; + } + + /* Allocate phys addr array */ + if (!((*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t)))) { + err = -ENOMEM; + goto error; + } + +#ifndef _MIC_SCIF_ + /* Allocate virtual address array */ + if (!((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)))) { + err = -ENOMEM; + goto error; + } +#endif + /* Populate the values */ + (*pages)->cookie = window; + (*pages)->nr_pages = nr_pages; + (*pages)->prot_flags = window->prot; + + for (i = 0; i < nr_pages; i++) { + (*pages)->phys_addr[i] = +#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + is_self_scifdev(ep->remote_dev) ? + micscif_get_dma_addr(window, offset + (i * PAGE_SIZE), + NULL, NULL, NULL) : window->phys_addr[i]; +#else + get_phys_addr(micscif_get_dma_addr(window, offset + (i * PAGE_SIZE), + NULL, NULL, NULL), ep->remote_dev); +#endif +#ifndef _MIC_SCIF_ + if (!is_self_scifdev(ep->remote_dev)) + (*pages)->va[i] = + get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.va + + (*pages)->phys_addr[i] - + get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.pa; +#endif + } + + window->get_put_ref_count += nr_pages; + get_window_ref_count(window, nr_pages); +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (err) { + if (*pages) { + if ((*pages)->phys_addr) + scif_free((*pages)->phys_addr, nr_pages * sizeof(dma_addr_t)); +#ifndef _MIC_SCIF_ + if ((*pages)->va) + scif_free((*pages)->va, nr_pages * sizeof(void *)); +#endif + kfree(*pages); + *pages = NULL; + } + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + } else { + micscif_create_node_dep(ep->remote_dev, nr_pages); + } + return err; +} + +int +scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages) +{ + int ret; + get_kref_count(epd); + ret = __scif_get_pages(epd, offset, len, pages); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_get_pages); + +/** + * scif_put_pages - Remove references from remote registered pages + * + * scif_put_pages() returns a scif_range structure previously obtained by + * calling scif_get_pages(). When control returns, the physical pages may + * become available for reuse if and when the window which represented + * those pages is unregistered. Therefore, those pages must never be accessed. + * + * Return Values + * Upon success, zero is returned. + * else an apt error is returned as documented in scif.h. + */ +int +__scif_put_pages(struct scif_range *pages) +{ + struct endpt *ep; + struct reg_range_t *window; + struct nodemsg msg; + + if (!pages || !pages->cookie) + return -EINVAL; + + window = pages->cookie; + + if (!window || window->magic != SCIFEP_MAGIC || + !window->get_put_ref_count) + return -EINVAL; + + ep = (struct endpt *)window->ep; + + /* + * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the + * callee should be allowed to release references to the pages, + * else the endpoint was not connected in the first place, + * hence the ENOTCONN. + */ + if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED) + return -ENOTCONN; + + /* + * TODO: Re-enable this check once ref counts for kernel mode APIs + * have been implemented and node remove call backs are called before + * the node is removed. This check results in kernel mode APIs not + * being able to release pages correctly since node remove callbacks + * are called after the node is removed currently. + * if (!scifdev_alive(ep)) + * return -ENODEV; + */ + + micscif_inc_node_refcnt(ep->remote_dev, 1); + mutex_lock(&ep->rma_info.rma_lock); + + /* Decrement the ref counts and check for errors */ + window->get_put_ref_count -= pages->nr_pages; + BUG_ON(window->get_put_ref_count < 0); + put_window_ref_count(window, pages->nr_pages); + + /* Initiate window destruction if ref count is zero */ + if (!window->ref_count) { + drain_dma_intr(ep->rma_info.dma_chan); + /* Inform the peer about this window being destroyed. */ + msg.uop = SCIF_MUNMAP; + msg.src = ep->port; + msg.payload[0] = window->peer_window; + /* No error handling for notification messages */ + micscif_nodeqp_send(ep->remote_dev, &msg, ep); + list_del(&window->list_member); + /* Destroy this window from the peer's registered AS */ + micscif_destroy_remote_window(ep, window); + } + mutex_unlock(&ep->rma_info.rma_lock); + + micscif_dec_node_refcnt(ep->remote_dev, 1); + micscif_destroy_node_dep(ep->remote_dev, pages->nr_pages); + scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t)); +#ifndef _MIC_SCIF_ + scif_free(pages->va, pages->nr_pages * sizeof(void*)); +#endif + kfree(pages); + return 0; +} + +int +scif_put_pages(struct scif_range *pages) +{ + int ret; + struct reg_range_t *window = pages->cookie; + struct endpt *ep = (struct endpt *)window->ep; + if (atomic_read(&(&(ep->ref_count))->refcount) > 0) { + kref_get(&(ep->ref_count)); + } else { + WARN_ON(1); + } + ret = __scif_put_pages(pages); + if (atomic_read(&(&(ep->ref_count))->refcount) > 0) { + kref_put(&(ep->ref_count), scif_ref_rel); + } else { + //WARN_ON(1); + } + return ret; +} +EXPORT_SYMBOL(scif_put_pages); + +int scif_event_register(scif_callback_t handler) +{ + /* Add to the list of event handlers */ + struct scif_callback *cb = kmalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return -ENOMEM; + mutex_lock(&ms_info.mi_event_cblock); + cb->callback_handler = handler; + list_add_tail(&cb->list_member, &ms_info.mi_event_cb); + mutex_unlock(&ms_info.mi_event_cblock); + return 0; +} +EXPORT_SYMBOL(scif_event_register); + +int scif_event_unregister(scif_callback_t handler) +{ + struct list_head *pos, *unused; + struct scif_callback *temp; + int err = -EINVAL; + + mutex_lock(&ms_info.mi_event_cblock); + list_for_each_safe(pos, unused, &ms_info.mi_event_cb) { + temp = list_entry(pos, struct scif_callback, list_member); + if (temp->callback_handler == handler) { + err = 0; + list_del(pos); + kfree(temp); + break; + } + } + + mutex_unlock(&ms_info.mi_event_cblock); + return err; +} +EXPORT_SYMBOL(scif_event_unregister); + +/** + * scif_register - Mark a memory region for remote access. + * @epd: endpoint descriptor + * @addr: starting virtual address + * @len: length of range + * @offset: offset of window + * @prot: read/write protection + * @map_flags: flags + * + * Return Values + * Upon successful completion, scif_register() returns the offset + * at which the mapping was placed else an apt error is returned + * as documented in scif.h. + */ +off_t +__scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, + int prot, int map_flags) +{ + scif_pinned_pages_t pinned_pages; + off_t err; + struct endpt *ep = (struct endpt *)epd; + uint64_t computed_offset; + struct reg_range_t *window; + struct mm_struct *mm = NULL; + + pr_debug("SCIFAPI register: ep %p %s addr %p len 0x%lx" + " offset 0x%lx prot 0x%x map_flags 0x%x\n", + epd, scif_ep_states[epd->state], addr, len, offset, prot, map_flags); + + /* Unsupported flags */ + if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL)) + return -EINVAL; + + /* Unsupported protection requested */ + if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) + return -EINVAL; + + /* addr/len must be page aligned. len should be non zero */ + if ((!len) || + (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) || + (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len)) + return -EINVAL; + + /* + * Offset is not page aligned/negative or offset+len + * wraps around with SCIF_MAP_FIXED. + */ + if ((map_flags & SCIF_MAP_FIXED) && + ((align_low(offset, PAGE_SIZE) != offset) || + (offset < 0) || + (offset + (off_t)len < offset))) + return -EINVAL; + + + might_sleep(); + +#ifdef DEBUG + /* Bad EP */ + if (!ep) + return -EINVAL; +#endif + + if ((err = verify_epd(ep))) + return err; + + /* Compute the offset for this registration */ + if ((err = micscif_get_window_offset(ep, map_flags, offset, + len, &computed_offset))) + return err; + + /* Allocate and prepare self registration window */ + if (!(window = micscif_create_window(ep, len >> PAGE_SHIFT, + computed_offset, false))) { + micscif_free_window_offset(ep, computed_offset, len); + return -ENOMEM; + } + + micscif_inc_node_refcnt(ep->remote_dev, 1); + + window->nr_pages = len >> PAGE_SHIFT; + + if ((err = micscif_send_alloc_request(ep, window))) { + micscif_destroy_incomplete_window(ep, window); + micscif_dec_node_refcnt(ep->remote_dev, 1); + return err; + } + + if (!(map_flags & SCIF_MAP_KERNEL)) { + mm = __scif_acquire_mm(); + map_flags |= SCIF_MAP_ULIMIT; + } + /* Pin down the pages */ + if ((err = scif_pin_pages(addr, len, prot, + map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT), + &pinned_pages))) { + micscif_destroy_incomplete_window(ep, window); + micscif_dec_node_refcnt(ep->remote_dev, 1); + __scif_release_mm(mm); + goto error; + } + + window->pinned_pages = pinned_pages; + window->nr_contig_chunks = pinned_pages->nr_contig_chunks; + window->prot = pinned_pages->prot; + window->mm = mm; + + /* Prepare the remote registration window */ + if ((err = micscif_prep_remote_window(ep, window))) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + micscif_set_nr_pages(ep->remote_dev, window); + printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Tell the peer about the new window */ + if ((err = micscif_send_scif_register(ep, window))) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err); + goto error_unmap; + } + + micscif_dec_node_refcnt(ep->remote_dev, 1); + + /* No further failures expected. Insert new window */ + mutex_lock(&ep->rma_info.rma_lock); + set_window_ref_count(window, pinned_pages->nr_pages); + micscif_insert_window(window, &ep->rma_info.reg_list); + mutex_unlock(&ep->rma_info.rma_lock); + + pr_debug("SCIFAPI register: ep %p %s addr %p" + " len 0x%lx computed_offset 0x%llx\n", + epd, scif_ep_states[epd->state], addr, len, computed_offset); + return computed_offset; +error_unmap: + micscif_destroy_window(ep, window); +error: + printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err); + return err; +} + +off_t +scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, + int prot, int map_flags) +{ + off_t ret; + get_kref_count(epd); + ret = __scif_register(epd, addr, len, offset, prot, map_flags); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_register); + +/** + * scif_unregister - Release a memory region registered for remote access. + * @epd: endpoint descriptor + * @offset: start of range to unregister + * @len: length of range to unregister + * + * Return Values + * Upon successful completion, scif_unegister() returns zero + * else an apt error is returned as documented in scif.h. + */ +int +__scif_unregister(scif_epd_t epd, off_t offset, size_t len) +{ + struct endpt *ep = (struct endpt *)epd; + struct reg_range_t *window = NULL; + struct micscif_rma_req req; + int nr_pages, err; + + pr_debug("SCIFAPI unregister: ep %p %s offset 0x%lx len 0x%lx\n", + ep, scif_ep_states[ep->state], offset, len); + + /* len must be page aligned. len should be non zero */ + if ((!len) || + (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len)) + return -EINVAL; + + /* Offset is not page aligned or offset+len wraps around */ + if ((align_low(offset, PAGE_SIZE) != offset) || + (offset + (off_t)len < offset)) + return -EINVAL; + + if ((err = verify_epd(ep))) + return err; + + might_sleep(); + nr_pages = (int)(len >> PAGE_SHIFT); + + req.out_window = &window; + req.offset = offset; + req.prot = 0; + req.nr_bytes = len; + req.type = WINDOW_FULL; + req.head = &ep->rma_info.reg_list; + + micscif_inc_node_refcnt(ep->remote_dev, 1); + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + if ((err = micscif_query_window(&req))) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + /* Unregister all the windows in this range */ + if ((err = micscif_rma_list_unregister(window, offset, nr_pages))) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); +error: + mutex_unlock(&ep->rma_info.rma_lock); + micscif_dec_node_refcnt(ep->remote_dev, 1); + return err; +} + +int +scif_unregister(scif_epd_t epd, off_t offset, size_t len) +{ + int ret; + get_kref_count(epd); + ret = __scif_unregister(epd, offset, len); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_unregister); + +unsigned int scif_pollfd(struct file *f, poll_table *wait, scif_epd_t epd) +{ + unsigned int ret; + get_kref_count(epd); + ret = __scif_pollfd(f, wait, (struct endpt *)epd); + put_kref_count(epd); + return ret; +} + +unsigned int __scif_pollfd(struct file *f, poll_table *wait, struct endpt *ep) +{ + unsigned int mask = 0; + unsigned long sflags; + + pr_debug("SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]); + + micscif_inc_node_refcnt(ep->remote_dev, 1); + spin_lock_irqsave(&ep->lock, sflags); + + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) { +#else + if (!wait || wait->key & SCIF_POLLOUT) { +#endif + poll_wait(f, &ep->conn_pend_wq, wait); + if (ep->state == SCIFEP_CONNECTED || + ep->state == SCIFEP_DISCONNECTED || + ep->conn_err) { + mask |= SCIF_POLLOUT; + } + goto return_scif_poll; + } + } + + /* Is it OK to use wait->key?? */ + if (ep->state == SCIFEP_LISTENING) { +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + if (!wait || poll_requested_events(wait) & SCIF_POLLIN) { +#else + if (!wait || wait->key & SCIF_POLLIN) { +#endif + spin_unlock_irqrestore(&ep->lock, sflags); + poll_wait(f, &ep->conwq, wait); + spin_lock_irqsave(&ep->lock, sflags); + if (ep->conreqcnt) + mask |= SCIF_POLLIN; + } else { + mask |= SCIF_POLLERR; + } + goto return_scif_poll; + } + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + if (!wait || poll_requested_events(wait) & SCIF_POLLIN) { +#else + if (!wait || wait->key & SCIF_POLLIN) { +#endif + if (ep->state != SCIFEP_CONNECTED && + ep->state != SCIFEP_LISTENING && + ep->state != SCIFEP_DISCONNECTED) { + mask |= SCIF_POLLERR; + goto return_scif_poll; + } + + spin_unlock_irqrestore(&ep->lock, sflags); + poll_wait(f, &ep->recvwq, wait); + spin_lock_irqsave(&ep->lock, sflags); + if (micscif_rb_count(&ep->qp_info.qp->inbound_q, 1)) + mask |= SCIF_POLLIN; + } + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) { +#else + if (!wait || wait->key & SCIF_POLLOUT) { +#endif + if (ep->state != SCIFEP_CONNECTED && + ep->state != SCIFEP_LISTENING) { + mask |= SCIF_POLLERR; + goto return_scif_poll; + } + + spin_unlock_irqrestore(&ep->lock, sflags); + poll_wait(f, &ep->sendwq, wait); + spin_lock_irqsave(&ep->lock, sflags); + if (micscif_rb_space(&ep->qp_info.qp->outbound_q)) + mask |= SCIF_POLLOUT; + } + +return_scif_poll: + /* If the endpoint is in the diconnected state then return hangup instead of error */ + if (ep->state == SCIFEP_DISCONNECTED) { + mask &= ~SCIF_POLLERR; + mask |= SCIF_POLLHUP; + } + + spin_unlock_irqrestore(&ep->lock, sflags); + micscif_dec_node_refcnt(ep->remote_dev, 1); + return mask; +} + +/* + * The private data field of each VMA used to mmap a remote window + * points to an instance of struct vma_pvt + */ +struct vma_pvt { + struct endpt *ep; /* End point for remote window */ + uint64_t offset; /* offset within remote window */ + bool valid_offset; /* offset is valid only if the original + * mmap request was for a single page + * else the offset within the vma is + * the correct offset + */ + struct kref ref; +}; + +static void vma_pvt_release(struct kref *ref) +{ + struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref); + kfree(vmapvt); +} + +/** + * scif_vma_open - VMA open driver callback + * @vma: VMM memory area. + * The open method is called by the kernel to allow the subsystem implementing + * the VMA to initialize the area. This method is invoked any time a new + * reference to the VMA is made (when a process forks, for example). + * The one exception happens when the VMA is first created by mmap; + * in this case, the driver's mmap method is called instead. + * This function is also invoked when an existing VMA is split by the kernel + * due to a call to munmap on a subset of the VMA resulting in two VMAs. + * The kernel invokes this function only on one of the two VMAs. + * + * Return Values: None. + */ +static void scif_vma_open(struct vm_area_struct *vma) +{ + struct vma_pvt *vmapvt = ((vma)->vm_private_data); + pr_debug("SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n", + ((vma)->vm_start), ((vma)->vm_end)); + kref_get(&vmapvt->ref); +} + +/** + * scif_munmap - VMA close driver callback. + * @vma: VMM memory area. + * When an area is destroyed, the kernel calls its close operation. + * Note that there's no usage count associated with VMA's; the area + * is opened and closed exactly once by each process that uses it. + * + * Return Values: None. + */ +void scif_munmap(struct vm_area_struct *vma) +{ + struct endpt *ep; + struct vma_pvt *vmapvt = ((vma)->vm_private_data); + int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT ); + uint64_t offset; + struct micscif_rma_req req; + struct reg_range_t *window = NULL; + int err; + + might_sleep(); + pr_debug("SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n", + ((vma)->vm_start), ((vma)->vm_end)); + /* used to be a BUG_ON(), prefer keeping the kernel alive */ + if (!vmapvt) { + WARN_ON(1); + printk(KERN_ERR "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n", + ((vma)->vm_start), ((vma)->vm_end)); + return; + } + + ep = vmapvt->ep; + offset = vmapvt->valid_offset ? vmapvt->offset : + ((vma)->vm_pgoff) << PAGE_SHIFT; + pr_debug("SCIFAPI munmap: ep %p %s nr_pages 0x%x offset 0x%llx\n", + ep, scif_ep_states[ep->state], nr_pages, offset); + + req.out_window = &window; + req.offset = offset; + req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start); + req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE); + req.type = WINDOW_PARTIAL; + req.head = &ep->rma_info.remote_reg_list; + + micscif_inc_node_refcnt(ep->remote_dev, 1); + mutex_lock(&ep->rma_info.rma_lock); + + if ((err = micscif_query_window(&req))) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + else + micscif_rma_list_munmap(window, offset, nr_pages); + + mutex_unlock(&ep->rma_info.rma_lock); + micscif_dec_node_refcnt(ep->remote_dev, 1); + + micscif_destroy_node_dep(ep->remote_dev, nr_pages); + + /* + * The kernel probably zeroes these out but we still want + * to clean up our own mess just in case. + */ + vma->vm_ops = NULL; + ((vma)->vm_private_data) = NULL; + kref_put(&vmapvt->ref, vma_pvt_release); + micscif_rma_put_task(ep, nr_pages); +} + +static const struct vm_operations_struct micscif_vm_ops = { + .open = scif_vma_open, + .close = scif_munmap, +}; + +/** + * scif_mmap - Map pages in virtual address space to a remote window. + * @vma: VMM memory area. + * @epd: endpoint descriptor + * + * Return Values + * Upon successful completion, scif_mmap() returns zero + * else an apt error is returned as documented in scif.h. + */ +int +scif_mmap(struct vm_area_struct *vma, scif_epd_t epd) +{ + struct micscif_rma_req req; + struct reg_range_t *window = NULL; + struct endpt *ep = (struct endpt *)epd; + uint64_t start_offset = ((vma)->vm_pgoff) << PAGE_SHIFT; + int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT); + int err; + struct vma_pvt *vmapvt; + + pr_debug("SCIFAPI mmap: ep %p %s start_offset 0x%llx nr_pages 0x%x\n", + ep, scif_ep_states[ep->state], start_offset, nr_pages); + + if ((err = verify_epd(ep))) + return err; + + might_sleep(); + + if ((err = micscif_rma_get_task(ep, nr_pages))) + return err; + + if (!(vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL))) { + micscif_rma_put_task(ep, nr_pages); + return -ENOMEM; + } + + vmapvt->ep = ep; + kref_init(&vmapvt->ref); + + micscif_create_node_dep(ep->remote_dev, nr_pages); + + req.out_window = &window; + req.offset = start_offset; + req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start); + req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE); + req.type = WINDOW_PARTIAL; + req.head = &ep->rma_info.remote_reg_list; + + micscif_inc_node_refcnt(ep->remote_dev, 1); + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + if ((err = micscif_query_window(&req))) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + RMA_MAGIC(window); + + /* Default prot for loopback */ + if (!is_self_scifdev(ep->remote_dev)) { +#ifdef _MIC_SCIF_ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); +#else + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); +#endif + } + + /* + * VM_DONTCOPY - Do not copy this vma on fork + * VM_DONTEXPAND - Cannot expand with mremap() + * VM_RESERVED - Count as reserved_vm like IO + * VM_PFNMAP - Page-ranges managed without "struct page" + * VM_IO - Memory mapped I/O or similar + * + * We do not want to copy this VMA automatically on a fork(), + * expand this VMA due to mremap() or swap out these pages since + * the VMA is actually backed by physical pages in the remote + * node's physical memory and not via a struct page. + */ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP | VM_PFNMAP; +#else + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP; +#endif + + if (!is_self_scifdev(ep->remote_dev)) + ((vma)->vm_flags) |= VM_IO; + + /* Map this range of windows */ + if ((err = micscif_rma_list_mmap(window, + start_offset, nr_pages, vma))) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + /* Set up the driver call back */ + vma->vm_ops = &micscif_vm_ops; + ((vma)->vm_private_data) = vmapvt; + /* + * For 1 page sized VMAs the kernel (remap_pfn_range) replaces the + * offset in the VMA with the pfn, so in that case save off the + * original offset, since the page sized VMA can't be split into + * smaller VMAs the offset is not going to change. + */ + if (nr_pages == 1) { + vmapvt->offset = start_offset; + vmapvt->valid_offset = true; + } + err = 0; +error: + mutex_unlock(&ep->rma_info.rma_lock); + micscif_dec_node_refcnt(ep->remote_dev, 1); + if (err) { + micscif_destroy_node_dep(ep->remote_dev, nr_pages); + kfree(vmapvt); + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + micscif_rma_put_task(ep, nr_pages); + } + return err; +} + +/** + * scif_readfrom() - Read SCIF offset data from remote connection + * @epd: endpoint descriptor + * @loffset: offset in local registered address space to which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space from which to copy + * @flags: flags + * + * Return Values + * Upon successful completion, scif_readfrom() returns zero + * else an apt error is returned as documented in scif.h. + */ +int +scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int ret; + get_kref_count(epd); + ret = __scif_readfrom(epd, loffset, len, roffset, flags); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_readfrom); + +/** + * scif_writeto() - Send SCIF offset data to remote connection + * @epd: endpoint descriptor + * @loffset: offset in local registered address space from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to which to copy + * @flags: flags + * + * Return Values + * Upon successful completion, scif_writeto() returns zero + * else an apt error is returned as documented in scif.h. + * + */ +int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int ret; + get_kref_count(epd); + ret = __scif_writeto(epd, loffset, len, roffset, flags); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_writeto); + +#define HOST_LOOPB_MAGIC_MARK 0xdead + +/** + * scif_fence_mark: + * @epd: endpoint descriptor + * @flags: control flags + * @mark: marked handle returned as output. + * + * scif_fence_mark() returns after marking the current set of all uncompleted + * RMAs initiated through the endpoint epd or marking the current set of all + * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are + * marked with a value returned in mark. The application may subsequently + * await completion of all RMAs so marked. + * + * Return Values + * Upon successful completion, scif_fence_mark() returns 0; + * else an apt error is returned as documented in scif.h. + */ +int __scif_fence_mark(scif_epd_t epd, int flags, int *mark) +{ + struct endpt *ep = (struct endpt *)epd; + int err = 0; + + pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x\n", + ep, scif_ep_states[ep->state], flags, *mark); + + if ((err = verify_epd(ep))) + return err; + + /* Invalid flags? */ + if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* At least one of init self or peer RMA should be set */ + if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) + return -EINVAL; + + /* Exactly one of init self or peer RMA should be set but not both */ + if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) + return -EINVAL; + +#ifndef _MIC_SCIF_ + /* + * Host Loopback does not need to use DMA. + * Return a valid mark to be symmetric. + */ + if (is_self_scifdev(ep->remote_dev)) { + *mark = HOST_LOOPB_MAGIC_MARK; + return 0; + } +#endif + + if (flags & SCIF_FENCE_INIT_SELF) { + if ((*mark = micscif_fence_mark(epd)) < 0) + err = *mark; + } else { + micscif_inc_node_refcnt(ep->remote_dev, 1); + err = micscif_send_fence_mark(ep, mark); + micscif_dec_node_refcnt(ep->remote_dev, 1); + } + if (err) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + + pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x err %d\n", + ep, scif_ep_states[ep->state], flags, *mark, err); + return err; +} + +int scif_fence_mark(scif_epd_t epd, int flags, int *mark) +{ + int ret; + get_kref_count(epd); + ret = __scif_fence_mark(epd, flags, mark); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_fence_mark); + +/** + * scif_fence_wait: + * @epd: endpoint descriptor + * @mark: mark request. + * + * scif_fence_wait() returns after all RMAs marked with mark have completed. + * + * Return Values + * Upon successful completion, scif_fence_wait() returns 0; + * else an apt error is returned as documented in scif.h. + */ +int __scif_fence_wait(scif_epd_t epd, int mark) +{ + struct endpt *ep = (struct endpt *)epd; + int err = 0; + + pr_debug("SCIFAPI fence_wait: ep %p %s mark 0x%x\n", + ep, scif_ep_states[ep->state], mark); + + if ((err = verify_epd(ep))) + return err; + +#ifndef _MIC_SCIF_ + /* + * Host Loopback does not need to use DMA. + * The only valid mark provided is 0 so simply + * return success if the mark is valid. + */ + if (is_self_scifdev(ep->remote_dev)) { + if (HOST_LOOPB_MAGIC_MARK == mark) + return 0; + else + return -EINVAL; + } +#endif + if (mark & SCIF_REMOTE_FENCE) { + micscif_inc_node_refcnt(ep->remote_dev, 1); + err = micscif_send_fence_wait(epd, mark); + micscif_dec_node_refcnt(ep->remote_dev, 1); + } else { + err = dma_mark_wait(epd->rma_info.dma_chan, mark, true); + if (!err && atomic_read(&ep->rma_info.tw_refcount)) + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); + } + + if (err < 0) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + return err; +} + +int scif_fence_wait(scif_epd_t epd, int mark) +{ + int ret; + get_kref_count(epd); + ret = __scif_fence_wait(epd, mark); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_fence_wait); + +/* + * scif_fence_signal: + * @loff: local offset + * @lval: local value to write to loffset + * @roff: remote offset + * @rval: remote value to write to roffset + * @flags: flags + * + * scif_fence_signal() returns after marking the current set of all + * uncompleted RMAs initiated through the endpoint epd or marking + * the current set of all uncompleted RMAs initiated through the peer + * of endpoint epd. + * + * Return Values + * Upon successful completion, scif_fence_signal() returns 0; + * else an apt error is returned as documented in scif.h. + */ +int __scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, + off_t roff, uint64_t rval, int flags) +{ + struct endpt *ep = (struct endpt *)epd; + int err = 0; + + pr_debug("SCIFAPI fence_signal: ep %p %s loff 0x%lx lval 0x%llx " + "roff 0x%lx rval 0x%llx flags 0x%x\n", + ep, scif_ep_states[ep->state], loff, lval, roff, rval, flags); + + if ((err = verify_epd(ep))) + return err; + + /* Invalid flags? */ + if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER | + SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)) + return -EINVAL; + + /* At least one of init self or peer RMA should be set */ + if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) + return -EINVAL; + + /* Exactly one of init self or peer RMA should be set but not both */ + if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */ + if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))) + return -EINVAL; + + /* Only Dword offsets allowed */ + if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(uint32_t) - 1))) + return -EINVAL; + + /* Only Dword aligned offsets allowed */ + if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(uint32_t) - 1))) + return -EINVAL; + + if (flags & SCIF_FENCE_INIT_PEER) { + micscif_inc_node_refcnt(ep->remote_dev, 1); + err = micscif_send_fence_signal(epd, roff, + rval, loff, lval, flags); + micscif_dec_node_refcnt(ep->remote_dev, 1); + } else { + /* Local Signal in Local RAS */ + if (flags & SCIF_SIGNAL_LOCAL) + if ((err = micscif_prog_signal(epd, loff, + lval, RMA_WINDOW_SELF))) + goto error_ret; + + /* Signal in Remote RAS */ + if (flags & SCIF_SIGNAL_REMOTE) { + micscif_inc_node_refcnt(ep->remote_dev, 1); + err = micscif_prog_signal(epd, roff, + rval, RMA_WINDOW_PEER); + micscif_dec_node_refcnt(ep->remote_dev, 1); + } + } +error_ret: + if (err) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + else if (atomic_read(&ep->rma_info.tw_refcount)) + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); + return err; +} + +int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, + off_t roff, uint64_t rval, int flags) +{ + int ret; + get_kref_count(epd); + ret = __scif_fence_signal(epd, loff, lval, roff, rval, flags); + put_kref_count(epd); + return ret; +} +EXPORT_SYMBOL(scif_fence_signal); + +/** + * scif_get_nodeIDs - Return information about online nodes + * @nodes: array space reserved for returning online node IDs + * @len: number of entries on the nodes array + * @self: address to place the node ID of this system + * + * Return Values + * scif_get_nodeIDs() returns the total number of scif nodes + * (including host) in the system + */ +int +scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self) +{ + int online = 0; + int offset = 0; + int node; +#ifdef _MIC_SCIF_ + micscif_get_node_info(); +#endif + + *self = ms_info.mi_nodeid; + mutex_lock(&ms_info.mi_conflock); + len = SCIF_MIN(len, (int32_t)ms_info.mi_total); + for (node = 0; node <=(int32_t)ms_info.mi_maxid; node++) { + if (ms_info.mi_mask & (1UL << node)) { + online++; + if (offset < len) + nodes[offset++] = node; + } + } + pr_debug("SCIFAPI get_nodeIDs total %d online %d filled in %d nodes\n", + ms_info.mi_total, online, len); + mutex_unlock(&ms_info.mi_conflock); + + return online; +} + +EXPORT_SYMBOL(scif_get_nodeIDs); + +/** + * micscif_pci_dev: + * @node: node ID + * + * Return the pci_dev associated with a node. + */ +int micscif_pci_dev(uint16_t node, struct pci_dev **pdev) +{ +#ifdef _MIC_SCIF_ + /* This *is* a PCI device, therefore no pdev to return. */ + return -ENODEV; +#else + mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1); + *pdev = mic_ctx->bi_pdev; + return 0; +#endif +} + +#ifndef _MIC_SCIF_ +/** + * micscif_pci_info: + * @node: node ID + * + * Populate the pci device info pointer associated with a node. + */ +int micscif_pci_info(uint16_t node, struct scif_pci_info *dev) +{ + int i; + mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1); + struct pci_dev *pdev; + + if (!mic_ctx) + return -ENODEV; + + dev->pdev = pdev = mic_ctx->bi_pdev; + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + if (!pci_resource_start(pdev, i)) { + dev->va[i] = NULL; + continue; + } + if (pci_resource_flags(pdev, i) & IORESOURCE_PREFETCH) { + /* TODO: Change comparison check for KNL. */ + if (pci_resource_start(pdev, i) == mic_ctx->aper.pa) + dev->va[i] = mic_ctx->aper.va; + else + dev->va[i] = NULL; + } else { + dev->va[i] = mic_ctx->mmio.va; + } + } + return 0; +} +#endif + +/** + * scif_pci_info - Populate the pci device info pointer associated with a node + * @node: the node to query + * @scif_pdev: The scif_pci_info structure to populate. + * + * scif_pci_info() populates the provided scif_pci_info structure + * associated with a node. The requested node ID cannot be the same as + * the current node. This routine may only return success when called from + * the host. + * + * Return Values + * Upon successful completion, scif_pci_info() returns 0; otherwise the + * an appropriate error is returned as documented in scif.h. + */ +int scif_pci_info(uint16_t node, struct scif_pci_info *dev) +{ +#ifdef _MIC_SCIF_ + return -EINVAL; +#else + if (node > ms_info.mi_maxid) + return -EINVAL; + + if ((scif_dev[node].sd_state == SCIFDEV_NOTPRESENT) || + is_self_scifdev(&scif_dev[node])) + return -ENODEV; + + return micscif_pci_info(node, dev); +#endif +} +EXPORT_SYMBOL(scif_pci_info); + +/* + * DEBUG helper functions + */ +void +print_ep_state(struct endpt *ep, char *label) +{ + if (ep) + printk("%s: EP %p state %s\n", + label, ep, scif_ep_states[ep->state]); + else + printk("%s: EP %p\n state ?\n", label, ep); +} + diff --git a/micscif/micscif_debug.c b/micscif/micscif_debug.c new file mode 100644 index 0000000..7f26f5a --- /dev/null +++ b/micscif/micscif_debug.c @@ -0,0 +1,1005 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" +#ifndef _MIC_SCIF_ +#include "mic_common.h" +#endif +#include "scif.h" +#include +#include + +#include + +static char *window_type[] = { + "NONE", + "SELF", + "PEER"}; + +static char *scifdev_state[] = { + "SCIFDEV_NOTPRESENT", + "SCIFDEV_INIT", + "SCIFDEV_RUNNING", + "SCIFDEV_SLEEPING", + "SCIFDEV_STOPPING", + "SCIFDEV_STOPPED"}; + +static struct proc_dir_entry *scif_proc; +static struct dentry *mic_debug = NULL; + +#define DEBUG_LEN 10 + +static int +scif_ep_show(struct seq_file *m, void *data) +{ + struct endpt *ep; + struct list_head *pos; + unsigned long sflags; + + seq_printf(m, "EP Address State Port Peer Remote Ep Address\n"); + seq_printf(m, "=================================================================\n"); + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each(pos, &ms_info.mi_listen) { + ep = list_entry(pos, struct endpt, list); + seq_printf(m, "%p %s %6d\n", + ep, scif_ep_states[ep->state], ep->port.port); + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each(pos, &ms_info.mi_connected) { + ep = list_entry(pos, struct endpt, list); + seq_printf(m, "%p %s %6d %2d:%-6d %p\n", + ep, scif_ep_states[ep->state], ep->port.port, ep->peer.node, + ep->peer.port, (void *)ep->remote_ep); + } + list_for_each(pos, &ms_info.mi_disconnected) { + ep = list_entry(pos, struct endpt, list); + seq_printf(m, "%p %s %6d %2d:%-6d %p\n", + ep, scif_ep_states[ep->state], ep->port.port, ep->peer.node, + ep->peer.port, (void *)ep->remote_ep); + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + + seq_printf(m, "EP Address State Port Peer Remote Ep Address reg_list " + "remote_reg_list mmn_list tw_refcount tcw_refcount mi_rma mi_rma_tc " + "task_list mic_mmu_notif_cleanup\n"); + seq_printf(m, "=================================================================\n"); + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each(pos, &ms_info.mi_zombie) { + ep = list_entry(pos, struct endpt, list); + seq_printf(m, "%p %s %6d %2d:%-6d %p %d %d %d %d %d %d %d %d %d\n", + ep, scif_ep_states[ep->state], ep->port.port, ep->peer.node, + ep->peer.port, (void *)ep->remote_ep, + list_empty(&ep->rma_info.reg_list), + list_empty(&ep->rma_info.remote_reg_list), + list_empty(&ep->rma_info.mmn_list), + atomic_read(&ep->rma_info.tw_refcount), + atomic_read(&ep->rma_info.tcw_refcount), + list_empty(&ms_info.mi_rma), + list_empty(&ms_info.mi_rma_tc), + list_empty(&ep->rma_info.task_list), +#ifdef CONFIG_MMU_NOTIFIER + list_empty(&ms_info.mi_mmu_notif_cleanup) +#else + -1 +#endif + ); + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + + return 0; +} + +static int +scif_ep_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_ep_show, NULL); +} + +struct file_operations scif_ep_fops = { + .open = scif_ep_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + +static int +scif_rma_window_show(struct seq_file *m, void *data) +{ + struct endpt *ep; + struct list_head *pos, *item, *tmp; + unsigned long sflags; + struct reg_range_t *window; + + seq_printf(m, "SCIF Connected EP RMA Window Info\n"); + seq_printf(m, "=================================================================\n"); + seq_printf(m, "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n", + "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State"); + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each(pos, &ms_info.mi_connected) { + ep = list_entry(pos, struct endpt, list); + if (mutex_trylock(&ep->rma_info.rma_lock)) { + list_for_each_safe(item, tmp, &ep->rma_info.reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + seq_printf(m, + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + seq_printf(m, + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + mutex_unlock(&ep->rma_info.rma_lock); + } else + seq_printf(m, + "Try Again, some other thread has the RMA lock for ep %p\n", + ep); + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + + seq_printf(m, "=================================================================\n"); + seq_printf(m, "SCIF Zombie EP RMA Window Info\n"); + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each(pos, &ms_info.mi_zombie) { + ep = list_entry(pos, struct endpt, list); + if (mutex_trylock(&ep->rma_info.rma_lock)) { + list_for_each_safe(item, tmp, &ep->rma_info.reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + seq_printf(m, + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + seq_printf(m, + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + mutex_unlock(&ep->rma_info.rma_lock); + } else + seq_printf(m, + "Try Again, some other thread has the RMA lock for ep %p\n", + ep); + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + seq_printf(m, "=================================================================\n"); + seq_printf(m, "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n", + "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State"); + spin_lock(&ms_info.mi_rmalock); + list_for_each_safe(item, tmp, &ms_info.mi_rma) { + window = list_entry(item, + struct reg_range_t, list_member); + ep = (struct endpt *)window->ep; + seq_printf(m, "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + spin_unlock(&ms_info.mi_rmalock); + + return 0; +} + +static int +scif_rma_window_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_rma_window_show, NULL); +} + +struct file_operations scif_rma_window_fops = { + .open = scif_rma_window_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int +scif_rma_xfer_show(struct seq_file *m, void *data) +{ + struct endpt *ep; + struct list_head *pos; + unsigned long sflags; + + seq_printf(m, "SCIF RMA Debug\n"); + seq_printf(m, "=================================================================\n"); + seq_printf(m, "%-16s\t %-16s %-16s %-16s\n", + "Endpoint", "Fence Ref Count", "Temp Window Ref Count", "DMA CHANNEL"); + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each(pos, &ms_info.mi_connected) { + ep = list_entry(pos, struct endpt, list); + seq_printf(m, "%-16p\t%-16d %-16d %-16d\n", + ep, ep->rma_info.fence_refcount, + atomic_read(&ep->rma_info.tw_refcount), + ep->rma_info.dma_chan ? get_chan_num(ep->rma_info.dma_chan): -1); + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + return 0; +} + +static int +scif_rma_xfer_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_rma_xfer_show, NULL); +} + +struct file_operations scif_rma_xfer_fops = { + .open = scif_rma_xfer_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int +scif_dev_show(struct seq_file *m, void *data) +{ + int node; + + seq_printf(m, "Total Nodes %d Self Node Id %d Maxid %d\n", + ms_info.mi_total, ms_info.mi_nodeid, ms_info.mi_maxid); + + seq_printf(m, "%-16s\t%-16s %-16s\t%-16s\t%-8s\t%-8s\t%-8s\n", + "node_id", "state", "scif_ref_cnt", "scif_map_ref_cnt", + "wait_status", "conn count", "numa_node"); + + for (node = 0; node <= ms_info.mi_maxid; node++) + seq_printf(m, "%-16d\t%-16s\t0x%-16lx\t%-16d\t%-16lld\t%-16d\t%-16d\n", + scif_dev[node].sd_node, scifdev_state[scif_dev[node].sd_state], + atomic_long_read(&scif_dev[node].scif_ref_cnt), + scif_dev[node].scif_map_ref_cnt, + scif_dev[node].sd_wait_status, + scif_dev[node].num_active_conn, + scif_dev[node].sd_numa_node); + + return 0; +} + +static int +scif_dev_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_dev_show, NULL); +} + +struct file_operations scif_dev_fops = { + .open = scif_dev_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int +scif_debug_show(struct seq_file *m, void *data) +{ + seq_printf(m, "Num gtt_entries %d\n", ms_info.nr_gtt_entries); + /* + * Tracking the number of zombies for debug. + * Need to make sure they are not being left behind forever. + */ + seq_printf(m, "Num Zombie Endpoints %d\n", ms_info.mi_nr_zombies); + seq_printf(m, "Watchdog timeout %d\n", ms_info.mi_watchdog_to); + seq_printf(m, "Watchdog enabled %d\n", ms_info.mi_watchdog_enabled); + seq_printf(m, "Watchdog auto reboot %d\n", ms_info.mi_watchdog_auto_reboot); + seq_printf(m, "Huge Pages Enabled %d Detected 2mb %lld 4k %lld\n", + mic_huge_page_enable, ms_info.nr_2mb_pages, ms_info.nr_4k_pages); +#ifdef RMA_DEBUG + seq_printf(m, "rma_alloc_cnt %ld rma_pin_cnt %ld mmu_notif %ld rma_unaligned_cpu_cnt %ld\n", + atomic_long_read(&ms_info.rma_alloc_cnt), + atomic_long_read(&ms_info.rma_pin_cnt), + atomic_long_read(&ms_info.mmu_notif_cnt), + atomic_long_read(&ms_info.rma_unaligned_cpu_cnt)); +#endif + seq_printf(m, "List empty? mi_uaccept %d mi_listen %d mi_zombie %d " + "mi_connected %d mi_disconnected %d\n", + list_empty(&ms_info.mi_uaccept), + list_empty(&ms_info.mi_listen), + list_empty(&ms_info.mi_zombie), + list_empty(&ms_info.mi_connected), + list_empty(&ms_info.mi_disconnected)); + + return 0; +} + +static int +scif_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_debug_show, NULL); +} + +struct file_operations scif_debug_fops = { + .open = scif_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int +scif_suspend_show(struct seq_file *m, void *data) +{ + int node; + uint64_t ret; + seq_printf(m, "Removing Nodes mask 0x7\n"); + + for (node = 1; node < ms_info.mi_total; node++) { + ret = micscif_disconnect_node(node, 0 , 1); + seq_printf(m, "Node %d requested disconnect. ret = %lld\n", + node, ret); + } + + return 0; +} + +static int +scif_suspend_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_suspend_show, NULL); +} + +struct file_operations scif_suspend_fops = { + .open = scif_suspend_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int +scif_cache_limit_show(struct seq_file *m, void *data) +{ + seq_printf(m, "reg_cache_limit = 0x%lx\n", ms_info.mi_rma_tc_limit); + return 0; +} + +static int +scif_cache_limit_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_cache_limit_show, NULL); +} + +struct file_operations scif_cache_limit_fops = { + .open = scif_cache_limit_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#else // LINUX VERSION 3.10 + +static int +scif_rma_window_read(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + struct endpt *ep; + struct list_head *pos, *item, *tmp; + unsigned long sflags; + int l = 0; + struct reg_range_t *window; + + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "SCIF Connected EP RMA Window Info\n"); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "=================================================================\n"); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n", + "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State"); + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each(pos, &ms_info.mi_connected) { + ep = list_entry(pos, struct endpt, list); + if (mutex_trylock(&ep->rma_info.rma_lock)) { + list_for_each_safe(item, tmp, &ep->rma_info.reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + mutex_unlock(&ep->rma_info.rma_lock); + } else + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "Try Again, some other thread has the RMA lock for ep %p\n", + ep); + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "=================================================================\n"); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "SCIF Zombie EP RMA Window Info\n"); + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each(pos, &ms_info.mi_zombie) { + ep = list_entry(pos, struct endpt, list); + if (mutex_trylock(&ep->rma_info.rma_lock)) { + list_for_each_safe(item, tmp, &ep->rma_info.reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) { + window = list_entry(item, struct reg_range_t, list_member); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + mutex_unlock(&ep->rma_info.rma_lock); + } else + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "Try Again, some other thread has the RMA lock for ep %p\n", + ep); + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "=================================================================\n"); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n", + "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State"); + spin_lock(&ms_info.mi_rmalock); + list_for_each_safe(item, tmp, &ms_info.mi_rma) { + window = list_entry(item, + struct reg_range_t, list_member); + ep = (struct endpt *)window->ep; + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n", + ep, window_type[window->type], window->offset, + window->nr_pages, window->prot, window->ref_count, + window->unreg_state); + } + spin_unlock(&ms_info.mi_rmalock); + + *eof = 1; + return l; +} + +static int +scif_rma_xfer_read(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + struct endpt *ep; + struct list_head *pos; + unsigned long sflags; + int l = 0; + + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , "SCIF RMA Debug\n"); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "=================================================================\n"); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , "%-16s\t %-16s %-16s %-16s\n", + "Endpoint", "Fence Ref Count", "Temp Window Ref Count", "DMA CHANNEL"); + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each(pos, &ms_info.mi_connected) { + ep = list_entry(pos, struct endpt, list); + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , "%-16p\t%-16d %-16d %-16d\n", + ep, ep->rma_info.fence_refcount, + atomic_read(&ep->rma_info.tw_refcount), + ep->rma_info.dma_chan ? get_chan_num(ep->rma_info.dma_chan): -1); + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + + *eof = 1; + return l; +} + +/* Place Holder for generic SCIF debug information */ +static int +scif_debug_read(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Num gtt_entries %d\n", ms_info.nr_gtt_entries); + /* + * Tracking the number of zombies for debug. + * Need to make sure they are not being left behind forever. + */ + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Num Zombie Endpoints %d\n", ms_info.mi_nr_zombies); + + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Watchdog timeout %d\n", ms_info.mi_watchdog_to); + + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Watchdog enabled %d\n", ms_info.mi_watchdog_enabled); + + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Watchdog auto reboot %d\n", ms_info.mi_watchdog_auto_reboot); + + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Huge Pages Enabled %d Detected 2mb %lld 4k %lld\n", + mic_huge_page_enable, ms_info.nr_2mb_pages, ms_info.nr_4k_pages); +#ifdef RMA_DEBUG + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "mm ref cnt %ld rma_alloc_cnt %ld rma_pin_cnt %ld mmu_notif %ld rma_unaligned_cpu_cnt %ld\n", + atomic_long_read(&ms_info.rma_mm_cnt), + atomic_long_read(&ms_info.rma_alloc_cnt), + atomic_long_read(&ms_info.rma_pin_cnt), + atomic_long_read(&ms_info.mmu_notif_cnt), + atomic_long_read(&ms_info.rma_unaligned_cpu_cnt)); +#endif + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "List empty? mi_uaccept %d mi_listen %d mi_zombie %d " + "mi_connected %d mi_disconnected %d\n", + list_empty(&ms_info.mi_uaccept), + list_empty(&ms_info.mi_listen), + list_empty(&ms_info.mi_zombie), + list_empty(&ms_info.mi_connected), + list_empty(&ms_info.mi_disconnected)); + + *eof = 1; + return l; +} + +static int +scif_dev_info(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + int node; + +#ifdef _MIC_SCIF_ + micscif_get_node_info(); + + mutex_lock(&ms_info.mi_conflock); +#endif + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Total Nodes %d Self Node Id %d Maxid %d\n", + ms_info.mi_total, ms_info.mi_nodeid, ms_info.mi_maxid); + + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "%-16s\t%-16s %-16s\t%-16s\t%-8s\t%-8s\t%-8s\n", + "node_id", "state", "scif_ref_cnt", "scif_map_ref_cnt", + "wait_status", "conn count", "numa_node"); + + for (node = 0; node <= ms_info.mi_maxid; node++) + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "%-16d\t%-16s\t0x%-16lx\t%-16d\t%-16lld\t%-16d\t%-16d\n", + scif_dev[node].sd_node, scifdev_state[scif_dev[node].sd_state], + atomic_long_read(&scif_dev[node].scif_ref_cnt), + scif_dev[node].scif_map_ref_cnt, + scif_dev[node].sd_wait_status, + scif_dev[node].num_active_conn, + scif_dev[node].sd_numa_node); +#ifdef _MIC_SCIF_ + mutex_unlock(&ms_info.mi_conflock); +#endif + + *eof = 1; + return l; +} + +static int +scif_suspend(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + +#ifdef _MIC_SCIF_ + micscif_suspend_handler(NULL, 0, NULL); +#else + { + int node; + uint64_t ret; + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Removing Nodes mask 0x7\n"); + for (node = 1; node < ms_info.mi_total; node++) { + ret = micscif_disconnect_node(node, 0 , 1); + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Node %d requested disconnect. ret = %lld\n", + node, ret); + } + } +#endif + + *eof = 1; + return l; +} + +#ifdef _MIC_SCIF_ +static int +scif_crash(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "%s %d Crash the Card to test Lost Nodes\n", __func__, __LINE__); + panic("Test Lost Node! Crash the card intentionally\n"); + *eof = 1; + return l; +} + +static int +scif_bugon(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "%s %d Bug on the Card to test Lost Nodes\n", __func__, __LINE__); + BUG_ON(1); + *eof = 1; + return l; +} +#endif + +static int +scif_fail_suspend(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + +#ifdef _MIC_SCIF_ + micscif_fail_suspend_handler(NULL, 0, NULL); + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Failing Suspend\n"); +#endif + + *eof = 1; + return l; +} + +static int +scif_resume(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + +#ifdef _MIC_SCIF_ + micscif_resume_handler(NULL, 0, NULL); + l += snprintf(buf + l, len - l > 0 ? len - l : 0, + "Resuming/Waking up node\n"); +#endif + + *eof = 1; + return l; +} + +static int +scif_get_reg_cache_limit(char *buf, char **start, off_t offset, int len, int *eof, void *data) +{ + int l = 0; + + l += snprintf(buf + l, len - l > 0 ? len - l : 0 , + "reg_cache_limit = 0x%lx\n", ms_info.mi_rma_tc_limit); + *eof = 1; + return l; +} + +static int +scif_set_reg_cache_limit(struct file *file, const char __user *buffer, + unsigned long len, void *unused) +{ + unsigned long data = 0; + char *p; + if (!(p = kzalloc(len, GFP_KERNEL))) + return -ENOMEM; + if (copy_from_user(p, buffer, len)) + return -EFAULT; + data = simple_strtoul(p, NULL, 0); + ms_info.mi_rma_tc_limit = data; + return len; +} +#endif + +#ifdef _MIC_SCIF_ +static int smpt_seq_show(struct seq_file *s, void *pos) +{ + volatile uint8_t *mm_sbox = scif_dev[SCIF_HOST_NODE].mm_sbox; + uint32_t smpt_reg_offset = SBOX_SMPT00; + uint32_t smpt_reg_val; + int i; + + seq_printf(s, + "=================================================================\n"); + seq_printf(s,"%-11s| %-15s %-14s %-5s \n", + "SMPT entry", "SMPT reg value", "DMA addr", "SNOOP"); + seq_printf(s, + "=================================================================\n"); + + for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) { + smpt_reg_val = readl(mm_sbox + smpt_reg_offset); + seq_printf(s,"%-11d| %-#15x %-#14llx %-5s \n", + i, smpt_reg_val, ((uint64_t)smpt_reg_val >> 2ULL) << MIC_SYSTEM_PAGE_SHIFT, + (smpt_reg_val & 0x1) ? "OFF" : "ON"); + smpt_reg_offset += 4; + } + + seq_printf(s, + "=================================================================\n"); + return 0; +} + +#else +static int smpt_seq_show(struct seq_file *s, void *pos) +{ + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + int i; + unsigned long flags; + + mic_ctx = get_per_dev_ctx(bid); + seq_printf(s, + "=================================================================\n"); + seq_printf(s,"Board %-2d |%-10s| %-14s %-10s \n", + (int)bid + 1, "SMPT entry", "DMA addr", "Reference Count"); + seq_printf(s, + "=================================================================\n"); + + if (mic_ctx && mic_ctx->mic_smpt) { + spin_lock_irqsave(&mic_ctx->smpt_lock, flags); + for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) { + seq_printf(s,"%9s|%-10d| %-#14llx %-10lld \n", + " ", i, mic_ctx->mic_smpt[i].dma_addr, mic_ctx->mic_smpt[i].ref_count); + } + spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags); + } + + seq_printf(s, + "================================================================X\n"); + return 0; +} +#endif + +static int smpt_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, smpt_seq_show, inode->i_private); +} + +static int smpt_debug_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} + +static struct file_operations smpt_file_ops = { + .owner = THIS_MODULE, + .open = smpt_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = smpt_debug_release +}; + +#ifndef _MIC_SCIF_ +static int log_buf_seq_show(struct seq_file *s, void *pos) +{ + uint64_t bid = (uint64_t)s->private; + mic_ctx_t *mic_ctx; + void *log_buf_len_va, *log_buf_va; + struct micscif_dev *dev; + + mic_ctx = get_per_dev_ctx(bid); + if (!mic_ctx || !mic_ctx->log_buf_addr || !mic_ctx->log_buf_len) + goto done; + + if (mic_ctx->bi_family == FAMILY_ABR) { + seq_printf(s, "log buffer display not supported for KNF\n"); + goto done; + } + + dev = &scif_dev[mic_get_scifnode_id(mic_ctx)]; + log_buf_len_va = virt_to_phys(mic_ctx->log_buf_len) + mic_ctx->aper.va; + log_buf_va = virt_to_phys(mic_ctx->log_buf_addr) + mic_ctx->aper.va; + + mutex_lock(&mic_ctx->state_lock); + switch (mic_ctx->state) { + case MIC_BOOT: + case MIC_BOOTFAIL: + case MIC_ONLINE: + case MIC_SHUTDOWN: + case MIC_LOST: + micscif_inc_node_refcnt(dev, 1); + seq_write(s, log_buf_va, *(int*)log_buf_len_va); + micscif_dec_node_refcnt(dev, 1); + break; + case MIC_NORESPONSE: + case MIC_READY: + /* Cannot access GDDR while reset is ongoing */ + case MIC_RESET: + case MIC_RESETFAIL: + case MIC_INVALID: + default: + break; + } + mutex_unlock(&mic_ctx->state_lock); +done: + return 0; +} + +static int log_buf_open(struct inode *inode, struct file *file) +{ + return single_open(file, log_buf_seq_show, inode->i_private); +} + +static int log_buf_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} + +static struct file_operations log_buf_ops = { + .owner = THIS_MODULE, + .open = log_buf_open, + .read = seq_read, + .llseek = seq_lseek, + .release = log_buf_release +}; +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +void +scif_proc_init(void) +{ + if ((scif_proc = proc_mkdir("scif", NULL)) != NULL) { + proc_create_data("ep", 0444, scif_proc, &scif_ep_fops, NULL); + proc_create_data("rma_window", 0444, scif_proc, &scif_rma_window_fops, NULL); + proc_create_data("rma_xfer", 0444, scif_proc, &scif_rma_xfer_fops, NULL); + proc_create_data("scif_dev", 0444, scif_proc, &scif_dev_fops, NULL); + proc_create_data("debug", 0444, scif_proc, &scif_debug_fops, NULL); + proc_create_data("suspend", 0444, scif_proc, &scif_suspend_fops, NULL); + proc_create("reg_cache_limit", S_IFREG | S_IRUGO | S_IWUGO, scif_proc, + &scif_cache_limit_fops); + } +} +#else +void +scif_proc_init(void) +{ + struct proc_dir_entry *reg_cache_limit_entry; + struct proc_dir_entry *ep_entry; + + if ((scif_proc = create_proc_entry("scif", S_IFDIR | S_IRUGO, NULL)) != NULL) { + create_proc_read_entry("rma_window", 0444, scif_proc, scif_rma_window_read, NULL); + create_proc_read_entry("rma_xfer", 0444, scif_proc, scif_rma_xfer_read, NULL); + create_proc_read_entry("scif_dev", 0444, scif_proc, scif_dev_info, NULL); + create_proc_read_entry("debug", 0444, scif_proc, scif_debug_read, NULL); + create_proc_read_entry("suspend", 0444, scif_proc, scif_suspend, NULL); + create_proc_read_entry("fail_suspend", 0444, scif_proc, scif_fail_suspend, NULL); + create_proc_read_entry("resume", 0444, scif_proc, scif_resume, NULL); +#ifdef _MIC_SCIF_ + create_proc_read_entry("crash", 0444, scif_proc, scif_crash, NULL); + create_proc_read_entry("bugon", 0444, scif_proc, scif_bugon, NULL); +#endif + if ((reg_cache_limit_entry = create_proc_entry("reg_cache_limit", S_IFREG | S_IRUGO | S_IWUGO, scif_proc))) { + reg_cache_limit_entry->write_proc = scif_set_reg_cache_limit; + reg_cache_limit_entry->read_proc = scif_get_reg_cache_limit; + reg_cache_limit_entry->data = NULL; + } + if ((ep_entry = create_proc_entry("ep", S_IFREG | S_IRUGO | S_IWUGO, scif_proc))) { + ep_entry->proc_fops = &scif_ep_fops; + } + + + } +} +#endif // LINUX VERSION + +#ifdef _MIC_SCIF_ +void +mic_debug_init(void) +{ + if ((mic_debug = debugfs_create_dir("mic_debug", NULL))) { + debugfs_create_file("smpt", 0444, mic_debug, NULL, &smpt_file_ops); + debugfs_create_u8("enable_msg_logging", 0666, mic_debug, &(ms_info.en_msg_log)); + } +} +#else +void +mic_debug_init(mic_ctx_t *mic_ctx) +{ + char name[DEBUG_LEN]; + uint64_t id = mic_ctx->bi_id; + struct dentry *child; + + if (!mic_debug) + mic_debug = debugfs_create_dir("mic_debug", NULL); + + if (mic_debug) { + snprintf(name, DEBUG_LEN, "mic%d", (int)id); + if ((child = debugfs_create_dir(name, mic_debug))) { + debugfs_create_file("smpt", 0444, child, (void*)id, &smpt_file_ops); + debugfs_create_file("log_buf", 0444, child, (void*)id, &log_buf_ops); + } + debugfs_create_u8("enable_msg_logging", 0666, mic_debug, &(ms_info.en_msg_log)); + } +} +#endif + +void +mic_debug_uninit(void) +{ + debugfs_remove_recursive(mic_debug); +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +void +scif_proc_cleanup(void) +{ + if (scif_proc) + remove_proc_subtree("scif", NULL); +} +#else +void +scif_proc_cleanup(void) +{ + if (scif_proc) { + remove_proc_entry("reg_cache_limit", scif_proc); + remove_proc_entry("ep", scif_proc); + remove_proc_entry("rma_window", scif_proc); + remove_proc_entry("rma_xfer", scif_proc); + remove_proc_entry("scif_dev", scif_proc); + remove_proc_entry("debug", scif_proc); + remove_proc_entry("suspend", scif_proc); + remove_proc_entry("fail_suspend", scif_proc); + remove_proc_entry("resume", scif_proc); +#ifdef _MIC_SCIF_ + remove_proc_entry("crash", scif_proc); + remove_proc_entry("bugon", scif_proc); +#endif + remove_proc_entry("scif", NULL); + scif_proc = NULL; + } +} +#endif + +#ifdef _MIC_SCIF_ +extern int micscif_max_msg_id; + +/* + * Test entry point for error injection + */ +int +micscif_error_inject(int scenario) +{ + switch (scenario) { + case 1: + micscif_max_msg_id = 0; + break; + default: + pr_debug("Illegal error injection scenario %d\n", scenario); + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(micscif_error_inject); +#endif // _MIC_SCIF_ diff --git a/micscif/micscif_fd.c b/micscif/micscif_fd.c new file mode 100644 index 0000000..9a4eb9c --- /dev/null +++ b/micscif/micscif_fd.c @@ -0,0 +1,528 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" + +struct mic_priv { + scif_epd_t epd; +}; + + +int +scif_fdopen(struct file *f) +{ + struct mic_priv *priv = (struct mic_priv *) + kmalloc(sizeof(struct mic_priv), GFP_KERNEL); + /* + * Not a valid errno as defined in scif.h but should be? + */ + if (!priv) + return -ENOMEM; + + /* SCIF device */ + if (!(priv->epd = __scif_open())) { + kfree(priv); + return -ENOMEM; + } + + ((f)->private_data) = priv; + return 0; +} + +int +scif_fdclose(struct file *f) +{ + struct mic_priv *priv = ((f)->private_data); + int err = 0; + + /* Only actually request of tear down of end point if file reference + * count is greater than 1. This accounts for the fork() issue. + */ + if (atomic64_read(&f->f_count) == 0) { + err = __scif_close(priv->epd); + kfree(priv); + } + return err; +} + +int +micscif_mmap(struct file *f, struct vm_area_struct *vma) +{ + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + return scif_mmap(vma, priv->epd); +} + +unsigned int +micscif_poll(struct file *f, poll_table *wait) +{ + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + return __scif_pollfd(f, wait, (struct endpt *)priv->epd); +} + +int +micscif_flush(struct file *f, fl_owner_t id) +{ + struct mic_priv *priv; + dev_t dev; + struct endpt *ep; + + priv = (struct mic_priv *)f->private_data; + dev = f->f_path.dentry->d_inode->i_rdev; + if (MINOR(dev) != 1) // SCIF MINOR + return 0; + + ep = priv->epd; + + /* Handles fork issue, making suer an endpoint only closes when the original + * thread that created it tries to close it, or when there are no more + * references to it. + */ + if (ep->files == id) + __scif_flush(ep); + + return 0; +} + + +static __always_inline void +scif_err_debug(int err, const char *str) +{ + /* + * ENOTCONN is a common uninteresting error which is + * flooding debug messages to the console unnecessarily. + */ + if (err < 0 && err != -ENOTCONN) + pr_debug("%s err %d\n", str, err); +} + + + +int +scif_process_ioctl(struct file *f, unsigned int cmd, uint64_t arg) +{ + struct mic_priv *priv = ((f)->private_data); + void __user *argp = (void __user *)arg; + int err = 0; + struct scifioctl_msg request; + bool non_block = false; + + non_block = !!(f->f_flags & O_NONBLOCK); + + switch (cmd) { + case SCIF_BIND: + { + int pn; + + if (copy_from_user(&pn, argp, sizeof(pn))) { + return -EFAULT; + } + + if ((pn = __scif_bind(priv->epd, pn)) < 0) { + return pn; + } + + if (copy_to_user(argp, &pn, sizeof(pn))) { + return -EFAULT; + } + + return 0; + } + case SCIF_LISTEN: + return __scif_listen(priv->epd, arg); + case SCIF_CONNECT: + { + struct scifioctl_connect req; + struct endpt *ep = (struct endpt *)priv->epd; + + if (copy_from_user(&req, argp, sizeof(struct scifioctl_connect))) { + return -EFAULT; + } + + if ((err = __scif_connect(priv->epd, &req.peer, non_block)) < 0) { + return err; + } + + req.self.node = ep->port.node; + req.self.port = ep->port.port; + + if (copy_to_user(argp, &req, sizeof(struct scifioctl_connect))) { + return -EFAULT; + } + + + return 0; + } + // Accept is done in two halves. Thes request ioctl does the basic functility of accepting + // the request and returning the information about it including the internal ID of the + // end point. The register is done with the internID on a new file desciptor opened by the + // requesting process. + case SCIF_ACCEPTREQ: + { + struct scifioctl_accept request; + unsigned long sflags; + scif_epd_t *ep = (scif_epd_t *)&request.endpt; + + if (copy_from_user(&request, argp, sizeof(struct scifioctl_accept))) { + return -EFAULT; + } + + if ((err = __scif_accept(priv->epd, &request.peer, ep, request.flags)) < 0) { + return err; + } + + if (copy_to_user(argp, &request, sizeof(struct scifioctl_accept))) { + scif_close(*ep); + return -EFAULT; + } + + // Add to the list of user mode eps where the second half of the accept + // is not yet completed. + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_add_tail(&((*ep)->miacceptlist), &ms_info.mi_uaccept); + list_add_tail(&((*ep)->liacceptlist), &priv->epd->li_accept); + (*ep)->listenep = priv->epd; + priv->epd->acceptcnt++; + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + + return 0; + } + case SCIF_ACCEPTREG: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct endpt *newep; + struct endpt *lisep; + struct endpt *ep; + struct endpt *fep = NULL; + struct endpt *tmpep; + struct list_head *pos, *tmpq; + unsigned long sflags; + + // Finally replace the pointer to the accepted endpoint + if (copy_from_user(&newep, argp, sizeof(void *))) + return -EFAULT; + + // Remove form the user accept queue + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_uaccept) { + tmpep = list_entry(pos, struct endpt, miacceptlist); + if (tmpep == newep) { + list_del(pos); + fep = tmpep; + break; + } + } + + if (fep == NULL) { + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + return -ENOENT; + } + + lisep = newep->listenep; + list_for_each_safe(pos, tmpq, &lisep->li_accept) { + tmpep = list_entry(pos, struct endpt, liacceptlist); + if (tmpep == newep) { + list_del(pos); + lisep->acceptcnt--; + break; + } + } + + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + + // Free the resources automatically created from the open. + micscif_teardown_ep(priv->epd); + micscif_add_epd_to_zombie_list(priv->epd, !MI_EPLOCK_HELD); + priv->epd = newep; + ep = (struct endpt *)priv->epd; + ep = ep; + return 0; + } + case SCIF_SEND: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + + if (copy_from_user(&request, argp, + sizeof(struct scifioctl_msg))) { + err = -EFAULT; + goto send_err; + } + + if ((err = scif_user_send(priv->epd, request.msg, + request.len, request.flags)) < 0) + goto send_err; + + if (copy_to_user(&((struct scifioctl_msg*)argp)->out_len, + &err, sizeof(err))) { + err = -EFAULT; + goto send_err; + } + err = 0; +send_err: + scif_err_debug(err, "scif_send"); + return err; + } + case SCIF_RECV: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + + if (copy_from_user(&request, argp, + sizeof(struct scifioctl_msg))) { + err = -EFAULT; + goto recv_err; + } + + if ((err = scif_user_recv(priv->epd, request.msg, + request.len, request.flags)) < 0) + goto recv_err; + + if (copy_to_user(&((struct scifioctl_msg*)argp)->out_len, + &err, sizeof(err))) { + err = -EFAULT; + goto recv_err; + } + err = 0; +recv_err: + scif_err_debug(err, "scif_recv"); + return err; + } + case SCIF_REG: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_reg reg; + off_t ret; + + if (copy_from_user(®, argp, sizeof(reg))) { + err = -EFAULT; + goto reg_err; + } + if (reg.flags & SCIF_MAP_KERNEL) { + err = -EINVAL; + goto reg_err; + } + if ((ret = __scif_register(priv->epd, reg.addr, reg.len, + reg.offset, reg.prot, reg.flags)) < 0) { + err = (int)ret; + goto reg_err; + } + + if (copy_to_user(&((struct scifioctl_reg*)argp)->out_offset, + &ret, sizeof(reg.out_offset))) { + err = -EFAULT; + goto reg_err; + } + err = 0; +reg_err: + scif_err_debug(err, "scif_register"); + return err; + } + case SCIF_UNREG: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_unreg unreg; + + if (copy_from_user(&unreg, argp, sizeof(unreg))) { + err = -EFAULT; + goto unreg_err; + } + err = __scif_unregister(priv->epd, unreg.offset, unreg.len); +unreg_err: + scif_err_debug(err, "scif_unregister"); + return err; + } + case SCIF_READFROM: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto readfrom_err; + } + err = __scif_readfrom(priv->epd, + copy.loffset, + copy.len, + copy.roffset, + copy.flags); +readfrom_err: + scif_err_debug(err, "scif_readfrom"); + return err; + } + case SCIF_WRITETO: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto writeto_err; + } + err = __scif_writeto(priv->epd, + copy.loffset, + copy.len, + copy.roffset, + copy.flags); +writeto_err: + scif_err_debug(err, "scif_writeto"); + return err; + } + case SCIF_VREADFROM: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto vreadfrom_err; + } + err = __scif_vreadfrom(priv->epd, + copy.addr, + copy.len, + copy.roffset, + copy.flags); +vreadfrom_err: + scif_err_debug(err, "scif_vreadfrom"); + return err; + } + case SCIF_VWRITETO: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto vwriteto_err; + } + err = __scif_vwriteto(priv->epd, + copy.addr, + copy.len, + copy.roffset, + copy.flags); +vwriteto_err: + scif_err_debug(err, "scif_vwriteto"); + return err; + } + case SCIF_GET_NODEIDS: + { + struct scifioctl_nodeIDs nodeIDs; + int entries; + uint16_t *nodes; + uint16_t self; + + if (copy_from_user(&nodeIDs, argp, sizeof(nodeIDs))) { + err = -EFAULT; + goto getnodes_err2; + } + + entries = SCIF_MIN(MAX_BOARD_SUPPORTED, nodeIDs.len); + + nodes = kmalloc(sizeof(uint16_t) * entries, GFP_KERNEL); + if ( (entries != 0) && (!nodes) ){ + err = -ENOMEM; + goto getnodes_err2; + } + nodeIDs.len = scif_get_nodeIDs(nodes, entries, &self); + + if (copy_to_user(nodeIDs.nodes, + nodes, sizeof(uint16_t) * entries)) { + err = -EFAULT; + goto getnodes_err1; + } + + if (copy_to_user(nodeIDs.self, + &self, sizeof(uint16_t))) { + err = -EFAULT; + goto getnodes_err1; + } + + if (copy_to_user(argp, &nodeIDs, sizeof(nodeIDs))) { + err = -EFAULT; + goto getnodes_err1; + } +getnodes_err1: + kfree(nodes); +getnodes_err2: + return err; + } + case SCIF_FENCE_MARK: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_fence_mark mark; + int tmp_mark = 0; + + if (copy_from_user(&mark, argp, sizeof(mark))) { + err = -EFAULT; + goto fence_mark_err; + } + if ((err = __scif_fence_mark(priv->epd, + mark.flags, &tmp_mark))) + goto fence_mark_err; + if (copy_to_user(mark.mark, &tmp_mark, sizeof(tmp_mark))) { + err = -EFAULT; + goto fence_mark_err; + } +fence_mark_err: + scif_err_debug(err, "scif_fence_mark"); + return err; + } + case SCIF_FENCE_WAIT: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + err = __scif_fence_wait(priv->epd, arg); + scif_err_debug(err, "scif_fence_wait"); + return err; + } + case SCIF_FENCE_SIGNAL: + { + struct mic_priv *priv = (struct mic_priv *)((f)->private_data); + struct scifioctl_fence_signal signal; + + if (copy_from_user(&signal, argp, sizeof(signal))) { + err = -EFAULT; + goto fence_signal_err; + } + + err = __scif_fence_signal(priv->epd, signal.loff, + signal.lval, signal.roff, signal.rval, signal.flags); +fence_signal_err: + scif_err_debug(err, "scif_fence_signal"); + return err; + } + case SCIF_GET_VERSION: + { + return SCIF_VERSION; + } + } + return -EINVAL; +} diff --git a/micscif/micscif_intr.c b/micscif/micscif_intr.c new file mode 100644 index 0000000..10268e0 --- /dev/null +++ b/micscif/micscif_intr.c @@ -0,0 +1,159 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" +#include "mic/micscif_intr.h" +#include "mic/micscif_nodeqp.h" +#include "mic_common.h" + +/* Runs in the context of sd_intr_wq */ +static void micscif_intr_bh_handler(struct work_struct *work) +{ + struct micscif_dev *scifdev = + container_of(work, struct micscif_dev, sd_intr_bh); + + /* figure out which qp we got a recv on */ + struct micscif_qp *qp = micscif_nodeqp_nextmsg(scifdev); + if (qp != NULL) { + if (is_self_scifdev(scifdev)) + micscif_loopb_msg_handler(scifdev, qp); + else + micscif_nodeqp_intrhandler(scifdev, qp); + } +} + +int micscif_setup_interrupts(struct micscif_dev *scifdev) +{ + if (!scifdev->sd_intr_wq) { + snprintf(scifdev->sd_intr_wqname, sizeof(scifdev->sd_intr_wqname), + "SCIF INTR %d", scifdev->sd_node); + + /* FIXME: Fix windows */ + if (!(scifdev->sd_intr_wq = + __mic_create_singlethread_workqueue(scifdev->sd_intr_wqname))) + return -ENOMEM; + + INIT_WORK(&scifdev->sd_intr_bh, micscif_intr_bh_handler); + } + return 0; +} + +void micscif_destroy_interrupts(struct micscif_dev *scifdev) +{ + destroy_workqueue(scifdev->sd_intr_wq); +} + +#ifdef _MIC_SCIF_ +irqreturn_t micscif_intr_handler(int irq, void *dev_id) +{ + struct micscif_dev *dev = (struct micscif_dev *)dev_id; + queue_work(dev->sd_intr_wq, &dev->sd_intr_bh); + return IRQ_HANDLED; +} + +/* + * register_scif_intr_handler() - Registers SCIF interrupt handler with + * appropriate IRQ + * @dev: per node dev structure to store the intr handle + * + * IRQ 17 - 24 Corresponds to RDMASR registers RDMASR0 - RRDMASR7. + * RDMASR registers are chosen based on the lowest ref count. + * There are 8 RDMASRS for the host and the nodes. So When the number of + * nodes added to the current node's p2p network increases beyond + * 7, it starts sharing the interrupt. + */ +int +register_scif_intr_handler(struct micscif_dev *dev) +{ + unsigned int handle = 0; + unsigned int i; + int ret; + + mutex_lock(&ms_info.mi_conflock); + + /* Find the first lowest ref count */ + for (i = 0; i < MAX_RDMASR; i++) + if (ms_info.mi_intr_rcnt[handle] > + ms_info.mi_intr_rcnt[i]) + handle = i; + + if ((ret = request_irq(get_rdmasr_irq(handle), micscif_intr_handler, + IRQF_SHARED, dev->sd_intr_wqname, dev))) { + printk(KERN_ERR "Cannot request irq number %d, ret = %d\n" + , get_rdmasr_irq(handle), ret); + goto error; + } + + ms_info.mi_intr_rcnt[handle]++; + dev->sd_intr_handle = handle; + + printk("Registered interrupt handler for node %d, for IRQ = %d," + "handle = %d\n", dev->sd_node, get_rdmasr_irq(handle), handle); + +error: + mutex_unlock(&ms_info.mi_conflock); + return ret; +} + +/* + * deregister_scif_intr_handler() - Deregisters SCIF interrupt + * handler from appropriate IRQ + * @dev: per node dev structure to retrieve the intr handle + * + */ +void +deregister_scif_intr_handler(struct micscif_dev *dev) +{ + unsigned int handle = dev->sd_intr_handle; + + if (handle >= MAX_RDMASR) + return; + + mutex_lock(&ms_info.mi_conflock); + ms_info.mi_intr_rcnt[handle]--; + + if (ms_info.mi_intr_rcnt[handle] < 0) { + printk("scif intr deregister negative ref count" + " for node %d, handle = %d, IRQ = %d\n", dev->sd_node, + handle, get_rdmasr_irq(handle)); + WARN_ON(1); + } + + mutex_unlock(&ms_info.mi_conflock); + free_irq(get_rdmasr_irq(handle), dev); + printk("Deregistered interrupt handler for node %d, for IRQ = %d," + "handle = %d\n", dev->sd_node, get_rdmasr_irq(handle), handle); +} +#endif /* _MIC_SCIF_ */ diff --git a/micscif/micscif_main.c b/micscif/micscif_main.c new file mode 100644 index 0000000..45d5bf4 --- /dev/null +++ b/micscif/micscif_main.c @@ -0,0 +1,606 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34)) +#include +#endif + +#include +#include +#include +#include +//#include +#include +#include +#include +/* Include this for suspend/resume notifications from pm driver */ +#include + +#ifdef CONFIG_MK1OM +#define MICPM_DEVEVENT_SUSPEND 1 +#define MICPM_DEVEVENT_RESUME 2 +#define MICPM_DEVEVENT_FAIL_SUSPEND 3 +extern void micpm_device_register(struct notifier_block *n); +extern void micpm_device_unregister(struct notifier_block *n); +#endif + +int scif_id = 0; +module_param(scif_id, int, 0400); +MODULE_PARM_DESC(scif_id, "Set scif driver node ID"); + +ulong scif_addr = 0; +module_param(scif_addr, ulong, 0400); +MODULE_PARM_DESC(scif_addr, "Set scif driver host address"); + +struct kmem_cache *unaligned_cache; + +struct mic_info { + dev_t m_dev; + struct cdev m_cdev; + struct class * m_class; + struct device * m_scifdev; +} micinfo; + +int micscif_major = SCIF_MAJOR; +int micscif_minor = 0; + +struct micscif_info ms_info; + +// MAX MIC cards + 1 for the Host +struct micscif_dev scif_dev[MAX_BOARD_SUPPORTED + 1]; + +extern mic_dma_handle_t mic_dma_handle; + +static int mic_pm_qos_cpu_dma_lat = -1; +static int mic_host_numa_node = -1; +static unsigned long mic_p2p_proxy_thresh = -1; + +#ifdef CONFIG_MK1OM +static int micscif_devevent_handler(struct notifier_block *nb, + unsigned long event, + void *msg) +{ + if (event == MICPM_DEVEVENT_SUSPEND) + return micscif_suspend_handler(nb, event, msg); + else if (event == MICPM_DEVEVENT_RESUME) + return micscif_resume_handler(nb, event, msg); + else if (event == MICPM_DEVEVENT_FAIL_SUSPEND) + return micscif_fail_suspend_handler(nb, event, msg); + return 0; +} + +static struct notifier_block mic_deviceevent = { + .notifier_call = micscif_devevent_handler, +}; +#endif + +static int micscif_open(struct inode *in, struct file *f) +{ + dev_t dev = in->i_rdev; + + switch (MINOR(dev)) { + case 0: + /* base mic device access for testing */ + return 0; + case 1: + return scif_fdopen(f); + } + + return -EINVAL; +} + +static int micscif_ioctl(struct inode *in, struct file *f, + unsigned int cmd, unsigned long arg) +{ + dev_t dev = in->i_rdev; + + if (MINOR(dev) == 1) { + /* SCIF device */ + return scif_process_ioctl(f, cmd, arg); + } + return -EINVAL; +} + +static long micscif_unlocked_ioctl(struct file *f, + unsigned int cmd, unsigned long arg) +{ + return (long) micscif_ioctl(f->f_path.dentry->d_inode, f, cmd, arg); +} + +static int micscif_release(struct inode *in, struct file *f) +{ + dev_t dev = in->i_rdev; + + switch (MINOR(dev)) { + case 0: + /* base mic device access for testing */ + return 0; + case 1: + return scif_fdclose(f); + } + + return -EINVAL; +} + +/* TODO: Need to flush the queue, grab some lock, and probably + * notify the remote node we're going down ... right now, we're + * just freeing things, which is probably a bad idea :-) + */ +static int micscif_uninit_qp(struct micscif_dev *scifdev) +{ + int i; + /* first, iounmap/unmap/free any memory we mapped */ + for (i = 0; i < scifdev->n_qpairs; i++) { + iounmap(scifdev->qpairs[i].remote_qp); + iounmap(scifdev->qpairs[i].outbound_q.rb_base); + kfree((void *)scifdev->qpairs[i].inbound_q.rb_base); + } + kfree(scifdev->qpairs); + scifdev->n_qpairs = 0; + + return 0; +} + +static int micscif_reboot(struct notifier_block *notifier, unsigned long unused1, void *unused2); + +static struct notifier_block micscif_reboot_notifier = { + .notifier_call = micscif_reboot, + .priority = 0, +}; + +extern struct attribute_group scif_attr_group; + +void micscif_destroy_base(void) +{ +#ifdef CONFIG_MMU_NOTIFIER + destroy_workqueue(ms_info.mi_mmu_notif_wq); +#endif + destroy_workqueue(ms_info.mi_misc_wq); + destroy_workqueue(ms_info.mi_conn_wq); + + sysfs_remove_group(&micinfo.m_scifdev->kobj, &scif_attr_group); + device_destroy(micinfo.m_class, micinfo.m_dev + 1); + device_destroy(micinfo.m_class, micinfo.m_dev); + class_destroy(micinfo.m_class); + cdev_del(&(micinfo.m_cdev)); + unregister_chrdev_region(micinfo.m_dev, 2); +} + +static void _micscif_exit(void) +{ + struct list_head *pos, *unused; + struct scif_callback *temp; + struct micscif_dev *dev; + int i; + + pr_debug("Goodbye SCIF!\n"); + /* Cleanup P2P Node Qp/ Interrupt Handlers */ + for (i = SCIF_HOST_NODE + 1; i <= MAX_BOARD_SUPPORTED; i++) { + dev = &scif_dev[i]; + + if (is_self_scifdev(dev)) + continue; + + micscif_cleanup_scifdev(dev, DESTROY_WQ); + } + + list_for_each_safe(pos, unused, &ms_info.mi_event_cb) { + temp = list_entry(pos, struct scif_callback, list_member); + list_del(pos); + kfree(temp); + } + mutex_destroy(&ms_info.mi_event_cblock); + +#ifdef CONFIG_MK1OM + micpm_device_unregister(&mic_deviceevent); +#endif + + scif_dev[ms_info.mi_nodeid].sd_state = SCIFDEV_STOPPING; + scif_dev[SCIF_HOST_NODE].sd_state = SCIFDEV_STOPPING; + + /* The EXIT message is the last message from MIC to the Host */ + micscif_send_exit(); + + /* + * Deliberate infinite wait for a host response during driver + * unload since the host must inform other SCIF nodes about + * this node going away and then only send a response back + * to this node to avoid this nodes host shutdown handler racing + * with disconnection from the SCIF network. There is a timeout + * on the host for sending a response back so a response will + * be sent else the host has crashed. + */ + wait_event(ms_info.mi_exitwq, + scif_dev[ms_info.mi_nodeid].sd_state == SCIFDEV_STOPPED); + scif_proc_cleanup(); + mic_debug_uninit(); + micscif_kmem_cache_destroy(); + + micscif_destroy_base(); + + /* Disable interrupts */ + deregister_scif_intr_handler(&scif_dev[SCIF_HOST_NODE]); + destroy_workqueue(scif_dev[SCIF_HOST_NODE].sd_intr_wq); + micscif_destroy_loopback_qp(&scif_dev[ms_info.mi_nodeid]); + + /* Close DMA device */ + close_dma_device(0, &mic_dma_handle); + + micscif_uninit_qp(&scif_dev[SCIF_HOST_NODE]); + iounmap(scif_dev[SCIF_HOST_NODE].mm_sbox); +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34)) + pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "micscif"); +#endif +} + +static void micscif_exit(void) +{ + unregister_reboot_notifier(&micscif_reboot_notifier); + _micscif_exit(); +} + +static int micscif_reboot(struct notifier_block *notifier, unsigned long unused1, void *unused2) +{ + _micscif_exit(); + return NOTIFY_OK; +} + +struct file_operations micscif_ops = { + .owner = THIS_MODULE, + .unlocked_ioctl = micscif_unlocked_ioctl, + .mmap = micscif_mmap, + .poll = micscif_poll, + .flush = micscif_flush, + .open = micscif_open, + .release = micscif_release, +}; + +static char * scif_devnode(struct device *dev, mode_t *mode) +{ + return kasprintf(GFP_KERNEL, "mic/%s", dev_name(dev)); +} + +// Setup the base informaiton for the driver. No interface specific code. +static int micscif_setup_base(void) +{ + long int result; + + if (micscif_major) { + micinfo.m_dev = MKDEV(micscif_major, micscif_minor); + result = register_chrdev_region(micinfo.m_dev, 2, "micscif"); + } else { + result = alloc_chrdev_region(&micinfo.m_dev, micscif_minor, 2, "micscif"); + micscif_major = MAJOR(micinfo.m_dev); + } + + if (result >= 0) { + cdev_init(&(micinfo.m_cdev), &micscif_ops); + micinfo.m_cdev.owner = THIS_MODULE; + if ((result = cdev_add(&(micinfo.m_cdev), micinfo.m_dev, 2))) + goto unreg_chrdev; + } else { + goto unreg_chrdev; + } + + micinfo.m_class = class_create(THIS_MODULE, "micscif"); + if (IS_ERR(micinfo.m_class)) { + result = PTR_ERR(micinfo.m_class); + goto del_m_dev; + } + + micinfo.m_class->devnode = scif_devnode; + if (IS_ERR((int *)(result = + (long int)device_create(micinfo.m_class, NULL, micinfo.m_dev, NULL, "mic")))) { + result = PTR_ERR((int *)result); + goto class_destroy; + } + if (IS_ERR(micinfo.m_scifdev = + device_create(micinfo.m_class, NULL, micinfo.m_dev + 1, NULL, "scif"))) { + result = PTR_ERR(micinfo.m_scifdev); + goto device_destroy; + } + if ((result = sysfs_create_group(&micinfo.m_scifdev->kobj, &scif_attr_group))) + goto device_destroy1; + + spin_lock_init(&ms_info.mi_eplock); + spin_lock_init(&ms_info.mi_connlock); + spin_lock_init(&ms_info.mi_rmalock); + mutex_init(&ms_info.mi_fencelock); + spin_lock_init(&ms_info.mi_nb_connect_lock); + INIT_LIST_HEAD(&ms_info.mi_uaccept); + INIT_LIST_HEAD(&ms_info.mi_listen); + INIT_LIST_HEAD(&ms_info.mi_zombie); + INIT_LIST_HEAD(&ms_info.mi_connected); + INIT_LIST_HEAD(&ms_info.mi_disconnected); + INIT_LIST_HEAD(&ms_info.mi_rma); + INIT_LIST_HEAD(&ms_info.mi_rma_tc); + INIT_LIST_HEAD(&ms_info.mi_nb_connect_list); + +#ifdef CONFIG_MMU_NOTIFIER + INIT_LIST_HEAD(&ms_info.mi_mmu_notif_cleanup); +#endif + INIT_LIST_HEAD(&ms_info.mi_fence); + if (!(ms_info.mi_misc_wq = create_singlethread_workqueue("SCIF_MISC"))) { + result = -ENOMEM; + goto remove_group; + } + INIT_WORK(&ms_info.mi_misc_work, micscif_misc_handler); + if (!(ms_info.mi_conn_wq = create_singlethread_workqueue("SCIF_NB_CONN"))) { + result = -ENOMEM; + goto destroy_misc_wq; + } + INIT_WORK(&ms_info.mi_conn_work, micscif_conn_handler); +#ifdef CONFIG_MMU_NOTIFIER + if (!(ms_info.mi_mmu_notif_wq = create_singlethread_workqueue("SCIF_MMU"))) { + result = -ENOMEM; + goto destroy_conn_wq; + } + INIT_WORK(&ms_info.mi_mmu_notif_work, micscif_mmu_notif_handler); +#endif + ms_info.mi_watchdog_to = DEFAULT_WATCHDOG_TO; +#ifdef MIC_IS_EMULATION + ms_info.mi_watchdog_enabled = 0; +#else + ms_info.mi_watchdog_enabled = 1; +#endif + ms_info.mi_rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT; + ms_info.mi_proxy_dma_threshold = mic_p2p_proxy_thresh; + ms_info.en_msg_log = 0; + return result; +#ifdef CONFIG_MMU_NOTIFIER +destroy_conn_wq: + destroy_workqueue(ms_info.mi_conn_wq); +#endif +destroy_misc_wq: + destroy_workqueue(ms_info.mi_misc_wq); +remove_group: + sysfs_remove_group(&micinfo.m_scifdev->kobj, &scif_attr_group); +device_destroy1: + device_destroy(micinfo.m_class, micinfo.m_dev + 1); +device_destroy: + device_destroy(micinfo.m_class, micinfo.m_dev); +class_destroy: + class_destroy(micinfo.m_class); +del_m_dev: + cdev_del(&(micinfo.m_cdev)); +unreg_chrdev: + unregister_chrdev_region(micinfo.m_dev, 2); +//error: + return result; +} + +#define SBOX_MMIO_LENGTH 0x10000 + +static int micscif_init(void) +{ + int result = 0; + int i; + phys_addr_t host_queue_phys; + phys_addr_t gtt_phys_base; + + pr_debug("HELLO SCIF!\n"); + +#if defined(CONFIG_ML1OM) + pr_debug("micscif_init(): Hello KNF!\n"); +#elif defined(CONFIG_MK1OM) + pr_debug("micscif_init(): Hello KNC!\n"); +#endif + + if (!scif_id || !scif_addr) { + printk(KERN_ERR "%s %d scif_id 0x%x scif_addr 0x%lx" + "not provided as module parameter. Fail module load", + __func__, __LINE__, scif_id, scif_addr); + return -EINVAL; + } + + for (i = 1; i <= MAX_BOARD_SUPPORTED; i++) { + scif_dev[i].sd_state = SCIFDEV_INIT; + scif_dev[i].sd_node = i; + scif_dev[i].sd_numa_node = -1; + mutex_init (&scif_dev[i].sd_lock); + init_waitqueue_head(&scif_dev[i].sd_mmap_wq); + init_waitqueue_head(&scif_dev[i].sd_wq); + init_waitqueue_head(&scif_dev[i].sd_p2p_wq); + INIT_DELAYED_WORK(&scif_dev[i].sd_p2p_dwork, + scif_poll_qp_state); + scif_dev[i].sd_p2p_retry = 0; + } + + // Setup the host node access information + // Initially only talks to the host => node 0 + scif_dev[SCIF_HOST_NODE].sd_node = SCIF_HOST_NODE; + scif_dev[SCIF_HOST_NODE].sd_state = SCIFDEV_RUNNING; + if (!(scif_dev[SCIF_HOST_NODE].mm_sbox = + ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH))) { + result = -ENOMEM; + goto error; + } + scif_dev[SCIF_HOST_NODE].scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + scif_dev[SCIF_HOST_NODE].scif_map_ref_cnt = 0; + init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_wq); + init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_mmap_wq); + mutex_init(&scif_dev[SCIF_HOST_NODE].sd_lock); + gtt_phys_base = readl(scif_dev[SCIF_HOST_NODE].mm_sbox + SBOX_GTT_PHY_BASE); + gtt_phys_base *= ((4) * 1024); + pr_debug("GTT PHY BASE in GDDR 0x%llx\n", gtt_phys_base); + pr_debug("micscif_init(): gtt_phy_base x%llx\n", gtt_phys_base); + + /* Get handle to DMA device */ + if ((result = open_dma_device(0, 0, &mic_dma_handle))) + goto unmap_sbox; + + ms_info.mi_nodeid = scif_id; + ms_info.mi_maxid = scif_id; + ms_info.mi_total = 2; // Host plus this card + +#ifdef RMA_DEBUG + ms_info.rma_unaligned_cpu_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + ms_info.rma_alloc_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + ms_info.rma_pin_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); +#ifdef CONFIG_MMU_NOTIFIER + ms_info.mmu_notif_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); +#endif +#endif + + pr_debug("micscif_init(): setup_card_qp \n"); + host_queue_phys = scif_addr; + mutex_init(&ms_info.mi_event_cblock); + mutex_init(&ms_info.mi_conflock); + INIT_LIST_HEAD(&ms_info.mi_event_cb); + + pr_debug("micscif_init(): setup_interrupts \n"); + /* + * Set up the workqueue thread for interrupt handling + */ + if ((result = micscif_setup_interrupts(&scif_dev[SCIF_HOST_NODE]))) + goto close_dma; + + pr_debug("micscif_init(): host_intr_handler \n"); + if ((result = micscif_setup_card_qp(host_queue_phys, &scif_dev[SCIF_HOST_NODE]))) { + if (result == -ENXIO) + goto uninit_qp; + else + goto destroy_intr_wq; + } + /* need to do this last -- as soon as the dev is setup, userspace + * can try to use the device + */ + pr_debug("micscif_init(): setup_base \n"); + if ((result = micscif_setup_base())) + goto uninit_qp; + /* + * Register the interrupt + */ + if ((result = register_scif_intr_handler(&scif_dev[SCIF_HOST_NODE]))) + goto destroy_base; + + // Setup information for self aka loopback. + scif_dev[ms_info.mi_nodeid].sd_node = ms_info.mi_nodeid; + scif_dev[ms_info.mi_nodeid].sd_numa_node = mic_host_numa_node; + scif_dev[ms_info.mi_nodeid].mm_sbox = scif_dev[SCIF_HOST_NODE].mm_sbox; + scif_dev[ms_info.mi_nodeid].scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + scif_dev[ms_info.mi_nodeid].scif_map_ref_cnt = 0; + init_waitqueue_head(&scif_dev[ms_info.mi_nodeid].sd_wq); + init_waitqueue_head(&scif_dev[ms_info.mi_nodeid].sd_mmap_wq); + mutex_init(&scif_dev[ms_info.mi_nodeid].sd_lock); + if ((result = micscif_setup_loopback_qp(&scif_dev[ms_info.mi_nodeid]))) + goto dereg_intr_handle; + scif_dev[ms_info.mi_nodeid].sd_state = SCIFDEV_RUNNING; + + unaligned_cache = micscif_kmem_cache_create(); + if (!unaligned_cache) { + result = -ENOMEM; + goto destroy_loopb; + } + scif_proc_init(); + mic_debug_init(); + + pr_debug("micscif_init(): Setup successful: 0x%llx \n", host_queue_phys); + +#ifdef CONFIG_MK1OM + micpm_device_register(&mic_deviceevent); +#endif + if ((result = register_reboot_notifier(&micscif_reboot_notifier))) + goto cache_destroy; + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34)) + result = pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "micscif", mic_pm_qos_cpu_dma_lat); + if (result) { + printk("%s %d mic_pm_qos_cpu_dma_lat %d result %d\n", + __func__, __LINE__, mic_pm_qos_cpu_dma_lat, result); + result = 0; + /* Dont fail driver load due to PM QoS API. Fall through */ + } +#endif + + return result; +cache_destroy: +#ifdef CONFIG_MK1OM + micpm_device_unregister(&mic_deviceevent); +#endif + micscif_kmem_cache_destroy(); +destroy_loopb: + micscif_destroy_loopback_qp(&scif_dev[ms_info.mi_nodeid]); +dereg_intr_handle: + deregister_scif_intr_handler(&scif_dev[SCIF_HOST_NODE]); +destroy_base: + pr_debug("Unable to finish scif setup for some reason: %d\n", result); + micscif_destroy_base(); +uninit_qp: + micscif_uninit_qp(&scif_dev[SCIF_HOST_NODE]); +destroy_intr_wq: + micscif_destroy_interrupts(&scif_dev[SCIF_HOST_NODE]); +close_dma: + close_dma_device(0, &mic_dma_handle); +unmap_sbox: + iounmap(scif_dev[SCIF_HOST_NODE].mm_sbox); +error: + return result; +} + +module_init(micscif_init); +module_exit(micscif_exit); + +module_param_named(huge_page, mic_huge_page_enable, bool, 0600); +MODULE_PARM_DESC(huge_page, "SCIF Huge Page Support"); + +module_param_named(ulimit, mic_ulimit_check, bool, 0600); +MODULE_PARM_DESC(ulimit, "SCIF ulimit check"); + +module_param_named(reg_cache, mic_reg_cache_enable, bool, 0600); +MODULE_PARM_DESC(reg_cache, "SCIF registration caching"); +module_param_named(p2p, mic_p2p_enable, bool, 0600); +MODULE_PARM_DESC(p2p, "SCIF peer-to-peer"); + +module_param_named(p2p_proxy, mic_p2p_proxy_enable, bool, 0600); +MODULE_PARM_DESC(p2p_proxy, "SCIF peer-to-peer proxy DMA support"); + +module_param_named(pm_qos_cpu_dma_lat, mic_pm_qos_cpu_dma_lat, int, 0600); +MODULE_PARM_DESC(pm_qos_cpu_dma_lat, "PM QoS CPU DMA latency in usecs."); + +module_param_named(numa_node, mic_host_numa_node, int, 0600); +MODULE_PARM_DESC(numa_node, "Host Numa node to which MIC is attached"); + +module_param_named(p2p_proxy_thresh, mic_p2p_proxy_thresh, ulong, 0600); +MODULE_PARM_DESC(numa_node, "Transfer size after which Proxy DMA helps DMA perf"); + +MODULE_LICENSE("GPL"); +MODULE_INFO(build_number, BUILD_NUMBER); +MODULE_INFO(build_bywhom, BUILD_BYWHOM); +MODULE_INFO(build_ondate, BUILD_ONDATE); +MODULE_INFO(build_scmver, BUILD_SCMVER); diff --git a/micscif/micscif_nm.c b/micscif/micscif_nm.c new file mode 100644 index 0000000..7d34942 --- /dev/null +++ b/micscif/micscif_nm.c @@ -0,0 +1,1740 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* SCIF Node Management */ + +#include "mic/micscif.h" +#ifndef _MIC_SCIF_ +#include "mic_common.h" + +#endif +#include "mic/micscif_map.h" +#include "mic/micscif_intr.h" +#ifdef _MIC_SCIF_ +extern mic_dma_handle_t mic_dma_handle; +#else +extern bool mic_crash_dump_enabled; +#endif + + +/** + * micscif_create_node_dep: + * + * @dev: Remote SCIF device. + * @nr_pages: number of pages* + * + * Increment the map SCIF device ref count and notify the host if this is the + * first dependency being create between the two nodes. + */ +void +micscif_create_node_dep(struct micscif_dev *dev, int nr_pages) +{ +#ifdef SCIF_ENABLE_PM + struct nodemsg notif_msg; + + if (dev) { + mutex_lock(&dev->sd_lock); + if (!dev->scif_map_ref_cnt) { + /* Notify Host if this is the first dependency being created */ + notif_msg.uop = SCIF_NODE_CREATE_DEP; + notif_msg.src.node = ms_info.mi_nodeid; + notif_msg.payload[0] = dev->sd_node; + /* No error handling for Host SCIF device */ + micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], ¬if_msg, NULL); + } + dev->scif_map_ref_cnt += nr_pages; + mutex_unlock(&dev->sd_lock); + } +#endif +} + +/** + * micscif_destroy_node_dep: + * + * @dev: Remote SCIF device. + * @nr_pages: number of pages + * + * Decrement the map SCIF device ref count and notify the host if a dependency + * no longer exists between two nodes. + */ +void +micscif_destroy_node_dep(struct micscif_dev *dev, int nr_pages) +{ +#ifdef SCIF_ENABLE_PM + struct nodemsg notif_msg; + + if (dev) { + mutex_lock(&dev->sd_lock); + dev->scif_map_ref_cnt -= nr_pages; + if (!dev->scif_map_ref_cnt) { + /* Notify Host if all dependencies have been destroyed */ + notif_msg.uop = SCIF_NODE_DESTROY_DEP; + notif_msg.src.node = ms_info.mi_nodeid; + notif_msg.payload[0] = dev->sd_node; + /* No error handling for Host SCIF device */ + micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], ¬if_msg, NULL); + } + mutex_unlock(&dev->sd_lock); + } +#endif +} + +/** + * micscif_callback: + * + * @node: node id of the node added/removed. + * @event_type: SCIF_NODE_ADDED if a new node is added + * SCIF_NODE_REMOVED if a new node is removed + * + * Calls the callback function whenever a new node is added/removed + */ +static void micscif_callback(uint16_t node, enum scif_event_type event_type) +{ + struct list_head *pos; + struct scif_callback *temp; + union eventd event; + + switch (event_type) { + case SCIF_NODE_ADDED: + event.scif_node_added = node; + break; + case SCIF_NODE_REMOVED: + event.scif_node_removed = node; + break; + default: + return; + } + + mutex_lock(&ms_info.mi_event_cblock); + list_for_each(pos, &ms_info.mi_event_cb) { + temp = list_entry(pos, struct scif_callback, list_member); + temp->callback_handler(event_type, event); + } + mutex_unlock(&ms_info.mi_event_cblock); +} + +/** + * micscif_node_remove_callback: + * + * @node: node id of the node removed. + * + * Calls the callback function whenever a new node is removed + */ +static void micscif_node_remove_callback(int node) +{ + micscif_callback((uint16_t)node, SCIF_NODE_REMOVED); +} + +/** + * micscif_node_add_callback: + * + * @node: node id of the node added. + * + * Calls the callback function whenever a new node is added + */ +void micscif_node_add_callback(int node) +{ + micscif_callback((uint16_t)node, SCIF_NODE_ADDED); +} + +void micscif_cleanup_qp(struct micscif_dev *dev) +{ + struct micscif_qp *qp; + + qp = &dev->qpairs[0]; + + if (!qp) + return; + + scif_iounmap((void*)qp->remote_qp, sizeof(struct micscif_qp), dev); + scif_iounmap((void*)dev->qpairs[0].outbound_q.rb_base, sizeof(struct micscif_qp), dev); + qp->remote_qp = NULL; + dev->qpairs[0].local_write = 0; + dev->qpairs[0].inbound_q.current_write_offset = 0; + dev->qpairs[0].inbound_q.current_read_offset = 0; +#ifdef _MIC_SCIF_ + kfree((void*)(qp->inbound_q.rb_base)); + kfree(dev->qpairs); + qp = NULL; +#endif +} + +/* + * micscif_cleanup_scifdev + * + * @dev: Remote SCIF device. + * Uninitialize SCIF data structures for remote SCIF device. + */ +void micscif_cleanup_scifdev(struct micscif_dev *dev, bool destroy_wq) +{ + int64_t ret; +#ifndef _MIC_SCIF_ + mic_ctx_t *mic_ctx; +#endif + if (SCIFDEV_NOTPRESENT == dev->sd_state) { +#ifdef _MIC_SCIF_ + /* + * If there are any stale qp allocated due to + * p2p connection failures then cleanup now + */ + micscif_cleanup_qp(dev); +#endif + return; + } + + dev->sd_wait_status = OP_FAILED; + wake_up(&dev->sd_wq); + +#ifdef _MIC_SCIF_ + /* + * Need to protect destruction of the workqueue since this code + * can be called from two contexts: + * a) Remove Node Handling. + * b) SCIF driver unload + */ + mutex_lock(&dev->sd_lock); + if ((SCIFDEV_RUNNING != dev->sd_state) && (SCIFDEV_SLEEPING != dev->sd_state)) + goto unlock; + dev->sd_state = SCIFDEV_STOPPED; + wake_up(&dev->sd_p2p_wq); + mutex_unlock(&dev->sd_lock); + deregister_scif_intr_handler(dev); + if (destroy_wq && dev->sd_intr_wq) { + destroy_workqueue(dev->sd_intr_wq); + dev->sd_intr_wq = NULL; + } +#endif + + mutex_lock(&dev->sd_lock); +#ifndef _MIC_SCIF_ + if ((SCIFDEV_RUNNING != dev->sd_state) && (SCIFDEV_SLEEPING != dev->sd_state)) + goto unlock; + dev->sd_state = SCIFDEV_STOPPED; +#endif + /* + * Change the state of the remote SCIF device + * to idle as soon as the activity counter is + * zero. The node state and ref count is + * maintained within a single atomic_long_t. + * No timeout for this tight loop since we expect + * the node to complete the API it is currently + * executing following which the scif_ref_count + * will drop to zero. + */ + do { + ret = atomic_long_cmpxchg( + &dev->scif_ref_cnt, 0, SCIF_NODE_IDLE); + cpu_relax(); + } while (ret && ret != SCIF_NODE_IDLE); + + mutex_unlock(&dev->sd_lock); + /* Cleanup temporary registered windows */ + flush_workqueue(ms_info.mi_misc_wq); + mutex_lock(&dev->sd_lock); + +#ifdef _MIC_SCIF_ + drain_dma_global(mic_dma_handle); +#else + mic_ctx = get_per_dev_ctx(dev->sd_node - 1); + drain_dma_global(mic_ctx->dma_handle); + micscif_destroy_p2p(mic_ctx); +#endif + scif_invalidate_ep(dev->sd_node); + micscif_kill_apps_with_mmaps(dev->sd_node); + + micscif_cleanup_qp(dev); + mutex_unlock(&dev->sd_lock); +#ifndef _MIC_SCIF_ + mutex_lock(&ms_info.mi_conflock); + ms_info.mi_mask &= ~(0x1 << dev->sd_node); + ms_info.mi_total--; + mutex_unlock(&ms_info.mi_conflock); +#endif + + /* Wait for all applications to unmap remote memory mappings. */ + wait_event(dev->sd_mmap_wq, + !micscif_rma_do_apps_have_mmaps(dev->sd_node)); + micscif_cleanup_rma_for_zombies(dev->sd_node); + micscif_node_remove_callback(dev->sd_node); + return; +unlock: + mutex_unlock(&dev->sd_lock); +} + +/* + * micscif_remove_node: + * + * @mask: bitmask of nodes in the deactivation set. + * @flags: Type of deactivation set i.e. Power Management, + * RAS, Maintenance Mode etc. + * @block: Can block. + * + * Attempt to deactivate a set of remote SCIF devices nodes passed in mask. + * If the SCIF activity ref count is positive for a remote node then + * the approporiate bit in the input bitmask is reset and the resultant + * bitmask is returned. + */ +uint64_t micscif_handle_remove_node(uint64_t mask, uint64_t payload) +{ + int64_t ret; + int err = 0; + uint32_t i; + struct micscif_dev *dev; + uint64_t flags = 0; + flags = payload & 0x00000000FFFFFFFF; + + switch(flags) { + case DISCONN_TYPE_POWER_MGMT: + { + uint8_t *nodemask_buf = NULL; + int size = payload >> 32; + +#ifndef _MIC_SCIF_ + nodemask_buf = mic_data.dd_pm.nodemask; +#else + nodemask_buf = scif_ioremap(mask, size, &scif_dev[SCIF_HOST_NODE]); +#endif + if (!nodemask_buf) { + err = EAGAIN; + break; + } + + for (i = 0; i <= ms_info.mi_maxid; i++) { + dev = &scif_dev[i]; + if (!get_nodemask_bit(nodemask_buf , i)) + continue; + /* + * Try for the SCIF device lock. Bail out if + * it is already grabbed since some other + * thread is already working on some other + * node state transition for this remote SCIF device. + */ + if (mutex_trylock(&dev->sd_lock)) { + + if (SCIFDEV_RUNNING != dev->sd_state) { + mutex_unlock(&dev->sd_lock); + continue; + } + /* + * Change the state of the remote SCIF device + * to idle only if the activity counter is + * already zero. The node state and ref count + * is maintained within a single atomic_long_t. + */ + ret = atomic_long_cmpxchg( + &dev->scif_ref_cnt, 0, SCIF_NODE_IDLE); + + if (!ret || ret == SCIF_NODE_IDLE) { + if (!ret) { +#ifdef _MIC_SCIF_ + drain_dma_global(mic_dma_handle); +#else + mic_ctx_t *mic_ctx = get_per_dev_ctx(dev->sd_node - 1); + drain_dma_global(mic_ctx->dma_handle); +#endif + } + /* + * Turn off the remote SCIF device. + * Any communication to this SCIF + * after this point will require a + * wake up message to the host. + */ + dev->sd_state = SCIFDEV_SLEEPING; + err = 0; + } + else { + /* + * Cannot put the remote SCIF device + * to sleep. + */ + err = EAGAIN; + mutex_unlock(&dev->sd_lock); + break; + } + mutex_unlock(&dev->sd_lock); + } else { + err = EAGAIN; + break; + } + } + +#ifndef _MIC_SCIF_ + scif_iounmap(nodemask_buf, size, &scif_dev[SCIF_HOST_NODE]); +#endif + + break; + } + case DISCONN_TYPE_LOST_NODE: + { + /* In the case of lost node, first paramater + * is the node id and not a mask. + */ + dev = &scif_dev[mask]; + micscif_cleanup_scifdev(dev, !DESTROY_WQ); + break; + } + default: + { + /* Unknown remove node flags */ + BUG_ON(1); + } + } + + return err; +} + +/** + * set_nodemask_bit: + * + * @node_id[in]: node id to be set in the mask + * + * Set bit in the nodemask. each bit represents node. set bit to add node in to + * activation/de-activation set + */ +//void +//set_nodemask_bit(uint64_t *nodemask, uint32_t node_id) +void +set_nodemask_bit(uint8_t* nodemask, uint32_t node_id, int val) +{ + int index = 0; + uint8_t *temp_mask; + + index = (int) node_id / 8; + temp_mask = nodemask + index; + node_id = node_id - (index * 8); + if (val) + *temp_mask |= (1ULL << node_id); + else + *temp_mask &= ~(1ULL << node_id); +} + +/** + * check_nodemask_bit: + * + * @node_id[in]: node id to be set in the mask + * + * Check if a bit in the nodemask corresponding to a + * node id is set. + * + * return 1 if the bit is set. 0 if the bit is cleared. + */ +int +get_nodemask_bit(uint8_t* nodemask, uint32_t node_id) { + int index = 0; + uint8_t *temp_mask; + + index = (int) node_id / 8; + temp_mask = nodemask + index; + node_id = node_id - (index * 8); + return *temp_mask & (1ULL << node_id); + +} +/** +* nodemask_isvalid - Check if a nodemask is valid after +* calculating the de-activation set. +* +* @nodemask[in]: The nodemask to be checked. +* +* Returns true if valid. +*/ +bool nodemask_isvalid(uint8_t* nodemask) { + uint32_t i; + for (i = 0; i <= ms_info.mi_maxid; i++) { + if (get_nodemask_bit(nodemask, i)) + return true; + } + + return false; +} + +#ifndef _MIC_SCIF_ +/* + * micscif_send_rmnode_msg: + * + * @mask: Bitmask of nodes in the deactivation set. + * @node: Destination node for a deactivation set. + * @flags: Type of deactivation set i.e. Power Management, + * RAS, Maintenance Mode etc. + * @orig_node: The node which triggered this remove node message. + * + * Sends a deactivation request to the valid nodes not included in the + * deactivation set from the Host and waits for a response. + * Returns the response mask received from the node. + */ +uint64_t micscif_send_pm_rmnode_msg(int node, uint64_t nodemask_addr, + uint64_t nodemask_size, int orig_node) { + + uint64_t ret; + struct nodemsg notif_msg; + struct micscif_dev *dev = &scif_dev[node]; + + /* + * Send remove node msg only to running nodes. + * An idle node need not know about another _lost_ node + * until it wakes up. When it does, it will request the + * host to wake up the _lost_ node to which the host will + * respond with a NACK + */ + + if (SCIFDEV_RUNNING != dev->sd_state) + return -ENODEV; + + notif_msg.uop = SCIF_NODE_REMOVE; + notif_msg.src.node = ms_info.mi_nodeid; + notif_msg.dst.node = node; + notif_msg.payload[0] = nodemask_addr; + notif_msg.payload[1] = DISCONN_TYPE_POWER_MGMT; + notif_msg.payload[1] |= (nodemask_size << 32); + notif_msg.payload[2] = atomic_long_read(&ms_info.mi_unique_msgid); + notif_msg.payload[3] = orig_node; + /* Send the request to remove a set of nodes */ + pr_debug("Send PM rmnode msg for node %d to node %d\n", orig_node, node); + ret = micscif_nodeqp_send(dev, ¬if_msg, NULL); + + return ret; +} + +uint64_t micscif_send_lost_node_rmnode_msg(int node, int orig_node) { + uint64_t ret; + struct nodemsg notif_msg; + struct micscif_dev *dev = &scif_dev[node]; + + /* + * Send remove node msg only to running nodes. + * An idle node need not know about another _lost_ node + * until it wakes up. When it does, it will request the + * host to wake up the _lost_ node to which the host will + * respond with a NACK + */ + if (SCIFDEV_RUNNING != dev->sd_state) + return -ENODEV; + + micscif_inc_node_refcnt(dev, 1); + notif_msg.uop = SCIF_NODE_REMOVE; + notif_msg.src.node = ms_info.mi_nodeid; + notif_msg.dst.node = node; + notif_msg.payload[0] = orig_node; + notif_msg.payload[1] = DISCONN_TYPE_LOST_NODE; + notif_msg.payload[3] = orig_node; + /* Send the request to remove a set of nodes */ + ret = micscif_nodeqp_send(dev, ¬if_msg, NULL); + micscif_dec_node_refcnt(dev, 1); + + return ret; +} + +/* + * micpm_nodemask_uninit: + * @node - node to uninitalize + * + * Deallocate memory for per-card nodemask buffer +*/ +void +micpm_nodemask_uninit(mic_ctx_t* mic_ctx) +{ + if (mic_ctx && mic_ctx->micpm_ctx.nodemask.va) { + mic_ctx_unmap_single(mic_ctx, mic_ctx->micpm_ctx.nodemask.pa, + mic_ctx->micpm_ctx.nodemask.len); + kfree(mic_ctx->micpm_ctx.nodemask.va); + } +} + +/* + * micpm_nodemask_init: + * @num_devs - no of scif nodes including the host + * @node - node to initialize + * + * Allocate memory for per-card nodemask buffer +*/ +int +micpm_nodemask_init(uint32_t num_devs, mic_ctx_t* mic_ctx) +{ + if (!mic_ctx) + return 0; + + mic_ctx->micpm_ctx.nodemask.len = ((int) (num_devs / 8) + + ((num_devs % 8) ? 1 : 0)); + mic_ctx->micpm_ctx.nodemask.va = (uint8_t *) + kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL); + + if (!mic_ctx->micpm_ctx.nodemask.va) { + PM_DEBUG("Error allocating nodemask buffer\n"); + return -ENOMEM; + } + + mic_ctx->micpm_ctx.nodemask.pa = mic_ctx_map_single(mic_ctx, + mic_ctx->micpm_ctx.nodemask.va, + mic_ctx->micpm_ctx.nodemask.len); + + if(mic_map_error(mic_ctx->micpm_ctx.nodemask.pa)) { + PM_PRINT("Error Mapping nodemask buffer\n"); + kfree(mic_ctx->micpm_ctx.nodemask.va); + } + return 0; +} + +/** + * micpm_disconn_uninit: + * @num_devs - no of scif nodes including host + * Note - can not use ms_info.mi_total(total no of scif nodes) as it is updated after the driver load is complete + * + * Reset/re-initialize data structures needed for PM disconnection. This is necessary everytime the board is reset. + * Since host(node 0)represents one of the node in network, it is necessary to clear dependency of host with the given node + */ +int +micpm_disconn_uninit(uint32_t num_devs) +{ + uint32_t i; + uint32_t status = 0; + + /* + * ms_info.mi_total is updated after the driver load is complete + * switching back to static allocation of max nodes + */ + + if (ms_info.mi_depmtrx) { + + for (i = 0; i < (int)num_devs; i++) { + if (ms_info.mi_depmtrx[i]) { + kfree(ms_info.mi_depmtrx[i]); + } + } + kfree(ms_info.mi_depmtrx); + } + + if (mic_data.dd_pm.nodemask) + kfree(mic_data.dd_pm.nodemask); + + return status; +} + +/** + * micpm_disconn_init: + * @num_devs - no of scif nodes including host + * Note - can not use ms_info.mi_total(total no of scif nodes) as it is updated after the driver load is complete + * + * Allocate memory for dependency graph. Initialize dependencies for the node. + * The memory allocated is based on the no of devices present during driver load. + */ +int +micpm_disconn_init(uint32_t num_devs) +{ + uint32_t i; + uint32_t status = 0; + mic_ctx_t *mic_ctx; + + if (ms_info.mi_depmtrx) + return status; + + ms_info.mi_depmtrx = (uint32_t**)kzalloc(sizeof(uint32_t*) * num_devs, GFP_KERNEL); + if (!ms_info.mi_depmtrx) { + pr_debug("dependency graph initialization failed\n"); + status = -ENOMEM; + goto exit; + } + + for (i = 0; i < (int)num_devs; i++) { + ms_info.mi_depmtrx[i] = (uint32_t*)kzalloc(sizeof(uint32_t) * num_devs, GFP_KERNEL); + if (!ms_info.mi_depmtrx[i]) { + micpm_disconn_uninit(num_devs); + pr_debug("dependency graph initialization failed\n"); + status = -ENOMEM; + goto exit; + } + } + init_waitqueue_head(&ms_info.mi_disconn_wq); + atomic_long_set(&ms_info.mi_unique_msgid, 0); + + //In Windows, this code is executed during micpm_probe + for(i = 0; i < (num_devs - 1); i++) { + mic_ctx = get_per_dev_ctx(i); + status = micpm_nodemask_init(num_devs, mic_ctx); + if (status) + goto exit; + } + + /* Set up a nodemask buffer for Host scif node in a common pm_ctx */ + mic_data.dd_pm.nodemask_len = ((int) (num_devs / 8) + + ((num_devs % 8) ? 1 : 0)); + mic_data.dd_pm.nodemask = (uint8_t *) + kzalloc(mic_data.dd_pm.nodemask_len, GFP_KERNEL); + + if (!mic_data.dd_pm.nodemask) { + PM_DEBUG("Error allocating nodemask buffer\n"); + status = -ENOMEM; + goto exit; + } + +exit: + return status; +} + +/** + * micscif_set_nodedep: + * + * @src_node: node which is creating dependency. + * @dst_node: node on which dependency is being created + * + * sets the given value in dependency graph for src_node -> dst_node + */ +void +micscif_set_nodedep(uint32_t src_node, uint32_t dst_node, enum dependency_state state) +{ + /* We dont need to lock dependency graph while updating + * as every node will modify its own row + */ + if (ms_info.mi_depmtrx) + ms_info.mi_depmtrx[src_node][dst_node] = state; +} + +/** + * micscif_get_nodedep: + * + * @src_node: node which has/has not created dependency. + * @dst_node: node on which dependency was/was not created + * + * gets the current value in dependency graph for src_node -> dst_node + */ +enum dependency_state +micscif_get_nodedep(uint32_t src_node, uint32_t dst_node) +{ + enum dependency_state state = DEP_STATE_NOT_DEPENDENT; + if (ms_info.mi_depmtrx) + state = ms_info.mi_depmtrx[src_node][dst_node]; + return state; +} + +/** + * init_depgraph_stack: + * + * @stack_ptr: list head. + * + * Initialize linked list to be used as stack + */ +int +init_depgraph_stack(struct list_head *stack_ptr) +{ + int status = 0; + + if (!stack_ptr) { + pr_debug("%s argument stack_ptr is invalid\n", __func__); + status = -EINVAL; + goto exit; + } + /* Initialize stack */ + INIT_LIST_HEAD(stack_ptr); + +exit: + return status; +} + +/** + * uninit_depgraph_stack: + * + * @stack_ptr: list head for linked list(stack). + * + * Empty stack(linked list). Pop all the nodes left in the stack. + */ +int +uninit_depgraph_stack(struct list_head *stack_ptr) +{ + int status = 0; + uint32_t node_id; + if (!stack_ptr) { + pr_debug("%s argument stack_ptr is invalid\n", __func__); + status = -EINVAL; + goto exit; + } + + /* pop all the nodes left in the stack */ + while (!is_stack_empty(stack_ptr)) { + status = stack_pop_node(stack_ptr, &node_id); + if (status) { + pr_debug("%s error while cleaning up depgraph stack\n", __func__); + status = -EINVAL; + goto exit; + } + } + +exit: + return status; +} + +/** + * is_stack_empty: + * + * @stack_ptr: list head for linked list(stack). + * + * returns true if the stack is empty. + */ +int +is_stack_empty(struct list_head *stack_ptr) +{ + if(list_empty(stack_ptr)) { + return 1; + } + return 0; +} + +/** + * stack_push_node: + * + * @stack_ptr[in]: list head for linked list(stack). + * @node_id[in]: node id to be pushed + * + * Push node in to the stack i.e. create node and add it at the start of linked list + */ +int +stack_push_node(struct list_head *stack_ptr, uint32_t node_id) +{ + int status = 0; + struct stack_node *datanode = NULL; + + datanode = kmalloc(sizeof(struct stack_node), GFP_KERNEL); + if (!datanode) { + pr_debug("%s error allocating memory to stack node.\n", __func__); + status = -ENOMEM; + goto exit; + } + + datanode->node_id = node_id; + list_add(&datanode->next, stack_ptr); +exit: + return status; +} + +/** + * stack_pop_node: + * + * @stack_ptr[in]: list head for linked list(stack). + * @node_id[out]: pointer to the node id to be popped + * + * Pop node from the stack i.e. delete first entry of linked list and return its data. + */ +int +stack_pop_node(struct list_head *stack_ptr, uint32_t *node_id) +{ + int status = 0; + struct stack_node *datanode = NULL; + + if(is_stack_empty(stack_ptr)) { + pr_debug("%s stack found empty when tried to pop\n", __func__); + status = -EFAULT; + goto exit; + } + + datanode = list_first_entry(stack_ptr, struct stack_node, next); + if (!datanode) { + pr_debug("%s Unable to pop from stack\n", __func__); + status = -EFAULT; + goto exit; + } + *node_id = datanode->node_id; + + list_del(&datanode->next); + if (datanode) { + kfree(datanode); + } + +exit: + return status; +} + +/** + * micscif_get_activeset: + * + * @node_id[in]: source node id. + * @nodemask[out]: bitmask of nodes present in activation set + * + * Algorithm to find out activation set for the given source node. Activation set is used to re-connect node into + * the scif network. + */ +int +micscif_get_activeset(uint32_t node_id, uint8_t *nodemask) +{ + int status = 0; + uint32_t i = 0; + struct list_head stack; + uint8_t visited[128] = {0}; // 128 is max number of nodes. + uint32_t num_nodes = ms_info.mi_maxid + 1; + mic_ctx_t *mic_ctx; + + if (!ms_info.mi_depmtrx) { + status = -EINVAL; + goto exit; + } + + status = init_depgraph_stack(&stack); + if (status) { + pr_debug("%s failed to initilize depgraph stack\n", __func__); + goto exit; + } + + status = stack_push_node(&stack, node_id); + if (status) { + pr_debug("%s error while running activation set algorithm\n", __func__); + goto exit; + } + + /* mark node visited to avoid repetition of the algorithm for the same node */ + visited[node_id] = 1; + + while (!is_stack_empty(&stack)) { + status = stack_pop_node(&stack, &node_id); + if (status) { + pr_debug("%s error while running activation set algorithm\n", __func__); + goto exit; + } + + /* include node_id in the activation set*/ + set_nodemask_bit(nodemask, node_id, 1); + + for (i = 0; i < num_nodes; i++) { + /* check if node has dependency on any node 'i' which is also disconnected at this time*/ + if ((!visited[i]) && (ms_info.mi_depmtrx[node_id][i] == DEP_STATE_DISCONNECTED)) { + visited[i] = 1; + if (i == 0) + continue; + mic_ctx = get_per_dev_ctx(i - 1); + if ((mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC3) || + (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC6)) { + status = stack_push_node(&stack, i); + if (status) { + pr_debug("%s error while running activation set algorithm\n", __func__); + goto exit; + } + } + } + } + } /* end of while (!is_stack_empty(&stack)) */ +exit: + uninit_depgraph_stack(&stack); + return status; +} + +/** + * micscif_get_minimal_deactiveset: + * + * @node_id[in]: source node id. + * @nodemask[out]: bitmask of nodes present in de-activation set + * @visited[in/out]: information of which nodes are already visited in de-activation set algorithm + * + * Algorithm to find out minimum/must de-activation set for the given source node. This method is part of and used by + * micscif_get_deactiveset. + */ +int micscif_get_minimal_deactiveset(uint32_t node_id, uint8_t *nodemask, uint8_t *visited) +{ + int status = 0; + uint32_t i = 0; + struct list_head stack; + uint32_t num_nodes = ms_info.mi_maxid + 1; + + if (!ms_info.mi_depmtrx) { + status = -EINVAL; + goto exit; + } + + status = init_depgraph_stack(&stack); + if (!visited) { + pr_debug("%s invalid parameter visited", __func__); + status = -EINVAL; + goto exit_pop; + } + + if (status) { + pr_debug("%s failed to initilize depgraph stack\n", __func__); + goto exit_pop; + } + + status = stack_push_node(&stack, node_id); + if (status) { + pr_debug("%s error while running de-activation set algorithm\n", __func__); + goto exit_pop; + } + + /* mark node visited to avoid repetition of the algorithm for the same node */ + visited[node_id] = 1; + + while (!is_stack_empty(&stack)) { + + status = stack_pop_node(&stack, &node_id); + if (status) { + pr_debug("%s error while running de-activation set algorithm\n", __func__); + goto exit_pop; + } + + /* include node_id in the activation set*/ + set_nodemask_bit(nodemask, node_id, 1); + + for (i = 0; i < num_nodes; i++) { + if (!visited[i]) { + if (ms_info.mi_depmtrx[i][node_id] == DEP_STATE_DEPENDENT) { + /* The algorithm terminates, if we find any dependent node active */ + status = -EOPNOTSUPP; + goto exit_pop; + } else if(ms_info.mi_depmtrx[i][node_id] == DEP_STATE_DISCONNECT_READY) { + /* node is dependent but ready to get disconnected */ + visited[i] = 1; + status = stack_push_node(&stack, i); + if (status) { + pr_debug("%s error while running de-activation set algorithm\n", __func__); + goto exit_pop; + } + } + } + } + }/*end of while(!is_stack_empty(&stack))*/ + +exit_pop: + while (!is_stack_empty(&stack)) { + status = stack_pop_node(&stack, &node_id); + if (status) { + pr_debug("%s error while running activation set algorithm\n", __func__); + break; + } + if (visited) + visited[node_id] = 0; + } +exit: + return status; +} + +/** + * micscif_get_deactiveset: + * + * @node_id[in]: source node id. + * @nodemask[out]: bitmask of nodes present in de-activation set + * @max_disconn: flag to restrict de-activation set algoritthm to minimum/must set. + * True value indicates maximum de-activation set + * + * Algorithm to find out de-activation set for the given source node. De-activation set is used to disconnect node into + * the scif network. The algorithm can find out maximum possible de-activation set(required in situations like + * power management)if the max_possible flag is set. + */ +int +micscif_get_deactiveset(uint32_t node_id, uint8_t *nodemask, int max_disconn) +{ + int status = 0; + uint32_t i = 0; + struct list_head stack; + uint8_t *visited = NULL; + uint8_t cont_next_step = 0; + uint32_t num_nodes = ms_info.mi_maxid + 1; + mic_ctx_t *mic_ctx; + + if (!ms_info.mi_depmtrx) { + status = -EINVAL; + goto exit; + } + + status = init_depgraph_stack(&stack); + if (status) { + pr_debug("%s failed to initilize depgraph stack\n", __func__); + goto exit; + } + + visited = kzalloc(sizeof(uint8_t) * num_nodes, GFP_KERNEL); + if (!visited) { + pr_debug("%s failed to allocated memory for visited array", __func__); + status = -ENOMEM; + goto exit; + } + + status = stack_push_node(&stack, node_id); + if (status) { + pr_debug("%s error while running de-activation set algorithm\n", __func__); + goto exit; + } + + while (!is_stack_empty(&stack)) { + + status = stack_pop_node(&stack, &node_id); + if (status) { + pr_debug("%s error while running de-activation set algorithm\n", __func__); + goto exit; + } + + /* check if we want to find out maximum possible de-activation set */ + if (max_disconn) { + cont_next_step = 1; + } + + if (!visited[node_id]) { + status = micscif_get_minimal_deactiveset(node_id, nodemask, visited); + if (status) { + if (status == -EOPNOTSUPP) { + pr_debug("%s No deactivation set found for node %d", __func__, node_id); + cont_next_step = 0; + } + else { + pr_debug("%s Failed to calculate deactivation set", __func__); + goto exit; + } + } + + } /* end for if (!visited[node_id]) */ + + if (cont_next_step) { + for (i = 0; i < num_nodes; i++) { + /* check if we can put more nodes 'i' in de-activation set if this node(dependent node) + * is de-activating + */ + if ((!visited[i]) && + (ms_info.mi_depmtrx[node_id][i] == DEP_STATE_DISCONNECT_READY)) { + if (i == 0) + continue; + mic_ctx = get_per_dev_ctx(i - 1); + if (mic_ctx->micpm_ctx.idle_state == + PM_IDLE_STATE_PC3_READY) { + /* This node might be able to get into deactivation set */ + status = stack_push_node(&stack, i); + if (status) { + pr_debug("%s error while running de-activation set algorithm\n", __func__); + goto exit; + } + } + } + } + } + } /* end for while (!is_stack_empty(&stack)) */ + + if (!nodemask_isvalid(nodemask)) { + pr_debug("%s No deactivation set found for node %d", + __func__, node_id); + status = -EOPNOTSUPP; + } +exit: + if (visited) { + kfree(visited); + } + uninit_depgraph_stack(&stack); + return status; +} + +/* micscif_update_p2p_state: + * + * Update the p2p_disc_state of peer node peer_id in the p2p list of node node_id. + * + * @node_id: The node id whose p2p list needs to be updated. + * @peer_id: The node id in the p2p list of the node_id that will get updated. + * @scif_state: The state to be updated to. + * + */ +void micscif_update_p2p_state(uint32_t node_id, uint32_t peer_id, enum scif_state state) { + + struct micscif_dev *dev; + struct list_head *pos, *tmp; + struct scif_p2p_info *p2p; + + dev = &scif_dev[node_id]; + if (!list_empty(&dev->sd_p2p)) { + list_for_each_safe(pos, tmp, &dev->sd_p2p) { + p2p = list_entry(pos, struct scif_p2p_info, + ppi_list); + if(p2p->ppi_peer_id == peer_id) { + p2p->ppi_disc_state = state; + break; + } + } + } +} + +/* micscif_p2p_node_exists: Check if a node exists in the + * list of nodes that have been sent an rmnode message. + * + * node_list: The list that contains the nodes that has been + * sent the rmnode message for this transaction. + * node_id: the node to be searched for. + * + * returns: true of the node exists.False otherwise + */ +bool micscif_rmnode_msg_sent(struct list_head *node_list, uint32_t node_id) { + + struct list_head *pos1, *tmp1; + struct stack_node *added_node; + + if (!list_empty(node_list)) { + list_for_each_safe(pos1, tmp1, node_list) { + added_node = list_entry(pos1, struct stack_node, next); + if(added_node->node_id == node_id) + return true; + } + } + return false; +} + +/** + * micscif_execute_disconnecte: Perform PM disconnection of a node + * with its neighboring nodes. + * + * node_id: The node to be disconnected. + * nodemask: Mask containing the list of nodes (including node_id) + * to be disconnected. + * node_list: List of nodes that received the disconnection message. + */ +int micscif_execute_disconnect(uint32_t node_id, + uint8_t *nodemask, + struct list_head *node_list) +{ + uint32_t status = 0; + int ret; + uint64_t msg_cnt = 0; + uint32_t i = 0; + int pending_wakeups = 0; + mic_ctx_t *send_rmnode_ctx; + uint32_t node; + mic_ctx_t *mic_ctx = get_per_dev_ctx(node_id - 1); + struct scif_p2p_info *p2p; + struct list_head *pos, *tmp; + struct micscif_dev *dev; + + + /* Always send rmnode msg to SCIF_HOST_NODE */ + memcpy(mic_data.dd_pm.nodemask, nodemask, + mic_data.dd_pm.nodemask_len); + ret = (int) micscif_send_pm_rmnode_msg(SCIF_HOST_NODE, 0, mic_data.dd_pm.nodemask_len, + node_id); + /* Add this node to msg list. */ + if(!ret) { + msg_cnt++; + stack_push_node(node_list, SCIF_HOST_NODE); + } + + if((ret == 0)||(ret == -ENODEV)) { + status = 0; + } + + /* For each node in the nodemask, traverse its p2p list + * and send rmnode_msg to those nodes 1) That are not also + * in the node mask and 2) That have not been already sent + * rmnode messages in this transaction and 3) That have + * their disconnection state as RUNNING. + */ + for (i = 0; i <= ms_info.mi_maxid; i++) { + /* verify if the node is present in deactivation set */ + if (!get_nodemask_bit(nodemask, i)) + continue; + + /* Get to the p2p list of this node */ + dev = &scif_dev[i]; + list_for_each_safe(pos, tmp, &dev->sd_p2p) { + p2p = list_entry(pos, struct scif_p2p_info, + ppi_list); + + if (get_nodemask_bit(nodemask, p2p->ppi_peer_id)) + continue; + if (p2p->ppi_disc_state == SCIFDEV_SLEEPING) + continue; + + if(micscif_rmnode_msg_sent(node_list, p2p->ppi_peer_id)) + continue; + send_rmnode_ctx = get_per_dev_ctx(p2p->ppi_peer_id - 1); + if (!send_rmnode_ctx->micpm_ctx.nodemask.va) { + status = -EINVAL; + goto list_cleanup; + } + + memcpy(send_rmnode_ctx->micpm_ctx.nodemask.va, nodemask, + send_rmnode_ctx->micpm_ctx.nodemask.len); + ret = (int) micscif_send_pm_rmnode_msg(p2p->ppi_peer_id, + send_rmnode_ctx->micpm_ctx.nodemask.pa, + send_rmnode_ctx->micpm_ctx.nodemask.len,node_id); + + /* Add this node to msg list. */ + if(!ret) { + msg_cnt++; + stack_push_node(node_list, p2p->ppi_peer_id); + } + + if((ret == 0)||(ret == -ENODEV)) { + status = 0; + } + } + } + + ret = wait_event_timeout(ms_info.mi_disconn_wq, + (atomic_read(&mic_ctx->disconn_rescnt) == msg_cnt) || + (pending_wakeups = atomic_read(&mic_data.dd_pm.wakeup_in_progress)), + NODE_ALIVE_TIMEOUT); + if ((!ret) || (atomic_read(&mic_ctx->disconn_rescnt) != msg_cnt) + || (ms_info.mi_disconnect_status == OP_FAILED)) { + pr_debug("SCIF disconnect failed. " + "remove_node messages sent: = %llu " + "remove_node acks received: %d " + "Pending wakeups: %d ret = %d\n", msg_cnt, + atomic_read(&mic_ctx->disconn_rescnt), + pending_wakeups, ret); + + status = -EAGAIN; + goto list_cleanup; + } + return status; + +list_cleanup: + while (!is_stack_empty(node_list)) + stack_pop_node(node_list, &node); + return status; +} + +/** + * micscif_node_disconnect: + * + * @node_id[in]: source node id. + * @nodemask[out]: bitmask of nodes that have to be disconnected together. + * it represents node_id + * @disconn_type[in]: flag to identify disconnection type. (for example - power mgmt, lost node, maintenance mode etc) + * + * Method responsible for disconnecting node from the scif network. considers dependencies with other node. + * finds out deactivation set. Sends node queue pair messages to all the scif nodes outside deactivation set + * returns error if node can not be disconnected from the network. + */ +int micscif_disconnect_node(uint32_t node_id, uint8_t *nodemask, enum disconn_type type) +{ + uint32_t status = 0; + int ret; + uint64_t msg_cnt = 0; + uint32_t i = 0; + mic_ctx_t *mic_ctx = 0; + struct list_head node_list; + uint32_t node; + + if (!node_id) + return -EINVAL; + + mic_ctx = get_per_dev_ctx(node_id - 1); + + if (!mic_ctx) + return -EINVAL; + + switch(type) { + case DISCONN_TYPE_POWER_MGMT: + { + if (!nodemask) + return -EINVAL; + + atomic_long_add(1, &ms_info.mi_unique_msgid); + atomic_set(&mic_ctx->disconn_rescnt, 0); + ms_info.mi_disconnect_status = OP_IN_PROGRESS; + INIT_LIST_HEAD(&node_list); + + status = micscif_execute_disconnect(node_id, + nodemask, &node_list); + if (status) + return status; + + /* Reset unique msg_id */ + atomic_long_set(&ms_info.mi_unique_msgid, 0); + + while (!is_stack_empty(&node_list)) { + status = stack_pop_node(&node_list, &node); + if (status) + break; + + for (i = 0; i <= ms_info.mi_maxid; i++) { + if (!get_nodemask_bit(nodemask, i)) + continue; + micscif_update_p2p_state(i, node, SCIFDEV_SLEEPING); + } + } + break; + } + case DISCONN_TYPE_LOST_NODE: + { + atomic_long_add(1, &ms_info.mi_unique_msgid); + atomic_set(&mic_ctx->disconn_rescnt, 0); + + for (i = 0; ((i <= ms_info.mi_maxid) && (i != node_id)); i++) { + ret = (int)micscif_send_lost_node_rmnode_msg(i, node_id); + if(!ret) + msg_cnt++; + if((ret == 0)||(ret == -ENODEV)) { + status = 0; + } + } + + ret = wait_event_timeout(ms_info.mi_disconn_wq, + (atomic_read(&mic_ctx->disconn_rescnt) == msg_cnt), + NODE_ALIVE_TIMEOUT); + break; + } + default: + status = -EINVAL; + } + + return status; +} + +/** + * micscif_node_connect: + * + * @node_id[in]: node to wakeup. + * @bool get_ref[in]: Also get node reference after wakeup by incrementing the PM reference count + * + * Method responsible for connecting node into the scif network. considers dependencies with other node. + * finds out activation set. connects all the depenendent nodes in the activation set + * returns error if node can not be connected from the network. + */ +int +micscif_connect_node(uint32_t node_id, bool get_ref) +{ + return do_idlestate_exit(get_per_dev_ctx(node_id - 1), get_ref); +} + +uint64_t micscif_send_node_alive(int node) +{ + struct nodemsg alive_msg; + struct micscif_dev *dev = &scif_dev[node]; + int err; + + alive_msg.uop = SCIF_NODE_ALIVE; + alive_msg.src.node = ms_info.mi_nodeid; + alive_msg.dst.node = node; + pr_debug("node alive msg sent to node %d\n", node); + micscif_inc_node_refcnt(dev, 1); + err = micscif_nodeqp_send(dev, &alive_msg, NULL); + micscif_dec_node_refcnt(dev, 1); + return err; +} + +int micscif_handle_lostnode(uint32_t node_id) +{ + mic_ctx_t *mic_ctx; + uint32_t status = -EOPNOTSUPP; +#ifdef MM_HANDLER_ENABLE + uint8_t *mmio_va; + sbox_scratch1_reg_t scratch1reg = {0}; +#endif + + printk("%s %d node %d\n", __func__, __LINE__, node_id); + mic_ctx = get_per_dev_ctx(node_id - 1); + + if (mic_ctx->state != MIC_ONLINE && mic_ctx->state != MIC_SHUTDOWN) + return 0; + + if (mic_crash_dump_enabled) { + if (!(status = vmcore_create(mic_ctx))) + printk("%s %d node %d ready for crash dump!\n", + __func__, __LINE__, node_id); + else + printk(KERN_ERR "%s %d node %d crash dump failed status %d\n", + __func__, __LINE__, node_id, status); + } + + mic_ctx->crash_count++; + mutex_lock(&mic_ctx->state_lock); + if (mic_ctx->state == MIC_ONLINE || + mic_ctx->state == MIC_SHUTDOWN) + mic_setstate(mic_ctx, MIC_LOST); + mutex_unlock(&mic_ctx->state_lock); + + /* mpssd will handle core dump and reset/auto reboot */ + if (mic_crash_dump_enabled && !status) + return status; + + printk("%s %d stopping node %d to recover lost node!\n", + __func__, __LINE__, node_id); + status = adapter_stop_device(mic_ctx, 1, !RESET_REATTEMPT); + wait_for_reset(mic_ctx); + + if (!ms_info.mi_watchdog_auto_reboot) { + printk("%s %d cannot boot node %d to recover lost node since auto_reboot is off\n", + __func__, __LINE__, node_id); + return status; + } + +/* Disabling MM handler invocation till it is ready to handle errors + * till then we just reboot the card + */ +#ifdef MM_HANDLER_ENABLE + mmio_va = mic_ctx->mmio.va; + scratch1reg.bits.status = FLASH_CMD_INVALID; + + if(mic_ctx->bi_family == FAMILY_ABR) { + printk("Node %d lost. Cannot recover in KNF\n", node_id); + status = adapter_start_device(mic_ctx); + return status; + } + + printk("Booting maintenance mode handler\n"); + status = set_card_usage_mode(mic_ctx, USAGE_MODE_MAINTENANCE, NULL, 0); + if(status) { + printk("Unable to boot maintenance mode\n"); + return status; + } + + status = send_flash_cmd(mic_ctx, RAS_CMD, NULL, 0); + if(status) { + printk("Unable to recover node\n"); + return status; + } + while(scratch1reg.bits.status != FLASH_CMD_COMPLETED) { + ret = SBOX_READ(mmio_va, SBOX_SCRATCH1); + scratch1reg.value = ret; + msleep(1); + i++; + printk("Looping for status (time = %d ms)\n", i); + if(i > NODE_ALIVE_TIMEOUT) { + status = -ETIME; + printk("Unable to recover node. Status bit is : %d\n", + scratch1reg.bits.status); + return status; + } + + } +#endif + printk("%s %d booting node %d to recover lost node!\n", + __func__, __LINE__, node_id); + status = adapter_start_device(mic_ctx); + return status; +} + +void micscif_watchdog_handler(struct work_struct *work) +{ + struct micscif_dev *dev = + container_of(to_delayed_work(work), + struct micscif_dev, sd_watchdog_work); + struct _mic_ctx_t *mic_ctx; + int i = dev->sd_node, err, ret; + + mic_ctx = get_per_dev_ctx(i - 1); + + switch (mic_ctx->sdbic1) { + case SYSTEM_HALT: + case SYSTEM_POWER_OFF: + { + adapter_stop_device(mic_ctx, 1, !RESET_REATTEMPT); + wait_for_reset(mic_ctx); + mic_ctx->sdbic1 = 0; + break; + } + case SYSTEM_RESTART: + { + mic_setstate(mic_ctx, MIC_LOST); + mic_ctx->sdbic1 = 0; + break; + } + case SYSTEM_BOOTING: + case SYSTEM_RUNNING: +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)) + case SYSTEM_SUSPEND_DISK: +#endif + break; + case 0xdead: + if (mic_crash_dump_enabled) + micscif_handle_lostnode(i); + mic_ctx->sdbic1 = 0; + break; + default: + break; + } + + switch (mic_ctx->state) { + case MIC_ONLINE: + break; + case MIC_BOOT: + goto restart_timer; + case MIC_SHUTDOWN: + case MIC_LOST: + case MIC_READY: + case MIC_NORESPONSE: + case MIC_BOOTFAIL: + case MIC_RESET: + case MIC_RESETFAIL: + case MIC_INVALID: + return; + } + + if (!ms_info.mi_watchdog_enabled) + return; + + err = micpm_get_reference(mic_ctx, false); + if (err == -EAGAIN) { + goto restart_timer; + } else if (err == -ENODEV) { + micscif_handle_lostnode(i); + goto restart_timer; + } + + if (1 != atomic_cmpxchg(&dev->sd_node_alive, 1, 0)) { + + err = (int)(micscif_send_node_alive(i)); + + if (err) { + micpm_put_reference(mic_ctx); + goto restart_timer; + } + + ret = wait_event_timeout(dev->sd_watchdog_wq, + (atomic_cmpxchg(&dev->sd_node_alive, 1, 0) == 1), + NODE_ALIVE_TIMEOUT); + if (!ret || err) + micscif_handle_lostnode(i); + } + micpm_put_reference(mic_ctx); + +restart_timer: + if (dev->sd_ln_wq) + queue_delayed_work(dev->sd_ln_wq, + &dev->sd_watchdog_work, NODE_ALIVE_TIMEOUT); +} +#else + +long micscif_suspend(uint8_t* nodemask) { + long ret = 0; + int i; + struct micscif_dev *dev; + + for (i = 0; i <= ms_info.mi_maxid; i++) { + if (get_nodemask_bit(nodemask , i)) { + dev = &scif_dev[i]; + if (SCIFDEV_RUNNING != dev->sd_state) + continue; + + ret = atomic_long_cmpxchg( + &dev->scif_ref_cnt, 0, SCIF_NODE_IDLE); + if (!ret || ret == SCIF_NODE_IDLE) { + dev->sd_state = SCIFDEV_SLEEPING; + ret = 0; + } + else { + set_nodemask_bit(nodemask, i, 0); + ret = EAGAIN; + } + } + } + return ret; +} +/* + * scif_suspend_handler - SCIF tasks before transition to low power state. + */ +int micscif_suspend_handler(struct notifier_block *this, + unsigned long event, void *ptr) +{ + int ret = 0; +#ifdef SCIF_ENABLE_PM + int node = 0; + int size; + uint8_t *nodemask_buf; + + size = ((int) ((ms_info.mi_maxid + 1) / 8) + + (((ms_info.mi_maxid + 1) % 8) ? 1 : 0)); + nodemask_buf = (uint8_t*)kzalloc(size, GFP_ATOMIC); + if(!nodemask_buf) + return -ENOMEM; + + for (node = 0; node <= ms_info.mi_maxid; node++) { + if ((node != SCIF_HOST_NODE) && (node != ms_info.mi_nodeid)) + set_nodemask_bit(nodemask_buf, node, 1); + } + + if (micscif_suspend(nodemask_buf)){ + ret = -EBUSY; + goto clean_up; + } + + dma_suspend(mic_dma_handle); +clean_up: + kfree(nodemask_buf); +#endif + return ret; +} + +/* + * micscif_resume_handler - SCIF tasks after wake up from low power state. + */ +int micscif_resume_handler(struct notifier_block *this, + unsigned long event, void *ptr) +{ +#ifdef SCIF_ENABLE_PM +#ifdef _MIC_SCIF_ + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); +#endif + dma_resume(mic_dma_handle); +#endif + return 0; +} + +/* + * scif_fail_suspend_handler - SCIF tasks if a previous scif_suspend call has + * failed since a low power state transition could not be completed. + */ +int micscif_fail_suspend_handler(struct notifier_block *this, + unsigned long event, void *ptr) +{ +/* Stub out function since it is an optimization that isn't working properly */ +#if 0 +#ifdef SCIF_ENABLE_PM + int node = 0; + long ret; + struct micscif_dev *dev; + + for (node = 0; node <= ms_info.mi_maxid; node++) { + dev = &scif_dev[node]; + ret = atomic_long_cmpxchg(&dev->scif_ref_cnt, SCIF_NODE_IDLE, 0); + if (ret != SCIF_NODE_IDLE) + continue; + if (SCIFDEV_SLEEPING == dev->sd_state) + dev->sd_state = SCIFDEV_RUNNING; + } +#endif +#endif + return 0; +} + +void micscif_get_node_info(void) +{ + struct nodemsg msg; + struct get_node_info node_info; + + init_waitqueue_head(&node_info.wq); + node_info.state = OP_IN_PROGRESS; + micscif_inc_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1); + msg.uop = SCIF_GET_NODE_INFO; + msg.src.node = ms_info.mi_nodeid; + msg.dst.node = SCIF_HOST_NODE; + msg.payload[3] = (uint64_t)&node_info; + + if ((micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], &msg, NULL))) + goto done; + + wait_event(node_info.wq, node_info.state != OP_IN_PROGRESS); +done: + micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1); + /* Synchronize with the thread waking us up */ + mutex_lock(&ms_info.mi_conflock); + mutex_unlock(&ms_info.mi_conflock); + ; +} +#endif /* _MIC_SCIF_ */ diff --git a/micscif/micscif_nodeqp.c b/micscif/micscif_nodeqp.c new file mode 100644 index 0000000..7dc5350 --- /dev/null +++ b/micscif/micscif_nodeqp.c @@ -0,0 +1,2902 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" +#include "mic/micscif_smpt.h" +#include "mic/micscif_nodeqp.h" +#include "mic/micscif_intr.h" +#include "mic/micscif_nm.h" +#include "mic_common.h" +#include "mic/micscif_map.h" + +#define SBOX_MMIO_LENGTH 0x10000 +/* FIXME: HW spefic, define someplace else */ +/* SBOX Offset in MMIO space */ +#define SBOX_OFFSET 0x10000 + +#ifdef ENABLE_TEST +static void micscif_qp_testboth(struct micscif_dev *scifdev); +#endif + +bool mic_p2p_enable = 1; +bool mic_p2p_proxy_enable = 1; + +void micscif_teardown_ep(void *endpt) +{ + struct endpt *ep = (struct endpt *)endpt; + struct micscif_qp *qp = ep->qp_info.qp; + if (qp) { + if (qp->outbound_q.rb_base) + scif_iounmap((void *)qp->outbound_q.rb_base, + qp->outbound_q.size, ep->remote_dev); + if (qp->remote_qp) + scif_iounmap((void *)qp->remote_qp, + sizeof(struct micscif_qp), ep->remote_dev); + if (qp->local_buf) { + unmap_from_aperture( + qp->local_buf, + ep->remote_dev, ENDPT_QP_SIZE); + } + if (qp->local_qp) { + unmap_from_aperture(qp->local_qp, ep->remote_dev, + sizeof(struct micscif_qp)); + } + if (qp->inbound_q.rb_base) + kfree((void *)qp->inbound_q.rb_base); + kfree(qp); +#ifdef _MIC_SCIF_ + micscif_teardown_proxy_dma(endpt); +#endif + WARN_ON(!list_empty(&ep->rma_info.task_list)); + } +} + +/* + * Enqueue the endpoint to the zombie list for cleanup. + * The endpoint should not be accessed once this API returns. + */ +void micscif_add_epd_to_zombie_list(struct endpt *ep, bool mi_eplock_held) +{ + unsigned long sflags = 0; + + /* + * It is an error to call scif_close() on an endpoint on which a + * scif_range structure of that endpoint has not been returned + * after a call to scif_get_pages() via scif_put_pages(). + */ + if (SCIFEP_CLOSING == ep->state || + SCIFEP_CLOSED == ep->state || + SCIFEP_DISCONNECTED == ep->state) + BUG_ON(micscif_rma_list_get_pages_check(ep)); + + if (list_empty(&ep->rma_info.task_list) && ep->remote_dev) + wake_up(&ep->remote_dev->sd_mmap_wq); + if (!mi_eplock_held) + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + spin_lock(&ep->lock); + ep->state = SCIFEP_ZOMBIE; + spin_unlock(&ep->lock); + list_add_tail(&ep->list, &ms_info.mi_zombie); + ms_info.mi_nr_zombies++; + if (!mi_eplock_held) + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); +} + +/* Initializes "local" data structures for the QP + * + * Allocates the QP ring buffer (rb), initializes the "in bound" queue + * For the host generate bus addresses for QP rb & qp, in the card's case + * map these into the pci aperture + */ +int micscif_setup_qp_connect(struct micscif_qp *qp, dma_addr_t *qp_offset, + int local_size, struct micscif_dev *scifdev) +{ + void *local_q = NULL; + int err = 0; + volatile uint32_t tmp_rd; + + spin_lock_init(&qp->qp_send_lock); + spin_lock_init(&qp->qp_recv_lock); + + if (!qp->inbound_q.rb_base) { + /* we need to allocate the local buffer for the incoming queue */ + local_q = kzalloc(local_size, GFP_ATOMIC); + if (!local_q) { + printk(KERN_ERR "Ring Buffer Allocation Failed\n"); + err = -ENOMEM; + return err; + } + /* to setup the inbound_q, the buffer lives locally (local_q), + * the read pointer is remote (in remote_qp's local_read) + * the write pointer is local (in local_write) + */ + tmp_rd = 0; + micscif_rb_init(&qp->inbound_q, + &tmp_rd, /* No read ptr right now ... */ + &(scifdev->qpairs[0].local_write), + (volatile void *) local_q, + local_size); + qp->inbound_q.read_ptr = NULL; /* it is unsafe to use the ring buffer until this changes! */ + } + + if (!qp->local_buf) { + err = map_virt_into_aperture(&qp->local_buf, local_q, scifdev, local_size); + if (err) { + printk(KERN_ERR "%s %d error %d\n", + __func__, __LINE__, err); + return err; + } + } + + if (!qp->local_qp) { + err = map_virt_into_aperture(qp_offset, qp, scifdev, sizeof(struct micscif_qp)); + if (err) { + printk(KERN_ERR "%s %d error %d\n", + __func__, __LINE__, err); + return err; + } + qp->local_qp = *qp_offset; + } else { + *qp_offset = qp->local_qp; + } + return err; +} + +/* When the other side has already done it's allocation, this is called */ +/* TODO: Replace reads that go across the bus somehow ... */ +int micscif_setup_qp_accept(struct micscif_qp *qp, dma_addr_t *qp_offset, dma_addr_t phys, int local_size, struct micscif_dev *scifdev) +{ + void *local_q; + volatile void *remote_q; + struct micscif_qp *remote_qp; + int remote_size; + int err = 0; + + spin_lock_init(&qp->qp_send_lock); + spin_lock_init(&qp->qp_recv_lock); + /* Start by figuring out where we need to point */ + remote_qp = scif_ioremap(phys, sizeof(struct micscif_qp), scifdev); + qp->remote_qp = remote_qp; + qp->remote_buf = remote_qp->local_buf; + /* To setup the outbound_q, the buffer lives in remote memory (at scifdev->bs->buf phys), + * the read pointer is local (in local's local_read) + * the write pointer is remote (In remote_qp's local_write) + */ + remote_size = qp->remote_qp->inbound_q.size; /* TODO: Remove this read for p2p */ + remote_q = scif_ioremap(qp->remote_buf, remote_size, scifdev); + + BUG_ON(qp->remote_qp->magic != SCIFEP_MAGIC); + + qp->remote_qp->local_write = 0; + micscif_rb_init(&(qp->outbound_q), + &(qp->local_read), /*read ptr*/ + &(qp->remote_qp->local_write), /*write ptr*/ + remote_q, /*rb_base*/ + remote_size); + /* to setup the inbound_q, the buffer lives locally (local_q), + * the read pointer is remote (in remote_qp's local_read) + * the write pointer is local (in local_write) + */ + local_q = kzalloc(local_size, GFP_KERNEL); + if (!local_q) { + printk(KERN_ERR "Ring Buffer Allocation Failed\n"); + err = -ENOMEM; + return err; + } + + qp->remote_qp->local_read = 0; + micscif_rb_init(&(qp->inbound_q), + &(qp->remote_qp->local_read), + &(qp->local_write), + local_q, + local_size); + err = map_virt_into_aperture(&qp->local_buf, local_q, scifdev, local_size); + if (err) { + printk(KERN_ERR "%s %d error %d\n", + __func__, __LINE__, err); + return err; + } + err = map_virt_into_aperture(qp_offset, qp, scifdev, sizeof(struct micscif_qp)); + if (err) { + printk(KERN_ERR "%s %d error %d\n", + __func__, __LINE__, err); + return err; + } + qp->local_qp = *qp_offset; + return err; +} + +int micscif_setup_qp_connect_response(struct micscif_dev *scifdev, struct micscif_qp *qp, uint64_t payload) +{ + int err = 0; + void *r_buf; + int remote_size; + phys_addr_t tmp_phys; + + qp->remote_qp = scif_ioremap(payload, sizeof(struct micscif_qp), scifdev); + + if (!qp->remote_qp) { + err = -ENOMEM; + goto error; + } + + if (qp->remote_qp->magic != SCIFEP_MAGIC) { + printk(KERN_ERR "SCIFEP_MAGIC doesnot match between node %d " + "(self) and %d (remote)\n", scif_dev[ms_info.mi_nodeid].sd_node, + scifdev->sd_node); + WARN_ON(1); + err = -ENODEV; + goto error; + } + + tmp_phys = readq(&(qp->remote_qp->local_buf)); + remote_size = readl(&qp->remote_qp->inbound_q.size); + r_buf = scif_ioremap(tmp_phys, remote_size, scifdev); + +#if 0 + pr_debug("payload = 0x%llx remote_qp = 0x%p tmp_phys=0x%llx \ + remote_size=%d r_buf=%p\n", payload, qp->remote_qp, + tmp_phys, remote_size, r_buf); +#endif + + micscif_rb_init(&(qp->outbound_q), + &(qp->local_read), + &(qp->remote_qp->local_write), + r_buf, + remote_size); + /* resetup the inbound_q now that we know where the inbound_read really is */ + micscif_rb_init(&(qp->inbound_q), + &(qp->remote_qp->local_read), + &(qp->local_write), + qp->inbound_q.rb_base, + qp->inbound_q.size); +error: + return err; +} + +#ifdef _MIC_SCIF_ +extern int micscif_send_host_intr(struct micscif_dev *, uint32_t); + +int micscif_send_host_intr(struct micscif_dev *dev, uint32_t doorbell) +{ + uint32_t db_reg; + + if (doorbell > 3) + return -EINVAL; + + db_reg = readl(dev->mm_sbox + + (SBOX_SDBIC0 + (4 * doorbell))) | SBOX_SDBIC0_DBREQ_BIT; + writel(db_reg, dev->mm_sbox + (SBOX_SDBIC0 + (4 * doorbell))); + return 0; +} +#endif + +/* + * Interrupts remote mic + */ +static void +micscif_send_mic_intr(struct micscif_dev *dev) +{ + /* Writes to RDMASR triggers the interrupt */ + writel(0, (uint8_t *)dev->mm_sbox + dev->sd_rdmasr); +} + +/* scifdev - remote scif device + * also needs the local scif device so that we can decide which RMASR + * to target on the remote mic + */ +static __always_inline void +scif_send_msg_intr(struct micscif_dev *scifdev) +{ +#ifdef _MIC_SCIF_ + if (scifdev == &scif_dev[0]) + micscif_send_host_intr(scifdev, 0); + else +#endif + micscif_send_mic_intr(scifdev); +} + +#ifdef _MIC_SCIF_ +int micscif_setup_card_qp(phys_addr_t host_phys, struct micscif_dev *scifdev) +{ + int local_size; + dma_addr_t qp_offset; + int err = 0; + struct nodemsg tmp_msg; + uint16_t host_scif_ver; + + pr_debug("Got 0x%llx from the host\n", host_phys); + + local_size = NODE_QP_SIZE; + + /* FIXME: n_qpairs is always 1 OK to get rid of it ? */ + scifdev->n_qpairs = 1; + scifdev->qpairs = kzalloc(sizeof(struct micscif_qp), GFP_KERNEL); + if (!scifdev->qpairs) { + printk(KERN_ERR "Node QP Allocation failed\n"); + err = -ENOMEM; + return err; + } + + scifdev->qpairs->magic = SCIFEP_MAGIC; + pr_debug("micscif_card(): called qp_accept\n"); + err = micscif_setup_qp_accept(&scifdev->qpairs[0], &qp_offset, host_phys, local_size, scifdev); + + if (!err) { + host_scif_ver = readw(&(&scifdev->qpairs[0])->remote_qp->scif_version); + if (host_scif_ver != SCIF_VERSION) { + printk(KERN_ERR "Card and host SCIF versions do not match. \n"); + printk(KERN_ERR "Card version: %u, Host version: %u \n", + SCIF_VERSION, host_scif_ver); + err = -ENXIO; + goto error_exit; + } + /* now that everything is setup and mapped, we're ready to tell the + * host where our queue's location + */ + tmp_msg.uop = SCIF_INIT; + tmp_msg.payload[0] = qp_offset; + tmp_msg.payload[1] = get_rdmasr_offset(scifdev->sd_intr_handle); + tmp_msg.dst.node = 0; /* host */ + + pr_debug("micscif_setup_card_qp: micscif_setup_qp_accept, INIT message\n"); + err = micscif_nodeqp_send(scifdev, &tmp_msg, NULL); + } +error_exit: + if (err) + printk(KERN_ERR "%s %d error %d\n", + __func__, __LINE__, err); + return err; +} + + +void micscif_send_exit(void) +{ + struct nodemsg msg; + struct micscif_dev *scifdev = &scif_dev[SCIF_HOST_NODE]; + + init_waitqueue_head(&ms_info.mi_exitwq); + + msg.uop = SCIF_EXIT; + msg.src.node = ms_info.mi_nodeid; + msg.dst.node = scifdev->sd_node; + /* No error handling for Host SCIF device */ + micscif_nodeqp_send(scifdev, &msg, NULL); +} + +#else /* !_MIC_SCIF_ */ +static uint32_t tmp_r_ptr; +int micscif_setup_host_qp(mic_ctx_t *mic_ctx, struct micscif_dev *scifdev) +{ + int err = 0; + int local_size; + + /* Bail out if the node QP is already setup */ + if (scifdev->qpairs) + return err; + + local_size = NODE_QP_SIZE; + + /* for now, assume that we only have one queue-pair -- with the host */ + scifdev->n_qpairs = 1; + scifdev->qpairs = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_ATOMIC); + if (!scifdev->qpairs) { + printk(KERN_ERR "Node QP Allocation failed\n"); + err = -ENOMEM; + return err; + } + + scifdev->qpairs->magic = SCIFEP_MAGIC; + scifdev->qpairs->scif_version = SCIF_VERSION; + err = micscif_setup_qp_connect(&scifdev->qpairs[0], &(mic_ctx->bi_scif.si_pa), local_size, scifdev); + /* fake the read pointer setup so we can use the inbound q */ + scifdev->qpairs[0].inbound_q.read_ptr = &tmp_r_ptr; + + /* We're as setup as we can be ... the inbound_q is setup, w/o + * a usable outbound q. When we get a message, the read_ptr will + * be updated, so we know there's something here. When that happens, + * we finish the setup (just point the write pointer to the real + * write pointer that lives on the card), and pull the message off + * the card. + * Tell the card where we are. + */ + printk("My Phys addrs: 0x%llx and scif_addr 0x%llx\n", scifdev->qpairs[0].local_buf, + mic_ctx->bi_scif.si_pa); + + if (err) printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err); + return err; +} + + +/* FIXME: add to header */ +struct scatterlist * micscif_p2p_mapsg(void *va, int page_size, int page_cnt); +void micscif_p2p_freesg(struct scatterlist *); +mic_ctx_t* get_per_dev_ctx(uint16_t node); + +/* Init p2p mappings required to access peerdev from scifdev */ +static struct scif_p2p_info * +init_p2p_info(struct micscif_dev *scifdev, struct micscif_dev *peerdev) +{ + struct _mic_ctx_t *mic_ctx_peer; + struct _mic_ctx_t *mic_ctx; + struct scif_p2p_info *p2p; + int num_mmio_pages; + int num_aper_pages; + + mic_ctx = get_per_dev_ctx(scifdev->sd_node - 1); + mic_ctx_peer = get_per_dev_ctx(peerdev->sd_node - 1); + + num_mmio_pages = (int) (mic_ctx_peer->mmio.len >> PAGE_SHIFT); + num_aper_pages = (int) (mic_ctx_peer->aper.len >> PAGE_SHIFT); + + // First map the peer board addresses into the new board + p2p = kzalloc(sizeof(struct scif_p2p_info), GFP_KERNEL); + + if (p2p){ + int sg_page_shift = get_order(min(mic_ctx_peer->aper.len,(uint64_t)(1 << 30))); + /* FIXME: check return codes below */ + p2p->ppi_sg[PPI_MMIO] = micscif_p2p_mapsg(mic_ctx_peer->mmio.va, PAGE_SIZE, + num_mmio_pages); + p2p->sg_nentries[PPI_MMIO] = num_mmio_pages; + p2p->ppi_sg[PPI_APER] = micscif_p2p_mapsg(mic_ctx_peer->aper.va, 1 << sg_page_shift, + num_aper_pages >> (sg_page_shift - PAGE_SHIFT)); + p2p->sg_nentries[PPI_APER] = num_aper_pages >> (sg_page_shift - PAGE_SHIFT); + + pci_map_sg(mic_ctx->bi_pdev, p2p->ppi_sg[PPI_MMIO], num_mmio_pages, PCI_DMA_BIDIRECTIONAL); + pci_map_sg(mic_ctx->bi_pdev, p2p->ppi_sg[PPI_APER], + num_aper_pages >> (sg_page_shift - PAGE_SHIFT), PCI_DMA_BIDIRECTIONAL); + + p2p->ppi_pa[PPI_MMIO] = sg_dma_address(p2p->ppi_sg[PPI_MMIO]); + p2p->ppi_pa[PPI_APER] = sg_dma_address(p2p->ppi_sg[PPI_APER]); + p2p->ppi_len[PPI_MMIO] = num_mmio_pages; + p2p->ppi_len[PPI_APER] = num_aper_pages; + p2p->ppi_disc_state = SCIFDEV_RUNNING; + p2p->ppi_peer_id = peerdev->sd_node; + + } + return (p2p); +} + + +int micscif_setuphost_response(struct micscif_dev *scifdev, uint64_t payload) +{ + int read_size; + struct nodemsg msg; + int err = 0; + + pr_debug("micscif_setuphost_response: scif node %d\n", scifdev->sd_node); + err = micscif_setup_qp_connect_response(scifdev, &scifdev->qpairs[0], payload); + if (err) { + printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err); + return err; + } + /* re-recieve the bootstrap message after re-init call */ + pr_debug("micscif_host(): reading INIT message after re-init call\n"); + read_size = micscif_rb_get_next(&(scifdev->qpairs[0].inbound_q), &msg, + sizeof(struct nodemsg)); + micscif_rb_update_read_ptr(&(scifdev->qpairs[0].inbound_q)); + + scifdev->sd_rdmasr = (uint32_t)msg.payload[1]; + + /* for testing, send a message back to the card */ + msg.uop = SCIF_INIT; + msg.payload[0] = 0xdeadbeef; + msg.dst.node = scifdev->sd_node; /* card */ + if ((err = micscif_nodeqp_send(scifdev, &msg, NULL))) { + printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err); + return err; + } + +#ifdef ENABLE_TEST + /* Launch the micscif_rb test */ + pr_debug("micscif_host(): starting TEST\n"); + micscif_qp_testboth(scifdev); +#endif + + /* + * micscif_nodeqp_intrhandler(..) increments the ref_count before calling + * this API hence clamp the scif_ref_cnt to 1. This is required to + * handle the SCIF module load/unload case on MIC. The SCIF_EXIT message + * keeps the ref_cnt clamped to SCIF_NODE_IDLE during module unload. + * Setting the ref_cnt to 1 during SCIF_INIT ensures that the ref_cnt + * returns back to 0 once SCIF module load completes. + */ +#ifdef SCIF_ENABLE_PM + scifdev->scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(1); +#endif + mutex_lock(&ms_info.mi_conflock); + ms_info.mi_mask |= 0x1 << scifdev->sd_node; + ms_info.mi_maxid = SCIF_MAX(scifdev->sd_node, ms_info.mi_maxid); + ms_info.mi_total++; + scifdev->sd_state = SCIFDEV_RUNNING; + mutex_unlock(&ms_info.mi_conflock); + + micscif_node_add_callback(scifdev->sd_node); + return err; +} + +void +micscif_removehost_respose(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + mic_ctx_t *mic_ctx = get_per_dev_ctx(scifdev->sd_node -1); + int err; + + if (scifdev->sd_state != SCIFDEV_RUNNING) + return; + + micscif_stop(mic_ctx); + + if ((err = micscif_nodeqp_send(scifdev, msg, NULL))) + printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err); + + scifdev->sd_state = SCIFDEV_INIT; +} +#endif + +/* TODO: Fix the non-symmetric use of micscif_dev on the host and the card. Right + * now, the card's data structures are shaping up such that there is a single + * micscif_dev structure with multiple qp's. The host ends up with multiple + * micscif_devs (one per card). We should unify the way this will work. + */ +static struct micscif_qp *micscif_nodeqp_find(struct micscif_dev *scifdev, uint8_t node) +{ + struct micscif_qp *qp = NULL; +#ifdef _MIC_SCIF_ + /* This is also a HACK. Even though the code is identical with the host right + * now, I broke it into two parts because they will likely not be identical + * moving forward + */ + qp = scifdev->qpairs; +#else + /* HORRIBLE HACK! Since we only have one card, and one scifdev, we + * can just grab the scifdev->qp to find the qp. We don't actually have to + * do any kind of looking for it + */ + qp = scifdev->qpairs; +#endif /* !_MIC_SCIF_ */ + return qp; +} + +static char *scifdev_state[] = {"SCIFDEV_NOTPRESENT", + "SCIFDEV_INIT", + "SCIFDEV_RUNNING", + "SCIFDEV_SLEEPING", + "SCIFDEV_STOPPING", + "SCIFDEV_STOPPED"}; + +static char *message_types[] = {"BAD", + "INIT", + "EXIT", + "SCIF_NODE_ADD", + "SCIF_NODE_ADD_ACK", + "CNCT_REQ", + "CNCT_GNT", + "CNCT_GNTACK", + "CNCT_GNTNACK", + "CNCT_REJ", + "CNCT_TERM", + "TERM_ACK", + "DISCNCT", + "DISCNT_ACK", + "REGISTER", + "REGISTER_ACK", + "REGISTER_NACK", + "UNREGISTER", + "UNREGISTER_ACK", + "UNREGISTER_NACK", + "ALLOC_REQ", + "ALLOC_GNT", + "ALLOC_REJ", + "FREE_PHYS", + "FREE_VIRT", + "CLIENT_SENT", + "CLIENT_RCVD", + "MUNMAP", + "MARK", + "MARK_ACK", + "MARK_NACK", + "WAIT", + "WAIT_ACK", + "WAIT_NACK", + "SIGNAL_LOCAL", + "SIGNAL_REMOTE", + "SIG_ACK", + "SIG_NACK", + "MAP_GTT", + "MAP_GTT_ACK", + "MAP_GTT_NACK", + "UNMAP_GTT", + "CREATE_NODE_DEP", + "DESTROY_NODE_DEP", + "REMOVE_NODE", + "REMOVE_NODE_ACK", + "WAKE_UP_NODE", + "WAKE_UP_NODE_ACK", + "WAKE_UP_NODE_NACK", + "SCIF_NODE_ALIVE", + "SCIF_NODE_ALIVE_ACK", + "SCIF_SMPT", + "SCIF_GTT_DMA_MAP", + "SCIF_GTT_DMA_ACK", + "SCIF_GTT_DMA_NACK", + "SCIF_GTT_DMA_UNMAP", + "SCIF_PROXY_DMA", + "SCIF_PROXY_ORDERED_DMA", + "SCIF_NODE_CONNECT", + "SCIF_NODE_CONNECT_NACK", + "SCIF_NODE_ADD_NACK", + "SCIF_GET_NODE_INFO", + "TEST"}; + +static void +micscif_display_message(struct micscif_dev *scifdev, struct nodemsg *msg, + const char *label) +{ + if (!ms_info.en_msg_log) + return; + if (msg->uop > SCIF_MAX_MSG) { + pr_debug("%s: unknown msg type %d\n", label, msg->uop); + return; + } + if (msg->uop == SCIF_TEST) + return; + + printk("%s: %s msg type %s, src %d:%d, dest %d:%d " + "payload 0x%llx:0x%llx:0x%llx:0x%llx\n", + label, scifdev_state[scifdev->sd_state], + message_types[msg->uop], msg->src.node, msg->src.port, + msg->dst.node, msg->dst.port, msg->payload[0], msg->payload[1], + msg->payload[2], msg->payload[3]); +} + +/** + * micscif_nodeqp_send - Send a message on the Node Qp. + * @scifdev: Scif Device. + * @msg: The message to be sent. + * + * This function will block till a message is not sent to the destination + * scif device. + */ +int micscif_nodeqp_send(struct micscif_dev *scifdev, + struct nodemsg *msg, struct endpt *ep) +{ + struct micscif_qp *qp; + int err = -ENOMEM, loop_cnt = 0; + + if (oops_in_progress || + (SCIF_INIT != msg->uop && + SCIF_EXIT != msg->uop && + SCIFDEV_RUNNING != scifdev->sd_state && + SCIFDEV_SLEEPING != scifdev->sd_state) || + (ep && SCIFDEV_STOPPED == ep->sd_state)) { + err = -ENODEV; + goto error; + } + + micscif_display_message(scifdev, msg, "Sent"); + + qp = micscif_nodeqp_find(scifdev, (uint8_t)msg->dst.node); + if (!qp) { + err = -EINVAL; + goto error; + } + spin_lock(&qp->qp_send_lock); + + while ((err = micscif_rb_write(&qp->outbound_q, + msg, sizeof(struct nodemsg)))) { + cpu_relax(); + mdelay(1); + if (loop_cnt++ > (NODEQP_SEND_TO_MSEC)) { + err = -ENODEV; + break; + } + } + if (!err) + micscif_rb_commit(&qp->outbound_q); + spin_unlock(&qp->qp_send_lock); + if (!err) { + if (is_self_scifdev(scifdev)) + /* + * For loopback we need to emulate an interrupt by queueing + * work for the queue handling real Node Qp interrupts. + */ + + queue_work(scifdev->sd_intr_wq, &scifdev->sd_intr_bh); + else + scif_send_msg_intr(scifdev); + } +error: + if (err) + pr_debug("%s %d error %d uop %d\n", + __func__, __LINE__, err, msg->uop); + return err; +} + +/* TODO: Make this actually figure out where the interrupt came from. For host, it can + * be a little easier (one "vector" per board). For the cards, we'll have to do some + * scanning, methinks + */ +struct micscif_qp *micscif_nodeqp_nextmsg(struct micscif_dev *scifdev) +{ + return &scifdev->qpairs[0]; +} + +/* + * micscif_misc_handler: + * + * Work queue handler for servicing miscellaneous SCIF tasks. + * Examples include: + * 1) Remote fence requests. + * 2) Destruction of temporary registered windows + * created during scif_vreadfrom()/scif_vwriteto(). + * 3) Cleanup of zombie endpoints. + */ +void micscif_misc_handler(struct work_struct *work) +{ + micscif_rma_handle_remote_fences(); + micscif_rma_destroy_temp_windows(); +#ifdef _MIC_SCIF_ + vm_unmap_aliases(); +#endif + micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc); + micscif_cleanup_zombie_epd(); +} + +/** + * scif_init_resp() - Respond to SCIF_INIT interrupt message + * @scifdev: Other node device to respond to + * @msg: Interrupt message + * + * Loading the driver on the MIC card sends an INIT message to the host + * with the PCI bus memory information it needs. This function receives + * that message, finishes its intialization and echoes it back to the card. + * + * When the card receives the message this function starts a connection test. + */ +static __always_inline void +scif_init_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ +#ifdef _MIC_SCIF_ + if (msg->payload[0] != 0xdeadbeef) + printk(KERN_ERR "Bad payload 0x%llx\n", msg->payload[0]); +#ifdef ENABLE_TEST + else + micscif_qp_testboth(scifdev); +#endif +#else + pr_debug("scifhost(): sending response to INIT\n"); + micscif_setuphost_response(scifdev, msg->payload[0]); + atomic_set(&scifdev->sd_node_alive, 0); + if (scifdev->sd_ln_wq) + queue_delayed_work(scifdev->sd_ln_wq, + &scifdev->sd_watchdog_work, NODE_ALIVE_TIMEOUT); +#endif +} + +/** + * scif_exit_resp() - Respond to SCIF_EXIT interrupt message + * @scifdev: Other node device to respond to + * @msg: Interrupt message + * + * Loading the driver on the MIC card sends an INIT message to the host + * with the PCI bus memory information it needs. This function receives + * that message, finishes its intialization and echoes it back to the card. + * + * When the card receives the message this function starts a connection test. + */ +static __always_inline void +scif_exit_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ +#ifdef _MIC_SCIF_ + printk("card: scif node %d exiting\n", ms_info.mi_nodeid); + scif_dev[ms_info.mi_nodeid].sd_state = SCIFDEV_STOPPED; + wake_up(&ms_info.mi_exitwq); +#else + printk("host: scif node %d exiting\n", msg->src.node); + /* The interrupt handler that received the message would have + * bumped up the ref_cnt by 1. micscif_removehost_response + * calls micscif_cleanup_scifdev which loops forever for the ref_cnt + * to drop to 0 thereby leading to a soft lockup. To prevent + * that, decrement the ref_cnt here. + */ + micscif_dec_node_refcnt(scifdev, 1); + micscif_removehost_respose(scifdev, msg); + /* increment the ref_cnt here. The interrupt handler will now + * decrement it, leaving the ref_cnt to 0 if everything + * works as expected. Note that its not absolutely necessary + * to do this execpt to make sure ref_cnt is 0 and to catch + * errors that may happen if ref_cnt drops to a negative value. + */ + micscif_inc_node_refcnt(scifdev, 1); + +#endif +} + +/** + * scif_nodeadd_resp() - Respond to SCIF_NODE_ADD interrupt message + * @scifdev: Other node device to respond to + * @msg: Interrupt message + * + * When the host driver has finished initializing a MIC node queue pair it + * marks the board as online. It then looks for all currently online MIC + * cards and send a SCIF_NODE_ADD message to identify the ID of the new card for + * peer to peer initialization + * + * The local node allocates its incoming queue and sends its address in the + * SCIF_NODE_ADD_ACK message back to the host, the host "reflects" this message + * to the new node + */ +static __always_inline void +scif_nodeadd_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ +#ifdef _MIC_SCIF_ + struct micscif_dev *newdev; + dma_addr_t qp_offset; + int qp_connect; + + pr_debug("Scifdev %d:%d received NODE_ADD msg for node %d\n", + scifdev->sd_node, msg->dst.node, msg->src.node); + pr_debug("Remote address for this node's aperture %llx\n", + msg->payload[0]); + printk("Remote node's sbox %llx\n", msg->payload[1]); + + newdev = &scif_dev[msg->src.node]; + newdev->sd_node = msg->src.node; + + if (micscif_setup_interrupts(newdev)) { + printk(KERN_ERR "failed to setup interrupts for %d\n", msg->src.node); + goto interrupt_setup_error; + } + + newdev->mm_sbox = ioremap_nocache(msg->payload[1] + SBOX_OFFSET, SBOX_MMIO_LENGTH); + + if (!newdev->mm_sbox) { + printk(KERN_ERR "failed to map mmio for %d\n", msg->src.node); + goto mmio_map_error; + } + + if (!(newdev->qpairs = kzalloc(sizeof(struct micscif_qp), GFP_KERNEL))) { + printk(KERN_ERR "failed to allocate qpair for %d\n", msg->src.node); + goto qp_alloc_error; + } + + /* Set the base address of the remote node's memory since it gets + * added to qp_offset + */ + newdev->sd_base_addr = msg->payload[0]; + + if ((qp_connect = micscif_setup_qp_connect(newdev->qpairs, &qp_offset, + NODE_QP_SIZE, newdev))) { + printk(KERN_ERR "failed to setup qp_connect %d\n", qp_connect); + goto qp_connect_error; + } + + if (register_scif_intr_handler(newdev)) + goto qp_connect_error; + + newdev->scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0); + micscif_node_add_callback(msg->src.node); + newdev->qpairs->magic = SCIFEP_MAGIC; + newdev->qpairs->qp_state = QP_OFFLINE; + wmb(); + + msg->uop = SCIF_NODE_ADD_ACK; + msg->dst.node = msg->src.node; + msg->src.node = ms_info.mi_nodeid; + msg->payload[0] = qp_offset; + msg->payload[2] = get_rdmasr_offset(newdev->sd_intr_handle); + msg->payload[3] = scif_dev[ms_info.mi_nodeid].sd_numa_node; + micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL); + return; + +qp_connect_error: + kfree(newdev->qpairs); + newdev->qpairs = NULL; +qp_alloc_error: + iounmap(newdev->mm_sbox); + newdev->mm_sbox = NULL; +mmio_map_error: +interrupt_setup_error: + printk(KERN_ERR "node add failed for node %d\n", msg->src.node); + /* + * Update self with NODE ADD failure and send + * nack to update the peer. + */ + mutex_lock(&newdev->sd_lock); + newdev->sd_state = SCIFDEV_NOTPRESENT; + mutex_unlock(&newdev->sd_lock); + wake_up_interruptible(&newdev->sd_p2p_wq); + msg->uop = SCIF_NODE_ADD_NACK; + msg->dst.node = msg->src.node; + msg->src.node = ms_info.mi_nodeid; + micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL); +#endif +} + +#ifdef _MIC_SCIF_ +static inline void scif_p2pdev_uninit(struct micscif_dev *peerdev) +{ + deregister_scif_intr_handler(peerdev); + iounmap(peerdev->mm_sbox); + mutex_lock(&peerdev->sd_lock); + peerdev->sd_state = SCIFDEV_NOTPRESENT; + mutex_unlock(&peerdev->sd_lock); +} + +void scif_poll_qp_state(struct work_struct *work) +{ +#define NODE_QP_RETRY 100 + struct micscif_dev *peerdev = container_of(work, struct micscif_dev, + sd_p2p_dwork.work); + struct micscif_qp *qp = &peerdev->qpairs[0]; + + if (SCIFDEV_RUNNING != peerdev->sd_state) + return; + if (qp->qp_state == QP_OFFLINE) { + if (peerdev->sd_p2p_retry++ == NODE_QP_RETRY) { + printk(KERN_ERR "Warning: QP check timeout with " + "state %d\n", qp->qp_state); + goto timeout; + } + schedule_delayed_work(&peerdev->sd_p2p_dwork, + msecs_to_jiffies(NODE_QP_TIMEOUT)); + return; + } + wake_up(&peerdev->sd_p2p_wq); + return; +timeout: + printk(KERN_ERR "%s %d remote node %d offline, state = 0x%x\n", + __func__, __LINE__, peerdev->sd_node, qp->qp_state); + micscif_inc_node_refcnt(peerdev, 1); + qp->remote_qp->qp_state = QP_OFFLINE; + micscif_dec_node_refcnt(peerdev, 1); + scif_p2pdev_uninit(peerdev); + wake_up(&peerdev->sd_p2p_wq); +} +#endif + +/** + * scif_nodeaddack_resp() - Respond to SCIF_NODE_ADD_ACK interrupt message + * @scifdev: Other node device to respond to + * @msg: Interrupt message + * + * After a MIC node receives the SCIF_LINK_ADD_ACK message it send this + * message to the host to confirm the sequeuce is finished. + * + */ +static __always_inline void +scif_nodeaddack_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ +#ifdef _MIC_SCIF_ + struct micscif_dev *peerdev; + struct micscif_qp *qp; +#else + struct micscif_dev *dst_dev = &scif_dev[msg->dst.node]; +#endif + pr_debug("Scifdev %d received SCIF_NODE_ADD_ACK msg for src %d dst %d\n", + scifdev->sd_node, msg->src.node, msg->dst.node); + pr_debug("payload %llx %llx %llx %llx\n", msg->payload[0], msg->payload[1], + msg->payload[2], msg->payload[3]); +#ifndef _MIC_SCIF_ + + /* the lock serializes with micscif_setuphost_response + * The host is forwarding the NODE_ADD_ACK message from src to dst + * we need to make sure that the dst has already received a NODE_ADD + * for src and setup its end of the qp to dst + */ + mutex_lock(&ms_info.mi_conflock); + msg->payload[1] = ms_info.mi_maxid; + mutex_unlock(&ms_info.mi_conflock); + micscif_inc_node_refcnt(dst_dev, 1); + micscif_nodeqp_send(dst_dev, msg, NULL); + micscif_dec_node_refcnt(dst_dev, 1); +#else + peerdev = &scif_dev[msg->src.node]; + peerdev->sd_node = msg->src.node; + + if (peerdev->sd_state == SCIFDEV_NOTPRESENT) + return; + + qp = &peerdev->qpairs[0]; + + if ((micscif_setup_qp_connect_response(peerdev, &peerdev->qpairs[0], + msg->payload[0]))) + goto local_error; + + mutex_lock(&peerdev->sd_lock); + peerdev->sd_numa_node = msg->payload[3]; + /* + * Proxy the DMA only for P2P reads with transfer size + * greater than proxy DMA threshold. Proxying reads to convert + * them into writes is only required for host jaketown platforms + * when the two MIC devices are connected to the same + * QPI/IOH/numa node. The host will not pass the numa node + * information for non Intel Jaketown platforms and it will + * be -1 in that case. + */ + peerdev->sd_proxy_dma_reads = + mic_p2p_proxy_enable && + scif_dev[ms_info.mi_nodeid].sd_numa_node != -1 && + (peerdev->sd_numa_node == + scif_dev[ms_info.mi_nodeid].sd_numa_node); + peerdev->sd_state = SCIFDEV_RUNNING; + mutex_unlock(&peerdev->sd_lock); + + mutex_lock(&ms_info.mi_conflock); + ms_info.mi_maxid = msg->payload[1]; + peerdev->sd_rdmasr = msg->payload[2]; + mutex_unlock(&ms_info.mi_conflock); + + /* accessing the peer qp. Make sure the peer is awake*/ + micscif_inc_node_refcnt(peerdev, 1); + qp->remote_qp->qp_state = QP_ONLINE; + micscif_dec_node_refcnt(peerdev, 1); + schedule_delayed_work(&peerdev->sd_p2p_dwork, + msecs_to_jiffies(NODE_QP_TIMEOUT)); + return; +local_error: + scif_p2pdev_uninit(peerdev); + wake_up(&peerdev->sd_p2p_wq); +#endif +} + +/** + * scif_cnctreq_resp() - Respond to SCIF_CNCT_REQ interrupt message + * @msg: Interrupt message + * + * This message is initiated by the remote node to request a connection + * to the local node. This function looks for an end point in the + * listen state on the requested port id. + * + * If it finds a listening port it places the connect request on the + * listening end points queue and wakes up any pending accept calls. + * + * If it does not find a listening end point it sends a connection + * reject message to the remote node. + */ +static __always_inline void +scif_cnctreq_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = NULL; + struct conreq *conreq; + unsigned long sflags; + + if ((conreq = (struct conreq *)kmalloc(sizeof(struct conreq), GFP_KERNEL)) == NULL) { + // Lack of resources so reject the request. + goto conreq_sendrej; + } + + if ((ep = micscif_find_listen_ep(msg->dst.port, &sflags)) == NULL) { + // Send reject due to no listening ports + goto conreq_sendrej_free; + } + + if (ep->backlog <= ep->conreqcnt) { + // Send reject due to too many pending requests + spin_unlock_irqrestore(&ep->lock, sflags); + goto conreq_sendrej_free; + } + + conreq->msg = *msg; + list_add_tail(&conreq->list, &ep->conlist); + ep->conreqcnt++; + spin_unlock_irqrestore(&ep->lock, sflags); + + wake_up_interruptible(&ep->conwq); + return; + +conreq_sendrej_free: + kfree(conreq); +conreq_sendrej: + msg->uop = SCIF_CNCT_REJ; + micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL); +} + +/** + * scif_cnctgnt_resp() - Respond to SCIF_CNCT_GNT interrupt message + * @msg: Interrupt message + * + * An accept() on the remote node has occured and sent this message + * to indicate success. Place the end point in the MAPPING state and + * save the remote nodes memory information. Then wake up the connect + * request so it can finish. + */ +static __always_inline void +scif_cnctgnt_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + unsigned long sflags; + struct endpt *ep = (struct endpt *)msg->payload[0]; + + spin_lock_irqsave(&ep->lock, sflags); + if (SCIFEP_CONNECTING == ep->state) { + ep->peer.node = msg->src.node; + ep->peer.port = msg->src.port; + ep->qp_info.cnct_gnt_payload = msg->payload[1]; + ep->remote_ep = msg->payload[2]; + ep->state = SCIFEP_MAPPING; + + wake_up_interruptible(&ep->conwq); + wake_up(&ep->diswq); + } + spin_unlock_irqrestore(&ep->lock, sflags); +} + +/** + * scif_cnctgntack_resp() - Respond to SCIF_CNCT_GNTACK interrupt message + * @msg: Interrupt message + * + * The remote connection request has finished mapping the local memmory. + * Place the connection in the connected state and wake up the pending + * accept() call. + */ +static __always_inline void +scif_cnctgntack_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + unsigned long sflags; + struct endpt *ep = (struct endpt *)msg->payload[0]; + + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + spin_lock(&ep->lock); + // New ep is now connected with all resouces set. + ep->state = SCIFEP_CONNECTED; + list_add_tail(&ep->list, &ms_info.mi_connected); + get_conn_count(scifdev); + wake_up(&ep->conwq); + spin_unlock(&ep->lock); + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); +} + +/** + * scif_cnctgntnack_resp() - Respond to SCIF_CNCT_GNTNACK interrupt message + * @msg: Interrupt message + * + * The remote connection request failed to map the local memory it was sent. + * Place the end point in the CLOSING state to indicate it and wake up + * the pending accept(); + */ +static __always_inline void +scif_cnctgntnack_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + unsigned long sflags; + + spin_lock_irqsave(&ep->lock, sflags); + ep->state = SCIFEP_CLOSING; + wake_up(&ep->conwq); + spin_unlock_irqrestore(&ep->lock, sflags); +} + +/** + * scif_cnctrej_resp() - Respond to SCIF_CNCT_REJ interrupt message + * @msg: Interrupt message + * + * The remote end has rejected the connection request. Set the end + * point back to the bound state and wake up the pending connect(). + */ +static __always_inline void +scif_cnctrej_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + unsigned long sflags; + + spin_lock_irqsave(&ep->lock, sflags); + if (SCIFEP_CONNECTING == ep->state) { + ep->state = SCIFEP_BOUND; + wake_up_interruptible(&ep->conwq); + } + spin_unlock_irqrestore(&ep->lock, sflags); +} + +/** + * scif_cnctterm_resp() - Respond to SCIF_CNCT_TERM interrupt message + * @msg: Interrupt message + * + * The remote connect() has waited to long for an accept() to occur and + * is removing the connection request. + * + * If the connection request is not found then it is currently being + * processed and a NACK is sent to indicate to the remote connect() to + * wait for connection to complete. + * + * Otherwise the request is removed and an ACK is returned to indicate + * success. + */ +static __always_inline void +scif_cnctterm_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + unsigned long sflags; + struct endpt *ep = NULL; + struct conreq *conreq = NULL; + + ep = micscif_find_listen_ep(msg->dst.port, &sflags); + + if (ep != NULL) { + conreq = miscscif_get_connection_request(ep, msg->payload[0]); + spin_unlock_irqrestore(&ep->lock, sflags); + } + + if (conreq != NULL) { + kfree(conreq); + msg->uop = SCIF_TERM_ACK; + micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL); + } +} + +/** + * scif_termack_resp() - Respond to SCIF_TERM_ACK interrupt message + * @msg: Interrupt message + * + * Connection termination has been confirmed so set the end point + * to bound and allow the connection request to error out. + */ +static __always_inline void +scif_termack_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + unsigned long sflags; + + spin_lock_irqsave(&ep->lock, sflags); + if (ep->state != SCIFEP_BOUND) { + ep->state = SCIFEP_BOUND; + wake_up(&ep->diswq); + } + spin_unlock_irqrestore(&ep->lock, sflags); +} + +/** + * scif_discnct_resp() - Respond to SCIF_DISCNCT interrupt message + * @msg: Interrupt message + * + * The remote node has indicated close() has been called on its end + * point. Remove the local end point from the connected list, set its + * state to disconnected and ensure accesses to the remote node are + * shutdown. + * + * When all accesses to the remote end have completed then send a + * DISCNT_ACK to indicate it can remove its resources and complete + * the close routine. + */ +static __always_inline void +scif_discnct_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + unsigned long sflags; + struct endpt *ep = NULL; + struct endpt *tmpep; + struct list_head *pos, *tmpq; + + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each_safe(pos, tmpq, &ms_info.mi_connected) { + tmpep = list_entry(pos, struct endpt, list); + if (((uint64_t)tmpep == msg->payload[1]) && ((uint64_t)tmpep->remote_ep == msg->payload[0])) { + list_del(pos); + put_conn_count(scifdev); + ep = tmpep; + spin_lock(&ep->lock); + break; + } + } + + // If the terminated end is not found then this side started closing + // before the other side sent the disconnect. If so the ep will no + // longer be on the connected list. Reguardless the other side + // needs to be acked to let it know close is complete. + if (ep == NULL) { + // Need to unlock conn lock and restore irq state + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + goto discnct_resp_ack; + } + + ep->state = SCIFEP_DISCONNECTED; + list_add_tail(&ep->list, &ms_info.mi_disconnected); + + // TODO Cause associated resources to be freed. + // First step: wake up threads blocked in send and recv + wake_up_interruptible(&ep->sendwq); + wake_up_interruptible(&ep->recvwq); + wake_up_interruptible(&ep->conwq); + spin_unlock(&ep->lock); + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + +discnct_resp_ack: + msg->uop = SCIF_DISCNT_ACK; + micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL); +} + +/** + * scif_discnctack_resp() - Respond to SCIF_DISCNT_ACK interrupt message + * @msg: Interrupt message + * + * Remote side has indicated it has not more references to local resources + */ +static __always_inline void +scif_discntack_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + unsigned long sflags; + + spin_lock_irqsave(&ep->lock, sflags); + ep->state = SCIFEP_DISCONNECTED; + wake_up(&ep->disconwq); + spin_unlock_irqrestore(&ep->lock, sflags); +} + +/** + * scif_clientsend_resp() - Respond to SCIF_CLIENT_SEND interrupt message + * @msg: Interrupt message + * + * Remote side is confirming send or recieve interrupt handling is complete. + */ +static __always_inline void +scif_clientsend_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + + if (SCIFEP_CONNECTED == ep->state) { + wake_up_interruptible(&ep->recvwq); + } +} + +/** + * scif_clientrcvd_resp() - Respond to SCIF_CLIENT_RCVD interrupt message + * @msg: Interrupt message + * + * Remote side is confirming send or recieve interrupt handling is complete. + */ +static __always_inline void +scif_clientrcvd_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + + if (SCIFEP_CONNECTED == ep->state) { + wake_up_interruptible(&ep->sendwq); + } +} + +/** + * scif_alloc_req: Respond to SCIF_ALLOC_REQ interrupt message + * @msg: Interrupt message + * + * Remote side is requesting a memory allocation. + */ +static __always_inline void +scif_alloc_req(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + int err, opcode = (int)msg->payload[3]; + struct reg_range_t *window = 0; + size_t nr_pages = msg->payload[1]; + struct endpt *ep = (struct endpt *)msg->payload[0]; + + might_sleep(); + + if (SCIFEP_CONNECTED != ep->state) { + err = -ENOTCONN; + goto error; + } + + switch (opcode) { + case SCIF_REGISTER: + if (!(window = micscif_create_remote_window(ep, + (int)nr_pages))) { + err = -ENOMEM; + goto error; + } + break; + default: + /* Unexpected allocation request */ + printk(KERN_ERR "Unexpected allocation request opcode 0x%x ep = 0x%p " + " scifdev->sd_state 0x%x scifdev->sd_node 0x%x\n", + opcode, ep, scifdev->sd_state, scifdev->sd_node); + err = -EINVAL; + goto error; + }; + + /* The peer's allocation request is granted */ + msg->uop = SCIF_ALLOC_GNT; + msg->payload[0] = (uint64_t)window; + msg->payload[1] = window->mapped_offset; + if ((err = micscif_nodeqp_send(ep->remote_dev, msg, ep))) + micscif_destroy_remote_window(ep, window); + return; +error: + /* The peer's allocation request is rejected */ + printk(KERN_ERR "%s %d error %d alloc_ptr %p nr_pages 0x%lx\n", + __func__, __LINE__, err, window, nr_pages); + msg->uop = SCIF_ALLOC_REJ; + micscif_nodeqp_send(ep->remote_dev, msg, ep); +} + +/** + * scif_alloc_gnt_rej: Respond to SCIF_ALLOC_GNT/REJ interrupt message + * @msg: Interrupt message + * + * Remote side responded to a memory allocation. + */ +static __always_inline void +scif_alloc_gnt_rej(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct allocmsg *handle = (struct allocmsg *)msg->payload[2]; + switch (handle->uop) { + case SCIF_REGISTER: + { + handle->vaddr = (void *)msg->payload[0]; + handle->phys_addr = msg->payload[1]; + if (msg->uop == SCIF_ALLOC_GNT) + handle->state = OP_COMPLETED; + else + handle->state = OP_FAILED; + wake_up(&handle->allocwq); + break; + } + default: + { + printk(KERN_ERR "Bug Unknown alloc uop 0x%x\n", handle->uop); + } + } +} + +/** + * scif_free_phys: Respond to SCIF_FREE_PHYS interrupt message + * @msg: Interrupt message + * + * Remote side is done accessing earlier memory allocation. + * Remove GTT/PCI mappings created earlier. + */ +static __always_inline void +scif_free_phys(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + return; +} + +/** + * scif_free_phys: Respond to SCIF_FREE_VIRT interrupt message + * @msg: Interrupt message + * + * Free up memory kmalloc'd earlier. + */ +static __always_inline void +scif_free_virt(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + int opcode = (int)msg->payload[3]; + struct reg_range_t *window = + (struct reg_range_t *)msg->payload[1]; + + switch (opcode) { + case SCIF_REGISTER: + micscif_destroy_remote_window(ep, window); + break; + default: + /* Unexpected allocation request */ + BUG_ON(opcode != SCIF_REGISTER); + }; +} + +/** + * scif_recv_register: Respond to SCIF_REGISTER interrupt message + * @msg: Interrupt message + * + * Update remote window list with a new registered window. + */ +static __always_inline void +scif_recv_register(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + unsigned long sflags; + struct endpt *ep = (struct endpt *)msg->payload[0]; + struct reg_range_t *window = + (struct reg_range_t *)msg->payload[1]; + + might_sleep(); + RMA_MAGIC(window); + mutex_lock(&ep->rma_info.rma_lock); + /* FIXME: + * ep_lock lock needed ? rma_lock is already held + */ + spin_lock_irqsave(&ep->lock, sflags); + if (SCIFEP_CONNECTED == ep->state) { + msg->uop = SCIF_REGISTER_ACK; + micscif_nodeqp_send(ep->remote_dev, msg, ep); + micscif_set_nr_pages(ep->remote_dev, window); + /* No further failures expected. Insert new window */ + micscif_insert_window(window, + &ep->rma_info.remote_reg_list); + } else { + msg->uop = SCIF_REGISTER_NACK; + micscif_nodeqp_send(ep->remote_dev, msg, ep); + } + spin_unlock_irqrestore(&ep->lock, sflags); + mutex_unlock(&ep->rma_info.rma_lock); + /* + * We could not insert the window but we need to + * destroy the window. + */ + if (SCIF_REGISTER_NACK == msg->uop) + micscif_destroy_remote_window(ep, window); + else { +#ifdef _MIC_SCIF_ + micscif_destroy_remote_lookup(ep, window); +#endif + } +} + +/** + * scif_recv_unregister: Respond to SCIF_UNREGISTER interrupt message + * @msg: Interrupt message + * + * Remove window from remote registration list; + */ +static __always_inline void +scif_recv_unregister(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct micscif_rma_req req; + struct reg_range_t *window = NULL; + struct reg_range_t *recv_window = + (struct reg_range_t *)msg->payload[0]; + struct endpt *ep; + int del_window = 0; + + might_sleep(); + RMA_MAGIC(recv_window); + ep = (struct endpt *)recv_window->ep; + req.out_window = &window; + req.offset = recv_window->offset; + req.prot = 0; + req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; + req.type = WINDOW_FULL; + req.head = &ep->rma_info.remote_reg_list; + msg->payload[0] = ep->remote_ep; + + mutex_lock(&ep->rma_info.rma_lock); + /* + * Does a valid window exist? + */ + if (micscif_query_window(&req)) { + printk(KERN_ERR "%s %d -ENXIO\n", __func__, __LINE__); + msg->uop = SCIF_UNREGISTER_ACK; + goto error; + } + if (window) { + RMA_MAGIC(window); + if (window->ref_count) + put_window_ref_count(window, window->nr_pages); + window->unreg_state = OP_COMPLETED; + if (!window->ref_count) { + msg->uop = SCIF_UNREGISTER_ACK; + atomic_inc(&ep->rma_info.tw_refcount); + atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages); + ep->rma_info.async_list_del = 1; + list_del(&window->list_member); + window->offset = INVALID_VA_GEN_ADDRESS; + del_window = 1; + } else + /* NACK! There are valid references to this window */ + msg->uop = SCIF_UNREGISTER_NACK; + } else { + /* The window did not make its way to the list at all. ACK */ + msg->uop = SCIF_UNREGISTER_ACK; + micscif_destroy_remote_window(ep, recv_window); + } +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (del_window) + drain_dma_intr(ep->rma_info.dma_chan); + micscif_nodeqp_send(ep->remote_dev, msg, ep); + if (del_window) + micscif_queue_for_cleanup(window, &ms_info.mi_rma); + return; +} + +/** + * scif_recv_register_ack: Respond to SCIF_REGISTER_ACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to complete registration. + */ +static __always_inline void +scif_recv_register_ack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct reg_range_t *window = + (struct reg_range_t *)msg->payload[2]; + RMA_MAGIC(window); + window->reg_state = OP_COMPLETED; + wake_up(&window->regwq); +} + +/** + * scif_recv_register_nack: Respond to SCIF_REGISTER_NACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to inform it that registration + * cannot be completed. + */ +static __always_inline void +scif_recv_register_nack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct reg_range_t *window = + (struct reg_range_t *)msg->payload[2]; + RMA_MAGIC(window); + window->reg_state = OP_FAILED; + wake_up(&window->regwq); +} +/** + * scif_recv_unregister_ack: Respond to SCIF_UNREGISTER_ACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to complete unregistration. + */ +static __always_inline void +scif_recv_unregister_ack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct reg_range_t *window = + (struct reg_range_t *)msg->payload[1]; + RMA_MAGIC(window); + window->unreg_state = OP_COMPLETED; + wake_up(&window->unregwq); +} + +/** + * scif_recv_unregister_nack: Respond to SCIF_UNREGISTER_NACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to inform it that unregistration + * cannot be completed immediately. + */ +static __always_inline void +scif_recv_unregister_nack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct reg_range_t *window = + (struct reg_range_t *)msg->payload[1]; + RMA_MAGIC(window); + window->unreg_state = OP_FAILED; + wake_up(&window->unregwq); +} + +static __always_inline void +scif_recv_munmap(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct micscif_rma_req req; + struct reg_range_t *window = NULL; + struct reg_range_t *recv_window = + (struct reg_range_t *)msg->payload[0]; + struct endpt *ep; + int del_window = 0; + + might_sleep(); + RMA_MAGIC(recv_window); + ep = (struct endpt *)recv_window->ep; + req.out_window = &window; + req.offset = recv_window->offset; + req.prot = recv_window->prot; + req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; + req.type = WINDOW_FULL; + req.head = &ep->rma_info.reg_list; + msg->payload[0] = ep->remote_ep; + + mutex_lock(&ep->rma_info.rma_lock); + /* + * Does a valid window exist? + */ + if (micscif_query_window(&req)) { + printk(KERN_ERR "%s %d -ENXIO\n", __func__, __LINE__); + msg->uop = SCIF_UNREGISTER_ACK; + goto error; + } + + RMA_MAGIC(window); + + if (window->ref_count) + put_window_ref_count(window, window->nr_pages); + + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages); + ep->rma_info.async_list_del = 1; + list_del(&window->list_member); + micscif_free_window_offset(ep, window->offset, + window->nr_pages << PAGE_SHIFT); + window->offset_freed = true; + del_window = 1; + } +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (del_window) + micscif_queue_for_cleanup(window, &ms_info.mi_rma); +} + +/** + * scif_recv_mark: Handle SCIF_MARK request + * @msg: Interrupt message + * + * The peer has requested a mark. + */ +static __always_inline void +scif_recv_mark(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + int mark; + + if (SCIFEP_CONNECTED != ep->state) { + msg->payload[0] = ep->remote_ep; + msg->uop = SCIF_MARK_NACK; + micscif_nodeqp_send(ep->remote_dev, msg, ep); + return; + } + + if ((mark = micscif_fence_mark(ep)) < 0) + msg->uop = SCIF_MARK_NACK; + else + msg->uop = SCIF_MARK_ACK; + msg->payload[0] = ep->remote_ep; + msg->payload[2] = mark; + micscif_nodeqp_send(ep->remote_dev, msg, ep); +} + +/** + * scif_recv_mark_resp: Handle SCIF_MARK_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a SCIF_MARK message. + */ +static __always_inline void +scif_recv_mark_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + struct fence_info *fence_req = (struct fence_info *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + if (SCIF_MARK_ACK == msg->uop) { + fence_req->state = OP_COMPLETED; + fence_req->dma_mark = (int)msg->payload[2]; + } else + fence_req->state = OP_FAILED; + wake_up(&fence_req->wq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_wait: Handle SCIF_WAIT request + * @msg: Interrupt message + * + * The peer has requested waiting on a fence. + */ +static __always_inline void +scif_recv_wait(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + struct remote_fence_info *fence; + + /* + * Allocate structure for remote fence information and + * send a NACK if the allocation failed. The peer will + * return ENOMEM upon receiving a NACK. + */ + if (!(fence = (struct remote_fence_info *)kmalloc( + sizeof(struct remote_fence_info), GFP_KERNEL))) { + msg->payload[0] = ep->remote_ep; + msg->uop = SCIF_WAIT_NACK; + micscif_nodeqp_send(ep->remote_dev, msg, ep); + return; + } + + /* Prepare the fence request */ + memcpy(&fence->msg, msg, sizeof(struct nodemsg)); + INIT_LIST_HEAD(&fence->list_member); + + /* Insert to the global remote fence request list */ + mutex_lock(&ms_info.mi_fencelock); + ep->rma_info.fence_refcount++; + list_add_tail(&fence->list_member, &ms_info.mi_fence); + mutex_unlock(&ms_info.mi_fencelock); + + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); +} + +/** + * scif_recv_wait_resp: Handle SCIF_WAIT_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a SCIF_WAIT message. + */ +static __always_inline void +scif_recv_wait_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + struct fence_info *fence_req = (struct fence_info *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + if (SCIF_WAIT_ACK == msg->uop) + fence_req->state = OP_COMPLETED; + else + fence_req->state = OP_FAILED; + wake_up(&fence_req->wq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_local_signal: Handle SCIF_SIG_LOCAL request + * @msg: Interrupt message + * + * The peer has requested a signal on a local offset. + */ +static __always_inline void +scif_recv_signal_local(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + int err = 0; + struct endpt *ep = (struct endpt *)msg->payload[0]; + + err = micscif_prog_signal(ep, + msg->payload[1], + msg->payload[2], + RMA_WINDOW_SELF); + if (err) + msg->uop = SCIF_SIG_NACK; + else + msg->uop = SCIF_SIG_ACK; + msg->payload[0] = ep->remote_ep; + if ((err = micscif_nodeqp_send(ep->remote_dev, msg, ep))) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); +} + +/** + * scif_recv_signal_remote: Handle SCIF_SIGNAL_REMOTE request + * @msg: Interrupt message + * + * The peer has requested a signal on a remote offset. + */ +static __always_inline void +scif_recv_signal_remote(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + int err = 0; + struct endpt *ep = (struct endpt *)msg->payload[0]; + + err = micscif_prog_signal(ep, + msg->payload[1], + msg->payload[2], + RMA_WINDOW_PEER); + if (err) + msg->uop = SCIF_SIG_NACK; + else + msg->uop = SCIF_SIG_ACK; + msg->payload[0] = ep->remote_ep; + if ((err = micscif_nodeqp_send(ep->remote_dev, msg, ep))) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); +} + +/** + * scif_recv_signal_remote: Handle SCIF_SIG_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a signal request. + */ +static __always_inline void +scif_recv_signal_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + struct fence_info *fence_req = (struct fence_info *)msg->payload[3]; + + mutex_lock(&ep->rma_info.rma_lock); + if (SCIF_SIG_ACK == msg->uop) + fence_req->state = OP_COMPLETED; + else + fence_req->state = OP_FAILED; + wake_up(&fence_req->wq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/* + * scif_node_wake_up_ack: Handle SCIF_NODE_WAKE_UP_ACK message + * @msg: Interrupt message + * + * Response for a SCIF_NODE_WAKE_UP message. + */ +static __always_inline void +scif_node_wake_up_ack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + scif_dev[msg->payload[0]].sd_wait_status = OP_COMPLETED; + wake_up(&scif_dev[msg->payload[0]].sd_wq); +} + +/* + * scif_node_wake_up_nack: Handle SCIF_NODE_WAKE_UP_NACK message + * @msg: Interrupt message + * + * Response for a SCIF_NODE_WAKE_UP message. + */ +static __always_inline void +scif_node_wake_up_nack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + scif_dev[msg->payload[0]].sd_wait_status = OP_FAILED; + wake_up(&scif_dev[msg->payload[0]].sd_wq); +} + +/* + * scif_node_remove: Handle SCIF_NODE_REMOVE message + * @msg: Interrupt message + * + * Handle node removal. + */ +static __always_inline void +scif_node_remove(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + msg->payload[0] = micscif_handle_remove_node(msg->payload[0], msg->payload[1]); + msg->uop = SCIF_NODE_REMOVE_ACK; + msg->src.node = ms_info.mi_nodeid; + micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL); +} + +#ifndef _MIC_SCIF_ +/* + * scif_node_remove_ack: Handle SCIF_NODE_REMOVE_ACK message + * @msg: Interrupt message + * + * The peer has acked a SCIF_NODE_REMOVE message. + */ +static __always_inline void +scif_node_remove_ack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + bool ack_is_current = true; + int orig_node = (int)msg->payload[3]; + + if ((msg->payload[1] << 32) == DISCONN_TYPE_POWER_MGMT) { + if (msg->payload[2] != atomic_long_read(&ms_info.mi_unique_msgid)) + ack_is_current = false; + } + + if (ack_is_current) { + mic_ctx_t *mic_ctx = get_per_dev_ctx(orig_node - 1); + if (!mic_ctx) { + printk(KERN_ERR "%s %d mic_ctx %p orig_node %d\n", + __func__, __LINE__, mic_ctx, orig_node); + return; + } + + if (msg->payload[0]) { + pr_debug("%s failed to get remove ack from node id %d", __func__, msg->src.node); + ms_info.mi_disconnect_status = OP_FAILED; + } + + atomic_inc(&mic_ctx->disconn_rescnt); + wake_up(&ms_info.mi_disconn_wq); + } +} + +/* + * scif_node_create_ack: Handle SCIF_NODE_CREATE_DEP message + * @msg: Interrupt message + * + * Notification about a new SCIF dependency between two nodes. + */ +static __always_inline void +scif_node_create_dep(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + uint32_t src_node = msg->src.node; + uint32_t dst_node = (uint32_t)msg->payload[0]; + /* + * Host driver updates dependency graph. + * src_node created dependency on dst_node + * src_node -> dst_node + */ + micscif_set_nodedep(src_node, dst_node, DEP_STATE_DEPENDENT); +} + +/* + * scif_node_destroy_ack: Handle SCIF_NODE_DESTROY_DEP message + * @msg: Interrupt message + * + * Notification about tearing down an existing SCIF dependency + * between two nodes. + */ +static __always_inline void +scif_node_destroy_dep(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + uint32_t src_node = msg->src.node; + uint32_t dst_node = (uint32_t)msg->payload[0]; + /* + * Host driver updates dependency graph. + * src_node removed dependency on dst_node + */ + micscif_set_nodedep(src_node, dst_node, DEP_STATE_NOT_DEPENDENT); +} + +/* + * scif_node_wake_up: Handle SCIF_NODE_WAKE_UP message + * @msg: Interrupt message + * + * The host has received a request to wake up a remote node. + */ +static __always_inline void +scif_node_wake_up(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + /* + * Host Driver now needs to wake up the remote node + * available in msg->payload[0]. + */ + uint32_t ret = 0; + ret = micscif_connect_node((uint32_t)msg->payload[0], false); + + if(!ret) { + msg->uop = SCIF_NODE_WAKE_UP_ACK; + micscif_update_p2p_state((uint32_t)msg->payload[0], + msg->src.node, SCIFDEV_RUNNING); + } else { + msg->uop = SCIF_NODE_WAKE_UP_NACK; + } + micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL); +} +#endif + +#ifdef _MIC_SCIF_ +static __always_inline void +scif_node_alive_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + msg->uop = SCIF_NODE_ALIVE_ACK; + msg->src.node = ms_info.mi_nodeid; + msg->dst.node = SCIF_HOST_NODE; + micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL); + pr_debug("node alive ack sent from node %d oops_in_progress %d\n", + ms_info.mi_nodeid, oops_in_progress); +} +#else +static __always_inline void +scif_node_alive_ack(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + pr_debug("node alive ack received from node %d\n", msg->src.node); + atomic_set(&scif_dev[msg->src.node].sd_node_alive, 1); + wake_up(&scifdev->sd_watchdog_wq); +} +#endif + + +#ifdef _MIC_SCIF_ +static __always_inline void +_scif_proxy_dma(struct micscif_dev *scifdev, struct nodemsg *msg, int flags) +{ + struct endpt *ep = (struct endpt *)msg->payload[0]; + off_t loffset = msg->payload[1]; + off_t roffset = msg->payload[2]; + size_t len = msg->payload[3]; + struct dma_channel *chan = ep->rma_info.dma_chan; + struct endpt_rma_info *rma = &ep->rma_info; + int err = __scif_writeto(ep, loffset, len, roffset, flags); + + if (!err && rma->proxy_dma_peer_phys && + !request_dma_channel(chan)) { + do_status_update(chan, rma->proxy_dma_peer_phys, OP_COMPLETED); + free_dma_channel(chan); + } + if (!rma->proxy_dma_peer_phys) + /* The proxy DMA physical address should have been set up? */ + WARN_ON(1); +} + +/** + * scif_proxy_dma: Handle SCIF_PROXY_DMA request. + * @msg: Interrupt message + * + * The peer has requested a Proxy DMA. + */ +static __always_inline void +scif_proxy_dma(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + _scif_proxy_dma(scifdev, msg, 0x0); +} + +/** + * scif_proxy_ordered_dma: Handle SCIF_PROXY_ORDERED_DMA request. + * @msg: Interrupt message + * + * The peer has requested an ordered Proxy DMA. + */ +static __always_inline void +scif_proxy_ordered_dma(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + _scif_proxy_dma(scifdev, msg, SCIF_RMA_ORDERED); +} +#endif + +#ifndef _MIC_SCIF_ +/** + * scif_node_connect: Respond to SCIF_NODE_CONNECT interrupt message + * @msg: Interrupt message + * + * Connect the src and dst node by setting up the p2p connection + * between them. Host here acts like a proxy. + */ +static __always_inline void +scif_node_connect_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct micscif_dev *dev_j = scifdev; + struct micscif_dev *dev_i = NULL; + struct scif_p2p_info *p2p_ij = NULL; /* bus addr for j from i */ + struct scif_p2p_info *p2p_ji = NULL; /* bus addr for i from j */ + struct scif_p2p_info *p2p; + struct list_head *pos, *tmp; + uint32_t bid = (uint32_t)msg->payload[0]; + int err; + uint64_t tmppayload; + + pr_debug("%s:%d SCIF_NODE_CONNECT from %d connecting to %d \n", + __func__, __LINE__, scifdev->sd_node, bid); + + mutex_lock(&ms_info.mi_conflock); + if (bid < 1 || bid > ms_info.mi_maxid) { + printk(KERN_ERR "%s %d unknown bid %d\n", __func__, __LINE__, bid); + goto nack; + } + + dev_i = &scif_dev[bid]; + mutex_unlock(&ms_info.mi_conflock); + micscif_inc_node_refcnt(dev_i, 1); + mutex_lock(&ms_info.mi_conflock); + + if (dev_i->sd_state != SCIFDEV_RUNNING) + goto ref_nack; + + /* + * If the p2p connection is already setup or in the process of setting up + * then just ignore this request. The requested node will get informed + * by SCIF_NODE_ADD_ACK or SCIF_NODE_ADD_NACK + */ + if (!list_empty(&dev_i->sd_p2p)) { + list_for_each_safe(pos, tmp, &dev_i->sd_p2p) { + p2p = list_entry(pos, struct scif_p2p_info, + ppi_list); + if (p2p->ppi_peer_id == dev_j->sd_node) { + mutex_unlock(&ms_info.mi_conflock); + micscif_dec_node_refcnt(dev_i, 1); + return; + } + } + } + + p2p_ij = init_p2p_info(dev_i, dev_j); + p2p_ji = init_p2p_info(dev_j, dev_i); + + list_add_tail(&p2p_ij->ppi_list, &dev_i->sd_p2p); + list_add_tail(&p2p_ji->ppi_list, &dev_j->sd_p2p); + + /* Send a SCIF_NODE_ADD to dev_i, pass it its bus address + * as seen from dev_j + */ + msg->uop = SCIF_NODE_ADD; + msg->src.node = dev_j->sd_node; + msg->dst.node = dev_i->sd_node; + + p2p_ji->ppi_mic_addr[PPI_APER] = mic_map(msg->src.node - 1, + p2p_ji->ppi_pa[PPI_APER], + p2p_ji->ppi_len[PPI_APER] << PAGE_SHIFT); + msg->payload[0] = p2p_ji->ppi_mic_addr[PPI_APER]; + + /* addresses for node j */ + p2p_ij->ppi_mic_addr[PPI_MMIO] = mic_map(msg->dst.node - 1, + p2p_ij->ppi_pa[PPI_MMIO], + p2p_ij->ppi_len[PPI_MMIO] << PAGE_SHIFT); + msg->payload[1] = p2p_ij->ppi_mic_addr[PPI_MMIO]; + + p2p_ij->ppi_mic_addr[PPI_APER] = mic_map(msg->dst.node - 1, + p2p_ij->ppi_pa[PPI_APER], + p2p_ij->ppi_len[PPI_APER] << PAGE_SHIFT); + msg->payload[2] = p2p_ij->ppi_mic_addr[PPI_APER]; + + msg->payload[3] = p2p_ij->ppi_len[PPI_APER] << PAGE_SHIFT; + + if ((err = micscif_nodeqp_send(dev_i, msg, NULL))) { + printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err); + goto ref_nack; + } + + /* Same as above but to dev_j */ + msg->uop = SCIF_NODE_ADD; + msg->src.node = dev_i->sd_node; + msg->dst.node = dev_j->sd_node; + + tmppayload = msg->payload[0]; + msg->payload[0] = msg->payload[2]; + msg->payload[2] = tmppayload; + + p2p_ji->ppi_mic_addr[PPI_MMIO] = mic_map(msg->dst.node - 1, p2p_ji->ppi_pa[PPI_MMIO], + p2p_ji->ppi_len[PPI_MMIO] << PAGE_SHIFT); + msg->payload[1] = p2p_ji->ppi_mic_addr[PPI_MMIO]; + msg->payload[3] = p2p_ji->ppi_len[PPI_APER] << PAGE_SHIFT; + + if ((err = micscif_nodeqp_send(dev_j, msg, NULL))) { + printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err); + goto ref_nack; + } + + mutex_unlock(&ms_info.mi_conflock); + micscif_dec_node_refcnt(dev_i, 1); + return; +ref_nack: + micscif_dec_node_refcnt(dev_i, 1); +nack: + mutex_unlock(&ms_info.mi_conflock); + msg->uop = SCIF_NODE_CONNECT_NACK; + msg->dst.node = dev_j->sd_node; + msg->payload[0] = bid; + if ((err = micscif_nodeqp_send(dev_j, msg, NULL))) + printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err); +} +#endif /* SCIF */ + +#ifdef _MIC_SCIF_ +/** + * scif_node_connect_nack_resp: Respond to SCIF_NODE_CONNECT_NACK interrupt message + * @msg: Interrupt message + * + * Tell the node that initiated SCIF_NODE_CONNECT earlier has failed. + */ +static __always_inline void +scif_node_connect_nack_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + struct micscif_dev *peerdev; + unsigned int bid = msg->payload[0]; + + if (bid > MAX_BOARD_SUPPORTED) { + printk(KERN_ERR "recieved a nack for invalid bid %d\n", bid); + WARN_ON(1); + return; + } + + peerdev = &scif_dev[bid]; + mutex_lock(&peerdev->sd_lock); + peerdev->sd_state = SCIFDEV_NOTPRESENT; + mutex_unlock(&peerdev->sd_lock); + wake_up(&peerdev->sd_p2p_wq); +} +#endif + +/** + * scif_node_add_nack_resp: Respond to SCIF_NODE_ADD_NACK interrupt message + * @msg: Interrupt message + * + * SCIF_NODE_ADD failed, so inform the waiting wq. + */ +static __always_inline void +scif_node_add_nack_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ +#ifndef _MIC_SCIF_ + struct micscif_dev *dst_dev = &scif_dev[msg->dst.node]; + pr_debug("SCIF_NODE_ADD_NACK recieved from %d \n", scifdev->sd_node); + micscif_inc_node_refcnt(dst_dev, 1); + micscif_nodeqp_send(dst_dev, msg, NULL); + micscif_dec_node_refcnt(dst_dev, 1); +#else + struct micscif_dev *peerdev; + + peerdev = &scif_dev[msg->src.node]; + + if (peerdev->sd_state == SCIFDEV_NOTPRESENT) + return; + + mutex_lock(&peerdev->sd_lock); + peerdev->sd_state = SCIFDEV_NOTPRESENT; + mutex_unlock(&peerdev->sd_lock); + wake_up(&peerdev->sd_p2p_wq); +#endif +} + +/** + * scif_get_node_info_resp: Respond to SCIF_GET_NODE_INFO interrupt message + * @msg: Interrupt message + * + * Retrieve node info i.e maxid, total and node mask from the host. + */ +static __always_inline void +scif_get_node_info_resp(struct micscif_dev *scifdev, struct nodemsg *msg) +{ +#ifdef _MIC_SCIF_ + struct get_node_info *node_info = (struct get_node_info *)msg->payload[3]; + + mutex_lock(&ms_info.mi_conflock); + ms_info.mi_mask = msg->payload[0]; + ms_info.mi_maxid = msg->payload[1]; + ms_info.mi_total = msg->payload[2]; + + node_info->state = OP_COMPLETED; + wake_up(&node_info->wq); + mutex_unlock(&ms_info.mi_conflock); +#else + swap(msg->dst.node, msg->src.node); + mutex_lock(&ms_info.mi_conflock); + msg->payload[0] = ms_info.mi_mask; + msg->payload[1] = ms_info.mi_maxid; + msg->payload[2] = ms_info.mi_total; + mutex_unlock(&ms_info.mi_conflock); + + if (micscif_nodeqp_send(scifdev, msg, NULL)) + printk(KERN_ERR "%s %d error \n", __func__, __LINE__); +#endif +} + +#ifdef ENABLE_TEST +static void +scif_test(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + if (msg->payload[0] != scifdev->count) { + printk(KERN_ERR "Con fail: payload == %llx\n", msg->payload[0]); + scifdev->test_done = -1; + } else if (scifdev->count == TEST_LOOP) { + pr_debug("Test success state %d!\n", scifdev->sd_state); + scifdev->test_done = 1; + } + + if (scifdev->test_done != 0) { + while (scifdev->test_done != 2) { + cpu_relax(); + schedule(); + } + + destroy_workqueue(scifdev->producer); + destroy_workqueue(scifdev->consumer); + pr_debug("Destroyed workqueue state %d!\n", scifdev->sd_state); + } + scifdev->count++; +} +#endif /* ENABLE_TEST */ + +static void +scif_msg_unknown(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + /* Bogus Node Qp Message? */ + printk(KERN_ERR "Unknown message 0x%xn scifdev->sd_state 0x%x " + "scifdev->sd_node 0x%x\n", + msg->uop, scifdev->sd_state, scifdev->sd_node); + BUG_ON(1); +} + +#ifdef _MIC_SCIF_ +static void +smpt_set(struct micscif_dev *scifdev, struct nodemsg *msg) +{ + printk("msd recvd : smpt add\n"); + printk("dma_addr = 0x%llX, entry = 0x%llX\n", msg->payload[0], msg->payload[1]); + mic_smpt_set(scif_dev->mm_sbox, msg->payload[0], msg->payload[1]); +} +#endif + +void (*scif_intr_func[SCIF_MAX_MSG + 1])(struct micscif_dev *, struct nodemsg *msg) = { + scif_msg_unknown, // Error + scif_init_resp, // SCIF_INIT + scif_exit_resp, // SCIF_EXIT + scif_nodeadd_resp, // SCIF_NODE_ADD + scif_nodeaddack_resp, // SCIF_NODE_ADD_ACK + scif_cnctreq_resp, // SCIF_CNCT_REQ + scif_cnctgnt_resp, // SCIF_CNCT_GNT + scif_cnctgntack_resp, // SCIF_CNCT_GNTACK + scif_cnctgntnack_resp, // SCIF_CNCT_GNTNACK + scif_cnctrej_resp, // SCIF_CNCT_REJ + scif_cnctterm_resp, // SCIF_CNCT_TERM 10 + scif_termack_resp, // SCIF_TERM_ACK + scif_discnct_resp, // SCIF_DISCNCT + scif_discntack_resp, // SCIF_DISCNT_ACK + scif_recv_register, // SCIF_REGISTER + scif_recv_register_ack, // SCIF_REGISTER_ACK + scif_recv_register_nack, // SCIF_REGISTER_NACK + scif_recv_unregister, // SCIF_UNREGISTER + scif_recv_unregister_ack, // SCIF_UNREGISTER_ACK + scif_recv_unregister_nack, // SCIF_UNREGISTER_NACK + scif_alloc_req, // SCIF_ALLOC_REQ 20 + scif_alloc_gnt_rej, // SCIF_ALLOC_GNT + scif_alloc_gnt_rej, // SCIF_ALLOC_REJ + scif_free_phys, // SCIF_FREE_PHYS + scif_free_virt, // SCIF_FREE_VIRT + scif_clientsend_resp, // SCIF_CLIENT_SENT + scif_clientrcvd_resp, // SCIF_CLIENT_RCVD + scif_recv_munmap, // SCIF_MUNMAP + scif_recv_mark, // SCIF_MARK + scif_recv_mark_resp, // SCIF_MARK_ACK + scif_recv_mark_resp, // SCIF_MARK_NACK 30 + scif_recv_wait, // SCIF_WAIT + scif_recv_wait_resp, // SCIF_WAIT_ACK + scif_recv_wait_resp, // SCIF_WAIT_NACK + scif_recv_signal_local, // SCIF_SIG_LOCAL + scif_recv_signal_remote, // SCIF_SIG_REMOTE + scif_recv_signal_resp, // SCIF_SIG_ACK + scif_recv_signal_resp, // SCIF_SIG_NACK +#ifdef _MIC_SCIF_ + scif_msg_unknown, + scif_msg_unknown, + scif_msg_unknown, + scif_msg_unknown, + scif_msg_unknown, // SCIF_NODE_CREATE_DEP Not on card + scif_msg_unknown, // SCIF_NODE_DESTROY_DEP Not on card +#else + scif_msg_unknown, + scif_msg_unknown, + scif_msg_unknown, + scif_msg_unknown, + scif_node_create_dep, // SCIF_NODE_CREATE_DEP + scif_node_destroy_dep, // SCIF_NODE_DESTROY_DEP +#endif + scif_node_remove, // SCIF_NODE_REMOVE +#ifdef _MIC_SCIF_ + scif_msg_unknown, // SCIF_NODE_REMOVE_ACK Not on card + scif_msg_unknown, // SCIF_NODE_WAKE_UP Not on card +#else + scif_node_remove_ack, // SCIF_NODE_REMOVE_ACK + scif_node_wake_up, // SCIF_NODE_WAKE_UP +#endif + scif_node_wake_up_ack, // SCIF_NODE_WAKE_UP_ACK + scif_node_wake_up_nack, // SCIF_NODE_WAKE_UP_NACK +#ifdef _MIC_SCIF_ + scif_node_alive_resp, // SCIF_NODE_ALIVE + scif_msg_unknown, // SCIF_NODE_ALIVE_ACK not on card + smpt_set, // SMPT_SET +#else + scif_msg_unknown, // SCIF_NODE_ALIVE not on Host + scif_node_alive_ack, // SCIF_NODE_ALIVE_ACK + scif_msg_unknown, // SCIF_NODE_ALIVE not on Host +#endif + scif_msg_unknown, + scif_msg_unknown, + scif_msg_unknown, + scif_msg_unknown, +#ifdef _MIC_SCIF_ + scif_proxy_dma, // SCIF_PROXY_DMA only for MIC + scif_proxy_ordered_dma, // SCIF_PROXY_ORDERED_DMA only for MIC +#else + scif_msg_unknown, + scif_msg_unknown, +#endif +#ifdef _MIC_SCIF_ + scif_msg_unknown, + scif_node_connect_nack_resp, //SCIF_NODE_CONNECT_NACK +#else + scif_node_connect_resp, //SCIF_NODE_CONNECT + scif_msg_unknown, +#endif + scif_node_add_nack_resp, //SCIF_NODE_ADD_NACK + scif_get_node_info_resp, //SCIF_GET_NODE_INFO +#ifdef ENABLE_TEST + scif_test // SCIF_TEST +#else + scif_msg_unknown +#endif +}; + +/** + * scif_nodeqp_msg_hander() - Common handler for node messages + * @scifdev: Remote device to respond to + * @qp: Remote memory pointer + * @msg: The message to be handled. + * + * This routine calls the appriate routine to handle a Node Qp message receipt. + */ +int micscif_max_msg_id = SCIF_MAX_MSG; + +static void +micscif_nodeqp_msg_handler(struct micscif_dev *scifdev, struct micscif_qp *qp, struct nodemsg *msg) +{ + micscif_display_message(scifdev, msg, "Rcvd"); + + if (msg->uop > (uint32_t)micscif_max_msg_id) { + /* Bogus Node Qp Message? */ + printk(KERN_ERR "Unknown message 0x%xn scifdev->sd_state 0x%x " + "scifdev->sd_node 0x%x\n", + msg->uop, scifdev->sd_state, scifdev->sd_node); + BUG_ON(1); + } + + scif_intr_func[msg->uop](scifdev, msg); +} + +/** + * scif_nodeqp_intrhander() - Interrupt handler for node messages + * @scifdev: Remote device to respond to + * @qp: Remote memory pointer + * + * This routine is triggered by the interrupt mechanism. It reads + * messages from the node queue RB and calls the Node QP Message handling + * routine. + */ +int +micscif_nodeqp_intrhandler(struct micscif_dev *scifdev, struct micscif_qp *qp) +{ + struct nodemsg msg; + int read_size; + + do { +#ifndef _MIC_SCIF_ + if (qp->blast) { + scif_wakeup_ep(SCIF_WAKE_UP_RECV); + qp->blast = 0; + } +#endif + if (SCIFDEV_STOPPED == scifdev->sd_state) + return 0; + read_size = micscif_rb_get_next(&qp->inbound_q, &msg, + sizeof(msg)); + /* Stop handling messages if an oops is in progress */ + if (read_size != sizeof(msg) || oops_in_progress) + break; +#ifndef _MIC_SCIF_ + atomic_set(&scifdev->sd_node_alive, 1); +#endif + + micscif_inc_node_refcnt(scifdev, 1); + micscif_nodeqp_msg_handler(scifdev, qp, &msg); + /* + * The reference count is reset to SCIF_NODE_IDLE + * during scif device cleanup so decrementing the + * reference count further is not required. + */ + if (SCIFDEV_INIT == scifdev->sd_state) + return 0; + if (SCIFDEV_STOPPED == scifdev->sd_state) { + micscif_dec_node_refcnt(scifdev, 1); + return 0; + } + micscif_rb_update_read_ptr(&qp->inbound_q); + micscif_dec_node_refcnt(scifdev, 1); + } while (read_size == sizeof(msg)); +#ifdef _MIC_SCIF_ + /* + * Keep polling the Node QP RB in case there are active SCIF + * P2P connections to provide better Node QP responsiveness + * in anticipation of P2P Proxy DMA requests for performance. + */ + if (scifdev->sd_proxy_dma_reads && + scifdev->num_active_conn && + SCIFDEV_STOPPED != scifdev->sd_state) { + queue_work(scifdev->sd_intr_wq, &scifdev->sd_intr_bh); + schedule(); + } +#endif + return read_size; +} + +/** + * micscif_loopb_wq_handler - Loopback Workqueue Handler. + * @work: loop back work + * + * This work queue routine is invoked by the loopback work queue handler. + * It grabs the recv lock, dequeues any available messages from the head + * of the loopback message list, calls the node QP message handler, + * waits for it to return, then frees up this message and dequeues more + * elements of the list if available. + */ +static void micscif_loopb_wq_handler(struct work_struct *work) +{ + struct micscif_dev *scifdev = + container_of(work, struct micscif_dev, sd_loopb_work); + struct micscif_qp *qp = micscif_nodeqp_nextmsg(scifdev); + struct loopb_msg *msg; + + do { + msg = NULL; + spin_lock(&qp->qp_recv_lock); + if (!list_empty(&scifdev->sd_loopb_recv_q)) { + msg = list_first_entry(&scifdev->sd_loopb_recv_q, + struct loopb_msg, list_member); + list_del(&msg->list_member); + } + spin_unlock(&qp->qp_recv_lock); + + if (msg) { + micscif_nodeqp_msg_handler(scifdev, qp, &msg->msg); + kfree(msg); + } + } while (msg); +} + +/** + * micscif_loopb_msg_handler() - Workqueue handler for loopback messages. + * @scifdev: SCIF device + * @qp: Queue pair. + * + * This work queue routine is triggered when a loopback message is received. + * + * We need special handling for receiving Node Qp messages on a loopback SCIF + * device via two workqueues for receiving messages. + * + * The reason we need the extra workqueue which is not required with *normal* + * non-loopback SCIF devices is the potential classic deadlock described below: + * + * Thread A tries to send a message on a loopback SCIF devide and blocks since + * there is no space in the RB while it has the qp_send_lock held or another + * lock called lock X for example. + * + * Thread B: The Loopback Node QP message receive workqueue receives the message + * and tries to send a message (eg an ACK) to the loopback SCIF device. It tries + * to grab the send lock again or lock X and deadlocks with Thread A. The RB + * cannot be drained any further due to this classic deadlock. + * + * In order to avoid deadlocks as mentioned above we have an extra level of + * indirection achieved by having two workqueues. + * 1) The first workqueue whose handler is micscif_loopb_msg_handler reads + * messages from the Node QP RB, adds them to a list and queues work for the + * second workqueue. + * + * 2) The second workqueue whose handler is micscif_loopb_wq_handler dequeues + * messages from the list, handles them, frees up the memory and dequeues + * more elements from the list if possible. + */ +int +micscif_loopb_msg_handler(struct micscif_dev *scifdev, struct micscif_qp *qp) +{ + int read_size; + struct loopb_msg *msg; + + do { + if (!(msg = kmalloc(sizeof(struct loopb_msg), GFP_KERNEL))) { + printk(KERN_ERR "%s %d ENOMEM\n", __func__, __LINE__); + return -ENOMEM; + } + + read_size = micscif_rb_get_next(&qp->inbound_q, &msg->msg, + sizeof(struct nodemsg)); + + if (read_size != sizeof(struct nodemsg)) { + kfree(msg); + micscif_rb_update_read_ptr(&qp->inbound_q); + break; + } + + spin_lock(&qp->qp_recv_lock); + list_add_tail(&msg->list_member, &scifdev->sd_loopb_recv_q); + spin_unlock(&qp->qp_recv_lock); + queue_work(scifdev->sd_loopb_wq, &scifdev->sd_loopb_work); + micscif_rb_update_read_ptr(&qp->inbound_q); + } while (read_size == sizeof(struct nodemsg)); + return read_size; +} + +/** + * micscif_setup_loopback_qp - One time setup work for Loopback Node Qp. + * @scifdev: SCIF device + * + * Sets up the required loopback workqueues, queue pairs, ring buffers + * and also tests out the Queue Pairs. + */ +int micscif_setup_loopback_qp(struct micscif_dev *scifdev) +{ + int err = 0; + void *local_q; + struct micscif_qp *qp; + + /* Set up the work queues */ + if ((err = micscif_setup_interrupts(scifdev))) + goto error; + + INIT_LIST_HEAD(&scifdev->sd_loopb_recv_q); + snprintf(scifdev->sd_loopb_wqname, sizeof(scifdev->sd_loopb_wqname), + "SCIF LOOPB %d", scifdev->sd_node); + if (!(scifdev->sd_loopb_wq = + __mic_create_singlethread_workqueue(scifdev->sd_loopb_wqname))){ + err = -ENOMEM; + goto destroy_intr_wq; + } + INIT_WORK(&scifdev->sd_loopb_work, micscif_loopb_wq_handler); + /* Allocate Self Qpair */ + scifdev->n_qpairs = 1; + scifdev->qpairs = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_KERNEL); + if (!scifdev->qpairs) { + printk(KERN_ERR "Node QP Allocation failed\n"); + err = -ENOMEM; + goto destroy_loopb_wq; + } + + qp = scifdev->qpairs; + qp->magic = SCIFEP_MAGIC; + spin_lock_init(&qp->qp_send_lock); + spin_lock_init(&qp->qp_recv_lock); + init_waitqueue_head(&scifdev->sd_mmap_wq); + + local_q = kzalloc(NODE_QP_SIZE, GFP_KERNEL); + if (!local_q) { + printk(KERN_ERR "Ring Buffer Allocation Failed\n"); + err = -ENOMEM; + goto free_qpairs; + } + + /* + * For loopback the inbound_q and outbound_q are essentially the same + * since the Node sends a message on the loopback interface to the + * outbound_q which is then received on the inbound_q. + */ + micscif_rb_init(&qp->outbound_q, + &(scifdev->qpairs[0].local_read), + &(scifdev->qpairs[0].local_write), + local_q, + NODE_QP_SIZE); + + micscif_rb_init(&(qp->inbound_q), + &(scifdev->qpairs[0].local_read), + &(scifdev->qpairs[0].local_write), + local_q, + NODE_QP_SIZE); + + /* Launch the micscif_rb test */ +#ifdef ENABLE_TEST + micscif_qp_testboth(scifdev); +#endif + return err; +free_qpairs: + kfree(scifdev->qpairs); +destroy_loopb_wq: + destroy_workqueue(scifdev->sd_loopb_wq); +destroy_intr_wq: + destroy_workqueue(scifdev->sd_intr_wq); +error: + return err; +} + +/** + * micscif_destroy_loopback_qp - One time uninit work for Loopback Node Qp + * @scifdev: SCIF device + * + * Detroys the workqueues and frees up the Ring Buffer and Queue Pair memory. + */ +int micscif_destroy_loopback_qp(struct micscif_dev *scifdev) +{ + micscif_destroy_interrupts(scifdev); + destroy_workqueue(scifdev->sd_loopb_wq); + kfree((void *)scifdev->qpairs->outbound_q.rb_base); + kfree(scifdev->qpairs); + return 0; +} + +#ifndef _MIC_SCIF_ +void micscif_destroy_p2p(mic_ctx_t *mic_ctx) +{ + mic_ctx_t * mic_ctx_peer; + struct micscif_dev *mic_scif_dev; + struct micscif_dev *peer_dev; + struct scif_p2p_info *p2p; + struct list_head *pos, *tmp; + uint32_t bd; + + if (!mic_p2p_enable) + return; + + + /* FIXME: implement node deletion */ + mic_scif_dev = &scif_dev[mic_get_scifnode_id(mic_ctx)]; + + /* Free P2P mappings in the given node for all its peer nodes */ + list_for_each_safe(pos, tmp, &mic_scif_dev->sd_p2p) { + p2p = list_entry(pos, struct scif_p2p_info, + ppi_list); + + mic_unmap(mic_ctx->bi_id, p2p->ppi_mic_addr[PPI_MMIO], + p2p->ppi_len[PPI_MMIO] << PAGE_SHIFT); + mic_unmap(mic_ctx->bi_id, p2p->ppi_mic_addr[PPI_APER], + p2p->ppi_len[PPI_APER] << PAGE_SHIFT); + pci_unmap_sg(mic_ctx->bi_pdev, + p2p->ppi_sg[PPI_MMIO], p2p->sg_nentries[PPI_MMIO], PCI_DMA_BIDIRECTIONAL); + micscif_p2p_freesg(p2p->ppi_sg[PPI_MMIO]); + pci_unmap_sg(mic_ctx->bi_pdev, + p2p->ppi_sg[PPI_APER], p2p->sg_nentries[PPI_APER], PCI_DMA_BIDIRECTIONAL); + micscif_p2p_freesg(p2p->ppi_sg[PPI_APER]); + list_del(pos); + kfree(p2p); + } + + /* Free P2P mapping created in the peer nodes for the given node */ + for (bd = SCIF_HOST_NODE + 1; bd <= ms_info.mi_maxid; bd++) { + peer_dev = &scif_dev[bd]; + + list_for_each_safe(pos, tmp, &peer_dev->sd_p2p) { + p2p = list_entry(pos, struct scif_p2p_info, + ppi_list); + if (p2p->ppi_peer_id == mic_get_scifnode_id(mic_ctx)) { + + mic_ctx_peer = get_per_dev_ctx(peer_dev->sd_node - 1); + mic_unmap(mic_ctx_peer->bi_id, p2p->ppi_mic_addr[PPI_MMIO], + p2p->ppi_len[PPI_MMIO] << PAGE_SHIFT); + mic_unmap(mic_ctx_peer->bi_id, p2p->ppi_mic_addr[PPI_APER], + p2p->ppi_len[PPI_APER] << PAGE_SHIFT); + pci_unmap_sg(mic_ctx_peer->bi_pdev, + p2p->ppi_sg[PPI_MMIO], p2p->sg_nentries[PPI_MMIO], PCI_DMA_BIDIRECTIONAL); + micscif_p2p_freesg(p2p->ppi_sg[PPI_MMIO]); + pci_unmap_sg(mic_ctx_peer->bi_pdev, p2p->ppi_sg[PPI_APER], + p2p->sg_nentries[PPI_APER], PCI_DMA_BIDIRECTIONAL); + micscif_p2p_freesg(p2p->ppi_sg[PPI_APER]); + list_del(pos); + kfree(p2p); + } + } + } +} +#endif + +/** + * ONLY TEST CODE BELOW + */ +#ifdef ENABLE_TEST +#include +#include +#include "mic/micscif_nodeqp.h" + +static void micscif_rb_trigger_consumer(struct work_struct *work) +{ + struct micscif_dev *scifdev = container_of(work, struct micscif_dev, consumer_work); + + while (scifdev->test_done == 0) { + cpu_relax(); + schedule(); + } + if (scifdev->test_done != 1) + printk(KERN_ERR "Consumer failed!\n"); + else + pr_debug("Test finished: Success\n"); + scifdev->test_done = 2; +} + +/** + * micscif_rb_trigger_producer + * This is the producer thread to create messages and update the + * RB write offset accordingly. + */ +static void micscif_rb_trigger_producer(struct work_struct *work) +{ + struct nodemsg msg; + int count = 0; + struct micscif_dev *scifdev = container_of(work, struct micscif_dev, producer_work); + + msg.dst.node = scifdev->sd_node; + msg.uop = SCIF_TEST; + + while (count <= TEST_LOOP) { + msg.payload[0] = count++; + micscif_nodeqp_send(scifdev, &msg, NULL); + /* pr_debug(("Prod payload %llu\n", msg.payload[0]); */ + } +} + +/* this is called from the host and the card at the same time on a queue pair. + * Each sets up a producer and a consumer and spins on the queue pair until done + */ +static void micscif_qp_testboth(struct micscif_dev *scifdev) +{ + scifdev->count = 0; + scifdev->test_done = 0; + snprintf(scifdev->producer_name, sizeof(scifdev->producer_name), + "PRODUCER %d", scifdev->sd_node); + snprintf(scifdev->consumer_name, sizeof(scifdev->consumer_name), + "CONSUMER %d", scifdev->sd_node); + scifdev->producer = + __mic_create_singlethread_workqueue(scifdev->producer_name); + scifdev->consumer = + __mic_create_singlethread_workqueue(scifdev->consumer_name); + + INIT_WORK(&scifdev->producer_work, micscif_rb_trigger_producer); + INIT_WORK(&scifdev->consumer_work, micscif_rb_trigger_consumer); + + queue_work(scifdev->producer, &scifdev->producer_work); + queue_work(scifdev->consumer, &scifdev->consumer_work); +} +#endif diff --git a/micscif/micscif_ports.c b/micscif/micscif_ports.c new file mode 100644 index 0000000..2a59410 --- /dev/null +++ b/micscif/micscif_ports.c @@ -0,0 +1,376 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * Port reservation mechnism. + * Since this goes with SCIF it must be available for any OS + * and should not consume IP ports. Therefore, roll our own. + * This is not required to be high performance, so a simple bit + * array should do just fine. + * + * API specification (loosely): + * + * uint16_t port + * Port number is a 16 bit unsigned integer + * + * uint16_t rsrv_scif_port(uint16_t) + * reserve specified port # + * returns port #, or 0 if port unavailable. + * + * uint16_t get_scif_port(void) + * reserve any available port # + * returns port #, or 0 if no ports available + * + * void put_scif_port(uint16_t) + * release port # + * + * Reserved ports comes from the lower end of the allocatable range, + * and is reserved only in the sense that get_scif_port() won't use + * them and there is only a predefined count of them available. + */ + +#include + +/* + * Manifests + * Port counts must be an integer multiple of 64 + */ + +#define SCIF_PORT_BASE 0x0000 /* Start port (port reserved if 0) */ +#define SCIF_PORT_COUNT 0x10000 /* Ports available */ + +#if SCIF_PORT_RSVD > (SCIF_PORT_COUNT/2) +#error "No more than half of scif ports can be reserved !!" +#endif +#if (SCIF_PORT_BASE + SCIF_PORT_COUNT) > (2 << 16) +#error "Scif ports cannot exceed 16 bit !!" +#endif + +#include +#include +static spinlock_t port_lock = __SPIN_LOCK_UNLOCKED(port_lock); + +/* + * Data structures + * init_array Flag for initialize (mark as init_code?) + * port_bits 1 bit representing each possible port. + * first_free Index into port_bits for free area + * port_lock Lock for exclusive access + * port_rsvd Total of successful "get/resv" calls. + * port_free Total of successful "free" calls. + * port_err Total of unsuccessfull calls. + */ + +#define BITS_PR_PORT (8 * sizeof(uint64_t)) +#define PORTS_ARRAY_SIZE ((SCIF_PORT_COUNT + (BITS_PR_PORT - 1)) / BITS_PR_PORT) + + +static int init_array = 1; +static uint16_t first_free; +static uint64_t port_bits[PORTS_ARRAY_SIZE]; +static uint64_t port_rsvd; +static uint64_t port_free; +static uint64_t port_err; + + +/* + * Bitfield handlers. + * + * Need 3 bit-fiddlers to operate on individual bits within + * one 64 bit word in memory (always passing a pointer). + * Individual bits are enumerated from 1, allowing for use + * of value 0 to indicate an error condition. + * + * 1) __scif_ffsclr() returns index of first set bit in the + * 64 bit word and clears it. A return value 0 means there + * were no set bits in the word. + * + * 2) __scif_clrbit() clears a specified bit in the 64 bit word + * The bit index is returned if bit was previously set and a + * value 0 is returned if it was previously clear. + * + * 3) __scif_setbit() sets a specified bit in the 64 bit word. + * + * Two versions, one should work for you. + */ + +#if 1 && (defined(__GNUC__) || defined(ICC)) +/* + * Use GNU style inline assembly for bit operations. + * + * Gcc complains about uninitialized use of variables + * big_bit in ffsclr and avl in clrbit. Generated code + * is correct, just haven't figured out the correct + * contraints yet. + * + * gcc -O2: + * __scif_ffsclr: 40 bytes + * __scif_clrbit: 34 bytes + * __scif_setbit: 17 bytes + */ + +static int +__scif_ffsclr(uint64_t *word) +{ + uint64_t big_bit = 0; + uint64_t field = *word; + + asm volatile ( + "bsfq %1,%0\n\t" + "jnz 1f\n\t" + "movq $-1,%0\n" + "jmp 2f\n\t" + "1:\n\t" + "btrq %2,%1\n\t" + "2:" + : "=r" (big_bit), "=r" (field) + : "0" (big_bit), "1" (field) + ); + + if (big_bit == -1) + return 0; + + *word = field; + return big_bit + 1; +} + +static int +__scif_clrbit(uint64_t *word, uint16_t bit) +{ + uint64_t field = *word; + uint64_t big_bit = bit; + int avl = 0; + + big_bit--; + asm volatile ( + "xorl %2,%2\n\t" + "btrq %3,%1\n\t" + "rcll $1,%2\n\t" + : "=Ir" (big_bit), "=r" (field), "=r" (avl) + : "0" (big_bit), "1" (field), "2" (avl) + ); + + *word = field; + return avl ? bit : 0; +} + +static void +__scif_setbit(uint64_t *word, uint16_t bit) +{ + uint64_t field = *word; + uint64_t big_bit = bit; + + big_bit--; + asm volatile ( + "btsq %2,%1" + : "=r" (field) + : "0" (field), "Jr" (big_bit) + ); + + *word = field; +} +#else +/* + * C inliners for bit operations. + * + * gcc -O2: + * __scif_ffsclr: 50 bytes + * __scif_clrbit: 45 bytes + * __scif_setbit: 18 bytes + * + * WARNING: + * 1) ffsll() may be glibc specific + * 2) kernel ffs() use cmovz instruction that may not + * work in uOS kernel (see arch/x86/include/asm/bitops.h) + * + */ + + +static int +__scif_ffsclr(uint64_t *word) +{ + int bit; +/* + * ffsll() Find 1st bit in 64 bit word + */ + + bit = ffsll(*word); + if (bit) + *word &= ~(1LL << (bit - 1)); + + return bit; +} + +static int +__scif_clrbit(uint64_t *word, uint16_t bit) +{ + uint64_t msk = (1LL << (bit - 1)); + + if (*word & msk) { + *word &= ~msk; + return bit; + } + return 0; +} + +static void +__scif_setbit(uint64_t *word, uint16_t bit) +{ + *word |= (1LL << (bit - 1)); +} +#endif + + +static void +init_scif_array(void) +{ + spin_lock(&port_lock); + if (init_array) { + int i; + for (i = 0; i < PORTS_ARRAY_SIZE; i++) + port_bits[i] = ~0; + first_free = SCIF_PORT_RSVD / BITS_PR_PORT; + if (!SCIF_PORT_BASE) + port_bits[0] ^= 1; + port_rsvd = 0; + port_free = 0; + port_err = 0; + init_array = 0; + } + spin_unlock(&port_lock); + pr_debug("SCIF port array init:\n" + " %d ports available starting at %d, %d reserved\n" + " Array consists of %ld %ld-bit wide integers\n", + SCIF_PORT_BASE ? SCIF_PORT_COUNT : SCIF_PORT_COUNT - 1, + SCIF_PORT_BASE ? SCIF_PORT_BASE : 1, SCIF_PORT_RSVD, + PORTS_ARRAY_SIZE, BITS_PR_PORT); +} + + +/* + * Reserve a specified port for SCIF + * TBD: doxyfy this header + */ +uint16_t +rsrv_scif_port(uint16_t port) +{ + uint16_t port_ix; + + if (!port) { + pr_debug("rsrv_scif_port: invalid port %d\n", port); + port_err++; + return 0; + } + + if (init_array) + init_scif_array(); + + port -= SCIF_PORT_BASE; + port_ix = port / BITS_PR_PORT; + + spin_lock(&port_lock); + port = __scif_clrbit(port_bits + port_ix, 1 + (port % BITS_PR_PORT)); + if (port) { + port = port - 1 + BITS_PR_PORT * port_ix + SCIF_PORT_BASE; + port_rsvd++; + } else { + port_err++; + } + spin_unlock(&port_lock); + + return port; +} + + +/* + * Get and reserve any port # for SCIF + * TBD: doxyfy this header + */ +uint16_t +get_scif_port(void) +{ + uint16_t port; + + if (init_array) + init_scif_array(); + + spin_lock(&port_lock); + if (first_free >= PORTS_ARRAY_SIZE) { /* Pool is empty */ + port = 0; + port_err++; + goto out; + } + port = __scif_ffsclr(port_bits + first_free); + if (port) { + port = port - 1 + BITS_PR_PORT * first_free + SCIF_PORT_BASE; + while ((first_free < PORTS_ARRAY_SIZE) && !port_bits[first_free]) + first_free++; + port_rsvd++; + } else + port_err++; +out: + spin_unlock(&port_lock); + return port; +} + + +/* + * Release a reserved port # for SCIF + * For now, just ignore release on unreserved port + * TBD: doxyfy this header + */ + +void +put_scif_port(uint16_t port) +{ + uint16_t port_ix; + + if (!port) { + pr_debug("put_scif_port: invalid port %d\n", port); + port_err++; + return; + } + + port -= SCIF_PORT_BASE; + port_ix = port / BITS_PR_PORT; + + spin_lock(&port_lock); + __scif_setbit(port_bits + port_ix, 1 + (port % BITS_PR_PORT)); + if (port >= SCIF_PORT_RSVD && port_ix < first_free) + first_free = port_ix; + port_free++; + spin_unlock(&port_lock); +} + diff --git a/micscif/micscif_rb.c b/micscif/micscif_rb.c new file mode 100644 index 0000000..3fdbf8f --- /dev/null +++ b/micscif/micscif_rb.c @@ -0,0 +1,372 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" +#include "mic/micscif_rb.h" + +#include +#include +#define count_in_ring(head, tail, size) CIRC_CNT(head, tail, size) +#define space_in_ring(head, tail, size) CIRC_SPACE(head, tail, size) + +MODULE_LICENSE("GPL"); + +static void *micscif_rb_get(struct micscif_rb *rb, uint32_t size); + +/** + * micscif_rb_init - To Initialize the RingBuffer + * @rb: The RingBuffer context + * @read_ptr: A pointer to the memory location containing + * the updated read pointer + * @write_ptr: A pointer to the memory location containing + * the updated write pointer + * @rb_base: The pointer to the ring buffer + * @size: The size of the ring buffer + */ +void micscif_rb_init(struct micscif_rb *rb, + volatile uint32_t *read_ptr, + volatile uint32_t *write_ptr, + volatile void *rb_base, + const uint32_t size) +{ + /* Size must be a power of two -- all logic assoicated with + * incrementing the read and write pointers relies on the size + * being a power of 2 + */ + BUG_ON((size & (size-1)) != 0); + rb->rb_base = rb_base; + rb->size = size; + rb->read_ptr = read_ptr; + rb->write_ptr = write_ptr; + rb->current_read_offset = *read_ptr; + rb->current_write_offset = *write_ptr; +} +EXPORT_SYMBOL(micscif_rb_init); + +/** + * micscif_rb_reset - To reset the RingBuffer + * @rb - The RingBuffer context + */ +void micscif_rb_reset(struct micscif_rb *rb) +{ + /* + * XPU_RACE_CONDITION: write followed by read + * MFENCE after write + * Read should take care of SBOX sync + * Ponters are volatile (see RingBuffer declaration) + */ + *rb->read_ptr = 0x0; + *rb->write_ptr = 0x0; + smp_mb(); + rb->current_write_offset = *rb->write_ptr; + rb->current_read_offset = *rb->read_ptr; +} +EXPORT_SYMBOL(micscif_rb_reset); + +/* Copies a message to the ring buffer -- handles the wrap around case */ +static int memcpy_torb(struct micscif_rb *rb, void *header, + void *msg, uint32_t size) +{ + /* Need to call two copies if it wraps around */ + uint32_t size1, size2; + if ((char*)header + size >= (char*)rb->rb_base + rb->size) { + size1 = (uint32_t) ( ((char*)rb->rb_base + rb->size) - (char*)header); + size2 = size - size1; + memcpy_toio(header, msg, size1); + memcpy_toio(rb->rb_base, (char*)msg+size1, size2); + } else { + memcpy_toio(header, msg, size); + } + return 0; +} + +/* Copies a message from the ring buffer -- handles the wrap around case */ +static int memcpy_fromrb(struct micscif_rb *rb, void *header, + void *msg, uint32_t size) +{ + /* Need to call two copies if it wraps around */ + uint32_t size1, size2; + if ((char*)header + size >= (char*)rb->rb_base + rb->size) { + size1 = (uint32_t) ( ((char*)rb->rb_base + rb->size) - (char*)header ); + size2 = size - size1; + memcpy_fromio(msg, header, size1); + memcpy_fromio((char*)msg+size1, rb->rb_base, size2); + } else { + memcpy_fromio(msg, header, size); + } + return 0; +} + +/** + * micscif_rb_space - + * Query space available for writing to the given RB. + * + * @rb - The RingBuffer context + * + * Returns: size available for writing to RB in bytes. + */ +int micscif_rb_space(struct micscif_rb *rb) +{ + rb->old_current_read_offset = rb->current_read_offset; + + rb->current_read_offset = *rb->read_ptr; + return space_in_ring(rb->current_write_offset, + rb->current_read_offset, rb->size); +} +EXPORT_SYMBOL(micscif_rb_space); + +/** + * micscif_rb_write - Write one package to the given ring buffer + * @rb - The RingBuffer context + * @msg - The package to be put in the ring buffer + * @size - the size (in bytes) you want to copy + * + * This API does not block if there isn't enough space in the RB. + */ +int micscif_rb_write(struct micscif_rb *rb, + void *msg, + uint32_t size) +{ + void *header; + int ret = 0; + if ((uint32_t)micscif_rb_space(rb) < size) + return -ENOMEM; + header = (char*)rb->rb_base + rb->current_write_offset; + ret = memcpy_torb(rb, header, msg, size); + if (!ret) { + /* + * XPU_RACE_CONDITION: Don't do anything here! + * Wait until micscif_rb_commit() + * Update the local ring buffer data, not the shared data until commit. + */ + rb->old_current_write_offset = rb->current_write_offset; + rb->current_write_offset = (rb->current_write_offset + size) & (rb->size - 1); + } + return ret; +} +EXPORT_SYMBOL(micscif_rb_write); + +/* + * micscif_rb_get_next + * Read from ring buffer. + * @rb - The RingBuffer context + * @msg - buffer to hold the message. Must be at least size bytes long + * @size - Size to be read out passed in, actual bytes read + * is returned. + * RETURN: + * Returns the number of bytes possible to read -- if retVal != size, then + * the read does not occur. + */ +int micscif_rb_get_next (struct micscif_rb *rb, void *msg, uint32_t size) +{ + void *header = NULL; + int read_size = 0; + /* + * warning: RingBufferGet() looks at the shared write pointer + */ + header = micscif_rb_get(rb, size); + if (header) { + uint32_t next_cmd_offset = + (rb->current_read_offset + size) & (rb->size - 1); + read_size = size; + rb->old_current_read_offset = rb->current_read_offset; + rb->current_read_offset = next_cmd_offset; + if (memcpy_fromrb(rb, header, msg, size)) // add check here + return -EFAULT; + } + return read_size; +} +EXPORT_SYMBOL(micscif_rb_get_next); + +/** + * micscif_rb_update_read_ptr + * @rb - The RingBuffer context + */ +void micscif_rb_update_read_ptr(struct micscif_rb *rb) +{ + uint32_t old_offset; + uint32_t new_offset; + smp_mb(); + old_offset = rb->old_current_read_offset; + new_offset = rb->current_read_offset; + + /* + * XPU_RACE_CONDITION: + * pReadPointer is ready to move + * Moving read pointer transfers ownership to MIC + * What if MICCPU starts writing to buffer before all + * writes were flushed? + * Need to flush out all pending writes before pointer update + */ + smp_mb(); + +#ifdef CONFIG_ML1OM + serializing_request((volatile uint8_t*) rb->rb_base+old_offset); +#endif + + *rb->read_ptr = new_offset; +#ifdef CONFIG_ML1OM + /* + * Readback since KNF doesn't guarantee that PCI ordering is maintained. + * Need a memory barrier on the host before the readback so the readback + * doesn't load from the write combining buffer but will go across to the + * PCI bus that will then flush the posted write to the device. + */ + smp_mb(); + serializing_request(rb->read_ptr); +#endif +#if defined(CONFIG_MK1OM) && defined(_MIC_SCIF_) + /* + * KNC Si HSD 3853952: For the case where a Core is performing an EXT_WR + * followed by a Doorbell Write, the Core must perform two EXT_WR to the + * same address with the same data before it does the Doorbell Write. + * This way, if ordering is violate for the Interrupt Message, it will + * fall just behind the first Posted associated with the first EXT_WR. + */ + *rb->read_ptr = new_offset; +#endif + smp_mb(); +} +EXPORT_SYMBOL(micscif_rb_update_read_ptr); + +/** + * micscif_rb_count + * @rb - The RingBuffer context + * RETURN: number of empty slots in the RB + */ +uint32_t micscif_rb_count(struct micscif_rb *rb, uint32_t size) +{ + if (count_in_ring(rb->current_write_offset, + rb->current_read_offset, + rb->size) < size) { + /* + * Update from the HW write pointer if empty + */ + rb->old_current_write_offset = rb->current_write_offset; + rb->current_write_offset = *rb->write_ptr; + } + return count_in_ring(rb->current_write_offset, + rb->current_read_offset, + rb->size); +} +EXPORT_SYMBOL(micscif_rb_count); + +/** + * micscif_rb_commit + * To submit the buffer to let the uOS to fetch it + * @rb - The RingBuffer context + */ +void micscif_rb_commit(struct micscif_rb *rb) +{ + /* + * XPU_RACE_CONDITION: + * Writing to ringbuffer memory before updating the pointer + * can be out-of-order and write combined. + * This is the point where we start to care about + * consistency of the data. + * There are two race conditions below: + * (1) Ring buffer pointer moves before all data is flushed: + * if uOS is late taking the interrupt for the previous transaction, + * it may take the new write pointer immediately + * and start accessing data in the ringbuffer. + * Ring buffer data must be consistent before we update the write + * pointer. We read back the address at oldCurrentWriteOffset + * -- this is the location in memory written during the last + * ring buffer operation; keep in mind that ring buffers and ring buffer + * pointers can be in different kinds of memory (host vs MIC, + * depending on currently active workaround flags. + * (2) If uOS takes interrupt while write pointer value is still + * in-flight may result in uOS reading old value, message being lost, + * and the deadlock. Must put another memory barrier after readback -- + * revents read-passing-read from later read + */ + smp_mb(); +#ifdef CONFIG_ML1OM + /* + * Also makes sure the following read is not reordered + */ + serializing_request((char*)rb->rb_base + rb->current_write_offset); +#endif + *rb->write_ptr = rb->current_write_offset; +#ifdef CONFIG_ML1OM + /* + * Readback since KNF doesn't guarantee that PCI ordering is maintained. + * Need a memory barrier on the host before the readback so the readback + * doesn't load from the write combining buffer but will go across to the + * PCI bus that will then flush the posted write to the device. + */ + smp_mb(); + serializing_request(rb->write_ptr); +#endif +#if defined(CONFIG_MK1OM) && defined(_MIC_SCIF_) + /* + * KNC Si HSD 3853952: For the case where a Core is performing an EXT_WR + * followed by a Doorbell Write, the Core must perform two EXT_WR to the + * same address with the same data before it does the Doorbell Write. + * This way, if ordering is violate for the Interrupt Message, it will + * fall just behind the first Posted associated with the first EXT_WR. + */ + *rb->write_ptr = rb->current_write_offset; +#endif + smp_mb(); +} +EXPORT_SYMBOL(micscif_rb_commit); + +/** + * micscif_rb_get + * To get next packet from the ring buffer + * @rb - The RingBuffer context + * RETURN: + * NULL if no packet in the ring buffer + * Otherwise The pointer of the next packet + */ +static void *micscif_rb_get(struct micscif_rb *rb, uint32_t size) +{ + void *header = NULL; + + if (micscif_rb_count(rb, size) >= size) + header = (char*)rb->rb_base + rb->current_read_offset; + return header; +} + +/** + * micscif_rb_get_version + * Return the ring buffer module version + */ +uint16_t micscif_rb_get_version(void) +{ + return RING_BUFFER_VERSION; +} +EXPORT_SYMBOL(micscif_rb_get_version); diff --git a/micscif/micscif_rma.c b/micscif/micscif_rma.c new file mode 100644 index 0000000..9c6de2e --- /dev/null +++ b/micscif/micscif_rma.c @@ -0,0 +1,2633 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" +#include "mic/micscif_smpt.h" +#include "mic/micscif_kmem_cache.h" +#include "mic/micscif_rma_list.h" +#ifndef _MIC_SCIF_ +#include "mic_common.h" +#endif +#include "mic/mic_dma_api.h" +#include "mic/micscif_map.h" + +bool mic_reg_cache_enable = 0; + +bool mic_huge_page_enable = 1; + +#ifdef _MIC_SCIF_ +mic_dma_handle_t mic_dma_handle; +#endif +static inline +void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn, + struct endpt *ep, bool inrange, + uint64_t start, uint64_t len); +#ifdef CONFIG_MMU_NOTIFIER +static void scif_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm); +static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); +static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end); +static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end); +static const struct mmu_notifier_ops scif_mmu_notifier_ops = { + .release = scif_mmu_notifier_release, + .clear_flush_young = NULL, + .change_pte = NULL,/*TODO*/ + .invalidate_page = scif_mmu_notifier_invalidate_page, + .invalidate_range_start = scif_mmu_notifier_invalidate_range_start, + .invalidate_range_end = scif_mmu_notifier_invalidate_range_end}; + +static void scif_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct endpt *ep; + struct rma_mmu_notifier *mmn; + mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier); + ep = mmn->ep; + micscif_rma_destroy_tcw(mmn, ep, false, 0, 0); + pr_debug("%s\n", __func__); + return; +} + +static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct endpt *ep; + struct rma_mmu_notifier *mmn; + mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier); + ep = mmn->ep; + micscif_rma_destroy_tcw(mmn, ep, true, address, PAGE_SIZE); + pr_debug("%s address 0x%lx\n", __func__, address); + return; +} + +static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct endpt *ep; + struct rma_mmu_notifier *mmn; + mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier); + ep = mmn->ep; + micscif_rma_destroy_tcw(mmn, ep, true, (uint64_t)start, (uint64_t)(end - start)); + pr_debug("%s start=%lx, end=%lx\n", __func__, start, end); + return; +} + +static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + /* Nothing to do here, everything needed was done in invalidate_range_start */ + pr_debug("%s\n", __func__); + return; +} +#endif + +#ifdef CONFIG_MMU_NOTIFIER +void ep_unregister_mmu_notifier(struct endpt *ep) +{ + struct endpt_rma_info *rma = &ep->rma_info; + struct rma_mmu_notifier *mmn = NULL; + struct list_head *item, *tmp; + mutex_lock(&ep->rma_info.mmn_lock); + list_for_each_safe(item, tmp, &rma->mmn_list) { + mmn = list_entry(item, + struct rma_mmu_notifier, list_member); + mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm); +#ifdef RMA_DEBUG + BUG_ON(atomic_long_sub_return(1, &ms_info.mmu_notif_cnt) < 0); +#endif + list_del(item); + kfree(mmn); + } + mutex_unlock(&ep->rma_info.mmn_lock); +} + +static void init_mmu_notifier(struct rma_mmu_notifier *mmn, struct mm_struct *mm, struct endpt *ep) +{ + mmn->ep = ep; + mmn->mm = mm; + mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops; + INIT_LIST_HEAD(&mmn->list_member); + INIT_LIST_HEAD(&mmn->tc_reg_list); +} + +static struct rma_mmu_notifier *find_mmu_notifier(struct mm_struct *mm, struct endpt_rma_info *rma) +{ + struct rma_mmu_notifier *mmn; + struct list_head *item; + list_for_each(item, &rma->mmn_list) { + mmn = list_entry(item, + struct rma_mmu_notifier, list_member); + if (mmn->mm == mm) + return mmn; + } + return NULL; +} +#endif + +/** + * micscif_rma_ep_init: + * @ep: end point + * + * Initialize RMA per EP data structures. + */ +int micscif_rma_ep_init(struct endpt *ep) +{ + int ret; + struct endpt_rma_info *rma = &ep->rma_info; + + mutex_init (&rma->rma_lock); + if ((ret = va_gen_init(&rma->va_gen, + VA_GEN_MIN, VA_GEN_RANGE)) < 0) + goto init_err; + spin_lock_init(&rma->tc_lock); + mutex_init (&rma->mmn_lock); + mutex_init (&rma->va_lock); + INIT_LIST_HEAD(&rma->reg_list); + INIT_LIST_HEAD(&rma->remote_reg_list); + atomic_set(&rma->tw_refcount, 0); + atomic_set(&rma->tw_total_pages, 0); + atomic_set(&rma->tcw_refcount, 0); + atomic_set(&rma->tcw_total_pages, 0); + init_waitqueue_head(&rma->fence_wq); + rma->fence_refcount = 0; + rma->async_list_del = 0; + rma->dma_chan = NULL; + INIT_LIST_HEAD(&rma->mmn_list); + INIT_LIST_HEAD(&rma->task_list); +init_err: + return ret; +} + +/** + * micscif_rma_ep_can_uninit: + * @ep: end point + * + * Returns 1 if an endpoint can be uninitialized and 0 otherwise. + */ +int micscif_rma_ep_can_uninit(struct endpt *ep) +{ + int ret = 0; + + /* Destroy RMA Info only if both lists are empty */ + if (list_empty(&ep->rma_info.reg_list) && + list_empty(&ep->rma_info.remote_reg_list) && +#ifdef CONFIG_MMU_NOTIFIER + list_empty(&ep->rma_info.mmn_list) && +#endif + !atomic_read(&ep->rma_info.tw_refcount) && + !atomic_read(&ep->rma_info.tcw_refcount)) + ret = 1; + return ret; +} + +#ifdef _MIC_SCIF_ +/** + * __micscif_setup_proxy_dma: + * @ep: SCIF endpoint descriptor. + * + * Sets up data structures for P2P Proxy DMAs. + */ +static int __micscif_setup_proxy_dma(struct endpt *ep) +{ + struct endpt_rma_info *rma = &ep->rma_info; + int err = 0; + uint64_t *tmp = NULL; + + mutex_lock(&rma->rma_lock); + if (is_p2p_scifdev(ep->remote_dev) && !rma->proxy_dma_va) { + if (!(tmp = scif_zalloc(PAGE_SIZE))) { + err = -ENOMEM; + goto error; + } + if ((err = map_virt_into_aperture(&rma->proxy_dma_phys, + tmp, + ep->remote_dev, PAGE_SIZE))) { + scif_free(tmp, PAGE_SIZE); + goto error; + } + *tmp = OP_IDLE; + rma->proxy_dma_va = tmp; + } +error: + mutex_unlock(&rma->rma_lock); + return err; +} + +static __always_inline int micscif_setup_proxy_dma(struct endpt *ep) +{ + if (ep->rma_info.proxy_dma_va) + return 0; + + return __micscif_setup_proxy_dma(ep); +} + +/** + * micscif_teardown_proxy_dma: + * @ep: SCIF endpoint descriptor. + * + * Tears down data structures setup for P2P Proxy DMAs. + */ +void micscif_teardown_proxy_dma(struct endpt *ep) +{ + struct endpt_rma_info *rma = &ep->rma_info; + mutex_lock(&rma->rma_lock); + if (rma->proxy_dma_va) { + unmap_from_aperture(rma->proxy_dma_phys, ep->remote_dev, PAGE_SIZE); + scif_free(rma->proxy_dma_va, PAGE_SIZE); + rma->proxy_dma_va = NULL; + } + mutex_unlock(&rma->rma_lock); +} + +/** + * micscif_proxy_dma: + * @ep: SCIF endpoint descriptor. + * @copy_work: DMA copy work information. + * + * This API does the following: + * 1) Sends the peer a SCIF Node QP message with the information + * required to program a proxy DMA to covert a P2P Read to a Write + * which will initiate a DMA transfer from the peer card to self. + * The reason for this special code path is KNF and KNC P2P read + * performance being much lower than P2P write performance on Crown + * Pass platforms. + * 2) Poll for an update of the known proxy dma VA to OP_COMPLETED + * via a SUD by the peer. + */ +static int micscif_proxy_dma(scif_epd_t epd, struct mic_copy_work *work) +{ + struct endpt *ep = (struct endpt *)epd; + struct nodemsg msg; + unsigned long ts = jiffies; + struct endpt_rma_info *rma = &ep->rma_info; + int err; + volatile uint64_t *proxy_dma_va = rma->proxy_dma_va; + + mutex_lock(&ep->rma_info.rma_lock); + /* + * Bail out if there is a Proxy DMA already in progress + * for this endpoint. The callee will fallback on self + * DMAs upon an error. + */ + if (*proxy_dma_va != OP_IDLE) { + mutex_unlock(&ep->rma_info.rma_lock); + err = -EBUSY; + goto error; + } + *proxy_dma_va = OP_IN_PROGRESS; + mutex_unlock(&ep->rma_info.rma_lock); + + msg.src = ep->port; + msg.uop = work->ordered ? SCIF_PROXY_ORDERED_DMA : SCIF_PROXY_DMA; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = work->src_offset; + msg.payload[2] = work->dst_offset; + msg.payload[3] = work->len; + + if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) + goto error_init_va; + + while (*proxy_dma_va != OP_COMPLETED) { + schedule(); + if (time_after(jiffies, + ts + NODE_ALIVE_TIMEOUT)) { + err = -EBUSY; + goto error_init_va; + } + } + err = 0; +error_init_va: + *proxy_dma_va = OP_IDLE; +error: + return err; +} +#endif + +/** + * micscif_create_pinned_pages: + * @nr_pages: number of pages in window + * @prot: read/write protection + * + * Allocate and prepare a set of pinned pages. + */ +struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot) +{ + struct scif_pinned_pages *pinned_pages; + + might_sleep(); + if (!(pinned_pages = scif_zalloc(sizeof(*pinned_pages)))) + goto error; + + if (!(pinned_pages->pages = scif_zalloc(nr_pages * + sizeof(*(pinned_pages->pages))))) + goto error_free_pinned_pages; + + if (!(pinned_pages->num_pages = scif_zalloc(nr_pages * + sizeof(*(pinned_pages->num_pages))))) + goto error_free_pages; + +#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_) + if (!(pinned_pages->vma = scif_zalloc(nr_pages * + sizeof(*(pinned_pages->vma))))) + goto error_free_num_pages; +#endif + + pinned_pages->prot = prot; + pinned_pages->magic = SCIFEP_MAGIC; + pinned_pages->nr_contig_chunks = 0; + return pinned_pages; + +#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_) +error_free_num_pages: + scif_free(pinned_pages->num_pages, + pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages))); +#endif +error_free_pages: + scif_free(pinned_pages->pages, + pinned_pages->nr_pages * sizeof(*(pinned_pages->pages))); +error_free_pinned_pages: + scif_free(pinned_pages, sizeof(*pinned_pages)); +error: + return NULL; +} + +/** + * micscif_destroy_pinned_pages: + * @pinned_pages: A set of pinned pages. + * + * Deallocate resources for pinned pages. + */ +int micscif_destroy_pinned_pages(struct scif_pinned_pages *pinned_pages) +{ + int j; + int writeable = pinned_pages->prot & SCIF_PROT_WRITE; + int kernel = SCIF_MAP_KERNEL & pinned_pages->map_flags; + + for (j = 0; j < pinned_pages->nr_pages; j++) { + if (pinned_pages->pages[j]) { + if (!kernel) { + if (writeable) + SetPageDirty(pinned_pages->pages[j]); +#ifdef RMA_DEBUG + BUG_ON(!page_count(pinned_pages->pages[j])); + BUG_ON(atomic_long_sub_return(1, &ms_info.rma_pin_cnt) < 0); +#endif + page_cache_release(pinned_pages->pages[j]); + } + } + } + +#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_) + scif_free(pinned_pages->vma, + pinned_pages->nr_pages * sizeof(*(pinned_pages->vma))); +#endif + scif_free(pinned_pages->pages, + pinned_pages->nr_pages * sizeof(*(pinned_pages->pages))); + scif_free(pinned_pages->num_pages, + pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages))); + scif_free(pinned_pages, sizeof(*pinned_pages)); + return 0; +} + +/* + * micscif_create_window: + * @ep: end point + * @pinned_pages: Set of pinned pages which wil back this window. + * @offset: offset hint + * + * Allocate and prepare a self registration window. + */ +struct reg_range_t *micscif_create_window(struct endpt *ep, + int64_t nr_pages, uint64_t offset, bool temp) +{ + struct reg_range_t *window; + + might_sleep(); + if (!(window = scif_zalloc(sizeof(struct reg_range_t)))) + goto error; + +#ifdef CONFIG_ML1OM + if (!temp) { + if (!(window->phys_addr = scif_zalloc(nr_pages * + sizeof(*(window->phys_addr))))) + goto error_free_window; + + if (!(window->temp_phys_addr = scif_zalloc(nr_pages * + sizeof(*(window->temp_phys_addr))))) + goto error_free_window; + } +#endif + + if (!(window->dma_addr = scif_zalloc(nr_pages * + sizeof(*(window->dma_addr))))) + goto error_free_window; + + if (!(window->num_pages = scif_zalloc(nr_pages * + sizeof(*(window->num_pages))))) + goto error_free_window; + + window->offset = offset; + window->ep = (uint64_t)ep; + window->magic = SCIFEP_MAGIC; + window->reg_state = OP_IDLE; + init_waitqueue_head(&window->regwq); + window->unreg_state = OP_IDLE; + init_waitqueue_head(&window->unregwq); + INIT_LIST_HEAD(&window->list_member); + window->type = RMA_WINDOW_SELF; + window->temp = temp; +#ifdef _MIC_SCIF_ + micscif_setup_proxy_dma(ep); +#endif + return window; + +error_free_window: + if (window->dma_addr) + scif_free(window->dma_addr, nr_pages * sizeof(*(window->dma_addr))); +#ifdef CONFIG_ML1OM + if (window->temp_phys_addr) + scif_free(window->temp_phys_addr, nr_pages * sizeof(*(window->temp_phys_addr))); + if (window->phys_addr) + scif_free(window->phys_addr, nr_pages * sizeof(*(window->phys_addr))); +#endif + scif_free(window, sizeof(*window)); +error: + return NULL; +} + +/** + * micscif_destroy_incomplete_window: + * @ep: end point + * @window: registration window + * + * Deallocate resources for self window. + */ +int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window) +{ + int err; + int64_t nr_pages = window->nr_pages; + struct allocmsg *alloc = &window->alloc_handle; + struct nodemsg msg; + + RMA_MAGIC(window); +retry: + err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + + if (OP_COMPLETED == alloc->state) { + msg.uop = SCIF_FREE_VIRT; + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = (uint64_t)window->alloc_handle.vaddr; + msg.payload[2] = (uint64_t)window; + msg.payload[3] = SCIF_REGISTER; + micscif_nodeqp_send(ep->remote_dev, &msg, ep); + } + + micscif_free_window_offset(ep, window->offset, + window->nr_pages << PAGE_SHIFT); + if (window->dma_addr) + scif_free(window->dma_addr, nr_pages * + sizeof(*(window->dma_addr))); + if (window->num_pages) + scif_free(window->num_pages, nr_pages * + sizeof(*(window->num_pages))); +#ifdef CONFIG_ML1OM + if (window->phys_addr) + scif_free(window->phys_addr, window->nr_pages * + sizeof(*(window->phys_addr))); + if (window->temp_phys_addr) + scif_free(window->temp_phys_addr, nr_pages * + sizeof(*(window->temp_phys_addr))); +#endif + scif_free(window, sizeof(*window)); + return 0; +} + +/** + * micscif_destroy_window: + * @ep: end point + * @window: registration window + * + * Deallocate resources for self window. + */ +int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window) +{ + int j; + struct scif_pinned_pages *pinned_pages = window->pinned_pages; + int64_t nr_pages = window->nr_pages; + + might_sleep(); + RMA_MAGIC(window); + if (!window->temp && window->mm) { + __scif_dec_pinned_vm_lock(window->mm, window->nr_pages, 0); + __scif_release_mm(window->mm); + window->mm = NULL; + } + + if (!window->offset_freed) + micscif_free_window_offset(ep, window->offset, + window->nr_pages << PAGE_SHIFT); + for (j = 0; j < window->nr_contig_chunks; j++) { + if (window->dma_addr[j]) { + unmap_from_aperture( + window->dma_addr[j], + ep->remote_dev, + window->num_pages[j] << PAGE_SHIFT); + } + } + + /* + * Decrement references for this set of pinned pages from + * this window. + */ + j = atomic_sub_return((int32_t)pinned_pages->nr_pages, + &pinned_pages->ref_count); + BUG_ON(j < 0); + /* + * If the ref count for pinned_pages is zero then someone + * has already called scif_unpin_pages() for it and we should + * destroy the page cache. + */ + if (!j) + micscif_destroy_pinned_pages(window->pinned_pages); + if (window->dma_addr) + scif_free(window->dma_addr, nr_pages * + sizeof(*(window->dma_addr))); + if (window->num_pages) + scif_free(window->num_pages, nr_pages * + sizeof(*(window->num_pages))); +#ifdef CONFIG_ML1OM + if (window->phys_addr) + scif_free(window->phys_addr, window->nr_pages * + sizeof(*(window->phys_addr))); + if (window->temp_phys_addr) + scif_free(window->temp_phys_addr, nr_pages * + sizeof(*(window->temp_phys_addr))); +#endif + window->magic = 0; + scif_free(window, sizeof(*window)); + return 0; +} + +/** + * micscif_create_remote_lookup: + * @ep: end point + * @window: remote window + * + * Allocate and prepare lookup entries for the remote + * end to copy over the physical addresses. + * Returns 0 on success and appropriate errno on failure. + */ +int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window) +{ + int i, j, err = 0; + int64_t nr_pages = window->nr_pages; + bool vmalloc_dma_phys; +#ifdef CONFIG_ML1OM + bool vmalloc_temp_phys = false; + bool vmalloc_phys = false; +#endif + might_sleep(); + + /* Map window */ + err = map_virt_into_aperture(&window->mapped_offset, + window, ep->remote_dev, sizeof(*window)); + if (err) + goto error_window; + + /* Compute the number of lookup entries. 21 == 2MB Shift */ + window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE, + ((2) * 1024 * 1024)) >> 21; + + if (!(window->dma_addr_lookup.lookup = + scif_zalloc(window->nr_lookup * + sizeof(*(window->dma_addr_lookup.lookup))))) + goto error_window; + + /* Map DMA physical addess lookup array */ + err = map_virt_into_aperture(&window->dma_addr_lookup.offset, + window->dma_addr_lookup.lookup, ep->remote_dev, + window->nr_lookup * + sizeof(*window->dma_addr_lookup.lookup)); + if (err) + goto error_window; + + vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]); + +#ifdef CONFIG_ML1OM + if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) { + if (!(window->temp_phys_addr_lookup.lookup = + scif_zalloc(window->nr_lookup * + sizeof(*(window->temp_phys_addr_lookup.lookup))))) + goto error_window; + + /* Map physical addess lookup array */ + err = map_virt_into_aperture(&window->temp_phys_addr_lookup.offset, + window->temp_phys_addr_lookup.lookup, ep->remote_dev, + window->nr_lookup * + sizeof(*window->temp_phys_addr_lookup.lookup)); + if (err) + goto error_window; + + if (!(window->phys_addr_lookup.lookup = + scif_zalloc(window->nr_lookup * + sizeof(*(window->phys_addr_lookup.lookup))))) + goto error_window; + + /* Map physical addess lookup array */ + err = map_virt_into_aperture(&window->phys_addr_lookup.offset, + window->phys_addr_lookup.lookup, ep->remote_dev, + window->nr_lookup * + sizeof(*window->phys_addr_lookup.lookup)); + if (err) + goto error_window; + + vmalloc_phys = is_vmalloc_addr(&window->phys_addr[0]); + vmalloc_temp_phys = is_vmalloc_addr(&window->temp_phys_addr[0]); + } +#endif + + /* Now map each of the pages containing physical addresses */ + for (i = 0, j = 0; i < nr_pages; i += NR_PHYS_ADDR_IN_PAGE, j++) { +#ifdef CONFIG_ML1OM + if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) { + err = map_page_into_aperture( + &window->temp_phys_addr_lookup.lookup[j], + vmalloc_temp_phys ? + vmalloc_to_page(&window->temp_phys_addr[i]) : + virt_to_page(&window->temp_phys_addr[i]), + ep->remote_dev); + if (err) + goto error_window; + + err = map_page_into_aperture( + &window->phys_addr_lookup.lookup[j], + vmalloc_phys ? + vmalloc_to_page(&window->phys_addr[i]) : + virt_to_page(&window->phys_addr[i]), + ep->remote_dev); + if (err) + goto error_window; + } +#endif + err = map_page_into_aperture( + &window->dma_addr_lookup.lookup[j], + vmalloc_dma_phys ? + vmalloc_to_page(&window->dma_addr[i]) : + virt_to_page(&window->dma_addr[i]), + ep->remote_dev); + if (err) + goto error_window; + } + return 0; +error_window: + return err; +} + +/** + * micscif_destroy_remote_lookup: + * @ep: end point + * @window: remote window + * + * Destroy lookup entries used for the remote + * end to copy over the physical addresses. + */ +void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window) +{ + int i, j; + + RMA_MAGIC(window); + if (window->nr_lookup) { + for (i = 0, j = 0; i < window->nr_pages; + i += NR_PHYS_ADDR_IN_PAGE, j++) { + if (window->dma_addr_lookup.lookup && + window->dma_addr_lookup.lookup[j]) { + unmap_from_aperture( + window->dma_addr_lookup.lookup[j], + ep->remote_dev, PAGE_SIZE); + } + } + if (window->dma_addr_lookup.offset) { + unmap_from_aperture( + window->dma_addr_lookup.offset, + ep->remote_dev, window->nr_lookup * + sizeof(*window->dma_addr_lookup.lookup)); + } + if (window->dma_addr_lookup.lookup) + scif_free(window->dma_addr_lookup.lookup, window->nr_lookup * + sizeof(*(window->dma_addr_lookup.lookup))); + if (window->mapped_offset) { + unmap_from_aperture(window->mapped_offset, + ep->remote_dev, sizeof(*window)); + } + window->nr_lookup = 0; + } +} + +/** + * micscif_create_remote_window: + * @ep: end point + * @nr_pages: number of pages in window + * + * Allocate and prepare a remote registration window. + */ +struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages) +{ + struct reg_range_t *window; + + might_sleep(); + if (!(window = scif_zalloc(sizeof(struct reg_range_t)))) + goto error_ret; + + window->magic = SCIFEP_MAGIC; + window->nr_pages = nr_pages; + +#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + if (!(window->page_ref_count = scif_zalloc(nr_pages * + sizeof(*(window->page_ref_count))))) + goto error_window; +#endif + + if (!(window->dma_addr = scif_zalloc(nr_pages * + sizeof(*(window->dma_addr))))) + goto error_window; + + if (!(window->num_pages = scif_zalloc(nr_pages * + sizeof(*(window->num_pages))))) + goto error_window; + +#ifdef CONFIG_ML1OM + if (!(window->phys_addr = scif_zalloc(nr_pages * + sizeof(*(window->phys_addr))))) + goto error_window; + + if (!(window->temp_phys_addr = scif_zalloc(nr_pages * + sizeof(*(window->temp_phys_addr))))) + goto error_window; +#endif + + if (micscif_create_remote_lookup(ep, window)) + goto error_window; + + window->ep = (uint64_t)ep; + window->type = RMA_WINDOW_PEER; + set_window_ref_count(window, nr_pages); + window->get_put_ref_count = 0; + window->unreg_state = OP_IDLE; +#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + window->gttmap_state = OP_IDLE; + init_waitqueue_head(&window->gttmapwq); +#endif +#ifdef _MIC_SCIF_ + micscif_setup_proxy_dma(ep); + window->proxy_dma_phys = ep->rma_info.proxy_dma_phys; +#endif + return window; +error_window: + micscif_destroy_remote_window(ep, window); +error_ret: + return NULL; +} + +/** + * micscif_destroy_remote_window: + * @ep: end point + * @window: remote registration window + * + * Deallocate resources for remote window. + */ +void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window) +{ + RMA_MAGIC(window); + micscif_destroy_remote_lookup(ep, window); + if (window->dma_addr) + scif_free(window->dma_addr, window->nr_pages * + sizeof(*(window->dma_addr))); + if (window->num_pages) + scif_free(window->num_pages, window->nr_pages * + sizeof(*(window->num_pages))); +#ifdef CONFIG_ML1OM + if (window->phys_addr) + scif_free(window->phys_addr, window->nr_pages * + sizeof(*(window->phys_addr))); + if (window->temp_phys_addr) + scif_free(window->temp_phys_addr, window->nr_pages * + sizeof(*(window->temp_phys_addr))); +#endif + +#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + if (window->page_ref_count) + scif_free(window->page_ref_count, window->nr_pages * + sizeof(*(window->page_ref_count))); +#endif + window->magic = 0; + scif_free(window, sizeof(*window)); +} + +/** + * micscif_map_window_pages: + * @ep: end point + * @window: self registration window + * @tmp_wnd: is a temporary window? + * + * Map pages of a window into the aperture/PCI. + * Also compute physical addresses required for DMA. + */ +int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool tmp_wnd) +{ + int j, i, err = 0, nr_pages; + scif_pinned_pages_t pinned_pages; + + might_sleep(); + RMA_MAGIC(window); + + pinned_pages = window->pinned_pages; + for (j = 0, i = 0; j < window->nr_contig_chunks; j++, i += nr_pages) { + nr_pages = pinned_pages->num_pages[i]; +#ifdef _MIC_SCIF_ +#ifdef CONFIG_ML1OM + /* phys_addr[] holds addresses as seen from the remote node + * these addressed are then copied into the remote card's + * window structure + * when the remote node is the host and the card is knf + * these addresses are only created at the point of mapping + * the card physical address into gtt (for the KNC the + * the gtt code path returns the local address) + * when the remote node is loopback - the address remains + * the same + * when the remote node is a kn* - the base address of the local + * card as seen from the remote node is added in + */ + if (!tmp_wnd) { + if(ep->remote_dev != &scif_dev[SCIF_HOST_NODE]) { + if ((err = map_virt_into_aperture( + &window->temp_phys_addr[j], + phys_to_virt(page_to_phys(pinned_pages->pages[i])), + ep->remote_dev, + nr_pages << PAGE_SHIFT))) { + int k,l; + + for (l = k = 0; k < i; l++) { + nr_pages = pinned_pages->num_pages[k]; + window->temp_phys_addr[l] + &= ~RMA_HUGE_NR_PAGE_MASK; + unmap_from_aperture( + window->temp_phys_addr[l], + ep->remote_dev, + nr_pages << PAGE_SHIFT); + k += nr_pages; + window->temp_phys_addr[l] = 0; + } + return err; + } + if (!tmp_wnd) + RMA_SET_NR_PAGES(window->temp_phys_addr[j], nr_pages); + } + } +#endif + window->dma_addr[j] = + page_to_phys(pinned_pages->pages[i]); + if (!tmp_wnd) + RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages); +#else + err = map_virt_into_aperture(&window->dma_addr[j], + phys_to_virt(page_to_phys(pinned_pages->pages[i])), + ep->remote_dev, nr_pages << PAGE_SHIFT); + if (err) + return err; + if (!tmp_wnd) + RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages); +#endif + window->num_pages[j] = nr_pages; + } + return err; +} + + +/** + * micscif_unregister_window: + * @window: self registration window + * + * Send an unregistration request and wait for a response. + */ +int micscif_unregister_window(struct reg_range_t *window) +{ + int err = 0; + struct endpt *ep = (struct endpt *)window->ep; + bool send_msg = false; + + might_sleep(); + BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock)); + + switch (window->unreg_state) { + case OP_IDLE: + { + window->unreg_state = OP_IN_PROGRESS; + send_msg = true; + /* fall through */ + } + case OP_IN_PROGRESS: + { + get_window_ref_count(window, 1); + mutex_unlock(&ep->rma_info.rma_lock); + if (send_msg && (err = micscif_send_scif_unregister(ep, window))) { + window->unreg_state = OP_COMPLETED; + goto done; + } +retry: + err = wait_event_timeout(window->unregwq, + window->unreg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) { + err = -ENODEV; + window->unreg_state = OP_COMPLETED; + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + } + if (err > 0) + err = 0; +done: + mutex_lock(&ep->rma_info.rma_lock); + put_window_ref_count(window, 1); + break; + } + case OP_FAILED: + { + if (!scifdev_alive(ep)) { + err = -ENODEV; + window->unreg_state = OP_COMPLETED; + } + break; + } + case OP_COMPLETED: + break; + default: + /* Invalid opcode? */ + BUG_ON(1); + } + + if (OP_COMPLETED == window->unreg_state && + window->ref_count) + put_window_ref_count(window, window->nr_pages); + + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages); + list_del(&window->list_member); + micscif_free_window_offset(ep, window->offset, + window->nr_pages << PAGE_SHIFT); + window->offset_freed = true; + mutex_unlock(&ep->rma_info.rma_lock); + if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL)) + && scifdev_alive(ep)) { + drain_dma_intr(ep->rma_info.dma_chan); + } else { + if (!__scif_dec_pinned_vm_lock(window->mm, + window->nr_pages, 1)) { + __scif_release_mm(window->mm); + window->mm = NULL; + } + } + micscif_queue_for_cleanup(window, &ms_info.mi_rma); + mutex_lock(&ep->rma_info.rma_lock); + } + return err; +} + +/** + * micscif_send_alloc_request: + * @ep: end point + * @window: self registration window + * + * Send a remote window allocation request + */ +int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window) +{ + struct nodemsg msg; + struct allocmsg *alloc = &window->alloc_handle; + + /* Set up the Alloc Handle */ + alloc->uop = SCIF_REGISTER; + alloc->state = OP_IN_PROGRESS; + init_waitqueue_head(&alloc->allocwq); + + /* Send out an allocation request */ + msg.uop = SCIF_ALLOC_REQ; + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = window->nr_pages; + msg.payload[2] = (uint64_t)&window->alloc_handle; + msg.payload[3] = SCIF_REGISTER; + return micscif_nodeqp_send(ep->remote_dev, &msg, ep); +} + +/** + * micscif_prep_remote_window: + * @ep: end point + * @window: self registration window + * + * Send a remote window allocation request, wait for an allocation response, + * prepare the remote window and notify the peer to unmap it once done. + */ +int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window) +{ + struct nodemsg msg; + struct reg_range_t *remote_window; + struct allocmsg *alloc = &window->alloc_handle; + dma_addr_t *dma_phys_lookup, *tmp; + int i = 0, j = 0; + int nr_contig_chunks, loop_nr_contig_chunks, remaining_nr_contig_chunks, nr_lookup; +#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + dma_addr_t *phys_lookup = 0; +#endif + int err, map_err; + + nr_contig_chunks = remaining_nr_contig_chunks = (int)window->nr_contig_chunks; + + if ((map_err = micscif_map_window_pages(ep, window, false))) { + printk(KERN_ERR "%s %d map_err %d\n", __func__, __LINE__, map_err); + } +retry: + /* Now wait for the response */ + err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + + if (!err) + err = -ENODEV; + + if (err > 0) + err = 0; + else + return err; + + /* Bail out. The remote end rejected this request */ + if (OP_FAILED == alloc->state) + return -ENOMEM; + + if (map_err) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, map_err); + msg.uop = SCIF_FREE_VIRT; + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = (uint64_t)window->alloc_handle.vaddr; + msg.payload[2] = (uint64_t)window; + msg.payload[3] = SCIF_REGISTER; + if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) + err = -ENOTCONN; + else + err = map_err; + return err; + } + + + remote_window = scif_ioremap(alloc->phys_addr, + sizeof(*window), ep->remote_dev); + + RMA_MAGIC(remote_window); + + /* Compute the number of lookup entries. 21 == 2MB Shift */ + nr_lookup = ALIGN(nr_contig_chunks * PAGE_SIZE, ((2) * 1024 * 1024)) >> 21; +#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + if (is_p2p_scifdev(ep->remote_dev)) + phys_lookup = scif_ioremap(remote_window->temp_phys_addr_lookup.offset, + nr_lookup * + sizeof(*remote_window->temp_phys_addr_lookup.lookup), + ep->remote_dev); +#endif + + dma_phys_lookup = scif_ioremap(remote_window->dma_addr_lookup.offset, + nr_lookup * + sizeof(*remote_window->dma_addr_lookup.lookup), + ep->remote_dev); + + while (remaining_nr_contig_chunks) { + loop_nr_contig_chunks = min(remaining_nr_contig_chunks, (int)NR_PHYS_ADDR_IN_PAGE); + /* #1/2 - Copy physical addresses over to the remote side */ + +#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + /* If the remote dev is self or is any node except the host + * its OK to copy the bus address to the remote window + * in the case of the host (for KNF only) the bus address + * is generated at the time of mmap(..) into card memory + * and does not exist at this time + */ + /* Note: + * the phys_addr[] holds MIC address for remote cards + * -> GTT offset for the host (KNF) + * -> local address for the host (KNC) + * -> local address for loopback + * this is done in map_window_pages(..) except for GTT + * offset for KNF + */ + if (is_p2p_scifdev(ep->remote_dev)) { + tmp = scif_ioremap(phys_lookup[j], + loop_nr_contig_chunks * sizeof(*window->temp_phys_addr), + ep->remote_dev); + memcpy_toio(tmp, &window->temp_phys_addr[i], + loop_nr_contig_chunks * sizeof(*window->temp_phys_addr)); + serializing_request(tmp); + smp_mb(); + scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev); + } +#endif + /* #2/2 - Copy DMA addresses (addresses that are fed into the DMA engine) + * We transfer bus addresses which are then converted into a MIC physical + * address on the remote side if it is a MIC, if the remote node is a host + * we transfer the MIC physical address + */ + tmp = scif_ioremap( + dma_phys_lookup[j], + loop_nr_contig_chunks * sizeof(*window->dma_addr), + ep->remote_dev); +#ifdef _MIC_SCIF_ + if (is_p2p_scifdev(ep->remote_dev)) { + /* knf: + * send the address as mapped through the GTT (the remote node's + * base address for this node is already added in) + * knc: + * add remote node's base address for this node to convert it + * into a MIC address + */ + int m; + dma_addr_t dma_addr; + for (m = 0; m < loop_nr_contig_chunks; m++) { +#ifdef CONFIG_ML1OM + dma_addr = window->temp_phys_addr[i + m]; +#else + dma_addr = window->dma_addr[i + m] + + ep->remote_dev->sd_base_addr; +#endif + writeq(dma_addr, &tmp[m]); + } + } else + /* Host node or loopback - transfer DMA addresses as is, this is + * the same as a MIC physical address (we use the dma_addr + * and not the phys_addr array since the phys_addr is only setup + * if there is a mmap() request from the host) + */ + memcpy_toio(tmp, &window->dma_addr[i], + loop_nr_contig_chunks * sizeof(*window->dma_addr)); +#else + /* Transfer the physical address array - this is the MIC address + * as seen by the card + */ + memcpy_toio(tmp, &window->dma_addr[i], + loop_nr_contig_chunks * sizeof(*window->dma_addr)); +#endif + remaining_nr_contig_chunks -= loop_nr_contig_chunks; + i += loop_nr_contig_chunks; + j++; + serializing_request(tmp); + smp_mb(); + scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev); + } + + /* Prepare the remote window for the peer */ + remote_window->peer_window = (uint64_t)window; + remote_window->offset = window->offset; + remote_window->prot = window->prot; + remote_window->nr_contig_chunks = nr_contig_chunks; +#ifdef _MIC_SCIF_ + if (!ep->rma_info.proxy_dma_peer_phys) + ep->rma_info.proxy_dma_peer_phys = remote_window->proxy_dma_phys; +#endif +#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + if (is_p2p_scifdev(ep->remote_dev)) + scif_iounmap(phys_lookup, + nr_lookup * + sizeof(*remote_window->temp_phys_addr_lookup.lookup), + ep->remote_dev); +#endif + scif_iounmap(dma_phys_lookup, + nr_lookup * + sizeof(*remote_window->dma_addr_lookup.lookup), + ep->remote_dev); + scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev); + window->peer_window = (uint64_t)alloc->vaddr; + return err; +} + +/** + * micscif_send_scif_register: + * @ep: end point + * @window: self registration window + * + * Send a SCIF_REGISTER message if EP is connected and wait for a + * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT + * message so that the peer can free its remote window allocated earlier. + */ +int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window) +{ + int err = 0; + struct nodemsg msg; + + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = (uint64_t)window->alloc_handle.vaddr; + msg.payload[2] = (uint64_t)window; + if (SCIFEP_CONNECTED == ep->state) { + msg.uop = SCIF_REGISTER; + window->reg_state = OP_IN_PROGRESS; + if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) { + micscif_set_nr_pages(ep->remote_dev, window); +retry: + err = wait_event_timeout(window->regwq, + window->reg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + if (OP_FAILED == window->reg_state) + err = -ENOTCONN; + } else { + micscif_set_nr_pages(ep->remote_dev, window); + } + } else { + msg.uop = SCIF_FREE_VIRT; + msg.payload[3] = SCIF_REGISTER; + if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) + err = -ENOTCONN; + micscif_set_nr_pages(ep->remote_dev, window); + } + return err; +} + +/** + * micscif_send_scif_unregister: + * @ep: end point + * @window: self registration window + * + * Send a SCIF_UNREGISTER message. + */ +int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window) +{ + struct nodemsg msg; + + RMA_MAGIC(window); + msg.uop = SCIF_UNREGISTER; + msg.src = ep->port; + msg.payload[0] = (uint64_t)window->alloc_handle.vaddr; + msg.payload[1] = (uint64_t)window; + return micscif_nodeqp_send(ep->remote_dev, &msg, ep); +} + +/** + * micscif_get_window_offset: + * @epd: end point descriptor + * @flags: flags + * @offset: offset hint + * @len: length of range + * @out_offset: computed offset returned by reference. + * + * Compute/Claim a new offset for this EP. The callee is supposed to grab + * the RMA mutex before calling this API. + */ +int micscif_get_window_offset(struct endpt *ep, int flags, + uint64_t offset, size_t len, uint64_t *out_offset) +{ + uint64_t computed_offset; + int err = 0; + + might_sleep(); + mutex_lock(&ep->rma_info.va_lock); + if (flags & SCIF_MAP_FIXED) { + computed_offset = va_gen_claim(&ep->rma_info.va_gen, + (uint64_t)offset, len); + if (INVALID_VA_GEN_ADDRESS == computed_offset) + err = -EADDRINUSE; + } else { + computed_offset = va_gen_alloc(&ep->rma_info.va_gen, + len, PAGE_SIZE); + if (INVALID_VA_GEN_ADDRESS == computed_offset) + err = -ENOMEM; + } + *out_offset = computed_offset; + mutex_unlock(&ep->rma_info.va_lock); + return err; +} + +/** + * micscif_free_window_offset: + * @offset: offset hint + * @len: length of range + * + * Free offset for this EP. The callee is supposed to grab + * the RMA mutex before calling this API. + */ +void micscif_free_window_offset(struct endpt *ep, + uint64_t offset, size_t len) +{ + mutex_lock(&ep->rma_info.va_lock); + va_gen_free(&ep->rma_info.va_gen, offset, len); + mutex_unlock(&ep->rma_info.va_lock); +} + +/** + * scif_register_temp: + * @epd: End Point Descriptor. + * @addr: virtual address to/from which to copy + * @len: length of range to copy + * @out_offset: computed offset returned by reference. + * @out_window: allocated registered window returned by reference. + * + * Create a temporary registered window. The peer will not know about this + * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's. + */ +static int +micscif_register_temp(scif_epd_t epd, void *addr, size_t len, int prot, + off_t *out_offset, struct reg_range_t **out_window) +{ + struct endpt *ep = (struct endpt *)epd; + int err; + scif_pinned_pages_t pinned_pages; + size_t aligned_len; + + aligned_len = ALIGN(len, PAGE_SIZE); + + if ((err = __scif_pin_pages((void *)((uint64_t)addr & + PAGE_MASK), + aligned_len, &prot, 0, &pinned_pages))) + return err; + + pinned_pages->prot = prot; + + /* Compute the offset for this registration */ + if ((err = micscif_get_window_offset(ep, 0, 0, + aligned_len, (uint64_t *)out_offset))) + goto error_unpin; + + /* Allocate and prepare self registration window */ + if (!(*out_window = micscif_create_window(ep, aligned_len >> PAGE_SHIFT, + *out_offset, true))) { + micscif_free_window_offset(ep, *out_offset, aligned_len); + err = -ENOMEM; + goto error_unpin; + } + + (*out_window)->pinned_pages = pinned_pages; + (*out_window)->nr_pages = pinned_pages->nr_pages; + (*out_window)->nr_contig_chunks = pinned_pages->nr_contig_chunks; + (*out_window)->prot = pinned_pages->prot; + + (*out_window)->va_for_temp = (void*)((uint64_t)addr & PAGE_MASK); + if ((err = micscif_map_window_pages(ep, *out_window, true))) { + /* Something went wrong! Rollback */ + micscif_destroy_window(ep, *out_window); + *out_window = NULL; + } else + *out_offset |= ((uint64_t)addr & ~PAGE_MASK); + + return err; +error_unpin: + if (err) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + scif_unpin_pages(pinned_pages); + return err; +} + +/** + * micscif_rma_completion_cb: + * @data: RMA cookie + * + * RMA interrupt completion callback. + */ +void micscif_rma_completion_cb(uint64_t data) +{ + struct dma_completion_cb *comp_cb = (struct dma_completion_cb *)data; +#ifndef _MIC_SCIF_ + struct pci_dev *pdev; +#endif + + /* Free DMA Completion CB. */ + if (comp_cb && comp_cb->temp_buf) { + if (comp_cb->dst_window) { + micscif_rma_local_cpu_copy(comp_cb->dst_offset, + comp_cb->dst_window, comp_cb->temp_buf + comp_cb->header_padding, + comp_cb->len, false); + } +#ifndef _MIC_SCIF_ + micscif_pci_dev(comp_cb->remote_node, &pdev); + mic_ctx_unmap_single(get_per_dev_ctx(comp_cb->remote_node - 1), + comp_cb->temp_phys, KMEM_UNALIGNED_BUF_SIZE); +#endif + if (comp_cb->is_cache) + micscif_kmem_cache_free(comp_cb->temp_buf_to_free); + else + kfree(comp_cb->temp_buf_to_free); + } + kfree(comp_cb); +} + +static void __micscif_rma_destroy_tcw_ep(struct endpt *ep); +static +bool micscif_rma_tc_can_cache(struct endpt *ep, size_t cur_bytes) +{ + if ((cur_bytes >> PAGE_SHIFT) > ms_info.mi_rma_tc_limit) + return false; + if ((atomic_read(&ep->rma_info.tcw_total_pages) + + (cur_bytes >> PAGE_SHIFT)) > + ms_info.mi_rma_tc_limit) { + printk(KERN_ALERT "%s %d total=%d, current=%zu reached max\n", + __func__, __LINE__, + atomic_read(&ep->rma_info.tcw_total_pages), + (1 + (cur_bytes >> PAGE_SHIFT))); + micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc); + __micscif_rma_destroy_tcw_ep(ep); + } + return true; +} + +/** + * micscif_rma_copy: + * @epd: end point descriptor. + * @loffset: offset in local registered address space to/from which to copy + * @addr: user virtual address to/from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to/from which to copy + * @flags: flags + * @dir: LOCAL->REMOTE or vice versa. + * + * Validate parameters, check if src/dst registered ranges requested for copy + * are valid and initiate either CPU or DMA copy. + */ +int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len, + off_t roffset, int flags, enum rma_direction dir, bool last_chunk) +{ + struct endpt *ep = (struct endpt *)epd; + struct micscif_rma_req remote_req; + struct micscif_rma_req req; + struct reg_range_t *window = NULL; + struct reg_range_t *remote_window = NULL; + struct mic_copy_work copy_work; + bool loopback; + int err = 0; + struct dma_channel *chan; + struct rma_mmu_notifier *mmn = NULL; + bool insert_window = false; + bool cache = false; + + if ((err = verify_epd(ep))) + return err; + + if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE | SCIF_RMA_SYNC | SCIF_RMA_ORDERED))) + return -EINVAL; + + if (!len) + return -EINVAL; + loopback = is_self_scifdev(ep->remote_dev) ? true : false; + copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ? DO_DMA_POLLING : 0; + copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk); + +#ifdef CONFIG_MMU_NOTIFIER + if (!mic_reg_cache_enable) + flags &= ~SCIF_RMA_USECACHE; +#else + flags &= ~SCIF_RMA_USECACHE; +#endif +#ifndef _MIC_SCIF_ +#ifdef CONFIG_ML1OM + /* Use DMA Copies even if CPU copy is requested on KNF MIC from Host */ + if (flags & SCIF_RMA_USECPU) { + flags &= ~SCIF_RMA_USECPU; + if (last_chunk) + copy_work.fence_type = DO_DMA_POLLING; + } +#endif + /* Use CPU for Host<->Host Copies */ + if (loopback) { + flags |= SCIF_RMA_USECPU; + copy_work.fence_type = 0x0; + } +#endif + + cache = flags & SCIF_RMA_USECACHE; + + /* Trying to wrap around */ + if ((loffset && (loffset + (off_t)len < loffset)) || + (roffset + (off_t)len < roffset)) + return -EINVAL; + + remote_req.out_window = &remote_window; + remote_req.offset = roffset; + remote_req.nr_bytes = len; + /* + * If transfer is from local to remote then the remote window + * must be writeable and vice versa. + */ + remote_req.prot = LOCAL_TO_REMOTE == dir ? VM_WRITE : VM_READ; + remote_req.type = WINDOW_PARTIAL; + remote_req.head = &ep->rma_info.remote_reg_list; + +#ifdef CONFIG_MMU_NOTIFIER + if (addr && cache) { + mutex_lock(&ep->rma_info.mmn_lock); + mmn = find_mmu_notifier(current->mm, &ep->rma_info); + if (!mmn) { + mmn = kzalloc(sizeof(*mmn), GFP_KERNEL); + if (!mmn) { + mutex_unlock(&ep->rma_info.mmn_lock); + return -ENOMEM; + } + init_mmu_notifier(mmn, current->mm, ep); + if (mmu_notifier_register(&mmn->ep_mmu_notifier, current->mm)) { + mutex_unlock(&ep->rma_info.mmn_lock); + kfree(mmn); + return -EBUSY; + } +#ifdef RMA_DEBUG + atomic_long_add_return(1, &ms_info.mmu_notif_cnt); +#endif + list_add(&mmn->list_member, &ep->rma_info.mmn_list); + } + mutex_unlock(&ep->rma_info.mmn_lock); + } +#endif + + micscif_inc_node_refcnt(ep->remote_dev, 1); +#ifdef _MIC_SCIF_ + if (!(flags & SCIF_RMA_USECPU)) { + /* + * Proxy the DMA only for P2P reads with transfer size + * greater than proxy DMA threshold. scif_vreadfrom(..) + * and scif_vwriteto(..) is not supported since the peer + * does not have the page lists required to perform the + * proxy DMA. + */ + if (ep->remote_dev->sd_proxy_dma_reads && + !addr && dir == REMOTE_TO_LOCAL && + ep->rma_info.proxy_dma_va && + len >= ms_info.mi_proxy_dma_threshold) { + copy_work.len = len; + copy_work.src_offset = roffset; + copy_work.dst_offset = loffset; + /* Fall through if there were errors */ + if (!(err = micscif_proxy_dma(epd, ©_work))) + goto error; + } + } +#endif + mutex_lock(&ep->rma_info.rma_lock); + if (addr) { + req.out_window = &window; + req.nr_bytes = ALIGN(len + ((uint64_t)addr & ~PAGE_MASK), PAGE_SIZE); + if (mmn) + req.head = &mmn->tc_reg_list; + req.va_for_temp = (void*)((uint64_t)addr & PAGE_MASK); + req.prot = (LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE | VM_READ); + /* Does a valid local window exist? */ + + pr_debug("%s %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n", + __func__, __LINE__, req.va_for_temp, addr, req.nr_bytes, len); + spin_lock(&ep->rma_info.tc_lock); + if (!mmn || (err = micscif_query_tcw(ep, &req))) { + pr_debug("%s %d err %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n", + __func__, __LINE__, err, req.va_for_temp, addr, req.nr_bytes, len); + spin_unlock(&ep->rma_info.tc_lock); + mutex_unlock(&ep->rma_info.rma_lock); + if (cache) + if (!micscif_rma_tc_can_cache(ep, req.nr_bytes)) + cache = false; + if ((err = micscif_register_temp(epd, req.va_for_temp, req.nr_bytes, + req.prot, + &loffset, &window))) { + goto error; + } + mutex_lock(&ep->rma_info.rma_lock); + pr_debug("New temp window created addr %p\n", addr); + if (cache) { + atomic_inc(&ep->rma_info.tcw_refcount); + atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tcw_total_pages); + if (mmn) { + spin_lock(&ep->rma_info.tc_lock); + micscif_insert_tcw(window, &mmn->tc_reg_list); + spin_unlock(&ep->rma_info.tc_lock); + } + } + insert_window = true; + } else { + spin_unlock(&ep->rma_info.tc_lock); + pr_debug("window found for addr %p\n", addr); + BUG_ON(window->va_for_temp > addr); + } + loffset = window->offset + ((uint64_t)addr - (uint64_t)window->va_for_temp); + pr_debug("%s %d addr %p loffset 0x%lx window->nr_pages 0x%llx" + " window->va_for_temp %p\n", __func__, __LINE__, + addr, loffset, window->nr_pages, window->va_for_temp); + RMA_MAGIC(window); + } + + /* Does a valid remote window exist? */ + if ((err = micscif_query_window(&remote_req))) { + pr_debug("%s %d err %d roffset 0x%lx len 0x%lx\n", + __func__, __LINE__, err, roffset, len); + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + RMA_MAGIC(remote_window); + if (!addr) { + req.out_window = &window; + req.offset = loffset; + /* + * If transfer is from local to remote then the self window + * must be readable and vice versa. + */ + req.prot = LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE; + req.nr_bytes = len; + req.type = WINDOW_PARTIAL; + req.head = &ep->rma_info.reg_list; + /* Does a valid local window exist? */ + if ((err = micscif_query_window(&req))) { + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + RMA_MAGIC(window); + } + + /* + * Preprare copy_work for submitting work to the DMA kernel thread + * or CPU copy routine. + */ + copy_work.len = len; + copy_work.loopback = loopback; + copy_work.remote_dev = ep->remote_dev; + copy_work.dma_chan_released = false; + if (LOCAL_TO_REMOTE == dir) { + copy_work.src_offset = loffset; + copy_work.src_window = window; + copy_work.dst_offset = roffset; + copy_work.dst_window = remote_window; + } else { + copy_work.src_offset = roffset; + copy_work.src_window = remote_window; + copy_work.dst_offset = loffset; + copy_work.dst_window = window; + } + + if (!(flags & SCIF_RMA_USECPU)) { + chan = ep->rma_info.dma_chan; + if ((err = request_dma_channel(chan))) { + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + err = micscif_rma_list_dma_copy_wrapper(epd, ©_work, + chan, loffset); + if (!copy_work.dma_chan_released) + free_dma_channel(chan); + } + if (flags & SCIF_RMA_USECPU) { + /* Initiate synchronous CPU copy */ + micscif_rma_list_cpu_copy(©_work); + } + if (insert_window && !cache) { + atomic_inc(&ep->rma_info.tw_refcount); + atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages); + } + + mutex_unlock(&ep->rma_info.rma_lock); + + if (last_chunk) { + if (DO_DMA_POLLING == copy_work.fence_type) + err = drain_dma_poll(ep->rma_info.dma_chan); + else if (DO_DMA_INTR == copy_work.fence_type) + err = drain_dma_intr(ep->rma_info.dma_chan); + } + + micscif_dec_node_refcnt(ep->remote_dev, 1); + if (insert_window && !cache) + micscif_queue_for_cleanup(window, &ms_info.mi_rma); + return err; +error: + if (err) { + if (addr && window && !cache) + micscif_destroy_window(ep, window); + printk(KERN_ERR "%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len); + } + micscif_dec_node_refcnt(ep->remote_dev, 1); + return err; +} + +/** + * micscif_send_fence_mark: + * @epd: end point descriptor. + * @out_mark: Output DMA mark reported by peer. + * + * Send a remote fence mark request. + */ +int micscif_send_fence_mark(scif_epd_t epd, int *out_mark) +{ + int err; + struct nodemsg msg; + struct fence_info *fence_req; + struct endpt *ep = (struct endpt *)epd; + + if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) { + err = -ENOMEM; + goto error; + } + + fence_req->state = OP_IN_PROGRESS; + init_waitqueue_head(&fence_req->wq); + + msg.src = ep->port; + msg.uop = SCIF_MARK; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = (uint64_t)fence_req; + + if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) + goto error; + +retry: + err = wait_event_timeout(fence_req->wq, + (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + if (err < 0) { + mutex_lock(&ep->rma_info.rma_lock); + if (OP_IN_PROGRESS == fence_req->state) + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + } + if (OP_COMPLETED == fence_req->state) + *out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark; + + if (OP_FAILED == fence_req->state && !err) + err = -ENOMEM; + mutex_lock(&ep->rma_info.rma_lock); + mutex_unlock(&ep->rma_info.rma_lock); + kfree(fence_req); +error: + return err; +} + +/** + * micscif_send_fence_wait: + * @epd: end point descriptor. + * @mark: DMA mark to wait for. + * + * Send a remote fence wait request. + */ +int micscif_send_fence_wait(scif_epd_t epd, int mark) +{ + int err; + struct nodemsg msg; + struct fence_info *fence_req; + struct endpt *ep = (struct endpt *)epd; + + if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) { + err = -ENOMEM; + goto error; + } + + fence_req->state = OP_IN_PROGRESS; + init_waitqueue_head(&fence_req->wq); + + msg.src = ep->port; + msg.uop = SCIF_WAIT; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = (uint64_t)fence_req; + msg.payload[2] = mark; + + if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) + goto error; +retry: + err = wait_event_timeout(fence_req->wq, + (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + if (err < 0) { + mutex_lock(&ep->rma_info.rma_lock); + if (OP_IN_PROGRESS == fence_req->state) + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + } + if (OP_FAILED == fence_req->state && !err) + err = -ENOMEM; + mutex_lock(&ep->rma_info.rma_lock); + mutex_unlock(&ep->rma_info.rma_lock); + kfree(fence_req); +error: + return err; +} + +/** + * micscif_send_fence_signal: + * @epd - endpoint descriptor + * @loff - local offset + * @lval - local value to write to loffset + * @roff - remote offset + * @rval - remote value to write to roffset + * @flags - flags + * + * Sends a remote fence signal request + */ +int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval, + off_t loff, uint64_t lval, int flags) +{ + int err = 0; + struct nodemsg msg; + struct fence_info *fence_req; + struct endpt *ep = (struct endpt *)epd; + + if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) { + err = -ENOMEM; + goto error; + } + + fence_req->state = OP_IN_PROGRESS; + init_waitqueue_head(&fence_req->wq); + + msg.src = ep->port; + if (flags & SCIF_SIGNAL_LOCAL) { + msg.uop = SCIF_SIG_LOCAL; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = roff; + msg.payload[2] = rval; + msg.payload[3] = (uint64_t)fence_req; + if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) + goto error_free; +retry1: + err = wait_event_timeout(fence_req->wq, + (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry1; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + if (err < 0) { + mutex_lock(&ep->rma_info.rma_lock); + if (OP_IN_PROGRESS == fence_req->state) + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + } + if (OP_FAILED == fence_req->state && !err) { + err = -ENXIO; + goto error_free; + } + } + fence_req->state = OP_IN_PROGRESS; + + if (flags & SCIF_SIGNAL_REMOTE) { + msg.uop = SCIF_SIG_REMOTE; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = loff; + msg.payload[2] = lval; + msg.payload[3] = (uint64_t)fence_req; + if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) + goto error_free; +retry2: + err = wait_event_timeout(fence_req->wq, + (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry2; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + if (err < 0) { + mutex_lock(&ep->rma_info.rma_lock); + if (OP_IN_PROGRESS == fence_req->state) + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + } + if (OP_FAILED == fence_req->state && !err) { + err = -ENXIO; + goto error_free; + } + } +error_free: + mutex_lock(&ep->rma_info.rma_lock); + mutex_unlock(&ep->rma_info.rma_lock); + kfree(fence_req); +error: + return err; +} + +/* + * micscif_fence_mark: + * + * @epd - endpoint descriptor + * Set up a mark for this endpoint and return the value of the mark. + */ +int micscif_fence_mark(scif_epd_t epd) +{ + int mark = 0; + struct endpt *ep = (struct endpt *)epd; + struct dma_channel *chan = ep->rma_info.dma_chan; + + if ((mark = request_dma_channel(chan))) + goto error; + + mark = program_dma_mark(chan); + + free_dma_channel(chan); +error: + return mark; +} + +/** + * micscif_rma_destroy_temp_windows: + * + * This routine destroys temporary registered windows created + * by scif_vreadfrom() and scif_vwriteto(). + */ +void micscif_rma_destroy_temp_windows(void) +{ + struct list_head *item, *tmp; + struct reg_range_t *window; + struct endpt *ep; + struct dma_channel *chan; + might_sleep(); +restart: + spin_lock(&ms_info.mi_rmalock); + list_for_each_safe(item, tmp, &ms_info.mi_rma) { + window = list_entry(item, + struct reg_range_t, list_member); + ep = (struct endpt *)window->ep; + chan = ep->rma_info.dma_chan; + + list_del(&window->list_member); + spin_unlock(&ms_info.mi_rmalock); + micscif_inc_node_refcnt(ep->remote_dev, 1); + if (!chan || + !scifdev_alive(ep) || + (!is_current_dma_mark(chan, window->dma_mark) && + is_dma_mark_processed(chan, window->dma_mark)) || + !drain_dma_intr(chan)) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + /* Remove window from global list */ + window->unreg_state = OP_COMPLETED; + } else { + micscif_dec_node_refcnt(ep->remote_dev, 1); + /* DMA engine hung ?? */ + printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d " + "window->dma_mark 0x%x channel_mark 0x%x\n", + __func__, __LINE__, get_chan_num(chan), + ep->sd_state, window->dma_mark, get_dma_mark(chan)); + WARN_ON(1); + micscif_queue_for_cleanup(window, &ms_info.mi_rma); + goto restart; + } + + if (OP_COMPLETED == window->unreg_state) { + BUG_ON(atomic_sub_return((int32_t)window->nr_pages, + &ep->rma_info.tw_total_pages) < 0); + if (RMA_WINDOW_SELF == window->type) + micscif_destroy_window(ep, window); + else + micscif_destroy_remote_window(ep, window); + BUG_ON(atomic_dec_return( + &ep->rma_info.tw_refcount) < 0); + } + goto restart; + } + spin_unlock(&ms_info.mi_rmalock); +} + +/** + * micscif_rma_destroy_tcw: + * + * This routine destroys temporary registered windows created + * by scif_vreadfrom() and scif_vwriteto(). + */ +static +void __micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn, + struct endpt *ep, bool inrange, + uint64_t start, uint64_t len) +{ + struct list_head *item, *tmp; + struct reg_range_t *window; + uint64_t start_va, end_va; + uint64_t end = start + len; + list_for_each_safe(item, tmp, &mmn->tc_reg_list) { + window = list_entry(item, + struct reg_range_t, list_member); + ep = (struct endpt *)window->ep; + if (inrange) { + if (0 == len) + break; + start_va = (uint64_t)window->va_for_temp; + end_va = start_va+ (window->nr_pages << PAGE_SHIFT); + if (start < start_va) { + if (end <= start_va) { + break; + } else { + } + + } else { + if (start >= end_va) { + continue; + } else { + } + } + } + __micscif_rma_destroy_tcw_helper(window); + } +} + +static inline +void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn, + struct endpt *ep, bool inrange, + uint64_t start, uint64_t len) +{ + unsigned long sflags; + + spin_lock_irqsave(&ep->rma_info.tc_lock, sflags); + __micscif_rma_destroy_tcw(mmn, ep, inrange, start, len); + spin_unlock_irqrestore(&ep->rma_info.tc_lock, sflags); +} + +static void __micscif_rma_destroy_tcw_ep(struct endpt *ep) +{ + struct list_head *item, *tmp; + struct rma_mmu_notifier *mmn; + spin_lock(&ep->rma_info.tc_lock); + list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { + mmn = list_entry(item, + struct rma_mmu_notifier, list_member); + __micscif_rma_destroy_tcw(mmn, ep, false, 0, 0); + } + spin_unlock(&ep->rma_info.tc_lock); +} + +void micscif_rma_destroy_tcw_ep(struct endpt *ep) +{ + struct list_head *item, *tmp; + struct rma_mmu_notifier *mmn; + list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { + mmn = list_entry(item, + struct rma_mmu_notifier, list_member); + micscif_rma_destroy_tcw(mmn, ep, false, 0, 0); + } +} + +/** + * micscif_rma_destroy_tcw: + * + * This routine destroys temporary registered windows created + * by scif_vreadfrom() and scif_vwriteto(). + */ +void micscif_rma_destroy_tcw_invalid(struct list_head *list) +{ + struct list_head *item, *tmp; + struct reg_range_t *window; + struct endpt *ep; + struct dma_channel *chan; + might_sleep(); +restart: + spin_lock(&ms_info.mi_rmalock); + list_for_each_safe(item, tmp, list) { + window = list_entry(item, + struct reg_range_t, list_member); + ep = (struct endpt *)window->ep; + chan = ep->rma_info.dma_chan; + list_del(&window->list_member); + spin_unlock(&ms_info.mi_rmalock); + micscif_inc_node_refcnt(ep->remote_dev, 1); + mutex_lock(&ep->rma_info.rma_lock); + if (!chan || + !scifdev_alive(ep) || + (!is_current_dma_mark(chan, window->dma_mark) && + is_dma_mark_processed(chan, window->dma_mark)) || + !drain_dma_intr(chan)) { + micscif_dec_node_refcnt(ep->remote_dev, 1); + BUG_ON(atomic_sub_return((int32_t)window->nr_pages, + &ep->rma_info.tcw_total_pages) < 0); + micscif_destroy_window(ep, window); + BUG_ON(atomic_dec_return( + &ep->rma_info.tcw_refcount) < 0); + } else { + /* DMA engine hung ?? */ + printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d " + "window->dma_mark 0x%x channel_mark 0x%x\n", + __func__, __LINE__, get_chan_num(chan), + ep->sd_state, window->dma_mark, get_dma_mark(chan)); + WARN_ON(1); + mutex_unlock(&ep->rma_info.rma_lock); + micscif_dec_node_refcnt(ep->remote_dev, 1); + micscif_queue_for_cleanup(window, &ms_info.mi_rma); + goto restart; + } + mutex_unlock(&ep->rma_info.rma_lock); + goto restart; + } + spin_unlock(&ms_info.mi_rmalock); +} + +/** + * micscif_rma_handle_remote_fences: + * + * This routine services remote fence requests. + */ +void micscif_rma_handle_remote_fences(void) +{ + struct list_head *item, *tmp; + struct remote_fence_info *fence; + struct endpt *ep; + int mark; + + might_sleep(); + mutex_lock(&ms_info.mi_fencelock); + list_for_each_safe(item, tmp, &ms_info.mi_fence) { + fence = list_entry(item, + struct remote_fence_info, list_member); + /* Remove fence from global list */ + list_del(&fence->list_member); + + /* Initiate the fence operation */ + ep = (struct endpt *)fence->msg.payload[0]; + mark = (int)fence->msg.payload[2]; + BUG_ON(!(mark & SCIF_REMOTE_FENCE)); + if (dma_mark_wait(ep->rma_info.dma_chan, + mark & ~SCIF_REMOTE_FENCE, false)) { + printk(KERN_ERR "%s %d err\n", __func__, __LINE__); + fence->msg.uop = SCIF_WAIT_NACK; + } else { + fence->msg.uop = SCIF_WAIT_ACK; + } + micscif_inc_node_refcnt(ep->remote_dev, 1); + fence->msg.payload[0] = ep->remote_ep; + /* No error handling for Notification messages. */ + micscif_nodeqp_send(ep->remote_dev, &fence->msg, ep); + micscif_dec_node_refcnt(ep->remote_dev, 1); + kfree(fence); + /* + * Decrement ref count and wake up + * any thread blocked in the EP close routine waiting + * for all such remote fence requests to complete. + */ + ep->rma_info.fence_refcount--; + wake_up(&ep->rma_info.fence_wq); + } + mutex_unlock(&ms_info.mi_fencelock); +} + +#ifdef CONFIG_MMU_NOTIFIER +void micscif_mmu_notif_handler(struct work_struct *work) +{ + struct list_head *pos, *tmpq; + struct endpt *ep; +restart: + micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc); + spin_lock(&ms_info.mi_rmalock); + list_for_each_safe(pos, tmpq, &ms_info.mi_mmu_notif_cleanup) { + ep = list_entry(pos, struct endpt, mmu_list); + list_del(&ep->mmu_list); + spin_unlock(&ms_info.mi_rmalock); + BUG_ON(list_empty(&ep->rma_info.mmn_list)); + + micscif_rma_destroy_tcw_ep(ep); + ep_unregister_mmu_notifier(ep); + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); + goto restart; + } + spin_unlock(&ms_info.mi_rmalock); +} +#endif + +/** + * micscif_reserve_dma_chan: + * @ep: Endpoint Descriptor. + * + * This routine reserves a DMA channel for a particular + * endpoint. All DMA transfers for an endpoint are always + * programmed on the same DMA channel. + */ +int micscif_reserve_dma_chan(struct endpt *ep) +{ + int err = 0; +#ifndef _MIC_SCIF_ + /* + * Host Loopback cannot use DMA by design and hence + * reserving DMA channels is a nop. + */ + if (is_self_scifdev(ep->remote_dev)) + return 0; +#endif + mutex_lock(&ep->rma_info.rma_lock); + if (!ep->rma_info.dma_chan) { + struct dma_channel **chan = &ep->rma_info.dma_chan; + unsigned long ts = jiffies; +#ifndef _MIC_SCIF_ + mic_ctx_t *mic_ctx = + get_per_dev_ctx(ep->remote_dev->sd_node - 1); + BUG_ON(!ep->remote_dev->sd_node); +#endif + while (true) { + if (!(err = allocate_dma_channel((struct mic_dma_ctx_t *) +#ifdef _MIC_SCIF_ + mic_dma_handle, +#else + mic_ctx->dma_handle, +#endif + chan))) + break; + schedule(); + if (time_after(jiffies, + ts + NODE_ALIVE_TIMEOUT)) { + err = -EBUSY; + goto error; + } + } + mic_dma_thread_free_chan(*chan); + } +error: + mutex_unlock(&ep->rma_info.rma_lock); + return err; +} + +/* + * micscif_prog_signal: + * @epd - Endpoint Descriptor + * @offset - registered address + * @val - Value to be programmed in SUD. + * @type - Type of the window. + * + * Program a status update descriptor adter ensuring that the offset + * provided is indeed valid. + */ +int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val, + enum rma_window_type type) +{ + struct endpt *ep = (struct endpt *)epd; + struct dma_channel *chan = ep->rma_info.dma_chan; + struct reg_range_t *window = NULL; + struct micscif_rma_req req; + int err; + dma_addr_t phys; + + mutex_lock(&ep->rma_info.rma_lock); + req.out_window = &window; + req.offset = offset; + req.nr_bytes = sizeof(uint64_t); + req.prot = SCIF_PROT_WRITE; + req.type = WINDOW_SINGLE; + if (RMA_WINDOW_SELF == type) + req.head = &ep->rma_info.reg_list; + else + req.head = &ep->rma_info.remote_reg_list; + /* Does a valid window exist? */ + if ((err = micscif_query_window(&req))) { + printk(KERN_ERR "%s %d err %d\n", + __func__, __LINE__, err); + goto unlock_ret; + } + RMA_MAGIC(window); + +#ifndef _MIC_SCIF_ + if (unlikely(is_self_scifdev(ep->remote_dev))) { + void *dst_virt; + if (RMA_WINDOW_SELF == type) + dst_virt = get_local_va(offset, window, + sizeof(uint32_t)); + else { + struct page **pages = ((struct reg_range_t *) + (window->peer_window))->pinned_pages->pages; + int page_nr = (int) ( (offset - window->offset) >> PAGE_SHIFT ); + off_t page_off = offset & ~PAGE_MASK; + dst_virt = (void *)((uint64_t)phys_to_virt(page_to_phys( + pages[page_nr])) | page_off); + } + *(uint64_t*)dst_virt = val; + goto unlock_ret; + } +#endif + phys = micscif_get_dma_addr(window, offset, NULL, NULL, NULL); + if ((err = request_dma_channel(chan))) + goto unlock_ret; + err = do_status_update(chan, phys, val); + free_dma_channel(chan); +unlock_ret: + mutex_unlock(&ep->rma_info.rma_lock); + return err; +} + +/* + * __micscif_kill_apps_with_mmaps: + * @ep - The SCIF endpoint + * + * Kill the applications which have valid remote memory mappings + * created via scif_mmap(..). + */ +static void __micscif_kill_apps_with_mmaps(struct endpt *ep) +{ + struct list_head *item; + struct rma_task_info *info; + + spin_lock(&ep->lock); + list_for_each(item, &ep->rma_info.task_list) { + info = list_entry(item, struct rma_task_info, list_member); + kill_pid(info->pid, SIGKILL, 1); + pr_debug("%s ep %p pid %p ref %d\n", + __func__, ep, info->pid, info->ref_count); + } + spin_unlock(&ep->lock); +} + +/* + * _micscif_kill_apps_with_mmaps: + * @node - remote node id. + * @head - head of the list of endpoints to kill. + * + * Traverse the list of endpoints for a particular remote node and + * kill applications with valid remote memory mappings. + */ +static void _micscif_kill_apps_with_mmaps(int node, struct list_head *head) +{ + struct endpt *ep; + unsigned long sflags; + struct list_head *item; + + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each(item, head) { + ep = list_entry(item, struct endpt, list); + if (ep->remote_dev->sd_node == node) + __micscif_kill_apps_with_mmaps(ep); + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); +} + +/* + * micscif_kill_apps_with_mmaps: + * @node - remote node id. + * + * Wrapper for killing applications with valid remote memory mappings + * for a particular node. This API is called by peer nodes as part of + * handling a lost node. + */ +void micscif_kill_apps_with_mmaps(int node) +{ + _micscif_kill_apps_with_mmaps(node, &ms_info.mi_connected); + _micscif_kill_apps_with_mmaps(node, &ms_info.mi_disconnected); +} + +/* + * micscif_query_apps_with_mmaps: + * @node - remote node id. + * @head - head of the list of endpoints to query. + * + * Query if any applications for a remote node have valid remote memory + * mappings. + */ +static bool micscif_query_apps_with_mmaps(int node, struct list_head *head) +{ + struct endpt *ep; + unsigned long sflags; + struct list_head *item; + bool ret = false; + + spin_lock_irqsave(&ms_info.mi_connlock, sflags); + list_for_each(item, head) { + ep = list_entry(item, struct endpt, list); + if (ep->remote_dev->sd_node == node && + !list_empty(&ep->rma_info.task_list)) { + ret = true; + break; + } + } + spin_unlock_irqrestore(&ms_info.mi_connlock, sflags); + return ret; +} + +/* + * micscif_rma_do_apps_have_mmaps: + * @node - remote node id. + * + * Wrapper for querying if any applications have remote memory mappings + * for a particular node. + */ +bool micscif_rma_do_apps_have_mmaps(int node) +{ + return (micscif_query_apps_with_mmaps(node, &ms_info.mi_connected) || + micscif_query_apps_with_mmaps(node, &ms_info.mi_disconnected)); +} + +/* + * __micscif_cleanup_rma_for_zombies: + * @ep - The SCIF endpoint + * + * This API is only called while handling a lost node: + * a) Remote node is dead. + * b) All endpoints with remote memory mappings have been killed. + * So we can traverse the remote_reg_list without any locks. Since + * the window has not yet been unregistered we can drop the ref count + * and queue it to the cleanup thread. + */ +static void __micscif_cleanup_rma_for_zombies(struct endpt *ep) +{ + struct list_head *pos, *tmp; + struct reg_range_t *window; + + list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) { + window = list_entry(pos, struct reg_range_t, list_member); + /* If unregistration is complete then why is it on the list? */ + WARN_ON(window->unreg_state == OP_COMPLETED); + if (window->ref_count) + put_window_ref_count(window, window->nr_pages); + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + atomic_add_return((int32_t)window->nr_pages, + &ep->rma_info.tw_total_pages); + list_del(&window->list_member); + micscif_queue_for_cleanup(window, &ms_info.mi_rma); + } + } +} + +/* + * micscif_cleanup_rma_for_zombies: + * @node - remote node id. + * + * Cleanup remote registration lists for zombie endpoints. + */ +void micscif_cleanup_rma_for_zombies(int node) +{ + struct endpt *ep; + unsigned long sflags; + struct list_head *item; + + spin_lock_irqsave(&ms_info.mi_eplock, sflags); + list_for_each(item, &ms_info.mi_zombie) { + ep = list_entry(item, struct endpt, list); + if (ep->remote_dev && ep->remote_dev->sd_node == node) { + /* + * If the zombie endpoint remote node matches the lost + * node then the scifdev should not be alive. + */ + WARN_ON(scifdev_alive(ep)); + __micscif_cleanup_rma_for_zombies(ep); + } + } + spin_unlock_irqrestore(&ms_info.mi_eplock, sflags); +} + +/* + * micscif_rma_get_task: + * + * Store the parent task struct and bump up the number of remote mappings. + * If this is the first remote memory mapping for this endpoint then + * create a new rma_task_info entry in the epd task list. + */ +int micscif_rma_get_task(struct endpt *ep, int nr_pages) +{ + struct list_head *item; + struct rma_task_info *info; + int err = 0; + + spin_lock(&ep->lock); + list_for_each(item, &ep->rma_info.task_list) { + info = list_entry(item, struct rma_task_info, list_member); + if (info->pid == task_tgid(current)) { + info->ref_count += nr_pages; + pr_debug("%s ep %p existing pid %p ref %d\n", + __func__, ep, info->pid, info->ref_count); + goto unlock; + } + } + spin_unlock(&ep->lock); + + /* A new task is mapping this window. Create a new entry */ + if (!(info = kzalloc(sizeof(*info), GFP_KERNEL))) { + err = -ENOMEM; + goto done; + } + + info->pid = get_pid(task_tgid(current)); + info->ref_count = nr_pages; + pr_debug("%s ep %p new pid %p ref %d\n", + __func__, ep, info->pid, info->ref_count); + spin_lock(&ep->lock); + list_add_tail(&info->list_member, &ep->rma_info.task_list); +unlock: + spin_unlock(&ep->lock); +done: + return err; +} + +/* + * micscif_rma_put_task: + * + * Bump down the number of remote mappings. if the ref count for this + * particular task drops to zero then remove the rma_task_info from + * the epd task list. + */ +void micscif_rma_put_task(struct endpt *ep, int nr_pages) +{ + struct list_head *item; + struct rma_task_info *info; + + spin_lock(&ep->lock); + list_for_each(item, &ep->rma_info.task_list) { + info = list_entry(item, struct rma_task_info, list_member); + if (info->pid == task_tgid(current)) { + info->ref_count -= nr_pages; + pr_debug("%s ep %p pid %p ref %d\n", + __func__, ep, info->pid, info->ref_count); + if (!info->ref_count) { + list_del(&info->list_member); + put_pid(info->pid); + kfree(info); + } + goto done; + } + } + /* Why was the task not found? This is a bug. */ + WARN_ON(1); +done: + spin_unlock(&ep->lock); + return; +} + +/* Only debug API's below */ +void micscif_display_window(struct reg_range_t *window, const char *s, int line) +{ + int j; + + printk("%s %d window %p type %d temp %d offset 0x%llx" + " nr_pages 0x%llx nr_contig_chunks 0x%llx" + " prot %d ref_count %d magic 0x%llx peer_window 0x%llx" + " unreg_state 0x%x va_for_temp %p\n", + s, line, window, window->type, window->temp, + window->offset, window->nr_pages, window->nr_contig_chunks, + window->prot, window->ref_count, window->magic, + window->peer_window, window->unreg_state, window->va_for_temp); + + for (j = 0; j < window->nr_contig_chunks; j++) + pr_debug("page[%d] = dma_addr 0x%llx num_pages 0x%x\n", + j, + window->dma_addr[j], + window->num_pages[j]); + + if (RMA_WINDOW_SELF == window->type && window->pinned_pages) + for (j = 0; j < window->nr_pages; j++) + pr_debug("page[%d] = pinned_pages %p address %p\n", + j, window->pinned_pages->pages[j], + page_address(window->pinned_pages->pages[j])); + +#ifdef CONFIG_ML1OM + if (window->temp_phys_addr) + for (j = 0; j < window->nr_contig_chunks; j++) + pr_debug("page[%d] = temp_phys_addr 0x%llx\n", + j, window->temp_phys_addr[j]); + if (window->phys_addr) + for (j = 0; j < window->nr_pages; j++) + pr_debug("page[%d] = phys_addr 0x%llx\n", + j, window->phys_addr[j]); +#endif + RMA_MAGIC(window); +} diff --git a/micscif/micscif_rma_dma.c b/micscif/micscif_rma_dma.c new file mode 100644 index 0000000..9fafc4c --- /dev/null +++ b/micscif/micscif_rma_dma.c @@ -0,0 +1,982 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" +#include "mic/micscif_smpt.h" +#include "mic/mic_dma_api.h" +#include "mic/micscif_kmem_cache.h" +#include "mic/micscif_rma.h" +#include "mic/micscif_rma_list.h" +#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT) +#include +#endif +#include +#ifndef _MIC_SCIF_ +#include "mic_common.h" +#endif + +static __always_inline +void *get_local_va(off_t off, struct reg_range_t *window, size_t len) +{ + uint64_t page_nr = (off - window->offset) >> PAGE_SHIFT; + off_t page_off = off & ~PAGE_MASK; + void *va; + + if (RMA_WINDOW_SELF == window->type) { + struct page **pages = window->pinned_pages->pages; + va = (void *)((uint64_t) + (page_address(pages[page_nr])) | page_off); + } else { + dma_addr_t phys = micscif_get_dma_addr(window, off, NULL, NULL, NULL); +#ifdef CONFIG_ML1OM + if (RMA_ERROR_CODE == phys) + return NULL; +#endif + va = (void *)((uint64_t) (phys_to_virt(phys))); + } + return va; +} + +#ifdef _MIC_SCIF_ +static __always_inline +void *ioremap_remote(off_t off, struct reg_range_t *window, + size_t len, bool loopback, struct micscif_dev *dev, int *index, uint64_t *start_off) +{ + void *ret; + dma_addr_t phys = micscif_get_dma_addr(window, off, NULL, index, start_off); + +#ifdef CONFIG_ML1OM + if (RMA_ERROR_CODE == phys) + return NULL; +#endif + if (!loopback) + ret = ioremap_nocache(phys, len); + else + ret = (void *)((uint64_t)phys_to_virt(phys)); + return ret; +} + +static __always_inline +void *ioremap_remote_gtt(off_t off, struct reg_range_t *window, + size_t len, bool loopback, struct micscif_dev *dev, int ch_num, struct mic_copy_work *work) +{ + return ioremap_remote(off, window, len, loopback, dev, NULL, NULL); +} +#else +static __always_inline +void *ioremap_remote_gtt(off_t off, struct reg_range_t *window, + size_t len, bool loopback, struct micscif_dev *dev, int ch_num, struct mic_copy_work *work) +{ + void *ret; + uint64_t page_nr = (off - window->offset) >> PAGE_SHIFT; + off_t page_off = off & ~PAGE_MASK; + if (!loopback) { + dma_addr_t phys = micscif_get_dma_addr(window, off, NULL, NULL, NULL); + /* Ideally there should be a helper to do the +/-1 */ + ret = get_per_dev_ctx(dev->sd_node - 1)->aper.va + phys; + } else { + struct page **pages = ((struct reg_range_t *) + (window->peer_window))->pinned_pages->pages; + ret = (void *)((uint64_t)phys_to_virt(page_to_phys(pages[page_nr])) + | page_off); + } + return ret; +} + +static __always_inline +void *ioremap_remote(off_t off, struct reg_range_t *window, + size_t len, bool loopback, struct micscif_dev *dev, int *index, uint64_t *start_off) +{ + void *ret; + int page_nr = (int)((off - window->offset) >> PAGE_SHIFT); + off_t page_off = off & ~PAGE_MASK; + + if (!loopback) { + dma_addr_t phys; + mic_ctx_t *mic_ctx = get_per_dev_ctx(dev->sd_node - 1); + phys = micscif_get_dma_addr(window, off, NULL, index, start_off); + ret = mic_ctx->aper.va + phys; + } else { + struct page **pages = ((struct reg_range_t *) + (window->peer_window))->pinned_pages->pages; + ret = (void *)((uint64_t)phys_to_virt(page_to_phys(pages[page_nr])) + | page_off); + } + return ret; +} +#endif + +static __always_inline void +iounmap_remote(void *virt, size_t size, struct mic_copy_work *work) +{ +#ifdef _MIC_SCIF_ + if (!work->loopback) + iounmap(virt); +#endif +} + +/* + * Takes care of ordering issue caused by + * 1. Hardware: Only in the case of cpu copy from host to card because of WC memory. + * 2. Software: If memcpy reorders copy instructions for optimization. This could happen + * at both host and card. + */ +static inline void ordered_memcpy(volatile char *dst, + const char *src, size_t count) +{ + if (!count) + return; + + memcpy_toio(dst, src, --count); + wmb(); + *(dst + count) = *(src + count); +} + +static inline void micscif_unaligned_memcpy(volatile char *dst, + const char *src, size_t count, bool ordered) +{ + if (unlikely(ordered)) + ordered_memcpy(dst, src, count); + else + memcpy_toio(dst, src, count); +} + +/* + * Copy between rma window and temporary buffer + */ +void micscif_rma_local_cpu_copy(uint64_t offset, struct reg_range_t *window, uint8_t *temp, size_t remaining_len, bool to_temp) +{ + void *window_virt; + size_t loop_len; + int offset_in_page; + uint64_t end_offset; + struct list_head *item; + + BUG_ON(RMA_WINDOW_SELF != window->type); + + offset_in_page = offset & ~PAGE_MASK; + loop_len = PAGE_SIZE - offset_in_page; + + if (remaining_len < loop_len) + loop_len = remaining_len; + + if (!(window_virt = get_local_va(offset, window, loop_len))) + return; + if (to_temp) + memcpy(temp, window_virt, loop_len); + else + memcpy(window_virt, temp, loop_len); + + offset += loop_len; + temp += loop_len; + remaining_len -= loop_len; + + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + while (remaining_len) { + if (offset == end_offset) { + item = ( + &window->list_member)->next; + window = list_entry(item, + struct reg_range_t, + list_member); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + + loop_len = min(PAGE_SIZE, remaining_len); + + if (!(window_virt = get_local_va(offset, window, loop_len))) + return; + + if (to_temp) + memcpy(temp, window_virt, loop_len); + else + memcpy(window_virt, temp, loop_len); + + offset += loop_len; + temp += loop_len; + remaining_len -= loop_len; + } +} + +/* + * Comment this + * + */ +static int micscif_rma_list_dma_copy_unaligned(struct mic_copy_work *work, uint8_t *temp, struct dma_channel *chan, bool src_local) +{ + struct dma_completion_cb *comp_cb = work->comp_cb; + dma_addr_t window_dma_addr, temp_dma_addr; +#ifndef _MIC_SCIF_ + dma_addr_t temp_phys = comp_cb->temp_phys; +#endif + size_t loop_len, nr_contig_bytes = 0, remaining_len = work->len; + int offset_in_page; + uint64_t end_offset = 0, offset = 0; + struct reg_range_t *window = NULL; + struct list_head *item = NULL; + int ret = 0; + void *window_virt_addr = NULL; + size_t tail_len = 0; + + if (src_local) { + offset = work->dst_offset; + window = work->dst_window; + } else { + offset = work->src_offset; + window = work->src_window; + } + + offset_in_page = offset & (L1_CACHE_BYTES - 1); + if (offset_in_page) { + loop_len = L1_CACHE_BYTES - offset_in_page; + loop_len = min(loop_len, remaining_len); + + if (!(window_virt_addr = ioremap_remote_gtt(offset, window, loop_len, + work->loopback, work->remote_dev, + get_chan_num(chan), work))) + return -ENOMEM; + + if (src_local) { + micscif_unaligned_memcpy(window_virt_addr, temp, loop_len, work->ordered && + !(remaining_len - loop_len)); + serializing_request(window_virt_addr); + } else { + memcpy_fromio(temp, window_virt_addr, loop_len); + serializing_request(temp); + } +#ifdef RMA_DEBUG + atomic_long_add_return(loop_len, &ms_info.rma_unaligned_cpu_cnt); +#endif + smp_mb(); + iounmap_remote(window_virt_addr, loop_len, work); + + offset += loop_len; + temp += loop_len; +#ifndef _MIC_SCIF_ + temp_phys += loop_len; +#endif + remaining_len -= loop_len; + } + + offset_in_page = offset & ~PAGE_MASK; + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + + tail_len = remaining_len & (L1_CACHE_BYTES - 1); + remaining_len -= tail_len; + while (remaining_len) { + if (offset == end_offset) { + item = (&window->list_member)->next; + window = list_entry(item, + struct reg_range_t, + list_member); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } +#ifndef _MIC_SCIF_ + temp_dma_addr = temp_phys; +#else + temp_dma_addr = (dma_addr_t)virt_to_phys(temp); +#endif + window_dma_addr = micscif_get_dma_addr(window, offset, &nr_contig_bytes, NULL, NULL); + +#ifdef CONFIG_ML1OM + if (RMA_ERROR_CODE == window_dma_addr) + return -ENXIO; +#endif + loop_len = min(nr_contig_bytes, remaining_len); + + if (src_local) { + if (unlikely(work->ordered && !tail_len && + !(remaining_len - loop_len) && + loop_len != L1_CACHE_BYTES)) { + /* + * Break up the last chunk of the transfer into two steps + * if there is no tail to gurantee DMA ordering. + * Passing DO_DMA_POLLING inserts a status update descriptor + * in step 1 which acts as a double sided synchronization + * fence for the DMA engine to ensure that the last cache line + * in step 2 is updated last. + */ + /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ + ret = do_dma(chan, DO_DMA_POLLING, temp_dma_addr, window_dma_addr, + loop_len - L1_CACHE_BYTES, NULL); + if (ret < 0) { + printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; + } + offset += (loop_len - L1_CACHE_BYTES); + temp_dma_addr += (loop_len - L1_CACHE_BYTES); + window_dma_addr += (loop_len - L1_CACHE_BYTES); + remaining_len -= (loop_len - L1_CACHE_BYTES); + loop_len = remaining_len; + + /* Step 2) DMA: L1_CACHE_BYTES */ + ret = do_dma(chan, 0, temp_dma_addr, window_dma_addr, + loop_len, NULL); + if (ret < 0) { + printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; + } + } else { + int flags = 0; + if (remaining_len == loop_len + L1_CACHE_BYTES) + flags = DO_DMA_POLLING; + ret = do_dma(chan, flags, temp_dma_addr, window_dma_addr, + loop_len, NULL); + } + } else { + ret = do_dma(chan, 0, window_dma_addr, temp_dma_addr, + loop_len, NULL); + } + if (ret < 0) { + printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; + } + offset += loop_len; + temp += loop_len; +#ifndef _MIC_SCIF_ + temp_phys += loop_len; +#endif + remaining_len -= loop_len; + offset_in_page = 0; + } + if (tail_len) { + if (offset == end_offset) { + item = (&window->list_member)->next; + window = list_entry(item, + struct reg_range_t, + list_member); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + if (!(window_virt_addr = ioremap_remote_gtt(offset, window, tail_len, + work->loopback, work->remote_dev, + get_chan_num(chan), work))) + return -ENOMEM; + + /* + * The CPU copy for the tail bytes must be initiated only once previous + * DMA transfers for this endpoint have completed to guarantee + * ordering. + */ + if (unlikely(work->ordered)) { + free_dma_channel(chan); + work->dma_chan_released = true; + if ((ret = drain_dma_intr(chan))) + return ret; + } + + if (src_local) { + micscif_unaligned_memcpy(window_virt_addr, temp, tail_len, work->ordered); + serializing_request(window_virt_addr); + } else { + memcpy_fromio(temp, window_virt_addr, tail_len); + serializing_request(temp); + } +#ifdef RMA_DEBUG + atomic_long_add_return(tail_len, &ms_info.rma_unaligned_cpu_cnt); +#endif + smp_mb(); + iounmap_remote(window_virt_addr, tail_len, work); + } + if (work->dma_chan_released) { + if ((ret = request_dma_channel(chan))) + return ret; + /* Callee frees the DMA channel lock, if it is held */ + work->dma_chan_released = false; + } + ret = do_dma(chan, DO_DMA_INTR, 0, 0, 0, comp_cb); + if (ret < 0) { + printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; + } + return 0; +} + +static inline bool is_local_dma_addr(uint64_t addr) +{ +#ifdef _MIC_SCIF_ + return (addr >> PAGE_SHIFT < num_physpages); +#else + return is_syspa(addr); +#endif +} + +/* + * micscif_rma_list_dma_copy_aligned: + * + * Traverse all the windows and perform DMA copy. + */ +static int micscif_rma_list_dma_copy_aligned(struct mic_copy_work *work, struct dma_channel *chan) +{ + dma_addr_t src_dma_addr, dst_dma_addr; + size_t loop_len, remaining_len, tail_len, src_contig_bytes = 0, dst_contig_bytes = 0; + int src_cache_off, dst_cache_off, src_last_index = 0, dst_last_index = 0; + uint64_t end_src_offset, end_dst_offset; + void *src_virt, *dst_virt; + struct reg_range_t *src_window = work->src_window; + struct reg_range_t *dst_window = work->dst_window; + uint64_t src_offset = work->src_offset, dst_offset = work->dst_offset; + uint64_t src_start_offset = src_window->offset, dst_start_offset = dst_window->offset; + struct list_head *item; + int ret = 0; + + remaining_len = work->len; + + src_cache_off = src_offset & (L1_CACHE_BYTES - 1); + dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1); + if (src_cache_off != dst_cache_off) { + BUG_ON(1); + } else if (src_cache_off != 0) { + /* Head */ + loop_len = L1_CACHE_BYTES - src_cache_off; + loop_len = min(loop_len, remaining_len); + src_dma_addr = micscif_get_dma_addr(src_window, src_offset, NULL, NULL, NULL); + dst_dma_addr = micscif_get_dma_addr(dst_window, dst_offset, NULL, NULL, NULL); +#ifdef CONFIG_ML1OM + if (RMA_ERROR_CODE == src_dma_addr) + return -ENXIO; + if (RMA_ERROR_CODE == dst_dma_addr) + return -ENXIO; + get_window_ref_count(src_window, 1); + get_window_ref_count(dst_window, 1); +#endif + if (is_local_dma_addr(src_dma_addr)) + src_virt = get_local_va(src_offset, src_window, loop_len); + else + src_virt = ioremap_remote_gtt(src_offset, src_window, + loop_len, work->loopback, + work->remote_dev, get_chan_num(chan), work); + if (!src_virt) { +#ifdef CONFIG_ML1OM + put_window_ref_count(src_window, 1); + put_window_ref_count(dst_window, 1); +#endif + return -ENOMEM; + } + if (is_local_dma_addr(dst_dma_addr)) + dst_virt = get_local_va(dst_offset, dst_window, loop_len); + else + dst_virt = ioremap_remote_gtt(dst_offset, dst_window, + loop_len, work->loopback, + work->remote_dev, get_chan_num(chan), work); +#ifdef CONFIG_ML1OM + put_window_ref_count(src_window, 1); + put_window_ref_count(dst_window, 1); +#endif + if (!dst_virt) { + if (!is_local_dma_addr(src_dma_addr)) + iounmap_remote(src_virt, loop_len, work); + return -ENOMEM; + } + if (is_local_dma_addr(src_dma_addr)){ + micscif_unaligned_memcpy(dst_virt, src_virt, loop_len, + remaining_len == loop_len ? work->ordered : false); + } + else{ + memcpy_fromio(dst_virt, src_virt, loop_len); + } + serializing_request(dst_virt); + smp_mb(); + if (!is_local_dma_addr(src_dma_addr)) + iounmap_remote(src_virt, loop_len, work); + if (!is_local_dma_addr(dst_dma_addr)) + iounmap_remote(dst_virt, loop_len, work); + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } + + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + tail_len = remaining_len & (L1_CACHE_BYTES - 1); + remaining_len -= tail_len; + while (remaining_len) { + if (src_offset == end_src_offset) { + item = (&src_window->list_member)->next; + src_window = list_entry(item, + struct reg_range_t, + list_member); + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + src_last_index = 0; + src_start_offset = src_window->offset; + } + if (dst_offset == end_dst_offset) { + item = (&dst_window->list_member)->next; + dst_window = list_entry(item, struct reg_range_t, list_member); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + dst_last_index = 0; + dst_start_offset = dst_window->offset; + } + + /* compute dma addresses for transfer */ + src_dma_addr = micscif_get_dma_addr(src_window, src_offset, &src_contig_bytes, &src_last_index, &src_start_offset); + dst_dma_addr = micscif_get_dma_addr(dst_window, dst_offset, &dst_contig_bytes, &dst_last_index, &dst_start_offset); +#ifdef CONFIG_ML1OM + if (RMA_ERROR_CODE == src_dma_addr) + return -ENXIO; + if (RMA_ERROR_CODE == dst_dma_addr) + return -ENXIO; +#endif + loop_len = min(src_contig_bytes, dst_contig_bytes); + loop_len = min(loop_len, remaining_len); + if (unlikely(work->ordered && !tail_len && + !(remaining_len - loop_len) && + loop_len != L1_CACHE_BYTES)) { + /* + * Break up the last chunk of the transfer into two steps + * if there is no tail to gurantee DMA ordering. + * Passing DO_DMA_POLLING inserts a status update descriptor + * in step 1 which acts as a double sided synchronization + * fence for the DMA engine to ensure that the last cache line + * in step 2 is updated last. + */ + /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ + ret = do_dma(chan, DO_DMA_POLLING, src_dma_addr, dst_dma_addr, + loop_len - L1_CACHE_BYTES, NULL); + if (ret < 0) { + printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; + } + src_offset += (loop_len - L1_CACHE_BYTES); + dst_offset += (loop_len - L1_CACHE_BYTES); + src_dma_addr += (loop_len - L1_CACHE_BYTES); + dst_dma_addr += (loop_len - L1_CACHE_BYTES); + remaining_len -= (loop_len - L1_CACHE_BYTES); + loop_len = remaining_len; + + /* Step 2) DMA: L1_CACHE_BYTES */ + ret = do_dma(chan, 0, src_dma_addr, dst_dma_addr, + loop_len, NULL); + if (ret < 0) { + printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; + } + } else { + int flags = 0; + if (remaining_len == loop_len + L1_CACHE_BYTES) + flags = DO_DMA_POLLING; + ret = do_dma(chan, flags, src_dma_addr, dst_dma_addr, + loop_len, NULL); + if (ret < 0) { + printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; + } + } + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } +#ifdef CONFIG_MK1OM + BUG_ON(remaining_len != 0); +#endif +#ifdef CONFIG_ML1OM + if (remaining_len) + return - ENXIO; +#endif + remaining_len = tail_len; + if (remaining_len) { + loop_len = remaining_len; + if (src_offset == end_src_offset) { + item = (&src_window->list_member)->next; + src_window = list_entry(item, + struct reg_range_t, + list_member); + } + if (dst_offset == end_dst_offset) { + item = (&dst_window->list_member)->next; + dst_window = list_entry(item, struct reg_range_t, list_member); + } + + src_dma_addr = micscif_get_dma_addr(src_window, src_offset, NULL, NULL, NULL); + dst_dma_addr = micscif_get_dma_addr(dst_window, dst_offset, NULL, NULL, NULL); +#ifdef CONFIG_ML1OM + if (RMA_ERROR_CODE == src_dma_addr) + return -ENXIO; + if (RMA_ERROR_CODE == dst_dma_addr) + return -ENXIO; +#endif + /* + * The CPU copy for the tail bytes must be initiated only once previous + * DMA transfers for this endpoint have completed to guarantee + * ordering. + */ + if (unlikely(work->ordered)) { + free_dma_channel(chan); + work->dma_chan_released = true; + if ((ret = drain_dma_poll(chan))) + return ret; + } +#ifdef CONFIG_ML1OM + get_window_ref_count(src_window, 1); + get_window_ref_count(dst_window, 1); +#endif + if (is_local_dma_addr(src_dma_addr)) + src_virt = get_local_va(src_offset, src_window, loop_len); + else + src_virt = ioremap_remote_gtt(src_offset, src_window, + loop_len, work->loopback, + work->remote_dev, get_chan_num(chan), work); + if (!src_virt) { +#ifdef CONFIG_ML1OM + put_window_ref_count(src_window, 1); + put_window_ref_count(dst_window, 1); +#endif + return -ENOMEM; + } + + if (is_local_dma_addr(dst_dma_addr)) + dst_virt = get_local_va(dst_offset, dst_window, loop_len); + else + dst_virt = ioremap_remote_gtt(dst_offset, dst_window, + loop_len, work->loopback, + work->remote_dev, get_chan_num(chan), work); +#ifdef CONFIG_ML1OM + put_window_ref_count(src_window, 1); + put_window_ref_count(dst_window, 1); +#endif + if (!dst_virt) { + if (!is_local_dma_addr(src_dma_addr)) + iounmap_remote(src_virt, loop_len, work); + return -ENOMEM; + } + + if (is_local_dma_addr(src_dma_addr)){ + micscif_unaligned_memcpy(dst_virt, src_virt, loop_len, work->ordered); + } + else{ + memcpy_fromio(dst_virt, src_virt, loop_len); + } + serializing_request(dst_virt); + smp_mb(); + if (!is_local_dma_addr(src_dma_addr)) + iounmap_remote(src_virt, loop_len, work); + + if (!is_local_dma_addr(dst_dma_addr)) + iounmap_remote(dst_virt, loop_len, work); + + remaining_len -= loop_len; +#ifdef CONFIG_MK1OM + BUG_ON(remaining_len != 0); +#endif +#ifdef CONFIG_ML1OM + if (remaining_len) + return - ENXIO; +#endif + } + + return ret; +} + +int micscif_rma_list_dma_copy_wrapper(struct endpt *epd, struct mic_copy_work *work, struct dma_channel *chan, off_t loffset) +{ + int src_cache_off, dst_cache_off; + uint64_t src_offset = work->src_offset, dst_offset = work->dst_offset; + uint8_t *temp = NULL; + bool src_local = true, dst_local = false; + struct dma_completion_cb *comp_cb; + dma_addr_t src_dma_addr, dst_dma_addr; +#ifndef _MIC_SCIF_ + struct pci_dev *pdev; +#endif + + src_cache_off = src_offset & (L1_CACHE_BYTES - 1); + dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1); + if (dst_cache_off == src_cache_off) + return micscif_rma_list_dma_copy_aligned(work, chan); + + if (work->loopback) { +#ifdef _MIC_SCIF_ + BUG_ON(micscif_rma_list_cpu_copy(work)); + return 0; +#else + BUG_ON(1); +#endif + } + + src_dma_addr = micscif_get_dma_addr(work->src_window, src_offset, NULL, NULL, NULL); + dst_dma_addr = micscif_get_dma_addr(work->dst_window, dst_offset, NULL, NULL, NULL); + + if (is_local_dma_addr(src_dma_addr)) + src_local = true; + else + src_local = false; + + if (is_local_dma_addr(dst_dma_addr)) + dst_local = true; + else + dst_local = false; + + dst_local = dst_local; + BUG_ON(work->len + (L1_CACHE_BYTES << 1) > KMEM_UNALIGNED_BUF_SIZE); + + /* Allocate dma_completion cb */ + if (!(comp_cb = kzalloc(sizeof(*comp_cb), GFP_KERNEL))) + goto error; + + work->comp_cb = comp_cb; + comp_cb->cb_cookie = (uint64_t)comp_cb; + comp_cb->dma_completion_func = &micscif_rma_completion_cb; + + if (work->len + (L1_CACHE_BYTES << 1) < KMEM_UNALIGNED_BUF_SIZE) { + comp_cb->is_cache = false; + if (!(temp = kmalloc(work->len + (L1_CACHE_BYTES << 1), GFP_KERNEL))) + goto free_comp_cb; + comp_cb->temp_buf_to_free = temp; + /* kmalloc(..) does not guarantee cache line alignment */ + if ((uint64_t)temp & (L1_CACHE_BYTES - 1)) + temp = (uint8_t*)ALIGN((uint64_t)temp, L1_CACHE_BYTES); + } else { + comp_cb->is_cache = true; + if (!(temp = micscif_kmem_cache_alloc())) + goto free_comp_cb; + comp_cb->temp_buf_to_free = temp; + } + + if (src_local) { + temp += dst_cache_off; + comp_cb->tmp_offset = dst_cache_off; + micscif_rma_local_cpu_copy(work->src_offset, work->src_window, temp, work->len, true); + } else { + comp_cb->dst_window = work->dst_window; + comp_cb->dst_offset = work->dst_offset; + work->src_offset = work->src_offset - src_cache_off; + comp_cb->len = work->len; + work->len = ALIGN(work->len + src_cache_off, L1_CACHE_BYTES); + comp_cb->header_padding = src_cache_off; + } + comp_cb->temp_buf = temp; + +#ifndef _MIC_SCIF_ + micscif_pci_dev(work->remote_dev->sd_node, &pdev); + comp_cb->temp_phys = mic_map_single(work->remote_dev->sd_node - 1, + pdev, temp, KMEM_UNALIGNED_BUF_SIZE); + + if (mic_map_error(comp_cb->temp_phys)) { + goto free_temp_buf; + } + + comp_cb->remote_node = work->remote_dev->sd_node; +#endif + if (0 > micscif_rma_list_dma_copy_unaligned(work, temp, chan, src_local)) + goto free_temp_buf; + if (!src_local) + work->fence_type = DO_DMA_INTR; + return 0; +free_temp_buf: + if (comp_cb->is_cache) + micscif_kmem_cache_free(comp_cb->temp_buf_to_free); + else + kfree(comp_cb->temp_buf_to_free); +free_comp_cb: + kfree(comp_cb); +error: + printk(KERN_ERR "Unable to malloc %s %d\n", __func__, __LINE__); + return -ENOMEM; +} + +#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT) +static int softlockup_threshold = 60; +static void avert_softlockup(unsigned long data) +{ + *(unsigned long*)data = 1; +} + +/* + * Add a timer to handle the case of hogging the cpu for + * time > softlockup_threshold. + * Add the timer every softlockup_threshold / 3 so that even if + * there is a huge delay in running our timer, we will still don't hit + * the softlockup case.(softlockup_tick() is run in hardirq() context while + * timers are run at softirq context) + * + */ +static inline void add_softlockup_timer(struct timer_list *timer, unsigned long *data) +{ + setup_timer(timer, avert_softlockup, (unsigned long) data); + timer->expires = jiffies + usecs_to_jiffies(softlockup_threshold * 1000000 / 3); + add_timer(timer); +} + +static inline void del_softlockup_timer(struct timer_list *timer) +{ + /* We need delete synchronously since the variable being touched by + * timer interrupt is on the stack + */ + del_timer_sync(timer); +} +#endif + +/* + * micscif_rma_list_cpu_copy: + * + * Traverse all the windows and perform CPU copy. + */ +int micscif_rma_list_cpu_copy(struct mic_copy_work *work) +{ + void *src_virt, *dst_virt; + size_t loop_len, remaining_len; + int src_cache_off, dst_cache_off; + uint64_t src_offset = work->src_offset, dst_offset = work->dst_offset; + struct reg_range_t *src_window = work->src_window; + struct reg_range_t *dst_window = work->dst_window; + uint64_t end_src_offset, end_dst_offset; + struct list_head *item; + int srcchunk_ind = 0; + int dstchunk_ind = 0; + uint64_t src_start_offset, dst_start_offset; + int ret = 0; +#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT) + unsigned long timer_fired = 0; + struct timer_list timer; + int cpu = smp_processor_id(); + add_softlockup_timer(&timer, &timer_fired); +#endif + + remaining_len = work->len; + src_start_offset = src_window->offset; + dst_start_offset = dst_window->offset; + + while (remaining_len) { +#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT) + /* Ideally we should call schedule only if we didn't sleep + * in between. But there is no way to know that. + */ + if (timer_fired) { + timer_fired = 0; + if (smp_processor_id() == cpu) + touch_softlockup_watchdog(); + else + cpu = smp_processor_id(); + add_softlockup_timer(&timer, &timer_fired); + } +#endif + src_cache_off = src_offset & ~PAGE_MASK; + dst_cache_off = dst_offset & ~PAGE_MASK; + loop_len = PAGE_SIZE - + ((src_cache_off > dst_cache_off) ? + src_cache_off : dst_cache_off); + if (remaining_len < loop_len) + loop_len = remaining_len; + + if (RMA_WINDOW_SELF == src_window->type) + src_virt = get_local_va(src_offset, src_window, loop_len); + else + src_virt = ioremap_remote(src_offset, + src_window, loop_len, work->loopback, work->remote_dev, &srcchunk_ind, &src_start_offset); + if (!src_virt) { + ret = -ENOMEM; + goto error; + } + + if (RMA_WINDOW_SELF == dst_window->type) + dst_virt = get_local_va(dst_offset, dst_window, loop_len); + else + dst_virt = ioremap_remote(dst_offset, + dst_window, loop_len, work->loopback, work->remote_dev, &dstchunk_ind, &dst_start_offset); + if (!dst_virt) { + if (RMA_WINDOW_PEER == src_window->type) + iounmap_remote(src_virt, loop_len, work); + ret = -ENOMEM; + goto error; + } + + if (work->loopback) + memcpy(dst_virt, src_virt, loop_len); + else { + + if (RMA_WINDOW_SELF == src_window->type){ + memcpy_toio(dst_virt, src_virt, loop_len); + } + else{ + memcpy_fromio(dst_virt, src_virt, loop_len); + } + serializing_request(dst_virt); + smp_mb(); + } + if (RMA_WINDOW_PEER == src_window->type) + iounmap_remote(src_virt, loop_len, work); + + if (RMA_WINDOW_PEER == dst_window->type) + iounmap_remote(dst_virt, loop_len, work); + + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + if (remaining_len) { + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + if (src_offset == end_src_offset) { + item = ( + &src_window->list_member)->next; + src_window = list_entry(item, + struct reg_range_t, + list_member); + srcchunk_ind = 0; + src_start_offset = src_window->offset; + } + if (dst_offset == end_dst_offset) { + item = ( + &dst_window->list_member)->next; + dst_window = list_entry(item, + struct reg_range_t, + list_member); + dstchunk_ind = 0; + dst_start_offset = dst_window->offset; + } + } + } +error: +#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT) + del_softlockup_timer(&timer); +#endif + return ret; +} diff --git a/micscif/micscif_rma_list.c b/micscif/micscif_rma_list.c new file mode 100644 index 0000000..9052c1f --- /dev/null +++ b/micscif/micscif_rma_list.c @@ -0,0 +1,533 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include "mic/micscif.h" +#include "mic/micscif_smpt.h" +#include "mic/mic_dma_api.h" +#include "mic/micscif_kmem_cache.h" +#ifdef CONFIG_MMU_NOTIFIER +#include +#include +#endif +#ifndef _MIC_SCIF_ +#include "mic_common.h" +#endif +#include "mic/micscif_map.h" + +/* + * micscif_insert_tcw: + * + * Insert a temp window to the temp registration list sorted by va_for_temp. + * RMA lock must be held. + */ +void micscif_insert_tcw(struct reg_range_t *window, + struct list_head *head) +{ + struct reg_range_t *curr = NULL, *prev = NULL; + struct list_head *item; + BUG_ON(!window); + INIT_LIST_HEAD(&window->list_member); + /* + * HSD 4845254 + * Hack for worst case performance + * Compare with tail and if the new entry is new tail add it to the end + */ + if (!list_empty(head)) { + curr = list_entry(head->prev, struct reg_range_t, list_member); + if ((uint64_t) curr->va_for_temp < (uint64_t) window->va_for_temp) { + list_add_tail(&window->list_member, head); + return; + } + } + /* + * We don't need the if(!prev) code but I am gonna leave it as + * is for now. If someone touches the above code it is likely that they + * will miss that they have to add if(!prev) block + */ + list_for_each(item, head) { + curr = list_entry(item, struct reg_range_t, list_member); + if ((uint64_t) curr->va_for_temp > (uint64_t) window->va_for_temp) + break; + prev = curr; + } + if (!prev) + list_add(&window->list_member, head); + else + list_add(&window->list_member, &prev->list_member); +} +/* + * micscif_insert_window: + * + * Insert a window to the self registration list sorted by offset. + * RMA lock must be held. + */ +void micscif_insert_window(struct reg_range_t *window, struct list_head *head) +{ + struct reg_range_t *curr = NULL, *prev = NULL; + struct list_head *item; + BUG_ON(!window); + INIT_LIST_HEAD(&window->list_member); + list_for_each(item, head) { + curr = list_entry(item, struct reg_range_t, list_member); + if (curr->offset > window->offset) + break; + prev = curr; + } + if (!prev) + list_add(&window->list_member, head); + else + list_add(&window->list_member, &prev->list_member); +} + +/* + * micscif_query_tcw: + * + * Query the temp cached registration list of ep and check if a valid contiguous + * range of windows exist. + * If there is a partial overlap, delete the existing window and create a new one + * that encompasses the previous window and a new range + * RMA lock must be held. + */ +int micscif_query_tcw(struct endpt *ep, struct micscif_rma_req *req) +{ + struct list_head *item, *temp; + struct reg_range_t *window; + uint64_t start_va_window, start_va_req = (uint64_t) req->va_for_temp; + uint64_t end_va_window, end_va_req = start_va_req + req->nr_bytes; + + /* + * HSD 4845254 + * Hack for the worst case scenario + * Avoid traversing the entire list to find out that there is no + * entry that matches + */ + if (!list_empty(req->head)) { + temp = req->head->prev; + window = list_entry(temp, + struct reg_range_t, list_member); + end_va_window = (uint64_t) window->va_for_temp + + (window->nr_pages << PAGE_SHIFT); + if (start_va_req > end_va_window) + return -ENXIO; + } + list_for_each_safe(item, temp, req->head) { + window = list_entry(item, + struct reg_range_t, list_member); + start_va_window = (uint64_t) window->va_for_temp; + end_va_window = (uint64_t) window->va_for_temp + + (window->nr_pages << PAGE_SHIFT); + pr_debug("%s %d start_va_window 0x%llx end_va_window 0x%llx" + " start_va_req 0x%llx end_va_req 0x%llx req->nr_bytes 0x%lx\n", + __func__, __LINE__, start_va_window, end_va_window, + start_va_req, end_va_req, req->nr_bytes); + if (start_va_req < start_va_window) { + if (end_va_req < start_va_window) { + /* No overlap */ + } else { + if ((window->prot & req->prot) != req->prot) { + + } else { + req->nr_bytes += ((end_va_req > end_va_window) ? 0:(end_va_window - end_va_req)); + pr_debug("%s %d Extend req->va_for_temp %p req->nr_byte 0x%lx\n", + __func__, __LINE__, req->va_for_temp, req->nr_bytes); + } + __micscif_rma_destroy_tcw_helper(window); + } + break; + } else { + if (start_va_req > end_va_window) { + /* No overlap */ + continue; + } else { + if ((window->prot & req->prot) != req->prot) { + __micscif_rma_destroy_tcw_helper(window); + break; + } + if (end_va_req > end_va_window) { + req->va_for_temp = (void*) start_va_window; + req->nr_bytes = end_va_req - start_va_window; + pr_debug("%s %d Extend req->va_for_temp %p req->nr_byte 0x%lx\n", + __func__, __LINE__, req->va_for_temp, req->nr_bytes); + __micscif_rma_destroy_tcw_helper(window); + return -ENXIO; + } else { + *(req->out_window) = window; + return 0; + } + } + } + } + pr_debug("%s %d ENXIO\n", __func__, __LINE__); + return -ENXIO; +} + +/* + * micscif_query_window: + * + * Query the registration list and check if a valid contiguous + * range of windows exist. + * RMA lock must be held. + */ +int micscif_query_window(struct micscif_rma_req *req) +{ + struct list_head *item; + struct reg_range_t *window; + uint64_t end_offset, offset = req->offset; + uint64_t tmp_min, nr_bytes_left = req->nr_bytes; + + list_for_each(item, req->head) { + window = list_entry(item, + struct reg_range_t, list_member); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + if (offset < window->offset) + /* Offset not found! */ + return -ENXIO; + if (offset < end_offset) { + /* Check read/write protections. */ + if ((window->prot & req->prot) != req->prot) + return -EPERM; + if (nr_bytes_left == req->nr_bytes) + /* Store the first window */ + *(req->out_window) = window; + tmp_min = min(end_offset - offset, nr_bytes_left); + nr_bytes_left -= tmp_min; + offset += tmp_min; + /* + * Range requested encompasses + * multiple windows contiguously. + */ + if (!nr_bytes_left) { + /* Done for partial window */ + if (req->type == WINDOW_PARTIAL || + req->type == WINDOW_SINGLE) + return 0; + /* Extra logic for full windows */ + if (offset == end_offset) + /* Spanning multiple whole windows */ + return 0; + /* Not spanning multiple whole windows */ + return -ENXIO; + } + if (req->type == WINDOW_SINGLE) + break; + } + } + printk(KERN_ERR "%s %d ENXIO\n", __func__, __LINE__); + return -ENXIO; +} + +/* + * micscif_rma_list_mmap: + * + * Traverse the remote registration list starting from start_window: + * 1) Check read/write protections. + * 2) Create VtoP mappings via remap_pfn_range(..) + * 3) Once step 1) and 2) complete successfully then traverse the range of + * windows again and bump the reference count. + * RMA lock must be held. + */ +int micscif_rma_list_mmap(struct reg_range_t *start_window, + uint64_t offset, int nr_pages, struct vm_area_struct *vma) +{ + struct list_head *item, *head; + uint64_t end_offset, loop_offset = offset; + struct reg_range_t *window; + int64_t start_page_nr, loop_nr_pages, nr_pages_left = nr_pages; + struct endpt *ep = (struct endpt *)start_window->ep; + int i, err = 0; + uint64_t j =0; + dma_addr_t phys_addr; + + might_sleep(); + BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock)); + + /* Start traversing from the previous link in the list */ + head = ((&start_window->list_member))->prev; + list_for_each(item, head) { + window = list_entry(item, struct reg_range_t, + list_member); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + start_page_nr = (loop_offset - window->offset) >> PAGE_SHIFT; + loop_nr_pages = min((int64_t)((end_offset - loop_offset) >> PAGE_SHIFT), + nr_pages_left); + for (i = (int)start_page_nr; + i < ((int)start_page_nr + (int)loop_nr_pages); i++, j++) { + + phys_addr = +#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM) + is_self_scifdev(ep->remote_dev) ? + micscif_get_dma_addr(window, loop_offset, + NULL, NULL, NULL) : window->phys_addr[i]; +#else + get_phys_addr(micscif_get_dma_addr(window, loop_offset, + NULL, NULL, NULL), ep->remote_dev); +#endif + /* + * Note: + * 1) remap_pfn_rnage returns an error if there is an + * attempt to create MAP_PRIVATE COW mappings. + */ + if ((err = remap_pfn_range(vma, + ((vma)->vm_start) + (j * PAGE_SIZE), + phys_addr >> PAGE_SHIFT, + PAGE_SIZE, + ((vma)->vm_page_prot)))) + goto error; + loop_offset += PAGE_SIZE; + } + nr_pages_left -= loop_nr_pages; + if (!nr_pages_left) + break; + } + BUG_ON(nr_pages_left); + /* + * No more failures expected. Bump up the ref count for all + * the windows. Another traversal from start_window required + * for handling errors encountered across windows during + * remap_pfn_range(..). + */ + loop_offset = offset; + nr_pages_left = nr_pages; + head = (&(start_window->list_member))->prev; + list_for_each(item, head) { + window = list_entry(item, struct reg_range_t, + list_member); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + start_page_nr = (loop_offset - window->offset) >> PAGE_SHIFT; + loop_nr_pages = min((int64_t)((end_offset - loop_offset) >> PAGE_SHIFT), + nr_pages_left); + get_window_ref_count(window, loop_nr_pages); + nr_pages_left -= loop_nr_pages; + loop_offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages_left) + break; + } + BUG_ON(nr_pages_left); +error: + if (err) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + return err; +} + +/* + * micscif_rma_list_munmap: + * + * Traverse the remote registration list starting from window: + * 1) Decrement ref count. + * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer. + * RMA lock must be held. + */ +void micscif_rma_list_munmap(struct reg_range_t *start_window, + uint64_t offset, int nr_pages) +{ + struct list_head *item, *tmp, *head; + struct nodemsg msg; + uint64_t loop_offset = offset, end_offset; + int64_t loop_nr_pages, nr_pages_left = nr_pages; + struct endpt *ep = (struct endpt *)start_window->ep; + struct reg_range_t *window; + + BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock)); + + msg.uop = SCIF_MUNMAP; + msg.src = ep->port; + loop_offset = offset; + nr_pages_left = nr_pages; + /* Start traversing from the previous link in the list */ + head = (&(start_window->list_member))->prev; + list_for_each_safe(item, tmp, head) { + window = list_entry(item, struct reg_range_t, + list_member); + RMA_MAGIC(window); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min((int64_t)((end_offset - loop_offset) >> PAGE_SHIFT), + nr_pages_left); + put_window_ref_count(window, loop_nr_pages); + if (!window->ref_count) { + if (scifdev_alive(ep)) + drain_dma_intr(ep->rma_info.dma_chan); + /* Inform the peer about this munmap */ + msg.payload[0] = window->peer_window; + /* No error handling for Notification messages. */ + micscif_nodeqp_send(ep->remote_dev, &msg, ep); + list_del(&window->list_member); + /* Destroy this window from the peer's registered AS */ + micscif_destroy_remote_window(ep, window); + } + nr_pages_left -= loop_nr_pages; + loop_offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages_left) + break; + } + BUG_ON(nr_pages_left); +} + +/* + * micscif_rma_list_unregister: + * + * Traverse the self registration list starting from window: + * 1) Call micscif_unregister_window(..) + * RMA lock must be held. + */ +int micscif_rma_list_unregister(struct reg_range_t *window, + uint64_t offset, int nr_pages) +{ + struct list_head *item, *tmp, *head; + uint64_t end_offset; + int err = 0; + int64_t loop_nr_pages; + struct endpt *ep = (struct endpt *)window->ep; + + BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock)); + /* Start traversing from the previous link in the list */ + head = (&window->list_member)->prev; + list_for_each_safe(item, tmp, head) { + window = list_entry(item, struct reg_range_t, + list_member); + RMA_MAGIC(window); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min((int)((end_offset - offset) >> PAGE_SHIFT), + nr_pages); + if ((err = micscif_unregister_window(window))) + return err; + nr_pages -= (int)loop_nr_pages; + offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages) + break; + } + BUG_ON(nr_pages); + return 0; +} + +/* + * micscif_unregister_all_window: + * + * Traverse all the windows in the self registration list and: + * 1) Call micscif_unregister_window(..) + * RMA lock must be held. + */ +int micscif_unregister_all_windows(scif_epd_t epd) +{ + struct list_head *item, *tmp; + struct reg_range_t *window; + struct endpt *ep = (struct endpt *)epd; + struct list_head *head = &ep->rma_info.reg_list; + int err = 0; + + queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work); + mutex_lock(&ep->rma_info.rma_lock); +retry: + item = NULL; + tmp = NULL; + list_for_each_safe(item, tmp, head) { + window = list_entry(item, + struct reg_range_t, list_member); + ep->rma_info.async_list_del = 0; + if ((err = micscif_unregister_window(window))) + pr_debug("%s %d err %d\n", + __func__, __LINE__, err); + /* + * Need to restart list traversal if there has been + * an asynchronous list entry deletion. + */ + if (ep->rma_info.async_list_del) + goto retry; + } + mutex_unlock(&ep->rma_info.rma_lock); + + /* + * The following waits cannot be interruptible since they are + * from the driver release() entry point. + */ + err = wait_event_timeout(ep->rma_info.fence_wq, + !ep->rma_info.fence_refcount, NODE_ALIVE_TIMEOUT); + /* Timeout firing is unexpected. Is the DMA engine hung? */ + if (!err) + printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err); + +#ifdef CONFIG_MMU_NOTIFIER + if (!list_empty(&ep->rma_info.mmn_list)) { + spin_lock(&ms_info.mi_rmalock); + list_add_tail(&ep->mmu_list, &ms_info.mi_mmu_notif_cleanup); + spin_unlock(&ms_info.mi_rmalock); + queue_work(ms_info.mi_mmu_notif_wq, &ms_info.mi_mmu_notif_work); + } +#endif + return err; +} + +/* + * micscif_rma_list_get_pages_check: + * + * Traverse the remote registration list and return 0 if all the + * scif_get_pages()/scif_put_pages() ref_counts are zero else return -1. + */ +int micscif_rma_list_get_pages_check(struct endpt *ep) +{ + struct list_head *item, *head = &ep->rma_info.remote_reg_list; + struct reg_range_t *window; + int err = 0; + + mutex_lock(&ep->rma_info.rma_lock); + list_for_each(item, head) { + window = list_entry(item, + struct reg_range_t, list_member); + if (window->get_put_ref_count) { + err = -1; + break; + } + } + mutex_unlock(&ep->rma_info.rma_lock); + return err; +} + +/* Only debug API's below */ +void micscif_display_all_windows(struct list_head *head) +{ + struct list_head *item; + struct reg_range_t *window; + pr_debug("\nWindow List Start\n"); + list_for_each(item, head) { + window = list_entry(item, + struct reg_range_t, list_member); + micscif_display_window(window, __func__, __LINE__); + } + pr_debug("Window List End\n\n"); +} diff --git a/micscif/micscif_select.c b/micscif/micscif_select.c new file mode 100644 index 0000000..c6f125f --- /dev/null +++ b/micscif/micscif_select.c @@ -0,0 +1,446 @@ +/* + * Implementation of select and poll + * + * Copyright 2011-2012 Intel Corporation. + * + * This file is a derivative of fs/select.c from within the Linux kernel + * source distribution, version 2.6.34; it has been modified (starting + * in May 2011) to work within the context of the SCIF driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA. + * + * Initial comment from fs/select.c: + * + * This file contains the procedures for the handling of select and poll + * + * Created for Linux based loosely upon Mathius Lattner's minix + * patches by Peter MacDonald. Heavily edited by Linus. + * + * 4 February 1994 + * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS + * flag set in its personality we do *not* modify the given timeout + * parameter to reflect time remaining. + * + * 24 January 2000 + * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation + * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + */ + +#include +#include +#include +#include +#include + +#include "mic/micscif.h" + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +#include +#endif + +struct poll_table_page { + struct poll_table_page *next; + struct poll_table_entry *entry; + struct poll_table_entry entries[0]; +}; + +/* + * Estimate expected accuracy in ns from a timeval. + * + * After quite a bit of churning around, we've settled on + * a simple thing of taking 0.1% of the timeout as the + * slack, with a cap of 100 msec. + * "nice" tasks get a 0.5% slack instead. + * + * Consider this comment an open invitation to come up with even + * better solutions.. + */ + +#define MAX_SLACK (100 * NSEC_PER_MSEC) + +static long __estimate_accuracy(struct timespec *tv) +{ + long slack; + int divfactor = 1000; + + if (tv->tv_sec < 0) + return 0; + + if (task_nice(current) > 0) + divfactor = divfactor / 5; + + if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) + return MAX_SLACK; + + slack = tv->tv_nsec / divfactor; + slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); + + if (slack > MAX_SLACK) + return MAX_SLACK; + + return slack; +} + +static long estimate_accuracy(struct timespec *tv) +{ + unsigned long ret; + struct timespec now; + + /* + * Realtime tasks get a slack of 0 for obvious reasons. + */ + + if (rt_task(current)) + return 0; + + ktime_get_ts(&now); + now = timespec_sub(*tv, now); + ret = __estimate_accuracy(&now); + if (ret < current->timer_slack_ns) + return current->timer_slack_ns; + return ret; +} + +#define POLL_TABLE_FULL(table) \ + ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) + +/* + * Ok, Peter made a complicated, but straightforward multiple_wait() function. + * I have rewritten this, taking some shortcuts: This code may not be easy to + * follow, but it should be free of race-conditions, and it's practical. If you + * understand what I'm doing here, then you understand how the linux + * sleep/wakeup mechanism works. + * + * Two very simple procedures, poll_wait() and poll_freewait() make all the + * work. poll_wait() is an inline-function defined in , + * as all select/poll functions have to call it to add an entry to the + * poll table. + */ +static void __pollwait(struct file *filp __attribute__((unused)), wait_queue_head_t *wait_address, + poll_table *p); + +static void scif_poll_initwait(struct poll_wqueues *pwq) +{ + init_poll_funcptr(&pwq->pt, __pollwait); + pwq->polling_task = current; + pwq->triggered = 0; + pwq->error = 0; + pwq->table = NULL; + pwq->inline_index = 0; +} + +static void free_poll_entry(struct poll_table_entry *entry) +{ + remove_wait_queue(entry->wait_address, &entry->wait); +} + +static void scif_poll_freewait(struct poll_wqueues *pwq) +{ + struct poll_table_page * p = pwq->table; + int i; + for (i = 0; i < pwq->inline_index; i++) + free_poll_entry(pwq->inline_entries + i); + while (p) { + struct poll_table_entry *entry; + struct poll_table_page *old; + + entry = p->entry; + do { + entry--; + free_poll_entry(entry); + } while (entry > p->entries); + old = p; + p = p->next; + free_page((unsigned long) old); + } +} + +static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) +{ + struct poll_table_page *table = p->table; + + if (p->inline_index < N_INLINE_POLL_ENTRIES) + return p->inline_entries + p->inline_index++; + + if (!table || POLL_TABLE_FULL(table)) { + struct poll_table_page *new_table; + + new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); + if (!new_table) { + p->error = -ENOMEM; + return NULL; + } + new_table->entry = new_table->entries; + new_table->next = table; + p->table = new_table; + table = new_table; + } + + return table->entry++; +} + +static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_wqueues *pwq = wait->private; + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); + + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expect write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in poll_schedule_timeout. + */ + smp_wmb(); + pwq->triggered = 1; + + /* + * Perform the default wake up operation using a dummy + * waitqueue. + * + * TODO: This is hacky but there currently is no interface to + * pass in @sync. @sync is scheduled to be removed and once + * that happens, wake_up_process() can be used directly. + */ + return default_wake_function(&dummy_wait, mode, sync, key); +} + +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_table_entry *entry; + + entry = container_of(wait, struct poll_table_entry, wait); + if (key && !((unsigned long)key & entry->key)) + return 0; + return __pollwake(wait, mode, sync, key); +} + +/* Add a new entry */ +static void __pollwait(struct file *filp __attribute__((unused)), wait_queue_head_t *wait_address, + poll_table *p) +{ + struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); + struct poll_table_entry *entry = poll_get_entry(pwq); + if (!entry) + return; + entry->filp = NULL; + entry->wait_address = wait_address; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + entry->key = p->_key; +#else + entry->key = p->key; +#endif + init_waitqueue_func_entry(&entry->wait, pollwake); + entry->wait.private = pwq; + add_wait_queue(wait_address, &entry->wait); +} + +int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack) +{ + int rc = -EINTR; + + set_current_state(state); + if (!pwq->triggered) + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + __set_current_state(TASK_RUNNING); + + /* + * Prepare for the next iteration. + * + * The following set_mb() serves two purposes. First, it's + * the counterpart rmb of the wmb in pollwake() such that data + * written before wake up is always visible after wake up. + * Second, the full barrier guarantees that triggered clearing + * doesn't pass event check of the next iteration. Note that + * this problem doesn't exist for the first iteration as + * add_wait_queue() has full barrier semantics. + */ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,2,0)) + smp_store_mb(pwq->triggered, 0); +#else + set_mb(pwq->triggered, 0); +#endif + + return rc; +} + +static unsigned int scif_poll_kernel(poll_table *pwait, struct endpt *ep) +{ + return __scif_pollfd(NULL, pwait, ep); +} + +/* + * Fish for pollable events on the pollfd->fd file descriptor. We're only + * interested in events matching the pollfd->events mask, and the result + * matching that mask is both recorded in pollfd->revents and returned. The + * pwait poll_table will be used by the fd-provided poll handler for waiting, + * if non-NULL. + */ +static inline unsigned int do_pollfd(struct scif_pollepd *pollfd, poll_table *pwait) +{ + unsigned int mask; + scif_epd_t epd; + + mask = 0; + epd = pollfd->epd; + if (epd) { + mask = POLLNVAL; + mask = DEFAULT_POLLMASK; + if (pwait) +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) + pwait->_key = pollfd->events | POLLERR | POLLHUP; +#else + pwait->key = pollfd->events | POLLERR | POLLHUP; +#endif + mask = scif_poll_kernel(pwait, epd); + /* Mask out unneeded events. */ + mask &= pollfd->events | POLLERR | POLLHUP; + } + pollfd->revents = mask; + + return mask; +} + +static int do_poll(unsigned int nfds, struct scif_pollepd *ufds, + struct poll_wqueues *wait, struct timespec *end_time) +{ + poll_table* pt = &wait->pt; + ktime_t expire, *to = NULL; + int timed_out = 0, count = 0, i = 0; + unsigned long slack = 0; + + /* Optimise the no-wait case */ + if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { + pt = NULL; + timed_out = 1; + } + + if (end_time && !timed_out) + slack = estimate_accuracy(end_time); + + for (;;) { + for (i = 0; i < nfds; i++) { + /* + * Fish for events. If we found one, record it + * and kill the poll_table, so we don't + * needlessly register any other waiters after + * this. They'll get immediately deregistered + * when we break out and return. + */ + if (do_pollfd(ufds + i, pt)) { + count++; + pt = NULL; + } + } + /* + * All waiters have already been registered, so don't provide + * a poll_table to them on the next loop iteration. + */ + pt = NULL; + if (!count) { + count = wait->error; + if (signal_pending(current)) + count = -EINTR; + } + if (count || timed_out) + break; + + /* + * If this is the first loop and we have a timeout + * given, then we convert to ktime_t and set the to + * pointer to the expiry value. + */ + if (end_time && !to) { + expire = timespec_to_ktime(*end_time); + to = &expire; + } + + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) + timed_out = 1; + } + return count; +} + +static int do_scif_poll(struct scif_pollepd *ufds, unsigned int nfds, + struct timespec *end_time) +{ + struct poll_wqueues table; + int epdcount; + + scif_poll_initwait(&table); + epdcount = do_poll(nfds, ufds, &table, end_time); + scif_poll_freewait(&table); + + return epdcount; +} + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +static struct timespec scif_timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} +/** + * poll_select_set_timeout - helper function to setup the timeout value + * @to: pointer to timespec variable for the final timeout + * @sec: seconds (from user space) + * @nsec: nanoseconds (from user space) + * + * Note, we do not use a timespec for the user space value here, That + * way we can use the function for timeval and compat interfaces as well. + * + * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. + */ +static int scif_poll_select_set_timeout(struct timespec *to, long sec, long nsec) +{ + struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + + if (!timespec_valid(&ts)) + return -EINVAL; + + /* Optimize for the zero timeout value here */ + if (!sec && !nsec) { + to->tv_sec = to->tv_nsec = 0; + } else { + ktime_get_ts(to); + *to = scif_timespec_add_safe(*to, ts); + } + return 0; +} + +int scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs) +{ + struct timespec end_time, *to = NULL; + if (timeout_msecs >= 0) { + to = &end_time; + scif_poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, + NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); + } + + return do_scif_poll(ufds, nfds, to); +} +EXPORT_SYMBOL(scif_poll); diff --git a/micscif/micscif_smpt.c b/micscif/micscif_smpt.c new file mode 100644 index 0000000..35c0ec2 --- /dev/null +++ b/micscif/micscif_smpt.c @@ -0,0 +1,457 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#if defined(HOST) || defined(WINDOWS) +#include "mic_common.h" +#endif + +struct _mic_ctx_t; +// Figure out which SMPT entry based on the host addr +#define SYSTEM_ADDR_TO_SMPT(sysaddr) ((sysaddr) >> (MIC_SYSTEM_PAGE_SHIFT)) +#define HOSTMIC_PA_TO_SMPT(hostmic_pa) (((hostmic_pa) - MIC_SYSTEM_BASE)\ + >> MIC_SYSTEM_PAGE_SHIFT) + +#define NUM_SMPT_ENTRIES_IN_USE 32 +#define SMPT_TO_MIC_PA(smpt_index) (MIC_SYSTEM_BASE + ((smpt_index) * \ + MIC_SYSTEM_PAGE_SIZE)) +#define MAX_HOST_MEMORY ((NUM_SMPT_ENTRIES_IN_USE) * MIC_SYSTEM_PAGE_SIZE) +#define MAX_SYSTEM_ADDR ((MIC_SYSTEM_BASE) + (MAX_HOST_MEMORY) - (1)) +#define IS_MIC_SYSTEM_ADDR(addr) (((addr) >= MIC_SYSTEM_BASE) && \ + ((addr) <= MAX_SYSTEM_ADDR)) + +#define _PAGE_OFFSET(x) ((x) & ((PAGE_SIZE) - (1ULL))) +#define SMPT_OFFSET(x) ((x) & MIC_SYSTEM_PAGE_MASK) +#define PAGE_ALIGN_LOW(x) ALIGN(((x) - ((PAGE_SIZE) - 1ULL)), (PAGE_SIZE)) +#define PAGE_ALIGN_HIGH(x) ALIGN((x), (PAGE_SIZE)) +#define SMPT_ALIGN_LOW(x) ALIGN(((x) - (MIC_SYSTEM_PAGE_MASK)), \ + (MIC_SYSTEM_PAGE_SIZE)) +#define SMPT_ALIGN_HIGH(x) ALIGN((x), (MIC_SYSTEM_PAGE_SIZE)) + +#if defined(HOST) +#define SMPT_LOGGING 0 +#if SMPT_LOGGING +static int64_t smpt_ref_count_g[MAX_BOARD_SUPPORTED]; +static int64_t map_count_g; +static int64_t unmap_count_g; +#endif +#endif + +void mic_smpt_set(volatile void *mm_sbox, uint64_t dma_addr, uint64_t index) +{ + uint32_t smpt_reg_val = BUILD_SMPT(SNOOP_ON, dma_addr >> MIC_SYSTEM_PAGE_SHIFT); + writel(smpt_reg_val, (uint8_t*)mm_sbox + SBOX_SMPT00 + (4 * index)); +} + +#if defined(HOST) +/* + * Called once per board as part of starting a MIC + * to restore the SMPT state to the previous values + * as stored in SMPT SW data structures. + */ +void mic_smpt_restore(mic_ctx_t *mic_ctx) +{ + int i; + dma_addr_t dma_addr; + uint32_t *smpt = (uint32_t*)(mic_ctx->mmio.va + + HOST_SBOX_BASE_ADDRESS + SBOX_SMPT00); + uint32_t smpt_reg_val; + + for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) { + dma_addr = mic_ctx->mic_smpt[i].dma_addr; + if (mic_ctx->bi_family == FAMILY_KNC) { + smpt_reg_val = BUILD_SMPT(SNOOP_ON, + dma_addr >> MIC_SYSTEM_PAGE_SHIFT); + writel(smpt_reg_val, &smpt[i]); + } + } +} + +/* + * Called once per board as part of smpt init + * This does a 0-512G smpt mapping, + */ +void mic_smpt_init(mic_ctx_t *mic_ctx) +{ + int i; + dma_addr_t dma_addr; + uint32_t *smpt = (uint32_t*)(mic_ctx->mmio.va + + HOST_SBOX_BASE_ADDRESS + SBOX_SMPT00); + uint32_t smpt_reg_val; +#if SMPT_LOGGING + smpt_ref_count_g[mic_ctx->bi_id] = 0; +#endif + + spin_lock_init(&mic_ctx->smpt_lock); + mic_ctx->mic_smpt = kmalloc(sizeof(mic_smpt_t) + * NUM_SMPT_ENTRIES_IN_USE, GFP_KERNEL); + + for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) { + dma_addr = i * MIC_SYSTEM_PAGE_SIZE; + mic_ctx->mic_smpt[i].dma_addr = dma_addr; + mic_ctx->mic_smpt[i].ref_count = 0; + if (mic_ctx->bi_family == FAMILY_KNC) { + smpt_reg_val = BUILD_SMPT(SNOOP_ON, + dma_addr >> MIC_SYSTEM_PAGE_SHIFT); + writel(smpt_reg_val, &smpt[i]); + } + } +} + +/* + * Called during mic exit per ctx (i.e once for every board) + * If ref count is non-zero, then it means that some module + * did not call mic_unmap_single/mic_ctx_unmap_single correctly. + */ +void +mic_smpt_uninit(mic_ctx_t *mic_ctx) +{ +#if SMPT_LOGGING + printk("global ref count for node = %d is %lld\n", + mic_ctx->bi_id+1, smpt_ref_count_g[mic_ctx->bi_id]); + printk("mic map calls = %lld, mic unmap calls = %lld \n", + map_count_g, unmap_count_g); + + for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) { + printk("[smpt_san%d] smpt_entry[%d] dma_addr = 0x%llX" + " ref_count = %lld \n", mic_ctx->bi_id+1, i, + mic_ctx->mic_smpt[i].dma_addr, + mic_ctx->mic_smpt[i].ref_count); + } +#endif +#ifdef DEBUG + { + int i; + for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) + WARN_ON(mic_ctx->mic_smpt[i].ref_count); + } +#endif + + kfree(mic_ctx->mic_smpt); + mic_ctx->mic_smpt = NULL; + ; +} + +dma_addr_t mic_ctx_map_single(mic_ctx_t *mic_ctx, void *p, size_t size) +{ + struct pci_dev *hwdev = mic_ctx->bi_pdev; + int bid = mic_ctx->bi_id; + + return mic_map_single(bid, hwdev, p, size); +} + +void mic_unmap_single(int bid, struct pci_dev *hwdev, dma_addr_t mic_addr, + size_t size) +{ + dma_addr_t dma_addr = mic_to_dma_addr(bid, mic_addr); + mic_unmap(bid, mic_addr, size); + pci_unmap_single(hwdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); +} + +void mic_ctx_unmap_single(mic_ctx_t *mic_ctx, dma_addr_t dma_addr, + size_t size) +{ + struct pci_dev *hwdev = mic_ctx->bi_pdev; + int bid = mic_ctx->bi_id; + mic_unmap_single(bid, hwdev, dma_addr, size); +} + +dma_addr_t mic_map_single(int bid, struct pci_dev *hwdev, void *p, + size_t size) +{ + dma_addr_t mic_addr = 0; + dma_addr_t dma_addr; + + dma_addr = pci_map_single(hwdev, p, size, PCI_DMA_BIDIRECTIONAL); + + if (!pci_dma_mapping_error(hwdev, dma_addr)) + if (!(mic_addr = mic_map(bid, dma_addr, size))) { + printk(KERN_ERR "mic_map failed board id %d\ + addr %#016llx size %#016zx\n", + bid, dma_addr, size); + pci_unmap_single(hwdev, dma_addr, + size, PCI_DMA_BIDIRECTIONAL); + } + return mic_addr; +} + +void add_smpt_entry(int spt, int64_t *ref, uint64_t dma_addr, int entries, mic_ctx_t *mic_ctx) +{ + + struct nodemsg msg; + dma_addr_t addr = dma_addr; + mic_smpt_t *mic_smpt = mic_ctx->mic_smpt; + int dev_id = mic_ctx->bi_id + 1; + void *mm_sbox = mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS; + int i; + + for (i = spt; i < spt + entries; i++, addr += MIC_SYSTEM_PAGE_SIZE) { +#ifdef CONFIG_ML1OM + /* + * For KNF if the ref count is 0 and the entry number is greater + * than 16 then we must resend a SMPT_SET message in case the uOS + * was rebooted and lost SMPT register state (example during host + * suspend/hibernate. + */ + if (!mic_smpt[i].ref_count && i >= (NUM_SMPT_ENTRIES_IN_USE >> 1)) { +#else + if (!mic_smpt[i].ref_count && (mic_smpt[i].dma_addr != addr)) { +#endif + /* + * ref count was zero and dma_addr requested did not + * match the dma address in the table. So, this is a + * new entry in the table. + * KNF: Send a message to the card + * to update its smpt table with a new value. + * KNC: write to the SMPT registers from host since + * they are accessible. + */ + if (mic_ctx->bi_family == FAMILY_ABR) { + msg.uop = SMPT_SET; + msg.payload[0] = addr; + msg.payload[1] = i; + msg.dst.node = scif_dev[dev_id].sd_node; + msg.src.node = 0; +#if SMPT_LOGGING + printk("[smpt_node%d] ==> sending msg to " + " node = %d dma_addr = 0x%llX, entry =" + "0x%llX\n" , mic_ctx->bi_id + 1, + scif_dev[dev_id].sd_node, + msg.payload[0], msg.payload[1]); +#endif + micscif_inc_node_refcnt(&scif_dev[dev_id], 1); + micscif_nodeqp_send(&scif_dev[dev_id], &msg, NULL); + micscif_dec_node_refcnt(&scif_dev[dev_id], 1); + } + else + mic_smpt_set(mm_sbox, addr, i); + mic_smpt[i].dma_addr = addr; + } + mic_smpt[i].ref_count += ref[i - spt]; + } +} + +dma_addr_t smpt_op(int bid, uint64_t dma_addr, + int entries, int64_t *ref) +{ + int spt = -1; /* smpt index */ + int ee = 0; /* existing entries */ + int fe = 0; /* free entries */ + int i; + unsigned long flags; + dma_addr_t mic_addr = 0; + dma_addr_t addr = dma_addr; + mic_ctx_t *mic_ctx = get_per_dev_ctx(bid); + mic_smpt_t *mic_smpt = mic_ctx->mic_smpt; + + if (micpm_get_reference(mic_ctx, true)) + goto exit; + spin_lock_irqsave(&mic_ctx->smpt_lock, flags); + + /* find existing entries */ + for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) { + if (mic_smpt[i].dma_addr == addr) { + ee++; + addr += MIC_SYSTEM_PAGE_SIZE; + } + else if (ee) /* cannot find contiguous entries */ + goto not_found; + + if (ee == entries) + goto found; + } + + /* find free entry */ +#ifdef CONFIG_ML1OM + /* + * For KNF the SMPT registers are not host accessible so we maintain a + * 1:1 map for SMPT registers from 0-256GB i.e. the first 16 entries and + * look for SMPT entries for P2P and IB etc from the 16th entry onwards. + * This allows the KNF card to boot on Host systems with < 256GB system + * memory and access VNET/SCIF buffers without crashing. P2P and IB SMPT + * entries are setup after SCIF driver load/reload via SCIF Node QP + * SMPT_SET messages. + */ + for (i = NUM_SMPT_ENTRIES_IN_USE / 2 ; i < NUM_SMPT_ENTRIES_IN_USE; i++) { +#else + for (i = 0 ; i < NUM_SMPT_ENTRIES_IN_USE; i++) { +#endif + fe = (mic_smpt[i].ref_count == 0) ? fe + 1: 0; + if (fe == entries) + goto found; + } + +not_found: + spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags); + micpm_put_reference(mic_ctx); +exit: + return mic_addr; +found: + spt = i - entries + 1; + mic_addr = SMPT_TO_MIC_PA(spt); + add_smpt_entry(spt, ref, dma_addr, entries, mic_ctx); + spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags); + micpm_put_reference(mic_ctx); + return mic_addr; +} + + +/* + * Returns number of smpt entries needed for dma_addr to dma_addr + size + * also returns the reference count array for each of those entries + * and the starting smpt address + */ +int get_smpt_ref_count(int64_t *ref, dma_addr_t dma_addr, size_t size, + uint64_t *smpt_start) +{ + uint64_t start = dma_addr; + uint64_t end = dma_addr + size; + int i = 0; + + while (start < end) { + ref[i++] = min(SMPT_ALIGN_HIGH(start + 1), end) - start; + start = SMPT_ALIGN_HIGH(start + 1); + } + + if (smpt_start) + *smpt_start = SMPT_ALIGN_LOW(dma_addr); + + return i; +} + +/* + * Maps dma_addr to dma_addr + size memory in the smpt table + * of board bid + */ +dma_addr_t mic_map(int bid, dma_addr_t dma_addr, size_t size) +{ + dma_addr_t mic_addr = 0; + int entries; + int64_t ref[NUM_SMPT_ENTRIES_IN_USE]; + uint64_t smpt_start; +#if SMPT_LOGGING + unsigned long flags; + mic_ctx_t *mic_ctx = get_per_dev_ctx(bid); + spin_lock_irqsave(&mic_ctx->smpt_lock, flags); + map_count_g++; + smpt_ref_count_g[bid] += (int64_t)size; + spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags); +#endif + if (!size) + return mic_addr; + + /* + * Get number of smpt entries to be mapped, ref count array + * and the starting smpt address to start the search for + * free or existing smpt entries. + */ + entries = get_smpt_ref_count(ref, dma_addr, size, &smpt_start); + + /* Set the smpt table appropriately and get 16G aligned mic address */ + mic_addr = smpt_op(bid, smpt_start, entries, ref); + + /* + * If mic_addr is zero then its a error case + * since mic_addr can never be zero. + * else generate mic_addr by adding the 16G offset in dma_addr + */ + if (!mic_addr) { + WARN_ON(1); + return mic_addr; + } + else + return (mic_addr + (dma_addr & MIC_SYSTEM_PAGE_MASK)); +} + +/* + * Unmaps mic_addr to mic_addr + size memory in the smpt table + * of board bid + */ +void mic_unmap(int bid, dma_addr_t mic_addr, size_t size) +{ + mic_ctx_t *mic_ctx = get_per_dev_ctx(bid); + mic_smpt_t *mic_smpt = mic_ctx->mic_smpt; + int64_t ref[NUM_SMPT_ENTRIES_IN_USE]; + int num_smpt; + int spt = HOSTMIC_PA_TO_SMPT(mic_addr); + int i; + unsigned long flags; + + if (!size) + return; + + if (!IS_MIC_SYSTEM_ADDR(mic_addr)) { + WARN_ON(1); + return; + } + + /* Get number of smpt entries to be mapped, ref count array */ + num_smpt = get_smpt_ref_count(ref, mic_addr, size, NULL); + + spin_lock_irqsave(&mic_ctx->smpt_lock, flags); + +#if SMPT_LOGGING + unmap_count_g++; + smpt_ref_count_g[bid] -= (int64_t)size; +#endif + + for (i = spt; i < spt + num_smpt; i++) { + mic_smpt[i].ref_count -= ref[i - spt]; + WARN_ON(mic_smpt[i].ref_count < 0); + } + spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags); +} + +dma_addr_t mic_to_dma_addr(int bid, dma_addr_t mic_addr) +{ + mic_ctx_t *mic_ctx = get_per_dev_ctx(bid); + int spt = HOSTMIC_PA_TO_SMPT(mic_addr); + dma_addr_t dma_addr; + + if (!IS_MIC_SYSTEM_ADDR(mic_addr)) { + WARN_ON(1); + return 0; + } + dma_addr = mic_ctx->mic_smpt[spt].dma_addr + SMPT_OFFSET(mic_addr); + return dma_addr; +} + +#endif + +bool is_syspa(dma_addr_t pa) +{ + return IS_MIC_SYSTEM_ADDR(pa); +} diff --git a/micscif/micscif_sysfs.c b/micscif/micscif_sysfs.c new file mode 100644 index 0000000..c38a383 --- /dev/null +++ b/micscif/micscif_sysfs.c @@ -0,0 +1,234 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include + +unsigned long scif_get_maxid(void); +static ssize_t show_scif_maxid(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_maxid); +} +static DEVICE_ATTR(maxnode, S_IRUGO, show_scif_maxid, NULL); + +unsigned long scif_get_total(void); +static ssize_t show_scif_total(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_total); +} +static DEVICE_ATTR(total, S_IRUGO, show_scif_total, NULL); + +unsigned long scif_get_nodes(void); +static ssize_t show_scif_nodes(struct device *dev, struct device_attribute *attr, char *buf) +{ + int len = 0; + int node; + + len += snprintf(buf + len, PAGE_SIZE, "%d:", ms_info.mi_total); + len += snprintf(buf + len, PAGE_SIZE, "%d", ms_info.mi_nodeid); + + for (node = 0; node <= ms_info.mi_maxid; node++) { + if (scif_dev[node].sd_state == SCIFDEV_RUNNING || + scif_dev[node].sd_state == SCIFDEV_SLEEPING || + is_self_scifdev(&scif_dev[node])) { + len += snprintf(buf + len, PAGE_SIZE, ",%d", scif_dev[node].sd_node); + } + } + + len += snprintf(buf + len, PAGE_SIZE, "\n"); + return len; +} +static DEVICE_ATTR(nodes, S_IRUGO, show_scif_nodes, NULL); + +static ssize_t show_watchdog_to(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_watchdog_to); +} + +static ssize_t store_watchdog_to(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int i, ret; + + if (sscanf(buf, "%d", &i) != 1) + goto invalid; + + if (i <= 0) + goto invalid; + + ms_info.mi_watchdog_to = i; + ret = strlen(buf); + printk("Current watchdog timeout %d seconds\n", ms_info.mi_watchdog_to); + goto bail; + +invalid: + printk(KERN_ERR "Attempt to set invalid watchdog timeout\n"); + ret = -EINVAL; +bail: + return ret; +} +static DEVICE_ATTR(watchdog_to, S_IRUGO | S_IWUSR, show_watchdog_to, store_watchdog_to); + +static ssize_t show_watchdog_enabled(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_watchdog_enabled); +} + +static ssize_t store_watchdog_enabled(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int i, ret; +#ifndef _MIC_SCIF_ + struct micscif_dev *scifdev; + int node; +#endif + + if (sscanf(buf, "%d", &i) != 1) + goto invalid; + + if (i < 0) + goto invalid; + + if (i && !ms_info.mi_watchdog_enabled) { + ms_info.mi_watchdog_enabled = 1; +#ifndef _MIC_SCIF_ + for (node = 1; node <= ms_info.mi_maxid; node++) { + scifdev = &scif_dev[node]; + if (scifdev->sd_ln_wq) + queue_delayed_work(scifdev->sd_ln_wq, + &scifdev->sd_watchdog_work, NODE_ALIVE_TIMEOUT); + } +#endif + } + + if (!i) + ms_info.mi_watchdog_enabled = 0; + + ret = strlen(buf); + printk("Watchdog timeout enabled = %d\n", ms_info.mi_watchdog_enabled); + goto bail; +invalid: + ret = -EINVAL; +bail: + return ret; +} +static DEVICE_ATTR(watchdog_enabled, S_IRUGO | S_IWUSR, show_watchdog_enabled, store_watchdog_enabled); + +static ssize_t show_watchdog_auto_reboot(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_watchdog_auto_reboot); +} + +static ssize_t store_watchdog_auto_reboot(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int i, ret; + + if (sscanf(buf, "%d", &i) != 1) + goto invalid; + + if (i < 0) + goto invalid; + + if (i && !ms_info.mi_watchdog_auto_reboot) + ms_info.mi_watchdog_auto_reboot = 1; + + if (!i) + ms_info.mi_watchdog_auto_reboot = 0; + + ret = strlen(buf); + printk("Watchdog auto reboot enabled = %d\n", ms_info.mi_watchdog_auto_reboot); + goto bail; +invalid: + ret = -EINVAL; +bail: + return ret; +} +static DEVICE_ATTR(watchdog_auto_reboot, S_IRUGO | S_IWUSR, show_watchdog_auto_reboot, store_watchdog_auto_reboot); + +static ssize_t show_proxy_dma_threshold(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lld\n", ms_info.mi_proxy_dma_threshold); +} + +static ssize_t store_proxy_dma_threshold(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int ret; + uint64_t i; + + if (sscanf(buf, "%lld", &i) != 1) + goto invalid; + + ms_info.mi_proxy_dma_threshold = i; + ret = strlen(buf); + printk("P2P proxy DMA Threshold = %lld bytes\n", ms_info.mi_proxy_dma_threshold); + goto bail; +invalid: + ret = -EINVAL; +bail: + return ret; +} +static DEVICE_ATTR(proxy_dma_threshold, S_IRUGO | S_IWUSR, show_proxy_dma_threshold, store_proxy_dma_threshold); + +static struct attribute *scif_attributes[] = { + &dev_attr_maxnode.attr, + &dev_attr_total.attr, + &dev_attr_nodes.attr, + &dev_attr_watchdog_to.attr, + &dev_attr_watchdog_enabled.attr, + &dev_attr_watchdog_auto_reboot.attr, + &dev_attr_proxy_dma_threshold.attr, + NULL +}; + +struct attribute_group scif_attr_group = { + .attrs = scif_attributes +}; diff --git a/micscif/micscif_va_gen.c b/micscif/micscif_va_gen.c new file mode 100644 index 0000000..7338a57 --- /dev/null +++ b/micscif/micscif_va_gen.c @@ -0,0 +1,480 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* ************************************************************************* *\ +generate a virtual address for a given size +\* ************************************************************************* */ +#include "mic/micscif.h" + +/* ************************************************************************* *\ +FUNCTION: VaGenAddress::Initialize + +DESCRIPTION: Initialize VaGenAddress to point to one node of size = range +\* ************************************************************************* */ +static int +va_gen_init_internal(struct va_gen_addr *addr, uint64_t range) +{ + struct va_node *node; + int err; + + va_node_init(&addr->allocator); + if ((err = va_node_alloc(&addr->allocator, &addr->hole_list)) < 0) + goto init_err; + if (va_node_is_valid(addr->hole_list)) { + node = va_node_get(&addr->allocator, addr->hole_list); + node->next = invalid_va_node_index; + node->base = 0; + node->range = range; + } + addr->claims_list = invalid_va_node_index; +init_err: + return err; +} + +/* ************************************************************************* *\ +FUNCTION: VaGenAddress::Alloc +Allocate virtual memory by searching through free virtual memory +linked list for first range >= desired range. + +Note: Free list is sorted by base, we are searching for range. + +Return: Offset to allocated virtual address if successful (in pages). +INVALID_VA_PAGE_INDEX if failed +\* ************************************************************************* */ +static uint64_t +va_gen_alloc_internal(struct va_gen_addr *addr, uint64_t range) +{ + //========================================================================== + // Search for a sufficiently large memory hole (first-fit). + //-------------------------------------------------------------------------- + + // Search for first available hole of sufficient size. + uint32_t index = addr->hole_list; + struct va_node *pFind; + // Used to handle case of an exact range match. + struct va_node *pPrev = 0; + uint64_t base; + + if (0 == range || !va_node_is_valid(addr->hole_list)) + return INVALID_VA_PAGE_INDEX; + + pFind = va_node_get(&addr->allocator, index); + + for ( ; ; ) { + if (pFind->range >= range) + break; + else { + index = pFind->next; + // No hole sufficiently large. + if (!va_node_is_valid(index)) + return INVALID_VA_PAGE_INDEX; + pPrev = pFind; + pFind = va_node_get(&addr->allocator, index); + } + } + + // Found an adequate hole. Get its base. + base = pFind->base; + + //============================================================================ + // Uncommon case: pFind->range == in_range + // Remove node from the hole list when exact fit. Note, could leave the + // hole list empty. + //---------------------------------------------------------------------------- + + if (pFind->range == range) { + // first node? + if (addr->hole_list == index) + addr->hole_list = pFind->next; + else { + BUG_ON(!pPrev); + pPrev->next = pFind->next; + } + va_node_free(&addr->allocator, index); + return base; + } + + //================================================================================ + // Shrink an existing node that is too large. + //-------------------------------------------------------------------------------- + + else { + pFind->base += range; + pFind->range -= range; + } + + return base; +} + +/* ************************************************************************* *\ +FUNCTION: VaGenAddress::FreeClaim + +DESCRIPTION: +Removes claimed range from the claims list. +\* ************************************************************************* */ +static void +va_gen_free_claim(struct va_gen_addr *addr, uint64_t base, uint64_t range) +{ + struct va_node *pNode = 0; + struct va_node *pPrev = 0; + uint32_t index, new_index; + struct va_node *pNewNode; + int err; + + if (0 == range) + return; + + for (index = addr->claims_list; va_node_is_valid(index); index = pNode->next) { + pNode = va_node_get(&addr->allocator, index); + + if (pNode->base <= base && pNode->base + pNode->range >= base + range) { + if (pNode->base == base) { + pNode->base += range; + pNode->range -= range; + if (0 == pNode->range) { + if (pPrev) + pPrev->next = pNode->next; + else + addr->claims_list = pNode->next; + va_node_free(&addr->allocator, index); + } + } else if (pNode->base + pNode->range == base + range) { + pNode->range -= range; + } else { + err = va_node_alloc(&addr->allocator, &new_index); + BUG_ON(err < 0); + pNewNode = va_node_get(&addr->allocator, new_index); + pNewNode->base = base + range; + pNewNode->range = pNode->range - pNewNode->base; + pNewNode->next = pNode->next; + pNode->range = base - pNode->base; + pNode->next = new_index; + } + return; + } + if (pNode->base > base + range) { + pr_debug("Freed claim not found in the list\n"); + return; + } + + if ((pNode->base < base) ? + (pNode->base + pNode->range > base) : + (base + range > pNode->base)) { + pr_debug("Freed claim partially overlaps the list\n"); + return; + } + pPrev = pNode; + } +} + +/* ************************************************************************* *\ +FUNCTION: VaGenAddress::InsertAndCoalesce + +DESCRIPTION: +O(n) search through free list sorted by base +should average O(n/2), and free list should be much less than the # allocated +coalesce with node before/after if possible +3 possible outcomes: +1. freed node is inserted into list (0 deallocated) +2. freed node range coalesced with existing node, +so freed node can be deallocated (1 deallocated) +3. freed node + another node are coalesced + deallocated +(2 deallocated) +Fails if there is full or partial overlap between inserted +range and ranges in the list + +returns false if insert failed +\* ************************************************************************* */ +static int +va_gen_insert_and_coalesce(struct va_node_allocator *allocator, uint32_t *list, + uint64_t base, uint64_t range) +{ + // search through free list, insert ordered + // also check for coalesce + uint32_t findPtr = *list; + uint32_t prev = *list; + uint64_t end_range = base + range; + uint32_t nextPtr, ptr; + struct va_node *nextNode, *node; + int err; + + while (va_node_is_valid(findPtr)) { + struct va_node *find = va_node_get(allocator, findPtr); + // overlap? + // A.start < B.start && A.end > B.start A-B==A-B A-B==B-A otherwise A-A B-B + // B.start < A.start && B.end > A.start B-A==B-A B-A==A-B otherwise B-B A-A + // => + // A.start < B.start ? A.end > B.start : B.end > A.start + + if ((find->base < base) ? + (find->base + find->range > base) : + (end_range > find->base)) { + return -1; + } + //---------------------------------------------------------- + // coalesce? 2 possibilities: + // 1. (pFind->base + pFind->range) == current.base + // coalesce, check next node base = endrange, + // coalesce with next if possible, deallocate next, exit + // 2. end_range == pFind->base + // coalesce, exit + if (end_range == find->base) { + // pr_debug("Coalesce base %lld before %lld\n", base, find->base); + find->base = base; + find->range += range; + return 0; + } else if ((find->base + find->range) == base) { + // pr_debug("Coalesce base %lld after %lld\n", base, find->base); + // leave the base unchanged + find->range += range; + // check the next node to see if it coalesces too + nextPtr = find->next; + if (va_node_is_valid(nextPtr)) { + nextNode = va_node_get(allocator, nextPtr); + // end_range is the same after prior coalesce + if (nextNode->base == end_range) { + // pr_debug("Double Coalesce index %d before %d\n", findPtr, nextPtr); + find->range += nextNode->range; + find->next = nextNode->next; + va_node_free(allocator, nextPtr); + } + } + return 0; + } + // end coalesce + + //---------------------------------------------------------- + // insert if found a node at a greater address + else if (find->base > end_range) + // exit loop, insert node + break; + // nothing found yet, next index + prev = findPtr; + findPtr = find->next; + } + + //---------------------------------------------------------- + // insert or append if node + // could be at the end or empty free list (find index = INVALID) + // or, next node has larger base + //---------------------------------------------------------- + err = va_node_alloc(allocator, &ptr); + BUG_ON(err < 0); + if (!va_node_is_valid(ptr)) { + printk(KERN_ERR "FAILED to add hole! base = %lld, range = %lld\n", base, range); + return 0; + } + node = va_node_get(allocator, ptr); + node->base = base; + node->range = range; + node->next = findPtr; + // First node or empty list (Alloc() can empty the list) + if (findPtr == *list) + // pr_debug("List now starts with %d\n", ptr); + *list = ptr; + else { // reached the end of the list or insertion + BUG_ON(!va_node_is_valid(prev)); + // pr_debug("Append index %d after %d\n", ptr, prev); + (va_node_get(allocator, prev))->next = ptr; + } + return 0; +} + +/* ************************************************************************* *\ +FUNCTION: VaGenAddress::Free + +DESCRIPTION: +Frees allocated Virtual Address. Inserts freed range in the list of holes +(available virtual addresses) +\* ************************************************************************* */ +static void +va_gen_free_internal(struct va_gen_addr *addr, uint64_t base, uint64_t range) +{ + int result = va_gen_insert_and_coalesce(&addr->allocator, &addr->hole_list, base, range); + BUG_ON(result < 0); +} + +/* ************************************************************************* *\ +FUNCTION: VaGenAddress::Alloc +Allocate virtual memory space. + +Note: "Quick and dirty" implementation of aligned Alloc on top of +non-aligned Alloc. + +Return: Offset to allocated virtual address if successful (in pages). +INVALID_VA_PAGE_INDEX if failed +\* ************************************************************************* */ +static uint64_t +va_gen_alloc_aligned(struct va_gen_addr *addr, uint64_t range, uint32_t unit_align) +{ + uint64_t base_address = va_gen_alloc_internal(addr, range + unit_align - 1); + uint64_t aligned_base = base_address; + if (0 == range || 0 == unit_align) + return INVALID_VA_PAGE_INDEX; + //BUG_ON(IsPowerOfTwo(in_unitAlign)); + + if (unit_align == 1 || base_address == INVALID_VA_PAGE_INDEX) + return base_address; + + if (aligned_base > base_address) + va_gen_free_internal(addr, base_address, aligned_base - base_address); + + if (aligned_base + range < base_address + unit_align - 1) + va_gen_free_internal(addr, aligned_base + range, + base_address + unit_align - 1 - aligned_base - range); + return aligned_base; +} + +/* ************************************************************************* *\ +FUNCTION: VaGenAddress::Claim + +DESCRIPTION: +Claims a SVAS range. Checks if range was claimed before; if not, records +the claim in the claims list + +returns false if claim failed +\* ************************************************************************* */ +static int +va_gen_claim_internal(struct va_gen_addr *addr, uint64_t base, uint64_t range) +{ + return va_gen_insert_and_coalesce(&addr->allocator, &addr->claims_list, base, range); +} + +/* ************************************************************************* *\ +FUNCTION: VaGenAddressMutex::Alloc +Allocate virtual memory space. + +Note: Wrapper for unit-testable address generator to add critical +section and convert bytes to pages. +Note: Free() selects between Free[Alloc] and FreeClaim based on +the address range of the freed address. + +Return: Allocated virtual address if successful (in bytes) +INVALID_VA_GEN_ADDRESS if failed +\* ************************************************************************* */ +uint64_t +va_gen_alloc(struct va_gen_addr *addr, uint64_t num_bytes, uint32_t align_bytes) +{ + // Convert input bytes to pages which is our unit for the address generator. + uint64_t num_pages = (uint64_t)(((PAGE_SIZE - 1) + num_bytes) / PAGE_SIZE); + uint64_t align_pages = align_bytes / PAGE_SIZE; + uint64_t va_page_index, ret; + + if (align_bytes < PAGE_SIZE) { + ret = INVALID_VA_GEN_ADDRESS; + WARN_ON(1); + goto done; + } + + if (num_bytes > (0xffffffffULL * PAGE_SIZE)) { + ret = INVALID_VA_GEN_ADDRESS; + WARN_ON(1); + goto done; + } + va_page_index = va_gen_alloc_aligned(addr, num_pages, (uint32_t)(align_pages % 0xffffffff) ); + + if (va_page_index == INVALID_VA_PAGE_INDEX) + return INVALID_VA_GEN_ADDRESS; + + // Convert page number to virtual address, adding base. + ret = va_page_index << PAGE_SHIFT; + ret += addr->base; +done: + return ret; +} + +// Claims ownership of a memory region +uint64_t +va_gen_claim(struct va_gen_addr *addr, uint64_t address, uint64_t num_bytes) +{ + uint64_t va, num_pages; + int result; + + if (address + num_bytes > addr->base) + address = INVALID_VA_GEN_ADDRESS; + else if (address & (PAGE_SIZE - 1)) + // address not aligned + address = INVALID_VA_GEN_ADDRESS; + else { + va = (uint64_t)(address >> PAGE_SHIFT); + // pr_debug("%s %d (%#llx,%llx)\n", __func__, __LINE__, va, num_bytes); + // convert input bytes to pages, our unit for the address generator + num_pages = (uint64_t)(((PAGE_SIZE-1) + num_bytes) / PAGE_SIZE); + if ((result = va_gen_claim_internal(addr, va, num_pages)) < 0) + address = INVALID_VA_GEN_ADDRESS; + } + return address; +} + +// frees the address range so the pages may be re-assigned +void +va_gen_free(struct va_gen_addr *addr, uint64_t address, uint64_t num_bytes) +{ + uint64_t va, num_pages; + + if (address >= addr->base) { + // convert virtual address to page number, subtracting base + address -= addr->base; + va = (uint64_t)(address >> PAGE_SHIFT); + // pr_debug("%s %d (%#llx,%llx)\n", __func__, __LINE__, va, num_bytes); + // convert input bytes to pages, our unit for the address generator + num_pages = (uint64_t)(((PAGE_SIZE-1) + num_bytes) / PAGE_SIZE); + va_gen_free_internal(addr, va, num_pages); + } else { + va = (uint64_t)(address >> PAGE_SHIFT); + // pr_debug("%s %d (%#llx,%llx)\n", __func__, __LINE__, va, num_bytes); + // convert input bytes to pages, our unit for the address generator + num_pages = (uint64_t)(((PAGE_SIZE-1) + num_bytes) / PAGE_SIZE); + va_gen_free_claim(addr, va, num_pages); + } +} + +// base and range in bytes, though internal va generator works in pages +int +va_gen_init(struct va_gen_addr *addr, uint64_t base, uint64_t range) +{ + uint64_t rangeInPages = (uint64_t)(range >> PAGE_SHIFT); + int ret; + + if (!(ret = va_gen_init_internal(addr, rangeInPages))) + addr->base = base; + return ret; +} + +void +va_gen_destroy(struct va_gen_addr *addr) +{ + va_node_destroy(&addr->allocator); +} diff --git a/micscif/micscif_va_node.c b/micscif/micscif_va_node.c new file mode 100644 index 0000000..363b471 --- /dev/null +++ b/micscif/micscif_va_node.c @@ -0,0 +1,187 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/***************************************************************************\ +manage available nodes for VaGenAddress +\***************************************************************************/ +#include "mic/micscif.h" + +/***************************************************************************\ +FUNCTION: va_node_init + +DESCRIPTION: constructor for allocator for GfxGenAddress +\***************************************************************************/ +void va_node_init(struct va_node_allocator *node) +{ + node->pp_slab_directory = 0; + node->slab_shift = 7; /* 2^7 -> 128 nodes in the slab */ + node->nodes_in_slab = 1<slab_shift; + node->slab_mask = (node->nodes_in_slab-1); + node->num_slabs = 0; + node->num_free_slabs = 0; + node->free_list = invalid_va_node_index; +} + +int va_node_is_valid(uint32_t index) +{ + return invalid_va_node_index != index; +} + +/************************************************************************** *\ +FUNCTION: va_node_destroy + +DESCRIPTION: destructor for allocator for GfxGenAddress +\************************************************************************** */ +void va_node_destroy(struct va_node_allocator *node) +{ + uint32_t i; + if (node->pp_slab_directory) { + for (i = 0; i < node->num_slabs; i++) { + kfree(node->pp_slab_directory[i]); + node->pp_slab_directory[i] = NULL; + } + kfree(node->pp_slab_directory); + node->pp_slab_directory = NULL; + } +} + +/* ************************************************************************* *\ +FUNCTION: va_node_realloc + +DESCRIPTION: va_node_realloc to add more node arrays +\* ************************************************************************* */ +static int va_node_realloc(struct va_node_allocator *node) +{ + uint32_t growSlabs = 2 * (node->num_slabs) + 1; + struct va_node **ppGrowDirectory = + kzalloc(sizeof(struct va_node *) * growSlabs, GFP_KERNEL); + uint32_t i; + + if (!ppGrowDirectory) + return -ENOMEM; + + if (node->num_slabs) { + for (i = 0; i < node->num_slabs; i++) + ppGrowDirectory[i] = node->pp_slab_directory[i]; + kfree(node->pp_slab_directory); + node->pp_slab_directory = NULL; + } + node->pp_slab_directory = ppGrowDirectory; + node->num_free_slabs = growSlabs - node->num_slabs; + return 0; +} + +/* ************************************************************************* *\ +FUNCTION: va_node_grow + +DESCRIPTION: add a node array +\* ************************************************************************* */ +static int va_node_grow(struct va_node_allocator *node) +{ + struct va_node *pNewSlab; + uint32_t i, start; + int ret; + + if (!node->num_free_slabs) + if ((ret = va_node_realloc(node)) < 0) + return ret; + + pNewSlab = kzalloc(sizeof(struct va_node) * + node->nodes_in_slab, GFP_KERNEL); + if (pNewSlab) + node->pp_slab_directory[node->num_slabs] = pNewSlab; + else + return -ENOMEM; + + /*-------------------------------------------------------- + * add new nodes to free list + * slightly better than just calling free() for each index + */ + start = node->num_slabs * node->nodes_in_slab; + for (i = 0; i < (node->nodes_in_slab-1); i++) + /* we could optimize this, but why bother? */ + pNewSlab[i].next = start + i + 1; + /* add new allocations to start of list */ + pNewSlab[node->nodes_in_slab-1].next = node->free_list; + node->free_list = start; + /*-------------------------------------------------------*/ + + /* update bookkeeping for array of arrays */ + node->num_slabs++; + node->num_free_slabs--; + return 0; +} + +/* ************************************************************************* *\ +FUNCTION: va_node_get + +DESCRIPTION: return a node reference from index +\* ************************************************************************* */ +struct va_node *va_node_get(struct va_node_allocator *node, uint32_t index) +{ + uint32_t slabIndex = index >> node->slab_shift; + uint32_t nodeIndex = index & node->slab_mask; + + return &node->pp_slab_directory[slabIndex][nodeIndex]; +} + +/* ************************************************************************* *\ +FUNCTION: va_node_alloc + +DESCRIPTION: return 0 on success with valid index in out_alloc or errno on failure. +\* ************************************************************************* */ +int va_node_alloc(struct va_node_allocator *node, uint32_t *out_alloc) +{ + int ret; + + if (!va_node_is_valid(node->free_list)) + if ((ret = va_node_grow(node)) < 0) + return ret; + *out_alloc = node->free_list; + node->free_list = (va_node_get(node, *out_alloc))->next; + return 0; +} + +/* ************************************************************************* *\ +FUNCTION: va_node_free + +DESCRIPTION: make a node available +\* ************************************************************************* */ +void va_node_free(struct va_node_allocator *node, uint32_t index) +{ + struct va_node *tmp = va_node_get(node, index); + tmp->next = node->free_list; + node->free_list = index; +} diff --git a/mpssboot/Kbuild b/mpssboot/Kbuild new file mode 100644 index 0000000..c58d6c8 --- /dev/null +++ b/mpssboot/Kbuild @@ -0,0 +1 @@ +obj-m := mpssboot.o diff --git a/mpssboot/mpssboot.c b/mpssboot/mpssboot.c new file mode 100644 index 0000000..7939613 --- /dev/null +++ b/mpssboot/mpssboot.c @@ -0,0 +1,238 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ACPT_BOOTED 1 +#define ACPT_BOOT_ACK 2 +#define ACPT_NACK_VERSION 3 +#define ACPT_REQUEST_TIME 4 +#define ACPT_TIME_DATA 5 + +#define ACPT_VERSION 1 + +static dev_t dev; +static struct class *class; +static struct device *mbdev; + +static int host_notified; +static struct timespec tod; +static int timeset = 0; + +static ssize_t +show_timesync(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "Time: %s\n", timeset? "set" : "not set"); +} + +static ssize_t +set_synctime(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + struct scif_portID port = {0, MIC_NOTIFY}; + static scif_epd_t epd; + int proto = ACPT_REQUEST_TIME; + int version = ACPT_VERSION; + int err; + + epd = scif_open(); + + if ((err = scif_connect(epd, &port))) { + printk("MPSSBOOT error, synctime connect failed: %d\n", err); + goto close_synctime; + } + + if ((err = scif_send(epd, &version, sizeof(version), 0)) != sizeof(version)) { + printk("MPSSBOOT send version failed: %d\n", err); + goto close_synctime; + } + + if ((err = scif_send(epd, &proto, sizeof(proto), 0)) != sizeof(proto)) { + printk("MPSSBOOT send boot finished failed: %d\n", err); + goto close_synctime; + } + + if ((err = scif_recv(epd, &proto, sizeof(proto), SCIF_RECV_BLOCK)) != sizeof(proto)) { + printk("MPSSBOOT protocol recv ack failed: %d\n", err); + goto close_synctime; + } + + if (proto != ACPT_TIME_DATA) { + printk("MPSSBOOT failed to receive time data packet %d\n", proto); + goto close_synctime; + } + + if ((err = scif_recv(epd, &tod, sizeof(tod), SCIF_RECV_BLOCK)) != sizeof(tod)) { + printk("MPSSBOOT time data read size failed: %d\n", err); + goto close_synctime; + } + + do_settimeofday(&tod); + printk("MPSSBOOT Time of day sycned with host\n"); + timeset = 1; + +close_synctime: + scif_close(epd); + return count; +} +static DEVICE_ATTR(synctime, S_IRUGO | S_IWUSR, show_timesync, set_synctime); + +static ssize_t +show_host_notified(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", host_notified); +} + +static ssize_t +set_host_notified(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + struct scif_portID port = {0, MIC_NOTIFY}; + static scif_epd_t epd; + int proto = ACPT_BOOTED; + int version = ACPT_VERSION; + int err; + + epd = scif_open(); + + if ((err = scif_connect(epd, &port))) { + printk("MPSSBOOT error, notify connect failed: %d\n", err); + goto close_notify; + } + + if ((err = scif_send(epd, &version, sizeof(version), 0)) != sizeof(version)) { + printk("MPSSBOOT send version failed: %d\n", err); + goto close_notify; + } + + if ((err = scif_send(epd, &proto, sizeof(proto), 0)) != sizeof(proto)) { + printk("MPSSBOOT send boot finished failed: %d\n", err); + goto close_notify; + } + + if ((err = scif_recv(epd, &proto, sizeof(proto), SCIF_RECV_BLOCK)) != sizeof(proto)) { + printk("MPSSBOOT protocol recv ack failed: %d\n", err); + goto close_notify; + } + + if (proto != ACPT_BOOT_ACK) + printk("MPSSBOOT failed to receive boot ACK, got %d\n", proto); + else + printk("MPSSBOOT Boot acknowledged\n"); + +close_notify: + scif_close(epd); + return count; +} +static DEVICE_ATTR(host_notified, S_IRUGO | S_IWUSR, show_host_notified, set_host_notified); + +static struct attribute *mb_attributes[] = { + &dev_attr_synctime.attr, + &dev_attr_host_notified.attr, + NULL +}; + +struct attribute_group mb_attr_group = { + .attrs = mb_attributes +}; + +/* This function closes the endpoint established on init */ +static void +mpssboot_exit(void) +{ + sysfs_remove_group(&mbdev->kobj, &mb_attr_group); + device_destroy(class, dev); + class_destroy(class); +} + +static char * +mpssboot_devnode(struct device *dev, mode_t *mode) +{ + return kasprintf(GFP_KERNEL, "%s", dev_name(dev)); +} + +/* This function initializes a SCIF connection to the host */ +static int +mpssboot_init(void) +{ + //static struct device dev; + int result; + + alloc_chrdev_region(&dev, 0, 2, "micnotify"); + class = class_create(THIS_MODULE, "micnotify"); + class->devnode = mpssboot_devnode; + mbdev = device_create(class, NULL, dev, NULL, "notify"); + + result = sysfs_create_group(&mbdev->kobj, &mb_attr_group); + result = result; + return 0; +} + +module_init(mpssboot_init); +module_exit(mpssboot_exit); +MODULE_LICENSE("GPL"); + diff --git a/pm_scif/Kbuild b/pm_scif/Kbuild new file mode 100644 index 0000000..4f49d0d --- /dev/null +++ b/pm_scif/Kbuild @@ -0,0 +1 @@ +obj-m := pm_scif.o diff --git a/pm_scif/pm_scif.c b/pm_scif/pm_scif.c new file mode 100644 index 0000000..aa18b7a --- /dev/null +++ b/pm_scif/pm_scif.c @@ -0,0 +1,439 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pm_scif.h" + +#define PM_DB(fmt, ...) printk(KERN_ALERT"[ %s : %d ]:"fmt,__func__, __LINE__, ##__VA_ARGS__) +#define FUNCTION_ENTRY PM_DB("==> %s\n", __func__) +#define FUNCTION_EXIT PM_DB("<== %s\n", __func__) + +#define PM_SCIF_RETRY_COUNT 5 + +DEFINE_RWLOCK(pmscif_send); + +static atomic_t epinuse = ATOMIC_INIT(0); +void pm_scif_exit(void); + +typedef struct _mic_pm_scif { + scif_epd_t ep; + int lport; + struct scif_portID rport_id; + struct workqueue_struct *pm_recvq; + struct work_struct pm_recv; + PM_CONNECTION_STATE con_state; +} mic_pm_scif; + +mic_pm_scif *pm_scif; + +void +pm_dump(char *buf, size_t len) +{ + int i = 0; + + for ( i=0; i < len; i++) { + + if (i % 8) + printk(KERN_ALERT"\n"); + printk(KERN_ALERT"%x ", buf[i]); + } +} + +static void pm_handle_open (void *msg, size_t len) +{ + FUNCTION_ENTRY; + pm_dump((char*)msg, len); +} + +static void pm_handle_test (void *msg, size_t len) +{ + FUNCTION_ENTRY; + pm_dump((char*)msg, len); + +} +typedef void (*_pm_msg_handler)(void*, size_t); + +typedef struct _pm_msg_call { + _pm_msg_handler handler; + char *name; +}pm_msg_call; + +#define PM_HANDLE_ADD(opcode, function) [(opcode)] = {(function), #function} + +pm_msg_call pm_msg_caller[PM_MESSAGE_MAX] = { + PM_HANDLE_ADD(PM_MESSAGE_OPEN, pm_handle_open), + PM_HANDLE_ADD(PM_MESSAGE_TEST, pm_handle_test) +}; + +int +pm_send_to_host(PM_MESSAGE opcode, void *msg, size_t len) +{ +// FUNCTION_ENTRY; + int err = 0; + size_t psize = sizeof(pm_msg_header) + len; + char *payload; + unsigned long flags; + + if (pm_scif->con_state != PM_CONNECTED) { + err = -EINVAL; + goto error; + } + + if (!(payload = kmalloc(psize, GFP_ATOMIC))) { + err = -ENOMEM; + goto error; + } + read_lock_irqsave(&pmscif_send,flags); + + if (atomic_xchg(&epinuse,1) != 0) { + read_unlock_irqrestore(&pmscif_send,flags); + kfree(payload); + return -1; + } + + ((pm_msg_header*)payload)->opcode = opcode; + ((pm_msg_header*)payload)->len = len; + if (len) + memcpy((char*)payload + sizeof(pm_msg_header), msg, len); + + //0 for non blocking + if ((err = scif_send(pm_scif->ep, payload, psize, 0)) < 0) { + PM_DB("scif_recv failed\n"); + } + atomic_set(&epinuse,0); + //for (i = 0; i < psize; i++) + // printk(KERN_ALERT" buff: %X\n", payload[i]); + read_unlock_irqrestore(&pmscif_send,flags); + kfree(payload); +// FUNCTION_EXIT; +error: + return err; +} + +EXPORT_SYMBOL(pm_send_to_host); + +static struct mic_pmscif_handle micpmscif = { + .pm_scif_uos2host = pm_send_to_host, + .pm_scif_host2uos = NULL, + .owner = THIS_MODULE, +}; + + + +static void pm_send_to_uos(pm_msg_header *header, char *msg) +{ + if(micpmscif.pm_scif_host2uos) { + micpmscif.pm_scif_host2uos(header, msg); + } +} + +static void +pm_recv_from_host(struct work_struct *work) +{ + int err = 0; + char *msg = NULL; + pm_msg_header *header; + mic_pm_scif *pm_scif_info = container_of(work, mic_pm_scif, pm_recv); + + FUNCTION_ENTRY; + if (pm_scif->con_state != PM_CONNECTED) + goto exit; + + header = kmalloc(sizeof(pm_msg_header), GFP_KERNEL); + + if ((err = scif_recv(pm_scif_info->ep, header, sizeof(pm_msg_header), + SCIF_RECV_BLOCK)) < 0) { + PM_DB("scif_recv failed\n"); + goto end_con; + } + + msg = kmalloc(header->len, GFP_KERNEL); + + if ((err = scif_recv(pm_scif_info->ep, msg, header->len, + SCIF_RECV_BLOCK)) < 0) { + PM_DB("scif_recv failed\n"); + goto end_con; + } + if(header->opcode < PM_MESSAGE_MAX) { + if ((header->opcode != PM_MESSAGE_CLOSE) && + (header->opcode != PM_MESSAGE_CLOSE_ACK)) { + if(pm_msg_caller[header->opcode].handler) + pm_msg_caller[header->opcode].handler(msg, header->len); + pm_send_to_uos(header, msg); + } else { + if (header->opcode == PM_MESSAGE_CLOSE) { + pm_send_to_uos(header,msg); + pm_send_to_host(PM_MESSAGE_CLOSE_ACK, NULL, 0); + } + pm_scif->con_state = PM_DISCONNECTING; + goto end_con; + } + } + else + printk("pm_scif: Recvd scif message with bad opcode %d\n", + header->opcode); + kfree(header); + kfree(msg); + queue_work(pm_scif->pm_recvq, &pm_scif->pm_recv); + return; + +end_con: + kfree(header); + kfree(msg); +exit: + FUNCTION_EXIT; +} + +#ifdef PM_SCIF_IOCTL +static int +spm_ioctl(struct inode *in, struct file *f, unsigned int cmd, unsigned long arg) +{ + int i = 0; + uint32_t payload = 0xc0de0000; + + FUNCTION_ENTRY; + for (i = 0; i < PM_MESSAGE_TEST; i++) { + payload++; + //PM_DB("sending %s with payload = %x \n", + // pm_msg_caller[i].name, payload); + pm_send_to_host(i, &payload, sizeof(payload)); + } + + return 0; +} + +static long +spm_unlocked_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + return (long) spm_ioctl(f->f_path.dentry->d_inode, f, cmd, arg); +} + +static int +spm_release(struct inode *in, struct file *f) +{ + return 0; +} + +static char * +spm_devnode(struct device *dev, mode_t *mode) +{ + return kasprintf(GFP_KERNEL, "spm/%s", dev_name(dev)); +} + + +static int +spm_open(struct inode *in, struct file *f) +{ + return 0; +} + +struct file_operations spm_ops = { + .owner = THIS_MODULE, + .unlocked_ioctl = spm_unlocked_ioctl, + .open = spm_open, + .release = spm_release, +}; + +int spm_major; +int spm_minor; +dev_t spmdev; +struct cdev spmcdev; +struct class *spmclass; + +static void +spm_dev_deinit(void) +{ + device_destroy(spmclass,spmdev); + class_destroy(spmclass); + cdev_del(&spmcdev); + unregister_chrdev_region(spmdev, 1); +} + +static int +spm_dev_init(void) +{ + int err = 0; + + if (spm_major) { + spmdev = MKDEV(spm_major, spm_minor); + err = register_chrdev_region(spmdev, 1, "spm"); + } + else { + err = alloc_chrdev_region(&spmdev, spm_minor, 1, "spm"); + spm_major = MAJOR(spmdev); + } + + if (err < 0) { + unregister_chrdev_region(spmdev, 1); + goto done; + } + + spmdev = MKDEV(spm_major, spm_minor); + cdev_init(&spmcdev, &spm_ops); + spmcdev.owner = THIS_MODULE; + err = cdev_add(&spmcdev, spmdev, 1); + + if (err) + goto err; + + spmclass = class_create(THIS_MODULE, "spm"); + if (IS_ERR(spmclass)) { + err = PTR_ERR(spmclass); + goto err; + } + + spmclass->devnode = spm_devnode; + device_create(spmclass, NULL, spmdev, NULL, "spm"); + if (IS_ERR(spmclass)) { + err = PTR_ERR(spmclass); + goto err; + } +done: + return err; +err: + spm_dev_deinit(); + return err; +} +#endif + +int pm_scif_init(void) +{ + int err = 1; + int retry = 0; + + FUNCTION_ENTRY; + PM_DB("pm_scif insmoded \n"); +#ifdef PM_SCIF_IOCTL + if ((err = spm_dev_init())) { + PM_DB(" spm_dev_init failed\n"); + goto done; + } +#endif + atomic_set(&epinuse,0); + pm_scif = kzalloc(sizeof(mic_pm_scif), GFP_KERNEL); + + if (!pm_scif) { + err = -ENOMEM; + goto end_con; + } + + pm_scif_register(&micpmscif); + + if ((pm_scif->ep = scif_open()) == NULL) { + PM_DB(" scif_open failed\n"); + goto end_con; + } + + if ((pm_scif->lport = scif_bind(pm_scif->ep, 0)) < 0) { + PM_DB(" scif_bind failed\n"); + goto end_con; + } + + PM_DB(" scif_bind successfull. Local port number = %d, ep = \n", + pm_scif->lport); + dump_ep(pm_scif->ep, __func__,__LINE__); + pm_scif->rport_id.node = 0; + pm_scif->rport_id.port = SCIF_PM_PORT_0; + + while ((err = scif_connect(pm_scif->ep, &pm_scif->rport_id)) != 0) { + PM_DB(" scif_connect failed with err = %d ep %p\n",err, + pm_scif->ep); + msleep(1000); + if (retry++ > PM_SCIF_RETRY_COUNT) + goto end_con; + } + + pm_scif->pm_recvq = create_singlethread_workqueue("pm_recvq"); + INIT_WORK(&pm_scif->pm_recv, pm_recv_from_host); + queue_work(pm_scif->pm_recvq, &pm_scif->pm_recv); + pm_scif->con_state = PM_CONNECTED; + err = 0; +#ifdef PM_SCIF_IOCTL +done: +#endif + return err; +end_con: + pm_scif_exit(); + FUNCTION_EXIT; + return err; +} +EXPORT_SYMBOL(pm_scif_init); + +void pm_scif_exit(void) +{ + unsigned long flags; + + FUNCTION_ENTRY; + PM_DB("Good Bye!, pm scif \n"); + + pm_send_to_host(PM_MESSAGE_CLOSE, NULL, 0); + write_lock_irqsave(&pmscif_send,flags); + atomic_set(&epinuse,1); + write_unlock_irqrestore(&pmscif_send,flags); + + if (pm_scif) { + if(pm_scif->pm_recvq) { + flush_workqueue(pm_scif->pm_recvq); + PM_DB("calling destroy\n"); + destroy_workqueue(pm_scif->pm_recvq); + } + + PM_DB("closing ep \n"); + if (pm_scif->ep) + scif_close(pm_scif->ep); + + pm_scif_unregister(&micpmscif); + pm_scif->con_state = PM_DISCONNECTED; + kfree(pm_scif); + } + #ifdef PM_SCIF_IOCTL + spm_dev_deinit(); + #endif + FUNCTION_EXIT; +} + +EXPORT_SYMBOL(pm_scif_exit); + +module_init(pm_scif_init); +module_exit(pm_scif_exit); +MODULE_LICENSE("GPL"); diff --git a/pm_scif/pm_scif.h b/pm_scif/pm_scif.h new file mode 100644 index 0000000..ca275cd --- /dev/null +++ b/pm_scif/pm_scif.h @@ -0,0 +1,48 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#if !defined(__PM_SCIF_H) +#define __PM_SCIF_H + +struct mic_pmscif_handle{ + int (*pm_scif_uos2host)(PM_MESSAGE opcode, void *msg, size_t len); + int (*pm_scif_host2uos)(pm_msg_header *header, void *msg); + struct module *owner; +}; + +extern int pm_scif_register(struct mic_pmscif_handle *pmscif); +extern void pm_scif_unregister(struct mic_pmscif_handle *pmscif); + +#endif //__PM_SCIF_H diff --git a/ramoops/Kbuild b/ramoops/Kbuild new file mode 100644 index 0000000..53b0def --- /dev/null +++ b/ramoops/Kbuild @@ -0,0 +1 @@ +obj-m := ramoops.o diff --git a/ramoops/ramoops.c b/ramoops/ramoops.c new file mode 100644 index 0000000..76cd53f --- /dev/null +++ b/ramoops/ramoops.c @@ -0,0 +1,163 @@ +/* + * RAM Oops/Panic logger + * + * Copyright (C) 2009 Marco Stornelli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include + +#define RAMOOPS_KERNMSG_HDR "====" +#define RAMOOPS_HEADER_SIZE (5 + sizeof(struct timeval)) + +#define RECORD_SIZE 4096 + +static ulong mem_address; +module_param(mem_address, ulong, 0600); +MODULE_PARM_DESC(mem_address, + "start of reserved RAM used to store oops/panic logs"); + +static ulong mem_size; +module_param(mem_size, ulong, 0600); +MODULE_PARM_DESC(mem_size, + "size of reserved RAM used to store oops/panic logs"); + +static int dump_oops = 1; +module_param(dump_oops, int, 0600); +MODULE_PARM_DESC(dump_oops, + "set to 1 to dump oopses, 0 to only dump panics (default 1)"); + +static struct ramoops_context { + struct kmsg_dumper dump; + void *virt_addr; + phys_addr_t phys_addr; + unsigned long size; + int count; + int max_count; +} oops_cxt; + +static void ramoops_do_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, const char *s1, unsigned long l1, + const char *s2, unsigned long l2) +{ + struct ramoops_context *cxt = container_of(dumper, + struct ramoops_context, dump); + unsigned long s1_start, s2_start; + unsigned long l1_cpy, l2_cpy; + int res; + char *buf; + struct timeval timestamp; + + /* Only dump oopses if dump_oops is set */ + if ((reason != KMSG_DUMP_OOPS) || !dump_oops) + return; + + buf = (char *)(cxt->virt_addr + (cxt->count * RECORD_SIZE)); + memset(buf, '\0', RECORD_SIZE); + res = sprintf(buf, "%s", RAMOOPS_KERNMSG_HDR); + buf += res; + do_gettimeofday(×tamp); + res = sprintf(buf, "%lu.%lu\n", (long)timestamp.tv_sec, (long)timestamp.tv_usec); + buf += res; + + l2_cpy = min(l2, (unsigned long)(RECORD_SIZE - RAMOOPS_HEADER_SIZE)); + l1_cpy = min(l1, (unsigned long)(RECORD_SIZE - RAMOOPS_HEADER_SIZE) - l2_cpy); + + s2_start = l2 - l2_cpy; + s1_start = l1 - l1_cpy; + + memcpy(buf, s1 + s1_start, l1_cpy); + memcpy(buf + l1_cpy, s2 + s2_start, l2_cpy); + + cxt->count = (cxt->count + 1) % cxt->max_count; +} + +static int __init ramoops_init(void) +{ + struct ramoops_context *cxt = &oops_cxt; + int err = -EINVAL; + + if (!mem_size) { + printk(KERN_ERR "Invalid size specification"); + goto fail3; + } + + rounddown_pow_of_two(mem_size); + + if (mem_size < RECORD_SIZE) { + printk(KERN_ERR "size too small"); + goto fail3; + } + + cxt->max_count = mem_size / RECORD_SIZE; + cxt->count = 0; + cxt->size = mem_size; + cxt->phys_addr = mem_address; + + if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) { + printk(KERN_ERR "ramoops: request mem region failed"); + err = -EINVAL; + goto fail3; + } + + cxt->virt_addr = ioremap(cxt->phys_addr, cxt->size); + if (!cxt->virt_addr) { + printk(KERN_ERR "ramoops: ioremap failed"); + goto fail2; + } + + cxt->dump.dump = ramoops_do_dump; + err = kmsg_dump_register(&cxt->dump); + if (err) { + printk(KERN_ERR "ramoops: registering kmsg dumper failed"); + goto fail1; + } + + return 0; + +fail1: + iounmap(cxt->virt_addr); +fail2: + release_mem_region(cxt->phys_addr, cxt->size); +fail3: + return err; +} + +static void __exit ramoops_exit(void) +{ + struct ramoops_context *cxt = &oops_cxt; + + if (kmsg_dump_unregister(&cxt->dump) < 0) + printk(KERN_WARNING "ramoops: could not unregister kmsg_dumper\n"); + + iounmap(cxt->virt_addr); + release_mem_region(cxt->phys_addr, cxt->size); +} + + +module_init(ramoops_init); +module_exit(ramoops_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Stornelli "); +MODULE_DESCRIPTION("RAM Oops/Panic logger/driver"); + diff --git a/ras/Kbuild b/ras/Kbuild new file mode 100644 index 0000000..7778866 --- /dev/null +++ b/ras/Kbuild @@ -0,0 +1,6 @@ +obj-m := micras.o + +micras-y := micras_main.o micras_common.o +micras-y += micras_core.o micras_uncore.o micras_elog.o +micras-$(CONFIG_ML1OM) += micras_knf.o +micras-$(CONFIG_MK1OM) += micras_knc.o micras_pm.o diff --git a/ras/Makefile b/ras/Makefile new file mode 100644 index 0000000..f5550a7 --- /dev/null +++ b/ras/Makefile @@ -0,0 +1,210 @@ +# +# Build RAS drivers +# +# In Linux 2.6 kernels modules must be built by the kernel's kbuild +# system, with a path to the kernel module source directory. Kbuild +# expects a general purpose Makefile to exist and optionally an extra +# file named Kbuild with the kernel module build details. +# This Makefile is a 'backwards compatible' (see file "modules.txt"). +# +DEBUG = n + +ifneq ($(KERNELRELEASE),) + +# +# Kbuild backwards compatibility part: +# Load Kbuild to specify module targets and options. +# +include Kbuild + +else + +# +# Standard invocation: +# +# Export variables to environment and pass control to kernel tools +# ARCH Target architecture: l1om or k1om +# KERNELDIR Top of MIC kernel tree (not repo source tree) +# DRIVERDIR Top of MPSS drivers build tree (not repo source tree) +# + +ARCH := $(or $(ARCH), $(shell cat $(CURDIR)/../.arch 2>/dev/null)) + +ifeq ($(DRIVERDIR),) +ifeq ($(shell /usr/bin/test -d ../source-root/$(ARCH)-hybrid && echo Y),Y) +DRIVERDIR = $(PWD)/../source-root/$(ARCH)-hybrid +KERNELDIR ?= $(DRIVERDIR)/card/kernel +else ifeq ($(shell /usr/bin/test -d ../source-root/$(ARCH)-internal && echo Y),Y) +DRIVERDIR = $(PWD)/../source-root/$(ARCH)-internal +KERNELDIR ?= $(DRIVERDIR)/card/kernel +endif +endif +KERNELDIR ?= ../../miclinux + +SCIF_SYM = $(DRIVERDIR)/card/driver/Module.symvers +SCIF_LIB = $(DRIVERDIR)/host/scif_lib +SCIF_HEADER = $(DRIVERDIR)/include + +EXTRA_CFLAGS += $(KERNWARNFLAGS) +ifeq ($(ARCH),l1om) + EXTRA_CFLAGS += -DMIC_IS_L1OM +else ifeq ($(ARCH),k1om) + EXTRA_CFLAGS += -DMIC_IS_K1OM +else + $(error $$(ARCH) must be l1om or k1om) +endif +EXTRA_CFLAGS += -DINTERNAL_REG=1 -Wall +EXTRA_CFLAGS += $(SPOOKY_MIC_CFLAGS) + +CROSS_COMPILE = x86_64-$(ARCH)-linux- + +ifeq ($(shell which $(CROSS_COMPILE)gcc 2>/dev/null),) + ifeq ($(shell which ../cross/bin/$(CROSS_COMPILE)gcc 2>/dev/null),) + $(error $$(PATH) must include $(CROSS_COMPILE)gcc) + else + CROSS_COMPILE = $(PWD)/../cross/bin/x86_64-$(ARCH)-linux- + endif +endif + +default: modules tests + +modules: + @ echo "$(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) modules" + @ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) \ + V=0 DEBUG=$(DEBUG) \ + SPOOKY_MIC_CFLAGS=$(SPOOKY_MIC_CFLAGS) \ + CROSS_COMPILE=$(CROSS_COMPILE) \ + KBUILD_EXTRA_SYMBOLS=$(SCIF_SYM) \ + modules + +install: modules_install + +modules_install: + @ echo "$(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) install" + @ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) \ + V=0 DEBUG=$(DEBUG) \ + CROSS_COMPILE=$(CROSS_COMPILE) \ + SPOOKY_MIC_CFLAGS=$(SPOOKY_MIC_CFLAGS) \ + KBUILD_EXTRA_SYMBOLS=$(SCIF_SYM) \ + INSTALL_MOD_PATH=$(DESTDIR) \ + modules_install + +# +# Test programs, expects that compilers and SCIF libraries are present. +# +host-tools = edecode gdecode +host-tests = cp mc ttl tmp cutl proc ukill fan smc fsc pm trbo ptrig cp32 p-in-host p-out-host +card-tests = p-in-card p-out-card suid load + +tests: $(host-tools) $(host-tests) $(card-tests) + +cp: cp.c micras_api.h + @ echo gcc -O2 cp.c -o cp -lscif + @ gcc -O2 cp.c -o cp $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +cp32: cp32.c micras_api.h + @ echo gcc -O2 cp32.c -o cp32 -lscif + @ gcc -O2 cp32.c -o cp32 $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +mc: mc.c micmca_api.h + @ echo gcc -O2 mc.c -o mc -lscif + @ gcc -O2 mc.c -o mc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +cutl: cutl.c micras_api.h + @ echo gcc -O2 cutl.c -o cutl -lscif -lncurses + @ gcc -O2 cutl.c -o cutl $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif -lncurses + +cutl2: cutl2.c micras_api.h + @ echo gcc -O2 cutl2.c -o cutl2 -lscif -lncurses + @ gcc -O2 cutl2.c -o cutl2 $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif -lncurses + +proc: proc.c micras_api.h + @ echo gcc -O2 proc.c -o proc -lscif -lncurses + @ gcc -O2 proc.c -o proc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif -lncurses + +trbo: trbo.c micras_api.h + @ echo gcc -O2 trbo.c -o trbo -lscif + @ gcc -O2 trbo.c -o trbo $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +fan: fan.c micras_api.h + @ echo gcc -O2 fan.c -o fan -lscif + @ gcc -O2 fan.c -o fan $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +smc: smc.c micras_api.h + @ echo gcc -O2 smc.c -o smc -lscif + @ gcc -O2 smc.c -o smc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +pm_tst: pm_tst.c micras_api.h + @ echo gcc -O2 pm_tst.c -o pm_tst -lscif + @ gcc -O2 pm_tst.c -o pm_tst $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +fsc: fsc.c micras_api.h + @ echo gcc -O2 fsc.c -o fsc -lscif + @ gcc -O2 fsc.c -o fsc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +ptrig: ptrig.c micras_api.h + @ echo gcc -O2 ptrig.c -o ptrig -lscif + @ gcc -O2 ptrig.c -o ptrig $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +pm: pm.c micras_api.h micpm_api.h + @ echo gcc -O2 pm.c -o pm -lscif + @ gcc -O2 pm.c -o pm $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +tmp: tmp.c micras_api.h + @ echo gcc -O2 tmp.c -o tmp -lscif + @ gcc -O2 tmp.c -o tmp $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +ttl: ttl.c micras_api.h micpm_api.h + @ echo gcc -O2 ttl.c -o ttl -lscif + @ gcc -O2 ttl.c -o ttl $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +ukill: ukill.c micras_api.h + @ echo gcc -O2 ukill.c -o ukill -lscif + @ gcc -O2 ukill.c -o ukill $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif + +edecode: edecode.c + @ echo gcc -O2 edecode.c -o edecode + @ gcc -O2 -Wall edecode.c -o edecode + +gdecode: gdecode.c + @ echo gcc -O2 gdecode.c -o gdecode + @ gcc -O2 -Wall gdecode.c -o gdecode + +p-in-host: p-in.c Makefile + @ echo gcc -O2 p-in.c -o p-in-host + @ gcc -O2 p-in.c -o p-in-host -DIOK=16 + +p-out-host: p-out.c Makefile + @ echo gcc -O2 p-out.c -o p-out-host + @ gcc -O2 p-out.c -o p-out-host -DIOK=16 -DTXG=64 + +suid: suid.c + @ echo cross-gcc -O2 suid.c -o suid + @ $(CROSS_COMPILE)gcc -O2 suid.c -o suid + +p-in-card: p-in.c Makefile + @ echo cross-gcc -O2 p-in.c -o p-in-card + @ $(CROSS_COMPILE)gcc -O2 p-in.c -o p-in-card -DIOK=64 + +p-out-card: p-out.c Makefile + @ echo cross-gcc -O2 p-out.c -o p-out-card + @ $(CROSS_COMPILE)gcc -O2 p-out.c -o p-out-card -DIOK=64 -DTXG=16 + +load: load.c + @ echo cross-gcc load.c -o load -pthread -lpthread + @ $(CROSS_COMPILE)gcc load.c -o load $(EXTRA_CFLAGS) -pthread -lpthread + +cpptest: + @ echo Dumping compiler defines + @ echo > nil.c + @ $(CROSS_COMPILE)gcc -E -dM nil.c | sort + @ rm nil.c + +endif + +clean: + @ echo " Cleaning .." + @ rm -fr *.o *~ core .*.sw? .depend .*.cmd *.ko *.mod.c \ + .tmp_versions modules.order Module.symvers + @ rm -f $(host-tools) $(host-tests) $(card-tests) + diff --git a/ras/micmca_api.h b/ras/micmca_api.h new file mode 100644 index 0000000..8008ad0 --- /dev/null +++ b/ras/micmca_api.h @@ -0,0 +1,135 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * Definition of the public MC interface. + * Access to MC event features provided through SCIF only. + */ + +#ifndef _MICMCA_API_H_ +#define _MICMCA_API_H_ 1 + +#ifdef __cplusplus +extern "C" { /* C++ guard */ +#endif + +/* + * Configuration manifests + */ + +#pragma pack(push, 4) /* Windows requirement */ + + +/* + * Machine check info is reported on this port. Only one consumer can + * (and must) connect in order to be notified about MC events. + */ + +#define MR_MCE_PORT SCIF_RAS_PORT_1 + + +/* + * MC events are provide in raw form, i.e. as close to the + * contents of MCA register banks as possible. It is not + * the responsibility of the MCA event handler to perform + * analysis and interpretation of these registers, beyond + * determining whether the event was deadly to the uOS. + * + * Any data or context corruption _IS_ deadly by definition! + * + * Source identifiers: + * org id + * 0 Bank 0 CPU #, core event, range 0..CPU_MAX + * 1 Bank 1 CPU #, core event, range 0..CPU_MAX + * 2 Bank 2 CPU #, core event, range 0..CPU_MAX + * 3 DBOX #, uncore event, range 0..DBOX_MAX + * 4 SBOX, uncore event, range 0 + * 5 GBOX #, uncore event, range 0..GBOX_MAX + * 6 TBOX #, uncore event, range 0..TBOX_MAX + * + * Report flags bits (when set) representing: + * [31:5] Unused (and reserved) + * [4] Filter event, uOS side disabled this event + * [3] Status event, no failure (just MCA bank dump) + * [2] Injected or artificially generated event + * [1] This event has been recorded in EEPROM + * [0] Fatal, the uOS is toast (card needs reset) + * + * MCA bank register sizes are not the same on all banks: + * + * CTL STATUS ADDR MISC Notes + * CPU 0: 32 64 - - A,M not implemented, always 0 + * CPU 1: 32 64 64 32 + * CPU 2: 32 64 64 - M not implemented, always 0 + * DBOX: 32 64 64 - M not implemented, always 0 + * SBOX: 32 64 64 64 + * GBOX: 64 64 64 32 + * TBOX: 64 64 32 - M not implemented, not there + */ + +#define MC_ORG_BNK0 0 +#define MC_ORG_BNK1 1 +#define MC_ORG_BNK2 2 +#define MC_ORG_DBOX 3 +#define MC_ORG_SBOX 4 +#define MC_ORG_GBOX 5 +#define MC_ORG_TBOX 6 + +#define MC_FLG_FATAL (1 << 0) +#define MC_FLG_LOG (1 << 1) +#define MC_FLG_FALSE (1 << 2) +#define MC_FLG_STATUS (1 << 3) +#define MC_FLG_FILTER (1 << 4) + +typedef struct mce_info { + uint16_t org; /* Source of event */ + uint16_t id; /* Identifier of source */ + uint16_t flags; /* Report flags */ + uint16_t pid; /* Alternate source ID */ + uint64_t stamp; /* Time stamp of event */ + uint64_t ctl; /* MCA bank register 'CTL' */ + uint64_t status; /* MCA bank register 'STATUS' */ + uint64_t addr; /* MCA bank register 'ADDR' */ + uint64_t misc; /* MCA bank register 'MISC' */ +} MceInfo; + + +#pragma pack(pop) /* Restore to entry conditions */ + +#ifdef __cplusplus +} /* C++ guard */ +#endif + +#endif /* Recursion block */ diff --git a/ras/micpm_api.h b/ras/micpm_api.h new file mode 100644 index 0000000..d86ceeb --- /dev/null +++ b/ras/micpm_api.h @@ -0,0 +1,307 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * Definition of the PM interface to the RAS module. + * + * Throttle event interface is similar to the MC interface. + * If a connection is made to MR_TTL_PORT then event records + * will be sent to the host. Events are sent non-blocking, + * so if the SCIF buffer runs full, events are dropped until + * the block disappear (or the session is closed). + * + * Queries are technically implemented as an extension to the + * MT interface, and thus are accessible from the host. + * Except for the risk of conflicting commands written to the + * two power limit registers, there are no side effects from + * host side access via SCIF. + * + * Currently there are no plans to expose this in SysFs nodes. + * These routines are just wrappers for read/write access to + * SMC registers. No precious IP here. + */ + +#ifndef _MICPM_API_H_ +#define _MICPM_API_H_ 1 + +#ifdef __cplusplus +extern "C" { /* C++ guard */ +#endif + + +/* +** +** Configuration manifests +** +*/ + +#pragma pack(push, 4) /* Weird Windos requirement */ + + +/* + * Throttle events are reported on this port. Only one consumer can + * connect in order to be notified about PM throttling events. + */ + +#define MR_TTL_PORT SCIF_RAS_PORT_2 + + +/* + * Throttle events are provided in raw form, i.e. with as + * little processing on the card side as possible. + * For nicer throttle state display, use MT command MR_REQ_TTL. + * + * To compensate for the chance of lost events, the full + * throttle state is transfered in one byte on every message: + * + * Bit# Content + * 0 Power trottle state changed + * 1 New/Current power throttle state + * 2 Thermal throttle state changed + * 3 New/Current thermal throttle state + * 4 Power alert state changed + * 5 New/Current power alert state + * + * By definition, when power and thermal throttle are in effect + * the KnC is forced to run at reduced speed (600 MHz or so) and + * with lower operating voltages, i.e. software is not in control. + * During power alerts the KnC is consuming more power than PLim1 + * and the PM module can reduce speed and/or voltages to reduce + * power consumption. If power consumption goes beyond PLim0, the + * hardware (SMC really) will start real power throttles. + * In effect time spent in power throttle, will also be counted + * as being in the power alert state. See MT request MR_REQ_TTL. + */ + +#define PM_PWR_TTL_CHG (1 << 0) /* Power throttle change */ +#define PM_PWR_TTL (1 << 1) /* Power Trottle state */ +#define PM_TRM_TTL_CHG (1 << 2) /* Thermal throttle change */ +#define PM_TRM_TTL (1 << 3) /* Thermal Trottle state */ +#define PM_ALRT_TTL_CHG (1 << 4) /* Power alert change */ +#define PM_ALRT_TTL (1 << 5) /* Power alert state */ + +typedef struct ttl_info { + uint8_t upd; /* Throttle state update */ + uint8_t die; /* Die temperature (as per SBOX) */ +} TtlInfo; + + + +/* + * PM specific MT opcodes + * Leave one empty slot in callout table between + * this and the official MT API entries. + */ + +#define PM_REQ_PL0 (MR_REQ_MAX + 2) /* Get power limit 0 */ +#define PM_SET_PL0 (MR_REQ_MAX + 3) /* Set power limit 0 */ +#define PM_REQ_PL1 (MR_REQ_MAX + 4) /* Get power limit 1 */ +#define PM_SET_PL1 (MR_REQ_MAX + 5) /* Set power limit 1 */ +#define PM_REQ_PAVG (MR_REQ_MAX + 6) /* Get average power */ +#define PM_REQ_PTTL (MR_REQ_MAX + 7) /* Get power throttle */ +#define PM_REQ_VOLT (MR_REQ_MAX + 8) /* Get voltage */ +#define PM_REQ_TEMP (MR_REQ_MAX + 9) /* Get temperatures */ +#define PM_REQ_TACH (MR_REQ_MAX + 10) /* Get fan tachometer */ +#define PM_REQ_TTTL (MR_REQ_MAX + 11) /* Get thermal throttle */ +#define PM_REQ_FTTL (MR_REQ_MAX + 12) /* Get force throttle */ +#define PM_SET_FTTL (MR_REQ_MAX + 13) /* Set force throttle */ +#define PM_REQ_MAX PM_SET_FTTL /* Last PM command */ + + +/* +** +** Response container structures below. +** +*/ + + +/* + * Get power limit + * REQ_PL{0/1} notes: + * - Only power limit 0 have a guard band defined. + */ +typedef struct pm_rsp_plim { + uint32_t pwr_lim; /* Power limit, in Watt */ + uint32_t time_win; /* Time Window, in mSec */ + uint32_t guard_band; /* Guard band, in Watt */ +} PmRspPlim; + + +/* + * Set power limit + */ +typedef struct pm_cmd_plim { + uint32_t pwr_lim; /* Power limit, in Watt */ + uint32_t time_win; /* Time Window, in mSec */ +} PmCmdPlim; + + +/* + * Get average power + * REQ_PAVG notes: + * - Both values are subject to availability in the SMC. + * The top two status bit of each SMC register is provided + * separately (and stripped from the read value). Decode as + * 00 Data OK + * 01 Lower threshold reached + * 10 Upper threshold reached + * 11 Data unavailable + * It is unclear if data is good if outside thresholds. + */ +typedef struct pm_rsp_pavg { + uint8_t stat_0; /* Status bits for window 0 */ + uint8_t stat_1; /* Status bits for window 1 */ + uint32_t pwr_0; /* Average over window 0, in Watt */ + uint32_t pwr_1; /* Average over window 1, in Watt */ +} PmRspPavg; + + +/* + * Get Power throttle status + * REQ_PTTL notes: + * - Duration value is subject to availability in the SMC. + * The top two status bit of this SMC register is provided + * separately (and stripped from the read value). Decode as + * 00 Data OK + * 01 Reserved + * 10 Reserved + * 11 Data unavailable + */ +typedef struct pm_rsp_pttl { + uint8_t pwr_ttl; /* Power throttle asserted */ + uint8_t stat_dur; /* Status bits duration */ + uint32_t duration; /* Power throttle duration, in mSec */ +} PmRspPttl; + + +/* + * Get voltages + * REQ_VOLT notes: + * - VR values are subject to availability in the SMC. + * The top two status bit of each SMC register is provided + * separately (and stripped from the read value). Decode as + * 00 Data OK + * 01 Lower threshold reached + * 10 Upper threshold reached + * 11 Data unavailable + * It is unclear if data is good if outside thresholds. + */ +typedef struct pm_rsp_volt { + uint8_t stat_vccp; /* Status bits for Vddc */ + uint8_t stat_vddg; /* Status bits for Vddg */ + uint8_t stat_vddq; /* Status bits for Vddq */ + uint32_t vccp; /* Vccp, in mV */ + uint32_t vddg; /* Vddg, in mV */ + uint32_t vddq; /* Vddq, in mV */ +} PmRspVolt; + + +/* + * Get temperatures + * REQ_TEMP notes: + * - These values are subject to availability in the SMC. + * The top two status bit of each SMC register is provided + * separately (and stripped from the read value). Decode as + * 00 Data OK + * 01 Lower threshold reached + * 10 Upper threshold reached + * 11 Data unavailable + * It is unclear if data is good if outside thresholds. + */ +typedef struct pm_rsp_temp { + uint8_t stat_cpu; /* Status bits for Tcpu */ + uint8_t stat_vccp; /* Status bits for Tvddc */ + uint8_t stat_vddg; /* Status bits for Tvddg */ + uint8_t stat_vddq; /* Status bits for Tvddq */ + uint32_t cpu; /* CPU temp, in C */ + uint32_t vccp; /* Vccp VR temp, in C */ + uint32_t vddg; /* Vddg VR temp, in C */ + uint32_t vddq; /* Vddq VR temp, in C */ +} PmRspTemp; + + +/* + * Get fan tachometer + * REQ_TACH notes: + * - These values are subject to availability in the SMC. + * The top two status bit of each SMC register is provided + * separately (and stripped from the read value). Decode as + * 00 Data OK + * 01 Lower threshold reached (tach only) + * 10 Reserved + * 11 Data unavailable + * It is unclear if data is good if outside thresholds. + */ +typedef struct pm_rsp_tach { + uint8_t stat_pwm; /* Status bits for PWM */ + uint8_t stat_tach; /* Status bits for TACH */ + uint32_t fan_pwm; /* Fan power, in % */ + uint32_t fan_tach; /* Fan speed, in RPM */ +} PmRspTach; + + +/* + * Get thermal throttle status + * REQ_THRM notes: + * - Duration value is subject to availability in the SMC. + * The top two status bit of this SMC register is provided + * separately (and stripped from the read value). Decode as + * 00 Data OK + * 01 Reserved + * 10 Reserved + * 11 Data unavailable + */ +typedef struct pm_rsp_tttl { + uint8_t thrm_ttl; /* Power throttle asserted */ + uint8_t stat_dur; /* Status bits duration */ + uint32_t duration; /* Thermal throttle duration, in mSec */ +} PmRspTttl; + + +/* + * Get/Set force trottle control + */ +typedef struct pm_rsp_fttl { + uint8_t forced; /* Forced power throttle asserted */ +} PmRspFttl; + + +#pragma pack(pop) /* Restore to sane conditions */ + +#ifdef __cplusplus +} /* C++ guard */ +#endif + +#endif /* Recursion block */ diff --git a/ras/micras.h b/ras/micras.h new file mode 100644 index 0000000..faa3e91 --- /dev/null +++ b/ras/micras.h @@ -0,0 +1,536 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS module common internal declarations + * + * Configuration flags, constants and function prototypes + * for the RAS sysfs, MT and MC module. + */ + +#ifndef _MICRAS_H_ +#define _MICRAS_H_ 1 + + +/* + * Public APIs first. + * Must be self-contained and independent of local tunables. + */ + +#include "micras_api.h" +#include "micmca_api.h" +#include "micpm_api.h" + + +/* + * Local configurables & tunables + */ + +#define USE_PM 1 /* Support power management */ + +#define RAS_HALT 1 /* Panic on uncorrectable MCAs */ + +#define I2C_SLOW 1 /* Default to lowest speed on I2C */ + +#define USE_FSC 1 /* Allow using FSC MGBR/MGBSR protocol */ +#define USE_SVID 0 /* Allow using SVID for VR info */ +#define USE_SMC 1 /* Prefer SMC over SBOX (telemetry) */ + +#define MT_TIMER 1 /* Enable periodic wakeup */ +#define MT_PERIOD 999 /* Period sleep (mS) */ + +#define MCU_NMI 1 /* Use NMI in SBOX redirection table */ + +#define EE_VERIFY 0 /* Verify all EEPROM writes */ +#define EE_PROC 1 /* Enable access to EEPROM from /proc/elog */ +#define EE_PROC_NEW 0 /* Only display events between head & tail */ +#define EE_INJECT 0 /* Enable writes to EEPROM via /proc/elog */ + +#define BEAM_TEST 0 /* Neuter MC handling for beam test */ + +#define MT_VERBOSE 0 /* Track MT activity in kernel log */ +#define MC_VERBOSE 0 /* Track MC activity in kernel log */ +#define PM_VERBOSE 0 /* Track PM activity in kernel log */ + +#define GBOX_WORKING 0 /* Set to one when GBOX writes are stable */ + +#define WA_4845465 0 /* Use HSD #4845465 workaround */ + +#define ADD_DIE_TEMP 1 /* Embed die temperature in event reports */ + +#define NOT_YET 0 /* 'Hide' code that's not currently in use */ + + +/* + * Useful macros + *TBD: Cast everything to 64 bit (ULL)? + * For now all is 32 bit (U) + */ + +#define GET_BITS(l,r,v) (((v) >> (r)) & ((1U << ((l) - (r) +1)) -1)) +#define PUT_BITS(l,r,v) (((v) & ((1U << ((l) - (r) +1)) -1)) << (r)) + +#define GET_BIT(n,v) GET_BITS((n), (n), (v)) +#define PUT_BIT(n,v) PUT_BITS((n), (n), (v)) + + +/* + * Init/Exit functions + */ + +extern void mr_mt_init(void); +extern void mr_mt_exit(void); +extern void mr_mt_card_init(void); +extern void mr_mt_card_exit(void); + + +/* + * Command line options (exported from generic MCE handler) + */ + +extern int mce_disabled; + + +/* + * MT opcode/function table. + * Resides in micras_main() and gates access though sysctls and SCIF. + */ + +struct fnc_tab { + uint16_t cmd; + uint8_t simple; + uint8_t privileged; + int (*fnc)(void *); +}; + +extern int micras_priv; +extern int micras_mt_call(uint16_t, void *); + + +/* + * MT get functions + * Spread over micras_{common,knf,knc}.c + */ +extern int mr_get_hwinf(void *); +extern int mr_get_vers(void *); +extern int mr_get_pver(void *); +extern int mr_get_freq(void *); +extern int mr_get_volt(void *); +extern int mr_get_power(void *); +extern int mr_get_plim(void *); +extern int mr_get_clst(void *); +extern int mr_get_gddr(void *); +extern int mr_get_gfreq(void *); +extern int mr_get_gvolt(void *); +extern int mr_get_temp(void *); +extern int mr_get_fan(void *); +extern int mr_get_ecc(void *); +extern int mr_get_trc(void *); +extern int mr_get_trbo(void *); +extern int mr_get_oclk(void *); +extern int mr_get_cutl(void *); +extern int mr_get_mem(void *); +extern int mr_get_os(void *); +extern int mr_get_proc(void *); +extern int mr_get_pmcfg(void *); + +/* + * MT set functions + * Spread over micras_{common,knf,knc}.c + */ +extern int mr_set_freq(void *); +extern int mr_set_volt(void *); +extern int mr_set_plim(void *); +extern int mr_set_gfreq(void *); +extern int mr_set_gvolt(void *); +extern int mr_set_fan(void *); +extern int mr_set_trc(void *); +extern int mr_set_trbo(void *); +extern int mr_set_oclk(void *); + + +/* + * MT cmd functions + */ +extern int mr_cmd_pkill(void *); +extern int mr_cmd_ukill(void *); + + +#if defined(CONFIG_ML1OM) && USE_FSC +/* + * MT FSC access functions + * KnF specific, located in micras_knf.c + */ +extern int mr_get_fsc(void *); +extern int mr_set_fsc(void *); +#endif + +#if defined(CONFIG_MK1OM) +/* + * MT SMC access functions + * KnC specific, located in micras_knc.c + */ +extern int mr_get_smc(void *); +extern int mr_get_led(void *); +extern int mr_get_prochot(void *); +extern int mr_get_pwralt(void *); +extern int mr_get_perst(void *); +extern int mr_get_ttl(void *); + +extern int mr_set_smc(void *); +extern int mr_set_led(void *); +extern int mr_set_prochot(void *); +extern int mr_set_pwralt(void *); +extern int mr_set_perst(void *); +#endif + + +#if defined(CONFIG_MK1OM) && USE_PM +/* + * PM get functions + */ +extern int pm_get_pl0(void *); +extern int pm_get_pl1(void *); +extern int pm_get_pavg(void *); +extern int pm_get_pttl(void *); +extern int pm_get_volt(void *); +extern int pm_get_temp(void *); +extern int pm_get_tach(void *); +extern int pm_get_tttl(void *); +extern int pm_get_fttl(void *); + +/* + * PM set functions + */ +extern int pm_set_pl0(void *); +extern int pm_set_pl1(void *); +extern int pm_set_fttl(void *); +#endif + + +/* + * MC & TTL event distribution functions + * Spread over micras_{main,elog,core}.c + */ + +#ifdef MR_MCE_PORT +extern int micras_mc_send(struct mce_info *, int); +extern void micras_mc_ipmi(struct mce_info *, int); +extern void micras_mc_log(struct mce_info *); +extern uint32_t micras_mc_filter(struct mce_info *, uint64_t, int); +#endif +#ifdef MR_TTL_PORT +extern void micras_ttl_send(struct ttl_info *); +#endif + + +/* + * BOX constants (card variations). + */ + +#ifdef CONFIG_ML1OM +#define DBOX_NUM 1 +#define GBOX_NUM 4 +#endif + +#ifdef CONFIG_MK1OM +#define DBOX_NUM 2 +#define GBOX_NUM 8 /* Max count, SKU dependent */ +#define TBOX_NUM 8 /* Max count, SKU dependent */ +#endif + +#ifndef COMMON_MMIO_BOX_SIZE +#define COMMON_MMIO_BOX_SIZE (1<<16) +#endif + + +/* + * BOX utility functions + * Most located in micras_main.c + */ + +extern char *mr_sku(void); +extern int mr_mch(void); +extern int mr_txs(void); + +extern uint8_t *micras_sbox; +extern uint8_t *micras_dbox[DBOX_NUM]; +extern uint8_t *micras_gbox[GBOX_NUM]; +#ifdef CONFIG_MK1OM +extern uint8_t *micras_tbox[TBOX_NUM]; +#endif + +extern uint8_t *mr_sbox_base(int); +extern uint32_t mr_sbox_rl(int, uint32_t); +extern void mr_sbox_wl(int, uint32_t, uint32_t); +extern uint64_t mr_sbox_rq(int, uint32_t); +extern void mr_sbox_wq(int, uint32_t, uint64_t); + +extern uint8_t *mr_dbox_base(int); +extern uint32_t mr_dbox_rl(int, uint32_t); +extern void mr_dbox_wl(int, uint32_t, uint32_t); +extern uint64_t mr_dbox_rq(int, uint32_t); +extern void mr_dbox_wq(int, uint32_t, uint64_t); + +extern uint8_t *mr_gbox_base(int); +extern uint32_t mr_gbox_rl(int, uint32_t); +extern void mr_gbox_wl(int, uint32_t, uint32_t); +extern uint64_t mr_gbox_rq(int, uint32_t); +extern void mr_gbox_wq(int, uint32_t, uint64_t); + +#ifdef CONFIG_MK1OM +extern uint8_t *mr_tbox_base(int); +extern uint32_t mr_tbox_rl(int, uint32_t); +extern void mr_tbox_wl(int, uint32_t, uint32_t); +extern uint64_t mr_tbox_rq(int, uint32_t); +extern void mr_tbox_wq(int, uint32_t, uint64_t); +#endif + + +/* + * Un-core MCA register offsets + * Some #defines stolen from FreeBSD uOS. + * + *TBD: check again when we get real register include files + */ + +#define SBOX_MCX_CTL_LO 0x00003090 +#define SBOX_MCX_STATUS_LO 0x00003098 +#define SBOX_MCX_STATUS_HI 0x0000309C +#define SBOX_MCX_ADDR_LO 0x000030A0 +#define SBOX_MCX_ADDR_HI 0x000030A4 +#define SBOX_MCX_MISC 0x000030A8 +#define SBOX_MCX_MISC2 0x000030AC +#define SBOX_MCA_INT_STAT 0x0000AB00 +#define SBOX_MCA_INT_EN 0x0000AB04 +#define SBOX_COMPONENT_ID 0x00004134 + +#define DBOX_MC2_CTL 0x00000340 +#define DBOX_MC2_STATUS 0x00000348 +#define DBOX_MC2_ADDR 0x00000350 + +#define GBOX_FBOX_MCA_CTL_LO 0x0000005C +#define GBOX_FBOX_MCA_CTL_HI 0x00000060 +#define GBOX_FBOX_MCA_STATUS_LO 0x00000064 +#define GBOX_FBOX_MCA_STATUS_HI 0x00000068 +#define GBOX_FBOX_MCA_ADDR_LO 0x0000006C +#define GBOX_FBOX_MCA_ADDR_HI 0x00000070 +#define GBOX_FBOX_MCA_MISC 0x00000074 + +#ifdef CONFIG_MK1OM +#define TXS_MCX_CONTROL 0x00003700 +#define TXS_MCX_STATUS 0x00003740 +#define TXS_MCX_ADDRESS 0x00003780 +#endif + + +/* + * Thermal register offsets + */ + +#if defined(CONFIG_MK1OM) && WA_4845465 +#ifndef SBOX_MICROCONTROLLER_FAN_STATUS +#define SBOX_MICROCONTROLLER_FAN_STATUS 0x1020 +#endif +#endif +#if defined(CONFIG_MK1OM) && (WA_4845465 || ADD_DIE_TEMP || USE_PM) +#ifndef SBOX_THERMAL_STATUS_2 +#define SBOX_THERMAL_STATUS_2 0x1080 +#endif +#endif + + +/* + * SMP utilities + * Located in micras_main.c + */ + +extern uint32_t rd_cr4_on_cpu(int); +extern void set_in_cr4_on_cpu(int, uint32_t); +extern void clear_in_cr4_on_cpu(int, uint32_t); +extern uint64_t rdtsc(void); + + +/* + * General EEPROM and POST card UART access + * Located in micras_elog.c + */ + +#define EE_BUF_COUNT 100 +#define EE_BUF_LINELEN 256 +extern char ee_buf[]; +extern atomic_t ee_msg; +extern atomic_t ee_seen; + +extern char * ee_fmt(char *, va_list); +extern int ee_printk(char *, ...); +extern int ee_print(char *, ...); +#ifdef CONFIG_MK1OM +extern void ee_list(void); +extern void ee_wipe(void); +#endif +extern int ee_init(void); +extern int ee_exit(void); + +extern void myDELAY(uint64_t); + + +/* + * SMC access API + * Provided by the kernel + */ + +extern int gmbus_i2c_read(uint8_t, uint8_t, uint8_t, uint8_t *, uint16_t); +extern int gmbus_i2c_write(uint8_t, uint8_t, uint8_t, uint8_t *, uint16_t); + + +/* + * RAS core MCA handling + * Located in micras_core.c + */ + +extern uint8_t xlat_cpu[NR_CPUS]; +extern void mcc_sync(void); +extern int mcc_init(void); +extern int mcc_exit(void); +extern void mcc_flt_parm(uint8_t *); + + +/* + * RAS un-core MCA handling + * Located in micras_uncore.c + */ + +extern void box_reset(int); +extern int mcu_init(void); +extern int mcu_exit(void); + + +#if defined(CONFIG_MK1OM) && USE_PM +/* + * RAS PM handling + * Located in micras_pm.c + * + * Power management registration exchange records: + * The RAS module populates a 'params' record and pass it to + * the PM module through the micpm_ras_register() function. + * In return the PM module populate the passed 'callbacks' record. + * The PM module is responsible for populating the lists of + * supported core frequencies and core voltages. In contrast to + * KnF, where the lists reflect the hardware capabilities, these + * reflect the actual frequencies and voltages that core-freq + * module can use to lower power consumption. + */ + +struct micpm_params { + uint32_t * freq_lst; /* Core frequency list */ + uint32_t * freq_len; /* Core freq count */ + uint32_t freq_siz; /* Space in core freq list */ + uint32_t * volt_lst; /* Core voltage list */ + uint32_t * volt_len; /* Core voltage count */ + uint32_t volt_siz; /* Space in core volt list */ + int (* mt_call)(uint16_t, void *); /* Access MT function */ + void (* mt_ttl)(int, int); /* Throttle notifier */ +}; + +struct micpm_callbacks { + int (*micpm_get_turbo)(void); /* Get PM turbo setting */ + void (*micpm_set_turbo)(int); /* Notify PM of new turbo setting */ + void (*micpm_vf_refresh)(void); /* Refresh core V/F lists */ + int (*micpm_get_pmcfg)(void); /* Get PM operating mode */ +}; + +extern struct micpm_params pm_reg; +extern struct micpm_callbacks pm_cb; + + +/* + * Args for mt_ttl() function + */ + +#define TTL_OFF 0 +#define TTL_ON 1 + +#define TTL_POWER 0 +#define TTL_THERMAL 1 + + +/* + * Bit locations for micpm_get_turbo() and micpm_set_turbo() + */ + +#define MR_PM_MODE (1 << 0) /* Turbo mode */ +#define MR_PM_STATE (1 << 1) /* Current turbo state */ +#define MR_PM_AVAIL (1 << 2) /* Turbo mode available */ + + +/* + * Bit positions for the different features turned on/off + * in the uOS PM configuration, for micpm_get_pmcfg(). + */ + +#define PMCFG_PSTATES_BIT 0 +#define PMCFG_COREC6_BIT 1 +#define PMCFG_PC3_BIT 2 +#define PMCFG_PC6_BIT 3 + + +/* + * Register/Unregister functions in micpm driver that RAS calls + * during module init/exit. Pointers to the exchanged data + * structures are passed during registration. + * The RAS module guarantee that the pointers are valid until + * the unregister function is called. That way the PM module can + * modify the core frequency/voltage lists if they gets changed. + * The callbacks must always either be a valid function pointer + * or a null pointer. + */ + +extern int micpm_ras_register(struct micpm_callbacks *, struct micpm_params *); +extern void micpm_ras_unregister(void); + +extern int mr_pm_ttl(struct mr_rsp_ttl *); +extern int pm_init(void); +extern void pm_exit(void); +#endif + + +/* + * Debug tools + */ + +extern void dmp_hex(void *, int, const char *, ...); + +#endif /* Recursion block */ diff --git a/ras/micras_api.h b/ras/micras_api.h new file mode 100644 index 0000000..7456fb2 --- /dev/null +++ b/ras/micras_api.h @@ -0,0 +1,1006 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * Definition of the public RAS Monitoring Thread interface. + * Access to RAS features are expected from SCIF and through + * nodes under '/sys/class/micras'. Both interfaces ends up + * in the same code and thus present the exact same data. + * + * Some information that are available elsewhere through standard + * Linux mechanism are included in this API, though things like + * process status (/proc//stat), cpu status (/proc/stat), + * and memory status (/proc/vmstat) are better from the source. + */ + +#ifndef _MICRAS_API_H_ +#define _MICRAS_API_H_ 1 + +#ifdef __cplusplus +extern "C" { /* C++ guard */ +#endif + +/* +** +** Configuration manifests +** +*/ + +#pragma pack(push, 4) /* Windos requirement */ + + +/* + * RAS module version info: M.NP + */ + +#define RAS_MAJOR "1" +#define RAS_MINOR "0" +#define RAS_PATCH " " +#define RAS_VER RAS_MAJOR "." RAS_MINOR RAS_PATCH + + +/* + * RAS services in uOS kernel listens on this port for incoming queries. + * Consumers may establish multiple connections to this port, though no + * guarantee on connection processing order will be given. Transactions + * on a connection will be processed and replied to in order recieved. + */ + +#define MR_MON_PORT SCIF_RAS_PORT_0 +#define MR_SCIF_MAX 32 + + +/* + * Some array max sizes. + * These may be replaced by system wide constants + * if they become available in the source tree. + */ + +#define MR_VERS_LEN 120 /* Version string lengths */ +#define MR_GUID_LEN 16 /* Global unique ID length (bytes) */ +#define MR_SENO_LEN 12 /* Serial number length (bytes) */ +#define MR_PVER_LEN 8 /* API version string length */ +#define MR_PTAB_LEN 64 /* PM freq/volt pairs */ +#define MR_DIES_LEN 9 /* Die temperatures */ +#define MR_BRDS_LEN 4 /* Board temp sensors */ +#define MR_GVND_LEN 16 /* GDDR vendor string length */ +#define MR_CORE_LEN 62 /* Max number of CPU cores */ + + +/* +** Transaction header for requests and responses is a fixed size +** record followed by an optional variable length data block. +** +** Fields usage: +** cmd [15] data field is error record +** cmd [14] response to opcode +** cmd [13:0] opcode +** len length of payload +** parm command parameter +** stamp host side cookie, performance monitoring +** spent processing time, performance monitoring +** +** Command codes: +** Codes that directly relate to cores may set the 'parm' field to a +** non-zero value to address one core (base 1) instead of them all. +** +*/ + +typedef struct mr_hdr { + uint16_t cmd; /* Command field */ + uint16_t len; /* Size of data payload */ + uint32_t parm; /* Parameter field */ + uint64_t stamp; /* Time stamp of 'send' (set by host) */ + uint64_t spent; /* Time used on response (rdtsc delta) */ +} MrHdr; + +#define MR_RESP (1 << 14) /* Response bit */ +#define MR_ERROR (1 << 15) /* Error bit */ +#define MR_OP_MASK (MR_RESP - 1) /* Opcode mask */ + +#define MR_REQ_HWINF 1 /* Get hardware info */ +#define MR_REQ_VERS 2 /* Get version strings */ +#define MR_REQ_CFREQ 3 /* Get core frequencies */ +#define MR_SET_CFREQ 4 /* Set core frequency */ +#define MR_REQ_CVOLT 5 /* Get core voltages */ +#define MR_SET_CVOLT 6 /* Set core voltage */ +#define MR_REQ_PWR 7 /* Get power metrics */ +#define MR_REQ_PLIM 8 /* Get power limit */ +#define MR_SET_PLIM 9 /* Set power limit */ +#define MR_REQ_CLST 10 /* Get core list */ +#define MR_ENB_CORE 11 /* Enable core */ +#define MR_DIS_CORE 12 /* Disable core */ +#define MR_REQ_GDDR 13 /* Get GDDR device info */ +#define MR_REQ_GFREQ 14 /* Get GDDR frequencies */ +#define MR_SET_GFREQ 15 /* Set GDDR frequency */ +#define MR_REQ_GVOLT 16 /* Get GDDR voltages */ +#define MR_SET_GVOLT 17 /* Set GDDR voltage */ +#define MR_REQ_TEMP 18 /* Get board temperatures */ +#define MR_REQ_FAN 19 /* Get fan status */ +#define MR_SET_FAN 20 /* Set fan power */ +#define MR_REQ_ECC 21 /* Get ECC mode */ +#define MR_SET_ECC 22 /* Set ECC mode */ +#define MR_REQ_TRC 23 /* Get debug trace level */ +#define MR_SET_TRC 24 /* Set debug trace level */ +#define MR_REQ_TRBO 25 /* Get turbo mode status */ +#define MR_SET_TRBO 26 /* Set turbo mode status */ +#define MR_REQ_OCLK 27 /* Get overclocking status */ +#define MR_SET_OCLK 28 /* Set overclocking status */ +#define MR_REQ_CUTL 29 /* Get core utilization */ +#define MR_REQ_MEM 30 /* Get memory utilization */ +#define MR_REQ_OS 31 /* Get OS status & process list */ +#define MR_REQ_PROC 32 /* Get process details */ +#define MR_REQ_THRD 33 /* Get thread details */ +#define MR_REQ_PVER 34 /* Get API version */ +#define MR_CMD_PKILL 35 /* Kill process */ +#define MR_CMD_UKILL 36 /* Kill processes owned by user */ +#define MR_GET_SMC 37 /* Get SMC register */ +#define MR_SET_SMC 38 /* Write SMC register */ +#define MR_REQ_PMCFG 39 /* Get PM config mode */ +#define MR_REQ_LED 40 /* Get LED mode */ +#define MR_SET_LED 41 /* Set LED mode */ +#define MR_REQ_PROCHOT 42 /* Get PROC hot trigger */ +#define MR_SET_PROCHOT 43 /* Set PROC hot trigger */ +#define MR_REQ_GPUHOT 42 /* Get GPU hot trigger */ +#define MR_SET_GPUHOT 43 /* Set GPU hot trigger */ +#define MR_REQ_PWRALT 44 /* Get power alert trigger */ +#define MR_SET_PWRALT 45 /* Set power alert trigger */ +#define MR_REQ_PERST 46 /* Get persistent triggers flag */ +#define MR_SET_PERST 47 /* Set persistent triggers flag */ +#define MR_REQ_TTL 48 /* Get Throttle state */ +#define MR_REQ_MAX 48 /* Max command code */ + + +/* +** +** Transaction error record: +** If an error occurs during the handling of a request, an +** error record is returned, possibly with supplemental info. +** +** Fields usage: +** err code indication error condition +** len size of additional data +** +** For now there is no definition on what supplemental info +** should look like, but the idea is to open for a possibility +** of giving very precise specification on what the error was. +** Consider it a place holder for future use. +** +** Error codes: +** Code 'NOMEM' means that space for response generation was unavailable. +** Code 'NOVAL' is used to indicate that a valid request (i.e. a query +** on something temporarily unavailable, like processor utilization on +** a core in a sleep state) has no valid response. +** +*/ + +typedef struct mr_err { + uint16_t err; /* Error code field */ + uint16_t len; /* Length of additional error info */ +} MrErr; + +#define MR_ERR_INVOP 1 /* Dofus, command/opcode invalid */ +#define MR_ERR_INVLEN 2 /* Dofus, length not valid for opcode */ +#define MR_ERR_INVAUX 3 /* Dofus, parm field not valid for opcode */ +#define MR_ERR_INVDATA 4 /* Dofus, content of data block invalid */ +#define MR_ERR_PERM 5 /* Failure, privileged command */ +#define MR_ERR_NOMEM 6 /* Failure, out of memory */ +#define MR_ERR_SMC 7 /* Failure, SMC communication */ +#define MR_ERR_NOVAL 8 /* Failure, no valid value to report */ +#define MR_ERR_UNSUP 9 /* Failure, not implemented (temporary) */ +#define MR_ERR_RANGE 10 /* Failure, parameter out of range */ +#define MR_ERR_PEND 11 /* Pending, internal use only */ + + +/* +** +** Response container structures below. +** +** Strings are returned in Pascal format (why?), i.e. pre-fixed +** with a 1 byte length field and post-fixed with a 0 byte. +** +*/ + + +/* + * MIC Hardware Info + * REQ_HWINF Notes: + * - no idea how to determine PCI-E slot, it's a host side thing. + * - assume revision is same as model ID in the component ID register + * - unique ID not available in all flash versions + * - Hardware version codes are reported as-is, anticipating + * recipient to know what the codes means. + */ + +typedef struct mr_rsp_hwinf { + uint8_t guid[MR_GUID_LEN]; /* Unique ID, from SMC */ + uint8_t board; /* Board type, SMC HW 17:16 */ + uint8_t fab; /* Fab version, SMC HW 10:8 */ + uint8_t sku; /* SKU #, SMC HW 2:0 */ + uint8_t slot; /* PCI-E slot, get from where ? */ + uint8_t rev; /* Revision, component ID 16:19 */ + uint8_t step; /* Stepping, component ID 12:15 */ + uint8_t substep; /* Sub-stepping, component ID 8:11 */ + uint8_t serial[MR_SENO_LEN]; /* Serial number, from SMC */ +} MrRspHwInf; + + + +/* + * MIC API version + * REQ_PVER Notes: + * - returns RAS_VER string the module was built with. + */ + +typedef struct mr_rsp_pver { + char api[MR_PVER_LEN]; /* Ras module version */ +} MrRspPver; + + + +/* + * MIC uOS/Flash version + * REQ_VERS Notes: + * - unclear at this point what the lengths of these strings are. + * The limit of 128 bytes is a 'best safe guess' and may change. + * - KnF: My card has 3 flash strings, for now that's the count. + * - KnC: Has fewer defined version strings, currently only fboot0 + * string has been defined. + */ + +typedef struct mr_rsp_vers { + char fboot0[MR_VERS_LEN]; /* Fboot 0 version */ + char fboot1[MR_VERS_LEN]; /* Fboot 1 version */ + char flash[3][MR_VERS_LEN]; /* Flash block versions */ + char uos[MR_VERS_LEN]; /* uOS kernel version */ + char fsc[MR_VERS_LEN]; /* Fan controller version */ +} MrRspVers; + + + +/* + * Core frequency + * REQ_CFREQ Notes: + * - current is clock read from CURRENTRATIO register. + * - default/requested clock is read from COREFREQ register. + * In KnF, the CURRENTRATIO is not used and therefore + * COREFREQ s reported as current speed and the default + * is simply the first value registered (at module load). + * - supported speeds are part of freq/voltage pairs maintained + * by the cpu_freq driver as part of PM (cpu_freq driver). + * - unclear if we should allow manual control (writes). + */ + +typedef struct mr_rsp_freq { + uint32_t cur; /* Actual core speed in kHz */ + uint32_t def; /* Set core speed in kHz */ + uint32_t slen; /* Supported count */ + uint32_t supt[MR_PTAB_LEN]; /* Supported speed list in kHz */ +} MrRspFreq; + +/* + * Set core frequency + * New frequency (in kHz) passed in MrHdr.parm + * SET_CFREQ Notes: + * - need to turn off PM for this to stick + */ + + + +/* + * Core voltage + * REQ_CVOLT Notes: + * - KnF: Two core voltages; current voltage set from COREVOLT + * register and sense1 read in the BOARD_VOLTAGE_SENSE register. + * - KnC: 3 potential sources; SVID, SMC, and SBOX registers. + * SBOX regs require SMC telemetry which is uncertain. + * SVID does not work in A0, B0 is TBD. + * SMC will eventually relay VR data. + * Only SVID gives both set and actual values. + * Only SMC sets c_val field, zero is good. + * - Supported voltages are either determined from what the VRs + * can support or if PM is active it is part of the freq/voltage pairs + * maintained by the cpu_freq driver as part of PM (cpu_freq driver). + */ + +typedef struct mr_rsp_volt { + uint32_t cur; /* Core voltage read in uV */ + uint32_t set; /* Core voltage set in uV */ + uint8_t c_val; /* Valid bits, volt read */ + uint32_t slen; /* Supported count */ + uint32_t supt[MR_PTAB_LEN]; /* Supported voltage list in uV */ +} MrRspVolt; + +/* + * Set core voltage + * New voltage passed in MrHdr.parm + * SET_CVOLT Notes: + * - need to turn off PM for this to stick + * - Unclear if we should allow manual control through this API. + */ + + + +/* + * Card power + * REQ_PWR Notes + * - Power status only avalable on KnC via SMC query + * - VR status on KnC may come from VRs directly or from SMC query + * - VR status on KnF comes from SBOX registers (telemtry) + * - If available, status bits from query is provided, zero is good. + */ + +typedef struct mr_rsp_pws { /* Power sensor status */ + uint32_t prr; /* Current reading, in uW */ + uint8_t p_val; /* Valid bits, power */ +} MrRspPws; + +typedef struct mr_rsp_vrr { /* Voltage regulator status */ + uint32_t pwr; /* Power reading, in uW */ + uint32_t cur; /* Current, in uA */ + uint32_t volt; /* Voltage, in uV */ + uint8_t p_val; /* Valid bits, power */ + uint8_t c_val; /* Valid bits, current */ + uint8_t v_val; /* Valid bits, voltage */ +} MrRspVrr; + +typedef struct mr_rsp_power { + MrRspPws tot0; /* Total power, win 0 */ + MrRspPws tot1; /* Total power, win 1 */ + MrRspPws inst; /* Instantaneous power */ + MrRspPws imax; /* Max instantaneous power */ + MrRspPws pcie; /* PCI-E connector power */ + MrRspPws c2x3; /* 2x3 connector power */ + MrRspPws c2x4; /* 2x4 connector power */ + MrRspVrr vccp; /* Core rail */ + MrRspVrr vddg; /* Uncore rail */ + MrRspVrr vddq; /* Memory subsystem rail */ +} MrRspPower; + + + +/* + * Power envelope + * REQ_PLIM Notes: + * - power envelope is a PM property. A physical limit + * is given to PM, which then calculate derivative high + * and low water mark figures. + * - values are retrieved from PM module + */ + +typedef struct mr_rsp_plim { + uint32_t phys; /* Physical limit, in W */ + uint32_t hmrk; /* High water mark, in W */ + uint32_t lmrk; /* Low water mark, in W */ +} MrRspPlim; + +/*TBD + * Set power envelope + * New value passed in MrHdr.parm + * SET_PLIM Notes: + * - not sure if setting this should be allowed at all. + */ + + + +/* + * Core information + * REQ_CLST Notes: + * - for the average user a core count is all required, since + * logically the cores are _always_ enumerated 0 .. -1. + * Physical enumeration, such as ring stop, are not useful. + * - perhaps this request should return the CPU bitfields from + * the uOS of offline, online, possible, and present masks. + * Would allow watching of PM activity. + */ + +typedef struct mr_rsp_clst { + uint16_t count; /* Cores present */ + uint16_t thr; /* Threads per core */ +} MrRspClst; + + +/* + * Set core enable/disable + * Core id & set/reset value passed in MrHdr.parm + * ENB_CORE/DIS_CORE Notes: + * - uOS Linux does not have write access to HW config in SPI flash. + * No way to enable/disable cores + * - only listed here since if compatibility with FreeBSD is needed. + */ + + + +/* + * Memory device info + * REQ_GDDR Notes: + * - This is read from scratch9, i.e. provided by bootstrap. + */ + +typedef struct mr_rsp_gddr { + char dev[MR_GVND_LEN]; /* Device vendor */ + uint16_t rev; /* Device revision */ + uint32_t size; /* Device size, in Mbit/device */ + uint32_t speed; /* Transactions speed, kT/sec */ +} MrRspGddr; + + + +/* + * GDDR frequencies + * REQ_GFREQ Notes: + * - current clock can be read from MEMORYFREQ register + * - the GDDR nominal frequency is reported + * - the supported frequency list contains values that PLLs + * are capable of producing. Info is of limited use, since + * there is no way to control the GDDR frequency (locked by fuses). + */ + +typedef struct mr_rsp_gfreq { + uint32_t cur; /* Current GDDR speed in kHz */ + uint32_t def; /* Default GDDR speed in kHz */ + uint32_t slen; /* Supported count */ + uint32_t supt[MR_PTAB_LEN]; /* Supported speeds list in kHz */ +} MrRspGfreq; + +/* + * Set GDDR frequency + * New frequency passed in MrHdr.parm + * SET_GFREQ Notes: + * - uOS cannot alter the PLLs because it requires retraining, which + * causes loss of memory content. + * - KnF: uOS does not have write access to SPI flash, which is required + * to modify the GDDR frequency at next reboot. + * - KnC: GDDR frequency is hard locked by fuses, cannot change, ever!!! + */ + + + +/* + * GDDR voltages + * REQ_GVOLT Notes: + * - KnF: Two GDDR voltages; current voltage set from MEMVOLT + * register and sense2 from BOARD_VOLTAGE_SENSE register. + * MEMVOLT register always returns zero, only sense2 + * actually returns something useful in current Si. + * - KnC: 3 potential sources; SVID, SMC, and SBOX registers. + * SBOX regs require SMC telemetry which is uncertain. + * SVID does not work in A0, B0 is TBD. + * SMC will eventually relay VR data + * Only SVID gives both set and actual values. + * Only SMC sets c_val field, zero is good. + * - Supported voltages reported are voltages the VRs can be programmed + * to supply. Info is of limited use, since there is no way to control + * the GDDR voltage (locked by fuses). + */ + +typedef struct mr_rsp_gvolt { + uint32_t cur; /* GDDR voltage read in uV */ + uint32_t set; /* GDDR voltage set in uV */ + uint8_t c_val; /* Valid bits, volt read */ + uint32_t slen; /* Supported count */ + uint32_t supt[MR_PTAB_LEN]; /* Supported voltage list in uV */ +} MrRspGvolt; + +/* + * Set GDDR voltage + * New voltage passed in MrHdr.parm + * SET_GVOLT Notes: + * - uOS cannot alter the VR settings at all. Even if it could + * then it still clash with the need to retrain and memory loss. + * - KnF: uOS does not have write access to SPI flash, which is required + * to modify the GDDR voltage at next reboot. + * - KnC: GDDR voltage is hard locked by fuses, cannot change, ever!!! + */ + + + +/* + * Board temperatures + * REQ_TEMP Notes: + * - CPU die temps can be read from THERMAL_STATUS (highest + * of several sensors) and CURRENT_DIE_TEMP registers. + * The die sensors values do not match the status + * value, so the conversion formula or calibration + * needs a re-visit. + * - If we could get at them, we could provide readings + * from the following devices, but are they all useful? + * Fan inlet sensor + * Fan exhaust sensor + * GDDR temp (one chip is measured) sensor + * Vccp VR + * Vddg VR + * Vddq VR + * - most devices report current and maximum temperatures in + * degrees Celcius as a signed integer, 9 bits for die temp + * and 8 bits for voltage regulators, 12 bit for sensors. + */ + +typedef struct mr_rsp_tsns { + int16_t cur; /* Current temperature, in C */ + int8_t c_val; /* Valid bits, if available */ +} MrRspTsns; + +typedef struct mr_rsp_tdie { + int16_t cur; /* Current temperature, in C */ + int16_t max; /* Maximum temperature, in C */ +} MrRspTdie; + +typedef struct mr_rsp_temp { + MrRspTsns die; /* Highest on-die measure */ + MrRspTdie dies[MR_DIES_LEN]; /* All on-die measures */ + MrRspTsns brd; /* Highest board measure */ + MrRspTsns fin; /* Fan inlet */ + MrRspTsns fout; /* Fan outlet */ + MrRspTsns gddr; /* Gddr device */ + MrRspTsns vccp; /* Vccp VR */ + MrRspTsns vddg; /* Vddg VR */ + MrRspTsns vddq; /* Vddq VR */ +} MrRspTemp; + + + +/* + * Fan speed + * REQ_FAN Notes: + * - fan status is reported in RPM and it's control is + * a pulse with modulation ratio to 255, i.e. 0 is min, + * 127 is ~50% and 255 is max. + * - the card has logic for controlling two fans. + * Only one is used and we only report status for one. + */ + +typedef struct mr_rsp_fan { + uint16_t rpm; /* Fan speed, rpm */ + uint8_t pwm; /* Active PWM ratio, 0..255 */ + uint8_t override; /* Override flag */ + uint8_t r_val; /* Valid bits, speed */ + uint8_t p_val; /* Valid bits, PWM */ +} MrRspFan; + +/* + * Set fan speed + * Control is passed in MrHdr.parm (struct fits into 32 bit) + * SET_FAN Notes: + * - this may collide with OOB methods (such as IPMI) + * that has priority, no guarantee this will stick. + * - changing fan speed parameters may interfere + * with PM in undefined ways. + */ + +typedef struct mr_set_fan { + uint8_t override; /* Override enable flag */ + uint8_t pwm; /* Force PWM ratio, 0..255 */ +} MrSetFan; + + + +/* + * Error correction mode + * REQ_ECC Notes: + * - retrieve this info from one (any) of the gboxes. + */ + +typedef struct mr_rsp_ecc { + uint32_t enable; /* ECC mode: 1 enabled, 0 disabled */ +} MrRspEcc; + +/* + * Set error correction mode + * New mode passed in MrHdr.parm + * SET_ECC Notes: + * - ECC cannot be changed on the fly by uOS, requires retraining + * of GDDR which causes loss of memory content. + * - uOS Linux does not have write access to HW config in SPI flash. + * No way to change ECC enable/disable setting. + */ + + + +/* + * Trace level + * REQ_TRC Notes: + * - No idea what support this has in uOS Linux. + */ + +typedef struct mr_rsp_trc { + uint32_t lvl; /* Debug trace level */ +} MrRspTrc; + +/* + * Set trace level + * New level passed in MrHdr.parm + * SET_TRC Notes: + * - No idea what this does in uOS Linux (nothing yet). + */ + + + +/* + * Turbo setting + * REQ_TRBO Notes: + * - Retrieve current actual turbo mode and state + * - 'set' value: 1 if enabled, 0 otherwise + * - 'state' value: 1 if active, 0 otherwise + * - 'avail' value: 1 if TRBO supported, 0 otherwise + */ + +typedef struct mr_rsp_trbo { + uint8_t set; /* Turbo mode */ + uint8_t state; /* Turbo state */ + uint8_t avail; /* Turbo mode available */ + uint8_t pad; /* Pad to 32 bit */ +} MrRspTrbo; + +/* + * Set turbo mode + * New mode passed in MrHdr.parm + * SET_TRB Notes: + * - Set always allowed, but silently ignored is not available. + */ + + + +/* + * LED override + * REQ_LED Notes: + * - KnC: Retrieve current LED mode setting, 0=normal, 1=identify + * - KnF: not implemented (error MR_ERR_UNSUP) + */ + +typedef struct mr_rsp_led { + uint32_t led; /* LED mode setting */ +} MrRspLed; + +/* + * Set LED mode + * New mode passed in MrHdr.parm + * SET_LED Notes: + * - KnC: Mode values + * 0 is normal SMC control (fast blink) + * 1 is identify mode (2 blinks every 2 seconds) + * - KnF: not implemented (error MR_ERR_UNSUP) + */ + + + +/* + * Overclocking + * REQ_OCLK Notes: + * - Curently no idea how to represent overclocking state + * - Overclocking not supported, return MR_RSP_NOVAL + */ + +typedef struct mr_rsp_oclk { + uint32_t freq; /* Over clocking setting */ +} MrRspOclk; + +/* + * Set overclocking mode + * New mode passed in MrHdr.parm + * SET_OCLK Notes: + * - Overclocking not supported, return MR_RSP_NOVAL + */ + + + +/* + * Processor utilization (OS status) + * REQ_CUTL Notes: + * - returned info is a simple sum of 4 logical CPUs + * - the counter units returned are Linux kernel jiffies, + * typically in range 1 - 10 ms, based on continous + * counters maintained by the kernel. The number of + * jiffies per second is reported for scaling purposes. + * In order to get a current 'utilization' figure, the + * host needs to query the counters at regular intervals + * and use this formula to achieve a percentage: + * u = ((c2 - c1) / (t2 - t1)) * 100 + * or + * u = ((c2 - c1) * 100) / (t2 - t1) + * where t2 - t1 = elapsed jiffies between samples + * c2 - c1 = usage jiffy counts between samples + * - the listed counters does not add up to cover the + * wall clock time exactly, sampling errors do occur. + * - counters for iowait, irq, and softirq are not included. + * - jiffy counters are updated by the timer tick interrupt + * handler. It's accuracy is known to be limited, see + * Documentation/cpu-load.txt for details. + * - counters are reported regardless of core sleep states + */ + +typedef struct mr_rsp_ccnt { + uint64_t user; /* Normal user mode jiffies */ + uint64_t nice; /* 'Nice' user mode jiffies */ + uint64_t sys; /* System mode jiffies */ + uint64_t idle; /* Idle time jiffies */ +} MrRspCcnt; + +typedef struct mr_rsp_cutl { + uint32_t tck; /* Actual jiffs/sec (scaled by 256) */ + uint16_t core; /* Cores reported on */ + uint16_t thr; /* Threads per core */ + uint64_t jif; /* Jiffy counter at query time */ + MrRspCcnt sum; /* System wide counters */ + MrRspCcnt cpu[MR_CORE_LEN]; /* Counters per core */ +} MrRspCutl; + + + +/* + * Memory utilization (OS status) + * REQ_MEM Notes: + * - memory snapshot is obtained from kernel structs. + * No walk of page descriptors is performed. + * - Not all memory stats are visible (exported to) modules. + * + *TBD: + * - Need clarification on what memory utilization means. + * For now the total, free and buffer memory is reported. + */ + +typedef struct mr_rsp_mem { + uint32_t total; /* Total usable RAM in kB */ + uint32_t free; /* Free memory in kB */ + uint32_t bufs; /* Buffer storage in kB */ +} MrRspMem; + + + +/* + * Process management (OS status) + * REQ_OS/REQ_PROC/REQ_THRD Notes: + * - split in 3 levels of detail: + * 1) Get set of applications (exclude kernel processes and threads) + * 2) Get details on specified application (pid in MrHdr.parm), + * which includes a thread pid list (up to 256 threads). + * 3) Get details on specific thread (thread id in MrHdr.parm) + * Opcodes 2 and 3 will, apart from thread list, mostly report the same + * set of details. What needs monitoring (see 'man proc', section on + * /proc//stat and /proc//status for what's available)? + * - process time counters are continuous, so if any ratio between + * the time a process/thread spends and actual wall clock time is + * to be calculated, the same logic for dynamic display applies as + * for the CUTL counters. I.e. a jiffy stamp is needed in the reply. + *TBD: + * - Introduce some sanity in time measurements. + * - Level 3 (thread details) is not implemented (is it needed ?). + * - Add ppid & credentials in MrRspProc? Needed to make a "top" display. + */ + +typedef struct mr_rsp_os { + uint64_t uptime; /* Seconds since OS boot */ + uint64_t loads[3]; /* 1, 5, 15 minute load average */ + uint32_t alen; /* Application count */ + uint32_t apid[256]; /* Application PIDs */ +} MrRspOs; + +typedef struct mr_rsp_proc { + uint32_t pid; /* Process ID */ + char name[16]; /* Program name (less path) */ + uint64_t utime; /* User time in uS */ + uint64_t stime; /* System time in uS */ + uint64_t etime; /* Elapsed time in uS */ + uint32_t rss; /* Resident set, in kB */ + uint32_t vm; /* VM size, in kB */ + uint32_t tlen; /* Thread count */ + uint32_t tpid[256]; /* Process threads */ +} MrRspProc; + + + +/* + * Terminate process + * Signal passed in MrHdr.parm bits 31:24 (see 'kill -l') + * Process ID passed in MrHdr.parm bits 23:0 (see /proc/sys/kernel/pid_max) + * CMD_PKILL Notes: + * - This is specifically for MPI style cluster managers + * who wants to rid the card of a specific process. + * - Processes owned by users ID's less than 500 are immune to this. + */ + + + +/* + * Terminate user + * Signal passed in MrHdr.parm bits 31:24 (see 'kill -l') + * User ID passed in MrHdr.parm bits 23:0 (see /etc/login.defs). + * CMD_UKILL Notes: + * - This is specifically for MPI style cluster managers to + * rid the card of processes owned by a specific user ID. + * - User ID's below 500 will silently be ignored. + */ + + + +/* + * Read SMC register + * MR_GET_SMC Notes: + * - Both SMC and FSC devices are accessed through I2C busses, which + * means that retrieval will be slow (order of milli seconds). + * - KnC: allows direct access to the SMC CSRs, which can be read + * or written in any random order. + * SMC CSR definitions are not within the scope of this API. + * Register number passed in MrHdr.parm bits 7:0 (8 bits). + * SMC registers are 32 bit, except one (UUID) that is 16 byte. + * - KnF: allows direct access to the fan speed controller (FSC) + * status registers on board temp and power sensors. + * The FSC execute command register every 50 mSec, which means + * that register needs 'SET' and hold for 50 mSec before any + * value can be returned. For telemetry data the SET is done + * implicitly, all other has to execute a 'SET' before running + * a 'GET' command. + * + FSC register definitions are not within the scope of this API. + * All sensor data returns are 8 bit wide. + */ + +typedef struct mr_rsp_smc { + uint8_t reg; /* Register number */ + uint16_t width; /* Valid return bytes (4 or 16) */ + union { + uint32_t val; /* Requested register value */ + uint8_t uuid[16]; /* Unique identifier */ + uint8_t serial[12]; /* Card serial number */ + } rtn; +} MrRspSmc; + +/* + * Write SMC register + * Register number passed in MrHdr.parm bits 31:24 (8-bit address decode). + * Register value passed in MrHdr.parm bits 23:0 (24 bit data). + * MR_SET_SMC Notes: + * - Improper use of this command can cause thermal shutdown of the card. + * - Improper use can interfere with power management. + * - KnC: For security reasons only the following registers are writeable: + * 20, 22 IPMI + * 2b, 2c, 2d, 2f, 30, 31, 32, 33 PM control parameters + * 4b Fan Adder + * 60 LED control + * No SMC registers of interest are more than 16 bits wide. + * - KnF: For security reasons only the followingregisters are writable: + * 0 Fan 1 Speed Override + * 1 Power Management and Control Config + * 11 General Status command + * Selector is 8 bits wide and only valid values are + * 20, 21, 22, 23 Power sensors, 1s avg. + * 30, 31, 32, 33 Power sensors, 1 sample + * a1, a2, a3, a4, a5 Max temps + */ + + + +/* + * Get PM config mode + * REQ_PMCFG notes: + * - Return value is reported 'as-is' from the PM module. + */ + +typedef struct mr_rsp_pmcfg { + uint32_t mode; /* Current PM operation mode */ +} MrRspPmcfg; + + + +/* + * Read Power triggers + * Consist of two trigger points (power,time), which can be calculated + * from SKU at card power-on or be persistent across reboots. + * At trigger (PROCHOT), GPU Hot gets asserted + * At trigger (PWRALT), Power Alert gets asserted + * + * MR_REQ_PROCHOT, MR_REQ_PWRALT Notes: + * - KnC: Read SMC registers for trigger 0 and 1 respectively. + * GPUHOT: registers 0x2c and 0x2d + * PWRALT: registers 0x2f and 0x30 + * - KnF: not implemented (error MR_ERR_UNSUP) + */ + +typedef struct mr_rsp_ptrig { + uint16_t power; /* Power limit, Watt */ + uint16_t time; /* Time windows, mSec */ +} MrRspPtrig; + +/* + * Write Power triggers + * MR_SET_PROCHOT, MR_SET_PWRALT Notes + * Structure MrRspPtrig passed in MrHdr.parm + * Trigger PROCHOT.power must be higher than trigger PWRALT.power. + * - KnC: Write SMC registers for trigger 0 and 1 respectively. + * GPUHOT: registers 0x2c and 0x2d + * PWRALT: registers 0x2f and 0x30 + * - KnF: not implemented (error MR_ERR_UNSUP) + * Warning: MT does not check for GPUHOT.power >= PWRALT.power. + *TBD: Should it? + * It is anticipated that changes follows reads, i.e. checking + * can be checked in application software. + */ + + + +/* + * Read Persistent Power triggers flag + * If set, changes to Power Triggers will be permanent + * MR_REQ_PERST Notes: + * - KnC: Reads bit 0 of SMC register 0x32 + * - KnF: not implemented (error MR_ERR_UNSUP) + */ + +typedef struct mr_rsp_perst { + uint32_t perst; /* Persistent power triggers */ +} MrRspPerst; + +/* + * Write Persistent Power triggers flag + * New value passed in MrHdr.parm + * MR_SET_PERST Notes: + * - KnC: Writes bit 0 of SMC register 0x32 + * - KnF: not implemented (error MR_ERR_UNSUP) + */ + + +/* + * Read Throttle states + * Returns status of current and previous throttle state + * retrieved from the card side PM module. + * MR_REQ_TTL Notes: + * - KnC: Calls PM for latest information. + * Note that the 'active' flags can toggle very often, + * which may make it less informative for display. + * Time tracked in jiffies, not true mSec resolution. + * - KnF: not implemented (error MR_ERR_UNSUP) + */ + +typedef struct mr_rsp_tstat { + uint8_t active; /* Currently active */ + uint32_t since; /* Length of current throttle, mSec */ + uint32_t count; /* Number of throttles */ + uint32_t time; /* Total time throttled, mSec */ +} MrRspTstat; + +typedef struct mr_rsp_ttl { + MrRspTstat thermal; /* Thermal throttle state */ + MrRspTstat power; /* Power throttle state */ + MrRspTstat alert; /* Power alert state */ +} MrRspTtl; + + +#pragma pack(pop) /* Restore to entry conditions */ + +#ifdef __cplusplus +} /* C++ guard */ +#endif + +#endif /* Recursion block */ diff --git a/ras/micras_common.c b/ras/micras_common.c new file mode 100644 index 0000000..4011ec0 --- /dev/null +++ b/ras/micras_common.c @@ -0,0 +1,968 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS MT module, common code + * + * Code and data structures to handle get/set tasks for KnC and KnF. + * Parties accessing the data structures are supposed to use the + * micras_mt_tsk() routines to ensure integrity and consistency. + * Particularly important when handling sysfs nodes and actions + * requested from SCIF connections must use that method in order + * to guarantee serialized access. + * + * Even if read-only access to latest valid data is required, + * it should go through micras_mt_tsk() using dedicated handlers + * in this module. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras.h" + + +/* + * Persistent data accessible through the CP api. + * Some functions just read/modify hardware CSRs + * and thus need no storage between invocations. + */ + + struct mr_rsp_hwinf hwinf; /* Card specific */ + struct mr_rsp_vers vers; /* Card specific */ +static struct mr_rsp_pver pver; + struct mr_rsp_freq freq; /* Card specific */ + struct mr_rsp_volt volt; /* Card specific */ + struct mr_rsp_power power; /* Card specific */ + struct mr_rsp_plim plim; /* Card specific */ +static struct mr_rsp_clst clst; + struct mr_rsp_gddr gddr; + struct mr_rsp_gfreq gfreq; /* Card Specific */ + struct mr_rsp_gvolt gvolt; /* Card specific */ + struct mr_rsp_temp temp; /* Card specific */ + struct mr_rsp_ecc ecc; /* Card specific */ +static struct mr_rsp_trc trc; + struct mr_rsp_trbo trbo; /* Card specific */ + struct mr_rsp_pmcfg pmcfg; /* Card specific */ + + +/* + * Map of SKUs for KnX cards (currently known, will change) + * The SKU is identified solely from the PCIe ID and sub-ID. + * A zero sub-ID is a don't care. + * + *TBD: core counts in KnF needs update, not all have 32. + * + * Notes: + * - Unless the PCIe subID differs, there are two 2250 cards + * that can't be distinguished from each other, one has 8 TXs + * and the other has none. PO cards -> impact only internal. + * - Not sure exactly what 2254 is, suspect MPI prototype. + */ + +#define VD(v, d) (PUT_BITS(15,0,(v)) | PUT_BITS(31,16,(d))) + +static struct sku { + uint32_t devID; /* PCIe Vendor and device ID */ + uint32_t subID; /* PCIe Sub- Vendor and device ID */ + uint8_t revNo; /* PCIe Revision number */ + uint8_t cr; /* Core count */ + uint8_t ch; /* Memory channels */ + uint8_t tx; /* TX samplers (only in KnC) */ + char * name; /* SKU name */ +} skuList[] = { + { VD(0x8086, 0x2240), 0, 0x00, 32, 8, 0, "E1" }, /* KnF */ + { VD(0x8086, 0x2241), 0, 0x00, 32, 8, 0, "E2" }, /* KnF */ + { VD(0x8086, 0x2242), 0, 0x00, 32, 8, 0, "E3" }, /* KnF */ + { VD(0x8086, 0x2243), 0, 0x00, 32, 8, 0, "E3" }, /* KnF */ + { VD(0x8086, 0x2249), VD(0x8086, 0xed08), 0, 32, 4, 0, "Ed" }, /* KnF */ + { VD(0x8086, 0x2249), VD(0x8086, 0xed0a), 0, 32, 4, 0, "Eb" }, /* KnF */ + { VD(0x8086, 0x224a), 0, 0x00, 32, 8, 0, "Eb" }, /* KnF */ + + { VD(0x8086, 0x2250), 0, 0x00, 60, 16, 0, "SKU1/SKU2" }, /* KnC: ES1, ES1B */ + { VD(0x8086, 0x2250), 0, 0x10, 60, 16, 0, "SKU2" }, /* KnC: ES2 */ + { VD(0x8086, 0x2250), 0, 0x11, 60, 16, 0, "SKU2" }, /* KnC: Mkt2 */ + { VD(0x8086, 0x2250), 0, 0x20, 60, 16, 0, "SKU2" }, + { VD(0x8086, 0x2251), 0, 0x00, 48, 16, 8, "SKU2" }, + { VD(0x8086, 0x2252), 0, 0x00, 48, 16, 0, "SKU3" }, + { VD(0x8086, 0x2253), 0, 0x00, 40, 8, 0, "SKU4/SKU5" }, /* KnC: ES0, ES1 */ + { VD(0x8086, 0x2253), 0, 0x10, 40, 8, 0, "SKU5" }, + { VD(0x8086, 0x2254), 0, 0x00, 62, 16, 0, "??" }, /* KnC: ?? */ + { VD(0x8086, 0x2255), 0, 0x00, 62, 16, 8, "SKUX" }, /* KnC: A0-PO */ + { VD(0x8086, 0x2256), 0, 0x00, 48, 12, 7, "SKU5" }, /* KnC: A0-PO */ + { VD(0x8086, 0x2257), 0, 0x00, 4, 16, 0, "SKUZ" }, + { VD(0x8086, 0x2258), 0, 0x00, 62, 16, 0, "SKU1" }, /* KnC: ES1, ES1B */ + { VD(0x8086, 0x2258), 0, 0x10, 62, 16, 0, "SKU1" }, + { VD(0x8086, 0x2259), 0, 0x00, 52, 16, 0, "SKU3" }, /* KnC: ES1 */ + { VD(0x8086, 0x225a), 0, 0x00, 48, 12, 0, "SKU4" }, /* KnC: ES1, ES1B */ + { VD(0x8086, 0x225a), 0, 0x10, 48, 12, 0, "SKU4" }, /* KnC: ES2 */ + { VD(0x8086, 0x225a), 0, 0x11, 48, 12, 0, "SKU4" }, /* KnC: Int5 */ + { VD(0x8086, 0x225b), 0, 0x00, 52, 12, 0, "SKU3" }, + { VD(0x8086, 0x225b), 0, 0x10, 52, 12, 0, "SKU3" }, + { VD(0x8086, 0x225c), 0, 0x10, 61, 16, 0, "SKU1" }, /* KnC: Mkt1 */ + { VD(0x8086, 0x225c), 0, 0x11, 61, 16, 0, "SKU1" }, /* KnC: Mkt1 */ + { VD(0x8086, 0x225c), 0, 0x20, 61, 16, 0, "SKU1" }, /* KnC: Mkt1 */ + { VD(0x8086, 0x225d), 0, 0x10, 57, 12, 0, "SKU4" }, /* KnC: Mkt4 */ + { VD(0x8086, 0x225d), 0, 0x11, 57, 12, 0, "SKU4" }, /* KnC: Mkt3, Mkt4 */ + { VD(0x8086, 0x225d), 0, 0x20, 57, 12, 0, "SKU4" }, + { VD(0x8086, 0x225e), 0, 0x11, 57, 16, 0, "GZ" }, + { VD(0x8086, 0x225e), 0, 0x20, 57, 16, 0, "GZ" }, +}; + + +/* + * Map of GDDR vendor ID vs company names + */ + +static struct { + int id; + char * vendor; +} GddrVendors[] = { + { 1, "Samsung" }, + { 2, "Quimonda" }, + { 3, "Elpida" }, + { 6, "Hynix" }, +}; + + + +/* +** +** Initializations +** +** This has two intended purposes: +** - Do a on-time effort to collect info on properties that +** are not going to change after the initial setup by +** either bootstrap or kernel initialization. +** - Collect initial values on things we can modify. +** Intent is that unloading the ras module should reset +** all state to that of the time the module was loaded. +** +*/ + +void __init +mr_mt_init(void) +{ + static int only_once = 1; + uint32_t scr4, scr9, scr13; + uint32_t eax, ebx, ecx, edx; + uint32_t thr, hwt; + uint32_t id; + int i; + + if (! only_once) + return; + only_once = 0; + + /* + * HWINF: + * Scratch register 13 has more info than the hwinf record + * currently can contain, may revisit. + * 3:0 Substepping + * 7:4 Stepping (0 A, 2&3 B, 4 C, 6 D) + * 11:8 Model + * 15:12 Family (11 KnF) + * 17:16 Processor + * 19:18 Platform (0 Silicon, 1 FSIM, 2 MCEMU) + * 23:20 Extended model + * 31:24 Extended family + * + * Valid KnF steppings (Step + Substep): + * "A0" (0 + 0), "A1" (0 + 1), "A2" (0 + 2), + * "B0" (2 + 0), "B1" (3 + 1), "C0" (4 + 0), + * "D0" (6 + 0) + * Valid KnC steppings (Step + Substep): + * TBD: + */ + scr13 = mr_sbox_rl(0, SBOX_SCRATCH13); + hwinf.rev = GET_BITS(11, 8, scr13); + hwinf.step = GET_BITS( 7, 4, scr13); + hwinf.substep = GET_BITS( 3, 0, scr13); + + /* + * VERS: + * Add OS version + */ + vers.uos[0] = scnprintf(vers.uos + 1, MR_VERS_LEN -2, + "Linux version: %s (build %s)", + init_uts_ns.name.release, + init_uts_ns.name.version); + + /* + * PVERS: + * Make MicRas version available + */ + pver.api[0] = scnprintf(pver.api + 1, MR_PVER_LEN -2, + "%s", RAS_VER); + + /* + * CLST: + * On regular CPU's this is read from CPUID 2 (htt cores) + * and CPUID 4 (die cores), threads per cores is htt/die. + * This does not work the same way in MIC, cores & threads + * per core on various SKUs is not reflected by the CPUIDs. + * All we have is the number of registered APIC IDs, which + * happens to be the same as logical CPUs (htt cores). + * The threads per core (die cores) is given by bootstrap in + * scratch register #4 as a bit field. + * 3:0 Threads per core (mask) + * 5:4 Cache size (0,1,2: 512K, 3: 256K) + * 9:6 GBOX channel count (0 based) + * 29:25 ICC divider for MCLK + * 30 Soft reset boot + * 31 Internal flash build + */ + cpuid(1, &eax, &ebx, &ecx, &edx); + hwt = GET_BITS(23, 16, ebx); + if (hwt > nr_cpu_ids) + hwt = nr_cpu_ids; + scr4 = mr_sbox_rl(0, SBOX_SCRATCH4); + thr = GET_BITS(3, 0, scr4); + thr = bitmap_weight((const unsigned long *) &thr, 4); + if (thr) { + if (hwt % thr) + printk("mr_mt_init: cpu/thr mismatch: hwt %d, thr %d, cor %d, (%d)\n", + hwt, thr, hwt / thr, hwt % thr); + clst.thr = thr; + } + else { + printk("Who trashed scratch #4? Val 0x%08x => 0 threads/core?\n", scr4); + clst.thr = 4; /* Best guess */ + } + clst.count = hwt / 4; + + /* + * GDDR: + * Bootstrap leaves information in scratch register #9 + * about the GDDR devices. The layout is: + * 3:0 Vendor ID, see table GddrVendors above + * 7:4 Revision + * 9:8 Density (00 = 512, 01 = 1024, 02 = 2048) + * 11:10 FIFO depth + * 15:12 DRAM info ?? + * 29 ECC enable + */ + scr9 = mr_sbox_rl(0, SBOX_SCRATCH9); + id = GET_BITS(3, 0, scr9); + for(i = 0; i < ARRAY_SIZE(GddrVendors); i++) + if (GddrVendors[i].id == id) { + gddr.dev[0] = scnprintf(gddr.dev +1, MR_GVND_LEN -2, + "%s", GddrVendors[i].vendor); + break; + } + if (i == ARRAY_SIZE(GddrVendors)) + gddr.dev[0] = scnprintf(gddr.dev +1, MR_GVND_LEN -2, "Vendor %d", id); + gddr.rev = GET_BITS(7, 4, scr9); + gddr.size = 512 * (1 << GET_BITS(9, 8, scr9)); + + /* + * Card specific initialization + */ + mr_mt_card_init(); + + /* + *TBD: Save commmon registers this module may change + */ +} + +void __exit +mr_mt_exit(void) +{ + /* + * Card specific clean-up + */ + mr_mt_card_exit(); + + /* + *TBD: Restore commmon registers this module may change + */ +} + + +/* + * Return SKU properties for this card (as string) + * Processor can be identified on it's own easily, + * but the SKU reflects the impact of fuse changes + * which don't alter the CPU id. + * + * SKU properties: + * - name Name of sku (if known) + * - mch Number of memory channels + * - txs Number of texture samplers + */ + +/* + * Why are these not defined in the includes? + */ + +#ifndef SBOX_PCIE_VENDOR_ID_DEVICE_ID +#define SBOX_PCIE_VENDOR_ID_DEVICE_ID 0x00005800 +#endif +#ifndef SBOX_PCIE_PCI_SUBSYSTEM +#define SBOX_PCIE_PCI_SUBSYSTEM 0x0000582c +#endif +#ifndef SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8 +#define SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8 0x00005808 +#endif + +static struct sku * +get_sku(void) +{ + static struct sku * sku; + uint32_t dev, sub, rev, fuse; + char * grp; + int i; + + if (sku) + return sku; + + dev = mr_sbox_rl(0, SBOX_PCIE_VENDOR_ID_DEVICE_ID); + rev = mr_sbox_rl(0, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8); + sub = mr_sbox_rl(0, SBOX_PCIE_PCI_SUBSYSTEM); + fuse = mr_sbox_rl(0, SBOX_SCRATCH7); + rev = GET_BITS(7, 0, rev); + fuse = GET_BITS(15, 0, fuse); + + /* + * Usually the fuse revision define a group of SKUs. + * Once that's determined we'll use the other details + * to identify the SKU within that group. + */ + if (fuse >= 0 && fuse <= 1) + grp = "A0 PO"; + else if (fuse >= 2 && fuse <= 3) + grp = "A0 ES1"; + else if (fuse >= 4 && fuse <= 50) + grp = "A0 ES1B"; + else if (fuse >= 51 && fuse <= 100) + grp = "B0 PO"; + else if (fuse >= 101 && fuse <= 150) + grp = "B0 ES2"; + else if (fuse >= 151 && fuse <= 152) + grp = "B1 PO"; + else if (fuse >= 153 && fuse <= 154) + grp = "B1 PO"; + else if (fuse == 155) + grp = "B1 QS"; + else if (fuse == 156) + grp = "B1 PRQ"; + else if (fuse == 157) + grp = "B1 PRQ/GZ"; + else if (fuse >= 158 && fuse <= 159) + grp = "B1 PRQ"; + else if (fuse >= 201 && fuse <= 203) + grp = "B2 PRQ/QS"; + else if (fuse == 253) + grp = "C0 PO"; + else if (fuse == 254) + grp = "C0 QS"; + else + grp = "???"; + + /* + * Now determine which member of the group. + * Take hints from PCIe device ID and revision. + * Device ID mappings is a mess, see table above. + * Revision has a simple mapping (follows fuses): + * 0x00 => A0 cards + * 0x10 => B0 cards + * 0x11 => B1 cards + * 0x20 => C0 cards + * 0x21 => C1 cards (if ever to be made) + */ + for(i = 0; i < ARRAY_SIZE(skuList); i++) { + if (dev == skuList[i].devID) { + if (skuList[i].subID && sub != skuList[i].subID) + continue; + if (rev != skuList[i].revNo) + continue; + + /* + * Found one, this is the place to cross reference it + * - memory channels should match SCR4 bits 9:6 + */ + break; + } + } + + if (i < ARRAY_SIZE(skuList)) { + sku = skuList + i; + printk("RAS: card %x:%x:%x is a \"%s %s\" (%d cores, %d memch, %d txs)\n", + dev, sub, rev, grp, sku->name, sku->cr, sku->ch, sku->tx); + } + + return sku; +} + +#if NOT_YET +char * +mr_sku(void) +{ + struct sku * sku; + + sku = get_sku(); + return sku ? sku->name : 0; +} +#endif + +int +mr_mch(void) +{ + struct sku * sku; + + sku = get_sku(); + return sku ? sku->ch : 0; +} + +int +mr_txs(void) +{ + struct sku * sku; + + sku = get_sku(); + return sku ? sku->tx : 0; +} + + +/* +** +** MT Get functions +** +** All works the same way; they get an opague pointer to +** a place where the return structure can be placed. The +** return value is either the amount (bytes) to be shipped +** back in response or one of the MR_* error codes. +** +*/ + +int +mr_get_hwinf(void * p) +{ + struct mr_rsp_hwinf * r; + + r = (struct mr_rsp_hwinf *) p; + *r = hwinf; + return sizeof(*r); +} + + +int +mr_get_vers(void * p) +{ + struct mr_rsp_vers * r; + + r = (struct mr_rsp_vers *) p; + *r = vers; + return sizeof(*r); +} + + +int +mr_get_pver(void * p) +{ + struct mr_rsp_pver * r; + + r = (struct mr_rsp_pver *) p; + *r = pver; + return sizeof(*r); +} + + +int +mr_get_clst(void * p) +{ + struct mr_rsp_clst * r; + + r = (struct mr_rsp_clst *) p; + *r = clst; + return sizeof(*r); +} + + +int +mr_get_gddr(void * p) +{ + struct mr_rsp_gddr * r; + + r = (struct mr_rsp_gddr *) p; + *r = gddr; + return sizeof(*r); +} + + +int +mr_get_trc(void * p) +{ + struct mr_rsp_trc * r; + + r = (struct mr_rsp_trc *) p; + *r = trc; + return sizeof(*r); +} + + +int +mr_get_cutl(void * p) +{ + struct mr_rsp_cutl * r; + struct timespec tp; + struct cpu_usage_stat * u; + uint64_t user, nice, sys, idle; + int i, n; + + r = (struct mr_rsp_cutl *) p; + memset(r, '\0', sizeof(*r)); + r->tck = ACTHZ; + r->core = clst.count; + r->thr = clst.thr; + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + r->jif = timespec_to_jiffies(&tp); + + for_each_possible_cpu(i) { + u = & kstat_cpu(i).cpustat; + + user = u->user; + nice = u->nice; + sys = u->system + u->irq + u->softirq; + idle = u->idle + u->iowait; + + r->sum.user += user; + r->sum.nice += nice; + r->sum.sys += sys; + r->sum.idle += idle; + + /* + * Currently the boot processor is thread 0 of the last + * enabled core. Thus, on a 32 core machine, we get: + * + * cpu # 0 1 2 3 4 5 .. 124 125 126 127 + * core # 31 0 0 0 0 1 .. 30 31 31 31 + * apic ID 124 0 1 2 3 4 .. 123 125 126 127 + * + * The core is included in the per-cpu CpuInfo struct, + * and it should be safe to get it from there. + */ + n = cpu_data(i).cpu_core_id; + if (n < r->core) { + r->cpu[n].user += user; + r->cpu[n].nice += nice; + r->cpu[n].sys += sys; + r->cpu[n].idle += idle; + } + } + + return sizeof(*r); +} + + +int +mr_get_mem(void * p) +{ + struct mr_rsp_mem * r; + struct sysinfo si; + + si_meminfo(&si); + + r = (struct mr_rsp_mem *) p; + memset(r, '\0', sizeof(*r)); + r->total = si.totalram << (PAGE_SHIFT - 10); + r->free = si.freeram << (PAGE_SHIFT - 10); + r->bufs = si.bufferram << (PAGE_SHIFT - 10); + + return sizeof(*r); +} + + +int +mr_get_os(void * p) +{ + struct mr_rsp_os * r; + uint16_t i; + struct timespec tp; + struct task_struct * t; + + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + + r = (struct mr_rsp_os *) p; + memset(r, '\0', sizeof(*r)); + r->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + r->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + r->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + r->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + /* + * Walk process list and indentify processes that + * are associated with user programs. For now we + * exclude kernel threads and non-stable processes. + * + *TBD: Really just wanted to take the task_lock, but + * it is not exported to modules. It seems to be + * tied into the RCU logic, so locking the whole + * RCU should do the trick as long as it's just + * for a very short time. + */ + i = 0; + rcu_read_lock(); + for_each_process(t) { + if ((t->flags & (PF_KTHREAD | PF_STARTING | PF_EXITING)) || + (t->group_leader && t->group_leader != t)) + continue; + + if (i < ARRAY_SIZE(r->apid)) + r->apid[i] = t->pid; + i++; + } + rcu_read_unlock(); + r->alen = i; + + return sizeof(*r); +} + + +int +mr_get_proc(void * p) +{ + struct mr_rsp_proc * r; + struct task_struct * t, * s; + struct mm_struct * mm; + struct timespec uptime, start, ts; + cputime_t utime, stime; + pid_t pid; + int err, i; + + err = -MR_ERR_NOVAL; + pid = * (uint32_t *) p; + if (! pid) + return err; + + r = (struct mr_rsp_proc *) p; + memset(r, '\0', sizeof(*r)); + do_posix_clock_monotonic_gettime(&uptime); + + rcu_read_lock(); + t = pid_task(find_pid_ns(pid, &init_pid_ns), PIDTYPE_PID); + if (t) { + /* + * Found process, get base stats + */ + r->pid = pid; + strncpy(r->name +1, t->comm, sizeof(r->name) -1); + start = t->start_time; + utime = t->utime; + stime = t->stime; + mm = get_task_mm(t); + if (mm) { +#ifdef SPLIT_RSS_COUNTING + r->rss = atomic_long_read(& mm->rss_stat.count[MM_FILEPAGES]) + + atomic_long_read(& mm->rss_stat.count[MM_ANONPAGES]); +#else + r->rss = mm->rss_stat.count[MM_FILEPAGES] + + mm->rss_stat.count[MM_ANONPAGES]; +#endif + r->vm = mm->total_vm; + mmput(mm); + } + + /* + * Next try get list of threads (if any) + */ + i = 0; + if (!t->group_leader || t->group_leader == t) { + s = t; + do { + if (s->pid != pid) { + if (i < ARRAY_SIZE(r->tpid)) + r->tpid[i++] = s->pid; + } + } while_each_thread(t, s); + } + r->tlen = i; + err = sizeof(*r); + } + rcu_read_unlock(); + + /* + * Convert values into API formats (uSec, kB). + */ + if (err > 0) { + r->name[0] = strlen(r->name +1); + ts = timespec_sub(uptime, start); + r->etime = timespec_to_ns(&ts) / NSEC_PER_USEC; + r->utime = jiffies_to_usecs(utime); + r->stime = jiffies_to_usecs(stime); + r->vm = r->vm << (PAGE_SHIFT - 10); + r->rss = r->rss << (PAGE_SHIFT - 10); + } + + return err; +} + + + +/* +** +** MT Set functions +** +** All works the same way; they get an opague pointer to +** a location where the 'set' parameter from the request is +** placed. Return code is one of the MR_* error codes. +** +** Input screening takes place here (to the extent possible). +** +*/ + + +#if NOT_YET +int +mr_set_gvolt(void * p) +{ + /* + * Cannot be set from uOS, pretend success + */ + return 0; +} + + +int +mr_set_gfreq(void * p) +{ + /* + * Cannot be set from uOS, pretend success + */ + return 0; +} +#endif + + +int +mr_set_trc(void * p) +{ + /* + * No idea on what to do with this + */ + trc.lvl = *(uint32_t *) p; + return 0; +} + + + +/* +** +** MT Process controls +** +*/ + +int +mr_cmd_pkill(void * p) +{ + struct task_struct * t; + const struct cred * cred; + pid_t pid; + uint32_t val; + int sig, ret; + + val = *(uint32_t *) p; + pid = GET_BITS(23, 0, val); + sig = GET_BITS(31, 24, val); + + ret = -MR_ERR_INVAUX; + rcu_read_lock(); + t = pid_task(find_pid_ns(pid, &init_pid_ns), PIDTYPE_PID); + if (t) { + if (!(t->flags & (PF_KTHREAD | PF_STARTING | PF_EXITING)) && + !(t->group_leader && t->group_leader != t)) { + + cred = __task_cred(t); + if (cred->euid >= 500) { + if (!send_sig(sig, t, 1)) + ret = 0; + } + else + ret = -MR_ERR_PERM; + } + } + rcu_read_unlock(); + + return ret; +} + + +int +mr_cmd_ukill(void * p) +{ + struct task_struct * t; + const struct cred * cred; + uid_t uid; + uint32_t val; + int sig, ret; + + val = *(uint32_t *) p; + uid = GET_BITS(23, 0, val); + sig = GET_BITS(31, 24, val); + + if (uid < 500) + return -MR_ERR_PERM; + + ret = 0; + rcu_read_lock(); + for_each_process(t) { + if ((t->flags & (PF_KTHREAD | PF_STARTING | PF_EXITING)) || + (t->group_leader && t->group_leader != t)) + continue; + + cred = __task_cred(t); + if (cred->euid == uid) { + ret = send_sig(sig, t, 1); + if (ret) + break; + } + } + rcu_read_unlock(); + + return ret ? -MR_ERR_INVAUX : 0; +} + + +/* +** +** Debug utilities. +** Remove or comment out when development complete! +** +*/ + +#if EE_VERIFY +/* + * Hex dumper + */ + +#include + +#define ALEN 9 /* Digits of address shown */ + +void +dmp_hex(void *ptr, int len, const char *msg, ...) +{ + unsigned char * d; + unsigned char * prev; + int n, m; + int star; + char asc[16 + 1]; + + star = 0; + prev = 0; + + /* + * Print message (if any). + * It is treated as a 'printf' format strings with arguments. + */ + if (msg) { + va_list ap; + + va_start(ap, msg); + vprintk(msg, ap); + va_end(ap); + printk("\n"); + } + + /* + * Loop trying to dump 16 bytes at a time + */ + for(d = (unsigned char *) ptr;; d += 16) { + + /* + * Locate dump area from input buffer; + */ + n = (len > 16) ? 16 : len; + len -= n; + + /* + * Skip repeated lines. + * I want the last line shown on the output. + */ + if (d != ptr && n == 16 && !memcmp(d, prev, 16)) { + if (len) { + if (!star) { + star = 1; + printk("%*s\n", ALEN + 3, "*"); + } + continue; + } + } + + /* + * Print one line of hex dump. + */ + if (n) { + printk("%*lx ", ALEN, ((long) d) & ((1L << 4 * ALEN) - 1)); + for(m = 0; m < n; m++) { + printk("%02x ", d[m]); + if (m == 7) + printk(" "); + asc[m] = (isascii(d[m]) && isprint(d[m])) ? d[m] : '.'; + } + asc[m] = '\0'; + printk("%*s %s\n", 3 * (16 - m) + (m < 8), "", asc); + } + + /* + * We are done when end of buffer reached + */ + if (!len) + break; + + /* + * Reset repeat line suppression + */ + star = 0; + prev = d; + } +} +#endif diff --git a/ras/micras_core.c b/ras/micras_core.c new file mode 100644 index 0000000..2cdbb4b --- /dev/null +++ b/ras/micras_core.c @@ -0,0 +1,973 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS handler for core MC events + * + * Contains code to intercept MC events, collect information + * from core MCA banks on originating core and possibly on + * all active cores if necessary. + * + * In case of a severe event, defined by corrupted context, + * the handler will add a record of the event in the designated + * EEPROM hanging off the Over-Clocking I2C bus. Next a message + * will be sent to the SMC (enabling IPMI notifications) and at + * last a message is sent to host via the MC SCIF connection + * (if MC SCIF session has been established). + * + * Lesser events will also be sent to the host on a 'FYI' basis, + * but no record will be stored in the event log, nor will the + * SMC be notified. + * + * Special cases of high rate correctable errors may also cause + * events to be recorded in EEPROM on the assumption that the + * root cause will be detectable from maintenance mode. + * + * The handler cannot expect any support from the OS while in + * exception (NMI) context. Therefore, NMI-safe routines has + * been added to mimic some kernel services, e.g. ee_print(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras.h" + + +/* +** +** Brief design notes: +** There are two ways this code normally will be entered. +** +** 1) From standard interrupt context (bottom-half). +** This is supporting MC events picked up by the +** machine_check_poll(), i.e. events that aren't +** causing state corrruption (UC bit not set). +** +** 2) From exception/NMI context. +** This handles errors that _did_ flag processor +** state corruption (UC bit set, or other condition +** causing the kernel exception handler to pick it up). +** +** Both cases can happen simultaneously on different CPU's, +** which require careful considerations about re-entrant code +** behaviour here. Particularly nasty is exception context where +** normal spinlocks won't work (FYI: x86 spinlocks assume interrupt +** disable can protect a critical region, an assumption that is +** false when an exception/NMI occur). +** +** Standard interrupt context entries occur when non-fatal and +** thus non-critical MC events are handled. In most cases just +** results in a regular SCIF send of McInfo structs to the host. +** Note that the call chain origin is a callout from the timer +** thread, not from an interrupt service routine, so to name +** it as standard interrupt context is somewhat misleading. +** +** Exception context messages are usuallly fatal and must be +** dealt with immediately, because otherwise the generic machine +** handler may panic() the system when exiting exception handler +** (default behavior, may be tweaked by altering 'threshold'). +** +** In order to proceed we can either implement a locking mechanism +** at every API function entry, or we can let every function do it's +** thing independently. The latter is preferred, though it gets +** somewhat complicated because the API between the generic MC +** handling and RAS module is in fact composed of several calls. +** +** If state between API calls needs to be tracked then that can be +** done by means of pre-allocated arrays, similar to the generic +** handling in the Linux kernel. Currently the only state variable +** is the mask of CPUs that has been sent an IPI. +** +** Core MC events can be simulated by using the 'mce-inject' tool, +** consisting of a kernel module and a text mode application program. +** The 'mce-inject' module knows the difference between fatal and +** non-fatal events (defined by the UC bit) and acts differently +** in the two cases. Non-fatal injections cause machine_check_poll() +** to be called on all CPUs, resulting in events being reported to +** function mce_poll(). Fatal injections cause do_machine_check() +** to be called on all CPUs, resulting in calls to the mcc_exc_* +** routines below. Activities triggered by mce-inject are flagged +** as 'fake', and shall _NOT_ be logged in the EEPROM. +** +** Warning: +** Controls in the generic MC handling may cause the kernel to +** panic, _ALSO_ even if no event was found in any MCA banks!! +** Not sure exactly how to capture that sort of event. +** +** Warning: +** The 'mce-inject' module uses different methods of invoking error +** handling routines, depending on the mce record (inject_flags). +** Specifically, the 'mce-inject' module may use of broadcast NMIs +** to invoke machine_check_poll() or do_machine_check() on all CPUs, +** which will make these functions execute in exception context. +** The NMI broadcast mechanism is based on registering a handler on +** the 'die' notifier chain and then doing an +** apic->send_IPI_mask(.., NMI_VECTOR), +** knowing that do_nmi() will invoke this notifier chain when no +** genuine cause of NMI was found (i.e. if inb(61) returns 0xc0, +** [which is SERR + IOCHK on chipset register NSR]). +** Long story short; if 'mce-inject' is used we can not expect that +** polling is done in standard interrupt context, and need to set +** the 'in exception context' flag for SCIF access. +** +*/ + + +/* + * Hooks placed in the native machine check handler + * See file arch/x86/kernel/cpu/mcheck/mce.c for placement. + * + * poll After entering a non-UC event into mce_log. + * This happens in normal thread context, which + * means that kernel services are avaialble. + * exc_flt Filter on correctable errors. If events occur + * at a very high rate they can severely slow + * down the system and/or crash it entirely. + * Logic here will disable reporting of some + * events if they are seen too often. + * exc_entry Entering MC exception handler. + * Called _after_ reading MCG_STATUS and the early + * severity assesment by mce_severity() has been + * performed on all banks, such that we get to + * know if the native MC handler will panic. + * exc_log After entering a UC event into mce_log. + * The logged mce record has all available + * details on the event, and this point is the + * best place to perform our RAS activities. + * exc_panic Right before the MC exception handler calls + * the panic function. + * exc_exit Exit the MC exception handler + * print Exception context safe printf to POST-card UART + */ + +extern void (*mca_poll)(struct mce *, uint64_t, int); +extern void (*mca_exc_flt)(struct mce *, uint64_t, int); +extern void (*mca_exc_entry)(struct mce *, int, int, int, char *); +extern void (*mca_exc_log)(struct mce *, uint64_t, int, int, char *, int, int); +extern void (*mca_exc_panic)(struct mce *, char *, char *, int); +extern void (*mca_exc_exit)(struct mce *, int, int, int, int); +extern int (*mca_print)(char *, ...); + +extern struct mce_log mcelog; /* Export from kernel */ +extern struct mutex mce_read_mutex; /* Export from kernel */ +static unsigned mcc_seen; /* Last event in kernel log */ +int in_sync; /* Flag when sync'ing */ + + +/* + * Convert a kernel mce record into a MC API format + */ + +static void +mcc_conv(struct mce * mce, struct mce_info * mc) +{ + mc->org = mce->bank; + mc->id = mce->extcpu; +#ifdef CONFIG_MK1OM + mc->pid = xlat_cpu[cpu_data(mc->id).apicid]; +#endif + mc->stamp = mce->time; + mc->status = mce->status; + mc->addr = mce->addr; + mc->misc = mce->misc; + mc->flags = (mc->status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0; +} + + +/* + * Filter for correctable errors, may modify CTL value. + * The filter is pretty crude, we just want to protect + * ourselves from being run over by fast recurring events. + * We keep tabs of events seen in a static array. + * + * Algorithm is like this: + * - test if event is in filter list; if not exit filter. + * - search for instance of this event in history. + * - if not found, insert event in history (strike 1). + * - if found but time since last seen exceeds window, + * then treat event as new in history (new strike 1). + * - if found and within time window, bump strike counter. + * - if strike counter reach maximum, we're fed up and + * turn this event off by clearing the associated + * bit in the offending MCA bank's CTL register and + * send a 'filter' event notification to the host. + * + * Advantages of this design is: + * - individual parameters for every filtered event. + * - only one event history array. + * - no periodic aging of events in history array. + * - no averaging over time required. + * - no moving/reordering of event history entries. + * - new events do not replace older seen event + * - filter reacts immediately when max reached. + * + * Disadvantages are: + * - linear search through filter array. + * - linear search through history array. + * - time parameter not obvious, it's really a limit + * on how old events in history are allowed to be. + * - in pathological cases the filter's reaction time + * will be max * window (when events trickle in at + * a rate just below the window size). + * - data in ADDR and MISC registers are not used to + * match current event with history. Should they be? + * + * For now, both lists are short enough that introducing + * more advanced searches probably are not going to help. + * + * On KnC the flash may have overrides of the mc_turnoff table. + */ + +#define FT ((17 * 60) + 30) * 60 /* Default time window: 17.5 hours */ + +static struct mc_hist { + uint32_t count; /* How many times seen */ + uint64_t last; /* TSC last time seen */ + struct mce_info mc; /* Local MC event record */ +} mc_history[32]; + +static struct mc_disc { + uint8_t bank, ctl; /* Bank selector and control bit # */ + uint16_t win; /* Time window (seconds) */ + uint16_t max; /* Max count */ + uint16_t mca_code; /* MCA code, status[15:0] */ + uint16_t mdl_code; /* Model code, status[31:16] */ +} mc_turnoff[] = { + { 0, 3, FT, 2, 0x0150, 0x0000 }, /* MC0: J-Cache error */ + { 1, 0, FT, 2, 0x010a, 0x0001 }, /* MC1: L2 Tag error */ + { 1, 4, FT, 2, 0x010a, 0x0010 }, /* MC1: L2 Data error */ + { 2, 2, FT, 2, 0x010d, 0x0100 }, /* MC2: Tag State, ext TD */ + { 2, 2, FT, 2, 0x010d, 0x0101 }, /* MC2: Tag State, int TD */ + { 2, 3, FT, 2, 0x012d, 0x0110 }, /* MC2: Core Valid, ext TD */ + { 2, 3, FT, 2, 0x012d, 0x0111 }, /* MC2: Core Valid, int TD */ + { 3, 2, FT, 2, 0x010d, 0x0100 }, /* DBOX: Tag State error, ext TD */ + { 3, 2, FT, 2, 0x010d, 0x0101 }, /* DBOX: Tag State error, int TD */ + { 3, 3, FT, 2, 0x012d, 0x0110 }, /* DBOX: Core Valid error, ext TD */ + { 3, 3, FT, 2, 0x012d, 0x0111 }, /* DBOX: Core Valid error, int TD */ + { 4, 4, FT, 2, 0x0e0b, 0x0030 }, /* SBOX: PCI-e */ + { 5, 0, FT, 2, 0x0001, 0x0000 }, /* GBOX: Ch-0 retraining */ + { 5, 1, FT, 2, 0x0001, 0x0001 }, /* GBOX: Ch-1 retraining */ + { 5, 2, FT, 2, 0x0001, 0x0002 }, /* GBOX: Ch-0 ECC error */ + { 5, 3, FT, 2, 0x0001, 0x0003 }, /* GBOX: Ch-1 ECC error */ + { 6, 3, FT, 2, 0x010e, 0x0008 }, /* TBOX: T2 CRC error */ +}; + + +#ifdef CONFIG_MK1OM + +#define MC_FLT_SIG1 0x0e13c20f /* Start signature */ +#define MC_FLT_SIG2 0xf1ec3df0 /* End signature */ +#define MC_FLT_SIZE 0x200 /* Filter block length */ + +void +mcc_flt_parm(uint8_t * p) +{ + uint16_t fnum; + + /* + * Check signatures + */ + if (*((uint32_t *) p) != MC_FLT_SIG1 || + *((uint32_t *)(p + MC_FLT_SIZE - 4)) != MC_FLT_SIG2) { + printk("mcc_flt_parm: signatures not found, (%08x, %08x)\n", + *((uint32_t *) p), *((uint32_t *)(p + MC_FLT_SIZE - 4))); + return; + } + + /* + * After start signature comes filter count (uint16_t) + * followed by 'count' filter descriptors (struct mc_disc). + */ + fnum = *(uint16_t *)(p + 4); + if (fnum > ARRAY_SIZE(mc_turnoff) || + fnum * sizeof(struct mc_disc) + 10 > MC_FLT_SIZE) { + printk("mcc_flt_parm: filter count %d not valid\n", fnum); + return; + } + + /* + * Seems the table is legit, copy it over defaults. + */ + memset(mc_turnoff, '\0', sizeof(mc_turnoff)); + memcpy(mc_turnoff, p + 6, fnum * sizeof(struct mc_disc)); +#if MC_VERBOSE + { + int i; + + for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) { + printk("Filter %2d: bank %d, ctl %d, win %d, max %d, mca %04x, mdl %04x\n", + i, mc_turnoff[i].bank, mc_turnoff[i].ctl, mc_turnoff[i].win, + mc_turnoff[i].max, mc_turnoff[i].mca_code, mc_turnoff[i].mdl_code); + } + } +#endif +} + +#endif + + +/* + * Frequency filter for core and un-core MC events + */ + +uint32_t +micras_mc_filter(struct mce_info * mc, uint64_t tsc, int exc) +{ + struct mc_disc * dsc; + struct mc_hist * hst; + uint64_t ostamp; + int i, oldest; + + if (mc->status & MCI_STATUS_UC) + return 0; + + /* + * Check if this event may be filtered + */ + dsc = mc_turnoff; + for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) { + if (dsc->bank == mc->org && + dsc->mca_code == GET_BITS(15, 0, mc->status) && + dsc->mdl_code == GET_BITS(31, 16, mc->status)) + break; + dsc++; + } + if (i == ARRAY_SIZE(mc_turnoff)) + return 0; + + /* + * Have a candidate for filter. + * Have we seen this one before? + */ + oldest = 0; + ostamp = tsc; + hst = mc_history; + for(i = 0; i < ARRAY_SIZE(mc_history); i++) { + /* + * While scanning, find the oldest event too + */ + if (hst->last < ostamp) { + ostamp = hst->last; + oldest = i; + } + + /* + * Does this match event in filter history? + * TBD: how much needs to match? + * For now: cpu (or box), bank, mca_code and model_code. + */ + if (hst->last && + hst->mc.id == mc->id && + hst->mc.org == mc->org && + GET_BITS(15, 0, hst->mc.status) == GET_BITS(15, 0, mc->status) && + GET_BITS(31, 16, hst->mc.status) == GET_BITS(31, 16, mc->status)) + break; + hst++; + } + if (i == ARRAY_SIZE(mc_history)) { + /* + * Not seen this event before. + * 'oldest' is where to store this event. + */ + hst = mc_history + oldest; + hst->count = 1; + hst->last = tsc; + hst->mc = *mc; + return 0; + } + + /* + * Already 'on file in history', test expiration date + */ + if (hst->last + dsc->win * (cpu_khz * 1000LL) < tsc) { + /* + * Matching history element had expired, just overwrite it + */ + hst->count = 1; + hst->last = tsc; + hst->mc = *mc; + return 0; + } + + /* + * Filter element active, bump count and set last seen. + * We do _NOT_ want injected events to enter the EEPROM, + * so that flag is preserved over all event history + */ + hst->count++; + if (mc->flags & MC_FLG_FALSE) + hst->mc.flags |= MC_FLG_FALSE; + if (hst->count < dsc->max) { + hst->last = tsc; + return 0; + } + + /* + * Threshold reached, event source needs to be silenced. + * Store a record of this in the EEPROM and send a + * notification to host about it. Once duly reported, clear + * event from the filter; it is not expected to show up again. + * Note: we report the _first_ event seen, not the + * event at hand. We could save array space + * by sending latest event (less info to keep). + */ + ee_printk("RAS: MCE filter #%d: bank %d, bit %d, limit %d, delta %d (mS)\n", + dsc - mc_turnoff, dsc->bank, dsc->ctl, dsc->max, (tsc - hst->last) / cpu_khz); + hst->mc.flags |= MC_FLG_FILTER; +#ifdef CONFIG_MK1OM + if (!(hst->mc.flags & MC_FLG_FALSE)) { + micras_mc_log(&hst->mc); + hst->mc.flags |= MC_FLG_LOG; + } +#endif + micras_mc_send(&hst->mc, exc); + hst->last = 0; + + /* + * MC events are disabled by caller when a + * non-zero mask is returned by this routine. + */ + return (1 << dsc->ctl); +} + + +/* + * Remove/mask an 'enable-bit' from a core MCA bank. + * Note: This applies to _current_ cpu only. It is not explicitly + * linked to the cpu that was ID'd in the incoming mce struct. + * Happens to be OK for mcc_exc_flt() and mcc_poll() and mcc_exc_log(). + */ + +static void +mcc_ctl_mask(int bank, uint32_t msk) +{ + uint32_t ctl_lo, ctl_hi; + + rdmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi); + ctl_lo &= ~msk; + wrmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi); + +#if MC_VERBOSE + ee_printk("RAS: ctl mask CPU %d, MC%d_CTL -> %x\n", smp_processor_id(), bank, ctl_lo); +#endif +} + + +/* + * Filtering of correctable core MC events + * Called from the exception handler. + */ + +static void +mcc_exc_flt(struct mce * mce, uint64_t ctl, int fake) +{ + struct mce_info mc; + uint32_t msk; + + if (!mce) + return; + + if (mce->status & MCI_STATUS_UC) + return; + + mcc_conv(mce, &mc); + mc.ctl = ctl; + mc.flags = fake ? MC_FLG_FALSE : 0; + msk = micras_mc_filter(&mc, mce->tsc, 1); + if (msk) + mcc_ctl_mask(mce->bank, msk); +} + + +/* + * Only action required for polled MC events is to + * pass the event on to the SCIF channel (if connected). + * The event should already have caused an excption (the + * exception handler choses to ignore corrected errors) + * which means it already has been filtered. + * Injected corrected events do not cause MCE exceptions + * and thus escaped filtering, so we'll filter them here. + */ + +static void +mcc_poll(struct mce * mce, uint64_t ctl, int fake) +{ + struct mce_info mc; + +#if MC_VERBOSE + ee_printk("RAS: poll %d, fake %d, status %llx\n", mce->extcpu, fake, mce->status); +#endif + + mcc_conv(mce, &mc); + mc.ctl = ctl; + mc.flags = fake ? MC_FLG_FALSE : 0; + +#if BEAM_TEST + /* + * Under beam test we only want to send the SCIF message + */ + micras_mc_send(&mc, fake); + return; +#endif + + if (micras_mc_send(&mc, fake)) + mcc_seen = mcelog.next; + + /* + * According to MCA HAS the MCI_STATUS_VAL will only + * be set when an event's enable bit is set, in which + * case it is difficult to imagine how events without + * the MCI_STATUS_EN can appear here. The second clause + * of the test may never actually happen on Kn{F,C}. + * Note: MC polling does not capture TSCs + */ + if (fake || !(mc.status & MCI_STATUS_EN)) { + uint32_t msk; + + msk = micras_mc_filter(&mc, rdtsc(), fake); + if (msk) + mcc_ctl_mask(mce->bank, msk); + } +} + + +/* + * One CPU entered do_machine_check(). + * We get the initial mce record (which has cpu ID), early + * control variables and whether the event is injected. + * + * Since KnF and KnC deviate from the standard IA by not + * having the core MCAs broadcast to all CPU's we'll try + * to fake standard behavior in order to keep the generic + * machine check code intact. + * Therefore, if event is real (fake flag unset) and this + * CPU is the first seeing it (mcc_exc_mask is empty), + * then send IPI to all other CPU's listed in the online + * cpumask for vector #18. Later CPUs will see themselves + * marked in mcc_exc_mask and return quickly. + */ + +struct cpumask mcc_exc_mask; /* CPU's in mce ctx */ +static atomic_t ipi_lock = ATOMIC_INIT(0); /* Lock on exc mask */ + +static void +mcc_exc_entry(struct mce * mce, int fake, int no_way_out, int entry, char * msg) +{ + unsigned int cpu; + + /* + *TBD: should we use 'extcpu' from the MCE record instead? + */ + cpu = smp_processor_id(); + + /* + * Injected events invokes all CPUs automatically + * by hooking into the NMI notify_die call_chain. + * Nothing to do here. + */ + if (fake) + return; + +#if 1 + /* + * Avoid the IPI corralling circus on corrected errors, + * based on assessment entirely done by mce_severity(). + * If the result (no_way_out) is MCE_NO_SEVERITY (=0), then + * at worst we may have a correctable error, and that does + * not warrant the system lockdown managed by mce_start() + * and mce_end(). + * Note that MICs do not support newer status bits (MCG_SER_P) + * which causes variable mce_ser always to be zero and thus + * the test in the inner loop of do_machine_check() will be + * reduced to just testing for the UC bit. + */ + if (! no_way_out) + return; +#endif + + /* + * Test for entry from MT thread IPIs (testing) + * or a 'soft' exception from a IPI issued from + * the handler of the first exception. + * No further action needed in both cases. + */ + if (cpumask_test_cpu(cpu, &mcc_exc_mask)) + return; + + /* + * Create mcc_exc_mask to flag which CPU's are + * to be included in the IPI. This mask is later + * used to determine who needs to EOI the local + * APIC after MC event handling. + */ + while(atomic_xchg(&ipi_lock, 1)) + cpu_relax(); + smp_rmb(); + if (cpumask_test_cpu(cpu, &mcc_exc_mask)) { + /* + * Another CPU got here first + */ + atomic_xchg(&ipi_lock, 0); + return; + } + cpumask_copy(&mcc_exc_mask, cpu_online_mask); + cpumask_clear_cpu(cpu, &mcc_exc_mask); + smp_wmb(); + atomic_xchg(&ipi_lock, 0); + + /* + * Simulate a broadcast ny sending IPI to all + * other CPUs. + */ + // apic->send_IPI_mask(&mcc_exc_mask, MCE_VECTOR); + apic->send_IPI_allbutself(MCE_VECTOR); +} + + +/* + * In do_machine_check() bank scan loop. + * Called from a lockdown, no synchronization needed. + * MC bank scan is complete and the mce event has been + * entered into the kernel MC log + * + *TBD: revise logic on HALT on UC events? + * From a state corruption point of view this + * _is_ a fatal error because UC bit was set. + * However, if the tolerance setting is set + * high enough, the generic MC handler may + * not chose to panic on this event. + * We currently do not have the tolerance value + * when recording this event, nor do we have + * other factors that mce_reign() use to determine + * what to do after reporting event to the host. + */ + +static void +mcc_exc_log(struct mce * mce, uint64_t ctl, int fake, + int no_way_out, char * msg, int severity, int worst) +{ + struct mce_info mc; + uint32_t msk; + +#if MC_VERBOSE + ee_printk("RAS: log %d, wall %lld, nwo %d (%s), sev %d, wst %d\n", + mce->extcpu, mce->time, no_way_out, msg, severity, worst); +#endif + + /* + * Create a message for the host. + */ + mcc_conv(mce, &mc); + mc.ctl = ctl; + mc.flags |= fake ? MC_FLG_FALSE : 0; + +#if BEAM_TEST + /* + * Under beam test we only want to send the SCIF message + * This is guaranteed not to be called re-entrantly. + */ + micras_mc_send(&mc, 1); + return; +#endif + +#ifdef CONFIG_MK1OM + /* + * If this is a true event then log it in the EEPROM and + * notify SMC that we've had a serious machine check error. + */ + if ((mc.flags & (MC_FLG_FALSE | MC_FLG_FATAL)) == MC_FLG_FATAL) { + micras_mc_log(&mc); + mc.flags |= MC_FLG_LOG; + + /* + *TBD: Should this be deferred until the actual panic? + * The user can raise tolerance such that we in + * fact continue operating; in which case the SMC + * notification would be (somewhat) misleading. + */ + micras_mc_ipmi(&mc, 1); + } +#endif + + /* + * Always notify host and sync to kernel log + */ + if (micras_mc_send(&mc, 1)) + mcc_seen = mcelog.next; + +#if RAS_HALT + if ((mc.flags & MC_FLG_FATAL) && !fake) + panic("FATAL core machine check event:\n" + "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n", + mc.org, mc.id, mc.ctl, mc.status, mc.addr, mc.misc); +#endif + + /* + * Correctable events can in fact reach us here if + * mce_no_way_out() tags them as critical (for other + * reasons than the UC flag, e.g. MCIP missing). + * If the tolerance setting is high enough to prevent + * such events to panic, we'd still want filtering. + */ + msk = micras_mc_filter(&mc, mce->tsc, 1); + if (msk) + mcc_ctl_mask(mce->bank, msk); +} + + +/* + * In mce_panic(). + * Current event is about to make the kernel panic. + * Sources of this call are + * do_machine_check(), when no_way_out set + * mce_timed_out(), CPU rendez-vous failed + * mce_reign(), when severety high, a CPU hung, or no events + */ + +static void +mcc_exc_panic(struct mce * mce, char * msg, char * exp, int fake) +{ + /* + * Should host be notified in this case? + * And if so, how should be presented, we might not + * even have a mce record to show when this happens! + * If an mce is passed, it has already been seen and + * reported to the host by a call to mcc_exc_log(). + * If mce is NULL, then this _is_ an MC relatedi panic, + * but we have no data fitting for a host notification. + * Create a pseudo event and ship that? + */ + ee_printk("RAS: panic %d, wall %lld, msg %s, exp %s, fake %d\n", + mce->extcpu, mce->time, msg, exp, fake); +} + + +/* + * A CPU is leaving do_machine_check(). + * We get this after the monarch has 'reigned' and + * the response to the event has been completed. + */ + +static void +mcc_exc_exit(struct mce * mce, int no_way_out, int worst, int entry, int order) +{ + unsigned int cpu; + int eoi; + + cpu = smp_processor_id(); + + /* + * Assuming test_and_clear_bit() is atomic. + */ + smp_rmb(); + eoi = cpumask_test_and_clear_cpu(cpu, &mcc_exc_mask); + smp_wmb(); + if (eoi) + ack_APIC_irq(); +} + + +/* + * Routine to scan the kernel's MC log. + * Called when SCIF MC session has been created, to bring the host + * side up to date with prior unreported MC events, such as events + * occurring when MC session was not active (no peer was listening + * on the host) and events occurring before RAS module was loaded. + * + * Notes: + * - This is always called in thread context. + * - There are no injection flags in the kernel + * MC log, i.e. no guarantee events are genuine. + * - The MC kernel log has been exported explicitly for this. + * + * On synchronization (or the lack thereof): + * Effectively the mcelog holds a static array of mce's where the + * 'finished' flag says whether mce content is valid or not. The + * 'next' field is the index of the first element in the array that + * has not been assigned for an MC event. It is incremented when a + * new event is entered, and reset to zero on reads to /dev/mcelog. + * The kernel's event log does not wrap, so it is safe to use it as + * an indicator of how many events (finished or not) are in it. + * The mcelog's next field is protected by RCU style mechanisms + * in the kernel MCA handler (see arch/x86/kernel/cpu/mcheck/mce.c). + * For obvious reasons it is not genuine RCU, e.g. access to 'next' + * isn't within rcu_read_lock()/rcu_read_unlock() pair, just a clever + * masking use of a lock in an RCU macro definition. + * There is no RCU moving data around, the mce array does not move, + * and the 'finished' flag is set after a wmb() on the mce contents + * which means this routine will not clash with the MCE handler. + * Collisions with memset() on reads from /dev/mcelog are prevented + * by locking of mce_read_mutex. + */ + +void +mcc_sync(void) +{ + struct mce_info mc; + unsigned seen; + + if (mce_disabled) + return; + +#if 0 + /* + * Can't do this until bootstrap scrubs MC banks on all cards. + * It has been observed that MCA banks may _not_ be reset on card + * reboot which means events picked up by the kernel before loading + * the RAS module may have occured in a previous uOS run. + * Should be OK post early Jan '12 (flash ver 262, HSD 4115351). + */ + return; +#endif + + /* + * Lock out kernel log access through /dev/mcelog + */ + mutex_lock(&mce_read_mutex); + + /* + * Start over if the log has been cleared cleared + */ + if (mcc_seen > mcelog.next) + mcc_seen = 0; + + for(seen = mcc_seen; seen < mcelog.next; seen++) { + /* + * Basic checks. Index, CPU & bank must be reasonable. + */ + if (mcelog.entry[seen].finished) { + if (mcelog.entry[seen].cpu >= NR_CPUS || + mcelog.entry[seen].bank >= 3) { + printk("mcc_sync: entry %d contains garbage, cpu %d, bank %d\n", + seen, mcelog.entry[seen].cpu, mcelog.entry[seen].bank); + continue; + } + + /* + * Have good entry, can be UC, but it is 'old'. + */ + mcc_conv(&mcelog.entry[seen], &mc); + mc.ctl = 0; + +#ifdef CONFIG_MK1OM + /* + * Log this event in the eeprom and notify + * that we've had a serious machine check error. + */ + if (mc.flags & MC_FLG_FATAL) { + in_sync = 1; + micras_mc_log(&mc); + in_sync = 0; + mc.flags |= MC_FLG_LOG; + micras_mc_ipmi(&mc, 0); + } +#endif + + /* + * Notify host about this too + */ + if (! micras_mc_send(&mc, 0)) + break; + } + } + mcc_seen = mcelog.next; + + /* + * Done, release lock + */ + mutex_unlock(&mce_read_mutex); +} + + +/* + * Setup excetion handlers by hooking into the + * kernel's native MCA handler. + */ + +int __init +mcc_init(void) +{ + if (mce_disabled) { + printk("RAS.core: disabled\n"); + } + else { + mca_poll = mcc_poll; + mca_exc_flt = mcc_exc_flt; + mca_exc_entry = mcc_exc_entry; + mca_exc_log = mcc_exc_log; + mca_exc_panic = mcc_exc_panic; + mca_exc_exit = mcc_exc_exit; + mca_print = 0; /* For debug: ee_printk; */ + printk("RAS.core: init complete\n"); + } + + return 0; +} + + +/* + * Cleanup for module unload. + * Clear/restore hooks in the native MCA handler. + */ + +int __exit +mcc_exit(void) +{ + mca_poll = 0; + mca_exc_flt = 0; + mca_exc_entry = 0; + mca_exc_log = 0; + mca_exc_panic = 0; + mca_exc_exit = 0; + mca_print = 0; + + /* + * Links from kernel's MCE handler cut, + * wait for everybody in handler to leave. + */ + while(atomic_read(&mce_entry)) + cpu_relax(); + + printk("RAS.core: exit complete\n"); + return 0; +} + diff --git a/ras/micras_elog.c b/ras/micras_elog.c new file mode 100644 index 0000000..349c4cb --- /dev/null +++ b/ras/micras_elog.c @@ -0,0 +1,3136 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS EEPROM log driver + * + * Contains code to handle creation of MC event records in + * the designated EEPROM hanging off the 'OverClocking' I2C bus. + * + * Since it is not clear for the moment for how long the serial + * port on the POST card needs to (or will) be supported, it is + * not safe to assume we just can tap into the Linux I2C frame + * work to access the 'OverClocking' I2C bus. + * + * Furthermore, we need access from exception context, and cannot + * run a driver that has spinlocks, mutexes and sleeps in it's path + * like the current PXA-derived driver has. + * + * Therefore, a local exception safe driver is included here. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras.h" + +#ifdef MIC_IS_EMULATION +/* + * Emulation does not handle I2C busses. + * Therefore all code that deals with I2C needs to be + * replaced with harmless substitutes in emulation. + * The following stubs are for emulation only. + */ + +#if 0 +/* + * Probably don't need exclusive locks in emulation + */ +atomic_t pxa_block = ATOMIC_INIT(0); + +static void +ee_lock(void) +{ + while(atomic_xchg(&pxa_block, 1)) + myDELAY(50); +} + +static void +ee_unlock(void) +{ + atomic_xchg(&pxa_block, 0); +} +#endif + +char ee_buf[EE_BUF_COUNT * EE_BUF_LINELEN]; +atomic_t ee_msg = ATOMIC_INIT(-1); +atomic_t ee_seen = ATOMIC_INIT(0); +int ee_rdy; + +char * +ee_fmt(char * fmt, va_list args) +{ + char * buf; + int msg_id, msg_btm; + + msg_btm = atomic_read(&ee_seen); + msg_id = atomic_inc_return(&ee_msg); + if ((msg_id - msg_btm) < (EE_BUF_COUNT - 1)) { + buf = ee_buf + (msg_id % EE_BUF_COUNT) * EE_BUF_LINELEN; + vsnprintf(buf, EE_BUF_LINELEN - 1, fmt, args); + return buf; + } + return 0; +} + +int +ee_printk(char * fmt, ...) +{ + va_list args; + char * buf; + + va_start(args, fmt); + buf = ee_fmt(fmt, args); + va_end(args); + + return buf ? strlen(buf) : 0; +} + +int +ee_print(char * fmt, ...) +{ + va_list args; + char * buf; + + va_start(args, fmt); + buf = ee_fmt(fmt, args); + va_end(args); + + return buf ? strlen(buf) : 0; +} +EXPORT_SYMBOL_GPL(ee_print); + + +int +ee_init(void) +{ + ee_rdy = 1; + + if (mce_disabled) + printk("RAS.elog (EMU): disabled\n"); + else + printk("RAS.elog (EMU): init complete\n"); + return 0; +} + +int +ee_exit(void) +{ + ee_rdy = 0; + + printk("RAS.elog (EMU): exit complete\n"); + return 0; +} + +void +micras_mc_log(struct mce_info * event) +{ + if (mce_disabled) + return; + + /* + * Print entry on serial console (copy in kernel log) + */ + ee_printk("RAS.elog (EMU): bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n", + event->org, event->id, event->ctl, event->status, event->addr, event->misc); +} + +#else + +/* +** +** Exception safe I2C driver for the 'OverClocking' bus. +** The driver is a derivative of the FreeBSD driver that +** Ben W wrote. I.e. it is safe to re-use here because we +** wrote it in the first place, copyright is ours. +** +** NOTE: This I2C bus is usually run by the PXA driver, +** which means that the activities of this driver +** may interrupt the PXA driver's activity, i.e. +** interrupt the serial console. +** This is by design, the alternative was major +** hacking of the PXA driver to support use in +** exception context. +** +** NOTE: This code is currently exclusively designed to +** run on a KnF or KnC device, i.e. we know what +** hardware is present and we know the location +** of the CSRs. This code does very little for +** niceties like device discovery and registration. +** +** NOTE: Timing is altered slightly from the FreeBSD code. +** The I2C bus should run in 400 kHz mode, which at +** optimal conditions can transmit a byte in about +** 25 uSec (8 bits + ack/nak + a little overhead). +** Therefore it does not make much sense to poll +** much faster than 1 uSec anywhere in this driver. +** However, experiments show that timing is far +** from optimal, though it is not clear whether +** it is the UART or the controller that's slow. +** Update: In fact some of the boards cannot run +** reliably at 400 kHz, so we switched to 100 kHz. +*/ + +#define REG_DBG 0 /* Debug I2C Layer 1 */ +#define I2C_DBG 0 /* Debug I2C Layer 2 */ +#define XFR_DBG 0 /* Debug I2C Layer 3 */ +#define CON_DBG 0 /* Debug I2C UART */ +#define EPR_DBG 0 /* Debug EEPROM log */ + +#if REG_DBG +#define REG_REG reg_dmp +#else +#define REG_REG(s); /* As nothing */ +#endif + +#if I2C_DBG +#define I2C_PRT ee_printk +#else +#define I2C_PRT(s,...); /* As nothing */ +#endif + +#if XFR_DBG +#define XFR_PRT ee_printk +#else +#define XFR_PRT(s,...); /* As nothing */ +#endif + +#if CON_DBG +#define CON_PRT ee_printk +#else +#define CON_PRT(s,...); /* As nothing */ +#endif + +#if EPR_DBG +#define EPR_PRT ee_printk +#else +#define EPR_PRT(s,...); /* As nothing */ +#endif + + +#include +#include "monahan.h" + + +/* + *TBD: Get rid of Pascal relics! + */ + +#ifndef FALSE +#define FALSE false +#endif +#ifndef TRUE +#define TRUE true +#endif + + +/* + * Local timer routine. + * Similar to the udelay function, just simpler. + * + * The delay instruction can only go upto 1023 clocks, + * and larger delay needs to be split into two or more + * delay instructions. + * According to Kn{F|C} errata, delay disables interrupts. + * Want to play nice and allow interrupts every 250 clocks. + * For now the overhead of the loop is ignored. + */ + +#define MAX_DELAY 250 + +void +myDELAY(uint64_t usec) +{ + uint64_t num_cpu_clks, tick; + + /* + * Convert usec count into CPU clock cycles. + * Similar to set_cyc2ns_scale() we have: + * us = cycles / (freq / us_per_sec) + * us = cycles * (us_per_sec / freq) + * us = cycles * (10^6 / (cpu_khz * 10^3)) + * us = cycles * (10^3 / cpu_khz) + * cycles = us / ((10^3 / cpu_khz)) + * cycles = (us * cpu_khz) / 10^3 + */ + num_cpu_clks = (usec * tsc_khz) / 1000; + + if (num_cpu_clks <= MAX_DELAY) { + __asm__ __volatile__("delay %0"::"r"(num_cpu_clks):"memory"); + } else { + for(tick = MAX_DELAY; num_cpu_clks > tick; num_cpu_clks -= tick) + __asm__ __volatile__("delay %0"::"r"(tick):"memory"); + __asm__ __volatile__("delay %0"::"r"(num_cpu_clks):"memory"); + } +} + + +/* + * Layer 1 abstraction: device bus (controller register access) + * + * Access API to provide read/write to the I2C controller. + * Simply use a local copy of the SBOX MMIO routines, where the + * 'OverClocking' I2C controller CSRs starts at offset 0x1000. + * We use a local copy in order to not mix I2C register traces + * with those of the SBOX MMIO routines in micras_main.c. + * + *TBD: Shall debug features stay in the code? + */ + +#if REG_DBG + +/* + * I2C controller register dump utilities. + * Traces go to the kernel log. + */ + +struct bits { + uint32_t mask; + char *set; + char *unset; +}; + +#define PXA_BIT(m, s, u) { .mask = m, .set = s, .unset = u } + +static struct bits icr_bits[] = { + PXA_BIT(ICR_START, "START", 0), + PXA_BIT(ICR_STOP, "STOP", 0), + PXA_BIT(ICR_ACKNAK, "NAK", "ACK"), + PXA_BIT(ICR_TB, "TB", 0), + PXA_BIT(ICR_MA, "MA", 0), + PXA_BIT(ICR_SCLE, "SCLE", 0), + PXA_BIT(ICR_IUE, "IUE", 0), + PXA_BIT(ICR_GCD, "GCD", 0), + PXA_BIT(ICR_ITEIE, "ITEIE", 0), + PXA_BIT(ICR_DRFIE, "DRFIE", 0), + PXA_BIT(ICR_BEIE, "BEIE", 0), + PXA_BIT(ICR_SSDIE, "SSDIE", 0), + PXA_BIT(ICR_ALDIE, "ALDIE", 0), + PXA_BIT(ICR_SADIE, "SADIE", 0), + PXA_BIT(ICR_UR, "UR", 0), +}; + +static struct bits isr_bits[] = { + PXA_BIT(ISR_RWM, "RX", "TX"), + PXA_BIT(ISR_ACKNAK, "NAK", "ACK"), + PXA_BIT(ISR_UB, "UB", 0), + PXA_BIT(ISR_IBB, "IBB", 0), + PXA_BIT(ISR_SSD, "SSD", 0), + PXA_BIT(ISR_ALD, "ALD", 0), + PXA_BIT(ISR_ITE, "ITE", 0), + PXA_BIT(ISR_IRF, "IRF", 0), + PXA_BIT(ISR_GCAD, "GCAD", 0), + PXA_BIT(ISR_SAD, "SAD", 0), + PXA_BIT(ISR_BED, "BED", 0), +}; + + +static void +decode_bits(char *prefix, struct bits *bits, int num, uint32_t val) +{ + char * str; + + printk(" %s: ", prefix); + while (num--) { + str = (val & bits->mask) ? bits->set : bits->unset; + if (str) + printk("%s ", str); + bits++; + } +} + +static void reg_ICR(uint32_t val) +{ + decode_bits("ICR", icr_bits, ARRAY_SIZE(icr_bits), val); + printk("\n"); +} + +static void reg_ISR(uint32_t val) +{ + decode_bits("ISR", isr_bits, ARRAY_SIZE(isr_bits), val); + printk("\n"); +} + + +static void +reg_dmp(char * str) +{ + printk("%s: ICR %08x, ISR %08x, ISAR %08x, IDBR %08x, IBMR %08x\n", str, + mr_sbox_rl(0, SBOX_OC_I2C_ICR + ICR_OFFSET), + mr_sbox_rl(0, SBOX_OC_I2C_ICR + ISR_OFFSET), + mr_sbox_rl(0, SBOX_OC_I2C_ICR + ISAR_OFFSET), + mr_sbox_rl(0, SBOX_OC_I2C_ICR + IDBR_OFFSET), + mr_sbox_rl(0, SBOX_OC_I2C_ICR + IBMR_OFFSET)); +} + +#endif /* REG_DBG */ + + +/* + * Local versions of SBOX access routines, that + * does not leave trace messages in kernel log. + */ + +uint32_t +lmr_sbox_rl(int dummy, uint32_t roff) +{ + uint32_t val; + + val = * (volatile uint32_t *)(micras_sbox + roff); + return val; +} + +void +lmr_sbox_wl(int dummy, uint32_t roff, uint32_t val) +{ + * (volatile uint32_t *)(micras_sbox + roff) = val; +} + +static uint32_t +reg_read(uint32_t reg) +{ + uint32_t val; + + val = lmr_sbox_rl(0, SBOX_OC_I2C_ICR + reg); + +#if REG_DBG + printk("%s: %4x -> %08x", "rd", SBOX_OC_I2C_ICR + reg, val); + switch(reg) { + case ICR_OFFSET: reg_ICR(val); break; + case ISR_OFFSET: reg_ISR(val); break; + default: + printk("\n"); + } +#endif + + return val; +} + +static void +reg_write(uint32_t reg, uint32_t val) +{ +#if REG_DBG + printk("%s: %4x <- %08x", "wr", SBOX_OC_I2C_ICR + reg, val); + switch(reg) { + case ICR_OFFSET: reg_ICR(val); break; + default: + printk("\n"); + } +#endif + + lmr_sbox_wl(0, SBOX_OC_I2C_ICR + reg, val); +} + + +/* + * Layer 2 abstraction: I2C bus driver (byte access to I2C bus) + * + * Mostly a re-implementation of Ben W's low level FreeBSD driver. + * Provides an API to control what goes onto the I2C bus on a + * per individual byte basis. + * + * i2c_reset Reset bus controller + * i2c_init Setup trasaction parameters (speed & mode) + * i2c_start Send slave address + R/W bit + * i2c_rd_byte Read data byte + * i2c_wr_byte Send data byte + * i2c_stop Stop current transaction + * + * NOTE: It seems that the controller lacks means to reset the + * I2C bus (i.e. other devices on it). The controller + * resets fine, but at least the UART has been seen + * locking up and blocking the bus entirely. + */ + +static uint8_t hnd_addr = 0; /* Target address */ +static int hnd_freq = FREQ_100K; /* Target speed */ + +static uint8_t bus_slave_addr = ISAR_SLADDR; /* Our I2C slave address */ +static int bus_start_op = I2C_NOP; /* Bus command: R or W */ +static int bus_freq = 0; /* Bus speed (actual) */ +static int bus_inited = 0; /* Bus initialized */ + + +/* + * Master abort. + * Flip the ICR:MA bit long enough for current + * byte transfer to clock in/out on the wire. + */ + +static int +i2c_master_abort(void) { + I2C_PRT("i2c_master_abort: entry\n"); + + reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) | ICR_MA); + myDELAY(25); + reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) & ~ICR_MA); + + I2C_PRT("i2c_master_abort: exit\n"); + return 0; +} + + +/* + * Receive completion helper. + * Transmission ended (we got IRF), check if it was OK. + * We get ISR and whether a stop condition was expected. + */ + +static int +check_rx_isr(uint32_t isr, bool stop) +{ + I2C_PRT("check_rx_isr: entry, isr %02x, stop %d\n", isr, stop); + REG_REG("+check_rx_isr"); + + if (stop) { + /* + * Last byte read, controller is expected to give a + * NAK to slave. Verify that indeed is set in ISR. + */ + if (!(isr & ISR_ACKNAK)) { + REG_REG("-check_rx_isr"); + I2C_PRT("check_rx_isr: !ISR_ACKNAK, rtn %d\n", RX_SEVERE_ERROR); + return RX_SEVERE_ERROR; + } + + /* + * The controller is expected to set the STOP condition. + * Once completed the controller clears the RWM bit of the ISR. + * Wait for this to happen in max 200 uSec. + */ + if (isr & ISR_RWM) { + int counter; + + I2C_PRT("check_rx_isr: RWM\n"); + counter = 100; + while((reg_read(ISR_OFFSET) & ISR_RWM) && --counter) + myDELAY(2); + if(! counter) { + REG_REG("-check_rx_isr"); + I2C_PRT("check_rx_isr: timeout, RWM wait %d uSec, rtn %d\n", 2 * 100, RX_BIZARRE_ERROR); + return RX_BIZARRE_ERROR; + } + I2C_PRT("check_rx_isr: RWM clear, waited %d uSec\n", 2 * (100 - counter)); + } + } else { + /* + * Mid-message, verify that unit is still busy, received + * no NAK and that message operation is still 'read'. + */ + if (!(isr & ISR_UB)) { + REG_REG("-check_rx_isr"); + I2C_PRT("check_rx_isr: !UB, rtn %d\n", RX_SEVERE_ERROR); + return RX_SEVERE_ERROR; + } + + if (isr & ISR_ACKNAK) { + REG_REG("-check_rx_isr"); + I2C_PRT("check_rx_isr: ISR_ACKNAK, rtn %d\n", RX_SEVERE_ERROR); + return RX_SEVERE_ERROR; + } + + if (!(isr & ISR_RWM)) { + REG_REG("-check_rx_isr"); + I2C_PRT("check_rx_isr: !ISR_RWM, rtn %d\n", RX_BIZARRE_ERROR); + return RX_BIZARRE_ERROR; + } + } + + REG_REG("-check_rx_isr"); + I2C_PRT("check_rx_isr: done, rtn %d\n", XFER_SUCCESS); + return XFER_SUCCESS; +} + +/* + * Wait for receive completion. + * We get if stop condition expected. + */ + +static int +i2c_wait_rx_full(bool stop) +{ + int uwt, counter, err; + uint32_t temp; + + I2C_PRT("i2c_wait_rx_full: entry, stop %d\n", stop); + REG_REG("+i2c_wait_rx_full"); + + /* + * Guess on how long one I2C clock cycle is (in uSec) + */ + uwt = (bus_freq == FREQ_400K) ? 3 : 10; + + /* + * Wait for receive to end (IRF set). + * Since slave can hold the SCL to reduce the speed + * we wait longer than we expect the receive to last. + */ + counter = 100; + err = INCOMPLETE_XFER; + while(counter) { + temp = reg_read(ISR_OFFSET); + if (temp & ISR_IRF) { + I2C_PRT("i2c_wait_rx_full: IRF, ISR %02x\n", temp); + err = check_rx_isr(temp, stop); + reg_write(ISR_OFFSET, reg_read(ISR_OFFSET) | ISR_IRF); + switch(err) { + case XFER_SUCCESS: + break; + case RX_SEVERE_ERROR: + break; + case RX_END_WITHOUT_STOP: + i2c_master_abort(); + break; + default: + /* + * This is odd/unexpected, but not + * something we can do anything about. + */ + err = XFER_SUCCESS; + } + break; + } + myDELAY(uwt); + counter--; + } + + REG_REG("-i2c_wait_rx_full"); + I2C_PRT("i2c_wait_rx_full: done, IRF wait %d uSec, err %d\n", uwt * (100 - counter), err); + return err; +} + + +/* + * Transmit completion helper. + * Transmission ended (we got ITE), check if it was OK. + * We get ISR, the current operation and whether a stop + * condition was expected (last byte of transmission). + */ + +static int +check_tx_isr(uint32_t isr, bool stop, int op) +{ + I2C_PRT("check_tx_isr: entry, isr %02x, stop %d, op %d\n", isr, stop, op); + REG_REG("+check_tx_isr"); + + if (isr & ISR_BED) { /* Bus error */ + REG_REG("-check_tx_isr"); + I2C_PRT("check_tx_isr: BED, rtn %d\n", TX_NAK); + return TX_NAK; + } + + if(stop) { + /* + * Last byte write, controller expected to + * set the stop condition. This may take a + * while to complete, controller holds the + * UB flag of ISR until finished. + */ + if(isr & ISR_UB) { + int counter; + + I2C_PRT("check_rx_isr: UB\n"); + counter = 100; + while((reg_read(ISR_OFFSET) & ISR_UB) && --counter) + myDELAY(2); + if (! counter) { + REG_REG("-check_tx_isr"); + I2C_PRT("check_tx_isr: UB, timeout %d uSec, rtn %d\n", 2 * 100, TX_CONTROLLER_ERROR); + return TX_CONTROLLER_ERROR; + } + I2C_PRT("check_tx_isr: !UB, waited %d uSec\n", 2 * (100 - counter)); + } + } else { + /* + * Mid-message, the bus is expected to be busy. + */ + if(!(isr & ISR_UB)) { + REG_REG("-check_tx_isr"); + I2C_PRT("check_tx_isr: !UB, rtn %d\n", TX_CONTROLLER_ERROR); + return TX_CONTROLLER_ERROR; + } + } + + /* + * Assert that message operation hasn't changed + */ + if ((isr & 0x1) != op) { + REG_REG("-check_tx_isr"); + I2C_PRT("check_tx_isr: ISR %d != %d, rtn %d\n", isr & 0x1, op, TX_CONTROLLER_ERROR); + return TX_CONTROLLER_ERROR; + } + + REG_REG("-check_tx_isr"); + I2C_PRT("check_tx_isr: done, rtn %d\n", XFER_SUCCESS); + return XFER_SUCCESS; +} + +/* + * Wait for transmit completion + * We get the current operation and if a stop + * condition was expected (last byte of transmission). + */ + +static int +i2c_wait_tx_empty(bool stop, int op) +{ + int counter, uwt, err; + uint32_t temp; + + I2C_PRT("i2c_wait_tx_empty: entry, stop %d, op %d\n", stop, op); + REG_REG("+i2c_wait_tx_empty"); + + /* + * Guess on how long one I2C clock cycle is (in uSec) + */ + uwt = (bus_freq == FREQ_400K) ? 3 : 10; + + /* + * Wait for transmission to end (ITE set) + * Since slave can hold the SCL to lower the speed + * we wait longer than we expect the transmission + * to last. + */ + counter = 100; + err = INCOMPLETE_XFER; + while(counter) { + temp = reg_read(ISR_OFFSET); + if (temp & ISR_ITE) { + I2C_PRT("i2c_wait_tx_empty: ITE, ISR %02x\n", temp); + myDELAY(uwt); + temp = reg_read(ISR_OFFSET); + err = check_tx_isr(temp, stop, op); + reg_write(ISR_OFFSET, reg_read(ISR_OFFSET) | ISR_ITE); + break; + } + myDELAY(uwt); + counter--; + } + + REG_REG("-i2c_wait_tx_empty"); + I2C_PRT("i2c_wait_tx_empty: done, ITE wait %d uSec, err %d\n", uwt * (100 - counter), err); + return err; +} + + +/* + * Setup for a transaction. + * Determine transmission speed and program ICR accordingly. + * Also sets ISAR, though we probably don't neeed that. + */ + +static int +i2c_init(uint8_t slave_addr) +{ + uint32_t speed; + + I2C_PRT("i2c_init: entry, slave_addr %02x, hnd_speed %d\n", slave_addr, hnd_freq); + REG_REG("+i2c_init"); + + switch(hnd_freq) { + case FREQ_MAX: + speed = I2C_HS_FAST; + break; + case FREQ_400K: + speed = I2C_FAST; + break; + case FREQ_100K: + speed = I2C_STANDARD; + break; + case FREQ_AUTO: +#if I2C_SLOW + hnd_freq = FREQ_100K; + speed = I2C_STANDARD; +#else + hnd_freq = FREQ_400K; + speed = I2C_FAST; +#endif + break; + default: + return -EINVAL; + } + if (bus_inited && hnd_freq == bus_freq) { + REG_REG("-i2c_init"); + I2C_PRT("i2c_init: exit, bus_inited %d, hnd_freq %d\n", bus_inited, hnd_freq); + return 0; + } + I2C_PRT("i2c_init: speed %d, hnd_freq %d\n", bus_inited, hnd_freq); + + bus_slave_addr = ISAR_SLADDR; + reg_write(ISAR_OFFSET, bus_slave_addr); + reg_write(ICR_OFFSET, (reg_read(ICR_OFFSET) & ~ICR_MODE) | ICR_ON | speed); + bus_freq = hnd_freq; + bus_inited = 1; + + REG_REG("-i2c_init"); + I2C_PRT("i2c_init: done, bus_inited %d, bus_freq %d\n", bus_inited, bus_freq); + return 0; +} + + +/* + * Stop current transaction. + * If transmitting then do a master abort, otherwise + * just ensure that no new transmission starts. + */ + +static int +i2c_stop(void) +{ + I2C_PRT("i2c_stop: entry, bus_inited %d, bus_start_op %d\n", bus_inited, bus_start_op); + REG_REG("+i2c_stop"); + + if (reg_read(ISR_OFFSET) & ISR_UB) { + I2C_PRT("i2c_stop: Unit busy\n"); + i2c_master_abort(); + } + + switch(bus_start_op) { + case I2C_WRITE: + I2C_PRT("i2c_stop: Stop Write\n"); + reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) & ~(ICR_STOP | ICR_TB)); + break; + case I2C_READ: + I2C_PRT("i2c_stop: Stop Read\n"); + reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) & ~(ICR_STOP | ICR_TB | ICR_ACKNAK)); + break; + } + bus_start_op = I2C_NOP; + + REG_REG("-i2c_stop"); + I2C_PRT("i2c_stop: bus_start_op %d\n", bus_start_op); + return 0; +} + + +/* + * Reset I2C controller. + * Try to be nice and wait for current transaction to finish + */ + +static int +i2c_reset(void) +{ + I2C_PRT("i2c_reset: entry, bus_inited %d\n", bus_inited); + REG_REG("+i2c_reset"); + + i2c_stop(); + + reg_write(ICR_OFFSET, ICR_UR); + myDELAY(1); + reg_write(ISR_OFFSET, ~ISR_RESERVED); + myDELAY(1); + reg_write(ICR_OFFSET, 0); + myDELAY(1); + reg_write(ISAR_OFFSET, 0); + myDELAY(1); + reg_write(ICR_OFFSET, ICR_INIT_BITS); + bus_inited = 0; + + REG_REG("-i2c_reset"); + I2C_PRT("i2c_reset: exit, bus_inited %d\n", bus_inited); + return 0; +} + + +/* + * Start transaction using current setup. + * This is always a send of the target id and the R/W bit. + */ + +static int +i2c_start(int rw) +{ + int err; + uint32_t temp; + + I2C_PRT("i2c_start: entry, rw %d, bus_slave_addr %02x, bus_start_op %d\n", rw, bus_slave_addr, bus_start_op); + REG_REG("+i2c_start"); + + if (hnd_addr == bus_slave_addr) { + bus_slave_addr = bus_slave_addr - 1; + I2C_PRT("i2c_start: reset slave %02x\n", bus_slave_addr); + reg_write(ISAR_OFFSET, bus_slave_addr); + } + + reg_write(IDBR_OFFSET, (hnd_addr << 1) | rw); + temp = reg_read(ICR_OFFSET); + temp |= ICR_START | ICR_TB; + temp &= ~(ICR_STOP | ICR_ALDIE); + reg_write(ISR_OFFSET, ~ISR_RESERVED); + reg_write(ICR_OFFSET, temp); + + err = i2c_wait_tx_empty(FALSE, rw); + if (err) { + i2c_reset(); + I2C_PRT("i2c_start: exit, err %d\n", err); + REG_REG("-i2c_start"); + return err; + } + bus_start_op = rw; + + REG_REG("-i2c_start"); + I2C_PRT("i2c_start: done, bus_start_op %d\n", bus_start_op); + return 0; +} + + +/* + * Read next byte of transaction + * Must follow a 'start' in READ mode. + */ + +static int +i2c_rd_byte(bool sendStop, uint8_t *data) +{ + int retval; + uint32_t temp; + + I2C_PRT("i2c_rd_byte: entry, stop %d\n", sendStop); + + if (bus_start_op != I2C_READ) { + I2C_PRT("i2c_rd_byte: exit, called during WR\n"); + return -EINVAL; + } + + REG_REG("+i2c_rd_byte"); + + temp = reg_read(ICR_OFFSET); + temp |= (ICR_ALDIE | ICR_TB); + temp &= ~(ICR_START | ICR_STOP | ICR_ACKNAK); + if (sendStop) + temp |= ICR_STOP | ICR_ACKNAK; + + reg_write(ISR_OFFSET, ~ISR_RESERVED); + reg_write(ICR_OFFSET, temp); + retval = i2c_wait_rx_full(sendStop); + if (retval) { + REG_REG("-i2c_rd_byte"); + I2C_PRT("i2c_rd_byte: exit, err %d\n", retval); + return retval; + } + + temp = reg_read(IDBR_OFFSET); + if (data) + *data = temp; + + if (sendStop) + i2c_stop(); + + REG_REG("-i2c_rd_byte"); + I2C_PRT("i2c_rd_byte: done, data %02x\n", temp); + return 0; +} + +/* + * Write next byte of transaction + * Must follow a 'start' in WRITE mode. + */ + +static int +i2c_wr_byte(bool sendStop, uint8_t data) +{ + int retval; + uint32_t temp; + + I2C_PRT("i2c_wr_byte: entry, stop %d, data %02x\n", sendStop, data); + + if (bus_start_op != I2C_WRITE) { + I2C_PRT("i2c_wr_byte: exit, called during RD\n"); + return EINVAL; + } + + REG_REG("+i2c_wr_byte"); + + reg_write(IDBR_OFFSET, data); + + temp = reg_read(ICR_OFFSET); + temp |= (ICR_ALDIE | ICR_TB); + temp &= ~(ICR_START | ICR_STOP); + if (sendStop) + temp |= ICR_STOP; + + reg_write(ISR_OFFSET, ~ISR_RESERVED); + reg_write(ICR_OFFSET, temp); + retval = i2c_wait_tx_empty(sendStop, I2C_WRITE); + if (retval) { + REG_REG("-i2c_wr_byte"); + I2C_PRT("i2c_wr_byte: exit, err %d\n", retval); + return retval; + } + + if (sendStop) + i2c_stop(); + + REG_REG("-i2c_wr_byte"); + I2C_PRT("i2c_wr_byte: done\n"); + return 0; +} + + +/* + * Get exclusive access to the I2C bus at _any_ given time. + * + * If a transaction is in progress then try to complete it + * in a non-destructive way. We know that the interupted + * activity was from the console access to the UART, which + * boils down to just two possible sequences, read UART + * register or write UART register. The acting code paths is + * sc16is_serial_in() + * -> i2c_smbus_read_byte_data + * -> i2c_smbus_xfer + * -> i2c_smbus_xfer_emulated + * -> i2c_transfer + * -> i2c_pxa_pio_xfer + * -> i2c_pxa_do_pio_xfer + * -> i2c_pxa_set_master + * -> i2c_pxa_start_message + * -> i2c_pxa_handler (repeat for all bytes) + * -> i2c_pxa_irq_txempty (on writes) + * -> i2c_pxa_irq_rxfull (on reads) + * -> i2c_pxa_stop_message + * + * Function i2c_pxa_handler (designed as an interrupt handler) + * is polled every 10 uSec, which is pretty fast for a line that + * clocks at 400 kHz (minimum 20 uSec to send one byte). + * + * The two sequences on the I2C bus for the UART are: + * + * Write: S A A A P + * Read: S A A Sr A A P + * + * where + * S Start sequence + * P Stop sequence + * Sr Repeated start + * W Write flag + * R Read flag + * A Ack (send or recv) + * + * We need the abilitity to 'borrow' the I2C bus from the PXA driver + * both when it is running (say on another CPU) or when it has been + * interrupted (NMI and Exception context). + * + * From trackers in the PXA driver we get to know the current state + * of the I2C transaction with the following granularity: + * + * '-' Idle + * 'B' Waiting for bus free + * 'I' Initiating transfer (i.e. send addr & direction flag) + * 'S' Sending byte + * 'R' Receving byte + * + * Last byte of the transaction can be identified by the STOP flag. + * + * The take-over sequence starts by setting an atomic variable which + * tells the PXA driver to wait (and retry the I2C transaction when + * the variable gets cleared). Then we look at the controller status + * and command registers to determine whether it is active or not. + * + * Simple cases: + * ------------- + * state = '-' + * Controller is not in use by PXA driver. + * + * state 'B' + * Controller not actively in use yet. + * At worst the SCLE bit will be set, which won't affect + * anything in this driver since we always run as master. + * + * STOP bit set + * This is last byte of a transaction, we have two cases: + * a) Last part of a write UART register transaction. + * - Wait for the byte to clock out + * b) Last part of a read UART register transaction. + * - Wait for the byte to clock in, then preserve IDBR. + * + * Other cases: + * ------------ + * state 'I' + * Starting an I2C command (Start or Start-Repeat), + * we have 3 sub-cases of this: + * a) Starting a write UART register transaction: + * - Wait for the byte to clock out, then transmit a + * 0 byte with STOP bit set. This selects RX/TX + * UART register without accessing it. + * b) Starting a read UART register transaction: + * - Same as case a), turn it into a NOP. + * c) Reversing direction during read UART register, + * probably need to finish the read operation: + * - Wait for the byte to clock out, send STOP + ACK + * and wait for the receive to clock in. + * + * state 'S' + * Since STOP bit is not set, then this is the + * index being transfered, two sub-cases: + * a) Sending of a write UART register. + * - Wait for the byte to clock out, then transmit a + * 0 byte with the STOP bit set. This inadvertantly + * and temporarily clears a random UART register, + * which may result in a null byte transmitted + * Since there is a retry associated, the intended + * register value will be written later. + * b) Sending of a read UART register. + * - Same as state 'I' case c). + * + * state 'R' + * Should not occur, because communications with the + * UART only have single byte reads, which always is + * accompanied by a STOP bit, and thus is covered by + * the simple case above. If multi-byte reads were to + * be used then we'd have to terminate it: + * - Wait for the byte to clock in, send STOP + ACK + * and wait for the 2nd byte to clock in. + * Both bytes received can be discarded, as there + * is no easy way to pass them to the PXA driver. + * + * Warning: + * Beyond this being an ugly hack, it is also not re-entrant. + * It can reliably interrupt the console and return it without + * causing too much breakage, but it cannot grab the I2C bus + * from itself due to the use of global variables. + * + * Warning: + * The synchronization between i2c_grap/i2c_release and the + * PXA driver can still wreck the I2C controller. Cause not + * known, but when it happens the PXA driver ends up repeating + * these log messages: + * i2c: error: pxa_pio_set_master: timeout + * i2c: msg_num: 0 msg_idx: 1 msg_ptr: 0 + * i2c: ICR: 000017e0 ISR: 00000044 + * i2c: log: [000000c6:000017e0:00:9a] + * i2c i2c-0: i2c_pxa: timeout waiting for bus free + * pxa_do_pio_xfer: timeout to become master + * pxa_pio_set_master 'B': ISR 00044, ICR 7e0, IDBR 28, IBMR 1 + * Looks like the I2C controller gets stuck, ISR: IRF + IBB, + * The code failing is i2c_pxa_pio_set_master(), which points + * to the I2C UART as the culprit. One such case was during + * module load on KnF, where the only activity in the module + * was one ee_lock/ee_release pair, which in state 'B' should + * be straight forward to handle. + */ + +#ifdef CONFIG_I2C_PXA +#define PXA_SYNC 1 +#else +#define PXA_SYNC 0 +#endif + +#if PXA_SYNC +static uint32_t sv_icr, sv_isr, sv_isar, sv_idbr, ee_term; +extern char pxa_state; +extern atomic_t pxa_block; +#endif + +static void +i2c_grab(void) +{ + int uwt, n; + uint32_t icr, isr; + char * w; + + I2C_PRT("i2c_grab: entry\n"); + REG_REG("+i2c_grab"); + +#if PXA_SYNC + sv_isar = reg_read(ISAR_OFFSET); + sv_idbr = reg_read(IDBR_OFFSET); + sv_icr = reg_read(ICR_OFFSET); + isr = sv_isr = reg_read(ISR_OFFSET); + if ((pxa_state == '-' || pxa_state == 'B') && !(isr & ISR_UB)) { + REG_REG("-i2c_grab"); + I2C_PRT("i2c_grab: controller idle, isr %08x\n", isr); + return; + } + ee_term = 1; + I2C_PRT("i2c_grab: controller active, pxa %c\n", pxa_state); +#else + isr = reg_read(ISR_OFFSET); + if (!(isr & ISR_UB)) { + REG_REG("-i2c_grab"); + I2C_PRT("i2c_grab: controller idle, isr %08x\n", isr); + return; + } + I2C_PRT("i2c_grab: controller active\n"); + w = "-"; +#endif + + /* + * Guess on how long one I2C clock cycle is (in uSec) + * Note: ignore High-Speed modes, they are not used. + */ + icr = reg_read(ICR_OFFSET); + uwt = (icr & ICR_FAST_MODE) ? 3 : 10; + + /* + * Wait here long enough that current byte transaction + * on the I2C controller must have clocked all on its bus. + * Imperically, we've determined that length of this wait + * can to be in range up to a dozen I2C clocks. + * We probe state once per I2C clock cycle. + */ + for(n = 0; n < 100 && (isr & ISR_UB); n++) { + /* + * Controller busy doing something. Whatever it is + * doing, it should set either ITE or IRF when done. + * Need to check for this independently because UB + * is asserted all the way from START thru STOP. + */ + if (isr & (ISR_ITE | ISR_IRF)) + break; + myDELAY(uwt); + isr = reg_read(ISR_OFFSET); + } + I2C_PRT("i2c_grab: ITE/IRF wait %d uSec, isr %02x, UB %d\n", + n * uwt, isr, (isr & ISR_UB) == ISR_UB); + + /* + * Controller should have finished current byte transfer by now. + * If it was last byte of a transaction, we are done. + * In read mode we preserve the received data. + */ + if (icr & ICR_STOP) { +#if PXA_SYNC + if (isr & ISR_RWM) + sv_idbr = reg_read(IDBR_OFFSET); +#endif + for(n = 0; n < 100 && (isr & ISR_UB); n++) { + myDELAY(uwt); + isr = reg_read(ISR_OFFSET); + } + + REG_REG("-i2c_grab"); + I2C_PRT("i2c_grab: easy case, UB wait %d uSec, bus %sclear, icr %08x, isr %08x\n", + n * uwt, (isr & ISR_UB) ? "NOT " : "", icr, isr); + return; + } + +#if PXA_SYNC + w = "?"; + + if (pxa_state == 'I') { + isr &= ~ISR_INTS; + reg_write(ISR_OFFSET, isr); + + if (isr & ISR_RWM) { + /* + * Sub-case c) + * Start byte read and send nak+stop when received. + */ + I2C_PRT("i2c_grab: state 'I', sub-case c\n"); + icr = (icr & ~ICR_START) | (ICR_STOP | ICR_ACKNAK | ICR_TB); + reg_write(ICR_OFFSET, icr); + w = "c"; + } + else { + /* + * Sub-case a) and b) + * Send a null byte and stop the transaction. + */ + I2C_PRT("i2c_grab: state 'I', sub-case a & b\n"); + icr = (icr & ~ICR_START) | (ICR_STOP | ICR_TB); + reg_write(IDBR_OFFSET, 0); + reg_write(ICR_OFFSET, icr); + w = "a & b"; + } + + myDELAY(8 * uwt); + isr = reg_read(ISR_OFFSET); + for(n = 0; n < 100 && (isr & ISR_UB); n++) { + myDELAY(uwt); + isr = reg_read(ISR_OFFSET); + } + if (*w == 'c') + sv_idbr = reg_read(IDBR_OFFSET); + } + + if (pxa_state == 'S') { + isr &= ~ISR_INTS; + reg_write(ISR_OFFSET, isr); + + if (isr & ISR_RWM) { + I2C_PRT("i2c_grab: state 'S', sub-case b\n"); + icr = (icr & ~ICR_START) | (ICR_STOP | ICR_ACKNAK | ICR_TB); + reg_write(ICR_OFFSET, icr); + w = "b"; + } + else { + I2C_PRT("i2c_grab: state 'S', sub-case a\n"); + icr = (icr & ~ICR_START) | (ICR_STOP | ICR_TB); + reg_write(IDBR_OFFSET, 0); + reg_write(ICR_OFFSET, icr); + w = "a"; + } + + myDELAY(8 * uwt); + isr = reg_read(ISR_OFFSET); + for(n = 0; n < 100 && (isr & ISR_UB); n++) { + myDELAY(uwt); + isr = reg_read(ISR_OFFSET); + } + if (*w == 'b') + sv_idbr = reg_read(IDBR_OFFSET); + } +#endif /* PXA_SYNC */ + + REG_REG("-i2c_grab"); + I2C_PRT("i2c_grab: controller %sclear, icr %08x, isr %08x, w %s\n", + (isr & ISR_UB) ? "NOT " : "", icr, isr, w); +} + +static void +i2c_release(void) +{ + I2C_PRT("i2c_release: entry\n"); + REG_REG("+i2c_release"); + +#if PXA_SYNC +#if 0 + /* + * Reset I2C controller before returning it to PXA driver + *TBD: Usually not necessary, remove? + */ + if (ee_term) { + I2C_PRT("i2c_release: resetting bus\n"); + reg_write(ICR_OFFSET, ICR_UR); + myDELAY(2); + reg_write(ICR_OFFSET, 0); + } +#endif + + I2C_PRT("i2c_release: restore controller state\n"); + reg_write(ISR_OFFSET, sv_isr); + reg_write(ICR_OFFSET, sv_icr & ~ICR_TB); + reg_write(ISAR_OFFSET, sv_isar); + reg_write(IDBR_OFFSET, sv_idbr); + + if (ee_term) + ee_term = 0; +#endif /* PXA_SYNC */ + + if (reg_read(IBMR_OFFSET) != 3) + I2C_PRT("i2c_release: WARNING: bus active!!!\n"); + + REG_REG("-i2c_release"); + I2C_PRT("i2c_release: exit\n"); +} + + +/* + * Layer 3 abstraction: I2C driver API (message passing). + * + * Controls data transfers to/from devices on the I2C bus. + * This is what device drivers should use. + * + * xfr_configure Set target address and speed + * xfr_start Start R/W operation + * xfr_write Write buffer to target + * xfr_read Read buffer from target + * xfr_rept_start Repeat-start new R/W operation + * xfr_reset Reset driver + */ + +static int +xfr_configure(uint8_t addr, int freq) +{ + XFR_PRT("xfr_configure: entry, addr %02x, freq %d\n", addr, freq); + + if (freq > FREQ_AUTO || freq <= FREQ_MAX) { + XFR_PRT("xfr_configure: exit, invalid freq\n"); + return -EINVAL; + } + + if (addr & 0x80) { + XFR_PRT("xfr_configure: exit, invalid addr\n"); + return -EINVAL; + } + + hnd_addr = addr; + hnd_freq = freq; + XFR_PRT("xfr_configure: done, hnd_addr %02x, hnd_freq %d\n", hnd_addr, hnd_freq); + return 0; +} + + +static int +xfr_start(int rw) +{ + int err; + + XFR_PRT("xfr_start: entry, rw %d, hnd_addr %02x\n", rw, hnd_addr); + + if (rw != I2C_WRITE && rw != I2C_READ) { + XFR_PRT("xfr_start: exit, op invalid\n"); + return -EINVAL; + } + + if (hnd_addr & 0x80) { + XFR_PRT("xfr_start: exit, hnd_addr %02x invalid\n", hnd_addr); + return -EINVAL; + } + + err = i2c_init(hnd_addr); + if (err) { + XFR_PRT("xfr_start: i2c_init failed, err %d\n", err); + i2c_reset(); + return -EIO; + } + + err = i2c_start(rw); + if (err) + XFR_PRT("xfr_start: i2c_start failed, err %d\n", err); + switch(err) { + case INCOMPLETE_XFER: + i2c_stop(); + err = -EBUSY; + break; + case TX_CONTROLLER_ERROR: + i2c_reset(); + err = -ENODEV; + break; + case TX_NAK: + i2c_stop(); + err = -ENXIO; + break; + } + + XFR_PRT("xfr_start: done, err %d\n", err); + return err; +} + + +static int +xfr_rept_start(int rw) +{ + int err; + + XFR_PRT("xfr_rept_start: entry, rw %d, bus_start_op %d\n", rw, bus_start_op); + + if (bus_start_op != I2C_READ && bus_start_op != I2C_WRITE) { + XFR_PRT("xfr_rept_start: exit, mode change %d\n", -ENXIO); + return -ENXIO; + } + + err = i2c_start(rw); + if (err) + XFR_PRT("xfr_rept_start: i2c_start err %d\n", err); + switch(err) { + case INCOMPLETE_XFER: + i2c_stop(); + err = -EBUSY; + break; + case TX_CONTROLLER_ERROR: + i2c_reset(); + err = -ENODEV; + break; + case TX_NAK: + i2c_stop(); + err = -ENXIO; + break; + } + + XFR_PRT("xfr_rept_start: done, err %d\n", err); + return err; +} + + +static int +xfr_write(bool sendStop, int cnt, uint8_t *data) +{ + int retval, i; + + XFR_PRT("xfr_write: entry, sendStop %d, cnt %d\n", sendStop, cnt); + + if (cnt < 0) { + XFR_PRT("xfr_write: exit, bad count %d\n", cnt); + return -EINVAL; + } + + if (! cnt) { + XFR_PRT("xfr_write: null write\n"); + retval = i2c_stop(); + goto out; + } + + if (cnt == 1) { + XFR_PRT("xfr_write: 1-byte write, '%02x'\n", *data); + retval = i2c_wr_byte(sendStop, *data); + goto out; + } + + for (i = 0; i < cnt - 1; i++) { + XFR_PRT("xfr_write: multi-byte write %d, '%02x'\n", i, data[i]); + retval = i2c_wr_byte(FALSE, data[i]); + if (retval) + goto out; + } + + XFR_PRT("xfr_write: last of multi-byte write %d, '%02x'\n", cnt - 1, data[cnt - 1]); + retval = i2c_wr_byte(sendStop, data[cnt - 1]); + +out: + if (retval) + XFR_PRT("xfr_write: post val %d\n", retval); + switch(retval) { + case INCOMPLETE_XFER: + i2c_stop(); + retval = -EBUSY; + break; + case TX_CONTROLLER_ERROR: + i2c_reset(); + retval = -ENODEV; + break; + case TX_NAK: + i2c_stop(); + retval = -ENXIO; + break; + } + + XFR_PRT("xfr_write: done, val %d\n", retval); + return retval; +} + + +static int +xfr_read(bool sendStop, int cnt, uint8_t *data) +{ + int retval, i; + + XFR_PRT("xfr_read: entry, stop %d, cnt %d\n", sendStop, cnt); + + if (cnt < 0) { + XFR_PRT("xfr_read: exit, bad count %d\n", cnt); + return -EINVAL; + } + + if (! cnt) { + XFR_PRT("xfr_read: null read\n"); + retval = i2c_stop(); + goto out; + } + + if (cnt == 1) { + XFR_PRT("xfr_read: 1-byte read\n"); + retval = i2c_rd_byte(sendStop, data); + goto out; + } + + for (i = 0; i < cnt - 1; i++) { + XFR_PRT("xfr_read: multi-byte read %d\n", i); + retval = i2c_rd_byte(FALSE, data ? &data[i] : data); + if (retval) + goto out; + } + + XFR_PRT("xfr_read: last of multi-byte read %d\n", cnt - 1); + retval = i2c_rd_byte(sendStop, data ? &data[cnt - 1] : data); + +out: + if (retval) { + XFR_PRT("xfr_read: post val %d\n", retval); + i2c_reset(); + retval = -ENXIO; + } + + XFR_PRT("xfr_read: done, err %d\n", retval); + return retval; +} + + +#if NOT_YET +static void +xfr_reset(void) +{ + i2c_reset(); +} +#endif + + + +/* +** +** UART support for printing from exception context. +** A somewhat crude implementation of two low level +** routines that write/read CSRs on the I2C UART. +** On top of these two functions, a set of mid-layer +** routines adds init/exit and character based I/O. +** We try not to alter the UART's transmission setup +** in order lower the risk of corrupting normal use. +** +** All UART support routines assume I2C controller +** to be initialized by xfr_configure() and expects +** exclusive access to the device +** +*/ + + +/* + * Weird way to say that the I2C UART has slave address + * 0x4D (or 0x48) and the UART registers are in bits + * [6:3] of the register address byte. + * KnF has both I2C UART address pins wired to Vss. + * KnC MPI has the address pins wired to Vdd instead. + *TBD: That's according to the schematics, in reality + * on A0 CRBs the address of the onboard UART is + * 0x4D, which matches address pins wired to Vss. + * Not sure why that changed. + */ + +#ifdef CONFIG_ML1OM +#define SC16IS_ADDR_0 1 +#define SC16IS_ADDR_1 1 +#endif +#ifdef CONFIG_MK1OM /* KAA: MPI specific or KnC specific ? */ +#define SC16IS_ADDR_0 1 +#define SC16IS_ADDR_1 1 +#endif +#define SC16IS_ADDR(a1, a0) \ + (0x40 | (((a1 + 8) + (a1 * 3)) | a0)) +#define SC16IS_SUBADDR(addr, ch) \ + ((addr & 0xf) << 3) | ((ch & 3) << 1) + + +static uint8_t +cons_getreg(int reg) +{ + uint8_t sub, val; + int err; + + CON_PRT("cons_getreg: reg %02x\n", reg); + + /* + * The SC16IS740 device reads 8-bit UART registers + * by first writing the register index and then in + * an subsequent read operation gets the register + * value. The two operations can (and probably + * should) be joined by a repeated start to save + * the intermediate stop signaling. + */ + val = 0; + sub = (uint8_t) SC16IS_SUBADDR(reg, 0); + err = xfr_start(I2C_WRITE); + if (err) { + CON_PRT("cons_getreg: xfr_start (WR) err %d\n", err); + return 0; + } + err = xfr_write(FALSE, 1, &sub); + if (err) { + CON_PRT("cons_getreg: xfr_write (%02x) err %d\n", sub, err); + return 0; + } + err = xfr_rept_start(I2C_READ); + if (err) { + CON_PRT("cons_getreg: xfr_rept_start (RD) err %d\n", err); + return 0; + } + err = xfr_read(TRUE, 1, &val); + if (err) { + CON_PRT("cons_getreg: xfr_read err %d\n", err); + return 0; + } + + CON_PRT("cons_getreg: reg %02x, val %02x\n", reg, val); + return val; +} + + +static void +cons_setreg(int reg, int val) +{ + uint8_t payload[2]; + int err; + + CON_PRT("cons_setreg: reg %02x, val %02x\n", reg, val); + + payload[0] = (uint8_t) SC16IS_SUBADDR(reg, 0); + payload[1] = (uint8_t) val; + CON_PRT("cons_setreg: I2C payload %02x, %02x\n", payload[0], payload[1]); + err = xfr_start(I2C_WRITE); + if (err) { + CON_PRT("cons_setreg: xfr_start (WR) err %d\n", err); + return; + } + err = xfr_write(TRUE, 2, payload); + if (err) + CON_PRT("cons_getreg: xfr_write (%02x, %02x) err %d\n", payload[0], payload[1], err); +} + + +static void +cons_init(void) +{ + /* + * For now assume that the kernel LXA driver or the + * bootstrap code has setup the I2C uart properly, i.e. + * we don't need to alter speed/databits/stopbits/parity + * or any other serial properties. + * + *WARNING: Since the switch of console from the I2C uart to + * the virtual console, the uart is left with default + * serial port speed of 9600 baud. Bootstrap blasts + * it's messages at 115200 baud, so now the choice + * of getting garbage from this routine or from the + * bootstrap. Using program stty from userspace may + * set any baudrate, we cannot override it here! + * # stty 115200 < /dev/ttyS0 + *TBD: make 115200 baud default on I2C uart! + */ + CON_PRT("cons_init: pass\n"); +} + + +static void +cons_exit(void) +{ + CON_PRT("cons_exit: pass\n"); +} + + +#if NOT_YET +static int +cons_rxrdy(void) +{ + int val; + + CON_PRT("cons_rxrdy: check console RxRdy\n"); + + val = (cons_getreg(UART_LSR) & UART_LSR_DR) ? 1 : 0; + + CON_PRT("cons_rxrdy: RxRdy %d\n", val); + return val; +} + + +static int +cons_getc(void) +{ + int c; + + CON_PRT("cons_getc: rd from console\n"); + + while((cons_getreg(UART_LSR) & UART_LSR_DR) == 0) + myDELAY(1000); + c = cons_getreg(UART_RX); + + CON_PRT("cons_getc: read '%02x'\n", c); + return c; +} +#endif + + +static void +cons_putc(int c) +{ + int limit; + + CON_PRT("cons_putc: wr '%02x' to console\n", c); + + limit = 10; + while((cons_getreg(UART_LSR) & UART_LSR_THRE) == 0 && --limit) ; + CON_PRT("cons_putc: THRE ready, limit %d\n", limit); + cons_setreg(UART_TX, c); + +#if 0 + /* + * No reason to wait for it to clock out + */ + limit = 10; + while((cons_getreg(UART_LSR) & UART_LSR_TEMT) == 0 && --limit) ; + CON_PRT("cons_putc: TEMT ready, limit %d\n", limit); +#endif + + CON_PRT("cons_putc: done printing '%02x'\n", c); +} + + +/* + * Simple exclusive access method for the 'OverClock' I2C bus. + * The POST-card UART is the only known other party using this + * bus under normal circumstances (because it is the console). + * If the POST-card UART is built into the kernel, the lock is + * in file 'drivers/serial/8250_sc16is7xx.c'. Otherwise the lock + * is local to the RAS module. + * + * Warning: + * This locking works perfectly in standard contexts and in + * the MCA handling contexts. However, they do not mix safely. + * If the ee_lock is taken from standard context, then an + * MCA event may hang because it cannot get the lock, ever! + * This can happen when/if ee_print() is used. + */ + +#ifdef CONFIG_I2C_PXA +extern atomic_t pxa_block; +extern char pxa_state; +#else +atomic_t pxa_block = ATOMIC_INIT(0); +char pxa_state = '-'; +#endif + +static void +ee_lock(void) +{ + /* + * Wait here until lock ackquired + */ + while(atomic_xchg(&pxa_block, 1)) + myDELAY(50); + + /* + * Lock taken, I2C transaction could be underway. + * Wait for it to end or forcefully terminate it. + */ + i2c_grab(); +} + +static void +ee_unlock(void) +{ + i2c_release(); + atomic_xchg(&pxa_block, 0); +} + + +/* + * Printf to the POST card UART. + * + * Function ee_printk() and ee_print() both creates + * a message into a local buffer from where the RAS + * timer will synch them into the kernel log about + * once a second. ee_printk() is thread safe. + * + * Function ee_print() will also attempt to write to + * the POST card serial port, which may be useful + * from exception context where OS services are out + * of the question. + * + * WARNING: ee_print() takes the same lock as + * the machine checks does, so if a machine check + * happens while a standard context thread are in + * this code we'll have an instant kernel hang. + */ + +char ee_buf[EE_BUF_COUNT * EE_BUF_LINELEN]; +atomic_t ee_msg = ATOMIC_INIT(-1); +atomic_t ee_seen = ATOMIC_INIT(-1); +int ee_rdy; + +#define EE_TSC 0 /* 1 to get rdtsc() included */ + +char * +ee_fmt(char * fmt, va_list args) +{ + char * buf; + int msg_id, tsl; +#if EE_TSC + uint64_t ts = rdtsc(); +#endif + + msg_id = atomic_inc_return(&ee_msg); + buf = ee_buf + (msg_id % EE_BUF_COUNT) * EE_BUF_LINELEN; + if (! *buf) { +#if EE_TSC + tsl = snprintf(buf, EE_BUF_LINELEN - 1, "[%lld] ", ts); +#else + tsl = 0; +#endif + vsnprintf(buf + tsl, EE_BUF_LINELEN - 1 - tsl, fmt, args); + return buf; + } + return 0; +} + +int +ee_printk(char * fmt, ...) +{ + va_list args; + char * buf; + + va_start(args, fmt); + buf = ee_fmt(fmt, args); + va_end(args); + + return buf ? strlen(buf) : 0; +} + +int +ee_print(char * fmt, ...) +{ + char ch, * buf; + va_list args; + int len; + + va_start(args, fmt); + buf = ee_fmt(fmt, args); + va_end(args); + + len = 0; + if (ee_rdy && buf) { + /* + * Get I2C bus exclusive access, + * setup for targeting the UART and + * send string one byte at a time + * with lf -> lr/cr translation. + */ + ee_lock(); + xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO); + while((ch = *(buf++))) { + if (ch == '\n') { + cons_putc('\r'); + len++; + } + cons_putc(ch); + len++; + } + ee_unlock(); + } + + return len; +} +EXPORT_SYMBOL_GPL(ee_print); + + + +/* +** +** EEPROM support routines +** +** The device is a 1 Mbit Atmel AT24C1024 which has 128 +** KByte addressable storage over 2 slave addresses. +** Lower 64 KB is at slave address 0x54 and upper +** 64KB is at slave address 0x55, i.e. it uses LSB of +** the slave address as bit 16 of the byte address. +** +** All EEPROM support routines assume I2C controller +** to be initialized by xfr_configure() and expects +** exclusive access to the device +** +** Only KnC has this storage +*/ + +#ifdef CONFIG_MK1OM + +#define MR_ELOG_SIZE (128 * 1024) /* 1 Mbit */ +#define MR_ELOG_ADDR_LO 0x54 /* Lo 64K slave */ +#define MR_ELOG_ADDR_HI 0x55 /* Hi 64K slave */ +#define EE_PG_SIZ 256 /* Device page size */ + + +/* + * Layout of the EEPROM is roughly like this: + * + * Bytes Content + * 0 - 15 Fixed log header + * 16 - 17 Log head index (last written) + * 18 - 19 Log tail index (last read) + * 20 - end Log entries + * + * By definition, the log is fully read when head and + * tail pointer are equal (initial value: last entry). + * The effective log size is + * (device_size - sizeof(McaHeader))/sizeof(McaRecord). + * + * Fields of interest in the log entry 'id' are + * bits 7:0 Source index, 8 bit + * bits 18:16 Source type, 3 bit + * bits 22:22 Injected error flag + * bits 23:23 Repaired flag + * bits 24:24 Filtered flag + * bits 31:31 Valid flag + * + * Enumeration details are in file micras_mca.h + * + * Time stamps in the MCA header and event records are supposed to be + * standard 32-bit Unix format, i.e. seconds since 00:00 Jan 1 1979 GMT. + * This will wrap some time Jan 19th 2038, which is about 25 years from + * the release of KnC. Given the use of 386's (introduced 1985) in the + * modern data center anno '12, 32 bit will last for all practical purposes. + */ + +typedef struct _mca_header { + uint8_t signature[8]; /* Magic */ + uint8_t header_ver; /* Format revision */ + uint8_t rec_start; /* Offset of 1st record */ + uint16_t rec_size; /* Size of an MCA record */ + uint16_t entries; /* Log size */ + uint8_t logfull; /* Log has wrapped (reserved) */ + uint8_t hwtype; /* Board type (reserved) */ + uint16_t rec_head; /* Head index */ + uint16_t rec_tail; /* Tail index */ +} McaHeader; + +typedef struct _mca_record { + uint32_t id; /* Event origin & flags */ + uint32_t stamp; /* Low 32 bit of system time */ + uint64_t ctl; /* MCA bank register 'CTL' */ + uint64_t status; /* MCA bank register 'STATUS' */ + uint64_t addr; /* MCA bank register 'ADDR' */ + uint64_t misc; /* MCA bank register 'MISC' */ +} McaRecord; + + +/* + * Header to drop onto un-initalized EEPROM + * By definition, the EEPROM is uninitialised + * if the magic signature is wrong. + */ + +#define MR_ELOG_NUM (MR_ELOG_SIZE - sizeof(McaHeader))/sizeof(McaRecord) + +static McaHeader elog_preset = { + .signature = {"MCA_LOG"}, + .header_ver = 1, + .rec_start = sizeof(McaHeader), + .rec_size = sizeof(McaRecord), + .entries = MR_ELOG_NUM, + .logfull = -1, + .hwtype = 0, + .rec_head = MR_ELOG_NUM - 1, + .rec_tail = MR_ELOG_NUM - 1, +}; + +static uint16_t ee_num, ee_head, ee_tail; /* Cached log state */ + + +#if EPR_DBG || EE_VERIFY +/* + * Printk from EEPROM code. + * We have the lock, and the I2C target address is + * set for the Atmel device, we must reset I2C for + * the UART on every entry, and reset it back to the + * EEPROM in order to keep this function transparent. + * + * Warning: this call is highly risky, particularly + * in error conditions where the I2C bus is involved. + * Do not call it during an EEPROM I2C transaction!! + * Use for internal debug _ONLY_ and at own risk. + */ + +int +elog_print(char * fmt, ...) +{ + char * buf, ch; + va_list args; + int len; + + va_start(args, fmt); + buf = ee_fmt(fmt, args); + va_end(args); + + if (! buf) + return 0; + + xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO); + + len = 0; + while((ch = *(buf++))) { + if (ch == '\n') { + cons_putc('\r'); + len++; + } + cons_putc(ch); + len++; + } + + return len; +} +#endif /* EPR_DBG */ + + +/* + * Write block of data to EEPROM + * The Atmel device does not allow writes to cross the + * internal page size, which is 256 bytes on the 1 Mbit part. + * Given the size of an McaRecord this is likely to occur, but + * cannot happen more than once per call. + * Must preset slave address on every call. + */ + +static void +ee_wr(uint8_t addr, uint16_t ofs, uint8_t *buf, uint8_t len) +{ + uint16_t pix, swp; + uint8_t wl; + int err; + + if (mce_disabled) + return; + + if ((ofs + len) < ofs) { + EPR_PRT("ee_wr: address overrun\n"); + return; + } + + xfr_configure(addr, FREQ_AUTO); + + pix = ofs & (EE_PG_SIZ - 1); + while(len) { + wl = (uint8_t) min((uint16_t)len, (uint16_t)(EE_PG_SIZ - pix)); + + err = xfr_start(I2C_WRITE); + if (err) { + EPR_PRT("ee_wr: xfr_start (WR) err %d\n", err); + return; + } + + /* + * Byte swap, send Most significant byte first + */ + swp = (ofs >> 8) | (ofs << 8); + err = xfr_write(FALSE, 2, (uint8_t *) &swp); + if (err) { + EPR_PRT("ee_wr: xfr_write offset (%02x, %02x) err %d\n", ofs >> 8, ofs & 0xff, err); + return; + } + + /* + * Write payload to device + */ + err = xfr_write(TRUE, wl, buf); + if (err) { + EPR_PRT("ee_wr: xfr_write %d bytes (%02x, %02x ..) err %d\n", wl, buf[0], buf[1], err); + return; + } + ofs += wl; + buf += wl; + len -= wl; + pix = 0; + + /* + * Data sheet says wait 5 mSec before next + * transaction to the device after a write. + */ + myDELAY(5000); + } +} + + +/* + * Read block of data from EEPROM + * Must preset slave address on every call. + */ + +static void +ee_rd(uint8_t addr, uint16_t ofs, uint8_t *buf, uint8_t len) +{ + uint16_t swp; + int err; + + if ((ofs + len) < ofs) { + EPR_PRT("ee_rd: address overrun\n"); + return; + } + + xfr_configure(addr, FREQ_AUTO); + + err = xfr_start(I2C_WRITE); + if (err) { + EPR_PRT("ee_rd: xfr_start (WR) err %d\n", err); + return; + } + + /* + * Byte swap, send Most significant byte first + */ + swp = (ofs >> 8) | (ofs << 8); + err = xfr_write(FALSE, 2, (uint8_t *) &swp); + if (err) { + EPR_PRT("ee_rd: xfr_write (%02x, %02x) err %d\n", ofs >> 8, ofs & 0xff, err); + return; + } + + /* + * Change bus direction and read payload + */ + err = xfr_rept_start(I2C_READ); + if (err) { + EPR_PRT("ee_rd: xfr_rept_start (RD) err %d\n", err); + return; + } + err = xfr_read(TRUE, len, buf); + if (err) { + EPR_PRT("ee_rd: xfr_read err %d\n", err); + return; + } +} + + +/* + * Read one MCA event record from EEPROM + * Handles crossing device addresses. + */ + +static void +ee_get(McaRecord * rec, int no) +{ + uint32_t pos, mid, low; + + mid = MR_ELOG_SIZE / 2; + memset(rec, '\0', sizeof(*rec)); + pos = sizeof(McaHeader) + no * sizeof(McaRecord); + if (pos < (mid - sizeof(McaRecord))) { + /* + * Record fit entirely in lower half of EEPROM + */ + ee_rd(MR_ELOG_ADDR_LO, pos, (uint8_t *) rec, sizeof(*rec)); + } + else + if (pos > mid) { + /* + * Record fit entirely in upper half of EEPROM + */ + ee_rd(MR_ELOG_ADDR_HI, pos - mid, (uint8_t *) rec, sizeof(*rec)); + } + else { + /* + * Record spans both halves, need 2 reads. + */ + low = mid - pos; + ee_rd(MR_ELOG_ADDR_LO, pos, (uint8_t *) rec, low); + ee_rd(MR_ELOG_ADDR_HI, 0, ((uint8_t *) rec) + low, sizeof(*rec) - low); + } +} + + +/* + * Write one MCA event record to EEPROM + * Handles crossing device addresses. + */ + +static void +ee_put(McaRecord * rec, int no) +{ + uint32_t loc, mid, low; + + mid = MR_ELOG_SIZE / 2; + loc = sizeof(McaHeader) + no * sizeof(McaRecord); + if (loc < (mid - sizeof(McaRecord))) { + /* + * Record fit entirely in lower half of EEPROM + */ + ee_wr(MR_ELOG_ADDR_LO, loc, (uint8_t *) rec, sizeof(*rec)); + } + else + if (loc > mid) { + /* + * Record fit entirely in upper half of EEPROM + */ + ee_wr(MR_ELOG_ADDR_HI, loc - mid, (uint8_t *) rec, sizeof(*rec)); + } + else { + /* + * Record spans both halves, need 2 writes. + */ + low = mid - loc; + ee_wr(MR_ELOG_ADDR_LO, loc, (uint8_t *) rec, low); + ee_wr(MR_ELOG_ADDR_HI, 0, ((uint8_t *) rec) + low, sizeof(*rec) - low); + } +} + + +/* + * Add one MCA event to the EEPROM + * Store the passed event info in the EEPROM, and update write + * position to next entry, just in case if there are more than + * one MC event detected that needs checking in maintenance mode. + * + * This can be called in exception context, and therefore must + * work without any kernel support whatsoever. We must assume + * kernel services are not reliable at this point. + */ + +void +micras_mc_log(struct mce_info * event) +{ + McaRecord mr; + uint16_t nxt, id; + + if (mce_disabled) + return; + + /* + * Print entry on serial console (copy in kernel log) + */ +#if MC_VERBOSE + ee_printk("RAS.elog: bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n", + event->org, event->id, event->ctl, event->status, event->addr, event->misc); +#endif + + /* + * Bail if EEPROM not in order (I2C lock-up or faulty device) + */ + if (! ee_num) + return; + + /* + * Prepare MCA error log record. + * We use the pysical CPU ID in the EEPROM records. + */ + id = (event->org <= 2) ? event->pid : event->id; + mr.id = PUT_BITS( 7, 0, id) | + PUT_BITS(18, 16, event->org) | + PUT_BIT(22, (event->flags & MC_FLG_FALSE) != 0) | + PUT_BIT(24, (event->flags & MC_FLG_FILTER) != 0) | + PUT_BIT(31, 1); + mr.stamp = (uint32_t) event->stamp; + mr.ctl = event->ctl; + mr.status = event->status; + mr.addr = event->addr; + mr.misc = event->misc; + +#if ADD_DIE_TEMP + { + uint32_t tmp; + tmp = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2); + mr.id |= PUT_BITS(15, 8, GET_BITS(19, 10, tmp)); + } +#endif + + /* + * Get I2C bus exclusive access + */ + ee_lock(); + +#if EE_VERIFY + { + /* + * Check for header corruption. + * Time sink, only enable for debugging + */ + extern int in_sync; + McaHeader hdr; + + ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr)); + if (memcmp(hdr.signature, elog_preset.signature, + sizeof(elog_preset.signature))) { + if (in_sync) { + printk("mc_log: Header corruption detected\n"); + dmp_hex(&hdr, sizeof(hdr), "mc_log: EEPROM header (entry)"); + } + else { + elog_print("mc_log: Header corruption detected (entry)\n"); + elog_print("EEPROM header: signature bad, ver %d, type %d\n", + hdr.header_ver, hdr.hwtype); + elog_print("EEPROM capacity: %d events, size %d, start %d\n", + hdr.entries, hdr.rec_size, hdr.rec_start); + elog_print("EEPROM state: head %d, tail %d, full %d\n", + hdr.rec_head, hdr.rec_tail, hdr.logfull); + } + } + } +#endif + + nxt = (ee_head + 1) % ee_num; + if (nxt == ee_tail) { + ee_printk("RAS.elog: EEPROM full, dropping event\n"); + ee_unlock(); + return; + } + ee_put(&mr, nxt); + +#if EE_VERIFY + { + /* + * Read back and verify with memory buffer + * Note: only works on 1st half of device. + * Time sink, only enable for debugging + */ + McaRecord tst; + + ee_rd(MR_ELOG_ADDR_LO, loc, (uint8_t *) &tst, sizeof(tst)); + if (memcmp(&mr, &tst, sizeof(tst))) + elog_print("Write event verify failed\n"); + else + elog_print("Write event verify OK\n"); + } +#endif + + /* + * Update head pointer in EEPROM header + */ + ee_wr(MR_ELOG_ADDR_LO, offsetof(McaHeader, rec_head), (uint8_t *) &nxt, sizeof(nxt)); + ee_head = nxt; + +#if EE_VERIFY + { + /* + * Read back and verify with memory buffer + * Time sink, only enable for debugging + */ + uint16_t tst; + + ee_rd(MR_ELOG_ADDR_LO, 16, (uint8_t *) &tst, 2); + if (tst != nxt) + elog_print("Write index verify failed\n"); + else + elog_print("Write index verify OK\n"); + } + + { + /* + * Check again for header corruption + * Time sink, only enable for debugging + */ + extern int in_sync; + McaHeader hdr; + + ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr)); + if (memcmp(hdr.signature, elog_preset.signature, + sizeof(elog_preset.signature))) { + if (in_sync) { + printk("mc_log: Header corruption detected (exit)\n"); + dmp_hex(&hdr, sizeof(hdr), "mc_log: EEPROM header"); + } + else { + elog_print("mc_log: Header corruption detected (exit)\n"); + elog_print("EEPROM header: signature bad, ver %d, type %d\n", + hdr.header_ver, hdr.hwtype); + elog_print("EEPROM capacity: %d events, size %d, start %d\n", + hdr.entries, hdr.rec_size, hdr.rec_start); + elog_print("EEPROM state: head %d, tail %d, full %d\n", + hdr.rec_head, hdr.rec_tail, hdr.logfull); + } + } + } +#endif + + /* + * Release I2C bus exclusive lock + */ + ee_unlock(); +} + + +/* + * Reset the EEPROM to mint condition + */ + +#define BSIZ 0xf0 + +static void +ee_mint(void) +{ + uint8_t buf[EE_PG_SIZ]; + McaHeader hdr; + uint32_t loc, mid; + uint16_t ofs; + uint8_t addr; + + + if (ee_rdy && ! mce_disabled) { + printk("EEPROM erase started ..\n"); + memset(buf, 0xff, sizeof(buf)); + + ee_lock(); + + /* + * Several cheats in this loop. + * - Despite maximum transfer per write command is 255 (8 bit count), + * we send only half a 'page', i.e. 128 byte, per call to ee_wr(). + * - Picking exactly half a page, starting page aligned, ensures there + * will be no writes across a page boundary, i.e. ee_wr() will always + * result in exactly one I2C write command per call. + * - We know that MR_ELOG_SIZE / (EE_PG_SIZ / 2) is a clean integer, + * and therefore will be no end condition to special case. + * - Same will be true for the 'mid-chip' limit where the target + * address is bumped by one. + */ + mid = MR_ELOG_SIZE / 2; + for(loc = 0; loc < MR_ELOG_SIZE; loc += (EE_PG_SIZ / 2)) { + addr = (loc < mid) ? MR_ELOG_ADDR_LO : MR_ELOG_ADDR_HI; + ofs = loc & 0xffff; + // printk(" -- loc %5x: addr %2x, offs %4x, len %4x\n", loc, addr, ofs, EE_PG_SIZ / 2); + ee_wr(addr, ofs, buf, EE_PG_SIZ / 2); + } + + /* + * Put in a fresh header + */ + ee_wr(MR_ELOG_ADDR_LO, 0, (uint8_t *) &elog_preset, sizeof(elog_preset)); + ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr)); + printk("EEPROM erase complete\n"); + + ee_unlock(); + + /* + * Verify that the header stuck. + * If not, then complain to kernel log and set event capacity to 0 + */ + if (memcmp(hdr.signature, elog_preset.signature, sizeof(elog_preset.signature)) || + hdr.header_ver != elog_preset.header_ver || + hdr.rec_start != elog_preset.rec_start || + hdr.rec_size != elog_preset.rec_size || + hdr.hwtype != elog_preset.hwtype) { + /* + * Write EEPROM header failed. + * Leave a message in the kernel log about it. + */ + printk("Error: EEPROM initialization failed!\n"); + printk("MCA events cannot be logged to EEPROM\n"); + ee_num = 0; + } + else { + ee_num = hdr.entries; + ee_head = hdr.rec_head; + ee_tail = hdr.rec_tail; + printk("EEPROM ready!\n"); + } + + + } +} + + +#if EE_PROC +/* + * Support for user space access to the EEPROM event log. + * Implemented as a 'proc' file named elog, who returns + * MCE events on read and on writes of 6 hex values + * per line creates new event(s) to be entered. + * + * Compile time configurable for disabling writes and + * choice of whether to dump new events or everything. + */ + +static struct proc_dir_entry * elog_pe; + +/* + * Write is just a simple file operation. + * We do not care about file offset since the specified event is to + * be added to the EEPROM at head+1, not at any arbitrary location. + */ + +static ssize_t +elog_write(struct file * file, const char __user * buff, size_t len, loff_t * off) +{ + char * buf; + uint16_t nxt; + McaRecord mr; + uint64_t ull[6]; + char * ep, * cp; + int i, err; + + /* + * Get input line into kernel space + */ + if (len > PAGE_SIZE -1) + len = PAGE_SIZE -1; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (! buf) + return -ENOMEM; + if (copy_from_user(buf, buff, len)) { + err = -EFAULT; + goto wr_out; + } + buf[len] = '\0'; + cp = ep = (char *) buf; + + /* + * Special case EEPROM reset option, + * first 5 letters form the word 'reset' + */ + if (!strncmp(buf, "reset", 5)) { + ee_mint(); + goto wr_one; + } + + /* + * Need 6 numbers for an event record + */ + for(i = 0; i < 6; i++) { + while(isspace(*cp)) + cp++; + ull[i] = simple_strtoull(cp, &ep, 16); + if (ep == cp || (*ep != '\0' && !isspace(*ep))) { + err = -EINVAL; + goto wr_out; + } + cp = ep; + } + +#if 0 + /* + * If we were to screen this the we should ensure that + * id[7:0] < CPU_MAX on org 0, 1, 2 + * < DBOX_NUM on org 3 + * == 0 on org 4 + * < GBOX_NUM on org 5 + * < TBOX_NUM on org 6 + * id[18:16] <= 6 + * id[23] == 0 + * id[31] == 1 + */ +#endif + + if (ee_num) { + mr.id = (uint32_t) ull[0]; + mr.stamp = (uint32_t) ull[1]; + mr.ctl = ull[2]; + mr.status = ull[3]; + mr.addr = ull[4]; + mr.misc = ull[5]; + + /* + * Add event record under I2C bus exclusive access + */ + ee_lock(); + nxt = (ee_head + 1) % ee_num; + ee_put(&mr, nxt); + ee_wr(MR_ELOG_ADDR_LO, offsetof(McaHeader, rec_head), (uint8_t *) &nxt, sizeof(nxt)); + ee_head = nxt; + ee_unlock(); + } + + /* + * Swallow any trailing junk up to next newline + */ +wr_one: + ep = strchr(buf, '\n'); + if (ep) + cp = ep + 1; + err = cp - buf; + +wr_out: + kfree(buf); + return err; +} + + +/* + * Use the sequencer to read one event at a time, + * in order of occurrence in the EEPROM. Sequence + * position is event index in range 0 .. ee_num, + * which will be offset by (ee_tail + 1) modulo + * ee_num if EE_PROC_NEW flag is set. + */ + +static int elog_eof; /* Elog end-of-file marker */ + +static int +elog_seq_show(struct seq_file * f, void * v) +{ + McaRecord mr; + int pos, nxt; + static int inv; + + pos = *(loff_t *) v; + + /* + * Print nice header on 1st read from /proc/elog + */ + if (! pos) { + extern struct mr_rsp_hwinf hwinf; + struct mr_rsp_hwinf * r = &hwinf; + + inv = 0; + seq_printf(f, "Card %c%c%c%c%c%c%c%c%c%c%c%c: " + "brd %d, fab %d, sku %d, rev %d, stp %d, sub %d\n", + r->serial[0], r->serial[1], r->serial[2], r->serial[3], + r->serial[4], r->serial[5], r->serial[6], r->serial[7], + r->serial[8], r->serial[9], r->serial[10], r->serial[11], + r->board, r->fab, r->sku, r->rev, r->step, r->substep); + if (ee_num) { + seq_printf(f, "Head %d, tail %d, cap %d\n", ee_head, ee_tail, ee_num); + seq_printf(f, "%5s %8s %12s %8s %16s %16s %16s %16s\n", + "index", "id", "id decode", "time", "ctrl", "status", "addr", "misc"); + } + else + seq_printf(f, "Error: EEPROM not initialized\n"); + } + + /* + * Set EOF and quit if EEPROM not accessible + */ + if (! ee_num) { + elog_eof = 1; + return 0; + } + + /* + * Get event under I2C bus exclusive access + */ +#if EE_PROC_NEW + nxt = (pos + ee_tail + 1) % ee_num; +#else + nxt = pos; +#endif + ee_lock(); + ee_get(&mr, nxt); + ee_unlock(); + +#if ! EE_PROC_NEW + /* + * We refuse to print invalid entries. + * However, a freshly reset EEPROM contains all 1s and + * therefore we won't rely on the valid-bit alone. + * Instead rely on the unused areas of 'id' to be 0s. + * Probably need to stop sequencer once a bad entry is + * seen because in all likelihood we've reached the + * log end and reading the remainder of the EEPROM will + * just be waste of time. + */ + if (GET_BITS(30, 25, mr.id) == 0x3f && + GET_BITS(21, 19, mr.id) == 0x07 && + GET_BITS(15, 8, mr.id) == 0xff) { + if (inv++ > 10) + elog_eof = 1; + return 0; + } +#endif + + seq_printf(f, "%5d %08x [%d %3d %c%c%c%c] %08x %016llx %016llx %016llx %016llx\n", + nxt, mr.id, + GET_BITS(18,16,mr.id), + GET_BITS(7,0,mr.id), + GET_BIT(22,mr.id) ? 'I' : ' ', + GET_BIT(23,mr.id) ? 'R' : ' ', + GET_BIT(24,mr.id) ? 'F' : ' ', + GET_BIT(31,mr.id) ? 'V' : ' ', + mr.stamp, mr.ctl, mr.status, mr.addr, mr.misc); + + return 0; +} + +static void * +elog_seq_start(struct seq_file * f, loff_t * pos) +{ + if (ee_num) { + if (*pos >= ee_num) + return NULL; +#if EE_PROC_NEW + /* + * Skip checks if we are dumping full log + */ + if (ee_head == ee_tail) + return NULL; + if (*pos && ((*pos + ee_tail) % ee_num) == ee_head) + return NULL; +#endif + } + + elog_eof = 0; + + return pos; +} + +static void * +elog_seq_next(struct seq_file * f, void * v, loff_t * pos) +{ + if (elog_eof) + return NULL; + + (*pos)++; + if (*pos >= ee_num) + return NULL; + +#if EE_PROC_NEW + /* + * No wrap checks if we are dumping full log + */ + { + int nxt; + + nxt = ((*pos) + ee_tail) % ee_num; + if (nxt == ee_head) + return NULL; + } +#endif + + return pos; +} + +static void +elog_seq_stop(struct seq_file * f, void * v) +{ +} + +static const struct seq_operations elog_seq_ops = { + .start = elog_seq_start, + .next = elog_seq_next, + .stop = elog_seq_stop, + .show = elog_seq_show, +}; + +static int +elog_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &elog_seq_ops); +} + +static struct file_operations proc_elog_operations = { + .open = elog_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = elog_write, +}; + +#endif /* EE_PROC */ + + + +/* +** +** Validation hooks. +** +** ee_list List EEPROM contents to kernel log +** ee_wipe Clear EEPROM (after RAS testing) +** +** Used by validation, exported entry point +** Do not enable this in production code. +** +*/ + +void +ee_list(void) +{ + McaHeader hdr; + McaRecord rec; + int pos, i; + + /* + * Get I2C bus exclusive access + */ + ee_lock(); + + /* + * Read header + */ + ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr)); + if (! strncmp(hdr.signature, "MCA_LOG", sizeof(hdr.signature))) { + printk("MCE log header: signature OK, ver %d, type %d\n", + hdr.header_ver, hdr.hwtype); + printk("MCE log capacity: %d events, size %d, start %d\n", + hdr.entries, hdr.rec_size, hdr.rec_start); + printk("MCE log state: head %d, tail %d, full %d\n", + hdr.rec_head, hdr.rec_tail, hdr.logfull); + if (hdr.entries != MR_ELOG_NUM) { + printk("MCE log check: invalid capacity, expected %ld\n", MR_ELOG_NUM); + goto ee_bad; + } + if (hdr.rec_size != sizeof(McaRecord)) { + printk("MCE log check: invalid rec size, expected %ld\n", sizeof(McaRecord)); + goto ee_bad; + } + if (hdr.rec_tail != ee_tail || + hdr.rec_head != ee_head) { + printk("MCE log check: cached h/t mismatch %d/%d\n", ee_head, ee_tail); + goto ee_bad; + } + if (hdr.entries != ee_num) { + printk("MCE log check: cached capacity mismatch %d\n", ee_num); + goto ee_bad; + } + + /* + * Header looks OK, + * Dump all valid entries in eeprom + */ + for(i = 0; i < hdr.entries; i++) { + ee_get(&rec, i); + + /* + * Uninitialized parts have all FFs in them, + * need to screen those before testing the valid bit + */ + if (rec.id != 0xffffffff && GET_BIT(31, rec.id)) { +#if EE_VERIFY + dmp_hex(&rec, sizeof(rec), "ee_list: Entry[%d]", i); +#endif + pos = hdr.rec_start + i * hdr.rec_size; + printk("Log %4d (pos %06x): id %08x, " + "ctrl %016llx, stat %016llx, addr %016llx, misc %016llx, time %d\n", + i, pos, rec.id, rec.ctl, rec.status, + rec.addr, rec.misc, rec.stamp); + } + } + } + else { + printk("MCE log header: bad signature %02x%02x%02x%02x%02x%02x%02x%02x\n", + hdr.signature[0], hdr.signature[1], hdr.signature[2], hdr.signature[3], + hdr.signature[4], hdr.signature[5], hdr.signature[6], hdr.signature[7]); + } + +ee_bad: + /* + * Release I2C bus exclusive lock + */ + ee_unlock(); +} +EXPORT_SYMBOL_GPL(ee_list); + +void +ee_wipe(void) +{ +#if 1 + printk("Wiping EEPROM disabled, call ignored\n"); +#else + ee_mint(); +#endif +} +EXPORT_SYMBOL_GPL(ee_wipe); +#endif /* CONFIG_MK1OM */ + + +/* +** +** Setup access to the EEPROM on KnC +** This include initializing the local I2C driver and +** locating the next write position in the EEPROM. +** We want to limit the exception time activity to +** a minimum and thus make preparations up front. +** This is expected to happen before enabling the +** MC event intercepts. +** +*/ + +int +ee_init(void) +{ +#if 0 + /* + * Clocking the delay loop. + * Average results over 3 runs: + * uSec % off + * 1 12.46 + * 2 6.22 + * 4 4.34 + * 8 3.41 + * 16 2.90 + * 32 2.65 + * 64 2.52 + * 128 2.46 + * 256 2.43 + * 512 2.41 + * 1024 2.41 + * 2048 6.30 + * 4096 2.43 + * 8192 3.28 + * 16384 3.30 + * 32768 3.42 + * , which is fine for the purposes in this driver. + */ + { + uint64_t t1, t2; + uint64_t usec, pwr; + + printk("RAS.test: tsc_khz %d\n", tsc_khz); + for(pwr = 0; pwr < 16; pwr++) { + usec = 1UL << pwr; + t1 = rdtsc(); + myDELAY(usec); + t2 = rdtsc(); + printk("RAS.test: myDelay(%lld) => %lld clocks\n", usec, t2 - t1); + } + } +#endif + +#ifdef CONFIG_MK1OM + if (! mce_disabled) { + McaHeader hdr; + +#ifndef CONFIG_I2C_PXA + /* + * Reset I2C controller if PXA driver is not included in the kernel. + */ + i2c_reset(); +#endif + + /* + * Get I2C bus exclusive access + */ + ee_lock(); + + /* + * Paranoia!! + * At this point the I2C controller should be inactive and + * the I2C bus should be idle. Verify this to be true. + * Note: This check is only applied on this very first + * access to the I2C controller. If it passed the + * two criterias we _assume_ we have good hardware. + * TBD: should we assume that the I2C subsystem can go bad + * at runtime and add more checking? + */ + ee_num = 0; + if ((reg_read(ISR_OFFSET) & ISR_UB) || (reg_read(IBMR_OFFSET) != 3)) { + printk("RAS.elog: I2C unit out of control, cannot access EEPROM\n"); + } + else { + /* + * Get EEPROM header and cache log state. + */ + ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr)); + if (memcmp(hdr.signature, elog_preset.signature, sizeof(elog_preset.signature)) || + hdr.header_ver != elog_preset.header_ver || + hdr.rec_start != elog_preset.rec_start || + hdr.rec_size != elog_preset.rec_size || + hdr.hwtype != elog_preset.hwtype) { + printk("RAS.elog: Found un-initialized EEPROM, initializing ..\n"); + ee_wr(MR_ELOG_ADDR_LO, 0, (uint8_t *) &elog_preset, sizeof(elog_preset)); + ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr)); + } + + if (memcmp(hdr.signature, elog_preset.signature, sizeof(elog_preset.signature)) || + hdr.header_ver != elog_preset.header_ver || + hdr.rec_start != elog_preset.rec_start || + hdr.rec_size != elog_preset.rec_size || + hdr.hwtype != elog_preset.hwtype) { + /* + * Write to EEPROM header failed. + * Leave a message in the kernel log about it and set capacity to 0. + */ + printk("RAS.elog: Error: EEPROM initialization failed!\n"); + } + else { + ee_num = hdr.entries; + ee_head = hdr.rec_head; + ee_tail = hdr.rec_tail; + printk("RAS.elog: rev %d, size %d, head %d, tail %d\n", + hdr.header_ver, ee_num, ee_head, ee_tail); + if (ee_head != ee_tail) { + /* + *TBD: should we be aggressive and replay these events to the host + * when it opens the MC SCIF channel to force the issue? + */ + printk("RAS.elog: Warning: MCA log has unprocessed entries\n"); + } + } + } + if (!ee_num) + printk("RAS.elog: MCA events cannot be logged to EEPROM\n"); + + /* + * Release I2C bus exclusive lock + */ + ee_unlock(); + } +#endif /* CONFIG_MK1OM */ + + /* + * Reset I2C bus & UART (sort of, internal reset only) + */ + xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO); + cons_init(); + ee_rdy = 1; + +#if defined(CONFIG_MK1OM) && EE_PROC + /* + * Create proc file + * We allow writes if EE_INJECT is defined or during manufacturing. + */ + { + int mode; +#if EE_INJECT + mode = 0644; +#else + uint32_t smc_err, smc_val, smc_fwv; + + /* + * HSD 4846538 + * Needs SMC FW 1.8 or later to be safe to use. + * Read FW version; if failed then not at manufacturing. + * If FW version 1.8 or later go read Zombie register. + * If zombie register responded we're at manufacturing, + */ + mode = 0444; + smc_err = gmbus_i2c_read(2, 0x28, 0x11, (uint8_t *) &smc_fwv, sizeof(smc_fwv)); + if (smc_err == sizeof(smc_fwv) && GET_BITS(31, 16, smc_fwv) >= 0x0108) { + smc_err = gmbus_i2c_read(2, 0x28, 0x1b, (uint8_t *) &smc_val, sizeof(smc_val)); + if (smc_err == sizeof(uint32_t)) + mode = 0644; + } + if (mode == 0444) + proc_elog_operations.write = 0; +#endif + elog_pe = proc_create("elog", mode, 0, &proc_elog_operations); + } +#endif + +#if 0 + /* + * Say hello on the console + */ + ee_printk("RAS: ee_print ready, uart adr %02x\n", + SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0)); +#endif + + if (mce_disabled) + printk("RAS.elog: disabled\n"); + else + printk("RAS.elog: init complete\n"); + return 0; +} + + +/* + * Cleanup for module unload. + * Free any resources held by this driver + */ + +int +ee_exit(void) +{ +#if defined(CONFIG_MK1OM) && EE_PROC + if (elog_pe) { + remove_proc_entry("elog", 0); + elog_pe = 0; + } +#endif + + + /* + * Reset I2C bus & UART (sort of, internal reset only) + */ + ee_rdy = 0; + xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO); + cons_exit(); + + printk("RAS.elog: exit complete\n"); + return 0; +} + +#endif /* EMULATION */ diff --git a/ras/micras_knc.c b/ras/micras_knc.c new file mode 100644 index 0000000..86ec013 --- /dev/null +++ b/ras/micras_knc.c @@ -0,0 +1,2794 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS MT module driver + * + * Code and data structures to handle get/set tasks for KnC. + * Parties accessing the data structures are supposed to use the + * micras_mt_tsk() routines to ensure integrity and consistency. + * Particularly important when handling sysfs nodes and actions + * requested from SCIF connections must use that method in order + * to guarantee serialized access. + * + * Even if read-only access to latest valid data is required, + * it should go through micras_mt_tsk() using dedicated handlers + * in this module. + * + * Apologies for the messy code, but hardware support to report + * board properties at this time (Power-On of A0) is so erratic + * that odd ways of obtaining the info had to replace the POR + * methods. The SMC support is sporadic, A0 has issues with SVID + * and some SBOX registers are invalid because they depend on + * TMU telemetry transmissions from the SMC which some reason + * has been forgotten/missed/defeatured (does not happen). + * + * TBD: Once the dust settles there will be code to remove. + * But until then, lots of #ifdef's remains. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras.h" + + +/* + * Persistent data accessible through the CP api. + * Some functions just read/modify hardware CSRs + * and thus need no storage between invocations. + */ + +extern struct mr_rsp_hwinf hwinf; +extern struct mr_rsp_vers vers; +extern struct mr_rsp_volt volt; +extern struct mr_rsp_freq freq; +extern struct mr_rsp_power power; +extern struct mr_rsp_plim plim; +extern struct mr_rsp_gddr gddr; +extern struct mr_rsp_gvolt gvolt; +extern struct mr_rsp_gfreq gfreq; +extern struct mr_rsp_temp temp; +extern struct mr_rsp_ecc ecc; +extern struct mr_rsp_trbo trbo; +extern struct mr_rsp_pmcfg pmcfg; + +#if USE_SVID +static uint8_t vccp_cap, vddq_cap, vddg_cap; +static uint8_t vccp_imax, vddq_imax, vddg_imax; +#endif + +uint8_t xlat_cpu[NR_CPUS]; + +#define FIX_DBOX 1 + +#if FIX_DBOX +/* + * Pre-emptive restoring DBOX-0 register access. + * A glitch during clock speed changes (PM or GPU_HOT) + * may under some rare circumstances break access to DBOX + * registers. It is very rare, requires hours of tailored + * simulation to reproduce, never seen in the wild (yet). + * The gmbus controller sits in the DBOX and is affected. + * Calling this routine prior to every gmbus read/write + * reduces risk of hitting this bug to a single SMC register, + * which has been deemed acceptable for B-step KnCs. + * Only alternative is to perform repeated transaction(s) + * until a stable result is obtained, which will be costly + * in performance. + */ +static void +mr_smc_deglitch(void) +{ + mr_dbox_rl(0, 0x600); + mr_dbox_rl(0, 0x2440); +} +#else +#define mr_smc_deglitch(); /* As nothing */ +#endif + + +/* +** +** Conversion between CP formats (uV, MHz, etc.) +** and hardware register formats (SMC and VRs mostly). +** +*/ + + +/* + * PLL tables used to map between hw scale register + * value and actual frequencies given a fixed base. + * + * The core frequency (MCLK) formula is + * freq = Icc * (Feedback / Feedforward) + * where + * Icc = Frequency generated from ICC, nominal 200 MHz + * FeedBack = ratio bits 8:1 (valid range: 8 .. 16) + * FeedForward = ratio bits 10:9 (01 -> 4, 10 -> 2, 11 -> 1) + * + * The gddr frequency (PGCLK) formula is + * freq = (X / 2) * Feedback / Feedforward + * where + * X = SBPLL (ICC) Table 1, FB range 10..22 + * X = LCVCO (ICC/2) Table 2, FB range 44..65 + * X = Bypass (ICC/2) Table 3, FB range 20..44 + * which is why there's three gddr tables. The divide by 2 of + * 'X' is represented as doubling the FF dividers in the tables. + * + * Overlapping ranges over feedback and feedforward values are + * handled by range table(s) below such that lower frequencies + * can be selected at a finer granularity. The tables themselves + * do not allow overlaps, i.e. two ways to specify the same + * PLL output frequency. + * + * Note that ICC clocks have their own PLL built in which uses + * the PCI-E 100 MHz clock, adds SSC and scale it by a pair of + * dividers. One divider is (I'm told) fixed at 40, the other + * is fused, and none of them can be read from uOS at runtime. + * The fused dividers are nominally 20, which is what the + * tables below is based on. Some SKUs tweak the core ICC PLL + * by fuses, so to counter it that divider is reported in scr #4. + * No means to know if gddr ICC PLL gets tweaked too. + * + *WARNING: there are overlabs on the divider codes for GDDR PLLs, + * which theoretically can cause false reporting of GDDR + * device speeds (example: FB dividers 20, 21, and 22 are + * defined both in gddr_tab1 and gddr_tab3). Currently + * there is no way to determine which table is used. + */ + +struct pll_tab { + uint8_t clk_div; /* Feed forward */ + uint8_t min_mul; /* Lower feedback */ + uint8_t max_mul; /* Upper feedback */ + uint16_t min_clk; /* Lower frequency */ + uint16_t max_clk; /* Upper frequency */ + uint8_t step_size; /* Granularity */ +} cpu_tab[] = { /* CPU PLL, ICC @ ~200 MHz */ + {1, 8, 16, 1600, 3200, 200}, + {2, 8, 15, 800, 1500, 100}, + {4, 8, 15, 400, 750, 50}, +}, gddr_tab1[] = { /* GDDR PLL, ICC @ 200 MHz */ + {2, 10, 22, 1000, 2200, 100}, + {4, 10, 22, 500, 1100, 50}, + {8, 10, 22, 250, 550, 25}, +}, gddr_tab2[] = { /* GDDR PLL, LCVCO @ 100 MHz */ + {2, 44, 65, 2200, 3250, 50}, +}, gddr_tab3[] = { /* GDDR PLL, ICC bypass @ 100 MHz */ + {2, 20, 44, 1000, 2200, 100}, + {4, 20, 44, 500, 1100, 50}, + {8, 20, 44, 250, 550, 25}, +}; + +#define ICC_NOM 20 /* Nominal ICC feed back divider */ + +static uint16_t +ratio2freq(uint16_t ratio, struct pll_tab * tab, int tablen, uint16_t base) +{ + uint16_t fwd, bck; + + fwd = GET_BITS(10, 9, ~ratio); + bck = GET_BITS(8, 1, ratio); + + if (tab == gddr_tab3 && (bck & 1)) + return 0; + + if (fwd < tablen && bck >= tab[fwd].min_mul && bck <= tab[fwd].max_mul) + return (base * bck) / tab[fwd].clk_div; + + return 0; +} + +static uint16_t +freq2ratio(uint16_t freq, struct pll_tab * tab, int tablen, uint16_t base) +{ + int fwd; + + for(fwd = tablen - 1; fwd >= 0; fwd--) { + if (freq >= tab[fwd].min_clk && freq <= tab[fwd].max_clk) { + /* + * Why bother check for accurate input? + * Ignoring it just rounds down to nearest supported! + */ + if (freq % tab[fwd].step_size) + break; + + return PUT_BITS(10, 9, ~fwd) | + PUT_BITS( 8, 1, (freq * tab[fwd].clk_div) / base); + } + } + + return 0; +} + +static uint32_t +icc_fwd(void) +{ + uint32_t scr4, div; + + scr4 = mr_sbox_rl(0, SBOX_SCRATCH4); + div = GET_BITS(29, 25, scr4); + + return div ? div : ICC_NOM; +} + +static uint32_t +mr_mt_gf_r2f(uint16_t pll) +{ + uint64_t freq; + + /* + * As per HSD 4118175, ICC clock at 200 MHz is currently not + * used on any SKUs, and is unlikely to be used in the future. + * Therefore, the 100 MHz tables are searched first. + */ + freq = ratio2freq(pll, gddr_tab3, ARRAY_SIZE(gddr_tab3), 100); + if (! freq) + freq = ratio2freq(pll, gddr_tab2, ARRAY_SIZE(gddr_tab2), 100); + if (! freq) + freq = ratio2freq(pll, gddr_tab1, ARRAY_SIZE(gddr_tab1), 200); + + return 1000 * freq; +} + +static uint32_t +mr_mt_cf_r2f(uint16_t pll) +{ + uint64_t freq; + + freq = ratio2freq(pll, cpu_tab, ARRAY_SIZE(cpu_tab), 200); + + return (1000 * freq * ICC_NOM) / icc_fwd(); +} + + +#if USE_SVID +/* + * VRM12 voltage converters + * Only bits 7:0 are being used as follows: + * Volt = Min + Res * (Bits -1) + * Bits = 1 + (Volt - Min) / Res + * Bits value of 0 reserved for turning VR off. + */ + +#define VRM12_MAX 1520000 /* 1.52 V */ +#define VRM12_MIN 250000 /* 250 mV */ +#define VRM12_RES 5000 /* 5.0 mV */ + +static uint32_t +svid2volt(uint8_t svid) +{ + uint32_t bits; + + bits = GET_BITS(7, 0, svid); + if (bits) + return VRM12_MIN + VRM12_RES * (bits - 1); + else + return 0; +} + +static uint8_t +volt2svid(uint32_t uv) +{ + uint32_t delta, bits; + + bits = 0; + if (uv >= VRM12_MIN && uv <= VRM12_MAX) { + delta = uv - VRM12_MIN; + /* + * Why bother check for accurate input? + * Ignoring it just rounds up to nearest! + */ + if (! (delta % VRM12_RES)) + bits = 1 + delta / VRM12_RES; + } + return PUT_BITS(7, 0, bits); +} + + +/* + * SVID register scaling: + * + * Vin = SVID_REG(0x1A) + * Iin = SVID_REG(0x19) 1:1 A + * Pin = SVID_REG(0x1B) 1:1 W + * Vout = SVID_REG(0x16) / 128 V + * Iout = SVID_REG(0x15) 1:1 A + * Pout = SVID_REG(0x18) 1:1 W + * Iout = (SVID_REG(0x15) / ADCmax) * (SVID_REG(0x21) A + * Temp = SVID_REG(0x17) 1:1 C + * + * Note: SVID_REG(0x06) bit 7 tells Iout formula. + * Assuming 8-bit ADC => ADCmax to be 0xff. + * + * Inputs are SVID register values, outputs are u{V|A|W}. + */ + +static uint32_t +vout2volt(uint8_t vout) +{ + /* + * Linear range from 0 to 2 volt + */ + return (((uint32_t) vout) * 1000000) / 128; +} + +static uint32_t +vin2volt(uint8_t vin) +{ + /* + * Formula not known. + */ + return (((uint32_t) vin) * 1000000) / 128; +} + +static uint32_t +one2one(uint8_t in) +{ + return ((uint32_t) in) * 1000000; +} + +static uint32_t +iout2amp(uint8_t iout, uint8_t cap, uint8_t imax) +{ + if (GET_BITS(7, 7, cap)) + return (((uint32_t) iout) * ((uint32_t) imax) * 1000000) / 256; + else + return one2one(iout); +} + +#define iin2amp(iin) one2one(iin) +#define pin2watt(pin) one2one(pin) +#define pout2watt(pout) one2one(pout) + + + +/* +** +** Simple SVIDCONTROL interface. +** +** 0 Parity bit out +** 8:1 SVID data out +** 13:9 SVID command +** 17:14 SVID address +** 18 Parity bit in (if any) +** 26:19 SVID data in (if any) +** 27 ACK #0 +** 28 ACK #1 +** 29 SVID Error +** 30 CTL Idle +** 31 CMD Start +** +** See SBOX HAS for more details. +** One transaction is expected to finish +** in less than 2 uSec (15.625 MHz clock) +** and busy waiting here should be OK. +** +** Return values: +** 0 OK +** 1-7 Controller bits 29:27 +** 8 Parameter error (invalid device or opcode) +** +*/ + +/* + * SVID command set + * Source: SVID Protocol rev 1.5 + */ +#define VR12Cmd_Extend 0x00 /* Req */ +#define VR12Cmd_SetVID_Fast 0x01 /* Req */ +#define VR12Cmd_SetVID_Slow 0x02 /* Req */ +#define VR12Cmd_SetVID_Decay 0x03 /* Req */ +#define VR12Cmd_SetPS 0x04 /* Req */ +#define VR12Cmd_SetRegADR 0x05 /* Req */ +#define VR12Cmd_SetRegDAT 0x06 /* Req */ +#define VR12Cmd_GetReg 0x07 /* Req */ +#define VR12Cmd_TestMode 0x08 /* Req */ + +/* + * SVID registers of interest + * Source: SVID Protocol rev 1.5 + * + * Notes on the capability register: + * bit 0 Iout (0x15) + * bit 1 Vout (0x16) + * bit 2 Pout (0x18) + * bit 3 Iin (0x19) + * bit 4 Vin (0x1a) + * bit 5 Pin (0x1b) + * bit 6 Temp (0x17) + * bit 7 Iout format of register 0x15 + * 0 -> value in Amps + * 1 -> value scaled to Icc_Max + */ + +#define VR12Reg_VendorID 0x00 /* Req */ +#define VR12Reg_ProductID 0x01 /* Req */ +#define VR12Reg_ProductRev 0x02 /* Req */ +#define VR12Reg_ProductDate 0x03 /* Opt */ +#define VR12Reg_LotCode 0x04 /* Opt */ +#define VR12Reg_ProtocolID 0x05 /* Req */ +#define VR12Reg_Capability 0x06 /* Req */ +#define VR12Reg_Iout 0x15 /* Req */ +#define VR12Reg_Vout 0x16 /* Opt */ +#define VR12Reg_Temp 0x17 /* Opt */ +#define VR12Reg_Pout 0x18 /* Opt */ +#define VR12Reg_Iin 0x19 /* Opt */ +#define VR12Reg_Vin 0x1a /* Opt */ +#define VR12Reg_Pin 0x1b /* Opt */ +#define VR12Reg_Icc_Max 0x21 /* Req */ +#define VR12Reg_Temp_Max 0x22 /* Req */ +#define VR12Reg_Vout_Max 0x30 /* Req */ +#define VR12Reg_VID_Set 0x31 /* Req */ + +/* + * SVID addresses on KnC + */ +#define SVID_VCCP 0x0 /* Core rail */ +#define SVID_VDDQ 0x2 /* Memory rail (1st loop) */ +#define SVID_VDDG 0x3 /* Uncore rail (2nd loop) */ + +static DEFINE_SPINLOCK(svidcontrol_lock); + +static int +SvidCmd(uint8_t dev, uint8_t op, uint8_t in) +{ + uint32_t cmd, ret, err; + + /* + * The SVID Controller does not work in A0 (HSD 3498464) + * Pretend success, but return 0 always + */ + return 0; + + /* + * For now just check that command can be contructed. + * + *TBD: Add stricter parameter check? + */ + if (dev > GET_BITS(17, 14, ~0) || + op > GET_BITS(13, 9, ~0)) + return -MR_ERR_SMC; + + /* + * Craft 18 bit command with even parity + */ + cmd = PUT_BITS( 8, 1, in) | + PUT_BITS(13, 9, op) | + PUT_BITS(17, 14, dev); + if (bitmap_weight((unsigned long *) &cmd, 18) & 1) + cmd |= 1; + + /* + * Wait until controller in idle state, + * write command + start bit and then + * wait for controller to be idle again. + */ + spin_lock(&svidcontrol_lock); + for( ;; ) { + ret = mr_sbox_rl(0, SBOX_SVIDCONTROL); + if (GET_BITS(31, 30, ret) == 0x1) + break; + } + mr_sbox_wl(0, SBOX_SVIDCONTROL, cmd | PUT_BIT(31, 1)); + for( ;; ) { + ret = mr_sbox_rl(0, SBOX_SVIDCONTROL); + if (GET_BITS(31, 30, ret) == 0x1) + break; + } + spin_lock(&svidcontrol_lock); + + /* + * Report command status + * Only if SVID_Error = 0, Ack #1 = 1, and Ack #0 = 0 + * did we have a successful transfer, and have data + * to return (SBOX HAS table 9). + */ + err = GET_BITS(29, 27, ret); + return (err == 0x2) ? GET_BITS(26, 19, ret) : -MR_ERR_SMC; +} +#endif + + + +/* +** +** SMC API +** +** See "Knights Corner System Managment Architecture Specification" +** for details on the SMC internals and supported APIs. +** +** This module is based on rev 0.31 +** +*/ + +#define MR_SMC_ADDR 0x28 /* SMC DVO-B Slave address */ + +#define MR_SMC_PCI_VID 0x00 /* PCI Vendor ID, 4 */ +#define MR_SMC_PCI_DID 0x02 /* PCI Device ID, 4 */ +#define MR_SMC_PCI_BCC 0x04 /* PCI Base Class Code, 4 */ +#define MR_SMC_PCI_SCC 0x05 /* PCI Sub Class Code, 4 */ +#define MR_SMC_PCI_PI 0x06 /* PCI Programming Interface, 4 */ +#define MR_SMC_PCI_SMBA 0x07 /* PCI MBus Manageability Address, 4 */ +#define MR_SMC_UUID 0x10 /* Universally Unique Identification, 16 */ +#define MR_SMC_FW_VERSION 0x11 /* SMC Firmware Version, 4 */ +#define MR_SMC_EXE_DOMAIN 0x12 /* SMC Execution Domain, 4 */ +#define MR_SMC_STS_SELFTEST 0x13 /* SMC Self-Test Results, 4 */ +#define MR_SMC_HW_REVISION 0x14 /* SMC Hardware Revision, 4 */ +#define MR_SMC_SERIAL 0x15 /* Card serial number, 12 */ +#define MR_SMC_SMB_RESTRT 0x17 /* Restart SMBus addr negotiation, 4 */ + +#define MR_SMC_CPU_POST 0x1a /* POST Register, 4 */ +#define MR_SMC_ZOMBIE 0x1b /* Zombie Mode Enable, 4 */ +#define MR_SMC_CPU_ID 0x1c /* CPU Identifier, 4 */ + +#define MR_SMC_SEL_ENTRY_SEL 0x20 /* SEL Entry Selection Register, 4 */ +#define MR_SMC_SEL_DATA 0x21 /* SEL Data register, */ +#define MR_SMC_SDR_ENTRY_SEL 0x22 /* SDR Entry Selection Register, 4 */ +#define MR_SMC_SDR_DATA 0x23 /* SDR Data register, */ + +#define MR_SMC_PWR_PCIE 0x28 /* PCIe Power Reading, 4 */ +#define MR_SMC_PWR_2X3 0x29 /* 2x3 Power Reading, 4 */ +#define MR_SMC_PWR_2X4 0x2a /* 2x4 Power Reading, 4 */ +#define MR_SMC_FORCE_TTL 0x2b /* Forced Throttle, 4 */ +#define MR_SMC_PWR_LIM_0 0x2c /* Power Limit 0, 4 */ +#define MR_SMC_TIME_WIN_0 0x2d /* Time Window 0, 4 */ +#define MR_SMC_PWR_LIM0_GRD 0x2e /* Power Limit 0 Guardband, 4 */ +#define MR_SMC_PWR_LIM_1 0x2f /* Power Limit 1, 4 */ +#define MR_SMC_TIME_WIN_1 0x30 /* Time Window 1, 4 */ +#define MR_SMC_INCL_3V3 0x31 /* Include 3.3 V, 4 */ +#define MR_SMC_PWR_LIM_PERS 0x32 /* Power Limit Persistence, 4 */ +#define MR_SMC_CLAMP_MODE 0x33 /* Clamp Mode, 4 */ +#define MR_SMC_ENERGY_STS_0 0x34 /* Energy Status 0, 4 */ +#define MR_SMC_AVG_PWR_0 0x35 /* Average Power 0, 4 */ +#define MR_SMC_AVG_PWR_1 0x36 /* Average Power 1, 4 */ +#define MR_SMC_MIN_PWR 0x37 /* Min Power, 4 */ +#define MR_SMC_PWR_TTL_DUR 0x38 /* Power Throttle Duration, 4 */ +#define MR_SMC_PWR_TTL 0x39 /* Power Throttling, 4 */ +#define MR_SMC_PWR_INST 0x3a /* Instantaneous Power Reading, 4 */ +#define MR_SMC_PWR_IMAX 0x3b /* Maximum Power Reading, 4 */ +#define MR_SMC_VOLT_VCCP 0x3c /* VCCP VR Output Voltage, 4 */ +#define MR_SMC_VOLT_VDDQ 0x3d /* VDDQ VR Output Voltage, 4 */ +#define MR_SMC_VOLT_VDDG 0x3e /* VDDG VR Output Voltage, 4 */ + +#define MR_SMC_TEMP_CPU 0x40 /* CPU DIE Temperature, 4 */ +#define MR_SMC_TEMP_EXHAUST 0x41 /* Card Exhaust Temperature, 4 */ +#define MR_SMC_TEMP_INLET 0x42 /* Card Inlet Temperature, 4 */ +#define MR_SMC_TEMP_VCCP 0x43 /* VCCP VR Temperature, 4 */ +#define MR_SMC_TEMP_VDDG 0x44 /* VDDG VR Temperature, 4 */ +#define MR_SMC_TEMP_VDDQ 0x45 /* VDDQ VR Temperature, 4 */ +#define MR_SMC_TEMP_GDDR 0x46 /* GDDR Temperature, 4 */ +#define MR_SMC_TEMP_EAST 0x47 /* East Temperature, 4 */ +#define MR_SMC_TEMP_WEST 0x48 /* West Temperature, 4 */ +#define MR_SMC_FAN_TACH 0x49 /* Fan RPM, 4 */ +#define MR_SMC_FAN_PWM 0x4a /* Fan PWM Percent, 4 */ +#define MR_SMC_FAN_PWM_ADD 0x4b /* Fan PWM Adder, 4 */ +#define MR_SMC_TCRITICAL 0x4c /* KNC Tcritical temperature, 4 */ +#define MR_SMC_TCONTROL 0x4d /* KNC Tcontrol temperature, 4 */ +#define MR_SMC_TRM_TTL_DUR 0x4e /* Thermal Throttle Duration, 4 */ +#define MR_SMC_TRM_TTL 0x4f /* Thermal Throttling, 4 */ +#define MR_SMC_TRM_PUSH 0x50 /* Target for die temp push, 4 */ + +#define MR_SMC_PWR_VCCP 0x58 /* VCCP VR Output Power, 4 */ +#define MR_SMC_PWR_VDDQ 0x59 /* VDDQ VR Output Power, 4 */ +#define MR_SMC_PWR_VDDG 0x5a /* VDDG VR Output Power, 4 */ + +#define MR_SMC_LED_CODE 0x60 /* LED blink code, 4 */ + + +/* + * Simple I/O access routines for most SMC registers. + * All but UUID & SERIAL are 4 bytes in size. + */ +#define SMC_TRACK 0 + +#if SMC_TRACK +#define RL printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, reg, *val, rl) +#define WL printk("%s: %2x <- %08x, rtn %d\n", __FUNCTION__, reg, *val, rl) +#else +#define RL /* As nothing */ +#define WL /* As nothing */ +#endif + +#ifdef MIC_IS_EMULATION +/* + * Emulation does not handle I2C busses. + * Therefore all code that deals with I2C needs to be + * replaced with harmless substitutes in emulation. + * The following stubs are for emulation only. + */ +int +gmbus_i2c_read(uint8_t d, uint8_t a, uint8_t r, uint8_t * v, uint16_t l) +{ + if (v && l) + memset(v, 0, l); + return l; +} + +int +gmbus_i2c_write(uint8_t d, uint8_t a, uint8_t r, uint8_t * v, uint16_t l) +{ + return l; +} +#endif /* EMULATION */ + +static char * +gm_err(int err) +{ + char * str = "unknown"; + + switch(err) { + case -1: str = "timeout"; break; + case -2: str = "ack timeout"; break; + case -3: str = "interrupted"; break; + case -4: str = "invalid command"; break; + } + + return str; +} + + +int +mr_smc_rd(uint8_t reg, uint32_t * val) +{ + int rl; + + mr_smc_deglitch(); + rl = gmbus_i2c_read(2, MR_SMC_ADDR, reg, (uint8_t *) val, sizeof(*val)); + RL; + if (rl == sizeof(uint32_t)) + return 0; + + /* + * Something failed, do a dummy read to get I2C bus in a known good state. + *TBD: Do retries, and if so how many? + */ + printk("smc_rd: error %d (%s), reg %02x\n", rl, gm_err(rl), reg); + mr_smc_deglitch(); + gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_FW_VERSION, (uint8_t *) &rl, sizeof(rl)); + *val = 0; + return 1; +} + +int +mr_smc_wr(uint8_t reg, uint32_t * val) +{ + int rl; + + WL; + mr_smc_deglitch(); + rl = gmbus_i2c_write(2, MR_SMC_ADDR, reg, (uint8_t *) val, sizeof(*val)); + if (rl == sizeof(uint32_t)) + return 0; + + /* + * Something failed, do a dummy read to get I2C bus in a known good state. + *TBD: Do retries, and if so how many? + */ + printk("smc_wr: error %d (%s), reg %02x\n", rl, gm_err(rl), reg); + mr_smc_deglitch(); + gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_FW_VERSION, (uint8_t *) &rl, sizeof(rl)); + return 0; +} +#undef RL +#undef WL + + +/* + * Bypass for SMC access. + * Kind of a backdoor really as it allows for raw access to the SMC which + * may be device dependent and vary significantly between SMC firmware + * revisions. This is intended for host side tools that (hopefully) know + * what they are receiving through this interface. There is a 'set' command + * too, which we screen heavily since the SMC controls board cooling and + * therefore is critical for the cards safe operation envolope. + */ + +int +mr_get_smc(void * p) +{ + int rtn; + uint32_t parm; + struct mr_rsp_smc * r; + + parm = * (uint32_t *) p; + if (GET_BITS(31, 8, parm)) + return -MR_ERR_RANGE; + r = (struct mr_rsp_smc *) p; + + r->reg = GET_BITS(7, 0, parm); + + /* + * These cannot be read by anybody + */ + if (r->reg > MR_SMC_LED_CODE || + r->reg == MR_SMC_ZOMBIE) + return -MR_ERR_PERM; + + /* + * These can only be read by root + */ + if (! micras_priv) + switch(r->reg) { + case MR_SMC_SEL_ENTRY_SEL: + case MR_SMC_SEL_DATA: + case MR_SMC_SDR_ENTRY_SEL: + case MR_SMC_SDR_DATA: + return -MR_ERR_PERM; + } + + /* + * Determine how wide the SMC register is + */ + switch(r->reg) { + case MR_SMC_UUID: + r->width = 16; + break; + case MR_SMC_SERIAL: + r->width = 12; + break; + default: + r->width = 4; + } + + mr_smc_deglitch(); + rtn = gmbus_i2c_read(2, MR_SMC_ADDR, r->reg, (uint8_t *) &r->rtn, r->width); +#if SMC_TRACK + printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, r->reg, r->rtn.val, rtn); +#endif + if (rtn != r->width) { + /* + * Failed once, try one more time + *TBD: insert a known good read before the actual retry? + */ + mr_smc_deglitch(); + rtn = gmbus_i2c_read(2, MR_SMC_ADDR, r->reg, (uint8_t *) &r->rtn, r->width); +#if SMC_TRACK + printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, r->reg, r->rtn.val, rtn); +#endif + + if (r->reg == MR_SMC_SERIAL) { + memcpy((uint8_t *) &r->rtn, hwinf.serial, r->width); + rtn = r->width; + } + } + + if (rtn != r->width) + return -MR_ERR_SMC; + + return sizeof(*r); +} + + +int +mr_set_smc(void * p) +{ + uint8_t reg; + uint16_t width; + int rtn; + uint32_t val, parm; + + parm = * (uint32_t *) p; + reg = GET_BITS(31, 24, parm); + + /* + * Screen for registers we allow setting. + * POST register is accessible to everyone, + * only root can 'SET' anything beyond that. + */ + if (micras_priv) { + switch (reg) { + case MR_SMC_CPU_POST: + case MR_SMC_SEL_ENTRY_SEL: + case MR_SMC_SDR_ENTRY_SEL: + case MR_SMC_SMB_RESTRT: + case MR_SMC_FORCE_TTL: + case MR_SMC_PWR_LIM_0: + case MR_SMC_TIME_WIN_0: + case MR_SMC_PWR_LIM_1: + case MR_SMC_TIME_WIN_1: + case MR_SMC_INCL_3V3: + case MR_SMC_PWR_LIM_PERS: + case MR_SMC_CLAMP_MODE: + case MR_SMC_FAN_PWM_ADD: + case MR_SMC_LED_CODE: + break; + default: + return -MR_ERR_PERM; + } + } + else { + switch (reg) { + case MR_SMC_CPU_POST: + break; + default: + return -MR_ERR_PERM; + } + } + + /* + * Screen against known SMC register widths. + * We insist that unused upper bits are zeros + */ + switch (reg) { + case MR_SMC_SEL_ENTRY_SEL: + case MR_SMC_SDR_ENTRY_SEL: + case MR_SMC_FAN_PWM_ADD: + val = GET_BITS(7, 0, parm); /* 8-bit registers */ + break; + case MR_SMC_PWR_LIM_0: + case MR_SMC_TIME_WIN_0: + case MR_SMC_PWR_LIM_1: + case MR_SMC_TIME_WIN_1: + val = GET_BITS(15, 0, parm); /* 16 bit registers */ + break; + case MR_SMC_CPU_POST: + val = GET_BITS(23, 0, parm); /* 24 bit registers */ + break; + default: + val = GET_BIT(0, parm); /* Booleans */ + } + if (val != GET_BITS(23, 0, parm)) + return -MR_ERR_INVAUX; + + width = 4; + mr_smc_deglitch(); + rtn = gmbus_i2c_write(2, MR_SMC_ADDR, reg, (uint8_t *) & val, width); +#if SMC_TRACK + printk("%s: %2x <- %08x, rtn %d\n", __FUNCTION__, reg, val, rtn); +#endif + if (rtn != width) + return -MR_ERR_SMC; + + return 0; +} + + +/* + * IPMI interface. + * The SMC has a connection to the host's board management software, which + * usually resides in a dedicated Board Management Controller, of which the + * SMC is supposed to be a registered satellite controller (aka. additional + * management controller). As such the SMC can receive controls originating + * from any valid IPMI session on things like power limits, but it can also + * add events to the non-volatile IPMI System Events Log for things like + * reporting catastrophic failures that otherwise might be lost because the + * main processors might be disabled (section 1.7.6 in IPMI spec 2.0 E5). + * In RAS context we'd want to let the SM know if fatal MC events occur + * and possibly also if the uOS crashes, such that remote management can + * be alerted via standard IPMI mechanisms. + * + * Input to this routine is an MceInfo record and an 'in-exception context' + * flag. It is still TBD what exactly to tell the SMC, but it is expected + * that all relevant info is in the MceInfo record. + */ + +void +micras_mc_ipmi(struct mce_info * mc, int ctx) +{ +} + + +#if !(USE_SVID || USE_SMC) +/* + * Board voltage sense converter + * Two 10 bit read-outs from SBOX register 0x1038. + * The format is very poorly documented, so no + * warranty on this conversion. Assumption is + * the reading is a binary fixed point number. + * bit 15 Valid reading if set + * bit 9:8 2 bit integer part + * bit 7:0 8 bit fraction part + * Return value is 0 (invalid) or voltage i uV. + */ + +uint32_t +bvs2volt(uint16_t sense) +{ + uint32_t res, f, msk; + + if (! GET_BIT(15, sense)) + return 0; + + /* + * First get integer contribution + * Then accumulate fraction contributions. + * Divide and add fraction if corresponding bit set. + */ + res = 1000000 * GET_BITS(9, 8, sense); + for(msk = (1 << 7), f = 1000000/2; msk && f; msk >>= 1, f >>= 1) + if (sense & msk) + res += f; + + return res; +} +#endif + + + +/* +** +** Initializations +** +** This has two intended purposes: +** - Do a on-time effort to collect info on properties that +** are not going to change after the initial setup by +** either bootstrap or kernel initialization. +** - Collect initial values on things we can modify. +** Intent is that unloading the ras module should reset +** all state to that of the time the module was loaded. +** +*/ + + +/* + *TBD: substitute with official defines when availble. + */ +#define KNC_FLASH_TAB 0x0FFF76000 /* Yes, it's below 4GB */ +#define KNC_FLASH_FILT 0x400 /* Correctable MC event filter */ +#define KNC_FLASH_BASE 0x0FFFA8000 /* Yes, it's below 4GB */ +#define KNC_FLASH_SIZE 0x2000 /* 8 KB according to Scott */ +#define KNC_FLASH_BOOT1 0x1274 /* Fboot1 version string */ +#define KNC_FLASH_BOOTB 0x02b8 /* Fboot1 backup version string */ +#define KNC_MP_PHYS 0x9e000 /* Location of MP table */ +#define KNC_MPF_SIG 0xa0afb2a0 /* String "_PM_" inverted */ +#define KNC_MPC_SIG 0x504d4350 /* String "PCMP" */ + +static void +get_cpu_table(void) +{ + struct mpf_intel * mpf; + struct mpc_table * mpc; + struct mpc_cpu * mpp; + uint8_t * ptr, * ep; + + mpf = phys_to_virt((phys_addr_t) KNC_MP_PHYS); + if (mpf) { + if (*((uint32_t *) mpf->signature) != KNC_MPF_SIG) { + printk("MP FP signature not found, %02x %02x %02x %02x\n", + mpf->signature[0], mpf->signature[1], + mpf->signature[2], mpf->signature[3]); + return; + } + mpc = phys_to_virt((phys_addr_t) mpf->physptr); + if (mpc) { + if (*((uint32_t *) mpc->signature) != KNC_MPC_SIG) { + printk("MP header signature not found, %02x %02x %02x %02x\n", + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); + return; + } + ptr = (uint8_t *)(mpc + 1); + ep = ptr + mpc->length; + while(ptr < ep) { + switch(*ptr) { + case 0x00: /* CPU */ + mpp = (struct mpc_cpu *) ptr; + if (GET_BIT(0, mpp->cpuflag) && mpp->apicid < nr_cpu_ids) + xlat_cpu[mpp->apicid] = GET_BITS(7, 0, mpp->reserved[1]); + ptr += 20; + break; + case 0x01: /* BUS */ + ptr += 8; + break; + case 0x02: /* I/O-APIC */ + ptr += 8; + break; + case 0x03: /* INT source */ + ptr += 8; + break; + case 0x04: /* LINT source */ + ptr += 8; + break; + default: /* Table out of spec */ + ptr = ep; + } + } + } +#if 0 + { + uint32_t eax, ebx, ecx, edx; + uint32_t hwt, i; + + cpuid(1, &eax, &ebx, &ecx, &edx); + hwt = GET_BITS(23, 16, ebx); + if (hwt > nr_cpu_ids) + hwt = nr_cpu_ids; + printk("RAS.card: CPU thread table:\n"); + for(i=0; i < hwt; i++) + printk(" cpu %d -> thr %d\n", i, xlat_cpu[i]); + } +#endif + } +} + + +static void __init +mr_mk_cf_lst(void) +{ + int i, n; + uint16_t f; + + /* + * If PM module interface is in place, then the + * core voltage list may already be populated. + */ + if (freq.supt[0] && freq.slen) + return; + + n = 0; + for(i = ARRAY_SIZE(cpu_tab) -1; i >= 0; i--) { + for(f = cpu_tab[i].min_clk; + f <= cpu_tab[i].max_clk; + f += cpu_tab[i].step_size) { + freq.supt[n] = 1000 * f; + freq.slen = ++n; + if (n >= MR_PTAB_LEN) + return; + } + } +} + +static void __init +mr_mk_gf_lst(void) +{ + int i, n; + uint16_t f; + + n = 0; + for(i = ARRAY_SIZE(gddr_tab1) -1; i >= 0; i--) { + for(f = gddr_tab1[i].min_clk; + f <= gddr_tab1[i].max_clk; + f += gddr_tab1[i].step_size) { + gfreq.supt[n] = 1000 * f; + gfreq.slen = ++n; + if (n == MR_PTAB_LEN) + return; + } + } + for(i = ARRAY_SIZE(gddr_tab2) -1; i >= 0; i--) { + for(f = gddr_tab2[i].min_clk; + f <= gddr_tab2[i].max_clk; + f += gddr_tab2[i].step_size) { + gfreq.supt[n] = 1000 * f; + gfreq.slen = ++n; + if (n == MR_PTAB_LEN) + return; + } + } +} + +/* + * We can only list 64 values in this list, but on + * a VRM12 device there is 256 values to chose from. + * For now we'll list values from 0.7 to 1.3 volt + * in 10 mV increments (61 values). + */ + +#define VRM_MIN 600000 +#define VRM_MAX 1300000 +#define VRM_RES 10000 + +static void __init +mr_mk_cv_lst(void) +{ + int n; + uint32_t cv; + + /* + * If PM module interface is in place, then the + * core voltage list may already be populated. + */ + if (volt.supt[0] && volt.slen) + return; + + n = 0; + for(cv = VRM_MIN; cv <= VRM_MAX; cv += VRM_RES) { + volt.supt[n] = cv; + volt.slen = ++n; + if (n >= MR_PTAB_LEN) + return; + } +} + + +void __init +mr_mt_card_init(void) +{ + uint32_t scr7, scr9, cf; + uint32_t smc, ci; + int rtn; +#ifndef MIC_IS_EMULATION + uint8_t * parm; +#endif +#if ! USE_SMC + uint32_t gv; +#endif +#if USE_SVID + int svid; + uint8_t vr; +#else +#if ! USE_SMC + uint32_t cv; +#endif +#endif +#if USE_PM + int (* fnc)(void); +#endif + + /* + * Make CPU->phys ID translation table + */ + get_cpu_table(); + + /* + * Build numbers for fboot0 and fboot 1 repectively + */ + scr7 = mr_sbox_rl(0, SBOX_SCRATCH7); + + /* + * VERS: + * Map flash and look for version strings. + */ +#ifdef MIC_IS_EMULATION + vers.fboot1[0] = scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2, + "No emulation flash version string (build %d)", + GET_BITS(31, 16, scr7)); +#else + parm = ioremap(KNC_FLASH_BASE, KNC_FLASH_SIZE); + if (!parm) { + printk("mr_mt_card_init: ioremap failure: parm %x\n", KNC_FLASH_BASE); + goto fail_iomap; + } + + /* + * The fboot0 version (hardwired in the chip) is placed in flash + * by bootstrap at a fixed location, and is less than 16 byte long. + */ + if (strnlen(parm + KNC_FLASH_BOOT1, 16) < 16) + vers.fboot1[0] = scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2, + "fboot1 version: %s (build %d)", + parm + KNC_FLASH_BOOT1, GET_BITS(31, 16, scr7)); + else + vers.fboot1[0] =scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2, + "No valid version string found"); + iounmap(parm); + + /* + * While at it, check if there is a MC filter list in flash + */ + parm = ioremap(KNC_FLASH_TAB, KNC_FLASH_SIZE); + if (!parm) { + printk("mr_mt_card_init: ioremap failure: parm %x\n", KNC_FLASH_TAB); + goto fail_iomap; + } + mcc_flt_parm(parm + KNC_FLASH_FILT); + iounmap(parm); + +fail_iomap: +#endif + + /* + * Retrieve ID details from the SMC + * UUID, 16 byte + * serial, 12 byte + * FW version, + * 15:0 Build number + * 23:16 Minor version + * 31:24 Major version + * Note: Ancient systems, like Berta, runs on cards with an older + * version on the SMC firmware that does not support serial. + */ + mr_smc_deglitch(); + rtn = gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_UUID, hwinf.guid, 16); +#if SMC_TRACK + printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, MR_SMC_UUID, *(uint32_t *) hwinf.guid, rtn); +#endif + if (rtn != 16) + memset(hwinf.guid, '\0', 16); + mr_smc_deglitch(); + rtn = gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_SERIAL, hwinf.serial, 12); +#if SMC_TRACK + printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, MR_SMC_SERIAL, *(uint32_t *) hwinf.serial, rtn); +#endif + if (rtn != 12) + memcpy(hwinf.serial, "Update_SMC!!", sizeof(hwinf.serial)); + if (! mr_smc_rd(MR_SMC_FW_VERSION, &smc)) + vers.fsc[0] = scnprintf(vers.fsc + 1, MR_VERS_LEN -2, + "SMC firmware rev. %d.%d (build %d)", + GET_BITS(31, 24, smc), + GET_BITS(23, 16, smc), + GET_BITS(15, 0, smc)); + + /* + * HWINF: + * Get processor details from SBOX componentID. + * 19:16 Model ID => aka revision + * 15:12 Stepping ID => stepping + * 11:8 Substepping ID => substep + * + * Get Card Revision details from the SMC. + * 17:16 board (0=MPI, CRB, SFF, Product) + * 10:8 fab version (0='A' .. 7='H') + * 2:0 PBA SKU # (need name table here?) + */ + ci = mr_sbox_rl(0, SBOX_COMPONENT_ID); + hwinf.rev = GET_BITS(19, 16, ci); + hwinf.step = GET_BITS(15, 12, ci); + hwinf.substep = GET_BITS(11, 8, ci); + if (! mr_smc_rd(MR_SMC_HW_REVISION, &smc)) { + hwinf.board = GET_BITS(17, 16, smc); + hwinf.fab = GET_BITS(10, 8, smc); + hwinf.sku = GET_BITS( 2, 0, smc); + } + + /* + * VOLT: + * By definition, reference voltage is 1st value seen. + * Order of preference is SVID, then SMC and lastly SBOX. + * SMC register bits 15:0 is voltage in mV. + * SBOX_COREVOLT should be in SVID voltage format. + */ +#if USE_SVID + svid = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_VID_Set); + if (svid >= 0) + volt.set = svid2volt(svid); +#else +#if USE_SMC + if (!mr_smc_rd(MR_SMC_VOLT_VCCP, &smc) && GET_BITS(31, 30, smc) != 0x3) + volt.set = GET_BITS(15, 0, smc) * 1000; +#else + cv = mr_sbox_rl(0, SBOX_COREVOLT); + volt.set = svid2volt(GET_BITS(7, 0, cv)); +#endif +#endif + mr_mk_cv_lst(); + + /* + * FREQ + * By definition, reference frequency is 1st value seen. + */ + cf = mr_sbox_rl(0, SBOX_COREFREQ); + freq.def = mr_mt_cf_r2f(GET_BITS(11, 0, cf)); + mr_mk_cf_lst(); + + /* + * GDDR: + * See layout of scratch #9 in 'common'. + * 26:16 Clock ratio encoding + * 27 ClamShell + */ + scr9 = mr_sbox_rl(0, SBOX_SCRATCH9); + gddr.speed = 2 * mr_mt_gf_r2f(GET_BITS(26, 16, scr9)); + + /* + * GVOLT: + * Report all values the hardware can set, kind + * of silly as these cannot be changed from uOS. + * Order of preference is SVID, then SMC and lastly SBOX. + * SMC register bits 15:0 is voltage in mV. + * + *TBD: Seriously suspect SBOX register to be wrong. + */ +#if USE_SVID + svid = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_VID_Set); + if (svid >= 0) + gvolt.set = svid2volt(svid); +#else +#if USE_SMC + if (!mr_smc_rd(MR_SMC_VOLT_VDDQ, &smc) && GET_BITS(31, 30, smc) != 0x3) + gvolt.set = GET_BITS(15, 0, smc) * 1000; +#else + gv = mr_sbox_rl(0, SBOX_MEMVOLT); + gvolt.set = svid2volt(GET_BITS(7, 0, gv)); +#endif +#endif + + /* + * GFREQ: + * Report all values the hardware can set, kind + * of silly as these cannot be changed from uOS. + */ + gfreq.def = mr_mt_gf_r2f(GET_BITS(26, 16, scr9)); + mr_mk_gf_lst(); + + /* + * PWR: + * If we are going to use SVID registers we'd need + * to know the VRs capabilities and ICC_MAX setting. + */ +#if USE_SVID + vr = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Capability); + if (vr >= 0) + vccp_cap = vr; + vr = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Capability); + if (vr >= 0) + vddq_cap = vr; + vr = SvidCmd(SVID_VDDG, VR12Cmd_GetReg, VR12Reg_Capability); + if (vr >= 0) + vddg_cap = vr; + vr = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Icc_Max); + if (vr >= 0) + vccp_imax = vr; + vr = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Icc_Max); + if (vr >= 0) + vddq_imax = vr; + vr = SvidCmd(SVID_VDDG, VR12Cmd_GetReg, VR12Reg_Icc_Max); + if (vr >= 0) + vddg_imax = vr; +#endif + + /* + * ECC: + * + *TBD: Where to find ECC setting? + * There are several GBOX registers that has something + * named ECC in them. Scott to tell once PO is done. + */ + ecc.enable = GET_BIT(29, scr9); + + /* + * TRBO + * The PM module have the inital turbo mode setting. + * Get it now, so we don't need to call PM to report it. + */ +#if USE_PM + fnc = pm_cb.micpm_get_turbo; + if (fnc) + trbo.set = fnc(); +#endif + + /* + *TBD: Save registers this module may change + */ +} + +void __exit +mr_mt_card_exit(void) +{ + /* + *TBD: Restore registers this module may change + */ +} + + + +/* +** +** Card specific 'Get' functions +** +*/ + +int +mr_get_volt(void * p) +{ + struct mr_rsp_volt * r; +#if USE_PM + void (* fnc)(void); +#endif + + /* + * Preference is VR out. + * Not sure if board sensors work in KnC + */ +#if USE_SVID + { + int vout; + + vout = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_VID_Set); + if (vout < 0) + return vout; + volt.set = svid2volt(vout); + + vout = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Vout); + if (vout < 0) + return vout; + volt.cur = vout2volt(vout); + } +#else +#if USE_SMC + { + uint32_t smc; + + volt.cur = 0; + volt.c_val = 3; + if (! mr_smc_rd(MR_SMC_VOLT_VCCP, &smc)) { + volt.c_val = GET_BITS(31, 30, smc); + if (volt.c_val != 0x3) + volt.cur = GET_BITS(15, 0, smc) * 1000; + } + + /* + *TBD: override 'set' value ? + */ + } +#else + { + uint32_t fsc, cv; + + cv = mr_sbox_rl(0, SBOX_COREVOLT); + volt.set = svid2volt(GET_BITS(7, 0, cv)); + + fsc = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE); + volt.cur = bvs2volt(GET_BITS(15, 0, fsc)); + } +#endif +#endif + +#if USE_PM + /* + * Ask PM for table refresh + */ + fnc = pm_cb.micpm_vf_refresh; + if (fnc) + fnc(); +#endif + + r = (struct mr_rsp_volt *) p; + *r = volt; + return sizeof(*r); +} + + +int +mr_get_freq(void * p) +{ + struct mr_rsp_freq * r; + uint32_t cf, cr; +#if USE_PM + void (* fnc)(void); +#endif + + /* + * Current Ratio: + * 11:0 Current core ratio + * 15 Enable 600 MHz + * 27:16 Goal ratio + * 31 OC disable + * Goal ratio is a product of base ratio and fuse overrides + * Current ration is a product of goal, fuse limits and themal throttle + * + * Core Frequency: + * 11:0 Base ratio + * 15 Fuse override + * 31 Select ratio + * Base ratio accepted only if (bit 15 | bit 31 | OC disble) == 010 + * + *TBD: How to detect clock bypasses? + * ICC bypass cuts the core and reference base in half. + */ + cr = mr_sbox_rl(0, SBOX_CURRENTRATIO); + cf = mr_sbox_rl(0, SBOX_COREFREQ); + freq.cur = mr_mt_cf_r2f(GET_BITS(11, 0, cr)); + freq.def = mr_mt_cf_r2f(GET_BITS(11, 0, cf)); + if (GET_BITS(11, 0, cf) != GET_BITS(11, 0, cr)) + printk("RAS.get_freq: core not running at expected frequency\n"); + +#if USE_PM + /* + * Ask PM for table refresh + */ + fnc = pm_cb.micpm_vf_refresh; + if (fnc) + fnc(); +#endif + + r = (struct mr_rsp_freq *) p; + *r = freq; + return sizeof(*r); +} + + +#if USE_SVID +int +mr_get_svid(uint8_t vr, uint8_t cap, uint8_t imax, struct mr_rsp_vrr * vrr) +{ + int v, a, p; + + p = SvidCmd(vr, VR12Cmd_GetReg, VR12Reg_Pout); + a = SvidCmd(vr, VR12Cmd_GetReg, VR12Reg_Iout); + v = SvidCmd(vr, VR12Cmd_GetReg, VR12Reg_Vout); + + if (p < 0 || a < 0 || v < 0) + return -MR_ERR_SMC; + + vrr->pwr = pout2watt(p); + vrr->cur = iout2amp(a, cap, imax); + vrr->volt = vout2volt(v); + + return 0; +} +#endif + +#define KNC_DFF_BOARD 2 /* DFF/SFF board */ + +int +mr_get_power(void * p) +{ + struct mr_rsp_power * r; +#if USE_SMC + static struct mr_rsp_vrr vnil = { 0, 0, 0, 3, 3, 3 }; + static struct mr_rsp_pws pnil = { 0, 3 }; + uint32_t vccp, vddg, vddq; + uint32_t prd0, prd1, pcie, p2x3, p2x4; +#endif + +#if USE_SVID + /* + * Get VR status over SVID. + */ + if (mr_get_svid(SVID_VCCP, vccp_cap, vccp_imax, &power.vccp) < 0 || + mr_get_svid(SVID_VDDQ, vddq_cap, vddq_imax, &power.vddq) < 0 || + mr_get_svid(SVID_VDDG, vddg_cap, vddg_imax, &power.vddq) < 0) + return -MR_ERR_SMC; +#else +#if USE_SMC + /* + * Get VR status from SMC. + * Only voltages are available currently. + * Still need to screen for good data. + * Top 2 bits decode as + * 00 Data OK + * 01 Upper threshold reached + * 10 Lower threshold reached + * 11 Data unavailable + * Assume data is valid even if a threshold reached + */ + power.vccp = power.vddg = power.vddq = vnil; + if (! mr_smc_rd(MR_SMC_VOLT_VCCP, &vccp)) { + power.vccp.v_val = GET_BITS(31, 30, vccp); + if (power.vccp.v_val != 0x3) + power.vccp.volt = 1000 * GET_BITS(15, 0, vccp); + } + if (! mr_smc_rd(MR_SMC_VOLT_VDDG, &vddg)) { + power.vddg.v_val = GET_BITS(31, 30, vddg); + if (power.vddg.v_val != 0x3) + power.vddg.volt = 1000 * GET_BITS(15, 0, vddg); + } + if (! mr_smc_rd(MR_SMC_VOLT_VDDQ, &vddq)) { + power.vddq.v_val = GET_BITS(31, 30, vddq); + if (power.vddq.v_val != 0x3) + power.vddq.volt = 1000 * GET_BITS(15, 0, vddq); + } + if (! mr_smc_rd(MR_SMC_PWR_VCCP, &vccp)) { + power.vccp.p_val = GET_BITS(31, 30, vccp); + if (power.vccp.p_val != 0x3) + power.vccp.pwr = 1000000 * GET_BITS(15, 0, vccp); + } + if (! mr_smc_rd(MR_SMC_PWR_VDDG, &vddg)) { + power.vddg.p_val = GET_BITS(31, 30, vddg); + if (power.vddg.p_val != 0x3) + power.vddg.pwr = 1000000 * GET_BITS(15, 0, vddg); + } + if (! mr_smc_rd(MR_SMC_PWR_VDDQ, &vddq)) { + power.vddq.p_val = GET_BITS(31, 30, vddq); + if (power.vddq.p_val != 0x3) + power.vddq.pwr = 1000000 * GET_BITS(15, 0, vddq); + } +#endif +#endif + +#if USE_SMC + /* + * Get reads on VRs and power sensors from SMC. + * This is a mess: + * - total power may or may not include 3.3 V rail. + * If it is then it's not measured, just "guessed". + * - there are two averaging windows for total power, + * though it is not clear who controls these windows. + * For now we assume window 0 is shorter than window 1 + * and thus power 0 is 'current' reading and power 1 + * is the '20 sec' reading. + * TBD: Who controls the time windows and is is true + * that Window 0 is shorter than Window 1? + * - No specifics on how power sensors are averaged, + * i.e. is Window 0/1 used or is is a third window. + * Need to know, otherwise Ptot may not be sum(sources). + * - There still is no 'max' value from SMC + * + * Still need to screen for good data. + * Top 2 bits decode as + * 00 Data OK + * 01 Upper threshold reached + * 10 Lower threshold reached + * 11 Data unavailable + * Assume data is valid even if a threshold reached + */ + power.tot0 = power.tot1 = + power.inst = power.imax = + power.pcie = power.c2x3 = power.c2x4 = pnil; + + if (! mr_smc_rd(MR_SMC_AVG_PWR_0, &prd0)) { + power.tot0.p_val = GET_BITS(31, 30, prd0); + if (power.tot0.p_val != 0x3) + power.tot0.prr = 1000000 * GET_BITS(29, 0, prd0); + } + if (! mr_smc_rd(MR_SMC_AVG_PWR_1, &prd1)) { + power.tot1.p_val = GET_BITS(31, 30, prd1); + if (power.tot1.p_val != 0x3) + power.tot1.prr = 1000000 * GET_BITS(29, 0, prd1); + } + power.inst = power.imax = pnil; + if (! mr_smc_rd(MR_SMC_PWR_INST, &prd0)) { + power.inst.p_val = GET_BITS(31, 30, prd0); + if (power.inst.p_val != 0x3) + power.inst.prr = 1000000 * GET_BITS(29, 0, prd0); + } + if (! mr_smc_rd(MR_SMC_PWR_IMAX, &prd1)) { + power.imax.p_val = GET_BITS(31, 30, prd1); + if (power.imax.p_val != 0x3) + power.imax.prr = 1000000 * GET_BITS(29, 0, prd1); + } + if (! mr_smc_rd(MR_SMC_PWR_PCIE, &pcie)) { + power.pcie.p_val = GET_BITS(31, 30, pcie); + if (power.pcie.p_val != 0x3) + power.pcie.prr = 1000000 * GET_BITS(15, 0, pcie); + } + if (hwinf.board != KNC_DFF_BOARD) { + if (! mr_smc_rd(MR_SMC_PWR_2X3, &p2x3)) { + power.c2x3.p_val = GET_BITS(31, 30, p2x3); + if (power.c2x3.p_val != 0x3) + power.c2x3.prr = 1000000 * GET_BITS(15, 0, p2x3); + } + if (! mr_smc_rd(MR_SMC_PWR_2X4, &p2x4)) { + power.c2x4.p_val = GET_BITS(31, 30, p2x4); + if (power.c2x4.p_val != 0x3) + power.c2x4.prr = 1000000 * GET_BITS(15, 0, p2x4); + } + } +#endif + + r = (struct mr_rsp_power *) p; + *r = power; + return sizeof(*r); +} + + +int +mr_get_plim(void * p) +{ + uint32_t pl0, pl1, grd; + struct mr_rsp_plim * r; + + /* + * Get values from PM + */ + if (! mr_smc_rd(MR_SMC_PWR_LIM_0, &pl0)) + plim.hmrk = GET_BITS(15, 0, pl0); + + if (! mr_smc_rd(MR_SMC_PWR_LIM_1, &pl1)) + plim.lmrk = GET_BITS(15, 0, pl1); + + if (! mr_smc_rd(MR_SMC_PWR_LIM0_GRD, &grd)) + plim.phys = plim.hmrk + GET_BITS(15, 0, grd); + + r = (struct mr_rsp_plim *) p; + *r = plim; + return sizeof(*r); +} + + +int +mr_get_gfreq(void * p) +{ + struct mr_rsp_gfreq * r; + uint32_t gbr; + + /* + * SBOX register MEMFREQ bits 7:0 now holds 10 x rate in GTps. + */ + gbr = mr_sbox_rl(0, SBOX_MEMORYFREQ); + gfreq.cur = GET_BITS(7, 0, gbr) * 100000 / 2; + + r = (struct mr_rsp_gfreq *) p; + *r = gfreq; + return sizeof(*r); +} + + +int +mr_get_gvolt(void * p) +{ + struct mr_rsp_gvolt * r; + + /* + * Preference is VR out. + * Not sure if board sensors work in KnC + */ +#if USE_SVID + { + int vout; + + vout = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_VID_Set); + if (vout < 0) + return vout; + gvolt.set = svid2volt(vout); + + vout = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Vout); + if (vout < 0) + return vout; + gvolt.cur = vout2volt(vout); + } +#else +#if USE_SMC + { + uint32_t smc; + + gvolt.cur = 0; + gvolt.c_val = 3; + if (! mr_smc_rd(MR_SMC_VOLT_VDDQ, &smc)) { + gvolt.c_val = GET_BITS(31, 30, smc); + if (gvolt.c_val != 0x3) + gvolt.cur = GET_BITS(15, 0, smc) * 1000; + } + if (!gvolt.set) + gvolt.set = gvolt.cur; + } +#else + { + uint32_t bvs; + + bvs = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE); + gvolt.cur = bvs2volt(GET_BITS(31, 16, bvs)); + } +#endif +#endif + + r = (struct mr_rsp_gvolt *) p; + *r = gvolt; + return sizeof(*r); +} + + +/* + * Card has 3 dedicated temp sensors (read from SMC): + * 0 Air Inlet (aka West) + * 1 Air exhaust (aka East) + * 2 GDDR memory (not sure which chip) + * + * VRs can measure temperature too, which may be read + * from SMC (via I2C bus) or the VRs directly (via SVID). + * 3 Vccp VR (IR3538) temp + * 4 Vddq VR (IR3541, loop 1) temp + * 5 Vddg VR (IR3541, loop 2) temp + * Note: Vddg and Vddq are measured on the same VR, + * likely will be the same reading (or very close). + * + * SBOX board temperature sensors are not connected + * in KnC (SBOX HAS vol 1, section 1.40.1). Instead it + * relies on SMC to 'broadcast' sensor telemetry into + * the KnC's TMU unit via it's I2C bus. + * Currently it doesn't, though a DCR has been filed. + */ + +int +mr_get_temp(void * p) +{ + struct mr_rsp_temp * r; + uint32_t die1, die2, die3; /* Die temps */ + uint32_t dmx1, dmx2, dmx3; /* Max die temps */ +#if USE_SVID + int tvccp, tvddq, tvddg; /* VR temps */ +#endif +#if USE_SMC + static struct mr_rsp_tsns tnil = { 0, 3 }; +#endif + +#if USE_SVID + /* + * Get VR temperatures over SVID. + * These are _all_ positive numbers. + */ + tvccp = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Temp); + tvddq = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Temp); + tvddg = SvidCmd(SVID_VDDG, VR12Cmd_GetReg, VR12Reg_Temp); + if (tvccp < 0 || tvddq < 0 || tvddg < 0) + return -MR_ERR_SMC; + temp.vccp.cur = GET_BITS(7, 0, tvccp); + temp.vddq.cur = GET_BITS(7, 0, tvddq); + temp.vddg.cur = GET_BITS(7, 0, tvddg); +#endif + +#if USE_SMC + /* + * Get temp sensor readings from SMC. + * According to MAS 0.30 it presents + * - CPU die temp (just one value) + * - Fan exhaust temp + * - Fan inlet temp + * - Vccp VR temp + * - Vddg VR temp + * - Vddq VR temp + * - GDDR temp + * + * Still need to screen for good data. + * Top 2 bits decode as + * 00 Data OK + * 01 Upper threshold reached + * 10 Lower threshold reached + * 11 Data unavailable + * Assume data is valid even if a threshold reached + */ + { + uint32_t fin, fout, gddr; /* Sensor temps */ + uint32_t vccp, vddg, vddq; /* VR temps */ + uint32_t die; /* Die summary */ + + temp.die = temp.fin = temp.fout = + temp.vccp = temp.vddg = temp.vddq = tnil; + if (! mr_smc_rd(MR_SMC_TEMP_CPU, &die)) { + temp.die.c_val = GET_BITS(31, 30, die); + if (temp.die.c_val != 0x3) + temp.die.cur = GET_BITS(15, 0, die); + } + if (! mr_smc_rd(MR_SMC_TEMP_EXHAUST, &fout)) { + temp.fout.c_val = GET_BITS(31, 30, fout); + if (temp.fout.c_val != 0x3) + temp.fout.cur = GET_BITS(15, 0, fout); + } + if (! mr_smc_rd(MR_SMC_TEMP_INLET, &fin)) { + temp.fin.c_val = GET_BITS(31, 30, fin); + if (temp.fin.c_val != 0x3) + temp.fin.cur = GET_BITS(15, 0, fin); + } + if (! mr_smc_rd(MR_SMC_TEMP_VCCP, &vccp)) { + temp.vccp.c_val = GET_BITS(31, 30, vccp); + if (temp.vccp.c_val != 0x3) + temp.vccp.cur = GET_BITS(15, 0, vccp); + } + if (! mr_smc_rd(MR_SMC_TEMP_VDDG, &vddg)) { + temp.vddg.c_val = GET_BITS(31, 30, vddg); + if (temp.vddg.c_val != 0x3) + temp.vddg.cur = GET_BITS(15, 0, vddg); + } + if (! mr_smc_rd(MR_SMC_TEMP_VDDQ, &vddq)) { + temp.vddq.c_val = GET_BITS(31, 30, vddq); + if (temp.vddq.c_val != 0x3) + temp.vddq.cur = GET_BITS(15, 0, vddq); + } + if (! mr_smc_rd(MR_SMC_TEMP_GDDR, &gddr)) { + temp.gddr.c_val = GET_BITS(31, 30, gddr); + if (temp.gddr.c_val != 0x3) + temp.gddr.cur = GET_BITS(15, 0, gddr); + } + } +#else + /* + * The TMU registers relies on telemetry broadcasts from + * the SMC in order to report current data, early SMC + * firmware does not provide telemetry at all. + * Mapping of 'board temps' to physical sensors isn't + * really defined anywhere. Based on FreeBSD comments + * they map is: + * 0 Air Inlet + * 1 VCCP VR + * 2 GDDR (not sure which chip) + * 3 GDDR VR + * + *TBD: verify map on actual CRB + */ + { + uint32_t btr1, btr2; /* Board temps */ + uint32_t tsta; /* Thermal status */ + uint32_t fsc; /* Fan controller status */ + + fsc = mr_sbox_rl(0, SBOX_STATUS_FAN2); + btr1 = mr_sbox_rl(0, SBOX_BOARD_TEMP1); + btr2 = mr_sbox_rl(0, SBOX_BOARD_TEMP2); + tsta = mr_sbox_rl(0, SBOX_THERMAL_STATUS); + temp.fin.cur = (btr1 & (1 << 15)) ? GET_BITS( 8, 0, btr1) : 0; + temp.vccp.cur = (btr1 & (1 << 31)) ? GET_BITS(24, 16, btr1) : 0; + temp.gddr.cur = (btr2 & (1 << 15)) ? GET_BITS( 8, 0, btr2) : 0; + temp.vddq.cur = (btr2 & (1 << 31)) ? GET_BITS(24, 16, btr2) : 0; + temp.vddg.cur = GET_BITS(19, 12, fsc); + temp.brd.cur = 0; + if (temp.fin.cur > temp.brd.cur) + temp.brd.cur = temp.fin.cur; + if (temp.vccp.cur > temp.brd.cur) + temp.brd.cur = temp.vccp.cur; + if (temp.gddr.cur > temp.brd.cur) + temp.brd.cur = temp.gddr.cur; + if (temp.vddq.cur > temp.brd.cur) + temp.brd.cur = temp.vddq.cur; + if (tsta & (1 << 31)) + temp.die.cur = GET_BITS(30, 22, tsta); + } +#endif + + /* + * Raw SBOX data for die temperatures. + * + *TBD: do these depend on SMC telemetry? + * If so they probably won't work until DCR in place. + */ + die1 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP0); + die2 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP1); + die3 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP2); + dmx1 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP0); + dmx2 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP1); + dmx3 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP2); + + /* + * Die temperatures. + * Always positive numbers (or zero for unfused parts) + */ + temp.dies[0].cur = GET_BITS( 9, 0, die1); + temp.dies[1].cur = GET_BITS(19, 10, die1); + temp.dies[2].cur = GET_BITS(29, 20, die1); + temp.dies[3].cur = GET_BITS( 9, 0, die2); + temp.dies[4].cur = GET_BITS(19, 10, die2); + temp.dies[5].cur = GET_BITS(29, 20, die2); + temp.dies[6].cur = GET_BITS( 9, 0, die3); + temp.dies[7].cur = GET_BITS(19, 10, die3); + temp.dies[8].cur = GET_BITS(29, 20, die3); + + /* + * Die max temp (probably 0 for unfused parts) + */ + temp.dies[0].max = GET_BITS( 9, 0, dmx1); + temp.dies[1].max = GET_BITS(19, 10, dmx1); + temp.dies[2].max = GET_BITS(29, 20, dmx1); + temp.dies[3].max = GET_BITS( 9, 0, dmx2); + temp.dies[4].max = GET_BITS(19, 10, dmx2); + temp.dies[5].max = GET_BITS(29, 20, dmx2); + temp.dies[6].max = GET_BITS( 9, 0, dmx3); + temp.dies[7].max = GET_BITS(19, 10, dmx3); + temp.dies[8].max = GET_BITS(29, 20, dmx3); + + r = (struct mr_rsp_temp *) p; + *r = temp; + return sizeof(*r); +} + + +int +mr_get_fan(void * p) +{ + struct mr_rsp_fan * r; + uint32_t fs, fp; +#if USE_SMC + uint32_t fa; +#endif + + r = (struct mr_rsp_fan *) p; + + /* + * Preference is SMC data. + * Not sure if SBOX registers work sensors work in KnC + */ +#if USE_SMC + /* + * Read fan state from SMC. + * No info on override available. + */ + r->override = 0; + r->r_val = r->p_val = 3; + if (mr_smc_rd(MR_SMC_FAN_TACH, &fs)) + fs = PUT_BITS(31, 30, 3); + if (mr_smc_rd(MR_SMC_FAN_PWM, &fp)) + fp = PUT_BITS(31, 30, 3); + if (mr_smc_rd(MR_SMC_FAN_PWM_ADD, &fa)) + fa = PUT_BITS(31, 30, 3); + + /* + * Still need to screen for good data. + * Top 2 bits decode as + * 00 Data OK + * 01 Reserved + * 10 Lower threshold reached (or reserved) + * 11 Data unavailable + * Assume data is still valid if a threshold reached + */ + if (GET_BITS(31, 30, fs) != 0x3) { + /* + * The override concept from KnF (and SBOX registers) + * seems to have been replaced with a PWM adder. + * Propose to set override flag if adder is non-zero. + */ + r->r_val = 0; + r->rpm = GET_BITS(15, 0, fs); + if (GET_BITS(31, 30, fp) != 0x3) { + r->p_val = 0; + r->pwm = GET_BITS(7, 0, fp); + if (GET_BITS(31, 30, fa) != 0x3) { + fa = GET_BITS(7, 0, fa); + if (fa) { + r->override = 1; + r->pwm += fa; + if (r->pwm > 100) + r->pwm = 100; + } + } + } + } +#else + /* + * Read fan state from SBOX registers + * Require SMC telemetry to work. + */ + fs = mr_sbox_rl(0, SBOX_STATUS_FAN1); + fp = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN); + + r->override = GET_BIT(15, fp); + r->rpm = GET_BITS(15, 0, fs); + if (r->override) + r->pwm = GET_BITS( 7, 0, fp); + else + r->pwm = GET_BITS(23, 16, fs); +#endif + + return sizeof(*r); +} + + +int +mr_get_ecc(void * p) +{ + struct mr_rsp_ecc * r; + + r = (struct mr_rsp_ecc *) p; + *r = ecc; + return sizeof(*r); +} + + +int +mr_get_trbo(void * p) +{ + struct mr_rsp_trbo * r; + + /* + * Get current value from PM + */ +#if USE_PM + int (* fnc)(void); + + fnc = pm_cb.micpm_get_turbo; + if (fnc) { + uint32_t pm; + + pm = fnc(); + trbo.state = GET_BIT(1, pm); + trbo.avail = GET_BIT(2, pm); + if (! trbo.avail) + trbo.set = 0; + } +#endif + + r = (struct mr_rsp_trbo *) p; + *r = trbo; + return sizeof(*r); +} + + +int +mr_get_pmcfg(void * p) +{ + struct mr_rsp_pmcfg * r; + +#if USE_PM + int (* fnc)(void); + + fnc = pm_cb.micpm_get_pmcfg; + if (fnc) + pmcfg.mode = fnc(); +#endif + + r = (struct mr_rsp_pmcfg *) p; + *r = pmcfg; + return sizeof(*r); +} + + +int +mr_get_led(void * p) +{ + struct mr_rsp_led * r; + uint32_t led; + + if (mr_smc_rd(MR_SMC_LED_CODE, &led)) + return -MR_ERR_SMC; + + r = (struct mr_rsp_led *) p; + r->led = GET_BIT(0, led); + return sizeof(*r); +} + + +int +mr_get_prochot(void * p) +{ + struct mr_rsp_ptrig * r; + uint32_t pwr0; + uint32_t time0; + + if (mr_smc_rd(MR_SMC_PWR_LIM_0, &pwr0) || + mr_smc_rd(MR_SMC_TIME_WIN_0, &time0)) + return -MR_ERR_SMC; + + r = (struct mr_rsp_ptrig *) p; + r->power = GET_BITS(15, 0, pwr0); + r->time = GET_BITS(15, 0, time0); + return sizeof(*r); +} + + +int +mr_get_pwralt(void * p) +{ + struct mr_rsp_ptrig * r; + uint32_t pwr1; + uint32_t time1; + + if (mr_smc_rd(MR_SMC_PWR_LIM_1, &pwr1) || + mr_smc_rd(MR_SMC_TIME_WIN_1, &time1)) + return -MR_ERR_SMC; + + r = (struct mr_rsp_ptrig *) p; + r->power = GET_BITS(15, 0, pwr1); + r->time = GET_BITS(15, 0, time1); + return sizeof(*r); +} + + +int +mr_get_perst(void * p) +{ + struct mr_rsp_perst * r; + uint32_t perst; + + if (mr_smc_rd(MR_SMC_PWR_LIM_PERS, &perst)) + return -MR_ERR_SMC; + + r = (struct mr_rsp_perst *) p; + r->perst = GET_BIT(0, perst); + return sizeof(*r); +} + + +int +mr_get_ttl(void * p) +{ + struct mr_rsp_ttl * r; + + r = (struct mr_rsp_ttl *) p; + +#if USE_PM + mr_pm_ttl(r); +#endif + + return sizeof(*r); +} + + +/* +** +** Card specific 'Set' functions +** Input screening takes place here (to the extent possible). +** +*/ + + +int +mr_set_volt(void * p) +{ +#if USE_SVID + uint32_t err, val; + uint8_t svid; + + /* + * Ensure it's a supported value + * Which limits to use, physical or PM list? + */ + val = *(uint32_t *) p; + svid = volt2svid(val); +#if 1 + { + if (!svid) + return -MR_ERR_RANGE; + } +#else + { + int i; + + for(i = 0; i < MR_PTAB_LEN; i++) + if (volt.supt[i] == val) + break; + if (i == MR_PTAB_LEN) + return -MR_ERR_RANGE; + } +#endif + + /* + * Read-modify-write the core voltage VID register + */ + err = SvidCmd(SVID_VCCP, VR12Cmd_SetVID_Slow, svid); + printk("SetVolt: %d -> %08x (err %08x)\n", val, svid, err); + + return err ? -MR_ERR_SMC : 0; +#else + return -MR_ERR_INVOP; +#endif +} + + +int +mr_set_freq(void * p) +{ + uint32_t cf, msk, new, val; + uint16_t rat; + int i; + + /* + * Ensure it's a supported value + */ + val = *(uint32_t *) p; + for(i = 0; i < MR_PTAB_LEN; i++) + if (freq.supt[i] == val) + break; + if (i == MR_PTAB_LEN) + return -MR_ERR_RANGE; + + /* + * Core Frequency: + * 11:0 Base ratio + * 15 Fuse override + * 31 Select ratio + * Base ratio accepted only if (bit 15 | bit 31 | OC disble) == 010 + * Pre-scale frequency to counter for any ICC trickery. + * Not nice, makes exact table matches difficult!! + */ + val = (val * icc_fwd()) / ICC_NOM; + rat = freq2ratio(val/1000, cpu_tab, ARRAY_SIZE(cpu_tab), 200); + cf = mr_sbox_rl(0, SBOX_COREFREQ); + msk = ~(PUT_BITS(11, 0, ~0) | PUT_BIT(15, 1) | PUT_BIT(31, 1)); + new = (cf & msk) | PUT_BITS(11, 0, rat) | PUT_BIT(31, 1); + mr_sbox_wl(0, SBOX_COREFREQ, new); + printk("SetFreq: %d -> %08x (%08x)\n", val, new, cf); + + /* + *TBD: + * We just changed the system's base clock without + * re-calibrating the APIC timer tick counters. + * There is probably a function call for the cpu-freq + * driver to deal with this, so should we call it? + */ + + return 0; +} + + +int +mr_set_plim(void * p) +{ + plim.phys = *(uint32_t *) p; + + /* + * Notify PM of change + *TBD: not supported, remove? + */ + return 0; +} + + +int +mr_set_fan(void * p) +{ + struct mr_set_fan * fc; + + /* + * Ensure operation is valid, i.e. no garbage + * in override flag (only 1 and 0 allowed) and + * that pwm in in range 0 through 99. + */ + fc = (struct mr_set_fan *) p; + if (GET_BITS(7, 1, fc->override) || fc->pwm >= 100) + return -MR_ERR_RANGE; + +#if USE_SMC + { + uint32_t dat; + + /* + * Determine the PWM-adder value, and send it to the SMC. + * Subsequent 'GET' fan will add the calculated PWM and + * this adder to report current PWM percentage. + * Only way to retrieve the adder is via GET_SMC(0x4b). + */ + if (fc->override) + dat = fc->pwm; + else + dat = 0; + + if (mr_smc_wr(MR_SMC_FAN_PWM_ADD, &dat)) + return -MR_ERR_SMC; + } +#else + /* + * Read-modify-write the fan override register + * Control of fan #1 only, don't touch #2 + * Note: require SMC to support SBOX registers + * which is not on the radar right now. + */ + { + uint32_t fcor, fco1, fco2; + + fcor = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN); + fco2 = GET_BITS(31, 16, fcor); + if (fc->override) + fco1 = PUT_BIT(15, 1) | fc->pwm; + else + fco1 = 0; + mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN, + PUT_BITS(31, 16, fco2) | PUT_BITS(15, 0, fco1)); + } +#endif + + return 0; +} + + +int +mr_set_trbo(void * p) +{ + uint32_t tmp; +#if USE_PM + void (* fnc)(int); +#endif + + /* + * Only values 0 and 1 allowed + */ + tmp = *(uint32_t *) p; + if (GET_BITS(31, 1, tmp)) + return -MR_ERR_RANGE; + trbo.set = tmp; + +#if USE_PM + /* + * Notify PM of new value + */ + fnc = pm_cb.micpm_set_turbo; + if (fnc) + fnc(trbo.set); +#endif + + return 0; +} + + +int +mr_set_led(void * p) +{ + uint32_t led; + + /* + * Only values 0 and 1 allowed + */ + led = *(uint32_t *) p; + if (GET_BITS(31, 1, led)) + return -MR_ERR_RANGE; + + if (mr_smc_wr(MR_SMC_LED_CODE, &led)) + return -MR_ERR_SMC; + + return 0; +} + + +int +mr_set_prochot(void * p) +{ + struct mr_rsp_ptrig * trig; + uint32_t pwr0; + uint32_t time0; + + trig = (struct mr_rsp_ptrig *) p; + pwr0 = trig->power; + time0 = trig->time; + + /* + * Check for sane values + *TBD: check pwr0 higher than current pwr1? + */ + if (pwr0 < 50 || pwr0 > 400) + return -MR_ERR_RANGE; + if (time0 < 50 || time0 > 1000) + return -MR_ERR_RANGE; + + if (mr_smc_wr(MR_SMC_PWR_LIM_0, &pwr0) || + mr_smc_wr(MR_SMC_TIME_WIN_0, &time0)) + return -MR_ERR_SMC; + + return 0; +} + + +int +mr_set_pwralt(void * p) +{ + struct mr_rsp_ptrig * trig; + uint32_t pwr1; + uint32_t time1; + + trig = (struct mr_rsp_ptrig *) p; + pwr1 = trig->power; + time1 = trig->time; + + /* + * Check for sane values + *TBD: check pwr1 lower than current pwr0? + */ + if (pwr1 < 50 || pwr1 > 400) + return -MR_ERR_RANGE; + if (time1 < 50 || time1 > 1000) + return -MR_ERR_RANGE; + + if (mr_smc_wr(MR_SMC_PWR_LIM_1, &pwr1) || + mr_smc_wr(MR_SMC_TIME_WIN_1, &time1)) + return -MR_ERR_SMC; + + return 0; +} + + +int +mr_set_perst(void * p) +{ + uint32_t perst; + + /* + * Only values 0 and 1 allowed + */ + perst = *(uint32_t *) p; + if (GET_BITS(31, 1, perst)) + return -MR_ERR_RANGE; + + if (mr_smc_wr(MR_SMC_PWR_LIM_PERS, &perst)) + return -MR_ERR_SMC; + + return 0; +} + + +#if USE_PM +/* +** +** API functions dedicated for PM support +** +** These functions are embedded within the MT callout table +** and thus needs to follow the calling convention, which +** for 'get' functions is to pass an opague pointer to a buffer +** to hold retrieved data and on return get a staus code (positive +** on success, negative on failures) and for 'put' functions is +** to pass an opague pointer to a buffer holding input data. +** +** Function list as per PM needs: +** +** pm_get_pl0 reads 0x2c, 0x2d and 0x2e +** pm_set_pl0 writes 0x2c and 0x2d +** +** pm_get_pl1 reads 0x2f and 0x30 +** pm_set_pl1 writes 0x2f and 0x30 +** +** pm_get_pavg reads 0x35 and 0x36 +** +** pm_get_pttl reads 0x38 and 0x39 +** +** pm_get_volt reads 0x3c, 0x3d and 0x3e +** +** pm_get_temp reads 0x40, 0x43, 0x44 and 0x45 +** +** pm_get_tach reads 0x49 and 0x4a +** +** pm_get_tttl reads 0x4e and 0x4f +** +** pm_get_fttl reads 0x2b +** pm_set_fttl writes 0x2b +** +*/ + +#include "micpm_api.h" + +int +pm_get_pl0(void * p) +{ + struct pm_rsp_plim * r; + uint32_t lim, win, grd; + + lim = 0; + win = 0; + grd = 0; + mr_smc_rd(MR_SMC_PWR_LIM_0, &lim); + mr_smc_rd(MR_SMC_TIME_WIN_0, &win); + mr_smc_rd(MR_SMC_PWR_LIM0_GRD, &grd); + + r = (struct pm_rsp_plim *) p; + r->pwr_lim = GET_BITS(15, 0, lim); + r->time_win = GET_BITS(15, 0, win); + r->guard_band = GET_BITS(15, 0, grd); + + return sizeof(*r); +} + +int +pm_set_pl0(void * p) +{ + struct pm_cmd_plim * r; + + /* + * Only lower 16 bit used + */ + r = (struct pm_cmd_plim *) p; + if (GET_BITS(31, 16, r->pwr_lim)) + return -MR_ERR_RANGE; + if (GET_BITS(31, 16, r->time_win)) + return -MR_ERR_RANGE; + + /* + * This does not allow caller to tell which failed. + *TBD: do we care? + */ + if (mr_smc_wr(MR_SMC_PWR_LIM_0, &r->pwr_lim)) + return -MR_ERR_SMC; + if (mr_smc_wr(MR_SMC_TIME_WIN_0, &r->time_win)) + return -MR_ERR_SMC; + + return 0; +} + +int +pm_get_pl1(void * p) +{ + struct pm_rsp_plim * r; + uint32_t lim, win; + + lim = 0; + win = 0; + mr_smc_rd(MR_SMC_PWR_LIM_1, &lim); + mr_smc_rd(MR_SMC_TIME_WIN_1, &win); + + r = (struct pm_rsp_plim *) p; + r->pwr_lim = GET_BITS(15, 0, lim); + r->time_win = GET_BITS(15, 0, win); + r->guard_band = 0; + + return sizeof(*r); +} + +int +pm_set_pl1(void * p) +{ + struct pm_cmd_plim * r; + + /* + * Only lower 16 bit used + */ + r = (struct pm_cmd_plim *) p; + if (GET_BITS(31, 16, r->pwr_lim)) + return -MR_ERR_RANGE; + if (GET_BITS(31, 16, r->time_win)) + return -MR_ERR_RANGE; + + /* + * This does not allow caller to tell which failed. + *TBD: do we care? + */ + if (mr_smc_wr(MR_SMC_PWR_LIM_1, &r->pwr_lim)) + return -MR_ERR_SMC; + if (mr_smc_wr(MR_SMC_TIME_WIN_1, &r->time_win)) + return -MR_ERR_SMC; + + return 0; +} + +int +pm_get_pavg(void * p) +{ + struct pm_rsp_pavg * r; + uint32_t pwr0, pwr1; + + pwr0 = PUT_BITS(31, 30, 3); + pwr1 = PUT_BITS(31, 30, 3); + mr_smc_rd(MR_SMC_AVG_PWR_0, &pwr0); + mr_smc_rd(MR_SMC_AVG_PWR_1, &pwr1); + + r = (struct pm_rsp_pavg *) p; + r->stat_0 = GET_BITS(31, 30, pwr0); + r->stat_1 = GET_BITS(31, 30, pwr1); + r->pwr_0 = GET_BITS(29, 0, pwr0); + r->pwr_1 = GET_BITS(29, 0, pwr1); + + return sizeof(*r); +} + +int +pm_get_pttl(void * p) +{ + struct pm_rsp_pttl * r; + uint32_t dur, ttl; + + if (mr_smc_rd(MR_SMC_PWR_TTL, &ttl)) + return -MR_ERR_SMC; + + r = (struct pm_rsp_pttl *) p; + r->pwr_ttl = GET_BIT(0, ttl); + dur = PUT_BITS(31, 30, 3); + if (r->pwr_ttl) + mr_smc_rd(MR_SMC_PWR_TTL_DUR, &dur); + r->stat_dur = GET_BITS(31, 30, dur); + r->duration = GET_BITS(15, 0, dur); + + return sizeof(*r); +} + +int +pm_get_volt(void * p) +{ + struct pm_rsp_volt * r; + uint32_t vccp, vddg, vddq; + + vccp = PUT_BITS(31, 30, 3); + vddg = PUT_BITS(31, 30, 3); + vddq = PUT_BITS(31, 30, 3); + mr_smc_rd(MR_SMC_VOLT_VCCP, &vccp); + mr_smc_rd(MR_SMC_VOLT_VDDG, &vddg); + mr_smc_rd(MR_SMC_VOLT_VDDQ, &vddq); + + r = (struct pm_rsp_volt *) p; + r->stat_vccp = GET_BITS(31, 30, vccp); + r->stat_vddg = GET_BITS(31, 30, vddg); + r->stat_vddq = GET_BITS(31, 30, vddq); + r->vccp = GET_BITS(15, 0, vccp); + r->vddg = GET_BITS(15, 0, vddg); + r->vddq = GET_BITS(15, 0, vddq); + + return sizeof(*r); +} + +int +pm_get_temp(void * p) +{ + struct pm_rsp_temp * r; + uint32_t cpu, vccp, vddg, vddq; + + cpu = PUT_BITS(31, 30, 3); + vccp = PUT_BITS(31, 30, 3); + vddg = PUT_BITS(31, 30, 3); + vddq = PUT_BITS(31, 30, 3); + mr_smc_rd(MR_SMC_TEMP_CPU, &cpu); + mr_smc_rd(MR_SMC_TEMP_VCCP, &vccp); + mr_smc_rd(MR_SMC_TEMP_VDDG, &vddg); + mr_smc_rd(MR_SMC_TEMP_VDDQ, &vddq); + + r = (struct pm_rsp_temp *) p; + r->stat_cpu = GET_BITS(31, 30, cpu); + r->stat_vccp = GET_BITS(31, 30, vccp); + r->stat_vddg = GET_BITS(31, 30, vddg); + r->stat_vddq = GET_BITS(31, 30, vddq); + r->cpu = GET_BITS(15, 0, cpu); + r->vccp = GET_BITS(15, 0, vccp); + r->vddg = GET_BITS(15, 0, vddg); + r->vddq = GET_BITS(15, 0, vddq); + + return sizeof(*r); +} + +int +pm_get_tach(void * p) +{ + struct pm_rsp_tach * r; + uint32_t pwm, tach; + + pwm = PUT_BITS(31, 30, 3); + tach = PUT_BITS(31, 30, 3); + mr_smc_rd(MR_SMC_FAN_PWM, &pwm); + mr_smc_rd(MR_SMC_FAN_TACH, &tach); + + r = (struct pm_rsp_tach *) p; + r->stat_pwm = GET_BITS(31, 30, pwm); + r->stat_tach = GET_BITS(31, 30, tach); + r->fan_pwm = GET_BITS( 7, 0, pwm); + r->fan_tach = GET_BITS(15, 0, tach); + + return sizeof(*r); +} + +int +pm_get_tttl(void * p) +{ + struct pm_rsp_tttl * r; + uint32_t dur, ttl; + + if (mr_smc_rd(MR_SMC_TRM_TTL, &ttl)) + return -MR_ERR_SMC; + + r = (struct pm_rsp_tttl *) p; + r->thrm_ttl = GET_BIT(0, ttl); + dur = PUT_BITS(31, 30, 3); + if (r->thrm_ttl) + mr_smc_rd(MR_SMC_TRM_TTL_DUR, &dur); + r->stat_dur = GET_BITS(31, 30, dur); + r->duration = GET_BITS(15, 0, dur); + + return sizeof(*r); +} + +int +pm_get_fttl(void * p) +{ + struct pm_rsp_fttl * r; + uint32_t ttl; + + if (mr_smc_rd(MR_SMC_FORCE_TTL, &ttl)) + return MR_ERR_SMC; + + r = (struct pm_rsp_fttl *) p; + r->forced = GET_BIT(0, ttl); + + return sizeof(*r); +} + +int +pm_set_fttl(void * p) +{ + uint32_t ttl; + + /* + * Only values 0 and 1 allowed + */ + ttl = ((struct pm_rsp_fttl *) p)->forced; + if (GET_BITS(31, 1, ttl)) + return -MR_ERR_RANGE; + + if (mr_smc_wr(MR_SMC_FORCE_TTL, &ttl)) + return -MR_ERR_SMC; + + return 0; +} + +#endif diff --git a/ras/micras_knf.c b/ras/micras_knf.c new file mode 100644 index 0000000..cda0637 --- /dev/null +++ b/ras/micras_knf.c @@ -0,0 +1,1432 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS MT module driver + * + * Code and data structures to handle get/set tasks for KnF. + * Parties accessing the data structures are supposed to use the + * micras_mt_tsk() routines to ensure integrity and consistency. + * Particularly important when handling sysfs nodes and actions + * requested from SCIF connections must use that method in order + * to guarantee serialized access. + * + * Even if read-only access to latest valid data is required, + * it should go through micras_mt_tsk() using dedicated handlers + * in this module. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras_api.h" +#include "micmca_api.h" +#include "micras.h" + + +/* + * Persistent data accessible through the CP api. + * Some functions just read/modify hardware CSRs + * and thus need no storage between invocations. + */ + +extern struct mr_rsp_vers vers; +extern struct mr_rsp_volt volt; +extern struct mr_rsp_freq freq; +extern struct mr_rsp_power power; +extern struct mr_rsp_plim plim; +extern struct mr_rsp_gddr gddr; +extern struct mr_rsp_gvolt gvolt; +extern struct mr_rsp_gfreq gfreq; +extern struct mr_rsp_temp temp; +extern struct mr_rsp_ecc ecc; +extern struct mr_rsp_trbo trbo; +extern struct mr_rsp_pmcfg pmcfg; + +#if USE_FSC +/* +** +** FSC API +** +** The FSC has a back-door communication channel, not documented +** anywhere in the register spec nor in any HAS or LLD that is +** available on recent KnF cards (later than rev ??). +** Found a .35 proposal for it, so it better do. In short, this +** backdoor relies on fan #2 is not used on KnF and the fact that +** controls for fan #2 is transmitted over I2C to the fan speed +** controller (FSC) unaltered, such that it can chose an alternate +** interpretation of received data. +** +** The Fan Speed Override register (SBOX 0x8007d102c) has this +** definition in the register spec: +** +** Bit(s) Usage +** ------ ---------- +** 7:0 Fan 1 override ratio +** 14 Fan 1 Set max speed +** 15 Fan 1 Enable override +** 23:16 Fan 2 override ratio +** 30 Fan 2 Set max speed +** 31 Fan 2 Enable override +** +** This register has been repurposed into a Message Gain Bit Bang Register +** (MGBR) with a 4 bit command and a 16 bit data field, layout is: +** +** Bit(s) Usage +** ------ ---------- +** 7:0 MGBR data 7:0 +** 21:14 MGBR data 15:8 +** 23:22 MGBR command 1:0 +** 31:30 MBGR command 3:2 +** +** Command Usage +** 0 Fan 1 Speed Override +** 1 Power Management and Control Config +** 7 PMC PCIe Alert Override +** 8 PMC 2x3 Alert Override +** 9 PMC 2x4 Alert Override +** 10 Temperature Override Command +** 11 General Status Command +** 12-15 PID Gain Command(s) +** +** Fan 1 control works as MGBR command 0, though the spec is unclear on +** whether the resulting FSO register format is same as the original spec. +** Specifically, old spec has Fan 1 override enable in FSO bit 15, whereas +** the MGBR spec has it in MGBR data bit 15 (corresponds to FSO bit 20). +** Test shows it has to be MGBR bit 9, i.e. compatible with register spec. +** +** Fan #2 Status Register (SBOX 0x8007d1028) has been redefined into a +** Message Gain Bit Bang Status (MGBSR) used to hold return data from +** the MGBR General Status command in this layout: +** +** Bit(s) Usage +** ------ ---------- +** 23:0 MGBSR data +** 31:28 MGBR Gen. Sts. selector (bits 23:0 source). +** +** To get access to KnF telemetry data, only MGBR command 11 is needed. +** Bits 7:0 of MGBR data for this command selects the sensor which FSC +** will report to MGBSR (not sure if one-time or repeatedly). The actual +** encoding is as follows: +** +** 0x00 Fan2Status +** 0x01 PMC Configuration Command Settings +** 0x07 Reads the 2x4 IR3275 Configuration Register +** 0x08 Reads the 2x3 IR3275 Configuration Register +** 0x09 Reads the PCIe IR3275 Configuration Register +** 0x0A Reads the Temperature Command Settings +** 0x20 Maximum Total Card Power - 1s Moving Average (20 Samples) +** 0x21 Maximum 2x4 Connector Power - 1s Moving Average (20 Samples) +** 0x22 Maximum 2x3 Connector Power - 1s Moving Average (20 Samples) +** 0x23 Maximum PCIe Connector Power - 1s Moving Average (20 Samples) +** 0x30 Maximum Total Card Power - Single Sample +** 0x31 Maximum 2x4 Connector Power - Single Sample +** 0x32 Maximum 2x3 Connector Power - Single Sample +** 0x33 Maximum PCIe Connector Power - Single Sample +** 0xA0 Returns the current Fan Tcontrol setting for the GPU temperature +** 0xA1 Maximum Temperature for Temperature Sensor 1 - VCCP +** 0xA2 Maximum Temperature for Temperature Sensor 2 - Air Inlet +** 0xA3 Maximum Temperature for Temperature Sensor 3 - NW GDDR +** 0xA4 Maximum Temperature for Temperature Sensor 4 - V1P5 VDD VR +** 0xA5 Maximum Temperature for Temperature Sensor 5 - Display Transmitter +** 0xA6 Maximum Temperature for GPU +** +** The 'return' values in MGBSR are 16 bit only, power in Watts, Temp in C. +** +** Implementation notes: +** > The MGBR API is timing sensitive. FSC reads the MGBR register +** at ~50 mSec intervals over an I2C bus and performs the command +** on every read, which in case of the General Status command will +** result in wrinting FSC internal data to the MGBSR register. +** A delay is required after every write to MGBR in order to +** ensure the FSC actually sees it. +** +** > I2C bus reads are 7 bytes, writes are 6 bytes, 1 clock at 100 kHz +** is 10 uSec, 1 byte roughly translates to 10 bits, so minimum delay +** on I2C from command written to return value is valid becomes +** 10 * (6 + 7) * 10 uSec = 1.3 mSec +** The I2C bus on KnF runs slower than 100 kHz, causing tranfers +** to take more time than that to finish. +** After the initial delay, we'll may need to wait on a result +** to arrive in the MGBSR register. +** +** > It seems that fan 1 override is a dynamic act, i.e. for it to +** be in effect the MBGR command needs to be set accordingly. +** Therefore, when reading telemetry, the MGBR command is set +** just for a period long enough for it to be seen by FSC and the +** result to be latched into the MGBSR register. After that period +** (when fan speed override is active) the MGBR is returned to +** restore the fan 1 override. +** +*/ + +#define MR_FSC_MGBR_OVR_CMD 0 /* Fan 1 Speed Override */ +#define MR_FSC_MGBR_GEN_CMD 11 /* General Status command */ + +#define MR_FSC_STATUS 0x00 /* FSC Status & version */ +#define MR_FSC_PMC_CFG 0x01 /* PMC Configuration */ + +#define MR_FSC_PWR_TOT 0x20 /* Total Power (1 sec avg) */ +#define MR_FSC_PWR_2X4 0x21 /* 2x4 Power (1 sec avg) */ +#define MR_FSC_PWR_2X3 0x22 /* 2x3 Power (1 sec avg) */ +#define MR_FSC_PWR_PCIE 0x23 /* PCIe Power (1 sec avg) */ + +#define MR_FSC_PWR1_TOT 0x30 /* Total Power (single sample) */ +#define MR_FSC_PWR1_2X4 0x31 /* 2x4 Power (single sample) */ +#define MR_FSC_PWR1_2X3 0x32 /* 2x3 Power (single sample) */ +#define MR_FSC_PWR1_PCIE 0x33 /* PCIe Power (single sample) */ + +#define MR_FSC_TEMP_VCCP 0xA1 /* VCCP VR Temperature */ +#define MR_FSC_TEMP_INLET 0xA2 /* Card Inlet Temperature */ +#define MR_FSC_TEMP_GDDR 0xA3 /* GDDR Temperature */ +#define MR_FSC_TEMP_VDD 0xA4 /* VDD VR Temperature */ +#define MR_FSC_TEMP_DISP 0xA5 /* Display Transmitter */ + + +/* + * Simple I/O access routines for FSC registers + */ + +#ifdef MIC_IS_EMULATION +/* + * Emulation does not handle I2C busses in general. + * Not sure if FSC is emulated, but won't rely on it. + * The following stubs are for emulation only. + */ + +int +fsc_mgbr_read(uint32_t * v) +{ + if (v) + memset(v, 0, 4); + + return 0; +} + +void +fsc_mgbr_write(uint8_t c, uint32_t v) +{ +} + +#else + +#if 0 +#define RL printk("%s: %2x -> %08x\n", __FUNCTION__, mgbr_cmd, *val) +#define WL printk("%s: %2x <- %08x\n", __FUNCTION__, mgbr_cmd, *val) +#else +#define RL /* As nothing */ +#define WL /* As nothing */ +#endif + +static uint8_t mgbr_cmd; /* Last MGBR command */ +static uint32_t mgbr_dat; /* Last MGBR data */ +static uint32_t fan1_ovr; /* Current fan 1 override command */ + +/* + * Read MGBSR from SBOX + * + * This function only support MGBR commands MR_FSC_MGBR_{OVR|GEN}_CMD. + * The operation mode is that the command is written to MGBR and after + * a while the response shows up in MGBSR, which has fields that tell + * which command caused the response (bits 31:28), and for GEN command + * also which sensor was read. This function checks both fields. + * + * We'll poll at 1 mSec rate and allow up to 200 mSec for the + * FSC to provide the measure in the SBOX register. + */ + +int +fsc_mgbsr_read(uint32_t * val) +{ + uint32_t mgbsr; + int n; + + for(n = 0; n < 200; n++) { + mgbsr = mr_sbox_rl(0, SBOX_STATUS_FAN2); + if ((GET_BITS(31, 28, mgbsr) == mgbr_cmd) || + mgbr_cmd != MR_FSC_MGBR_GEN_CMD || mgbr_dat == 0) { + if (mgbr_cmd != MR_FSC_MGBR_GEN_CMD || + mgbr_dat <= 1) { + *val = GET_BITS(23, 0, mgbsr); + RL; + return 0; + } + if (GET_BITS(23, 16, mgbsr) == mgbr_dat) { + *val = GET_BITS(15, 0, mgbsr); + RL; + return 0; + } + } + myDELAY(1000); + } + + /* + * Timeout + */ + return 1; +} + + +/* + * Write MGBR on SBOX + * + * This function only support MGBR commands MR_FSC_MGBR_{OVR|GEN}_CMD. + * The OVR command only when fan 1 speed override is active. + * The GEN command is meant to cause a new selectable telemetry to be + * pushed into the MBGSR register by the FSC. Any necessary delays + * are handled here. Not by the read function. + */ + +void +fsc_mgbr_write(uint8_t c, uint32_t * val) +{ + uint32_t prev_cmd, prev_dat; + uint32_t mgbr_reg, mgbr_sel; + uint32_t mgbsr, n; + + prev_cmd = mgbr_cmd; + prev_dat = mgbr_dat; + mgbr_cmd = GET_BITS(3, 0, c); + mgbr_dat = GET_BITS(15, 0, *val); + + mgbr_reg = PUT_BITS(31, 30, (mgbr_cmd >> 2)) | + PUT_BITS(23, 22, mgbr_cmd) | + PUT_BITS(21, 14, (mgbr_dat >> 8)) | + PUT_BITS( 7, 0, mgbr_dat); + WL; + mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN, mgbr_reg); + + /* + * Special for Set Fan Speed, we keep track of that one + */ + if (mgbr_cmd == MR_FSC_MGBR_OVR_CMD) { + if (GET_BIT(9, mgbr_dat)) + fan1_ovr = GET_BITS(9, 0, mgbr_dat); + else + fan1_ovr = 0; + } + + /* + * If the command issued is the same as the previous command, + * there is no way to determine if the MGBSR register is result + * of this or the previous command. It is not possible to clear + * MGBSR (read-only register), so if it is the same register, + * we'll just have to wait long enough for FSC to respond. + * Not all MGBR commands are mirrored into top 4 bits of MGBSR, + * those gets the simple delay treatment. + */ + if ((mgbr_cmd == prev_cmd && mgbr_dat == prev_dat) || + mgbr_cmd != MR_FSC_MGBR_GEN_CMD || mgbr_dat <= 1) { + myDELAY(100 * 1000); + return; + } + mgbr_sel = GET_BITS(7, 0, mgbr_dat); + for(n = 0; n < 200; n++) { + mgbsr = mr_sbox_rl(0, SBOX_STATUS_FAN2); + if (GET_BITS(31, 28, mgbsr) == mgbr_cmd) { + if (mgbr_cmd != MR_FSC_MGBR_GEN_CMD) + return; + if (GET_BITS(23, 16, mgbsr) == mgbr_sel) + return; + } + myDELAY(1000); + } +} +#undef RL +#undef WL +#endif /* EMULATION */ + + +/* + * Bypass for FSC access. + * Somewhat bizarre backdoor to the FSC's MGBR and MGBSR registers. + * The FSC interface is asymmetrical by nature since only the General + * Status MGBR command can cause data to be returned through MGBSR. + * To make it appear as telemetry registers can be read directly + * and without need for privileges, the Read operation is rigged to + * issue the appropriate MGBR registers itself when necessary. + * + * To protect the FSC integrity, the SET command are restricted + * to privileged users and is only accepting commands that cannot + * harm the FSC integrity. For now the whitelist consists of + * 0 Fan 1 Speed Override + * 1 Power Management and Control Config + * 11 General Status command + * + * To read back the response from a SET command the exact same value + * of 'parm' must be passed to a subsequent GET, in which case the + * the GET routine will not insert it's own MGBR command to select + * contents of the MGBSR to return. + * + * Notice that FSC read is equivalent of reading Fan #2 Status register + * and FSC write is equivalent of writing Fan Speed Override register. + * + * This reuse the SMC interface structs, but the semantics are different. + * + * Return: + * r->reg MGBSR sensor select (if applicable) or 0 + * r->width always 3 (24 bit wide field) + * r->rtn.val MGBSR sensor data + * + * Input: + * parm 31:24 MGBR command (must be 0xb) + * parm 15:0 MGBR data (sensor select) + */ + +int +mr_get_fsc(void * p) +{ + int rtn; + uint32_t raw; + struct mr_rsp_smc * r; + uint8_t cmd; + uint32_t dat, parm; + + /* + * Extract MGBR command and dat + */ + parm = * (uint32_t *) p; + cmd = GET_BITS(31, 24, parm); + dat = GET_BITS(15, 0, parm); + + /* + * If the request is different from the last issued + * 'SET' command in any way then 'GET' will issue the + * corresponding MGBR command, if allowed. + */ + if (mgbr_cmd != cmd || mgbr_dat != dat) { + /* + * Only allow 'General Status' command + */ + if (cmd != MR_FSC_MGBR_GEN_CMD) + return -MR_ERR_PERM; + + /* + * Screen against known FSC register widths. + * All commands seems to be 16 bit wide. + * We insist that unused upper bits are zeros. + */ + if (dat != GET_BITS(23, 0, parm)) + return -MR_ERR_INVAUX; + + /* + * Better way to single out these numbers? + * 0 1 20 21 22 23 30 31 32 33 a1 a2 a3 a4 a5 + */ + if (! ((dat <= 1) || + (dat >= 0x20 && dat <= 0x23) || + (dat >= 0x30 && dat <= 0x33) || + (dat >= 0xa1 && dat <= 0xa5))) + return -MR_ERR_PERM; + + /* + * Write MGBR command + */ + fsc_mgbr_write(cmd, &dat); + } + + /* + * Read MGBSR result + */ + rtn = fsc_mgbsr_read(&raw); + if (rtn) + return -MR_ERR_SMC; + + /* + * Revert to normal if fan 1 speed override mode if needed. + */ + if (fan1_ovr) + fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &fan1_ovr); + + r = (struct mr_rsp_smc *) p; + if (cmd == MR_FSC_MGBR_GEN_CMD) + r->reg = GET_BITS(7, 0, dat); + r->width = 3; + r->rtn.val = GET_BITS(23, 0, raw); + + return sizeof(*r); +} + + +int +mr_set_fsc(void * p) +{ + uint8_t cmd; + uint32_t dat, parm; + + parm = * (uint32_t *) p; + cmd = GET_BITS(31, 24, parm); + dat = GET_BITS(15, 0, parm); + + /* + * Screen against known FSC register widths. + * All commands seems to be 16 bit wide. + * We insist that unused upper bits are zeros. + */ + if (dat != GET_BITS(23, 0, parm)) + return -MR_ERR_INVAUX; + + /* + * 4-bit command code for FSC. + * Mask of valid codes needs just 16 bits. + * Max valid codes 0..1, 7..15, mask 0xff83. + * Non-debug registers reduce mask to 0x0803. + */ + if (! ((1 << cmd) & 0x0803)) + return -MR_ERR_PERM; + + /* + * Write MGBR command and revert to fan 1 speed override mode + * if needed (override in effect). Side effect of reverting + * is that any reponse in MGBSR must to be read before next + * FSC sample happens, i.e. within 50 mSec. + */ + fsc_mgbr_write(cmd, &dat); + if (fan1_ovr) + fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &fan1_ovr); + + return 0; +} +#endif + + +/* +** +** Conversion between CP formats (uV, MHz, etc.) +** and hardware register formats (SBOX mostly). +** +*/ + + +/* + * VRM11 voltage converters + * Only bits 6:1 are being used as follows: + * Volt = Max - Res * (Bits -1) + * Bits = 1 + (Max - Volt) / Res + * The delta divided by resolution is 62. + * Bits value of 0 reserved for turning VR off. + */ + +#define VRM11_MAX 1600000 /* 1.60 V */ +#define VRM11_MIN 825000 /* 825 mV */ +#define VRM11_RES 12500 /* 12.5 mV */ + +uint32_t +vid2volt(uint8_t vid) +{ + uint32_t bits; + + bits = GET_BITS(6, 1, vid); + if (bits) + return VRM11_MAX - VRM11_RES * (bits - 1); + else + return 0; +} + +uint8_t +volt2vid(uint32_t uv) +{ + uint32_t delta, bits; + + bits = 0; + if (uv >= VRM11_MIN && uv <= VRM11_MAX) { + delta = VRM11_MAX - uv; + /* + * Why bother check for accurate input? + * Ignoring it just rounds up to nearest! + */ + if (! (delta % VRM11_RES)) + bits = 1 + delta / VRM11_RES; + } + return PUT_BITS(6, 1, bits); +} + + +/* + * PLL tables used to map between hw scale register + * value and actual frequencies given a fixed base. + * The formula is (probably KnF specific) + * freq = Base * Feedback / Feedforward + * where + * Base = 100 MHz + * FeedBack = ratio bits 5:0 + * FeedForward = ratio bits 7:6 (00 -> 8, 01 -> 4, 10 -> 2, 11 -> 1) + * + * Overlapping ranges over feedback and feedforward values are + * handled by range table(s) below such that lower frequencies + * can be selected at a finer granularity. + */ + +struct pll_tab { + uint8_t clk_div; /* Feed forward */ + uint8_t min_mul; /* Lower feedback */ + uint8_t max_mul; /* Upper feedback */ + uint16_t min_clk; /* Lower frequency */ + uint16_t max_clk; /* Upper frequency */ + uint8_t step_size; /* Granularity */ +} cpu_tab[] = { /* CPU PLL */ + { 1, 20, 40, 2000, 4000, 100}, + { 2, 20, 39, 1000, 1950, 50}, + { 4, 20, 39, 500, 975, 25}, +}, gddr_tab[] = { /* GDDR PLL */ + {1, 14, 30, 1400, 3000, 100}, + {2, 12, 27, 600, 1350, 50}, +}; + +#define B_CLK 100 /* Base clock (MHz) */ + +static uint16_t +ratio2freq(uint8_t ratio, struct pll_tab * tab, int tablen) +{ + uint16_t fwd, bck; + + fwd = GET_BITS(7, 6, ~ratio); + bck = GET_BITS(5, 0, ratio); + + if (fwd < tablen && bck >= tab[fwd].min_mul && bck <= tab[fwd].max_mul) + return (B_CLK * bck) / tab[fwd].clk_div; + + return 0; +} + +static uint8_t +freq2ratio(uint16_t freq, struct pll_tab * tab, int tablen) +{ + int fwd; + + for(fwd = tablen - 1; fwd >= 0; fwd--) { + if (freq >= tab[fwd].min_clk && freq <= tab[fwd].max_clk) { + /* + * Why bother check for accurate input? + * Ignoring just rounds down to nearest supported! + */ + if (freq % tab[fwd].step_size) + break; + + return PUT_BITS(7, 6, ~fwd) | + PUT_BITS(5, 0, (freq * tab[fwd].clk_div) / B_CLK); + } + } + + return 0; +} + +static uint32_t +mr_mt_gf_r2f(uint8_t pll) +{ + return 1000 * ratio2freq(pll, gddr_tab, ARRAY_SIZE(gddr_tab)); +} + +static uint32_t +mr_mt_cf_r2f(uint8_t pll) +{ + return 1000 * ratio2freq(pll, cpu_tab, ARRAY_SIZE(cpu_tab)); +} + + +/* + * Board voltage sense converter + * Two 10 bit read-outs from SBOX register 0x1038. + * The format is very poorly documented, so no + * warranty on this conversion. Assumption is + * the reading is a binary fixed point number. + * bit 15 Valid reading if set + * bit 9:8 2 bit integer part + * bit 7:0 8 bit fraction part + * Return value is 0 (invalid) or voltage i uV. + */ + +uint32_t +bvs2volt(uint16_t sense) +{ + uint32_t res, f, msk; + + if (! GET_BIT(15, sense)) + return 0; + + /* + * First get integer contribution + * Then accumulate fraction contributions. + * Divide and add fraction if corresponding bit set. + */ + res = 1000000 * GET_BITS(9, 8, sense); + for(msk = (1 << 7), f = 1000000/2; msk && f; msk >>= 1, f >>= 1) + if (sense & msk) + res += f; + + return res; +} + + + +/* +** +** Initializations +** +** This has two intended purposes: +** - Do a on-time effort to collect info on properties that +** are not going to change after the initial setup by +** either bootstrap or kernel initialization. +** - Collect initial values on things we can modify. +** Intent is that unloading the ras module should reset +** all state to that of the time the module was loaded. +** +*/ + +static void __init +mr_mk_cf_lst(void) +{ + int i, n; + uint16_t f; + + n = 0; + for(i = ARRAY_SIZE(cpu_tab) -1; i >= 0; i--) { + for(f = cpu_tab[i].min_clk; + f <= cpu_tab[i].max_clk; + f += cpu_tab[i].step_size) { + freq.supt[n] = 1000 * f; + freq.slen = ++n; + if (n >= MR_PTAB_LEN) + return; + } + } +} + +static void __init +mr_mk_gf_lst(void) +{ + int i, n; + uint16_t f; + + n = 0; + for(i = ARRAY_SIZE(gddr_tab) -1; i >= 0; i--) { + for(f = gddr_tab[i].min_clk; + f <= gddr_tab[i].max_clk; + f += gddr_tab[i].step_size) { + gfreq.supt[n] = 1000 * f; + gfreq.slen = ++n; + if (n == MR_PTAB_LEN) + return; + } + } +} + +static void __init +mr_mk_cv_lst(void) +{ + int n; + uint32_t cv; + + n = 0; + for(cv = VRM11_MIN; cv <= VRM11_MAX; cv += VRM11_RES) { + volt.supt[n] = cv; + volt.slen = ++n; + if (n >= MR_PTAB_LEN) + return; + } +} + + +void __init +mr_mt_card_init(void) +{ + uint8_t * boot, * stage2, * parm; + uint32_t scr7, scr9, fsc; + uint32_t cv, cf, gv; + int i, j; + + /* + * VERS: + * Map flash and scan for version strings. + * Different methods for KnF and KnC. + */ + boot = ioremap(MIC_SPI_BOOTLOADER_BASE, MIC_SPI_BOOTLOADER_SIZE); + stage2 = ioremap(MIC_SPI_2ND_STAGE_BASE, MIC_SPI_2ND_STAGE_SIZE); + parm = ioremap(MIC_SPI_PARAMETER_BASE, MIC_SPI_PARAMETER_SIZE); + if (!boot || !stage2 || !parm) { + printk("mr_mt_init: ioremap failure: boot %p, stage2 %p, par %p\n", + boot, stage2, parm); + goto fail_iomap; + } + + /* + * Build numbers for fboot0 and fboot 1 repectively + */ + scr7 = mr_sbox_rl(0, SBOX_SCRATCH7); + + /* + * Boot block scan: + * Scan for string 'fboot0 version:' or use a 16 bit offset af offset 0xfff8. + * The latter points directly to the numeral, not to the string mentioned. + */ + for(i = 0; i < MIC_SPI_BOOTLOADER_SIZE - 32; i++) { + if (boot[i] != 'f') + continue; + + if (! memcmp(boot + i, "fboot0 version:", 15)) { + vers.fboot0[0] = scnprintf(vers.fboot0 + 1, MR_VERS_LEN -2, + "%s (build %d)", boot + i, GET_BITS(15, 0, scr7)); + break; + } + } + + /* + * Stage 2 scan: + * Scan for the magic string that locates the bootstrap version. This + * area is formatted as ' (<\0>, )', so the string we are + * looking for is 23 bytes later. + */ + for(i = 0; i < MIC_SPI_2ND_STAGE_SIZE - 32; i++) { + if (stage2[i] != 'L') + continue; + + if (! memcmp(stage2 + i, "Larrabee bootstrap", 18)) { + vers.fboot1[0] = scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2, + "fboot1 version: %s", stage2 + i + 23); + vers.fboot1[0] = scnprintf(vers.fboot1 + vers.fboot1[0], MR_VERS_LEN -2, + " (build %d)", GET_BITS(31, 16, scr7)); + break; + } + } + + /* + * Parameter block scan: + * On 4 byte aligned locations, look for chars 'EOB_'. + * Numerical values for that string is 0x5f424f45. + */ + for(i = j = 0; i < MIC_SPI_PARAMETER_SIZE; i += sizeof(uint32_t)) + if (*(uint32_t *)(parm + i) == 0x5f424f45) { + vers.flash[j][0] = scnprintf(vers.flash[j] + 1, MR_VERS_LEN -2, + "flash %c%c%c%c version: %s", + parm[i+4], parm[i+5], parm[i+6], parm[i+7], parm + i + 32); + if (++j >= ARRAY_SIZE(vers.flash)) + break; + } + +fail_iomap: + if (boot) + iounmap(boot); + if (stage2) + iounmap(stage2); + if (parm) + iounmap(parm); + +#if USE_FSC + /* + * Reset SMC registers to default (MGBR cmd 0, data 0). + */ + mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN, 0); + + /* + * The MGBR Status has this layout for (MGBR command 0). + * 7:0 Firmware version + * 10:8 Card straps + * 11 Fan disable + * 20:12 Temperatur sensor 5 + * 27:21 Reserved + * 31:28 Command (0) + */ +#else + /* + * Contrary to register spec, the fan speed controller + * 2 status register has been redefined to hold version + * information of the FSC firmware. + * 7:0 Revision + * 10:8 FSC straps + * 11 Fan disable + * 19:12 Temperatur sensor 5 + * 27:20 Reserved + * 28 BIOS clear + * 31:29 Reserved + * This is probably an early version of the MGBR hack. + */ +#endif + + /* + * Retrieve FSC version and strap config + */ + fsc = mr_sbox_rl(0, SBOX_STATUS_FAN2); + vers.fsc[0] = scnprintf(vers.fsc + 1, MR_VERS_LEN -2, + "FSC firmware revision: %02x, straps %x", + GET_BITS(7, 0, fsc), GET_BITS(10, 8, fsc)); + + /* + * VOLT: + * Report all voltages the hardware can set. + */ + cv = mr_sbox_rl(0, SBOX_COREVOLT); + volt.set = vid2volt(GET_BITS(7, 0, cv)); + mr_mk_cv_lst(); + + /* + * FREQ: + * In FreeBSD uOS the reference (nominal) frequency + * is simply the value read from the SBOX at boot time. + * We'll do the same and set 'def' to the same as 'current'. + * Report all voltages the hardware can set. + */ + cf = mr_sbox_rl(0, SBOX_COREFREQ); + freq.def = mr_mt_cf_r2f(GET_BITS(7, 0, cf)); + mr_mk_cf_lst(); + + /* + * GDDR: + * See layout of scratch #9 in 'common'. + * 23:16 Clock ratio encoding + * 28:24 External clock frequency + */ + scr9 = mr_sbox_rl(0, SBOX_SCRATCH9); + gddr.speed = 2 * mr_mt_gf_r2f(GET_BITS(23, 16, scr9)); + + /* + * GVOLT: + * Report all voltages the hardware can set. + * Kind of silly as these cannot be changed from uOS. + * Cheat and set 'def' to the same as 'current'. + */ + gv = mr_sbox_rl(0, SBOX_MEMVOLT); + gvolt.set = vid2volt(GET_BITS(7, 0, gv)); + + /* + * GFREQ: + * Report all values the hardware can set. + * Kind of silly as these cannot be changed from uOS. + * Cheat and set 'ref' to the same as 'current'. + */ + gfreq.def = mr_mt_gf_r2f(GET_BITS(23, 16, scr9)); + mr_mk_gf_lst(); + + /* + * POWER: + * If case FSC not working or if not compiled in, + * preset all power readings as invalid. + */ + { + struct mr_rsp_power tmp = {{0, 3}, {0, 3}, {0, 3}, + {0, 3}, {0, 3}, {0, 3}, {0, 3}, + {0, 0, 0, 3, 3, 3}, + {0, 0, 0, 3, 3, 3}, + {0, 0, 0, 3, 3, 3}}; + power = tmp; + } + + /* + *TBD: Save card registers this module may change + */ +} + +void __exit +mr_mt_card_exit(void) +{ + /* + *TBD: Restore card registers this module may change + */ +} + + + +/* +** +** Card specific 'Get' functions +** +*/ + +int +mr_get_volt(void * p) +{ + struct mr_rsp_volt * r; + uint32_t cv, fsc; + + + cv = mr_sbox_rl(0, SBOX_COREVOLT); + volt.set = vid2volt(GET_BITS(7, 0, cv)); + + fsc = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE); + volt.cur = bvs2volt(GET_BITS(15, 0, fsc)); + + r = (struct mr_rsp_volt *) p; + *r = volt; + return sizeof(*r); +} + +int +mr_get_freq(void * p) +{ + struct mr_rsp_freq * r; + uint32_t cf; + + cf = mr_sbox_rl(0, SBOX_COREFREQ); + freq.cur = mr_mt_cf_r2f(GET_BITS(7, 0, cf)); + + r = (struct mr_rsp_freq *) p; + *r = freq; + return sizeof(*r); +} + +#if USE_FSC +/* + * Get Power stats from the FSC + */ +static void +get_fsc_pwr(uint32_t req, struct mr_rsp_pws * pws) +{ + uint32_t fsc; + + /* + * Read the FSC status + */ + fsc_mgbr_write(MR_FSC_MGBR_GEN_CMD, &req); + if (fsc_mgbsr_read(&fsc)) + pws->p_val = 3; + else { + pws->p_val = 0; + pws->prr = 1000000 * GET_BITS(15, 0, fsc); + } +} +#endif + +int +mr_get_power(void * p) +{ + struct mr_rsp_power * r; + +#if USE_FSC + uint8_t prev_cmd; + uint32_t prev_dat; + + /* + * Backup current OVERRIDE register + */ + prev_cmd = mgbr_cmd; + prev_dat = mgbr_dat; + + /* + * Get Power stats from the FSC + */ + get_fsc_pwr(MR_FSC_PWR_TOT, &power.tot0); + get_fsc_pwr(MR_FSC_PWR1_TOT, &power.inst); + get_fsc_pwr(MR_FSC_PWR_PCIE, &power.pcie); + get_fsc_pwr(MR_FSC_PWR_2X3, &power.c2x3); + get_fsc_pwr(MR_FSC_PWR_2X4, &power.c2x4); + + /* + * Revert to normal or fan 1 speed override mode if needed. + */ + if (fan1_ovr) + fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &fan1_ovr); + else + fsc_mgbr_write(prev_cmd, &prev_dat); +#endif + + r = (struct mr_rsp_power *) p; + *r = power; + return sizeof(*r); +} + + +int +mr_get_plim(void * p) +{ + struct mr_rsp_plim * r; + +#if USE_FSC + uint32_t fsc, req, ofs; + + /* + * Read the FSC status + */ + req = MR_FSC_PMC_CFG; + fsc_mgbr_write(MR_FSC_MGBR_GEN_CMD, &req); + if (! fsc_mgbsr_read(&fsc)) { + ofs = 5 * GET_BITS(3, 0, fsc); + if (GET_BIT(4, fsc)) + plim.phys = 300 - ofs; + else + plim.phys = 300 + ofs; + plim.hmrk = plim.lmrk = plim.phys; + } +#endif + + r = (struct mr_rsp_plim *) p; + *r = plim; + return sizeof(*r); +} + + +int +mr_get_gfreq(void * p) +{ + struct mr_rsp_gfreq * r; + uint32_t gbr; + + gbr = mr_sbox_rl(0, SBOX_MEMORYFREQ); + gfreq.cur = mr_mt_gf_r2f(GET_BITS(7, 0, gbr)); + + r = (struct mr_rsp_gfreq *) p; + *r = gfreq; + return sizeof(*r); +} + + +int +mr_get_gvolt(void * p) +{ + struct mr_rsp_gvolt * r; + uint32_t gv, fsc; + + gv = mr_sbox_rl(0, SBOX_MEMVOLT); + gvolt.set = vid2volt(GET_BITS(7, 0, gv)); + + fsc = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE); + gvolt.cur = bvs2volt(GET_BITS(31, 16, fsc)); + + r = (struct mr_rsp_gvolt *) p; + *r = gvolt; + return sizeof(*r); +} + +int +mr_get_temp(void * p) +{ + struct mr_rsp_temp * r; + uint32_t btr1, btr2; /* Board temps */ + uint32_t die1, die2, die3; /* Die temps */ + uint32_t dmx1, dmx2, dmx3; /* Max die temps */ + uint32_t tsta, fsc; /* Thermal status */ + + btr1 = mr_sbox_rl(0, SBOX_BOARD_TEMP1); + btr2 = mr_sbox_rl(0, SBOX_BOARD_TEMP2); + die1 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP0); + die2 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP1); + die3 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP2); + dmx1 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP0); + dmx2 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP1); + dmx3 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP2); + tsta = mr_sbox_rl(0, SBOX_THERMAL_STATUS); + fsc = mr_sbox_rl(0, SBOX_STATUS_FAN2); + + /* + * Board temperatures. + * No idea of where on the board they are located, but + * guessing from FreeBSD comments they are: + * 0 Air Inlet + * 1 VCCP VR + * 2 GDDR (not sure which chip) + * 3 GDDR VR + * The temperature read from FSC #2 seems valid, but + * there's no mention of where it's measured. + * The readings does not make much sense. + * Sample readings are like this: + * fin 32 + * vccp 28 (vccp VR) + * vddq 33 (gddr VR) + * vddg 28 (FSC 2) + * So, at least 'fin' is wrong (or fan in reverse). + */ + temp.fin.cur = (btr1 & (1 << 15)) ? GET_BITS( 8, 0, btr1) : 0; + temp.vccp.cur = (btr1 & (1 << 31)) ? GET_BITS(24, 16, btr1) : 0; + temp.gddr.cur = (btr2 & (1 << 15)) ? GET_BITS( 8, 0, btr2) : 0; + temp.vddq.cur = (btr2 & (1 << 31)) ? GET_BITS(24, 16, btr2) : 0; + temp.vddg.cur = GET_BITS(19, 12, fsc); + temp.brd.cur = 0; + if (temp.fin.cur > temp.brd.cur) + temp.brd.cur = temp.fin.cur; + if (temp.vccp.cur > temp.brd.cur) + temp.brd.cur = temp.vccp.cur; + if (temp.gddr.cur > temp.brd.cur) + temp.brd.cur = temp.gddr.cur; + if (temp.vddq.cur > temp.brd.cur) + temp.brd.cur = temp.vddq.cur; + temp.fout.c_val = 3; + temp.gddr.c_val = 3; + + /* + * Die temperatures. + */ + temp.die.cur = (tsta & (1 << 31)) ? GET_BITS(30, 22, tsta) : 0; + temp.dies[0].cur = GET_BITS( 8, 0, die1); + temp.dies[1].cur = GET_BITS(17, 9, die1); + temp.dies[2].cur = GET_BITS(26, 18, die1); + temp.dies[3].cur = GET_BITS( 8, 0, die2); + temp.dies[4].cur = GET_BITS(17, 9, die2); + temp.dies[5].cur = GET_BITS(26, 18, die2); + temp.dies[6].cur = GET_BITS( 8, 0, die3); + temp.dies[7].cur = GET_BITS(17, 9, die3); + temp.dies[8].cur = GET_BITS(26, 18, die3); + + /* + * Die max temp (min is not reported to CP). + */ + temp.dies[0].max = GET_BITS( 8, 0, dmx1); + temp.dies[1].max = GET_BITS(17, 9, dmx1); + temp.dies[2].max = GET_BITS(26, 18, dmx1); + temp.dies[3].max = GET_BITS( 8, 0, dmx2); + temp.dies[4].max = GET_BITS(17, 9, dmx2); + temp.dies[5].max = GET_BITS(26, 18, dmx2); + temp.dies[6].max = GET_BITS( 8, 0, dmx3); + temp.dies[7].max = GET_BITS(17, 9, dmx3); + temp.dies[8].max = GET_BITS(26, 18, dmx3); + + r = (struct mr_rsp_temp *) p; + *r = temp; + return sizeof(*r); +} + + +int +mr_get_fan(void * p) +{ + struct mr_rsp_fan * r; + uint32_t fan1, fovr; + + r = (struct mr_rsp_fan *) p; + fan1 = mr_sbox_rl(0, SBOX_STATUS_FAN1); + +#if USE_FSC + fovr = fan1_ovr; + r->override = GET_BIT(9, fovr); +#else + fovr = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN); + r->override = GET_BIT(15, fovr); +#endif + + r->rpm = GET_BITS(15, 0, fan1); + if (r->override) + r->pwm = GET_BITS( 7, 0, fovr); + else + r->pwm = GET_BITS(23, 16, fan1); + + return sizeof(*r); +} + + +int +mr_get_ecc(void * p) +{ + struct mr_rsp_ecc * r; + + r = (struct mr_rsp_ecc *) p; + *r = ecc; + return sizeof(*r); +} + + +int +mr_get_trbo(void * p) +{ + struct mr_rsp_trbo * r; + + r = (struct mr_rsp_trbo *) p; + *r = trbo; + return sizeof(*r); +} + + +int +mr_get_pmcfg(void * p) +{ + struct mr_rsp_pmcfg * r; + + r = (struct mr_rsp_pmcfg *) p; + *r = pmcfg; + return sizeof(*r); +} + + +/* +** +** Card specific 'Set' functions +** Input screening takes place here (to the extent possible). +** +*/ + + +int +mr_set_volt(void * p) +{ + uint32_t cv, msk, new, val; + uint8_t vid; + int i; + + /* + * Ensure it's a supported value + */ + val = *(uint32_t *) p; + for(i = 0; i < MR_PTAB_LEN; i++) + if (volt.supt[i] == val) + break; + if (i == MR_PTAB_LEN) + return -MR_ERR_RANGE; + + /* + * Read-modify-write the core voltage VID register + */ + vid = volt2vid(val); + cv = mr_sbox_rl(0, SBOX_COREVOLT); + msk = ~PUT_BITS(7, 0, ~0); + new = (cv & msk) | PUT_BITS(7, 0, vid); + mr_sbox_wl(0, SBOX_COREVOLT, new); + printk("SetVolt: %d -> %08x (%08x)\n", val, new, cv); + + return 0; +} + + +int +mr_set_freq(void * p) +{ + uint32_t cf, msk, new, val; + uint8_t rat; + int i; + + /* + * Ensure it's a supported value + */ + val = *(uint32_t *) p; + for(i = 0; i < MR_PTAB_LEN; i++) + if (freq.supt[i] == val) + break; + if (i == MR_PTAB_LEN) + return -MR_ERR_RANGE; + + /* + * Read-modify-write the core frequency PLL register + * + *TBD: or should we just overwrite it? + * Register fields (of relevance): + * 7:0 New PLL encoding + * 16 Async Operation + * 31 Override fuse setting + */ + rat = freq2ratio(val/1000, cpu_tab, ARRAY_SIZE(cpu_tab)); + cf = mr_sbox_rl(0, SBOX_COREFREQ); + msk = ~(PUT_BITS(7, 0, ~0) | PUT_BIT(16, 1) | PUT_BIT(31, 1)); + new = (cf & msk) | PUT_BITS(7, 0, rat) | PUT_BIT(31, 1); + mr_sbox_wl(0, SBOX_COREFREQ, new); + printk("SetFreq: %d -> %08x (%08x)\n", val, new, cf); + + /* + *TBD: + * We just changed the system's base clock without + * re-calibrating the APIC timer tick counters. + * There is probably a function call for the cpu-freq + * driver to deal with this, so should we call it? + */ + + return 0; +} + + +int +mr_set_plim(void * p) +{ + plim.phys = *(uint32_t *) p; + return 0; +} + + +int +mr_set_fan(void * p) +{ + struct mr_set_fan * fc; + + /* + * Ensure operation is valid, i.e. no garbage + * in override flag (only 1 and 0 allowed) and + * that pwm is not zero (or above lower limit?) + */ + fc = (struct mr_set_fan *) p; + if (GET_BITS(7, 1, fc->override) || !fc->pwm) + return -MR_ERR_RANGE; + +#if USE_FSC + { + uint32_t dat; + + /* + * Craft the default OVERRIDE command and write it to FSC + * through the MGBR register (command 0). + * This does not change the telemetry in MGBSR, so only way + * to ensure it gets registered by FSC is to wait it out + * (happens in fsc_mgbr_write function). + */ + if (fc->override) + dat = PUT_BIT(9, 1) | fc->pwm; + else + dat = 0; + fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &dat); + } +#else + /* + * Read-modify-write the fan override register + * Control of fan #1 only, don't touch #2 + */ + { + uint32_t fcor, fco1, fco2; + + fcor = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN); + fco2 = GET_BITS(31, 16, fcor); + if (fc->override) + fco1 = PUT_BIT(15, 1) | fc->pwm; + else + fco1 = 0; + mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN, + PUT_BITS(31, 16, fco2) | PUT_BITS(15, 0, fco1)); + } +#endif + + return 0; +} + + +int +mr_set_trbo(void * p) +{ + return 0; +} + diff --git a/ras/micras_main.c b/ras/micras_main.c new file mode 100644 index 0000000..7e92fed --- /dev/null +++ b/ras/micras_main.c @@ -0,0 +1,2650 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS module driver + * + * Contains code to handle module install/deinstall + * and handling proper registration(s) to SCIF, sysfs + * pseudo file system, timer ticks, I2C driver and + * other one-time tasks. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras.h" + +#if MT_VERBOSE || MC_VERBOSE || PM_VERBOSE +/* + * For making scif_epd_t non-opague + */ +#define _MIC_MICBASEDEFINE_REGISTERS_H_ 1 +#include +#endif + +/* +** Lookup table to map API opcode into MT function. +** +** As we have to deal with both KnF and KnC, functions to +** retrieve information may be generic, in micras_common.c, +** or platform specific, in micras_kn{cf}.c. +** Code location is transparent to this table. +** +** Some MT functions can safely be called without +** serialization, e.g. if they are read-only or use +** atomics to get/set variables. The 'simple' flag tells +** which functions are safe to call without serialization. +** Other functions should be called thru micras_mt_call(). +** +** See micras_api.h and micpm_api.h for function details. +*/ + +static struct fnc_tab fnc_map[] = { + { 0, 0, 0, 0 }, + { MR_REQ_HWINF, 1, 0, mr_get_hwinf }, + { MR_REQ_VERS, 1, 0, mr_get_vers }, + { MR_REQ_CFREQ, 0, 0, mr_get_freq }, + { MR_SET_CFREQ, 0, 1, mr_set_freq }, + { MR_REQ_CVOLT, 0, 0, mr_get_volt }, + { MR_SET_CVOLT, 0, 1, mr_set_volt }, + { MR_REQ_PWR, 0, 0, mr_get_power }, + { MR_REQ_PLIM, 0, 0, mr_get_plim }, + { MR_SET_PLIM, 0, 1, mr_set_plim }, + { MR_REQ_CLST, 0, 0, mr_get_clst }, + { MR_ENB_CORE, 0, 1, 0 }, + { MR_DIS_CORE, 0, 1, 0 }, + { MR_REQ_GDDR, 1, 0, mr_get_gddr }, + { MR_REQ_GFREQ, 1, 0, mr_get_gfreq }, + { MR_SET_GFREQ, 1, 1, 0 }, + { MR_REQ_GVOLT, 1, 0, mr_get_gvolt }, + { MR_SET_GVOLT, 1, 1, 0 }, + { MR_REQ_TEMP, 0, 0, mr_get_temp }, + { MR_REQ_FAN, 0, 0, mr_get_fan }, + { MR_SET_FAN, 0, 1, mr_set_fan }, + { MR_REQ_ECC, 1, 0, mr_get_ecc }, + { MR_SET_ECC, 0, 1, 0 }, + { MR_REQ_TRC, 1, 0, mr_get_trc }, + { MR_SET_TRC, 1, 1, mr_set_trc }, + { MR_REQ_TRBO, 0, 0, mr_get_trbo }, + { MR_SET_TRBO, 0, 1, mr_set_trbo }, + { MR_REQ_OCLK, 0, 0, 0 }, + { MR_SET_OCLK, 0, 1, 0 }, + { MR_REQ_CUTL, 0, 0, mr_get_cutl }, + { MR_REQ_MEM, 0, 0, mr_get_mem }, + { MR_REQ_OS, 0, 0, mr_get_os }, + { MR_REQ_PROC, 0, 0, mr_get_proc }, + { MR_REQ_THRD, 0, 0, 0 }, + { MR_REQ_PVER, 1, 0, mr_get_pver }, + { MR_CMD_PKILL, 0, 1, mr_cmd_pkill }, + { MR_CMD_UKILL, 0, 1, mr_cmd_ukill }, +#if defined(CONFIG_MK1OM) + { MR_GET_SMC, 0, 0, mr_get_smc }, + { MR_SET_SMC, 0, 0, mr_set_smc }, +#else +#if defined(CONFIG_ML1OM) && USE_FSC + { MR_GET_SMC, 0, 0, mr_get_fsc }, + { MR_SET_SMC, 0, 1, mr_set_fsc }, +#else + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, +#endif +#endif + { MR_REQ_PMCFG, 0, 0, mr_get_pmcfg }, +#if defined(CONFIG_MK1OM) + { MR_REQ_LED, 0, 0, mr_get_led }, + { MR_SET_LED, 0, 1, mr_set_led }, + { MR_REQ_PROCHOT, 0, 0, mr_get_prochot }, + { MR_SET_PROCHOT, 0, 1, mr_set_prochot }, + { MR_REQ_PWRALT, 0, 0, mr_get_pwralt }, + { MR_SET_PWRALT, 0, 1, mr_set_pwralt }, + { MR_REQ_PERST, 0, 0, mr_get_perst }, + { MR_SET_PERST, 0, 1, mr_set_perst }, + { MR_REQ_TTL, 0, 0, mr_get_ttl }, +#else + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, +#endif +#if defined(CONFIG_MK1OM) && USE_PM + { 0, 0, 0, 0 }, + { PM_REQ_PL0, 1, 0, pm_get_pl0 }, + { PM_SET_PL0, 1, 1, pm_set_pl0 }, + { PM_REQ_PL1, 1, 0, pm_get_pl1 }, + { PM_SET_PL1, 1, 1, pm_set_pl1 }, + { PM_REQ_PAVG, 1, 0, pm_get_pavg }, + { PM_REQ_PTTL, 1, 0, pm_get_pttl }, + { PM_REQ_VOLT, 1, 0, pm_get_volt }, + { PM_REQ_TEMP, 1, 0, pm_get_temp }, + { PM_REQ_TACH, 1, 0, pm_get_tach }, + { PM_REQ_TTTL, 1, 0, pm_get_tttl }, + { PM_REQ_FTTL, 1, 0, pm_get_fttl }, + { PM_SET_FTTL, 1, 1, pm_set_fttl }, +#endif +}; + + + +/* +** +** The monitoring thread. +** In fact this is a work_queue, that receive work items +** from several independent parties, such as SCIF, sysfs, +** out of band telemetry, PM and possibly timers. +** +** These parties pass a structure with information necessary +** for the call-out function called by the MT thread to operate. +** These structures must include the work item structure, such +** that the container_of() mechanism can be used to locate it. +** +** The MT thread does not by itself provide any feed-back on +** when a task was executed nor the results from it. Therefore +** if a feedback is requred, then the callout needs to provide +** their own methods, such as the wait queue used by function +** micras_mt_data() below. Experiments has shown that it is not +** safe to place work item or the wait queue on a stack (no +** idea why, could be a bug). +** +*/ + +static int micras_stop; /* Module shutdown */ +static struct delayed_work micras_wq_init; /* Setup work item */ +static struct delayed_work micras_wq_tick; /* Timer tick token */ +static struct workqueue_struct * micras_wq; /* Monitor thread */ + int micras_priv; /* Call-out privileged */ + + +typedef struct wq_task { + int req; /* Request opcode */ + int rtn; /* Return value */ + int priv; /* Privileged */ + void * ptr; /* Response buffer */ + int (* fnc)(void *); /* Call out */ + struct work_struct wrk; /* Work item */ + wait_queue_head_t wqh; /* Wait queue header */ +} WqTask; + + +#if defined(CONFIG_MK1OM) && WA_4845465 +/* + * SMC die temp update job. + * + * As per HSD #4845465 we push the die temperature + * to the SMC instead of the usual reverse direction. + * This has to happen at around 50 mSec intervals, which should + * be possible with a work queue implementation. If that turns out + * not to be reliable enough we may need a more direct approach. + * During the experiment, we want to override the pushed temp. + */ + +#define DIE_PROC 1 /* Enable die temp override */ +#define SMC_PERIOD 50 /* SMC update interval, mSec */ +#define JITTER_STATS 1 /* Enable jitter measurements */ + +static struct delayed_work micras_wq_smc; /* SMC update token */ +static int smc_4845465; /* SMC push capable */ +#if DIE_PROC +static int die_override; /* Temperature override */ +#endif + +static void +micras_mt_smc(struct work_struct *work) +{ + extern int mr_smc_wr(uint8_t, uint32_t *); + static uint64_t n; + uint32_t tmp; + uint32_t ts2, mfs; + + if (! micras_stop) { + /* + * Re-arm for a callback in about 1 second. + * There is no guarantee this will be more than approximate. + */ + queue_delayed_work(micras_wq, &micras_wq_smc, msecs_to_jiffies(SMC_PERIOD)); + } + +#if JITTER_STATS + /* + * Time the interval in order to get some + * measurement on what jitter to expect. + * Leave a log message once every minute. + */ + { + static uint64_t d, t1, t2, s, hi, lo = ~0; + + t2 = rdtsc(); + if (n) { + d = t2 - t1; + s += d; + if (d > hi) + hi = d; + if (d < lo) + lo = d; +#if 1 + { + /* + * Show jitter in buckets representing 5 mSec. + * The center (#20) represent +- 2.5 mSec from reference. + * It is assumed TSC running at 1.1 GHz here, if PM kicks + * in the mesurements may be way off because it manipulate + * the system clock and indirectly the jiffy counter. + * It is assumed TSC running at 1.1 GHz here. + */ + static uint64_t buckets[41]; + int bkt; + int64_t err; + + err = ((d * 10) / 11) - (50 * 1000 * 1000); + if (err < -(25 * 100 * 1000)) + bkt = 19 + (err + (25 * 100 * 1000)) / (5 * 1000 * 1000); + else + if (err > (25 * 100 * 1000)) + bkt = 21 + (err - (25 * 100 * 1000)) / (5 * 1000 * 1000); + else + bkt = 20; + if (bkt < 0) + bkt = 0; + if (bkt > 40) + bkt = 40; + buckets[bkt]++; + if ((n % ((10 * 1000)/SMC_PERIOD)) == ((10 * 1000)/SMC_PERIOD) - 1) { + printk("smc_upd: dist"); + for(bkt = 0; bkt < 41; bkt++) { + if (bkt == 20) + printk(" | %lld |", buckets[bkt]); + else + printk(" %lld", buckets[bkt]); + } + printk("\n"); + } + } +#endif + if ((n % ((60 * 1000)/SMC_PERIOD)) == ((60 * 1000)/SMC_PERIOD) - 1) + printk("smc_upd: %lld, min %lld, max %lld, avg %lld\n", n, lo, hi, s / n); + } + t1 = t2; + } +#endif /* JITTER_STATS */ + + /* + * Send update to SMC to register 0x50. + * The value to push at the SMC must have following content + * + * Bits 9:0 Device Temperature + * -> THERMAL_STATUS_2 bits 19:10 + * Bit 10 Valid bit + * -> THERMAL_STATUS_2 bit 31 + * Bits 20:11 Thermal Monitor Control value + * -> THERMAL_STATUS_2 bits 9:0 + * Bits 30:21 Fan Thermal Control value + * -> MICROCONTROLLER_FAN_STATUS bits 17:8 + */ + + n++; + ts2 = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2); + mfs = mr_sbox_rl(0, SBOX_MICROCONTROLLER_FAN_STATUS); + +#if DIE_PROC + if (die_override) + tmp = GET_BITS(9, 0, die_override); + else +#endif + tmp = PUT_BITS(9, 0, GET_BITS(19, 10, ts2)); + tmp |= PUT_BIT(10, GET_BIT(31, ts2)) | + PUT_BITS(20, 11, GET_BITS(9, 0, ts2)) | + PUT_BITS(30, 21, GET_BITS(17, 8, mfs)); + + if (mr_smc_wr(0x50, &tmp)) + printk("smc_upd: %lld, tmp %d, SMC write failed\n", n, tmp); +} + + +#if DIE_PROC +/* + * Test proc file to override die temperature push. + * A value of 0 means no override, any other value is + * pushed as if it was a 'device temperature'. + */ + +static struct proc_dir_entry * die_pe; + +/* + * On writes: scan input line for single number. + */ + +static ssize_t +die_write(struct file * file, const char __user * buff, size_t len, loff_t * off) +{ + char * buf; + char * ep, * cp; + unsigned long ull; + int err; + + /* + * Get input line into kernel space + */ + if (len > PAGE_SIZE -1) + len = PAGE_SIZE -1; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (! buf) + return -ENOMEM; + if (copy_from_user(buf, buff, len)) { + err = -EFAULT; + goto wr_out; + } + buf[len] = '\0'; + cp = ep = (char *) buf; + + /* + * Read a number in strtoul format 0. + */ + while(isspace(*cp)) + cp++; + ull = simple_strtoull(cp, &ep, 0); + if (ep == cp || (*ep != '\0' && !isspace(*ep))) { + printk("Invalid die temp given\n"); + err = -EINVAL; + goto wr_out; + } + + die_override = GET_BITS(9, 0, ull); + printk("Die temp override set to %d C\n", die_override); + + /* + * Swallow any trailing junk up to next newline + */ + ep = strchr(buf, '\n'); + if (ep) + cp = ep + 1; + err = cp - buf; + +wr_out: + kfree(buf); + return err; +} + + +/* + * On reads: return string of current override temp. + */ + +static ssize_t +die_read(struct file * file, char __user * buff, size_t count, loff_t *ppos) +{ + char buf[32]; + size_t len; + + len = snprintf(buf, sizeof(buf), "%d\n", die_override); + return simple_read_from_buffer(buff, count, ppos, buf, len); +} + + +static const struct file_operations proc_die_operations = { + .read = die_read, + .write = die_write, + .llseek = no_llseek, +}; +#endif /* DIE_PROC */ +#endif /* WA_4845465 */ + + +/* + * Timer tick job + * + * This is for periodic updates from the SMC, + * which (with a little luck) can be avoided + * at the cost of I2C communications during + * actual CP queries. + */ + +static void +micras_mt_tick(struct work_struct *work) +{ +#if MT_TIMER + static int n; + + n++; + if (! micras_stop) { + /* + * Re-arm for a callback in about 1 second. + * There is no guarantee this will be more than approximate. + */ + queue_delayed_work(micras_wq, &micras_wq_tick, msecs_to_jiffies(MT_PERIOD)); + } + + /* + * Dump elog prints into the kernel log + *TBD: debug tool, time-shifts messages, remove eventually. + */ + { + int msg_top, msg_id; + char * buf; + + msg_id = atomic_read(&ee_seen); + msg_top = atomic_read(&ee_msg); + while(++msg_id <= msg_top) { + buf = ee_buf + (msg_id % EE_BUF_COUNT) * EE_BUF_LINELEN; + if (! *buf) + break; + printk("%s", buf); + *buf = '\0'; + atomic_inc(&ee_seen); + } + } +#endif +} + + +/* + * Handle SCIF & sysfs show/store requests + * + * By convention we know that the work item is member of + * a larger struct, which can readily be found using the + * container_of mechanism. + * + * Otherwise this routine just calls the function stored + * in the larger struct's mt_data element, and on its + * return wake up whoever is waiting for it's completion. + */ + +static void +micras_mt_data(struct work_struct * work) +{ + struct wq_task * wq; + + wq = container_of(work, struct wq_task, wrk); + micras_priv = wq->priv; + wq->rtn = wq->fnc(wq->ptr); + micras_priv = 0; + wake_up_all(& wq->wqh); +} + + +/* + * Helper to pass jobs (work items) to the monitoring thread. + * + * As input it receives details on function to be called, one + * argument to pass to that function, the opcode associated + * with the function and a function return value. The latter + * will be set to -MR_ERR_PEND, and we'll expect the callout + * function to change it. + * + * The work item is the only piece of information passed to + * the work queue callout, so we'll wrap it into a larger + * structure along with the received details such that the + * work queue can perform a function call on our behalf. + */ + +static int +micras_mt_tsk(struct wq_task * wq) +{ + int err; + +#if MT_VERBOSE + uint64_t start, stop; + start = rdtsc(); +#endif + + /* + * Create a work item for the RAS thread, + * enqueue and wait for it's completion. + * + *TBD: Timeout length to be revisited + */ + wq->rtn = -MR_ERR_PEND; + INIT_WORK_ONSTACK(&wq->wrk, micras_mt_data); + init_waitqueue_head(&wq->wqh); + queue_work(micras_wq, &wq->wrk); + err = wait_event_interruptible_timeout(wq->wqh, + wq->rtn != -MR_ERR_PEND, msecs_to_jiffies(1000)); + + /* + * Check for potential errors, which for now can only be + * "interrupted" or "timeout". In both cases try cancel the work + * item from MT thread. If cancel succeds (returns true) then + * the work item was still "pending" and is now removed from the + * work queue, i.e. it is safe to continue (with error). + * Otherwise, the cancel operation will wait for the work item's + * call-out function to finish, which kind of defies the purpose + * of "interruptable". However, we cannot leave until it is certain + * that it will not be accessed by the RAS thread. + */ + if (err == -ERESTARTSYS || err == 0) { + printk("MT tsk: interrupted or failure, err %d\n", err); + printk("MT tsk: FAILED: cmd %d, rtn %d, fnc %p, ptr %p\n", + wq->req, wq->rtn, wq->fnc, wq->ptr); + + err = cancel_work_sync(&wq->wrk); + printk("MT tsk: work canceled (%d)\n", err); + } + + /* + * Completed, turn interrupts and timeouts into MR errors. + */ + err = wq->rtn; + if (err == -MR_ERR_PEND) + err = -MR_ERR_NOVAL; + +#if MT_VERBOSE + stop = rdtsc(); + printk("MT tsk: cmd %d, err %d, time %llu\n", wq->req, err, stop - start); +#endif + return err; +} + + +/* + * Public interface to the MT functions + * Caller responsible for passing a buffer large enough + * to hold data for reads or writes (1 page will do, + * but structs matching the commands are recommended). + * Returned data are structs defined in micras.h + */ + +int +micras_mt_call(uint16_t cmd, void * buf) +{ + struct wq_task * wq; + int err; + + if (micras_stop) + return -MR_ERR_UNSUP; + + if (cmd > MR_REQ_MAX) + return -MR_ERR_INVOP; + + err = -MR_ERR_UNSUP; + if (fnc_map[cmd].fnc) { + if (fnc_map[cmd].simple) { + /* + * Fast access, just call function + */ + err = fnc_map[cmd].fnc(buf); + } + else { + /* + * Slow access, go through serializer. + * We allocate a work queue task for the MT thread, + * stuff arguments in it, run task, and then free + * work queue task. + */ + wq = kmalloc(sizeof(* wq), GFP_KERNEL); + if (! wq) { + printk("Scif: CP work task alloc failed\n"); + return -MR_ERR_NOMEM; + } + + memset(wq, '\0', sizeof(*wq)); + wq->req = cmd; + wq->priv = 1; + wq->fnc = (int (*)(void *)) fnc_map[cmd].fnc; + wq->ptr = buf; + err = micras_mt_tsk(wq); + + kfree(wq); + } + } + + return err; +} +EXPORT_SYMBOL_GPL(micras_mt_call); + + + +/* +** +** The sysfs nodes provided by this module is not really associated +** with a 'struct device', since we don't create device entries for +** access through '/dev'. Instead we register a 'struct class' +** with nodes defined with the CLASS_ATTR macro. +** Reasons for this choice are: +** - we don't want a device node created +** - we don't need (at least now) to create udev events +** - we don't act on suspend/resume transitions +** - we don't want to have our files unnecessarily deep +** in the sysfs file system. +** +** The sysfs layout is intended to look like: +** +** /sys/class/micras/ Root of this driver +** /clst Core information +** /cutl Core utilization +** /ecc Error correction mode +** /fan Fan controller +** /freq Core frequency +** /gddr GDDR devices +** /gfreq GDDR speed +** /gvolt GDDR voltage +** /hwinf Hardware Info +** /mem Memory utilization +** /os OS status +** /plim Power envelope +** /power Card power +** /temp Board tempearatures +** /trbo Turbo mode +** /trc Trace level +** /vers uOS/Flash version +** /volt Core voltage +** +** The following should be removed as there are better tools +** available in /proc//{stat|status|smap}, /proc/meminfo, +** /proc/stat, /proc/uptime, /proc/loadavg, and /proc/cpuinfo: +** clst, cutl, mem, os +** +** Below we hand-craft a 'micras' class to sit under '/sys/class' +** with attribute nodes directly under it. Each attribute may +** have a 'show' and a 'store' handler, both called with a reference +** to its class (ras_class, may hold private data), it's class_attribute, +** a buffer reference, and for 'store's a string length. The buffer +** passed to 'show' is one page (PAGE_SIZE, 4096) which sets the +** upper limit on the return string(s). Return value of 'store' +** has to be either an error code (negative) or the count of bytes +** consumed. If consumed less than what's passed in, the store routine +** will be called again until all input data has been consumed. +** +** Function pointers are hardwired by the macros below since it +** is easy and simpler than using the fnc_map table. This may +** change if the command set expands uncontrolled. +** We have local helper funtions to handle array prints. +** Any locking required is handled in called routines, not here. +** +** Note: This is not coded for maximum performance, since the +** use of the MT thread to serialize access to card data +** has a cost of two task switches attached, both which +** may cause delays due to other system activity. +** +*/ + + +/* + * Hack alert! + * Formatting routines for arrays of 16/32/64 bit unsigned ints. + * This reduces the printf argument list in _SHOW() macros below + * considerably, though perhaps at a cost in code efficiency. + * They need a scratch buffer in order to construct long lines. + * A quick swag at the largest possible response tells that we'll + * never exceed half if the page we are given to scribble into. + * So, instead of allocating print space, we'll simply use 2nd + * half of the page as scratch buffer. + */ + +#define BP (buf + (PAGE_SIZE/2)) /* Scratch pad location */ +#define BL (PAGE_SIZE/2 - 1) /* Scratch size */ + + +static char * +arr16(int16_t * arr, int len, char * buf, int siz) +{ + int n, bs; + + bs = 0; + for(n = 0; n < len && bs < siz; n++) + bs += scnprintf(buf + bs, siz - bs, "%s%u", n ? " " : "", arr[n]); + buf[bs] = '\0'; + + return buf; +} + + +static char * +arr32(uint32_t * arr, int len, char * buf, int siz) +{ + int n, bs; + + bs = 0; + for(n = 0; n < len && bs < siz; n++) + bs += scnprintf(buf + bs, siz - bs, "%s%u", n ? " " : "", arr[n]); + buf[bs] = '\0'; + + return buf; +} + + +static char * +arr64(uint64_t * arr, int len, char * buf, int siz) +{ + int n, bs; + + bs = 0; + for(n = 0; n < len && bs < siz; n++) + bs += scnprintf(buf + bs, siz - bs, "%s%llu", n ? " " : "", arr[n]); + buf[bs] = '\0'; + + return buf; +} + + +#define _SHOW(op,rec,nam,str...) \ + static ssize_t \ + micras_show_##nam(struct class *class, \ + struct class_attribute *attr, \ + char *buf) \ + { \ + struct mr_rsp_##rec * r; \ + struct wq_task * wq; \ + int len; \ + int err; \ +\ + wq = kmalloc(sizeof(* wq) + sizeof(* r), GFP_KERNEL); \ + if (! wq) \ + return -ENOMEM; \ +\ + memset(wq, '\0', sizeof(* wq)); \ + r = (struct mr_rsp_##rec *)(wq + 1); \ + wq->req = MR_REQ_##op; \ + wq->fnc = (int (*)(void *)) mr_get_##nam; \ + wq->ptr = r; \ + err = micras_mt_tsk(wq); \ +\ + if (err < 0) { \ + len = 0; \ + *buf = '\0'; \ + } \ + else { \ + len = scnprintf(buf, PAGE_SIZE, ##str); \ + } \ +\ + kfree(wq); \ + return len; \ + } + +_SHOW(HWINF, hwinf, hwinf, "%u %u %u %u %u %u " + "%c%c%c%c%c%c%c%c%c%c%c%c " + "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + r->rev, r->step, r->substep, r->board, r->fab, r->sku, + r->serial[0], r->serial[1], r->serial[2], r->serial[3], + r->serial[4], r->serial[5], r->serial[6], r->serial[7], + r->serial[8], r->serial[9], r->serial[10], r->serial[11], + r->guid[0], r->guid[1], r->guid[2], r->guid[3], + r->guid[4], r->guid[5], r->guid[6], r->guid[7], + r->guid[8], r->guid[9], r->guid[10], r->guid[11], + r->guid[12], r->guid[13], r->guid[14], r->guid[15]); + +_SHOW(VERS, vers, vers, "\"%s\" \"%s\" \"%s\" \"%s\" \"%s\" \"%s\" \"%s\"\n", + r->fboot0 +1, r->fboot1 +1, r->flash[0] +1, + r->flash[1] +1, r->flash[2] +1, r->fsc +1, r->uos +1) + +_SHOW(CFREQ, freq, freq, "%u %u %s\n", + r->cur, r->def, arr32(r->supt, r->slen, BP, BL)) + +_SHOW(CVOLT, volt, volt, "%u %u %s\n", + r->cur, r->set, arr32(r->supt, r->slen, BP, BL)) + +#if defined(CONFIG_MK1OM) || (defined(CONFIG_ML1OM) && USE_FSC) +_SHOW(PWR, power, power, "%d\n%d\n%d\n%d\n%d\n%d\n%d\n%s\n%s\n%s\n", + r->tot0.prr, + r->tot1.prr, + r->inst.prr, + r->imax.prr, + r->pcie.prr, + r->c2x3.prr, + r->c2x4.prr, + arr32(&r->vccp.pwr, 3, BP, 32), + arr32(&r->vddg.pwr, 3, BP + 32, 32), + arr32(&r->vddq.pwr, 3, BP + 64, 32)) + +_SHOW(PLIM, plim, plim, "%u %u %u\n", + r->phys, r->hmrk, r->lmrk) +#endif + +_SHOW(CLST, clst, clst, "%u %u\n", + r->count, r->thr) + +_SHOW(GDDR, gddr, gddr, "\"%s\" %u %u %u\n", + r->dev +1, r->rev, r->size, r->speed) + +_SHOW(GFREQ, gfreq, gfreq, "%u %u\n", + r->cur, r->def) + +_SHOW(GVOLT, gvolt, gvolt, "%u %u\n", + r->cur, r->set) + +_SHOW(TEMP, temp, temp, "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n", + arr16(&r->die.cur, 2, BP, 32), + arr16(&r->brd.cur, 2, BP + 32, 32), + arr16(&r->fin.cur, 2, BP + 64, 32), + arr16(&r->fout.cur, 2, BP + 96, 32), + arr16(&r->gddr.cur, 2, BP + 128, 32), + arr16(&r->vccp.cur, 2, BP + 160, 32), + arr16(&r->vddg.cur, 2, BP + 224, 32), + arr16(&r->vddq.cur, 2, BP + 256, 32)) + +_SHOW(FAN, fan, fan, "%u %u %u\n", + r->override, r->pwm, r->rpm) + +#ifdef CONFIG_MK1OM +_SHOW(ECC, ecc, ecc, "%d\n", + r->enable) +#endif + +_SHOW(TRC, trc, trc, "%d\n", + r->lvl) + +_SHOW(TRBO, trbo, trbo, "%d %d %d\n", + r->set, r->state, r->avail) + +#ifdef CONFIG_MK1OM +_SHOW(LED, led, led, "%d\n", + r->led) + +_SHOW(PROCHOT, ptrig, prochot, "%d %d\n", + r->power, r->time); + +_SHOW(PWRALT, ptrig, pwralt, "%d %d\n", + r->power, r->time); + +_SHOW(PERST, perst, perst, "%d\n", + r->perst); + +_SHOW(TTL, ttl, ttl, "%u %u %u %u\n%u %u %u %u\n%u %u %u %u\n", + r->thermal.active, r->thermal.since, r->thermal.count, r->thermal.time, + r->power.active, r->power.since, r->power.count, r->power.time, + r->alert.active, r->alert.since, r->alert.count, r->alert.time); +#endif + +_SHOW(CUTL, cutl, cutl, "%u %u %u %llu\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n...\n", + r->tck, r->core, r->thr, r->jif, + arr64(&r->sum.user, 4, BP, 80), + arr64(&r->cpu[0].user, 4, BP + 80, 80), + arr64(&r->cpu[1].user, 4, BP + 160, 80), + arr64(&r->cpu[2].user, 4, BP + 240, 80), + arr64(&r->cpu[3].user, 4, BP + 320, 80), + arr64(&r->cpu[4].user, 4, BP + 400, 80), + arr64(&r->cpu[5].user, 4, BP + 480, 80), + arr64(&r->cpu[6].user, 4, BP + 560, 80), + arr64(&r->cpu[7].user, 4, BP + 640, 80)) + +_SHOW(MEM, mem, mem, "%u %u %u\n", + r->total, r->free, r->bufs) + +_SHOW(OS, os, os, "%llu %llu %llu %llu %u [%s]\n", + r->uptime, r->loads[0], r->loads[1], r->loads[2], + r->alen, arr32(r->apid, r->alen, BP, BL)) + + +/* + * Ensure caller's creditials is root on all 'set' files. + * Even though file creation mode should prevent writes? + * + *TBD: + * - How many of the 'store's are to be permitted? + */ + +#define _STORE(op, nam) \ + static ssize_t \ + micras_store_##nam (struct class *class, \ + struct class_attribute *attr, \ + const char *buf, \ + size_t count) \ + { \ + struct wq_task * wq; \ + size_t ocount; \ + uint32_t val; \ + int err; \ + char * ep; \ +\ + if (current_euid() != 0) \ + return -EPERM; \ +\ + ocount = count; \ + if (count && buf[count - 1] == '\n') \ + ((char *) buf)[--count] = '\0'; \ +\ + err = -EINVAL; \ + if (count && *buf) { \ + val = simple_strtoul(buf, &ep, 0); \ + if (ep != buf && !*ep) { \ + wq = kmalloc(sizeof(* wq), GFP_KERNEL); \ + if (! wq) \ + return -ENOMEM; \ +\ + wq->req = MR_SET_##op; \ + wq->fnc = (int (*)(void *)) mr_set_##nam; \ + wq->ptr = (void *) &val; \ + if (! micras_mt_tsk(wq)) \ + err = ocount; \ + kfree(wq); \ + } \ + } \ +\ + return err; \ + } + +_STORE(CFREQ, freq) +_STORE(CVOLT, volt) + +#if defined(CONFIG_MK1OM) || (defined(CONFIG_ML1OM) && USE_FSC) +_STORE(PLIM, plim) +#endif + +_STORE(FAN, fan) +_STORE(TRC, trc) +_STORE(TRBO, trbo) + +#ifdef CONFIG_MK1OM +_STORE(LED, led) +_STORE(PERST, perst) +#endif + + +/* + *TBD: + * - Remove entries clst, cutl, mem, and os. + * Only included here for comparison with what cp/micinfo displays. + * They really need to go. + */ + +static struct class_attribute micras_attr[] = { + __ATTR(hwinf, 0444, micras_show_hwinf, 0), + __ATTR(vers, 0444, micras_show_vers, 0), + __ATTR(freq, 0644, micras_show_freq, micras_store_freq), + __ATTR(volt, 0644, micras_show_volt, micras_store_volt), +#if defined(CONFIG_MK1OM) || (defined(CONFIG_ML1OM) && USE_FSC) + __ATTR(power, 0444, micras_show_power, 0), + __ATTR(plim, 0644, micras_show_plim, micras_store_plim), +#endif + __ATTR(clst, 0444, micras_show_clst, 0), + __ATTR(gddr, 0444, micras_show_gddr, 0), + __ATTR(gfreq, 0444, micras_show_gfreq, 0), + __ATTR(gvolt, 0444, micras_show_gvolt, 0), + __ATTR(fan, 0644, micras_show_fan, micras_store_fan), + __ATTR(temp, 0444, micras_show_temp, 0), +#ifdef CONFIG_MK1OM + __ATTR(ecc, 0444, micras_show_ecc, 0), +#endif + __ATTR(trc, 0644, micras_show_trc, micras_store_trc), + __ATTR(trbo, 0644, micras_show_trbo, micras_store_trbo), +#ifdef CONFIG_MK1OM + __ATTR(led, 0644, micras_show_led, micras_store_led), + __ATTR(prochot, 0444, micras_show_prochot, 0), + __ATTR(pwralt, 0444, micras_show_pwralt, 0), + __ATTR(perst, 0644, micras_show_perst, micras_store_perst), + __ATTR(ttl, 0444, micras_show_ttl, 0), +#endif + __ATTR(cutl, 0444, micras_show_cutl, 0), + __ATTR(mem, 0444, micras_show_mem, 0), + __ATTR(os, 0444, micras_show_os, 0), + __ATTR_NULL, +}; + + +static struct class ras_class = { + .name = "micras", + .owner = THIS_MODULE, + .class_attrs = micras_attr, +}; + + + +/* +** +** SCIF interface & services are mostly handled here, including +** all aspects of setting up and tearing down SCIF channels. +** We create three listening SCIF sockets and create a workqueue +** with the initial task of waiting for 'accept's to happen. +** +** When TTL or MC accept incoming connections, their workqueue +** task spawns one thread just to detect if/when peer closes +** the session and will block any further connects until thes +** service thread terminates (peer closes session). +** The TTL or MC event handler, executing in interrupt context, +** will check for an open session and if one is present, deliver +** their event record(s) on it by using scif_send(). +** +** When CP accept incoming connections, its workqueue task spawns +** a new thread to run a session with the peer and then proceeds +** to accepting a new connection. Thus, there are no strict +** bounds on number of incoming connections, but for internal +** house-keeping sessions are limited to MR_SCIF_MAX (32). +** Accepted requests from the peer are fulfilled through the +** MT thread in a similar fashion as the sysctl interface, i.e. +** though function micras_mt_tsk(), who guarantee synchronized +** (serialized) access to MT core data and handle waits as needed. +** Function pointers corresponding to request opcodes are found +** by lookup in the fnc_map table. +** +** Note: This is not coded for maximum performance, since the +** use of the MT thread to serialize access to card data +** has a cost of two task switches attached, both which +** may cause delays due to other system activity. +*/ + + +static scif_epd_t micras_cp_lstn; /* CP listener handle */ +static struct workqueue_struct * micras_cp_wq; /* CP listener thread */ +static atomic_t micras_cp_rst; /* CP listener restart */ +static struct delayed_work micras_cp_tkn; /* CP accept token */ +static DECLARE_BITMAP(micras_cp_fd, MR_SCIF_MAX); /* CP free slots */ +static volatile struct scif_portID micras_cp_si[MR_SCIF_MAX]; /* CP sessions */ +static volatile struct task_struct * micras_cp_kt[MR_SCIF_MAX]; /* CP threads */ +static volatile scif_epd_t micras_cp_ep[MR_SCIF_MAX]; /* CP handles */ + +static scif_epd_t micras_mc_lstn; /* MC listener handle */ +static struct workqueue_struct * micras_mc_wq; /* MC listener thread */ +static struct delayed_work micras_mc_tkn; /* MC accept token */ +static volatile struct task_struct * micras_mc_kt; /* MC session */ +static volatile scif_epd_t micras_mc_ep; /* MC handle */ + +static scif_epd_t micras_ttl_lstn; /* TTL listener handle */ +static struct workqueue_struct * micras_ttl_wq; /* TTL listener thread */ +static struct delayed_work micras_ttl_tkn; /* TTL accept token */ +static volatile struct task_struct * micras_ttl_kt; /* TTL session */ +static volatile scif_epd_t micras_ttl_ep; /* TTL handle */ + + +/* + * SCIF CP session thread + */ + +static int +micras_cp_sess(void * _slot) +{ + struct wq_task * wq; + struct mr_hdr q, a; + scif_epd_t ep; + uint32_t slot; + void * buf; + uint64_t start, stop; + int blen, len, priv; + + slot = (uint32_t)((uint64_t) _slot); + priv = (micras_cp_si[slot].port < 1024) ? 1 : 0; +#if MT_VERBOSE + printk("Scif: CP session %d running%s\n", slot, priv ? " privileged" : ""); +#endif + + /* + * Allocate local buffer from kernel + * Since the I/O buffers in SCIF is just one page, + * we'd never expect to need larger buffers here. + */ + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (! buf) { + printk("Scif: CP scratch pad alloc failed\n"); + return 0; + } + + /* + * Allocate a work queue task for the MT thread + */ + wq = kmalloc(sizeof(* wq), GFP_KERNEL); + if (! wq) { + printk("Scif: CP work task alloc failed\n"); + goto cp_sess_end; + } + + /* + * Start servicing MT protocol + */ + ep = micras_cp_ep[slot]; + for( ;; ) { + + /* + * Get a message header + */ + len = scif_recv(ep, &q, sizeof(q), SCIF_RECV_BLOCK); + start = rdtsc(); + if (len < 0) { + if (len != -ECONNRESET) + printk("Scif: CP recv error %d\n", len); + goto cp_sess_end; + } + if (len != sizeof(q)) { + printk("Scif: CP short recv (%d), discarding\n", len); + continue; + } + + /* + * Validate the query: + * - known good opcode, + * - expected length (zero) + * - have callout in jump table + * - check requestor's port ID on privileged opcodes. + * + *TBD: opcodes above MR_REQ_MAX is really only meant for + * use by the PM module. Should it be host accessible? + */ + blen = 0; + if (q.cmd < MR_REQ_HWINF || +#if defined(CONFIG_MK1OM) && USE_PM + q.cmd > PM_REQ_MAX +#else + q.cmd > MR_REQ_MAX +#endif + ) { + printk("Scif: CP opcode %d invalid\n", q.cmd); + blen = -MR_ERR_INVOP; + } + else + if (q.len != 0) { + printk("Scif: CP command length %d invalid\n", q.len); + blen = -MR_ERR_INVLEN; + } + else + if (! fnc_map[q.cmd].fnc) { + printk("Scif: CP opcode %d un-implemented\n", q.cmd); + blen = -MR_ERR_UNSUP; + } + else + if (fnc_map[q.cmd].privileged && !priv) { + printk("Scif: CP opcode %d privileged, remote %d:%d\n", + q.cmd, micras_cp_si[slot].node, micras_cp_si[slot].port); + blen = -MR_ERR_PERM; + } + + /* + *TBD: If there is an error at this point, it might + * be a good idea to drain the SCIF channel. + * If garbage has entered the channel somehow, + * then how else can we get in sync such that + * next recv really is a command header? + * More radical solution is closing this session. + */ + + /* + * If header is OK (blen still zero) then pass + * a work queue item to MT and wait for response. + * The result will end up in buf (payload for response) + * or an error code that can be sent back to requestor. + * Since we don't want to care about whether it is a + * get or set command here, the 'parm' value is copied + * into buf prior to passing the work item to MT. + * Thus, functions expecting an 'uint32_t *' to + * point to a new value will be satisfied. + */ + if (blen == 0) { + if (fnc_map[q.cmd].simple) { + *((uint32_t *) buf) = q.parm; + blen = fnc_map[q.cmd].fnc(buf); + } + else { + memset(wq, '\0', sizeof(*wq)); + wq->req = q.cmd; + wq->priv = priv; + wq->fnc = (int (*)(void *)) fnc_map[q.cmd].fnc; + wq->ptr = buf; + *((uint32_t *) buf) = q.parm; + blen = micras_mt_tsk(wq); + } + } + stop = rdtsc(); + + /* + * Craft response header + */ + a.cmd = q.cmd | MR_RESP; + if (blen < 0) { + /* + * MT thread reported a failure. + * Set error bit and make error record in buf + */ + a.cmd |= MR_ERROR; + ((struct mr_err *) buf)->err = -blen; + ((struct mr_err *) buf)->len = 0; + a.len = sizeof(struct mr_err); + } + else { + /* + * Payload size is set by call-out + */ + a.len = blen; + } + a.stamp = q.stamp; + a.spent = stop - start; + + /* + * Send response header (always) + */ + len = scif_send(ep, &a, sizeof(a), SCIF_SEND_BLOCK); + if (len < 0) { + printk("Scif: header send error %d\n", len); + goto cp_sess_end; + } + if (len != sizeof(a)) { + printk("Scif: CP short header send (%d of %lu)\n", len, sizeof(a)); + goto cp_sess_end; + } + + /* + * Send payload (if any, defined by a.len) + */ + if (a.len > 0) { + len = scif_send(ep, buf, a.len, SCIF_SEND_BLOCK); + if (len < 0) { + printk("Scif: CP payload send error %d\n", len); + goto cp_sess_end; + } + if (len != a.len) { + printk("Scif: CP short payload send (%d of %d)\n", len, a.len); + goto cp_sess_end; + } + } + + } + +cp_sess_end: + if (wq) + kfree(wq); + if (buf) + kfree(buf); + ep = (scif_epd_t) atomic64_xchg((atomic64_t *)(micras_cp_ep + slot), 0); + if (ep) + scif_close(ep); + micras_cp_kt[slot] = 0; + set_bit(slot, micras_cp_fd); +#if MT_VERBOSE + printk("Scif: CP session %d terminated, sess mask %lx\n", slot, micras_cp_fd[0]); +#endif + + if (atomic_xchg(&micras_cp_rst, 0)) { + printk("Scif: resume listener\n"); + queue_delayed_work(micras_cp_wq, &micras_cp_tkn, 0); + } + + return 0; +} + + +/* + * SCIF CP session launcher + */ + +static void +micras_cp(struct work_struct * work) +{ + struct task_struct * thr; + scif_epd_t sess_ep; + struct scif_portID sess_id; + int slot; + int err; + + /* + * Wait for somebody to connect to us + * We stop listening on any error whatsoever + */ + err = scif_accept(micras_cp_lstn, &sess_id, &sess_ep, SCIF_ACCEPT_SYNC); + if (err == -EINTR) { + printk("Scif: CP accept interrupted, error %d\n", err); + return; + } + if (err < 0) { + printk("Scif: CP accept failed, error %d\n", err); + return; + } +#if MT_VERBOSE + printk("Scif: CP accept: remote %d:%d, local %d:%d\n", + sess_id.node, sess_id.port, + micras_cp_lstn->port.node, micras_cp_lstn->port.port); +#endif + + /* + * Spawn a new thread to run session with connecting peer + * We support only a limited number of connections, so first + * get a free "slot" for this session. + * The use of non-atomic ffs() below is safe as long as this + * function is never run by more than one thread at a time + * and all other manipulations of micras_cp_fd are atomic. + */ + slot = find_first_bit(micras_cp_fd, MR_SCIF_MAX); + if (slot < MR_SCIF_MAX) { + if (micras_cp_kt[slot] || micras_cp_ep[slot]) { + printk("Scif: CP slot %d busy (bug)\n", slot); + return; + } + + clear_bit(slot, micras_cp_fd); + micras_cp_ep[slot] = sess_ep; + micras_cp_si[slot] = sess_id; + thr = kthread_create(micras_cp_sess, (void *)(uint64_t) slot, "RAS CP svc %d", slot); + if (IS_ERR(thr)) { + printk("Scif: CP service thread creation failed\n"); + scif_close(sess_ep); + micras_cp_ep[slot] = 0; + set_bit(slot, micras_cp_fd); + return; + } + micras_cp_kt[slot] = thr; +#if MT_VERBOSE + printk("Scif: CP session %d launched, pid %d\n", slot, thr->pid); +#endif + wake_up_process(thr); + } + else { + printk("Scif: No open session slots, closing session\n"); + scif_close(sess_ep); + } + + /* + * Keep listening until session limit reached. + */ + if (bitmap_weight(micras_cp_fd, MR_SCIF_MAX)) + queue_delayed_work(micras_cp_wq, &micras_cp_tkn, 0); + else { + printk("Scif: CP connection limit reached\n"); + atomic_xchg(&micras_cp_rst, 1); + } +} + + +/* + * SCIF MC session thread + */ + +static int +micras_mc_sess(void * dummy) +{ + scif_epd_t ep; + char buf[8]; + int len; + +#if MC_VERBOSE + printk("Scif: MC session running\n"); +#endif + + /* + * Start servicing. + * This is just to get indication if peer closes connection + */ + for( ;; ) { + /* + * Sync with kernel MC event log. + */ + mcc_sync(); + + /* + * Try read 1 byte from host (turns into a wait-point + * keeping the connection open till host closes it) + */ + len = scif_recv(micras_mc_ep, buf, 1, SCIF_RECV_BLOCK); + if (len < 0) { + if (len != -ECONNRESET) + printk("Scif: MC recv error %d\n", len); + goto mc_sess_end; + } + + /* + * Ignore any received content. + */ + } + +mc_sess_end: + ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_mc_ep, 0); + if (ep) + scif_close(ep); + micras_mc_kt = 0; +#if MC_VERBOSE + printk("Scif: MC session terminated\n"); +#endif + return 0; +} + + +/* + * SCIF MC session launcher + */ + +static void +micras_mc(struct work_struct * work) +{ + struct task_struct * thr; + scif_epd_t sess_ep; + struct scif_portID sess_id; + int err; + + /* + * Wait for somebody to connect to us + * We stop listening on any error whatsoever + */ + err = scif_accept(micras_mc_lstn, &sess_id, &sess_ep, SCIF_ACCEPT_SYNC); + if (err == -EINTR) { + printk("Scif: MC accept interrupted, error %d\n", err); + return; + } + if (err < 0) { + printk("Scif: MC accept failed, error %d\n", err); + return; + } +#if MC_VERBOSE + printk("Scif: MC accept: remote %d:%d, local %d:%d\n", + sess_ep->peer.node, sess_ep->peer.port, + sess_ep->port.node, sess_ep->port.port); +#endif + + /* + * Spawn a new thread to run session with connecting peer + * We support only one connection, so if one already is + * running this one will be rejected. + */ + if (! micras_mc_ep) { + micras_mc_ep = sess_ep; + thr = kthread_create(micras_mc_sess, 0, "RAS MC svc"); + if (IS_ERR(thr)) { + printk("Scif: MC service thread creation failed\n"); + scif_close(sess_ep); + micras_mc_ep = 0; + return; + } + micras_mc_kt = thr; + wake_up_process(thr); + } + else { + printk("Scif: MC connection limit reached\n"); + scif_close(sess_ep); + } + + /* + * Keep listening + */ + queue_delayed_work(micras_mc_wq, &micras_mc_tkn, 0); +} + + +/* + * Ship a pre-packaged machine check event record to host + */ + +#ifndef SCIF_BLAST +#define SCIF_BLAST 2 +#endif + +int +micras_mc_send(struct mce_info * mce, int exc) +{ + if (micras_mc_ep) { + int err; + +#if ADD_DIE_TEMP + err = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2); + mce->flags |= PUT_BITS(15, 8, GET_BITS(19, 10, err)); +#endif + + if (exc) { + /* + * Exception context SCIF access, can't sleep and can't + * wait on spinlocks either. May be detrimental to + * other scif communications, but this _is_ an emergency + * and we _do_ need to ship this message to the host. + */ + err = scif_send(micras_mc_ep, mce, sizeof(*mce), SCIF_BLAST); + if (err != sizeof(*mce)) + ee_printk("micras_mc_send: scif_send failed, err %d\n", err); + } + else { + /* + * Thread context SCIF access. + * Just send message. + */ + err = scif_send(micras_mc_ep, mce, sizeof(*mce), SCIF_SEND_BLOCK); + if (err != sizeof(*mce)) + printk("micras_mc_send: scif_send failed, err %d\n", err); + } + return err == sizeof(*mce); + } + return 0; +} + + +/* + * SCIF TTL session thread + */ + +static int +micras_ttl_sess(void * dummy) +{ + scif_epd_t ep; + char buf[8]; + int len; + +#if PM_VERBOSE + printk("Scif: TTL session running\n"); +#endif + + /* + * Start servicing. + * This is just to get indication if peer closes connection + */ + for( ;; ) { + /* + * Try read 1 byte from host (turns into a wait-point + * keeping the connection open till host closes it) + */ + len = scif_recv(micras_ttl_ep, buf, 1, SCIF_RECV_BLOCK); + if (len < 0) { + if (len != -ECONNRESET) + printk("Scif: TTL recv error %d\n", len); + goto ttl_sess_end; + } + + /* + * Ignore any received content. + */ + } + +ttl_sess_end: + ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_ttl_ep, 0); + if (ep) + scif_close(ep); + micras_ttl_kt = 0; +#if PM_VERBOSE + printk("Scif: TTL session terminated\n"); +#endif + return 0; +} + + +/* + * SCIF TTL session launcher + */ + +static void +micras_ttl(struct work_struct * work) +{ + struct task_struct * thr; + scif_epd_t sess_ep; + struct scif_portID sess_id; + int err; + + /* + * Wait for somebody to connect to us + * We stop listening on any error whatsoever + */ + err = scif_accept(micras_ttl_lstn, &sess_id, &sess_ep, SCIF_ACCEPT_SYNC); + if (err == -EINTR) { + printk("Scif: TTL accept interrupted, error %d\n", err); + return; + } + if (err < 0) { + printk("Scif: TTL accept failed, error %d\n", err); + return; + } +#if PM_VERBOSE + printk("Scif: TTL accept: remote %d:%d, local %d:%d\n", + sess_ep->peer.node, sess_ep->peer.port, + sess_ep->port.node, sess_ep->port.port); +#endif + + /* + * Spawn a new thread to run session with connecting peer + * We support only one connection, so if one already is + * running this one will be rejected. + */ + if (! micras_ttl_ep) { + micras_ttl_ep = sess_ep; + thr = kthread_create(micras_ttl_sess, 0, "RAS TTL svc"); + if (IS_ERR(thr)) { + printk("Scif: TTL service thread creation failed\n"); + scif_close(sess_ep); + micras_ttl_ep = 0; + return; + } + micras_ttl_kt = thr; + wake_up_process(thr); + } + else { + printk("Scif: TTL connection limit reached\n"); + scif_close(sess_ep); + } + + /* + * Keep listening + */ + queue_delayed_work(micras_ttl_wq, &micras_ttl_tkn, 0); +} + + +/* + * Ship a pre-packaged throttle event record to host + */ + +void +micras_ttl_send(struct ttl_info * ttl) +{ + static struct ttl_info split_rec; + static int split_rem; + int err; + char * cp; + + if (micras_ttl_ep) { + + if (split_rem) { + cp = ((char *) &split_rec) + (sizeof(*ttl) - split_rem); + err = scif_send(micras_ttl_ep, cp, split_rem, 0); + if (err == split_rem) { + /* + * Tx of pendig buffer complete + */ + split_rem = 0; + } + else { + if (err < 0) { + /* + * SCIF failed squarely, just drop the message. + * TBD: close end point? + */ + } + else { + /* + * Another partial send + */ + split_rem -= err; + } + } + } + + if (! split_rem) { + /* + * Send message + */ + err = scif_send(micras_ttl_ep, ttl, sizeof(*ttl), 0); + if (err != sizeof(*ttl)) { + /* + * Did not send all the message + */ + if (err < 0) { + /* + * SCIF failed squarely, drop the message. + * TBD: close end point? + */ + } + else { + split_rec = *ttl; + split_rem = sizeof(*ttl) - err; + } + } + } + } +} + + + +/* +** +** MMIO regions used by RAS module +** Until some common strategy on access to BOXes and other CSRs +** we'll map them ourselves. All MMIO accesses are performed +** through 32 bit unsigned integers, but a 64 bit abstraction +** is provided for convenience (low 32 bit done first). +** +** We need access to the SBOX, all GBOXs, TBOXs and DBOXs. +** +** Note: I2C driver code for exception context in micras_elog.c +** has its own set of I/O routines in order to allow +** separate debugging. +** +*/ + +uint8_t * micras_sbox; /* SBOX mmio region */ +uint8_t * micras_dbox[DBOX_NUM]; /* DBOX mmio region */ +uint8_t * micras_gbox[GBOX_NUM]; /* GBOX mmio regions */ +#ifdef CONFIG_MK1OM +uint8_t * micras_tbox[TBOX_NUM]; /* TBOX mmio regions */ +#endif + +/* + * Specials: some defines are currently missing + */ + +#ifdef CONFIG_MK1OM +#define DBOX1_BASE 0x0800620000ULL + +#define GBOX4_BASE 0x08006D0000ULL +#define GBOX5_BASE 0x08006C0000ULL +#define GBOX6_BASE 0x08006B0000ULL +#define GBOX7_BASE 0x08006A0000ULL +#endif + + +/* + * MMIO I/O dumpers (for debug) + * Exception mode code needs to use the ee_print dumpers + * because printk is not safe to use (works most of the time + * though, but may hang the system eventually). + */ +#if 0 +#if 0 +extern atomic_t pxa_block; +#define RL if (! atomic_read(&pxa_block)) ee_print("%s: %4x -> %08x\n", __FUNCTION__, roff, val) +#define RQ if (! atomic_read(&pxa_block)) ee_print("%s: %4x -> %016llx\n", __FUNCTION__, roff, val) +#define WL if (! atomic_read(&pxa_block)) ee_print("%s: %4x <- %08x\n", __FUNCTION__, roff, val) +#define WQ if (! atomic_read(&pxa_block)) ee_print("%s: %4x <- %016llx\n", __FUNCTION__, roff, val) +#else +#define RL printk("%s: %4x -> %08x\n", __FUNCTION__, roff, val) +#define RQ printk("%s: %4x -> %016llx\n", __FUNCTION__, roff, val) +#define WL printk("%s: %4x <- %08x\n", __FUNCTION__, roff, val) +#define WQ printk("%s: %4x <- %016llx\n", __FUNCTION__, roff, val) +#endif +#else +#define RL /* As nothing */ +#define RQ /* As nothing */ +#define WL /* As nothing */ +#define WQ /* As nothing */ +#endif + + +/* + * SBOX MMIO I/O routines + * mr_sbox_base Return SBOX MMIO region + * mr_sbox_rl Read 32-bit register + * mr_sbox_rq Read 64-bit register (really two 32-bit reads) + * mr_sbox_wl Write 32-bit register + * mr_sbox_wq Write 64-bit register (really two 32-bit writes) + */ + +#if NOT_YET +uint8_t * +mr_sbox_base(int dummy) +{ + return micras_sbox; +} +#endif + +uint32_t +mr_sbox_rl(int dummy, uint32_t roff) +{ + uint32_t val; + + val = * (volatile uint32_t *)(micras_sbox + roff); + RL; + return val; +} + +uint64_t +mr_sbox_rq(int dummy, uint32_t roff) +{ + uint32_t hi, lo; + uint64_t val; + + lo = * (volatile uint32_t *)(micras_sbox + roff); + hi = * (volatile uint32_t *)(micras_sbox + roff + 4); + val = ((uint64_t) hi << 32) | (uint64_t) lo; + RQ; + return val; +} + +void +mr_sbox_wl(int dummy, uint32_t roff, uint32_t val) +{ + WL; + * (volatile uint32_t *)(micras_sbox + roff) = val; +} + +void +mr_sbox_wq(int dummy, uint32_t roff, uint64_t val) +{ + uint32_t hi, lo; + + WQ; + lo = val; + hi = val >> 32; + + * (volatile uint32_t *)(micras_sbox + roff) = lo; + * (volatile uint32_t *)(micras_sbox + roff + 4) = hi; +} + + +/* + * DBOX MMIO I/O routines + * mr_dbox_base Return DBOX MMIO region + * mr_dbox_rl Read 32-bit register + * mr_dbox_rq Read 64-bit register (really two 32-bit reads) + * mr_dbox_wl Write 32-bit register + * mr_dbox_wq Write 64-bit register (really two 32-bit writes) + */ + +#if NOT_YET +uint8_t * +mr_dbox_base(int unit) +{ + return micras_dbox[unit]; +} +#endif + +uint32_t +mr_dbox_rl(int unit, uint32_t roff) +{ + uint32_t val; + + val = * (volatile uint32_t *)(micras_dbox[unit] + roff); + RL; + return val; +} + +uint64_t +mr_dbox_rq(int unit, uint32_t roff) +{ + uint32_t hi, lo; + uint64_t val; + + lo = * (volatile uint32_t *)(micras_dbox[unit] + roff); + hi = * (volatile uint32_t *)(micras_dbox[unit] + roff + 4); + val = ((uint64_t) hi << 32) | (uint64_t) lo; + RQ; + return val; +} + +void +mr_dbox_wl(int unit, uint32_t roff, uint32_t val) +{ + WL; + * (volatile uint32_t *)(micras_dbox[unit] + roff) = val; +} + +void +mr_dbox_wq(int unit, uint32_t roff, uint64_t val) +{ + uint32_t hi, lo; + + WQ; + lo = val; + hi = val >> 32; + + * (volatile uint32_t *)(micras_dbox[unit] + roff) = lo; + * (volatile uint32_t *)(micras_dbox[unit] + roff + 4) = hi; +} + + +/* + * GBOX MMIO I/O routines + * mr_gbox_base Return GBOX MMIO region + * mr_gbox_rl Read 32-bit register + * mr_gbox_rq Read 64-bit register (really two 32-bit reads) + * mr_gbox_wl Write 32-bit register + * mr_gbox_wq Write 64-bit register (really two 32-bit writes) + * + * Due to a Si bug, MMIO writes can be dropped by the GBOXs + * during heavy DMA activity (HSD #4844222). The risk of it + * happening is low enough that a 'repeat until it sticks' + * workaround is sufficient. No 'read' issues so far. + * + *TBD: Ramesh asked that GBOX MMIOs check for sleep states. + * Not sure how to do that, but here is a good spot to + * add such check, as all GBOX access comes thru here. + */ + +#if NOT_YET +uint8_t * +mr_gbox_base(int unit) +{ + return micras_gbox[unit]; +} +#endif + +uint32_t +mr_gbox_rl(int unit, uint32_t roff) +{ + uint32_t val; + + val = * (volatile uint32_t *)(micras_gbox[unit] + roff); + RL; + return val; +} + +uint64_t +mr_gbox_rq(int unit, uint32_t roff) +{ + uint32_t hi, lo; + uint64_t val; + + lo = * (volatile uint32_t *)(micras_gbox[unit] + roff); + if (roff == 0x5c) { + /* + * Instead of placing HI part of MCA_STATUS + * at 0x60 to form a natural 64-bit register, + * it located at 0xac, against all conventions. + */ + hi = * (volatile uint32_t *)(micras_gbox[unit] + 0xac); + } + else + hi = * (volatile uint32_t *)(micras_gbox[unit] + roff + 4); + val = ((uint64_t) hi << 32) | (uint64_t) lo; + RQ; + return val; +} + +void +mr_gbox_wl(int unit, uint32_t roff, uint32_t val) +{ +#if !GBOX_WORKING + { + int rpt; + uint32_t rb; + + /* + * Due to bug HSD 4844222 loop until value sticks + */ + for(rpt = 10; rpt-- ; ) { +#endif + + WL; + * (volatile uint32_t *)(micras_gbox[unit] + roff) = val; + +#if !GBOX_WORKING + rb = mr_gbox_rl(unit, roff); + if (rb == val) + break; + } + } +#endif +} + +void +mr_gbox_wq(int unit, uint32_t roff, uint64_t val) +{ + uint32_t hi, lo; + + lo = val; + hi = val >> 32; + +#if !GBOX_WORKING + { + int rpt; + uint64_t rb; + + /* + * Due to bug HSD 4844222 loop until value sticks + * Note: this may result in bad things happening if + * wrinting to a MMIO MCA STATUS register + * since there is a non-zero chance that the + * NMI handler can fire and change the register + * inside this loop. Require that the caller + * is on same CPU as the NMI handler (#0). + */ + for(rpt = 10; rpt-- ; ) { +#endif + + WQ; + * (volatile uint32_t *)(micras_gbox[unit] + roff) = lo; + if (roff == 0x5c) { + /* + * Instead of placing HI part of MCA_STATUS + * at 0x60 to form a natural 64-bit register, + * it located at 0xac, against all conventions. + */ + * (volatile uint32_t *)(micras_gbox[unit] + 0xac) = hi; + } + else + * (volatile uint32_t *)(micras_gbox[unit] + roff + 4) = hi; + +#if !GBOX_WORKING + rb = mr_gbox_rq(unit, roff); + if (rb == val) + break; + } + } +#endif +} + + +#ifdef CONFIG_MK1OM +/* + * TBOX MMIO I/O routines + * mr_tbox_base Return TBOX MMIO region + * mr_tbox_rl Read 32-bit register + * mr_tbox_rq Read 64-bit register (really two 32-bit reads) + * mr_tbox_wl Write 32-bit register + * mr_tbox_wq Write 64-bit register (really two 32-bit writes) + * + * Some SKUs don't have TBOXs, in which case the + * micras_tbox array will contain null pointers. + * We do not test for this here, but expect that + * caller either know what he's doing or consult + * the mr_tbox_base() function first. + */ + +#if NOT_YET +uint8_t * +mr_tbox_base(int unit) +{ + return micras_tbox[unit]; +} +#endif + +uint32_t +mr_tbox_rl(int unit, uint32_t roff) +{ + uint32_t val; + + val = * (volatile uint32_t *)(micras_tbox[unit] + roff); + RL; + return val; +} + +uint64_t +mr_tbox_rq(int unit, uint32_t roff) +{ + uint32_t hi, lo; + uint64_t val; + + lo = * (volatile uint32_t *)(micras_tbox[unit] + roff); + hi = * (volatile uint32_t *)(micras_tbox[unit] + roff + 4); + val = ((uint64_t) hi << 32) | (uint64_t) lo; + RQ; + return val; +} + +void +mr_tbox_wl(int unit, uint32_t roff, uint32_t val) +{ + WL; + * (volatile uint32_t *)(micras_tbox[unit] + roff) = val; +} + +void +mr_tbox_wq(int unit, uint32_t roff, uint64_t val) +{ + uint32_t hi, lo; + + WQ; + lo = val; + hi = val >> 32; + + * (volatile uint32_t *)(micras_tbox[unit] + roff) = lo; + * (volatile uint32_t *)(micras_tbox[unit] + roff + 4) = hi; +} +#endif + + + +/* +** +** SMP utilities for CP and MC. +** The kernel offers routines for MSRs, but as far +** as I could find then there isn't any for some +** CPU registers we need, like CR4. +** +** rd_cr4_on_cpu Read a CR4 value on CPU +** set_in_cr4_on_cpu Set bits in CR4 on a CPU +** clear_in_cr4_on_cpu Guess... +** rdtsc Read time stamp counter +** +**TBD: Special case when CPU happens to be current? +*/ + +#if NOT_YET +static void +_rd_cr4_on_cpu(void * p) +{ + *((uint32_t *) p) = read_cr4(); +} + +uint32_t +rd_cr4_on_cpu(int cpu) +{ + uint32_t cr4; + + smp_call_function_single(cpu, _rd_cr4_on_cpu, &cr4, 1); + return cr4; +} + +static void +_set_in_cr4_on_cpu(void * p) +{ + uint32_t cr4; + + cr4 = read_cr4(); + cr4 |= * (uint32_t *) p; + write_cr4(cr4); +} + +void +set_in_cr4_on_cpu(int cpu, uint32_t m) +{ + smp_call_function_single(cpu, _set_in_cr4_on_cpu, &m, 1); +} + +static void +_clear_in_cr4_on_cpu(void * p) +{ + uint32_t cr4; + + cr4 = read_cr4(); + cr4 &= ~ *(uint32_t *) p; + write_cr4(cr4); +} + +void +clear_in_cr4_on_cpu(int cpu, uint32_t m) +{ + smp_call_function_single(cpu, _clear_in_cr4_on_cpu, &m, 1); +} +#endif + +uint64_t +rdtsc(void) { + uint32_t lo, hi; + __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); + return ((uint64_t) hi) << 32 | lo; +} + + + +/* +** +** Module load/unload logic +** +*/ + + +/* + * Startup job (run by MT thread) + * Intended to handle tasks that cannot impact + * module load status, such as kicking off service + * work queues, etc. + */ + +static void +micras_init2(struct work_struct * work) +{ + /* + * Make MT one-time setup and kick + * off 1 sec timer and SCIF listeners + */ + if (! micras_stop) { + + INIT_DELAYED_WORK(&micras_wq_tick, micras_mt_tick); + queue_delayed_work(micras_wq, &micras_wq_tick, msecs_to_jiffies(5000)); + + bitmap_fill(micras_cp_fd, MR_SCIF_MAX); + INIT_DELAYED_WORK(&micras_cp_tkn, micras_cp); + queue_delayed_work(micras_cp_wq, &micras_cp_tkn, 0); + + INIT_DELAYED_WORK(&micras_mc_tkn, micras_mc); + queue_delayed_work(micras_mc_wq, &micras_mc_tkn, 0); + + INIT_DELAYED_WORK(&micras_ttl_tkn, micras_ttl); + queue_delayed_work(micras_ttl_wq, &micras_ttl_tkn, 0); + +#if defined(CONFIG_MK1OM) && WA_4845465 && DIE_PROC + if (smc_4845465) + die_pe = proc_create("die", 0644, 0, &proc_die_operations); +#endif + + printk("RAS.init: module operational\n"); + module_put(THIS_MODULE); + } +} + + +static int __init +micras_init(void) +{ + int i; + int err; + + printk("Loading RAS module ver %s. Build date: %s\n", RAS_VER, __DATE__); + + /* + * Create work queue for the monitoring thread + * and pass it some initial work to start with. + */ +#if defined(CONFIG_MK1OM) && WA_4845465 + micras_wq = alloc_workqueue("RAS MT", WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 1); +#else + micras_wq = create_singlethread_workqueue("RAS MT"); +#endif + if (! micras_wq) { + err = -ESRCH; + printk("RAS.init: cannot start work queue, error %d\n", err); + goto fail_wq; + } + + /* + * Register top sysfs class (directory) and attach attributes (files) + * beneath it. No 'device's involved. + */ + err = class_register(&ras_class); + if (err) { + printk("RAS.init: cannot register class 'micras', error %d\n", err); + goto fail_class; + } + + /* + * Setup CP SCIF port in listening mode + */ + micras_cp_lstn = scif_open(); + if (! micras_cp_lstn) { + printk("RAS.init: cannot get SCIF CP endpoint\n"); + goto fail_cp; + } + err = scif_bind(micras_cp_lstn, MR_MON_PORT); + if (err < 0) { + printk("RAS.init: cannot bind SCIF CP endpoint, error %d\n", err); + goto fail_cp_ep; + } + err = scif_listen(micras_cp_lstn, MR_SCIF_MAX); + if (err < 0) { + printk("RAS.init: cannot make SCIF CP listen, error %d\n", err); + goto fail_cp_ep; + } + micras_cp_wq = create_singlethread_workqueue("RAS CP listen"); + if (! micras_cp_wq) { + err = -ESRCH; + printk("RAS.init: cannot start CP listener work queue, error %d\n", err); + goto fail_cp_ep; + } + + /* + * Setup MC SCIF port in listening mode + */ + micras_mc_lstn = scif_open(); + if (! micras_mc_lstn) { + printk("RAS.init: cannot get SCIF MC endpoint\n"); + goto fail_mc; + } + err = scif_bind(micras_mc_lstn, MR_MCE_PORT); + if (err < 0) { + printk("RAS.init: cannot bind SCIF MC endpoint, error %d\n", err); + goto fail_mc_ep; + } + err = scif_listen(micras_mc_lstn, MR_SCIF_MAX); + if (err < 0) { + printk("RAS.init: cannot make SCIF MC listen, error %d\n", err); + goto fail_mc_ep; + } + micras_mc_wq = create_singlethread_workqueue("RAS MC listen"); + if (! micras_mc_wq) { + err = -ESRCH; + printk("RAS.init: cannot start listener work queue, error %d\n", err); + goto fail_mc_ep; + } + + /* + * Setup TTL SCIF port in listening mode + */ + micras_ttl_lstn = scif_open(); + if (! micras_ttl_lstn) { + printk("RAS.init: cannot get SCIF TTL endpoint\n"); + goto fail_ttl; + } + err = scif_bind(micras_ttl_lstn, MR_TTL_PORT); + if (err < 0) { + printk("RAS.init: cannot bind SCIF TTL endpoint, error %d\n", err); + goto fail_ttl_ep; + } + err = scif_listen(micras_ttl_lstn, MR_SCIF_MAX); + if (err < 0) { + printk("RAS.init: cannot make SCIF TTL listen, error %d\n", err); + goto fail_ttl_ep; + } + micras_ttl_wq = create_singlethread_workqueue("RAS TTL listen"); + if (! micras_ttl_wq) { + err = -ESRCH; + printk("RAS.init: cannot start listener work queue, error %d\n", err); + goto fail_ttl_ep; + } + + /* + * Make the MMIO maps we need. + */ + micras_sbox = ioremap(SBOX_BASE, COMMON_MMIO_BOX_SIZE); + if (! micras_sbox) + goto fail_iomap; + + micras_dbox[0] = ioremap(DBOX0_BASE, COMMON_MMIO_BOX_SIZE); + if (! micras_dbox[0]) + goto fail_iomap; + +#ifdef CONFIG_MK1OM + micras_dbox[1] = ioremap(DBOX1_BASE, COMMON_MMIO_BOX_SIZE); + if (! micras_dbox[1]) + goto fail_iomap; +#endif + + micras_gbox[0] = ioremap(GBOX0_BASE, COMMON_MMIO_BOX_SIZE); + micras_gbox[1] = ioremap(GBOX1_BASE, COMMON_MMIO_BOX_SIZE); + micras_gbox[2] = ioremap(GBOX2_BASE, COMMON_MMIO_BOX_SIZE); + micras_gbox[3] = ioremap(GBOX3_BASE, COMMON_MMIO_BOX_SIZE); + if (!micras_gbox[0] || !micras_gbox[1] || + !micras_gbox[2] || !micras_gbox[3]) + goto fail_iomap; + +#ifdef CONFIG_MK1OM + micras_gbox[4] = ioremap(GBOX4_BASE, COMMON_MMIO_BOX_SIZE); + micras_gbox[5] = ioremap(GBOX5_BASE, COMMON_MMIO_BOX_SIZE); + micras_gbox[6] = ioremap(GBOX6_BASE, COMMON_MMIO_BOX_SIZE); + micras_gbox[7] = ioremap(GBOX7_BASE, COMMON_MMIO_BOX_SIZE); + if (!micras_gbox[4] || !micras_gbox[5] || + !micras_gbox[6] || !micras_gbox[7]) + goto fail_iomap; +#endif + +#ifdef CONFIG_MK1OM + /* + * Most SKUs don't have TBOXes. + * If not, then don't map to their MMIO space + */ + if (mr_txs()) { + micras_tbox[0] = ioremap(TXS0_BASE, COMMON_MMIO_BOX_SIZE); + micras_tbox[1] = ioremap(TXS1_BASE, COMMON_MMIO_BOX_SIZE); + micras_tbox[2] = ioremap(TXS2_BASE, COMMON_MMIO_BOX_SIZE); + micras_tbox[3] = ioremap(TXS3_BASE, COMMON_MMIO_BOX_SIZE); + micras_tbox[4] = ioremap(TXS4_BASE, COMMON_MMIO_BOX_SIZE); + micras_tbox[5] = ioremap(TXS5_BASE, COMMON_MMIO_BOX_SIZE); + micras_tbox[6] = ioremap(TXS6_BASE, COMMON_MMIO_BOX_SIZE); + micras_tbox[7] = ioremap(TXS7_BASE, COMMON_MMIO_BOX_SIZE); + if (!micras_tbox[0] || !micras_tbox[1] || + !micras_tbox[2] || !micras_tbox[3] || + !micras_tbox[4] || !micras_tbox[5] || + !micras_tbox[6] || !micras_tbox[7]) + goto fail_iomap; + } +#endif + + /* + * Setup non-volatile MC error logging device. + */ + if (ee_init()) + goto fail_iomap; + + /* + * Setup core MC event handler. + * If this can't fail, move into micras_wq_init instead. + */ + if (mcc_init()) + goto fail_ee; + + /* + * Setup un-core MC event handler. + * If this can't fail, move into micras_wq_init instead. + */ + if (mcu_init()) + goto fail_core; + + /* + * Prepare MT drivers + */ + mr_mt_init(); + +#if defined(CONFIG_MK1OM) && USE_PM + /* + * Setup PM interface + */ + if (pm_init()) + goto fail_uncore; +#endif + +#if defined(CONFIG_MK1OM) && WA_4845465 + /* + * Launch SMC temperature push work. + * Supported by SMC firmware later than 121.11 (build 4511). + */ + { + extern int mr_smc_rd(uint8_t, uint32_t *); + int rev, ref; + + mr_smc_rd(0x11, &rev); + if (rev) { + ref = PUT_BITS(31, 24, 121) | + PUT_BITS(23, 16, 11) | + PUT_BITS(15, 0, 4511); + + if (rev >= ref) + smc_4845465 = rev; + } + + if (smc_4845465) { + INIT_DELAYED_WORK(&micras_wq_smc, micras_mt_smc); + queue_delayed_work(micras_wq, &micras_wq_smc, 0); + printk("RAS.init: HSD 4845465 workaround active, fw %x\n", rev); + } + else + printk("RAS.init: SMC too old for HSD 4845465 workaround, fw %x\n", rev); + } +#endif + + /* + * Launch deferable setup work + */ + try_module_get(THIS_MODULE); + INIT_DELAYED_WORK(&micras_wq_init, micras_init2); + queue_delayed_work(micras_wq, &micras_wq_init, msecs_to_jiffies(500)); + printk("RAS module load completed\n"); + return err; + + /* + * Error exits: unwind all setup done so far and return failure + * + *TBD: consider calling exit function. Requires that it can tell + * with certainty what has been setup and what hasn't. + */ +#if defined(CONFIG_MK1OM) && USE_PM +fail_uncore: + mr_mt_exit(); + mcu_exit(); +#endif +fail_core: + mcc_exit(); +fail_ee: +#ifdef CONFIG_MK1OM + ee_exit(); +#endif +fail_iomap: + if (micras_sbox) + iounmap(micras_sbox); + for(i = 0; i < ARRAY_SIZE(micras_dbox); i++) + if (micras_dbox[i]) + iounmap(micras_dbox[i]); + for(i = 0; i < ARRAY_SIZE(micras_gbox); i++) + if (micras_gbox[i]) + iounmap(micras_gbox[i]); +#ifdef CONFIG_MK1OM + for(i = 0; i < ARRAY_SIZE(micras_tbox); i++) + if (micras_tbox[i]) + iounmap(micras_tbox[i]); +#endif + + destroy_workqueue(micras_ttl_wq); + +fail_ttl_ep: + scif_close(micras_ttl_lstn); + +fail_ttl: + destroy_workqueue(micras_mc_wq); + +fail_mc_ep: + scif_close(micras_mc_lstn); + +fail_mc: + destroy_workqueue(micras_cp_wq); + +fail_cp_ep: + scif_close(micras_cp_lstn); + +fail_cp: + class_unregister(&ras_class); + +fail_class: + micras_stop = 1; + flush_workqueue(micras_wq); + destroy_workqueue(micras_wq); + +fail_wq: + printk("RAS module load failed\n"); + return err; +} + + +static void __exit +micras_exit(void) +{ + int i; + scif_epd_t ep; + + printk("Unloading RAS module\n"); + micras_stop = 1; + + /* + * Disconnect MC event handlers and + * close the I2C eeprom interfaces. + */ + mcu_exit(); + mcc_exit(); + ee_exit(); + + /* + * Close SCIF listeners (no more connects). + */ + scif_close(micras_cp_lstn); + scif_close(micras_mc_lstn); + scif_close(micras_ttl_lstn); + msleep(10); + destroy_workqueue(micras_cp_wq); + destroy_workqueue(micras_mc_wq); + destroy_workqueue(micras_ttl_wq); + + /* + * Terminate active sessions by closing their end points. + * Session threads then should clean up after themselves. + */ + for(i = 0; i < MR_SCIF_MAX; i++) { + if (micras_cp_kt[i]) { + printk("RAS.exit: force closing CP session %d\n", i); + ep = (scif_epd_t) atomic64_xchg((atomic64_t *)(micras_cp_ep + i), 0); + if (ep) + scif_close(ep); + } + } + for(i = 0; i < 1000; i++) { + if (bitmap_weight(micras_cp_fd, MR_SCIF_MAX) == MR_SCIF_MAX) + break; + msleep(1); + } + if (micras_mc_kt) { + printk("RAS.exit: force closing MC session\n"); + ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_mc_ep, 0); + if (ep) + scif_close(ep); + for(i = 0; (i < 1000) && micras_mc_kt; i++) + msleep(1); + } + if (micras_ttl_kt) { + printk("RAS.exit: force closing TTL session\n"); + ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_ttl_ep, 0); + if (ep) + scif_close(ep); + for(i = 0; (i < 1000) && micras_ttl_kt; i++) + msleep(1); + } + + /* + * Tear down sysfs class and its nodes + */ + class_unregister(&ras_class); + +#if defined(CONFIG_MK1OM) && USE_PM + /* + * De-register with the PM module. + */ + pm_exit(); +#endif + + /* + * Shut down the work queues + */ +#if defined(CONFIG_MK1OM) && WA_4845465 + if (smc_4845465) + cancel_delayed_work(&micras_wq_smc); +#endif + cancel_delayed_work(&micras_wq_tick); + cancel_delayed_work(&micras_wq_init); + flush_workqueue(micras_wq); + destroy_workqueue(micras_wq); + + /* + * Restore MT state + */ + mr_mt_exit(); + + /* + * Remove MMIO region maps + */ + iounmap(micras_sbox); + for(i = 0; i < ARRAY_SIZE(micras_dbox); i++) + if (micras_dbox[i]) + iounmap(micras_dbox[i]); + for(i = 0; i < ARRAY_SIZE(micras_gbox); i++) + if (micras_gbox[i]) + iounmap(micras_gbox[i]); +#ifdef CONFIG_MK1OM + for(i = 0; i < ARRAY_SIZE(micras_tbox); i++) + if (micras_tbox[i]) + iounmap(micras_tbox[i]); +#endif + +#if defined(CONFIG_MK1OM) && WA_4845465 && DIE_PROC + if (smc_4845465 && die_pe) { + remove_proc_entry("die", 0); + die_pe = 0; + } +#endif + + printk("RAS module unload completed\n"); +} + +module_init(micras_init); +module_exit(micras_exit); + +MODULE_AUTHOR("Intel Corp. 2013 (" __DATE__ ") ver " RAS_VER); +MODULE_DESCRIPTION("RAS and HW monitoring module for MIC"); +MODULE_LICENSE("GPL"); + diff --git a/ras/micras_pm.c b/ras/micras_pm.c new file mode 100644 index 0000000..77172aa --- /dev/null +++ b/ras/micras_pm.c @@ -0,0 +1,1050 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS PM interface + * + * Contains code to handle interaction with the PM driver. + * This includes the initial upload of core voltages and + * frequencies, handling of 'turbo' mode, and accounting + * for and reporting of card throttles. + * This really is for KnC only. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras.h" +#include "monahan.h" +#include + +#if USE_PM + +static atomic_t pm_entry; /* Active calls from PM */ + + +/* + * Local variables to keep track of throttle states + * + * onoff Set to 1 if throttling is in effect, otherwise 0 + * count Count of complete throttles (not counting current). + * time Time spent in complete throttles + * start Time when current throttle started (or 0) + * + * Units of time is measured in jiffies and converted to mSecs + * at the end of a throttle period. Jiffies are lower resolution + * than mSec. If a throttle starts and ends within same jiffy, + * a standard penalty of 1/2 jiffy gets added. + * + *TBD: perhaps it's better simply to add 1/2 jiffy to every throttle + * period to compensate for rounding down errors. Would be fair + * if average throttle period is more than 1 jiffy long. + * + *TBD: Using atomics may be overkill. Calls from the RAS MT thread + * will be serialized (guaranteed), i.e. the report routine needs + * not to care about re-entrancy. + */ + +static atomic_t tmp_onoff; +static atomic_t tmp_count; +static atomic_long_t tmp_time; +static atomic_long_t tmp_start; + +static atomic_t pwr_onoff; +static atomic_t pwr_count; +static atomic_long_t pwr_time; +static atomic_long_t pwr_start; + +static atomic_t alrt_onoff; +static atomic_t alrt_count; +static atomic_long_t alrt_time; +static atomic_long_t alrt_start; + + +static void +mr_pwr_enter(void) +{ + if (atomic_xchg(&pwr_onoff, 1)) + return; + + atomic_long_set(&pwr_start, jiffies); +} + +static void +mr_pwr_leave(void) { + unsigned long then; + + if (! atomic_xchg(&pwr_onoff, 0)) + return; + + then = atomic_long_xchg(&pwr_start, 0); + atomic_inc(&pwr_count); + + if (jiffies == then) + atomic_long_add(jiffies_to_msecs(1) / 2, &pwr_time); + else + atomic_long_add(jiffies_to_msecs(jiffies - then), &pwr_time); +} + + +static void +mr_tmp_enter(void) +{ + if (atomic_xchg(&tmp_onoff, 1)) + return; + + atomic_long_set(&tmp_start, jiffies); +} + +static void +mr_tmp_leave(void) +{ + unsigned long then; + + if (! atomic_xchg(&tmp_onoff, 0)) + return; + + then = atomic_long_xchg(&tmp_start, 0); + atomic_inc(&tmp_count); + if (jiffies == then) + atomic_long_add(jiffies_to_msecs(1) / 2, &tmp_time); + else + atomic_long_add(jiffies_to_msecs(jiffies - then), &tmp_time); +} + + +static void +mr_alrt_enter(void) +{ + if (atomic_xchg(&alrt_onoff, 1)) + return; + + atomic_long_set(&alrt_start, jiffies); +} + +static void +mr_alrt_leave(void) +{ + unsigned long then; + + if (! atomic_xchg(&alrt_onoff, 0)) + return; + + then = atomic_long_xchg(&alrt_start, 0); + atomic_inc(&alrt_count); + if (jiffies == then) + atomic_long_add(jiffies_to_msecs(1) / 2, &alrt_time); + else + atomic_long_add(jiffies_to_msecs(jiffies - then), &alrt_time); +} + + + +/* + * Report current throttle state(s) to MT. + * Simple copy of local variables, except for the time + * measurement, where current throttle (if any) is included. + * Don't want a lock to gate access to the local variables, + * so the atomics needs to be read in the correct order. + * First throttle state, then adder if throttle is in + * progress, then counters. If PM enters or leave throttle + * while reading stats, the worst is that time for the + * current trottle is not included until next read. + */ + +int +mr_pm_ttl(struct mr_rsp_ttl * rsp) +{ + unsigned long then; + + rsp->power.since = 0; + rsp->power.active = (uint8_t) atomic_read(&pwr_onoff); + if (rsp->power.active) { + then = atomic_long_read(&pwr_start); + if (then) + rsp->power.since = jiffies_to_msecs(jiffies - then); + } + rsp->power.count = atomic_read(&pwr_count); + rsp->power.time = atomic_long_read(&pwr_time); + + rsp->thermal.since = 0; + rsp->thermal.active = (uint8_t) atomic_read(&tmp_onoff); + if (rsp->thermal.active) { + then = atomic_long_read(&tmp_start); + if (then) + rsp->thermal.since = jiffies_to_msecs(jiffies - then); + } + rsp->thermal.count = atomic_read(&tmp_count); + rsp->thermal.time = atomic_long_read(&tmp_time); + + rsp->alert.since = 0; + rsp->alert.active = (uint8_t) atomic_read(&alrt_onoff); + if (rsp->alert.active) { + then = atomic_long_read(&alrt_start); + if (then) + rsp->alert.since = jiffies_to_msecs(jiffies - then); + } + rsp->alert.count = atomic_read(&alrt_count); + rsp->alert.time = atomic_long_read(&alrt_time); + + return 0; +} + + +/* + * Throttle signaling function (call from PM) + */ + +static int ttl_tcrit; + +void +mr_throttle(int which, int state) +{ + struct ttl_info ttl; + uint32_t tmp; + + atomic_inc(&pm_entry); + + tmp = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2); + ttl.die = GET_BITS(19, 10, tmp); + + /* + * PM is weird in the destinction of thermal and power throttle. + * Power below PLIM should be quiet. Power between PLim1 and PLim0 + * results in TTL_POWER events. Power above PLim0 results in both + * TTL_POWER and TTL_THERMAL events, _even_ if temperature is well + * below Tcrit. We handle this by maintaining 3 throttle related + * event types: thermal throttles, power throttles and power alert. + * The power alert is flaggend on entry as TTL_POWER, no problems. + * The two throttles both come in as TTL_THERMAL, so we use current + * die temperature to determine whether it was a thermal threshold + * or the power limit that was exceeded. Point is power throttles + * arriving while temperature is above Tcrit _will_ be counted as + * thermal throttles, period. + */ + ttl.upd = 0; + switch(which) { + case TTL_POWER: + (state == TTL_OFF) ? mr_alrt_leave() : mr_alrt_enter(); + ttl.upd |= PM_ALRT_TTL_CHG; + ttl.upd |= atomic_read(&alrt_onoff) ? PM_ALRT_TTL : 0; + break; + + case TTL_THERMAL: +#if 1 + /* + * Careful here: may get throttle ON while die > tcrit + * and select thermal throttle correctly and then get + * the corresponding throttle OFF when die has fallen + * below tcrit in which case we must de-assert thermal + * trottle. + * As a shortcut, we deassert both throttles if the + * GPU_HOT signal gets de-asserted (which is correct). + */ + if (state == TTL_OFF) { + if (atomic_read(&pwr_onoff)) + ttl.upd |= PM_PWR_TTL_CHG; + if (atomic_read(&tmp_onoff)) + ttl.upd |= PM_TRM_TTL_CHG; + mr_pwr_leave(); + mr_tmp_leave(); + } + else { + if (ttl_tcrit && ttl.die < ttl_tcrit) { + if (! atomic_read(&pwr_onoff)) + ttl.upd |= (PM_PWR_TTL_CHG | PM_PWR_TTL); + mr_pwr_enter(); + } + else { + if (! atomic_read(&tmp_onoff)) + ttl.upd |= (PM_TRM_TTL_CHG | PM_TRM_TTL); + mr_tmp_enter(); + } + } +#else + if (ttl_tcrit && ttl.die < ttl_tcrit) { + (state == TTL_OFF) ? mr_pwr_leave() : mr_pwr_enter(); + ttl.upd |= PM_PWR_TTL_CHG; + ttl.upd |= atomic_read(&pwr_onoff) ? PM_PWR_TTL : 0; + } + else { + (state == TTL_OFF) ? mr_tmp_leave() : mr_tmp_enter(); + ttl.upd |= PM_TRM_TTL_CHG; + ttl.upd |= atomic_read(&tmp_onoff) ? PM_TRM_TTL : 0; + } +#endif + break; + } + + micras_ttl_send(&ttl); + +#if 0 + printk("ttl - args: which %d, state %d\n", which, state); + + printk("ttl - therm: on %d, count %d, time %ld, start %ld\n", + atomic_read(&tmp_onoff), atomic_read(&tmp_count), + atomic_long_read(&tmp_time), atomic_long_read(&tmp_start)); + + printk("ttl - power: on %d, count %d, time %ld, start %ld\n", + atomic_read(&pwr_onoff), atomic_read(&pwr_count), + atomic_long_read(&pwr_time), atomic_long_read(&pwr_start)); + + printk("ttl - alert: on %d, count %d, time %ld, start %ld\n", + atomic_read(&alrt_onoff), atomic_read(&alrt_count), + atomic_long_read(&alrt_time), atomic_long_read(&alrt_start)); +#endif + + atomic_dec(&pm_entry); +} + + +/* + * Throttle signaling function (call from notifier chain) + * + * TBD: should we test for odd state transitions and recursions? + */ + +static int +mr_pm_throttle_callback(struct notifier_block *nb, unsigned long event, void *msg) +{ + atomic_inc(&pm_entry); + + switch(event) { + + case EVENT_PROCHOT_ON: + mr_throttle(TTL_THERMAL, TTL_ON); + break; + + case EVENT_PROCHOT_OFF: + mr_throttle(TTL_THERMAL, TTL_OFF); + break; + + case EVENT_PWR_ALERT_ON: + mr_throttle(TTL_POWER, TTL_ON); + break; + + case EVENT_PWR_ALERT_OFF: + mr_throttle(TTL_POWER, TTL_OFF); + break; + + default: + /* + * Ignore whatever else is sent this way + */ + break; + } + + atomic_dec(&pm_entry); + return 0; +} + + + + +/* +** +** Power management routines +** +** one_mmio_rd Read one MMIO register into memory safe +** one_mmio_wr Write one MMIO register from memory safe +** +** one_msr_rd Read one MSR register into memory safe +** one_msr_wr Write one MSR register from memory safe +** +** mc_suspend Prepare for suspend, preserve CSRs to safe +** mc_suspend_cancel Suspend canceled, restore operating mode +** mc_resume Recover from suspend, restore CSRs from safe +** +** For now this stores all registers that are used by this module. +** In reality, only those registers on power planes turned off in +** deep sleep states needs to be stored, but at this point it is +** not known which registers are in that group. This is a table +** driven mechanism that _only_ handles RAS related registers. +** +**TBD: Turn off MC handlers while in suspend? +** Both pro's and con's on this one, such as +** + Disabling uncore is easy, just clear INT_EN +** + prevents MC to interfere with PM state transitions +** - can hide corruption due to UC errors +** - requires a lot of IPIs to shut down core MC handling +** + there's nobody to handle MCs when cores are asleep. +** ? can events hide in *BOX banks during suspend/resume +** and fire when restoring the INT_EN register? +** - Disabling core is not that easy (from a module). +** Enabling core MCEs requires setting flag X86_CR4_MCE +** in CR4 on every core _and_ writing ~0 to MSR IA32_MCG_CAP +** on every CPU. Probably better to let per-CPU routines +** like mce_suspend() and mce_resume() handle it, with +** some care because we'd want to save all CTLs before +** mce_suspend() runs and restore them after mce_resume(). +** Problem is how to get at these functions; they are not +** exported and seems not to be hooked into the kernel's PM +** call chains. Perhaps sysclass abstraction ties into PM. +** Even so, who's to invoke it and how? +*/ + +#define SAVE_BLOCK_MCA 1 /* Disable MC handling in suspend */ +#define RAS_SAVE_MSR 1 /* Include global MSRs in suspend */ +#define RAS_SAVE_CPU_MSR 0 /* Include per-CPU MSRs in suspend */ + +#define SBOX 1 /* SBOX register (index 0) */ +#define DBOX 2 /* DBOX register (index 0..1) */ +#define GBOX 3 /* GBOX register (index 0..7) */ +#define TBOX 4 /* TBOX register (index 0..7) */ +#define GMSR 5 /* Global MSR (index 0) */ +#define LMSR 6 /* Per-CPU MSR (index 0..CONFIG_NR_CPUS-1) */ + +#define W64 (1 << 6) /* 64 bit MMIO register (32 bit default) */ +#define VLD (1 << 7) /* Register value valid, can be restored */ + +typedef struct _regrec { + uint8_t box; /* Box type + width bit + valid bit */ + uint8_t num; /* Box index (or 0) */ + uint16_t ofs; /* MMIO byte offset / MSR number */ + uint64_t reg; /* Register value */ +} RegRec; + + +/* + * Rumor has it that SBOX CSRs below 0x7000 will survive deep sleep + * Think it's safer to save/restore CSRs that RAS writes to anyways. + * We'll leave out a bunch of RO CSRs, most of which are HW status. + * SCRATCH CSRs are above 0x7000 and needs to be preserved. + * + *TBD: Somebody else to preserve scratch CSRs not used by RAS? + * For now I'll save and restore all of them. + */ + +static RegRec susp_mmio[] = { /* Used in file */ + { SBOX, 0, SBOX_MCA_INT_EN, 0 }, /* Uncore, must be 1st */ + { SBOX, 0, SBOX_SCRATCH0, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH1, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH2, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH3, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH4, 0 }, /* Common, knc, */ + { SBOX, 0, SBOX_SCRATCH5, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH6, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH7, 0 }, /* Knc, knf */ + { SBOX, 0, SBOX_SCRATCH8, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH9, 0 }, /* Common, knc, knf */ + { SBOX, 0, SBOX_SCRATCH10, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH11, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH12, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH13, 0 }, /* Common */ + { SBOX, 0, SBOX_SCRATCH14, 0 }, /* - */ + { SBOX, 0, SBOX_SCRATCH15, 0 }, /* - */ +// { SBOX, 0, SBOX_COMPONENT_ID, 0 }, /* Knc */ +// { SBOX, 0, SBOX_SVIDCONTROL, 0 }, /* Knc */ +// { SBOX, 0, SBOX_PCIE_PCI_SUBSYSTEM, 0 }, /* Common */ +// { SBOX, 0, SBOX_PCIE_VENDOR_ID_DEVICE_ID, 0 }, /* Common */ +// { SBOX, 0, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8, 0 },/* Common */ + { SBOX, 0, SBOX_OC_I2C_ICR + ICR_OFFSET, 0 }, /* Elog */ + { SBOX, 0, SBOX_OC_I2C_ICR + ISR_OFFSET, 0 }, /* Elog */ + { SBOX, 0, SBOX_OC_I2C_ICR + ISAR_OFFSET, 0 }, /* Elog */ + { SBOX, 0, SBOX_OC_I2C_ICR + IDBR_OFFSET, 0 }, /* Elog */ +// { SBOX, 0, SBOX_OC_I2C_ICR + IBMR_OFFSET, 0 }, /* Elog */ +// { SBOX, 0, SBOX_COREVOLT, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_COREFREQ, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_MEMVOLT, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_MEMORYFREQ, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_CURRENTRATIO, 0 }, /* Knc */ +// { SBOX, 0, SBOX_BOARD_VOLTAGE_SENSE, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_THERMAL_STATUS, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_BOARD_TEMP1, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_BOARD_TEMP2, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_CURRENT_DIE_TEMP0, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_CURRENT_DIE_TEMP1, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_CURRENT_DIE_TEMP2, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_MAX_DIE_TEMP0, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_MAX_DIE_TEMP1, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_MAX_DIE_TEMP2, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_STATUS_FAN1, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_STATUS_FAN2, 0 }, /* Knc, knf */ +// { SBOX, 0, SBOX_SPEED_OVERRIDE_FAN, 0 }, /* Knc, knf */ + { SBOX, 0, SBOX_MCA_INT_STAT, 0 }, /* Uncore */ +// { SBOX, 0, SBOX_APICRT16, 0 }, /* Uncore */ + { SBOX, 0, SBOX_MCX_CTL_LO, 0 }, /* Uncore */ + { DBOX, 0, DBOX_MC2_CTL, 0 }, /* Uncore */ +#ifdef CONFIG_MK1OM + { DBOX, 1, DBOX_MC2_CTL, 0 }, /* Uncore */ +#endif + { GBOX | W64, 0, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ + { GBOX | W64, 1, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ + { GBOX | W64, 2, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ + { GBOX | W64, 3, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ +#ifdef CONFIG_MK1OM + { GBOX | W64, 4, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ + { GBOX | W64, 5, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ + { GBOX | W64, 6, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ + { GBOX | W64, 7, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */ +#endif +#ifdef CONFIG_MK1OM + { TBOX | W64, 0, TXS_MCX_CONTROL, 0 }, /* Uncore */ + { TBOX | W64, 1, TXS_MCX_CONTROL, 0 }, /* Uncore */ + { TBOX | W64, 2, TXS_MCX_CONTROL, 0 }, /* Uncore */ + { TBOX | W64, 3, TXS_MCX_CONTROL, 0 }, /* Uncore */ + { TBOX | W64, 4, TXS_MCX_CONTROL, 0 }, /* Uncore */ + { TBOX | W64, 5, TXS_MCX_CONTROL, 0 }, /* Uncore */ + { TBOX | W64, 6, TXS_MCX_CONTROL, 0 }, /* Uncore */ + { TBOX | W64, 7, TXS_MCX_CONTROL, 0 }, /* Uncore */ +#endif +}; + +#if RAS_SAVE_MSR +static RegRec susp_msr[] = { /* Used in file */ + { GMSR, 0, MSR_IA32_MCG_STATUS, 0 }, /* Uncore, kernel */ +}; + +#if RAS_SAVE_CPU_MSR +static RegRec susp_lcl_msr[4 * CONFIG_NR_CPUS] = { /* Used in file */ + { LMSR, 0, MSR_IA32_MCx_CTL(0), 0 }, /* Core, kernel */ + { LMSR, 0, MSR_IA32_MCx_CTL(1), 0 }, /* Core, kernel */ + { LMSR, 0, MSR_IA32_MCx_CTL(2), 0 }, /* Core, kernel */ + { LMSR, 0, MSR_IA32_MCG_CTL, 0 }, /* kernel */ + /* + * The remaining entries is setup/replicated by pm_init() + */ +}; +#endif +#endif + + +static void +one_mmio_rd(RegRec * r) +{ + switch(r->box & 0xf) { + case SBOX: + if (r->box & W64) + r->reg = mr_sbox_rq(0, r->ofs); + else + r->reg = (uint64_t) mr_sbox_rl(0, r->ofs); + break; + case DBOX: + if (r->box & W64) + r->reg = mr_dbox_rq(r->num, r->ofs); + else + r->reg = (uint64_t) mr_dbox_rl(r->num, r->ofs); + break; + case GBOX: + if (r->box & W64) + r->reg = mr_gbox_rq(r->num, r->ofs); + else + r->reg = (uint64_t) mr_gbox_rl(r->num, r->ofs); + break; + case TBOX: + if (mr_txs()) { + if (r->box & W64) + r->reg = mr_tbox_rq(r->num, r->ofs); + else + r->reg = (uint64_t) mr_tbox_rl(r->num, r->ofs); + } + break; + default: + r->box &= ~VLD; + return; + } + r->box |= VLD; + +#if PM_VERBOSE + printk("mmio_rd: box %d, idx %3d, ofs %04x -> %llx\n", + r->box & 0xf, r->num, r->ofs, r->reg); +#endif +} + +static void +one_mmio_wr(RegRec * r) +{ + if (! (r->box & VLD)) + return; + + switch(r->box & 0xf) { + case SBOX: + if (r->box & W64) + mr_sbox_wq(0, r->ofs, r->reg); + else + mr_sbox_wl(0, r->ofs, (uint32_t) r->reg); + break; + case DBOX: + if (r->box & W64) + mr_dbox_wq(r->num, r->ofs, r->reg); + else + mr_dbox_wl(r->num, r->ofs, (uint32_t) r->reg); + break; + case GBOX: + if (r->box & W64) + mr_gbox_wq(r->num, r->ofs, r->reg); + else + mr_gbox_wl(r->num, r->ofs, (uint32_t) r->reg); + break; + case TBOX: + if (mr_txs()) { + if (r->box & W64) + mr_tbox_wq(r->num, r->ofs, r->reg); + else + mr_tbox_wl(r->num, r->ofs, (uint32_t) r->reg); + } + break; + } + r->box &= ~VLD; + +#if PM_VERBOSE + printk("mmio_wr: box %d, idx %3d, ofs %04x <- %llx\n", + r->box & 0xf, r->num, r->ofs, r->reg); +#endif +} + + +#if RAS_SAVE_MSR +static void +one_msr_rd(RegRec * r) +{ + uint32_t hi, lo; + + switch(r->box & 0xf) { + case GMSR: + rdmsr(r->ofs, lo, hi); + break; +#if RAS_SAVE_CPU_MSR + case LMSR: + rdmsr_on_cpu(r->num, r->ofs, &lo, &hi); + break; +#endif + default: + r->box &= ~VLD; + return; + } + r->reg = ((uint64_t) hi) << 32 | (uint64_t) lo; + r->box |= VLD; + +#if PM_VERBOSE + printk("msr_rd: box %d, idx %3d, ofs %04x -> %llx\n", + r->box & 0xf, r->num, r->ofs, r->reg); +#endif +} + +static void +one_msr_wr(RegRec * r) +{ + uint32_t hi, lo; + + if (! (r->box & VLD)) + return; + + hi = r->reg >> 32; + lo = r->reg & 0xffffffff; + switch(r->box & 0xf) { + case GMSR: + wrmsr(r->ofs, lo, hi); + break; +#if RAS_SAVE_CPU_MSR + case LMSR: + wrmsr_on_cpu(r->num, r->ofs, lo, hi); + break; +#endif + } + r->box &= ~VLD; + +#if PM_VERBOSE + printk("msr_wr: box %d, idx %3d, ofs %04x <- %llx\n", + r->box & 0xf, r->num, r->ofs, r->reg); +#endif +} +#endif /* RAS_SAVE_MSR */ + + +/* + * Preserve all HW registers that will be lost in + * deep sleep states. This will be SBOX registers + * above offset 0x7000 and all other BOX registers. + */ + +static void +mr_suspend(void) +{ + int i; + + atomic_inc(&pm_entry); + + /* + * Save SBOX_MCA_INT_EN first and clear it. + * No more uncore MCAs will get through. + */ + one_mmio_rd(susp_mmio + 0); +#if SAVE_BLOCK_MCA + mr_sbox_wl(0, SBOX_MCA_INT_EN, 0); +#endif + + /* + * Save remaining BOX MMIOs + */ + for(i = 1; i < ARRAY_SIZE(susp_mmio); i++) + one_mmio_rd(susp_mmio + i); + +#if RAS_SAVE_MSR + /* + * Save global MSRs and set MCIP + * No new exceptions will be asserted + */ + for(i = 0; i < ARRAY_SIZE(susp_msr); i++) + one_msr_rd(susp_msr + i); +#if SAVE_BLOCK_MCA + wrmsr(MSR_IA32_MCG_STATUS, MCG_STATUS_MCIP, 0); +#endif + +#if RAS_SAVE_CPU_MSR + /* + * Save per-CPU MSRs + */ + for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++) + one_msr_rd(susp_lcl_msr + i); +#endif +#endif + + atomic_dec(&pm_entry); +} + + +/* + * Undo side effects of a suspend call. + * Nothing to do unless we turned MC handlers off. + */ + +static void +mr_cancel(void) +{ + int i; + + atomic_inc(&pm_entry); + + /* + * Restore SBOX_MCA_INT_EN to unblock uncore MCs + * Invalidate all other saved MMIO registers. + */ + one_mmio_wr(susp_mmio + 0); + for(i = 1; i < ARRAY_SIZE(susp_mmio); i++) + susp_mmio[i].box &= ~VLD; + +#if RAS_SAVE_MSR + /* + * Restore IA32_MCG_STATUS to unblock core MCs + * Invalidate all other saved MSR registers. + */ + one_msr_wr(susp_msr + 0); + for(i = 1; i < ARRAY_SIZE(susp_msr); i++) + susp_msr[i].box &= ~VLD; + +#if RAS_SAVE_CPU_MSR + for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++) + susp_lcl_msr[i].box &= ~VLD; +#endif +#endif + + atomic_dec(&pm_entry); +} + + +/* + * Restore all HW registers that we use. + */ + +static void +mr_resume(void) +{ + int i; + + atomic_inc(&pm_entry); + + /* + * Clear uncore MCA banks (just in case) + */ + if (susp_mmio[0].box & VLD) + box_reset(0); + + /* + * Restore all BOX MMIOs but SBOX_MCA_INT_EN + */ + for(i = 1; i < ARRAY_SIZE(susp_mmio); i++) + one_mmio_wr(susp_mmio + i); + + /* + * Then restore SBOX_MCA_INT_EN to enable uncore MCAs + */ + one_mmio_wr(susp_mmio + 0); + +#if RAS_SAVE_MSR + /* + * Restore all global MSRs but IA32_MCG_STATUS + */ + for(i = 1; i < ARRAY_SIZE(susp_msr); i++) + one_msr_wr(susp_msr + i); + + /* + * Then restore IA32_MCG_STATUS to allow core MCAs + */ + one_msr_wr(susp_msr + 0); + +#if RAS_SAVE_CPU_MSR + /* + * Restore all per-cpu MSRs + */ + for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++) + one_msr_wr(susp_lcl_msr + i); +#endif +#endif + + atomic_dec(&pm_entry); +} + + +/* + * Callback from PM notifier chain. + * TBD: should we test for odd state transitions and recursions? + */ + +static int +mr_pm_callback(struct notifier_block *nb, unsigned long event, void *msg) +{ + + switch(event) { + case MICPM_DEVEVENT_SUSPEND: + mr_suspend(); + break; + + case MICPM_DEVEVENT_RESUME: + mr_resume(); + break; + + case MICPM_DEVEVENT_FAIL_SUSPEND: + mr_cancel(); + break; + + default: + /* + * Ignore whatever else is sent this way + */ + break; + } + + return 0; +} + + + +/* +** +** The PM module loads before RAS, so we must setup +** the API to support power management, i.e register. +** PM needs: +** - Notification when MT changes certain variables. +** Provided by a call-out list that the PM sets +** at registration time. +** - Access to MT calls. +** The PM module can use micras_mt_call() for access. +** Since PM loads first, this function needs to +** be passed at registration time. +** RAS needs: +** - list of core voltages (for CVOLT query). +** We pass a pointer to the voltage list and the +** voltage list counter to PM module, who will +** fill in the actual values (not available until +** core-freq driver loads). +** - list of core frequencies (for CFREQ query). +** Same solution as for CVOLT. +** - Notifications for throttle state changes. +** - Power management notifications for suspend/resume. +** +** Note: can one notifier block be inserted in multiple +** chains? Its assume not, which require two blocks +** both pointing to the same local function. +*/ + +extern struct mr_rsp_freq freq; +extern struct mr_rsp_volt volt; + +struct micpm_params pm_reg; /* Our data for PM */ +struct micpm_callbacks pm_cb; /* PM data for us */ + +extern void micpm_device_register(struct notifier_block *n); +extern void micpm_device_unregister(struct notifier_block *n); +extern void micpm_atomic_notifier_register(struct notifier_block *n); +extern void micpm_atomic_notifier_unregister(struct notifier_block *n); + +static struct notifier_block ras_deviceevent = { + .notifier_call = mr_pm_callback, +}; + +static struct notifier_block ras_throttle_event_ns = { + .notifier_call = mr_pm_throttle_callback, +}; + +static struct notifier_block ras_throttle_event = { + .notifier_call = mr_pm_throttle_callback, +}; + + +/* + * Setup PM callbacks and SCIF handler. + */ + +static int +pm_mt_call(uint16_t cmd, void * buf) +{ + int err; + + atomic_inc(&pm_entry); + err = micras_mt_call(cmd, buf); + atomic_dec(&pm_entry); + + return err; +} + + +int __init +pm_init(void) +{ + extern int mr_smc_rd(uint8_t, uint32_t *); + +#if RAS_SAVE_CPU_MSR + /* + * Preset MCA bank MSR register descriptions + * + *TBD: We have to use IPIs to read MSRs, which will wake + * up cores at sleep when this function is called. + * PM module may not like this at all. + */ + int i, j; + for(i = 1; i < nr_cpu_ids; i++) { + j = 4 * i; + susp_lcl_msr[j] = susp_lcl_msr[0]; + susp_lcl_msr[j + 1] = susp_lcl_msr[1]; + susp_lcl_msr[j + 2] = susp_lcl_msr[2]; + susp_lcl_msr[j + 3] = susp_lcl_msr[3]; + susp_lcl_msr[j].num = i; + susp_lcl_msr[j + 1].num = i; + susp_lcl_msr[j + 2].num = i; + susp_lcl_msr[j + 3].num = i; + } +#endif + + /* + * Get temperature where power throttle becomes thermal throttle + */ + mr_smc_rd(0x4c, &ttl_tcrit); + + /* + * Register with the MIC Power Management driver. + */ + pm_reg.volt_lst = volt.supt; + pm_reg.volt_len = &volt.slen; + pm_reg.volt_siz = ARRAY_SIZE(volt.supt); + pm_reg.freq_lst = freq.supt; + pm_reg.freq_len = &freq.slen; + pm_reg.freq_siz = ARRAY_SIZE(freq.supt); + pm_reg.mt_call = pm_mt_call; + pm_reg.mt_ttl = mr_throttle; + if (micpm_ras_register(&pm_cb, &pm_reg)) + goto fail_pm; + + /* + * Get into the PM notifier lists + * MicPm reports events in 2 chains, one atomic and one + * blocking. Our callback will not block! + */ + micpm_atomic_notifier_register(&ras_throttle_event_ns); + micpm_notifier_register(&ras_throttle_event); + + if (boot_cpu_data.x86_mask == KNC_C_STEP) + micpm_device_register(&ras_deviceevent); + + printk("RAS.pm: init complete\n"); + return 0; + +fail_pm: + printk("RAS.pm: init failed\n"); + return 1; +} + + +/* + * Cleanup for module unload. + * Clear/restore hooks in the native MCA handler. + */ + +void __exit +pm_exit(void) +{ + /* + * Get off the PM notifier list + */ + micpm_atomic_notifier_unregister(&ras_throttle_event_ns); + micpm_notifier_unregister(&ras_throttle_event); + + if (boot_cpu_data.x86_mask == KNC_C_STEP) + micpm_device_unregister(&ras_deviceevent); + + /* + * De-register with the PM module. + */ + micpm_ras_unregister(); + + /* + * Wait for an calls to module to finish. + */ + while(atomic_read(&pm_entry)) + cpu_relax(); + + printk("RAS.pm: exit complete\n"); +} + +#endif /* USE_PM */ diff --git a/ras/micras_uncore.c b/ras/micras_uncore.c new file mode 100644 index 0000000..af1e3a4 --- /dev/null +++ b/ras/micras_uncore.c @@ -0,0 +1,1194 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * RAS handler for uncore MC events + * + * Contains code to intercept MC events, collect information + * from uncore MCA banks and handle the situation. + * + * In case of a severe event, defined by corrupted context, + * the handler will add a record of the event in the designated + * EEPROM hanging off the Over Clocking I2C bus. After that + * a message will be sent to the SMC (enabling IPMI notifications) + * and at last a message is sent to the host via the MC SCIF + * connection. + * + * Lesser events will also be sent to the host on a 'FYI' basis, + * but no rocord will be stored in the event log. + * + * This is in all aspects similar to the reaction to a severe + * core MC event. Differences are in the MC bank access (mmio), + * and that the event is delivered via an interrupt instead of + * an exception. Still, the handler cannot expect any support + * from the OS. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "micras.h" + + +/* + * Hooks placed in the native machine check handler + * See file arch/x86/kernel/traps.c for placement + * + * nmi Entered NMI exception handler. + * Called before any other tests, which allow us + * to test for and handle un-core MCA events before + * the traditional NMI handling. + * Note that the mce-inject mechanism also uses + * NMI's to distribute calls to do_machine_check(). + */ + +extern int (*mca_nmi)(int); + + + +/* + * Table of un-core MCA banks. + * Though there are differences in register count and sizes, un-core bank + * registers are always spaced 8 bytes apart, so all we need to know is + * the location of the first MCA bank register (CTL) to find them. + * If bank is present, the bank register offsets for ctl, status, addr, + * and misc are thus 0, 8, 16, and 24 respectively. + * Default CTL masks pulled from the register documentation + * Some SKUs don't have support for all BOXs but that will be handled + * at runtime in the support code, not at compile time by this table. + */ + + +#ifdef CONFIG_ML1OM +#define SBOX_DEF 0x000e /* All (7) */ +#define DBOX_DEF 0x0003 /* All (2) */ +#define GBOX_DEF 0x0003 /* All (2) */ +#endif +#ifdef CONFIG_MK1OM +#define SBOX_DEF 0x03ce /* All - PCIe errors (7) */ +#define DBOX_DEF 0x000f /* All (4) */ +#define GBOX_DEF 0x3ffffffff /* All (34) */ +#define TBOX_DEF 0x001f /* All (5) */ +#endif + +#define MCU_CTL_64 (1 << 0) /* Bank has 64 bit CTL register */ +#define MCU_NO_ADDR (1 << 1) /* Bank has no ADDR register */ +#define MCU_ADDR_32 (1 << 2) /* Bank has 32 bit ADDR register */ +#define MCU_NO_MISC (1 << 3) /* Bank has no MISC register */ +#define MCU_MISC_64 (1 << 4) /* Bank has 64 bit MISC register */ + +#define MCU_CTRL 0 +#define MCU_STAT 8 +#define MCU_ADDR 16 +#define MCU_MISC 24 + +typedef struct _mcu_rec { + uint8_t num; /* 'BOX' count */ + uint8_t org; /* Origin code */ + uint8_t qflg; /* Quirk flags */ + uint16_t ofs; /* MCA bank base offset */ + uint64_t ctl; /* Initial CTL mask */ + uint32_t (*rl)(int, uint32_t); /* 32-bit MMIO read */ + void (*wl)(int, uint32_t, uint32_t); /* 32-bit MMIO write */ + uint64_t (*rq)(int, uint32_t); /* 64-bit MMIO read */ + void (*wq)(int, uint32_t, uint64_t); /* 64-bit MMIO write */ +} McuRec; + + +static McuRec mcu_src[] = { + { 1, MC_ORG_SBOX, MCU_MISC_64, SBOX_MCX_CTL_LO, + SBOX_DEF, mr_sbox_rl, mr_sbox_wl, mr_sbox_rq, mr_sbox_wq }, + { DBOX_NUM, MC_ORG_DBOX, MCU_NO_MISC, DBOX_MC2_CTL, + DBOX_DEF, mr_dbox_rl, mr_dbox_wl, mr_dbox_rq, mr_dbox_wq }, + { GBOX_NUM, MC_ORG_GBOX, MCU_CTL_64, GBOX_FBOX_MCA_CTL_LO, + GBOX_DEF, mr_gbox_rl, mr_gbox_wl, mr_gbox_rq, mr_gbox_wq }, +#ifdef CONFIG_MK1OM + { TBOX_NUM, MC_ORG_TBOX, MCU_CTL_64 | MCU_NO_MISC | MCU_ADDR_32, TXS_MCX_CONTROL, + TBOX_DEF, mr_tbox_rl, mr_tbox_wl, mr_tbox_rq, mr_tbox_wq }, +#endif +}; + +#define GBOX_BROKEN 1 /* Set if GBOX MCA bank is borken */ + +#if GBOX_BROKEN +/* + * Si design managed to break the GBOX MCA bank concept + * by not filling useful data into ADDR and MISC registers. + * Instead they use a bunch of registers in another part + * of the GBOX (mbox to be specific) to hold this info. + * In order to get at the right register it is necesary + * to partially decode the STATUS register and from there + * select an GBOX.MBOX register. + * Since the new registers are all 32 bits wide, we'll stick + * the value into MISC register if Misc_V bit of STATUS is + * not set. The following table is used for register selection + * + * model code base width Chan Notes + * 0 017c 32 0 26 bit address, CRC (retrain) + * 1 097c 32 1 26 bit address, CRC (retrain) + * 2 01e0 32 0 26 bit address, ECC + * 3 09e0 32 1 26 bit address, ECC + * 4 01dc 32 0 26 bit address, UC CAPE + * 5 09dc 32 1 26 bit address, UC CAPE + * 31 01a4 32 0 26 bit address, UC ECC + * 32 09a4 32 1 26 bit address, UC ECC + * + * Note: model code is simply the enable bit number in CTL + */ + +static struct liu { + uint16_t mcode; + uint16_t base; +} liu[] = { + { 0, 0x17c }, /* Correctable CRC (retrain) ch 0 */ + { 1, 0x97c }, /* Correctable CRC (retrain) ch 1 */ + { 2, 0x1e0 }, /* Correctable ECC, ch 0 */ + { 3, 0x9e0 }, /* Correctable ECC, ch 1 */ + { 4, 0x1dc }, /* Uncorrectable CAPE, ch 0 */ + { 5, 0x9dc }, /* Uncorrectable CAPE, ch 1 */ + { 31, 0x1a4 }, /* Uncorrectable ECC, ch 0 */ + { 32, 0x9a4 } /* Uncorrectable ECC, ch 1 */ +}; + +static void +mcu_gbox_fixup(McuRec * mr, int num, MceInfo * mi) +{ + int i; + uint16_t mcode; + + /* + * Skip if Status.Misc_v set + */ + if (mi->status & (1ULL << 59)) + return; + + /* + * Get model code and if it's in the array, then read + * the addressed register into MISC. We don't set the + * Status.Misc_v bit because we want to distinguish + * this hack from the real MCA bank register. + */ + mcode = GET_BITS(31, 16, mi->status); + for(i = 0; i < ARRAY_SIZE(liu); i++) + if (liu[i].mcode == mcode) { + mi->misc = (uint64_t) mr->rl(num, liu[i].base); + break; + } +} +#endif + +/* + * Read Ctrl, Addr and Misc registers from an un-core MCA bank. + * The Status register is read/cleared in mcu_scan(). + */ + +static void +mcu_read(McuRec * mr, int num, MceInfo * mi) +{ + if (mr->qflg & MCU_CTL_64) + mi->ctl = mr->rq(num, mr->ofs + MCU_CTRL); + else + mi->ctl = (uint64_t) mr->rl(num, mr->ofs + MCU_CTRL); + + if (mr->qflg & MCU_NO_ADDR) + mi->addr = 0; + else { + if (mr->qflg & MCU_ADDR_32) + mi->addr = (uint64_t) mr->rl(num, mr->ofs + MCU_ADDR); + else + mi->addr = mr->rq(num, mr->ofs + MCU_ADDR); + } + + if (mr->qflg & MCU_NO_MISC) + mi->misc = 0; + else { + if (mr->qflg & MCU_MISC_64) + mi->misc = mr->rq(num, mr->ofs + MCU_MISC); + else + mi->misc = (uint64_t) mr->rl(num, mr->ofs + MCU_MISC); + } + +#if GBOX_BROKEN + if (mr->org == MC_ORG_GBOX) + mcu_gbox_fixup(mr, num, mi); +#endif +} + + +/* + * Reset one un-core MCA bank + * Any quirks go here. + */ + +static void +mcu_reset(McuRec * mr, int num, int arm) +{ + uint64_t ctl; + + mr->wq(num, mr->ofs + MCU_STAT, 0); + + if (! (mr->qflg & MCU_NO_ADDR)) { + if (mr->qflg & MCU_ADDR_32) + mr->wl(num, mr->ofs + MCU_ADDR, 0); + else + mr->wq(num, mr->ofs + MCU_ADDR, 0); + } + + if (! (mr->qflg & MCU_NO_MISC)) { + if (mr->qflg & MCU_MISC_64) + mr->wq(num, mr->ofs + MCU_MISC, 0); + else + mr->wl(num, mr->ofs + MCU_MISC, 0); + } + + ctl = arm ? mr->ctl : 0; + +#ifdef CONFIG_MK1OM + if (ctl && mr->org == MC_ORG_SBOX && mic_hw_stepping(0) == KNC_A_STEP) + ctl &= ~PUT_BIT(3, 1); /* A0 SBOX 'unclaimed address' bug */ + + if (ctl && mr->org == MC_ORG_GBOX && mr_mch() != 16) + ctl &= ~(uint64_t) PUT_BIT(6, 1); /* B0 GBOX 'Invalid Channel' (SKU 3 & 4) */ +#endif + + if (mr->qflg & MCU_CTL_64) + mr->wq(num, mr->ofs + MCU_CTRL, ctl); + else + mr->wl(num, mr->ofs + MCU_CTRL, ctl); +} + + +/* + * Un-core MC bank pre-scan + * Walk through all un-core MC sources to see if any events are pending. + * Stops on 1st match where STATUS has both VAL bit set. On some BOXes, + * like GBOX, interrupt may be signalled without the EN bit being set. + * See HSD 4116374 for details. + */ + +static int +mcu_prescan(void) +{ + int i, j; + uint64_t status; + struct _mcu_rec * mr; + + for(i = 0; i < ARRAY_SIZE(mcu_src); i++) { + mr = mcu_src + i; + +#ifdef CONFIG_MK1OM + if (mr->org == MC_ORG_TBOX && !mr_txs()) + continue; +#endif + + for(j = 0; j < mr->num; j++) { + status = mr->rq(j, mr->ofs + MCU_STAT); + if (status & MCI_STATUS_VAL) + return 1; + } + } + + return 0; +} + + +/* + * Un-core MC bank scanner. + * Walks through all un-core MC sources for new events. + * If any found, then process them same way as core events. + */ + +static int +mcu_scan(void) +{ + MceInfo mc, uc; + int gone, seen; + int i, j; + struct _mcu_rec * mr; + + /* + * Walk list of known un-core MC sources + */ + gone = seen = 0; + memset(&uc, 0, sizeof(uc)); + for(i = 0; i < ARRAY_SIZE(mcu_src); i++) { + mr = mcu_src + i; + +#ifdef CONFIG_MK1OM + if (mr->org == MC_ORG_TBOX && !mr_txs()) + continue; +#endif + + for(j = 0; j < mr->num; j++) { + + /* + * Read status to see if we have something of interest. + * As per HSD 4116374 the status register is cleared + * after read, if it had valid content. + *TBD: Clear unconditionally? + */ + mc.status = mr->rq(j, mr->ofs + MCU_STAT); + if (mc.status & MCI_STATUS_VAL) + mr->wq(j, mr->ofs + MCU_STAT, 0); + else + continue; + + /* + * Bank had valid content (VAL bit set). + * Verify the event was subscribed to (EN bit set). + * If not, the event is ignored. + */ + if (! (mc.status & MCI_STATUS_EN)) + continue; + + /* + * Valid and enabled event, read remaining bank registers. + */ + seen++; + mcu_read(mr, j, &mc); + + /* + * Fill out blanks in the MceInfo record + */ + mc.org = mr->org; + mc.id = j; + mc.stamp = get_seconds(); + mc.flags = (mc.status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0; + + /* + * If any way to detect injected errors then this is + * the place to do so and indicate by MC_FLG_FALSE flag + */ + + if (mc.flags & MC_FLG_FATAL) { +#ifdef CONFIG_MK1OM +#if MC_VERBOSE + ee_printk("Uncore fatal MC: org %d, id %d, status %lx\n", mc.org, mc.id, mc.status); +#endif + + /* + * Log UC events in the eeprom. + */ + micras_mc_log(&mc); + mc.flags |= MC_FLG_LOG; + + /* + * Notify SMC that we've had a serious machine check error. + */ + micras_mc_ipmi(&mc, 1); +#endif + /* + * Remember 1st fatal (UC) event + */ + if (! gone++) + uc = mc; + } + + /* + * Notify host + */ + micras_mc_send(&mc, 1); + + /* + * Filter corrected errors. + */ + if (! (mc.flags & MC_FLG_FATAL)) { + uint64_t tsc, msk; + + tsc = rdtsc(); + msk = micras_mc_filter(&mc, tsc, 1); + if (msk) { +#if MC_VERBOSE + ee_printk("Uncore filter: org %d, id %d, ctrl %lx, mask %lx\n", mc.org, mc.id, mc.ctl, msk); +#endif + if (mr->qflg & MCU_CTL_64) + mr->wq(j, mr->ofs + MCU_CTRL, mc.ctl & ~msk); + else + mr->wl(j, mr->ofs + MCU_CTRL, (uint32_t)(mc.ctl & ~msk)); + } + } + + /* + * Any event post processing goes here. + * This would be things like cache line refresh and such. + * Actual algorithms are TBD. + */ + } + } + +#if RAS_HALT + if (gone) { + atomic_inc(&mce_entry); + panic("FATAL un-core machine check event:\n" + "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n", + uc.org, uc.id, uc.ctl, uc.status, uc.addr, uc.misc); + } +#endif + + return seen; +} + + +/* + * NMI handler. + * + * Once we get control in 1st interrupt (NMI or regular), we'll + * use IPIs from the local APIC to force all active CPU's into + * our RAS NMI handler, similar to the core MC handler. + * After that, the same logic as for the generic MC handler is + * applied to corral all CPU's through well defined rendez-vous + * points where only one cpu gets to run the un-core MC event + * scan while everybody else are sitting in a holding pen. + * If containment wasn't an issue we could simply let the BP + * run the scan without involving other CPUs at all. + */ + +#define SPINUNIT 50 +#define SERIAL_MCU 0 + +struct cpumask mcu_exc_mask; /* NMI recipients */ +static int mcu_cpu = -1; /* SBOX target CPU */ +#if MCU_NMI +static uint64_t mcu_redir; /* SBOX I/O-APIC redirection entry */ +static uint64_t mcu_old_redir; /* Restore value for redirection entry */ +#else +unsigned int mcu_eoi; /* 1st interrupt from local APIC */ +#endif +static atomic_t mcu_callin; /* Entry rendez-vous gate */ +static atomic_t mcu_leavin; /* Hold rendez-vous gate */ + + +static int +mcu_timed_out(int64_t * timeout) +{ + if (*timeout < SPINUNIT) + return 1; + + *timeout -= SPINUNIT; + touch_nmi_watchdog(); + ndelay(SPINUNIT); + + return 0; +} + + +static int +mcu_wait(void) +{ + int cpus, order; + int64_t timeout; + + cpus = num_online_cpus(); + timeout = 1 * NSEC_PER_SEC; /* 1 Second */ + + /* + * Flush all caches + */ + + /* + * 'Entry' rendez-vous point. + * Wait here until all CPUs has entered. + */ + order = atomic_inc_return(&mcu_callin); + while(atomic_read(&mcu_callin) != cpus) { + if (mcu_timed_out(&timeout)) { + /* + * Timout waiting for CPU enter rendez-vous + */ + return -1; + } + } + + /* + * 'Hold' rendez-vous point. + * All CPUs drop by here 'simultaneously'. + * The first CPU that 'enter'ed (order of 1) will + * fall thru while the others wait until their + * number number comes up in the 'leavin' counter + * (or if a timeout happens). This also has a + * serializing effect, where one CPU leaves this + * loop at a time. + */ + if (order == 1) { +#if SERIAL_MCU + atomic_set(&mcu_leavin, 1); +#endif + } + else { + while(atomic_read(&mcu_leavin) < order) { + if (mcu_timed_out(&timeout)) { + /* + * Timout waiting in CPU hold rendez-vous + */ + return -1; + } + } + } + + return order; +} + + +static int +mcu_go(int order) +{ + int ret; + int64_t timeout; + + ret = -1; + if (order < 0) + goto mcu_reset; + +#if SERIAL_MCU + /* + * If any 'per-CPU' activity is needed in isolation + * (one CPU at a time) then that code needs to go here. + */ + + atomic_inc(&mcu_leavin); /* Next CPU out of hold */ +#endif + + timeout = NSEC_PER_SEC; /* 1 Second */ + if (order == 1) { + int cpus; + + /* + * The first CPU that entered (order of 1) waits here + * for the others to leave the 'hold' loop in mca_wait() + * and enter the 'exit' rendez-vous loop below. + * Once they are there, it will run the uncore MCA bank + * scan while the others are parked in 'exit' loop below. + */ + cpus = num_online_cpus(); +#if SERIAL_MCU + while(atomic_read(&mcu_leavin) <= cpus) { + if (mcu_timed_out(&timeout)) { + /* + * Timout waiting for CPU exit rendez-vous + */ + goto mcu_reset; + } + } +#else + atomic_set(&mcu_leavin, cpus); +#endif + mcu_scan(); + ret = 0; + } + else { + /* + * Exit rendez-vous point. + */ + while(atomic_read(&mcu_leavin) != 0) { + if (mcu_timed_out(&timeout)) { + /* + * Timout waiting in CPU exit rendez-vous + */ + goto mcu_reset; + } + } + return 0; + } + + /* + * Reset rendez-vous counters, letting all CPUs + * leave this function 'simultaneously'. + */ +mcu_reset: + atomic_set(&mcu_callin, 0); + atomic_set(&mcu_leavin, 0); + return ret; +} + + +/* + * NMI exception handler + * Uncertain if all cpumask_* functions implies barriers, + * so erroring on the safe side explicit barriers is used. + */ + +#if BEAM_TEST +static int +mcu_nmi(int cpu) +{ +#ifdef CONFIG_MK1OM + uint32_t mcg_status_lo, mcg_status_hi; +#endif + struct _mcu_rec * mr; + MceInfo mc; + int i, j; + + if (cpu != mcu_cpu) + return 0; + + if (! mcu_prescan()) + return 0; + + wbinvd(); + +#ifdef CONFIG_MK1OM + rdmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi); + wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo | MCG_STATUS_MCIP, mcg_status_hi); +#endif + + for(i = 0; i < ARRAY_SIZE(mcu_src); i++) { + mr = mcu_src + i; + +#ifdef CONFIG_MK1OM + if (mr->org == MC_ORG_TBOX && !mr_txs()) + continue; +#endif + + for(j = 0; j < mr->num; j++) { + mc.status = mr->rq(j, mr->ofs + MCU_STAT); + + if (! (mc.status & MCI_STATUS_VAL)) + continue; + + if (! (mc.status & MCI_STATUS_EN)) { + mr->wq(j, mr->ofs + MCU_STAT, 0); + continue; + } + + mcu_read(mr, j, &mc); + mr->wq(j, mr->ofs + MCU_STAT, 0); + + mc.org = mr->org; + mc.id = j; + mc.stamp = get_seconds(); + mc.flags = (mc.status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0; + + micras_mc_send(&mc, 1); + } + } + +#ifdef CONFIG_MK1OM + wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi); +#endif + return 1; + + /* + * Damn compiler options !!!!!! + * Don't want more changes than this routine, so + * added dummies to shut up gcc about unused code. + */ + i = mcu_wait(); + mcu_go(i); +} +#else + +static atomic_t mcu_entry; + +static int +mcu_nmi(int cpu) +{ +#ifdef CONFIG_MK1OM + uint32_t mcg_status_lo, mcg_status_hi; +#endif + int order, eoi; + + atomic_inc(&mcu_entry); + + /* + * Get MCA status from SBOX. + */ +#if 0 + /* + * If no source bits set, this was not an un-core MCA + * This would work if the SBOX_MCA_INT_STAT actually worked + * as described both in HAS and register specification. + * Unfortunately, it doesn't, as per tribal knowledge errata. + */ + uint32_t int_stat, int_en; + + int_en = mr_sbox_rl(0, SBOX_MCA_INT_EN); + int_stat = mr_sbox_rl(0, SBOX_MCA_INT_STAT); + if (! (int_en & int_stat)) { + atomic_dec(&mcu_entry); + return 0; + } +#else + /* + * Instead of having a single source of pending un-core MCA events, + * we now have to walk all BOXes to check if there is a valid event + * pending in one of them. That is much more expensive as we have + * to check this on all NMIs, including our own cascade NMIs used + * to corrall all CPUs in their rendezvouz point(s). We try to avoid + * this scan if there already is an un-core NMI in progress. + * We know that: + * un-core MCA NMIs are sent to just one CPU, mcu_cpu + * CPUs targeted in the cascade are in mcu_exc_mask + * non-zero atomic variable 'mcu_callin' tells cascade is in progress + */ + if (!cpumask_empty(&mcu_exc_mask)) + goto invited; + if (cpu != mcu_cpu) { + atomic_dec(&mcu_entry); + return 0; + } + + /* + * On CPU 0 and no un-core handling in progress! + * Then scan all BOXes for valid events pending, + * If there wasn't any, this is a false alarm and + * we'll re-connect MC lines and return. + */ + if (! mcu_prescan()) { + atomic_dec(&mcu_entry); + return 0; + } + +invited: +#endif + + /* + * Flush all caches. + * This is uncore so it should not be necessary to + * empty internal (L1) caches, doesn't harm either. + */ + wbinvd(); + + /* + * We do not want to be interrupted by a core MC + * exception while handling an NMI. We can block + * core MC events by setting the MCG_STATUS_MCIP. + * This is a MSR, so it has to be done on all CPUs. + * On KnC that is, KnF does not have that MSR. + */ +#ifdef CONFIG_MK1OM + rdmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi); + wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo | MCG_STATUS_MCIP, mcg_status_hi); +#endif + + /* + * Special for the SBOX NMI target CPU: + * - disconnect un-core MC lines from SBOX I/O-APIC, such + * that we don't get stacked NMIs in the Local APICs. + * - simulate a NMI broadcast by sending NMI to all _other_ + * active CPUs via IPIs. The SBOX could do a broadcast, + * but that will send NMIs to sleeping CPUs too, which + * we prefer to avoid if possible. + *TBD: should creating the mcu_exc_mask be protected by + * lock, similar to core events? Who can interfere? + */ + if (cpu == mcu_cpu) { + mr_sbox_wl(0, SBOX_MCA_INT_EN, 0); + cpumask_copy(&mcu_exc_mask, cpu_online_mask); + cpumask_clear_cpu(cpu, &mcu_exc_mask); + smp_wmb(); + // apic->send_IPI_mask(&mcu_exc_mask, NMI_VECTOR); + apic->send_IPI_allbutself(NMI_VECTOR); +#if !MCU_NMI + if (mcu_eoi) { + smp_rmb(); + cpumask_set_cpu(cpu, &mcc_exc_mask); + smp_wmb(); + mcu_eoi = 0; + } +#endif + } + + /* + * Corral all CPUs through the rendez-vous point maze. + * It guarantees that: + * - No CPU leaves mcu_wait() until all has entered. + * - One CPU leaves mcu_wait() at a time. + * - No CPU leaves mcu_go() until all has entered. + * - While one CPU is in transit between mcu_wait() + * and mcu_go(), all other CPUs are sitting in + * tight busy-wait loops in either function. + * - All CPUs leaves mcu_go() at the same time. + * If there is any 'per-cpu' activity that needs to be + * run in isolation, it must be placed between mcu_wait() + * and mcu_go(). + */ + order = mcu_wait(); + if (mcu_go(order)) { + /* + * Timeout waiting at one of the rendez-vous points. + * Scan the un-core MCA banks just in case. + */ + mcu_scan(); + } + + /* + * Special for the SBOX NMI target CPU: + * - reconnect un-core MC lines through to SBOX I/O-APIC. + * If new events already are pending, then this will + * result in a 'rising-edge' trigger to the I/O-APIC. + */ + if (cpu == mcu_cpu) + mr_sbox_wl(0, SBOX_MCA_INT_EN, mr_txs() ? 0x0fffff07 : 0xff07); + + /* + * If this CPU got its NMI from an IPI, then it must + * send an ACK to its local APIC (I think). + */ + smp_rmb(); + eoi = cpumask_test_and_clear_cpu(cpu, &mcu_exc_mask); + smp_wmb(); + if (eoi) + ack_APIC_irq(); + + /* + * Restore core MCG status and return 1 indicating to the + * kernel NMI handler we've handled it. + *TBD: reduce to one write per core instead of one per thread? + */ +#ifdef CONFIG_MK1OM + wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi); +#endif + atomic_dec(&mcu_entry); + return 1; +} +#endif + + +#if !MCU_NMI +/* + * MCA handler if using standard interrupts + * It's just a trampoline to convert a regular interrupt + * into an NMI, which is only needed if the I/O-APIC can't + * generate and NMI. + * + *TBD: remove all this? It is not used on KnC, and the KnF's + * I've tested this on all have been OK sending NMIs. + */ + +static irqreturn_t +sbox_handler(int irq, void * tag) +{ + /* + * Convert this regular interrupt into an NMI. + */ + mcu_cpu = smp_processor_id(); + mcu_eoi = 1; + apic->send_IPI_self(NMI_VECTOR); + return IRQ_HANDLED; +} +#endif + + +/* + * Reset all uncore MCA banks to defaults + */ + +void +box_reset(int arm) +{ + int i, j; + struct _mcu_rec * mr; + + for(i = 0; i < ARRAY_SIZE(mcu_src); i++) { + mr = mcu_src + i; + +#ifdef CONFIG_MK1OM + if (mr->org == MC_ORG_TBOX && !mr_txs()) + continue; +#endif + + for(j = 0; j < mr->num; j++) { + uint64_t status; + + /* + *TBD: Do we want to pick up existing MCA events or drop + * them because we don't know _when_ they occurred? + * Reporting them would require internal buffer because + * it's unlikely the SCIF MC session is up at this point. + * For now we just enter events into the system log. + */ + status = mr->rq(j, mr->ofs + MCU_STAT); + if (status & MCI_STATUS_VAL) { + MceInfo mc; + + mcu_read(mr, j, &mc); + printk("RAS.uncore: discard MC event:\n" + "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n", + mr->org, j, mc.ctl, status, mc.addr, mc.misc); + } + + /* + * Reset MCA bank registers. + */ + mcu_reset(mr, j, arm); + } + } +} + + +/* + * Setup interrupt handlers by hooking into the SBOX's I/O-APIC. + * For now, we send an NMI to single CPU, and let it process the + * event. This may need to be expanded into a broadcast NMI similar + * to what the generic core MC event handler does in order to keep + * containment at high as we possibly can. + * + *TBD: code a dual rendez-vous mechanism on all active CPUs. + */ + +int __init +mcu_init(void) +{ +#if MC_VERBOSE + int i, j; +#endif + + if (mce_disabled) { + printk("RAS.uncore: disabled\n"); + } + else { + /* + * Clear rendez-vous counters + */ + atomic_set(&mcu_callin, 0); + atomic_set(&mcu_leavin, 0); + +#if MC_VERBOSE + /* + * For debug only: + * Record all SBOX I/O-APIC registers to kernel log + */ + printk("SBOX_APICIDR: %lx\n", mr_sbox_rl(0, SBOX_APICIDR)); + printk("SBOX_APICVER: %lx\n", mr_sbox_rl(0, SBOX_APICVER)); + printk("SBOX_APICAPR: %lx\n", mr_sbox_rl(0, SBOX_APICAPR)); + for(i = 0; i < 26 ; i++) + printk("APICCRT%d: %llx\n", i, mr_sbox_rq(0, SBOX_APICRT0 + (8 * i))); + for(i = 0; i < 8 ; i++) + printk("APICICR%d: %llx\n", i, mr_sbox_rq(0, SBOX_APICICR0 + (8 * i))); + printk("SBOX_MCA_INT_EN: %lx\n", mr_sbox_rl(0, SBOX_MCA_INT_EN)); + printk("SBOX_MCA_INT_STAT: %lx\n", mr_sbox_rl(0, SBOX_MCA_INT_STAT)); +#endif + + /* + * Disconnect un-core MC lines from SBOX I/O-APIC, setup the + * individual BOXes, and clear any un-core MC pending flags + * from SBOX I/O-APIC + */ + mr_sbox_wl(0, SBOX_MCA_INT_EN, 0); + box_reset(1); + mr_sbox_wl(0, SBOX_MCA_INT_STAT, 0); + + /* + * Setup the SBOX I/O-APIC. + * Un-core MC events are routed through a mask in register + * SBOX_MCA_INT_EN into I/O APIC redirection table entry #16. + * Ideally we want all uncore MC events to be handled similar + * to core MCAs, which means we'd like an NMI on all CPUs. + * On KnF the I/O-APIC may not trigger an NMI (PoC security) + * and on KnC where NMI delivery is possible, it appears not + * to be ideal to broadcast it to all CPUs because it could + * wake up cores put to sleep bu power management rules. + * See MCA HAS, SBOX HAS Vol 4, and A0 Vol 2 for details. + * + * The redirection table entry has the following format: + * 47:32 Destination ID field + * 17 Interrrupt set (testing: trigger an interrupt) + * 16 Interrupt mask (0=enable, 1=disable) + * 15 Trigger mode (0=edge, 1=level) + * 14 Remote IRR (0=inactive, 1=accepted) + * 13 Interrupt polarity (0=active_high, 1=active_low) + * 12 Delivery status (0=idle, 1=send_pending) + * 11 Destination mode (0=physical, 1=logical) + * 10:8 Delivery mode (0=fixed, low, SMI, rsvd, NMI, INIT, rsvd, ext) + * 7:0 Interrupt vector + * + * The I/O-APIC input is 'rising edge', so we'd need to select + * it to be edge triggered, active high. + */ +#if MCU_NMI + /* + * If event delivery by NMI is preferred, we want it delivered on + * the BP. There is already an NMI handler present, so we have to + * tap into the existing NMI handler for the event notifications. + * + * The bit-fiddling below says: + * NMI delivery | Destination CPU APIC ID + */ + mcu_cpu = 0; + mcu_redir = PUT_BITS(10, 8, 4) | PUT_BITS(47, 32, (uint64_t) cpu_data(mcu_cpu).apicid); + mcu_old_redir = mr_sbox_rq(0, SBOX_APICRT16); + mr_sbox_wq(0, SBOX_APICRT16, mcu_redir | PUT_BITS(16, 16, 1)); + mr_sbox_wq(0, SBOX_APICRT16, mcu_redir); +#else + /* + * If event delivery by regular interrupt is preferred, then all + * I/O-APIC setup will be handled by calling request_irq(16,..). + * There is no guarantee that the event will be sent to the BP + * (though it's more than likely) so we'll defer indentifying the + * event handling CPU (mcu_cpu) till we receive the callback from + * the interrupt handling sus-system. + * The sbox_handler() function just converts the callback into an + * NMI because the only way containment can be achieved is to be + * able to lock down the system completely, which is not realistic + * using regular interrupts. + */ + mcu_eoi = 0; + (void) request_irq(16, sbox_handler, IRQF_TRIGGER_HIGH, "un-core mce", (void *) 42); +#endif + + /* + * Finally, place hook in NMI handler in case there's + * an un-core event pending and connect un-core MC lines + * through to SBOX I/O-APIC. From this point onwards we + * can get uncore MC events at any time. + */ + mca_nmi = mcu_nmi; + mr_sbox_wl(0, SBOX_MCA_INT_EN, mr_txs() ? 0x0fffff07 : 0xff07); + +#if MC_VERBOSE + /* + * For debug only + * Record initial uncore MCA banks to kernel log. + */ + printk("RAS.uncore: dumping all banks\n"); + + /* + * Dump all MCA registers we set to kernel log + */ + for(i = 0; i < ARRAY_SIZE(mcu_src); i++) { + char * boxname; + struct _mcu_rec * mr; + uint64_t ctl, stat, addr, misc; + + mr = mcu_src + i; +#ifdef CONFIG_MK1OM + if (mr->org == MC_ORG_TBOX && !mr_txs()) + continue; +#endif + switch(mr->org) { + case MC_ORG_SBOX: boxname = "SBOX"; break; + case MC_ORG_DBOX: boxname = "DBOX"; break; + case MC_ORG_GBOX: boxname = "GBOX"; break; + case MC_ORG_TBOX: boxname = "TBOX"; break; + default: boxname = "??"; /* Damn compiler */ + } + + for(j = 0; j < mr->num; j++) { + + if (mr->qflg & MCU_CTL_64) + ctl = mr->rq(j, mr->ofs + MCU_CTRL); + else + ctl = (uint64_t) mr->rl(j, mr->ofs + MCU_CTRL); + + stat = mr->rq(j, mr->ofs + MCU_STAT); + + if (mr->qflg & MCU_NO_ADDR) + addr = 0; + else { + if (mr->qflg & MCU_ADDR_32) + addr = (uint64_t) mr->rl(j, mr->ofs + MCU_ADDR); + else + addr = mr->rq(j, mr->ofs + MCU_ADDR); + } + + if (mr->qflg & MCU_NO_MISC) + misc = 0; + else { + if (mr->qflg & MCU_MISC_64) + misc = mr->rq(j, mr->ofs + MCU_MISC); + else + misc = (uint64_t) mr->rl(j, mr->ofs + MCU_MISC); + } + + printk("RAS.uncore: %s[%d] = { %llx, %llx, %llx, %llx }\n", + boxname, j, ctl, stat, addr, misc); + } + } + printk("RAS.uncore: MCA_INT_EN = %x\n", mr_sbox_rl(0, SBOX_MCA_INT_EN)); + printk("RAS.uncore: APICRT16 = %llx\n", mr_sbox_rq(0, SBOX_APICRT16)); +#endif + + printk("RAS.uncore: init complete\n"); + } + + return 0; +} + + +/* + * Cleanup for module unload. + * Clear/restore hooks in the SBOX's I/O-APIC. + */ + +int __exit +mcu_exit(void) +{ + if (! mce_disabled) { + + /* + * Disconnect uncore MC lines from SBOX I/O-APIC. + * No new uncore MC interrupts will be made. + */ + mr_sbox_wl(0, SBOX_MCA_INT_EN, 0); + + /* + * Disconnect exception handler. + */ +#if MCU_NMI + mcu_redir = 0; + mr_sbox_wq(0, SBOX_APICRT16, mcu_old_redir); +#else + mcu_eoi = 0; + free_irq(16, (void *) 42); +#endif + + /* + * Cut link from kernel's NMI handler and + * wait for everybody in handler to leave. + */ + mca_nmi = 0; + while(atomic_read(&mcu_entry)) + cpu_relax(); + mcu_cpu = -1; + + /* + * No more events will be received, clear + * MC reporting in all BOXes (just in case) + */ + box_reset(0); + } + + printk("RAS.uncore: exit complete\n"); + return 0; +} + diff --git a/ras/monahan.h b/ras/monahan.h new file mode 100644 index 0000000..4f3fd1f --- /dev/null +++ b/ras/monahan.h @@ -0,0 +1,201 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * The Monahan GX processor implementation of the I2C unit does not support + * the hardware general call, 10-bit slave addressing or CBUS compatibility. + * Otherwise it is compliant with I2C spec version 2.1. + * + * This is the SBOX 'OverClock' bus controller, which for reference is + * mostly like the I2C controller on PXA270 with the above limitations. + */ + +#ifndef _MONAHAN_H_ +#define _MONAHAN_H_ 1 + +/* +** +** Layer 1 stuff +** +** Offsets and bit definitions for the Monahans I2C controller. +** This is equivalent to defines in 'i2c-pxa.c', but kept separate. +*/ + +/* + * Register locations (base SBOX register SBOX_OC_I2C_ICR) + */ +#define ICR_OFFSET 0x00 +#define ISR_OFFSET 0x04 +#define ISAR_OFFSET 0x08 +#define IDBR_OFFSET 0x0c +#define IBMR_OFFSET 0x10 + +/* + * I2C Control Register bits + */ +#define ICR_START 0x00000001 /* Start bit */ +#define ICR_STOP 0x00000002 /* Stop bit */ +#define ICR_ACKNAK 0x00000004 /* Send ACK(0) or NAK(1) */ +#define ICR_TB 0x00000008 /* Transfer byte bit */ +#define ICR_MA 0x00000010 /* Master abort */ +#define ICR_SCLE 0x00000020 /* Master clock enable */ +#define ICR_IUE 0x00000040 /* Unit enable */ +#define ICR_GCD 0x00000080 /* General call disable */ +#define ICR_ITEIE 0x00000100 /* Enable tx interrupts */ +#define ICR_DRFIE 0x00000200 /* Enable rx interrupts */ +#define ICR_BEIE 0x00000400 /* Enable bus error ints */ +#define ICR_SSDIE 0x00000800 /* Slave STOP detected int enable */ +#define ICR_ALDIE 0x00001000 /* Enable arbitration interrupt */ +#define ICR_SADIE 0x00002000 /* Slave address detected int enable */ +#define ICR_UR 0x00004000 /* Unit reset */ +#define ICR_MODE 0x00018000 /* Bus speed mode */ +#define ICR_RESERVED 0xfffe0000 /* Unused */ + +/* + * Bus speed control values + * High speed modes are not supported by controller. + */ +#define ICR_STANDARD_MODE 0x00000000 /* 100k operation */ +#define ICR_FAST_MODE 0x00008000 /* 400k operation */ +#define ICR_HS_STANDARD_MODE 0x00010000 /* 3.4M/100k operation */ +#define ICR_HS_FAST_MODE 0x00018000 /* 3.4M/400k operation */ + +/* + * Shorthands + */ +#define ICR_ON (ICR_IUE | ICR_SCLE) /* Turn unit on */ +#define ICR_INIT_BITS (ICR_ITEIE | \ + ICR_DRFIE | \ + ICR_BEIE | \ + ICR_SADIE | \ + ICR_FAST_MODE | \ + ICR_ON) /* Init flags */ + +/* + * I2C Status Register bits + */ +#define ISR_RWM 0x00000001 /* Read(1)/write(0) mode */ +#define ISR_ACKNAK 0x00000002 /* Ack(0)/nak(1) sent or received */ +#define ISR_UB 0x00000004 /* Unit busy */ +#define ISR_IBB 0x00000008 /* Bus busy */ +#define ISR_SSD 0x00000010 /* Slave stop detected */ +#define ISR_ALD 0x00000020 /* Arbitration loss detected */ +#define ISR_ITE 0x00000040 /* Tx buffer empty */ +#define ISR_IRF 0x00000080 /* Rx buffer full */ +#define ISR_GCAD 0x00000100 /* General call address detected */ +#define ISR_SAD 0x00000200 /* Slave address detected */ +#define ISR_BED 0x00000400 /* Bus error no ACK/NAK */ +#define ISR_RESERVED 0xfffff800 /* Unused */ + +#define ISR_INTS (ISR_SSD | \ + ISR_ALD | \ + ISR_ITE | \ + ISR_IRF | \ + ISR_SAD | \ + ISR_BED) /* Interrupt flags */ +/* + * I2C Slave Address Register bits + */ +#define ISAR_SLADDR 0x0000007f /* 7-bit address for slave-receive mode */ +#define ISAR_RESERVED 0xffffff80 /* Unused */ + +/* + * I2C Data Buffer Register bits + */ +#define IDBR_DATA 0x000000ff /* 8-bit data buffer */ +#define IDBR_RESERVED 0xffffff00 /* Unused */ + +/* + * I2C Bus Monitor Register bits + */ +#define IBMR_SDA 0x00000001 /* State of SDA pin */ +#define IBMR_SCL 0x00000002 /* State of SCL pin */ +#define IBMR_RESERVED 0xfffffffc /* Unused */ + + +/* +** +** Layer 2 stuff +** +*/ + +/* + * Bus speed selections + */ +#define I2C_STANDARD ICR_STANDARD_MODE +#define I2C_FAST ICR_FAST_MODE +#define I2C_HS_STANDARD ICR_HS_STANDARD_MODE +#define I2C_HS_FAST ICR_HS_FAST_MODE + +/* + * Command types + */ +#define I2C_INVALID -1 /* Internal, not to be used */ +#define I2C_WRITE 0 /* Next transfer will be outgoing */ +#define I2C_READ 1 /* Next transfer will be incoming */ +#define I2C_NOP 2 /* Idle state */ + +/* + * Return codes + */ +#define XFER_SUCCESS 0 /* All OK */ +#define INCOMPLETE_XFER -1 /* Basic timeout */ +#define TX_CONTROLLER_ERROR -2 /* Requires reset */ +#define TX_NAK -3 /* NAK, master to send a stop */ +#define RX_SEVERE_ERROR -4 /* Requires reset */ +#define RX_END_WITHOUT_STOP -5 /* Deprecated */ +#define RX_BIZARRE_ERROR -6 /* Doesn't require reset */ + + +/* +** +** Layer 3 stuff +** +*/ + +/* + * Frequency selections + */ +#define FREQ_MAX -3 /* As fast as possible */ +#define FREQ_400K -2 /* 400 kHz */ +#define FREQ_100K -1 /* 100 kHz */ +#define FREQ_AUTO 0 /* Default speed */ + +/* + * Return codes: standard kernel codes used + * EBUSY, ENODEV, ENXIO, EINVAL, EIO + */ + +#endif /* Recursion block */ diff --git a/trace_capture/Kbuild b/trace_capture/Kbuild new file mode 100644 index 0000000..bc12e70 --- /dev/null +++ b/trace_capture/Kbuild @@ -0,0 +1 @@ +obj-m := trace_capture.o diff --git a/trace_capture/Makefile b/trace_capture/Makefile new file mode 100644 index 0000000..199953a --- /dev/null +++ b/trace_capture/Makefile @@ -0,0 +1,34 @@ +# +# Trace Capture module +# + +export ARCH = l1om + +KERNELDIR = $(CURDIR)/../../mic_linux +KBUILD := $(MAKE) -C $(KERNELDIR) ARCH=$(ARCH) M=$(CURDIR) + +ifneq ($(DESTDIR),) +INSTALL_MOD_PATH = $(DESTDIR) +endif + +ifeq ($(shell \which x86_64-$(ARCH)-linux-gcc 2>/dev/null),) +export PATH := $(PATH):$(CURDIR)/../cross/bin +endif + +.PHONY: default modules install modules_install clean + +default: modules tests + +modules: + +$(KBUILD) $@ + +install: modules_install + +modules_install: + +$(KBUILD) INSTALL_MOD_PATH=$(DESTDIR) modules_install + +clean: + +$(KBUILD) clean + +tests: + echo no tests diff --git a/trace_capture/docapture.c b/trace_capture/docapture.c new file mode 100644 index 0000000..587bebe --- /dev/null +++ b/trace_capture/docapture.c @@ -0,0 +1,70 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include /* open */ +#include /* exit */ +#include /* ioctl */ + +#include "trace_capture.h" + +void +ioctl_start_capture(int file_desc, long trigger) +{ + ioctl(file_desc, MICTC_START_CAPTURE, trigger); +} + +int +main (int argc, char *argv[]) +{ + int file_desc; + long trigger = 1; + + if ((file_desc = open(MICTC_FILE_NAME, 0)) < 0) { + printf("Can't open device file: %s\n", MICTC_FILE_NAME); + exit(-1); + } + + if (argc == 2) { + trigger = atoi(argv[1]); + printf("Trigger %ld\n", trigger); + } + + ioctl_start_capture(file_desc, trigger); + printf("Done.\n"); + + close(file_desc); +} diff --git a/trace_capture/tc_host.c b/trace_capture/tc_host.c new file mode 100644 index 0000000..5eecf7c --- /dev/null +++ b/trace_capture/tc_host.c @@ -0,0 +1,366 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include "../include/scif.h" +#include "trace_capture.h" + +#define BARRIER(epd, string) { \ + printf("%s\n", string); \ + if ((err = scif_send(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \ + printf("scif_send failed with err %d\n", errno); \ + fflush(stdout); \ + goto close; \ + } \ + if ((err = scif_recv(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \ + printf("scif_recv failed with err %d\n", errno); \ + fflush(stdout); \ + goto close; \ + } \ +} + +#if 0 +// These are common to the Host App +// and the MIC driver Trace Capture Feature +// COMMON DEFINES START HERE +enum TRACE_COMMAND { + TRACE_NOP = 100, + TRACE_DATA, + TRACE_HOST_READY, + TRACE_DONE, + TRACE_ERROR, + TRACE_PRINT, + TRACE_GET_FILE, + TRACE_PAGE_READY, + TRACE_REG_COMPLETE, + TRACE_MEM_COMPLETE, + TRACE_COMPLETE +}; + +#define TRACE_STATUS_OFFSET 8 +#define TRACE_SIZE_OFFSET 12 + +// Enable/Disable Memory Test. +// This MUST be enabled simultaneously on Host App as well. +#define MIC_TRACE_CAPTURE_MEMORY_TEST 0 + +#if MIC_TRACE_CAPTURE_MEMORY_TEST +#define TRACE_CHECKSUM_OFFSET 16 +#endif + +#define TRACE_TRIGGER_OFFSET 20 +#define TRACE_DATA_OFFSET 4096 + +// Types of Triggers - Refer to uOS Trace Capture Wiki for Usage +// Generic counter +#define TRACE_HOST_GENERIC_COUNTER 0x1 +// Async Flip counter +#define TRACE_HOST_FRAME_COUNTER 0x2 +// COMMON DEFINES END HERE +#endif + +// End points for SCIF +//static scif_epd_t mictc_epd_cmd; +static scif_epd_t mictc_epd_data; + +// SCIF ports - temp hack; move to scif.h +#define MICTC_SCIF_PORT_DATA 300 + +static volatile uint64_t *g_traceBufferStatusOffset = NULL; +static volatile uint64_t *g_traceBufferSizeOffset = NULL; +static volatile uint32_t *g_traceBufferDataOffset = NULL; +static volatile uint32_t *g_traceBufferTriggerOffset = NULL; + +// This is an array of trigger numbers. The value TRACE_EOL is ignored. +static uint32_t g_traceTriggers[TRACE_TRIGGER_MAX]; + +static struct scif_portID portID_data; +static scif_epd_t mictc_newepd; + +static void *g_mictc_buffer_base; +static void *g_mictc_buffer_offset_xml; +static off_t g_mictc_buffer_offset_mem; + +FILE *fp; + +static +int open_scif_channels(void) +{ + int err; + struct pollfd spollfd; + int control_msg = 0; + long scif_offset_dst; + int timeout = 0; + int page_count = 0; + int i; + + if ((err = posix_memalign(&g_mictc_buffer_base, 0x1000, MICTC_MEM_BUFFER_SIZE))) { + fprintf(stderr, "posix_memalign failed failed with %d\n", err); + return 0; + } + // Data channel + if ((mictc_epd_data = scif_open()) == SCIF_OPEN_FAILED) { + fprintf(stderr, "scif_open failed with ENOMEM\n", errno); + return 0; + } + + if (scif_bind(mictc_epd_data, MICTC_SCIF_PORT_DATA) == -1) { + fprintf(stderr, "scif_bind failed with error %d\n", errno); + return 0; + } + + portID_data.node = 1; + portID_data.port = MICTC_SCIF_PORT_DATA; + + if (scif_listen(mictc_epd_data, 1) == -1) { + fprintf(stderr, "scif_listen failed with error %d\n", errno); + return 0; + } + + while (1) { + printf("scif_accept in poll mode until a connect request is found\n"); + err = 1; + while (err) { + spollfd.fd = scif_get_fd(mictc_epd_data); + spollfd.events = POLLIN; + spollfd.revents = 0; + if ((err = poll(&spollfd, 1, -1)) < 0) { + printf("poll failed with err %d\n", errno); + } + if (((err = scif_accept(mictc_epd_data, &portID_data, &mictc_newepd, 0)) < 0) && (errno != EAGAIN)) { + printf("scif_accept failed with err %d\n", errno); + return 0; + } + } + + printf("scif_accept from port %d complete\n", portID_data.port); + + if ((g_mictc_buffer_offset_mem = scif_register(mictc_newepd, g_mictc_buffer_base, MICTC_MEM_BUFFER_SIZE, 0, // suggested_offset, + SCIF_PROT_READ | SCIF_PROT_WRITE, 0)) < 0) { + fprintf(stderr, "scif_register failed with err %d\n", errno); + return 0; + } + + printf("After scif_register, g_mictc_buffer_offset_mem = %llx\n", + (unsigned long long)g_mictc_buffer_offset_mem); + fflush(stdout); + + // printf("Before scif_send\n"); + // fflush(stdout); + + BARRIER(mictc_newepd, "before barrier"); + + if ((err = + scif_send(mictc_newepd, &g_mictc_buffer_offset_mem, sizeof(g_mictc_buffer_offset_mem), + SCIF_SEND_BLOCK)) <= 0) { + printf("scif_send failed with err %d\n", errno); + fflush(stdout); + goto close; + } + // BARRIER(mictc_newepd, "scif_send"); + + // printf("scif_offset = %lx\n", scif_offset); + // fflush(stdout); + + printf("Before scif_recv\n"); + fflush(stdout); + + if ((err = scif_recv(mictc_newepd, &scif_offset_dst, sizeof(scif_offset_dst), SCIF_RECV_BLOCK)) <= 0) { + printf("scif_recv failed with err %d\n", errno); + fflush(stdout); + goto close; + } + printf("scif_offset_dst = %lx\n", scif_offset_dst); + + printf("Before scif_mmap\n"); + + if ((g_mictc_buffer_offset_xml = scif_mmap(0, // physical address + MICTC_XML_BUFFER_SIZE, // length + SCIF_PROT_READ | SCIF_PROT_WRITE, // protection + 0, // flags + mictc_newepd, // endpoint + scif_offset_dst) // offset + ) == (void *)-1) { + fprintf(stderr, "scif_mmap failed with err %d\n", errno); + return 0; + } + + g_traceBufferStatusOffset = (uint64_t *) (g_mictc_buffer_offset_xml + TRACE_STATUS_OFFSET); + g_traceBufferSizeOffset = (uint64_t *) (g_mictc_buffer_offset_xml + TRACE_SIZE_OFFSET); + g_traceBufferDataOffset = (uint32_t *) (g_mictc_buffer_offset_xml + TRACE_DATA_OFFSET); + g_traceBufferTriggerOffset = (uint32_t *) (g_mictc_buffer_offset_xml + TRACE_TRIGGER_OFFSET); + + for (i = 0; i < TRACE_TRIGGER_MAX; i++) { + *g_traceBufferTriggerOffset = g_traceTriggers[i]; + g_traceBufferTriggerOffset++; + } + + *g_traceBufferStatusOffset = TRACE_HOST_READY; + + printf("Before fopen\n"); + + if ((fp = fopen("cpu.xml", "w")) == NULL) { + fprintf(stderr, "Cannot open file cpu.xml.\n"); + } + + printf("Waiting for TRACE_REG_COMPLETE or TRACE_ABORTED"); + fflush(stdout); + + while (*g_traceBufferStatusOffset != TRACE_REG_COMPLETE) { + printf("."); + fflush(stdout); + sleep(1); + if (timeout++ >= 200) { + // Hmmm, something is hung up. Save everything in the buffer ignoring length. + printf("Punt!\n"); + fprintf(fp, "%s\n", (char *)g_traceBufferDataOffset); + *g_traceBufferStatusOffset = TRACE_GET_FILE; + fclose(fp); + sleep(5); + goto close; // and quit + } + // If this happens the current trigger was not one we want -- reset and wait. + if (*g_traceBufferStatusOffset == TRACE_ABORTED) { + printf("\nAborted trace\n"); + fflush(stdout); + goto close2; + } + } + printf("\n"); + + { + int j; + + asm volatile ("lfence" ::: "memory"); + j = *g_traceBufferSizeOffset; + fprintf(fp, "%*s\n", j, (char *)g_traceBufferDataOffset); + } + *g_traceBufferStatusOffset = TRACE_GET_FILE; + fclose(fp); + sleep(5); + + // Memory dump + + if ((fp = fopen("mem.dat", "w")) == NULL) { + fprintf(stderr, "Cannot open file mem.dat.\n"); + } + + printf("Waiting for memory pages\n"); + fflush(stdout); + + timeout = 0; + + { + long i = 0; + + while (*g_traceBufferStatusOffset != TRACE_MEM_COMPLETE) { + //printf("status %d\n", *g_traceBufferStatusOffset); + + if (*g_traceBufferStatusOffset == TRACE_PAGE_READY) { + printf(" %ld", i++); + fflush(stdout); + asm volatile ("lfence" ::: "memory"); + + if (fwrite(g_mictc_buffer_base, *g_traceBufferSizeOffset, 1, fp) != 1) { + fprintf(stderr, "\nCannot write file mem.dat. error = %d\n", ferror(fp)); + return 0; + } + *g_traceBufferStatusOffset = TRACE_HOST_READY; // Get next page + timeout = 0; + } else { + // printf("."); + // fflush(stdout); + usleep(10000); + + if (timeout++ >= 2000) { + // Hmmm, something is hung up. Just close and quit. + printf("Punt!\n"); + fclose(fp); + sleep(5); + goto close; // and quit + } + } + } + } + close1: + printf("\nClosing memory dump file.\n"); + fflush(stdout); + fclose(fp); + *g_traceBufferStatusOffset = TRACE_COMPLETE; // File is closed; tell driver we are done. + printf("Done.\n"); + fflush(stdout); + close2: + sleep(2); + scif_munmap(g_mictc_buffer_offset_xml, MICTC_XML_BUFFER_SIZE); + scif_unregister(mictc_newepd, (off_t) g_mictc_buffer_base, MICTC_MEM_BUFFER_SIZE); + scif_close(mictc_newepd); + } // while (1) + close: + scif_munmap(g_mictc_buffer_offset_xml, MICTC_XML_BUFFER_SIZE); + scif_close(mictc_newepd); + scif_close(mictc_epd_data); + free(g_mictc_buffer_base); + return 1; +} + +int main(int argc, char *argv[]) +{ + int i; + + for (i = 0; i < TRACE_TRIGGER_MAX; i++) { + g_traceTriggers[i] = TRACE_EOL; + } + + if (argc >= 2) { + for (i = 1; i < argc; i++) { + if (i > TRACE_TRIGGER_MAX) break; + + g_traceTriggers[i - 1] = atoi(argv[i]); + printf("Trigger %d\n", g_traceTriggers[i - 1]); + } + } else { + printf("No triggers -- accept everything\n"); + } + + if (!open_scif_channels()) + exit(1); + + exit(0); +} diff --git a/trace_capture/tc_memcvt.c b/trace_capture/tc_memcvt.c new file mode 100644 index 0000000..38efce1 --- /dev/null +++ b/trace_capture/tc_memcvt.c @@ -0,0 +1,85 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include +#include "../include/scif.h" + +// Use 2MB for KNF and 4MB for KNC. +#define MICTC_XML_BUFFER_SIZE (2 * 1024 * 1024) + +// Memory transfer window. 1GB +#define MICTC_MEM_BUFFER_SIZE (1 * 1024UL * 1024UL * 1024UL) + +FILE *ip; +FILE *op; + + +int main(void) +{ + long srcPhysAddr = 0; + uint32_t page_buf[4096/4]; + long i = 0; + int size; + char dest[64]; + + if ((ip = fopen("mem.dat", "r")) == NULL) { + fprintf(stderr, "Cannot open file mem.dat.\n"); + } + + if ((op = fopen("memfmt.txt", "w")) == NULL) { + fprintf(stderr, "Cannot open file memfmt.txt.\n"); + } + + while (! feof(ip)) { + fread(page_buf, sizeof(page_buf), 1, ip); // check for error + + size = sprintf(dest, "origin %lx\n", srcPhysAddr); + fwrite(dest, size, 1, op); + + for (i = 0; i < 4096/4; i++) { + size = sprintf(dest, "%x\n", page_buf[i]); + fwrite(dest, size, 1, op); + } + + srcPhysAddr += 4096; + } + fclose(ip); + fclose(op); +} diff --git a/trace_capture/trace_capture.c b/trace_capture/trace_capture.c new file mode 100644 index 0000000..5cf1bfe --- /dev/null +++ b/trace_capture/trace_capture.c @@ -0,0 +1,2031 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * Trace Capture Driver + * + * Contains code to handle trace_capture syscall, stop all cpus + * and dump their state, then dump all physical memeory. + */ + +#include "trace_capture.h" + +//#define DEBUG + +int always_false = 0; + +#define BARRIER(epd, string) { \ + printk("%s\n", string); \ + if ((err = scif_send(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \ + pr_crit("%s:%s:%d scif_send failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err); \ + goto close; \ + } \ + if ((err = scif_recv(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \ + pr_crit("%s:%s:%d scif_recv failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err); \ + goto close; \ + } \ +} + +/* SPU privileged gates (per specification) */ +#define SPU_SPBA_OFFSET 0x1000 /* offset of Privileged gates in SPU MMIO */ +#define SPU_XQ_SIZE 0x040 +#define SPU_XQ_BASE 0x080 +#define SPU_XQ_INDEX 0x0C0 +#define SPU_CR 0x100 +#define SPU_CONTROL 0x100 +#define SPU_SAMPLER_BASE 0x140 +#define SPU_ABORT 0x180 +#define SPU_ABORT_STATUS 0x1C0 +#define SPU_FLUSH 0x200 +#define SPU_FLUSH_STATUS 0x240 +#define SPU_INVALPG_4K 0x280 +#define SPU_INVALPG_64K 0x2C0 +#define SPU_INVALPG_2M 0x300 +#define SPU_EMPTY 0x340 +#define SPU_ACTIVE 0x340 +#define SPU_FULL 0x380 +#define SPU_SOFT_RESET 0x3C0 +#define SPU_PMU_EVENT_SEL 0x400 +#define SPU_CONTROL2 0x440 +#define SPU_CONTROL3 0x480 + +#define SPU_MEM_BW_LIMIT 0x4C0 // This is 64 bit register + +#define SPU_TCU_CREDITS 0x700 +#define SPU_FER 0x800 +#define SPU_ALT_FER 0x840 +#define SPU_MATCH_ACTION 0x880 +#define SPU_INVAL 0xB00 +#define SPU_COUNTER0_SET 0x500 +#define SPU_COUNTER1_SET 0x540 +#define SPU_COUNTER2_SET 0x580 +#define SPU_COUNTER3_SET 0x5C0 +#define SPU_COUNTER4_SET 0x600 +#define SPU_COUNTER5_SET 0x640 +#define SPU_COUNTER6_SET 0x680 +#define SPU_COUNTER7_SET 0x6C0 + +#define CBOX_SPU_PA_MSR 0x0000017E +#define CBOX_SPU_SAMPLER_BIND_MSR 0x0000017F + +#define MSR_SF_MASK 0xc0000084 /* syscall flags mask */ +#define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ +#define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ +#define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ + +// MSR's defined in the trace file sent during REQs +// Are these all valid for L1OM?? +#define P6_CR_TSC 0x10 +#define X86_CR_APICBASE 0x1b +#define MIC_CR_SPUBASE 0x1c +#define IA32_CR_MISC 0x1a0 +#define WMT_CR_LASTBRANCH_0 0x1db +#define WMT_CR_LASTBRANCH_1 0x1dc +#define X86_CR_MTRRphysMask0 0x201 +#define X86_CR_MTRRphysMask1 0x203 +#define X86_CR_MTRRphysMask2 0x205 +#define X86_CR_MTRRphysMask3 0x207 +#define X86_CR_MTRRphysMask4 0x209 +#define X86_CR_MTRRphysMask5 0x20b +#define X86_CR_MTRRphysMask6 0x20d +#define X86_CR_MTRRphysMask7 0x20f +#define IA32_CR_PAT 0x277 +#define IA32_MTRR_DEF_TYPE 0x2ff +#define VMX_MSR_BASE 0x480 +#define VMX_MSR_BASE_PLUS_1 0x481 +#define VMX_MSR_BASE_PLUS_2 0x482 +#define VMX_MSR_BASE_PLUS_3 0x483 +#define VMX_MSR_BASE_PLUS_4 0x484 +#define VMX_MSR_BASE_PLUS_5 0x485 +#define VMX_MSR_BASE_PLUS_6 0x486 +#define VMX_MSR_BASE_PLUS_7 0x487 +#define VMX_MSR_BASE_PLUS_8 0x488 +#define VMX_MSR_BASE_PLUS_9 0x489 +#define TIME 0x4711 +#define PINFO 0x4712 +#define X86_CR_MTRRdefType 0x2ff +#define X86_CR_MTRRcap 0xfe +#define X86_CR_MTRRphysBase0 0x200 +#define X86_CR_MTRRphysBase1 0x202 +#define X86_CR_MTRRphysBase2 0x204 +#define X86_CR_MTRRphysBase3 0x206 +#define X86_CR_MTRRphysBase4 0x208 +#define X86_CR_MTRRphysBase5 0x20a +#define X86_CR_MTRRphysBase6 0x20c +#define X86_CR_MTRRphysBase7 0x20e +#define X86_CR_MTRRfix64K_00000 0x250 +#define X86_CR_MTRRfix16K_80000 0x258 +#define X86_CR_MTRRfix16K_A0000 0x259 +#define X86_CR_MTRRfix4K_C0000 0x268 +#define X86_CR_MTRRfix4K_C8000 0x269 +#define X86_CR_MTRRfix4K_D0000 0x26a +#define X86_CR_MTRRfix4K_D8000 0x26b +#define X86_CR_MTRRfix4K_E0000 0x26c +#define X86_CR_MTRRfix4K_E8000 0x26d +#define X86_CR_MTRRfix4K_F0000 0x26e +#define X86_CR_MTRRfix4K_F8000 0x26f +#define P5_MC_ADDR 0x0 +#define P5_MC_TYPE 0x1 +#define MSR_TR1 0x2 +#define MSR_TR2 0x4 +#define MSR_TR3 0x5 +#define MSR_TR4 0x6 +#define MSR_TR5 0x7 +#define MSR_TR6 0x8 +#define MSR_TR7 0x9 +#define MSR_TR9 0xb +#define MSR_TR10 0xc +#define MSR_TR11 0xd +#define MSR_TR12 0xe +#define IA32_APIC_BASE 0x1b +#define IA32_TIME_STAMP_COUNTER 0x10 +#define IA32_PerfCntr0 0x20 +#define IA32_PerfCntr1 0x21 +#define IA32_PerfCntr2 0x22 +#define IA32_PerfCntr3 0x23 +#define PerfFilteredCntr0 0x24 +#define PerfFilteredCntr1 0x25 +#define PerfFilteredCntr2 0x26 +#define PerfFilteredCntr3 0x27 +#define IA32_PerfEvtSel0 0x28 +#define IA32_PerfEvtSel1 0x29 +#define IA32_PerfEvtSel2 0x2a +#define IA32_PerfEvtSel3 0x2b +#define PerfFilterMask 0x2c +#define IA32_PERF_GLOBAL_STATUS 0x2d +#define IA32_PERF_GLOBAL_OVF_CONTROL 0x2e +#define IA32_PERF_GLOBAL_CTRL 0x2f +#define IA32_MCG_CTL 0x17b +#define IA32_MC0_CTRL 0x400 +#define IA32_MC0_STAT 0x401 +#define IA32_MC0_ADDR 0x402 +#define IA32_MC0_MISC 0x403 +#define IA32_MC1_CTRL 0x404 +#define IA32_MC1_STAT 0x405 +#define IA32_MC1_ADDR 0x406 +#define IA32_MC1_MISC 0x407 +#define STAR 0xc0000081 +#define LSTAR 0xc0000082 +#define SYSCALL_FLAG_MASK 0xc0000084 +#define X86_PAT 0x277 +#define SPU_BASE 0x1C + +// Kernel virtual address to physical page at 0xfee03000 +// This is created by an ioremap outside of interrupt context. +static uint8_t *spu_addr; + +struct mictc_seg { + struct desc_struct desc; + char zero[8]; + u16 selector; + uint64_t base; +}; + +struct mictc_tss { + tss_desc desc; + u16 selector; + uint64_t base; +}; + +struct mictc_segment_reg +{ + struct mictc_seg cs; + struct mictc_seg ds; + struct mictc_seg es; + struct mictc_seg ss; + struct mictc_seg fs; + struct mictc_seg gs; + struct mictc_tss ldtr; + struct mictc_tss tr; +}; + +#define MAX_SEG_REG 8 + +static char *SegRegNames[MAX_SEG_REG] = {"CS","DS","ES","SS", "FS","GS","LDTR","TR"}; + +//static struct i387_fxsave_struct fpu; + +struct mictc_trace +{ + struct mictc_segment_reg segment; + struct vpustate_struct vpustate; + struct i387_fxsave_struct fpu; +}; + +struct mictc_trace *trace; + +// fxsave definition copied from fpu.c +//#define mictc_fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) +#define mictc_fxsave(addr) __asm __volatile("fxsave (%0)" : "=a" (addr) : [fx] "a" (addr)) + + +// Spinlock to serialize access in IPI handler +static DEFINE_SPINLOCK(mictc_lock); + +// Used to count the cpus waiting +static atomic_t cpus_stopped = ATOMIC_INIT(0); + +// Used to count the cpus released +static atomic_t cpus_released = ATOMIC_INIT(0); + +// End points for SCIF +//static scif_epd_t mictc_endp_cmd; +static scif_epd_t mictc_endp_data; + +// SCIF ports - temp hack; move to scif.h +#define MICTC_SCIF_PORT_DATA 300 + +// Used to prevent concurent access into the same device . +static int Device_Open = 0; + +#define PS_BUF_SIZE 150 +//static char print_string_buf[PS_BUF_SIZE] = ""; + +#define print_str(fmt, ...) \ +{ \ + snprintf(print_string_buf, PS_BUF_SIZE, fmt, ##__VA_ARGS__); \ + print_string(print_string_buf); \ +} + +//#define printk(fmt, ...) print_str(fmt, ##__VA_ARGS__) +//#undef pr_crit +//#define pr_crit(fmt, ...) print_str(fmt, ##__VA_ARGS__) + +// Interrupts off / on +#define cli __asm (" cli\n") +#define sti __asm (" sti\n") + +// Debug code to display low 16 bits of eflags register. +#define print_eflags \ + {unsigned long kernel_eflags; \ + raw_local_save_flags(kernel_eflags); \ + printk("%s:%d eflags %lx\n", __FUNCTION__, __LINE__, kernel_eflags); \ + } + + +// Find another definition of this in some .h file +static __inline void +mictc_cpuid(u_int ax, u_int *p) +{ + __asm __volatile("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); +} + +static inline +uint32_t get_dr(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("mov %%db0, %0" :"=r" (val)); + break; + case 1: + asm("mov %%db1, %0" :"=r" (val)); + break; + case 2: + asm("mov %%db2, %0" :"=r" (val)); + break; + case 3: + asm("mov %%db3, %0" :"=r" (val)); + break; + case 4: + asm("mov %%db4, %0" :"=r" (val)); + break; + case 5: + asm("mov %%db5, %0" :"=r" (val)); + break; + case 6: + asm("mov %%db6, %0" :"=r" (val)); + break; + case 7: + asm("mov %%db7, %0" :"=r" (val)); + break; + default: + BUG(); + } + return val; +} + + +static inline void mictc_store_ldt(u16 *dtr) +{ + asm volatile("sldt %0":"=m" (*dtr)); +} + + +static inline void mictc_store_tr(u16 *dtr) +{ + asm volatile("str %0":"=m" (*dtr)); +} + + +static inline void read_gdt_entry(struct desc_struct *gdt, int entry, + void *desc, int type) +{ + unsigned int size; + switch (type) { + case DESC_TSS: + size = sizeof(tss_desc); + break; + case DESC_LDT: + size = sizeof(ldt_desc); + break; + default: + size = sizeof(struct desc_struct); + break; + } + memcpy(desc, &gdt[entry], size); +#if 0 // Helpful for debug + { u64 *p = (u64 *)&gdt[entry]; + printk("GDT[entry] = %p %llx %llx\n", &gdt[entry], p[0], p[1]); + } +#endif +} + + +static inline void __get_tss_desc(unsigned cpu, unsigned int entry, void *dest) +{ + struct desc_struct *d = get_cpu_gdt_table(cpu); + read_gdt_entry(d, entry, dest, DESC_TSS); +} + +#define get_tss_desc(cpu, addr) __get_tss_desc(cpu, GDT_ENTRY_TSS, addr) + + +static inline void __get_seg_desc(unsigned cpu, unsigned int entry, void *dest) +{ + struct desc_struct *d = get_cpu_gdt_table(cpu); + + read_gdt_entry(d, entry, dest, 0); +} + +#define get_seg_desc(cpu, seg, addr) __get_seg_desc(cpu, ((seg & 0xffff) >> 3), addr) + +// Redefine rdmsr to work like BSD. + +//#undef rdmsr +//#define rdmsr(msr) tc_msr((msr)) + +static inline +uint64_t tc_rdmsr(uint32_t msrid) +{ + uint32_t lower, upper; + rdmsr(msrid, lower, upper); + return (uint64_t)upper << 32 | lower; +} + +// Number of Retries before it is assumed that the Host will not respond +#define TRACE_CAPTURE_TIMEOUT 50000000 + +static void *g_traceBufferAllocated; + +// Global variable used by initiator to wait for everyone to complete trace captures +//static volatile u32 g_smpTraceCaptureWait; + +// Global variable to keep track of how much data we are writing to the shared buffer +// with the Host. +static volatile u64 g_sizeXferred = 0; + +static s64 g_triggerFound = -1; + +static volatile u64 *g_traceBufferStatusOffset = NULL; +static volatile u64 *g_traceBufferSizeOffset = NULL; +static volatile u32 *g_traceBufferDataOffset = 0; +static volatile u32 *g_traceBufferTriggerOffset = NULL; + +// This is an array of trigger numbers. The value TRACE_EOL is ignored. +static u32 g_traceTriggers[TRACE_TRIGGER_MAX]; +static u32 g_traceCurrentTrigger; + +static long scif_offset_xml; +//static long scif_offset_xml_dst; +static long scif_offset_mem; +static long scif_offset_dst; + +#if MIC_TRACE_CAPTURE_MEMORY_TEST +static volatile u64 *g_traceBufferChecksumOffset = NULL; + +// The maximum size allowed for a DMA transfer is 1MB - 4K. The size of this array +// is 1MB to allow this to be used as the dst memory while dumping entire GDDR +// For Debug purposes only. +static u32 g_dstMemoryDump[4096/sizeof(u32)] __attribute__ ((aligned(4096))); +#endif + +#define TRACE_SPRINTF(...) \ + (g_sizeXferred += sprintf(((char*)g_traceBufferDataOffset + g_sizeXferred), __VA_ARGS__)) + +#define ADD_SPU_REG_TO_HEADER(x) \ + TRACE_SPRINTF("\t\t\t\t\n\t\t\t\t\t%s\n\t\t\t\t\n", (x), #x) + +#define ADD_MSR_TO_HEADER(x) \ + TRACE_SPRINTF("\t\t\t\t\n", (x)) + +#define TRACE_SPRINTF_MSR(x) \ + TRACE_SPRINTF("\t\t\t\t0x%llx\n", (x), tc_rdmsr((x))) + +#define TRACE_SPRINTF_SPU(x) \ + TRACE_SPRINTF("\t\t\t\t0x%llx\n", (x), *(volatile u64*)((u8*)spu_addr + (x))) + +#define TRACE_SPRINTF_VECTOR(x, vpu) \ + PrintVector((u8*)&(vpu), (x)) + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_trace_capture_prep_SPU_header +// +// DESCRIPTION: +// Perform all the tasks related to preparing the SPU Trace Header +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_trace_capture_prep_SPU_header(void) +{ + TRACE_SPRINTF("\t\t\t\n"); + ADD_SPU_REG_TO_HEADER(SPU_XQ_SIZE); + ADD_SPU_REG_TO_HEADER(SPU_XQ_BASE); + ADD_SPU_REG_TO_HEADER(SPU_XQ_INDEX); + ADD_SPU_REG_TO_HEADER(SPU_CONTROL); + ADD_SPU_REG_TO_HEADER(SPU_SAMPLER_BASE); + ADD_SPU_REG_TO_HEADER(SPU_PMU_EVENT_SEL); + ADD_SPU_REG_TO_HEADER(SPU_CONTROL2); + ADD_SPU_REG_TO_HEADER(SPU_CONTROL3); + TRACE_SPRINTF("\t\t\t\n"); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_trace_capture_prep_cpuid_header +// +// DESCRIPTION: +// Perform all the tasks related to preparing the CPUID Trace Header +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_trace_capture_prep_cpuid_header(void) +{ + u_int regs[4]; + int i =0; + TRACE_SPRINTF("\t\t\t\n"); + for (i = 0; i < 0x4; i++) + { + mictc_cpuid(i, regs); + TRACE_SPRINTF("\t\t\t\t0x%x-0x%x-0x%x-0x%x\n", + i, regs[0], regs[1], regs[2], regs[3]); + } + TRACE_SPRINTF("\t\t\t\n"); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_trace_capture_prep_msr_header +// +// DESCRIPTION: +// Perform all the tasks related to preparing the MSR Trace Header +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_trace_capture_prep_msr_header(void) +{ + TRACE_SPRINTF("\t\t\t\n"); + ADD_MSR_TO_HEADER(P6_CR_TSC); + ADD_MSR_TO_HEADER(X86_CR_APICBASE); + ADD_MSR_TO_HEADER(CBOX_SPU_PA_MSR); + ADD_MSR_TO_HEADER(SPU_BASE); + ADD_MSR_TO_HEADER(CBOX_SPU_SAMPLER_BIND_MSR); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask0); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask1); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask2); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask3); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask4); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask5); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask6); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask7); + ADD_MSR_TO_HEADER(MSR_EFER); + ADD_MSR_TO_HEADER(MSR_SF_MASK); + ADD_MSR_TO_HEADER(MSR_FSBASE); + ADD_MSR_TO_HEADER(MSR_GSBASE); + ADD_MSR_TO_HEADER(X86_CR_MTRRdefType); + ADD_MSR_TO_HEADER(X86_CR_MTRRcap); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase2); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase0); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase1); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase3); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase4); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase5); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase6); + ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase7); + ADD_MSR_TO_HEADER(STAR); + ADD_MSR_TO_HEADER(LSTAR); + ADD_MSR_TO_HEADER(MSR_KGSBASE); + + // The following MSR's are currently ifdef'd out + // because LarrySim barfs on these. + // We might need these later. +#if 0 + ADD_MSR_TO_HEADER(X86_CR_MTRRfix64K_00000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix16K_80000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix16K_A0000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_C0000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_C8000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_D0000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_D8000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_E0000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_E8000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_F0000); + ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_F8000); + ADD_MSR_TO_HEADER(P5_MC_ADDR); + ADD_MSR_TO_HEADER(P5_MC_TYPE); + ADD_MSR_TO_HEADER(MSR_TR1); + ADD_MSR_TO_HEADER(MSR_TR2); + ADD_MSR_TO_HEADER(MSR_TR3); + ADD_MSR_TO_HEADER(MSR_TR4); + ADD_MSR_TO_HEADER(MSR_TR5); + ADD_MSR_TO_HEADER(MSR_TR6); + ADD_MSR_TO_HEADER(MSR_TR7); + ADD_MSR_TO_HEADER(MSR_TR9); + ADD_MSR_TO_HEADER(MSR_TR10); + ADD_MSR_TO_HEADER(MSR_TR11); + ADD_MSR_TO_HEADER(MSR_TR12); + ADD_MSR_TO_HEADER(IA32_APIC_BASE); + ADD_MSR_TO_HEADER(IA32_TIME_STAMP_COUNTER); + ADD_MSR_TO_HEADER(IA32_PerfCntr0); + ADD_MSR_TO_HEADER(IA32_PerfCntr1); + ADD_MSR_TO_HEADER(IA32_PerfCntr2); + ADD_MSR_TO_HEADER(IA32_PerfCntr3); + ADD_MSR_TO_HEADER(PerfFilteredCntr0); + ADD_MSR_TO_HEADER(PerfFilteredCntr1); + ADD_MSR_TO_HEADER(PerfFilteredCntr2); + ADD_MSR_TO_HEADER(PerfFilteredCntr3); + ADD_MSR_TO_HEADER(IA32_PerfEvtSel0); + ADD_MSR_TO_HEADER(IA32_PerfEvtSel1); + ADD_MSR_TO_HEADER(IA32_PerfEvtSel2); + ADD_MSR_TO_HEADER(IA32_PerfEvtSel3); + ADD_MSR_TO_HEADER(PerfFilterMask); + ADD_MSR_TO_HEADER(IA32_PERF_GLOBAL_STATUS); + ADD_MSR_TO_HEADER(IA32_PERF_GLOBAL_OVF_CONTROL); + ADD_MSR_TO_HEADER(IA32_PERF_GLOBAL_CTRL); + ADD_MSR_TO_HEADER(IA32_MCG_CTL); + ADD_MSR_TO_HEADER(IA32_MC0_CTRL); + ADD_MSR_TO_HEADER(IA32_MC0_STAT); + ADD_MSR_TO_HEADER(IA32_MC0_ADDR); + ADD_MSR_TO_HEADER(IA32_MC0_MISC); + ADD_MSR_TO_HEADER(IA32_MC1_CTRL); + ADD_MSR_TO_HEADER(IA32_MC1_STAT); + ADD_MSR_TO_HEADER(IA32_MC1_ADDR); + ADD_MSR_TO_HEADER(IA32_MC1_MISC); + ADD_MSR_TO_HEADER(SYSCALL_FLAG_MASK); + ADD_MSR_TO_HEADER(X86_PAT); +#endif + TRACE_SPRINTF("\t\t\t\n"); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_prep_header +// +// DESCRIPTION: +// Perform all the tasks related to preparing the Trace Header +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_prep_header(void) +{ + int i; + + TRACE_SPRINTF("\n"); + TRACE_SPRINTF("\n"); + TRACE_SPRINTF("\n"); + TRACE_SPRINTF("\t
\n"); + TRACE_SPRINTF("\t\t1.0\n"); + TRACE_SPRINTF("\t\tNov 19 2009\n"); + TRACE_SPRINTF("\t\t1.1\n"); + TRACE_SPRINTF("\t\tOct 21 2009\n"); + TRACE_SPRINTF("\t\tarchlib\n"); + TRACE_SPRINTF("\t\tWarnings! This is based on the state available in archlib.\n"); + TRACE_SPRINTF("\t\t This state dump is primarily good for capturing frequently used architectural register state.\n"); + TRACE_SPRINTF("\t\t Support for CPUId, MSRs, APIC, and x87 state is currently incomplete.\n"); + TRACE_SPRINTF("\t\t There is no support for state not specifically modeled in archlib.\n"); + TRACE_SPRINTF("\t\t Have also noticed inconsistencies in the final value of the RFLAGS reg.\n"); + if (g_triggerFound != -1) + { + TRACE_SPRINTF("\t\t This capture is generated for HOST BASED TRIGGER # %lld.\n", g_triggerFound); + g_triggerFound = -1; + } + TRACE_SPRINTF("\t
\n"); + TRACE_SPRINTF("\t\n"); + TRACE_SPRINTF("\t\t%d\n", num_online_cpus()); + TRACE_SPRINTF("\n"); + + for (i = 0; i < num_online_cpus(); i++) + { + TRACE_SPRINTF("\t\t\n", i); +// SPU is not supported in Linux + if (always_false) mictc_trace_capture_prep_SPU_header(); + mictc_trace_capture_prep_cpuid_header(); + mictc_trace_capture_prep_msr_header(); + TRACE_SPRINTF("\t\t\n"); + } + + TRACE_SPRINTF("\t\n"); + TRACE_SPRINTF("\t\n"); + TRACE_SPRINTF("\t\t\n"); + TRACE_SPRINTF("\t\n"); + TRACE_SPRINTF("\t\n"); + TRACE_SPRINTF("\n"); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_general_purpose_reg +// +// DESCRIPTION: +// Capture all general purpose registers. +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_general_purpose_reg(struct pt_regs *regs) +{ + // printk("starting reg dump regs=%llx\n", (uint64_t)regs); + + if (!regs) { + printk("Null pointer found. cpu %d %s\n", smp_processor_id(), current->comm); + return; + } + + TRACE_SPRINTF("\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->ax); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->bx); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->cx); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->dx); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->bp); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->sp); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->si); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->di); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r8); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r9); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r10); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r11); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r12); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r13); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r14); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->r15); +// In cases where a CPU is halted and is woken up from halt by the trace capture IPI +// we want to report the RIP as the one pointing to the halt instruction itself +// and not the one on the trap frame. This is to avoid the condition where the simulator-run +// for these halted CPUs ends up running extra cycles (before going back idle) +// which would not happen under actual conditions. Problem reported by Jason S. +//// if(regs->tf_rip == (register_t)ExitIdle) +//// TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->ip-1); +//// else + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->ip); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", regs->flags); + TRACE_SPRINTF("\t\t\t\n"); + + // printk("ending reg dump\n"); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_segment_reg +// +// DESCRIPTION: +// Capture all segment registers. +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_segment_reg(struct mictc_segment_reg *segment, struct pt_regs *regs) +{ + int i, v; + struct desc_ptr gdtr; + struct desc_ptr idtr; + struct mictc_seg *segreg; + +// printk("Segment registers on cpu %d\n", smp_processor_id()); + + // This is only useful during initial development. + if (!regs) { + printk("Null pointer found. cpu %d %s\n", smp_processor_id(), current->comm); + return; + } + + segment->cs.selector = (u16)regs->cs; + segment->ss.selector = (u16)regs->ss; +#if 0 + if (ISPL(regs->tf_cs) == SEL_KPL && curthread->td_pcb->pcb_ds == 0x0) { + // Specifically required for kernel IDLE thread + segment->ds = 0x10; + segment->es = 0x10; + segment->fs = 0x10; + segment->gs = 0x10; + } else { +#endif + asm("movl %%ds,%0" : "=r" (v)); segment->ds.selector = v; + asm("movl %%es,%0" : "=r" (v)); segment->es.selector = v; + segment->fs.selector = current->thread.fs; + segment->gs.selector = current->thread.gs; +// } + mictc_store_tr(&(segment->tr.selector)); + get_tss_desc(smp_processor_id(), &(segment->tr.desc)); + store_gdt(&gdtr); + store_idt(&idtr); + mictc_store_ldt(&(segment->ldtr.selector)); + // LDT is not used, so zeros will be printed. + + TRACE_SPRINTF("\t\t\t\n"); + segreg = (struct mictc_seg *)&(segment->cs); + + for(i=0; i < MAX_SEG_REG; i++) { + if (strcmp(SegRegNames[i], "GS") == 0) { + segreg->base = tc_rdmsr(MSR_KGSBASE); + } + if (strcmp(SegRegNames[i], "FS") == 0) { + segreg->base = tc_rdmsr(MSR_FSBASE); + } + + // Fill in the segment descriptor for cs to gs + if (i <= 5) { + get_seg_desc(smp_processor_id(), segreg->selector, &(segreg->desc)); + } + + TRACE_SPRINTF("\t\t\t\t\n",SegRegNames[i]); + if (i > 5) { // LDT and TSS + struct mictc_tss *segreg1 =(struct mictc_tss *)segreg; + + TRACE_SPRINTF("\t\t\t\t\t0x%llx\n", ((uint64_t)segreg1->desc.base3 << 32) | (uint64_t)((segreg1->desc.base2 << 24) | (segreg1->desc.base1 << 16) | segreg1->desc.base0)); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", (segreg1->desc.limit1 << 16) | segreg1->desc.limit0); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg1->selector); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg1->desc.g); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", 0); // double word of base and limit + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", 0); + TRACE_SPRINTF("\t\t\t\t\t0x0\n");//AVL bit not populated in the gdt[] array + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg1->desc.p); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg1->desc.dpl); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg1->desc.type & 0x10 ? 1 : 0); //The S bit (descriptor type) is clubbed along with the ssd_type element. + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", (segreg1->desc.type & 0xf)); + } else { + if (segreg->base) { + TRACE_SPRINTF("\t\t\t\t\t0x%llx\n", segreg->base); + } else { + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", (segreg->desc.base2 << 24) | (segreg->desc.base1 << 16) |segreg->desc.base0); + } + if (segreg->desc.l) segreg->desc.a = 0; + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", (segreg->desc.limit << 16) | segreg->desc.limit0); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg->selector); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg->desc.g); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg->desc.a & 1); // double word of base and limit + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg->desc.l); + TRACE_SPRINTF("\t\t\t\t\t0x0\n");//AVL bit not populated in the gdt[] array + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg->desc.p); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg->desc.dpl); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", segreg->desc.type & 0x10 ? 1 : 0); //The S bit (descriptor type) is clubbed along with the ssd_type element. + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", (segreg->desc.type & 0xf)); + } + TRACE_SPRINTF("\t\t\t\t\n"); + segreg++; + } + + TRACE_SPRINTF("\t\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t\t0x%lx\n", gdtr.address); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", gdtr.size); + TRACE_SPRINTF("\t\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t\t0x%lx\n", idtr.address); + TRACE_SPRINTF("\t\t\t\t\t0x%x\n", idtr.size); + TRACE_SPRINTF("\t\t\t\t\n"); + + TRACE_SPRINTF("\t\t\t\n"); + +// printk("End of mictc_capture_segment_reg\n"); + +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_debug_reg +// +// DESCRIPTION: +// Capture all debug registers. +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_debug_reg(void) +{ + TRACE_SPRINTF("\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(0)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(1)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(2)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(3)); +// These don't exist. +// TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(4)); +// TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(5)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(6)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", get_dr(7)); + TRACE_SPRINTF("\t\t\t\n"); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_control_reg +// +// DESCRIPTION: +// Capture all control registers. +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_control_reg(void) +{ + TRACE_SPRINTF("\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", (read_cr0()) & 0xffffffff); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", read_cr2()); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", read_cr3()); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", (read_cr4()) & 0xffffffff); + TRACE_SPRINTF("\t\t\t\t0x%lx\n", read_cr8()); + TRACE_SPRINTF("\t\t\t\n"); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_SPU_reg +// +// DESCRIPTION: +// Capture all SPU registers. +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_SPU_reg(void) +{ +#if 0 + // FIXME - The SPU is not setup currently in Linux + + TRACE_SPRINTF("\t\t\t\n"); + TRACE_SPRINTF_SPU(SPU_XQ_SIZE); + TRACE_SPRINTF_SPU(SPU_XQ_BASE); + TRACE_SPRINTF_SPU(SPU_XQ_INDEX); + TRACE_SPRINTF_SPU(SPU_CONTROL); + TRACE_SPRINTF_SPU(SPU_SAMPLER_BASE); + TRACE_SPRINTF_SPU(SPU_PMU_EVENT_SEL); + TRACE_SPRINTF_SPU(SPU_CONTROL2); + TRACE_SPRINTF_SPU(SPU_CONTROL3); + TRACE_SPRINTF("\t\t\t\n"); +#endif +} + + +//------------------------------------------------------------------------------ +// FUNCTION: PrintVector +// +// DESCRIPTION: +// Prints _m512 vectors +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +PrintVector(u8 *res_mem, int reg_num) +{ + TRACE_SPRINTF("\t\t\t\t0x" + "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x" + "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x" + "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x" + "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + reg_num, + res_mem[63], res_mem[62], res_mem[61], res_mem[60], res_mem[59], res_mem[58], res_mem[57], res_mem[56], + res_mem[55], res_mem[54], res_mem[53], res_mem[52], res_mem[51], res_mem[50], res_mem[49], res_mem[48], + res_mem[47], res_mem[46], res_mem[45], res_mem[44], res_mem[43], res_mem[42], res_mem[41], res_mem[40], + res_mem[39], res_mem[38], res_mem[37], res_mem[36], res_mem[35], res_mem[34], res_mem[33], res_mem[32], + res_mem[31], res_mem[30], res_mem[29], res_mem[28], res_mem[27], res_mem[26], res_mem[25], res_mem[24], + res_mem[23], res_mem[22], res_mem[21], res_mem[20], res_mem[19], res_mem[18], res_mem[17], res_mem[16], + res_mem[15], res_mem[14], res_mem[13], res_mem[12], res_mem[11], res_mem[10], res_mem[9], res_mem[8], + res_mem[7], res_mem[6], res_mem[5], res_mem[4], res_mem[3], res_mem[2], res_mem[1], res_mem[0]); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: PrintFPRegister +// +// DESCRIPTION: +// Prints 10 byte FP register contents +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +PrintFPRegister(u8 *res_mem, int reg_num) +{ + TRACE_SPRINTF("\t\t\t\t0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + reg_num, + res_mem[9], + res_mem[8], + res_mem[7], + res_mem[6], + res_mem[5], + res_mem[4], + res_mem[3], + res_mem[2], + res_mem[1], + res_mem[0]); +} + + +// VPU Instructions + +#ifdef CONFIG_ML1OM +#define VSTORED_DISP32_EAX(v, disp32) " vstored %%v" #v "," #disp32 "(%%rax)\n" + +#define VKSTORE_DISP32_EAX(k, disp32) \ + " vkmov %%k" #k ",%%ebx\n" \ + " movw %%bx, " #disp32 "(%%rax)\n" + +#define STVXCSR_DISP32_EAX(disp32) " stvxcsr " #disp32 "(%%rax)\n" + +#else +// For K1OM +#define VSTORED_DISP32_EAX(v, disp32) " vpackstorelps %%zmm" #v "," #disp32 "(%%rax)\n" + +#define VKSTORE_DISP32_EAX(k, disp32) \ + " kmov %%k" #k ",%%ebx\n" \ + " movw %%bx, " #disp32 "(%%rax)\n" + +#define STVXCSR_DISP32_EAX(disp32) " stmxcsr " #disp32 "(%%rax)\n" +#endif + +static inline void save_vpu(struct vpustate_struct *vpustate) +{ + asm volatile( + VSTORED_DISP32_EAX(0, 0x00) + VSTORED_DISP32_EAX(1, 0x40) + VSTORED_DISP32_EAX(2, 0x80) + VSTORED_DISP32_EAX(3, 0xc0) + VSTORED_DISP32_EAX(4, 0x100) + VSTORED_DISP32_EAX(5, 0x140) + VSTORED_DISP32_EAX(6, 0x180) + VSTORED_DISP32_EAX(7, 0x1c0) + VSTORED_DISP32_EAX(8, 0x200) + VSTORED_DISP32_EAX(9, 0x240) + VSTORED_DISP32_EAX(10, 0x280) + VSTORED_DISP32_EAX(11, 0x2c0) + VSTORED_DISP32_EAX(12, 0x300) + VSTORED_DISP32_EAX(13, 0x340) + VSTORED_DISP32_EAX(14, 0x380) + VSTORED_DISP32_EAX(15, 0x3c0) + VSTORED_DISP32_EAX(16, 0x400) + VSTORED_DISP32_EAX(17, 0x440) + VSTORED_DISP32_EAX(18, 0x480) + VSTORED_DISP32_EAX(19, 0x4c0) + VSTORED_DISP32_EAX(20, 0x500) + VSTORED_DISP32_EAX(21, 0x540) + VSTORED_DISP32_EAX(22, 0x580) + VSTORED_DISP32_EAX(23, 0x5c0) + VSTORED_DISP32_EAX(24, 0x600) + VSTORED_DISP32_EAX(25, 0x640) + VSTORED_DISP32_EAX(26, 0x680) + VSTORED_DISP32_EAX(27, 0x6c0) + VSTORED_DISP32_EAX(28, 0x700) + VSTORED_DISP32_EAX(29, 0x740) + VSTORED_DISP32_EAX(30, 0x780) + VSTORED_DISP32_EAX(31, 0x7c0) + VKSTORE_DISP32_EAX(0, 0x800) + VKSTORE_DISP32_EAX(1, 0x802) + VKSTORE_DISP32_EAX(2, 0x804) + VKSTORE_DISP32_EAX(3, 0x806) + VKSTORE_DISP32_EAX(4, 0x808) + VKSTORE_DISP32_EAX(5, 0x80a) + VKSTORE_DISP32_EAX(6, 0x80c) + VKSTORE_DISP32_EAX(7, 0x80e) + STVXCSR_DISP32_EAX(0x810) + : "=m" (vpustate) : [fx] "a" (vpustate) : "ebx" + ); +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_vector_reg +// +// DESCRIPTION: +// Capture all vector registers. +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_vector_reg(struct vpustate_struct *vpustate) +{ + // printk("vpustate = %p\n", vpustate); + + save_vpu(vpustate); + + TRACE_SPRINTF("\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[0]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[1]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[2]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[3]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[4]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[5]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[6]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->k[7]); + TRACE_SPRINTF_VECTOR(0, vpustate->vector_space[0]); + TRACE_SPRINTF_VECTOR(1, vpustate->vector_space[16]); + TRACE_SPRINTF_VECTOR(2, vpustate->vector_space[32]); + TRACE_SPRINTF_VECTOR(3, vpustate->vector_space[48]); + TRACE_SPRINTF_VECTOR(4, vpustate->vector_space[64]); + TRACE_SPRINTF_VECTOR(5, vpustate->vector_space[80]); + TRACE_SPRINTF_VECTOR(6, vpustate->vector_space[96]); + TRACE_SPRINTF_VECTOR(7, vpustate->vector_space[112]); + TRACE_SPRINTF_VECTOR(8, vpustate->vector_space[128]); + TRACE_SPRINTF_VECTOR(9, vpustate->vector_space[144]); + TRACE_SPRINTF_VECTOR(10, vpustate->vector_space[160]); + TRACE_SPRINTF_VECTOR(11, vpustate->vector_space[176]); + TRACE_SPRINTF_VECTOR(12, vpustate->vector_space[192]); + TRACE_SPRINTF_VECTOR(13, vpustate->vector_space[208]); + TRACE_SPRINTF_VECTOR(14, vpustate->vector_space[224]); + TRACE_SPRINTF_VECTOR(15, vpustate->vector_space[240]); + TRACE_SPRINTF_VECTOR(16, vpustate->vector_space[256]); + TRACE_SPRINTF_VECTOR(17, vpustate->vector_space[272]); + TRACE_SPRINTF_VECTOR(18, vpustate->vector_space[288]); + TRACE_SPRINTF_VECTOR(19, vpustate->vector_space[304]); + TRACE_SPRINTF_VECTOR(20, vpustate->vector_space[320]); + TRACE_SPRINTF_VECTOR(21, vpustate->vector_space[336]); + TRACE_SPRINTF_VECTOR(22, vpustate->vector_space[352]); + TRACE_SPRINTF_VECTOR(23, vpustate->vector_space[368]); + TRACE_SPRINTF_VECTOR(24, vpustate->vector_space[384]); + TRACE_SPRINTF_VECTOR(25, vpustate->vector_space[400]); + TRACE_SPRINTF_VECTOR(26, vpustate->vector_space[416]); + TRACE_SPRINTF_VECTOR(27, vpustate->vector_space[432]); + TRACE_SPRINTF_VECTOR(28, vpustate->vector_space[448]); + TRACE_SPRINTF_VECTOR(29, vpustate->vector_space[464]); + TRACE_SPRINTF_VECTOR(30, vpustate->vector_space[480]); + TRACE_SPRINTF_VECTOR(31, vpustate->vector_space[496]); + TRACE_SPRINTF("\t\t\t\t0x%x\n", vpustate->vxcsr); + TRACE_SPRINTF("\t\t\t\n"); +} + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_FPU_reg +// +// DESCRIPTION: +// Capture all FPU registers. +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_FPU_reg(struct i387_fxsave_struct *fpu) +{ + +/* + Get FPU contents from the registers instead of the PCB. + fxsave on L1OM saves only the x87 FPU registers and not the SSE2 and MMX registers. + For format of the data below refer Intel 64 and IA-32 Arch. SDM Vol 2A Instr Set Ref A-M + tables 3-59 & 3-60. +*/ + mictc_fxsave(fpu); + + TRACE_SPRINTF("\t\t\t\n"); + TRACE_SPRINTF("\t\t\t\t0x%x\n", fpu->cwd); + TRACE_SPRINTF("\t\t\t\t0x%x\n", fpu->swd); + TRACE_SPRINTF("\t\t\t\t0x%x\n", (fpu->twd)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", (fpu->fcs & 0xffff)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", fpu->fop); + TRACE_SPRINTF("\t\t\t\t0x%x\n", (fpu->fos & 0xffff)); + TRACE_SPRINTF("\t\t\t\t0x%x\n", fpu->fip); + TRACE_SPRINTF("\t\t\t\t0x%x\n", (fpu->foo)); + PrintFPRegister((u8 *)&(fpu->st_space[0]), 0); + PrintFPRegister((u8 *)&(fpu->st_space[4]), 1); + PrintFPRegister((u8 *)&(fpu->st_space[8]), 2); + PrintFPRegister((u8 *)&(fpu->st_space[12]), 3); + PrintFPRegister((u8 *)&(fpu->st_space[16]), 4); + PrintFPRegister((u8 *)&(fpu->st_space[20]), 5); + PrintFPRegister((u8 *)&(fpu->st_space[24]), 6); + PrintFPRegister((u8 *)&(fpu->st_space[28]), 7); + TRACE_SPRINTF("\t\t\t\n"); + +#if 0 + printk("00 %08x %08x\n", ((u32*)fpu)[0], ((u32*)fpu)[1]); + printk("08 %08x %08x\n", ((u32*)fpu)[2], ((u32*)fpu)[3]); + printk("10 %08x %08x\n", ((u32*)fpu)[4], ((u32*)fpu)[5]); + printk("18 %08x %08x\n", ((u32*)fpu)[6], ((u32*)fpu)[7]); + printk("20 %08x %08x\n", ((u32*)fpu)[8], ((u32*)fpu)[9]); + printk("28 %08x %08x\n", ((u32*)fpu)[10], ((u32*)fpu)[11]); + printk("30 %08x %08x\n", ((u32*)fpu)[12], ((u32*)fpu)[13]); + printk("38 %08x %08x\n", ((u32*)fpu)[14], ((u32*)fpu)[15]); + printk("40 %08x %08x\n", ((u32*)fpu)[16], ((u32*)fpu)[17]); + printk("48 %08x %08x\n", ((u32*)fpu)[18], ((u32*)fpu)[19]); + printk("50 %08x %08x\n", ((u32*)fpu)[20], ((u32*)fpu)[21]); + printk("58 %08x %08x\n", ((u32*)fpu)[22], ((u32*)fpu)[23]); + printk("60 %08x %08x\n", ((u32*)fpu)[24], ((u32*)fpu)[25]); + printk("68 %08x %08x\n", ((u32*)fpu)[26], ((u32*)fpu)[27]); + printk("70 %08x %08x\n", ((u32*)fpu)[28], ((u32*)fpu)[29]); + printk("78 %08x %08x\n", ((u32*)fpu)[30], ((u32*)fpu)[31]); + printk("80 %08x %08x\n", ((u32*)fpu)[32], ((u32*)fpu)[33]); + printk("88 %08x %08x\n", ((u32*)fpu)[34], ((u32*)fpu)[35]); + printk("90 %08x %08x\n", ((u32*)fpu)[36], ((u32*)fpu)[37]); + printk("98 %08x %08x\n", ((u32*)fpu)[38], ((u32*)fpu)[39]); +#endif +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_MSR +// +// DESCRIPTION: +// Capture all MSR +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_capture_MSR(void) +{ + // u32 me_cpu = PCPU_GET(cpuid); +#if 0 + //msr->msrMIC_CR_SPUBASE = tc_rdmsr(MIC_CR_SPUBASE); + //msr->msrIA32_CR_MISC = tc_rdmsr(IA32_CR_MISC); + //msr->msrWMT_CR_LASTBRANCH_0 = tc_rdmsr(WMT_CR_LASTBRANCH_0); + //msr->msrWMT_CR_LASTBRANCH_1 = tc_rdmsr(WMT_CR_LASTBRANCH_1); + msr->msrVMX_MSR_BASE = tc_rdmsr(VMX_MSR_BASE); + msr->msrVMX_MSR_BASE_PLUS_1 = tc_rdmsr(VMX_MSR_BASE_PLUS_1); + msr->msrVMX_MSR_BASE_PLUS_2 = tc_rdmsr(VMX_MSR_BASE_PLUS_2); + msr->msrVMX_MSR_BASE_PLUS_3 = tc_rdmsr(VMX_MSR_BASE_PLUS_3); + msr->msrVMX_MSR_BASE_PLUS_4 = tc_rdmsr(VMX_MSR_BASE_PLUS_4); + msr->msrVMX_MSR_BASE_PLUS_5 = tc_rdmsr(VMX_MSR_BASE_PLUS_5); + msr->msrVMX_MSR_BASE_PLUS_6 = tc_rdmsr(VMX_MSR_BASE_PLUS_6); + msr->msrVMX_MSR_BASE_PLUS_7 = tc_rdmsr(VMX_MSR_BASE_PLUS_7); + msr->msrVMX_MSR_BASE_PLUS_8 = tc_rdmsr(VMX_MSR_BASE_PLUS_8); + msr->msrVMX_MSR_BASE_PLUS_9 = tc_rdmsr(VMX_MSR_BASE_PLUS_9); + msr->msrTIME = tc_rdmsr(TIME); + msr->msrPINFO = tc_rdmsr(PINFO); +#endif + TRACE_SPRINTF("\t\t\t\n"); + TRACE_SPRINTF_MSR(P6_CR_TSC); + TRACE_SPRINTF_MSR(X86_CR_APICBASE); + TRACE_SPRINTF_MSR(CBOX_SPU_PA_MSR); + // This is being added since it is included in the ITP dump as well. + TRACE_SPRINTF("\t\t\t\t0x%llx\n", SPU_BASE, (tc_rdmsr(CBOX_SPU_PA_MSR) & 0x7fffffffffffffff) + 0x1000); + TRACE_SPRINTF_MSR(CBOX_SPU_SAMPLER_BIND_MSR); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask0); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask1); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask2); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask3); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask4); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask5); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask6); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask7); + TRACE_SPRINTF_MSR(MSR_EFER & ~0x800); // Force bit 11 to 0 + TRACE_SPRINTF_MSR(MSR_SF_MASK); + TRACE_SPRINTF_MSR(MSR_FSBASE); + TRACE_SPRINTF_MSR(MSR_GSBASE); + TRACE_SPRINTF_MSR(X86_CR_MTRRcap); + TRACE_SPRINTF_MSR(X86_CR_MTRRdefType); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase2); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase0); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase1); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase3); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase4); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase5); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase6); + TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase7); + TRACE_SPRINTF_MSR(STAR); + TRACE_SPRINTF_MSR(LSTAR); + + // MSR_KGSBASE needs some special handling. + // On Silicon when a thread transitions from Ring 3->Ring 0 the + // first instruction it executes is swapgs which swaps the value + // of the current GSBase (which could be 0x0) with the value in + // MSR_KGSBASE to get to the per cpu data structure and onwards to the kernel stack. + // On Silicon, when the same thread transitions from Ring 0->Ring 3 MSR_KGSBASE gets + // the right value as a result of another swapgs on the way back. + // Where Trace Capture differs from Silicon is that we take a snapshot while executing + // in Ring 0 (when MSR_KGSBASE could be 0x0) but the first instruction + // which executes on LarrySim is a Ring 3 instruction. + // On the first syscall in LarrySim when it executes a swapgs it sees a MSR_KGSBASE value of 0x0. + // LarrySim cannot get to the kernel stack and we correctly hit a double fault (Bang!). + // The correct fix is to ensure that LarrySim sees a correct value of + // MSR_KGSBASE when it is provided a snapshot. +//FIXME +// TRACE_SPRINTF("\t\t\t\t0x%lx\n", MSR_KGSBASE, &__pcpu[me_cpu]); + + // The following MSR's are currently ifdef'd out + // because LarrySim barfs on these. + // We might need these later. +#if 0 + TRACE_SPRINTF_MSR(X86_CR_MTRRfix64K_00000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix16K_80000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix16K_A0000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_C0000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_C8000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_D0000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_D8000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_E0000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_E8000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_F0000); + TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_F8000); + TRACE_SPRINTF_MSR(P5_MC_ADDR); + TRACE_SPRINTF_MSR(P5_MC_TYPE); + TRACE_SPRINTF_MSR(MSR_TR1); + TRACE_SPRINTF_MSR(MSR_TR2); + TRACE_SPRINTF_MSR(MSR_TR3); + TRACE_SPRINTF_MSR(MSR_TR4); + TRACE_SPRINTF_MSR(MSR_TR5); + TRACE_SPRINTF_MSR(MSR_TR6); + TRACE_SPRINTF_MSR(MSR_TR7); + TRACE_SPRINTF_MSR(MSR_TR9); + TRACE_SPRINTF_MSR(MSR_TR10); + TRACE_SPRINTF_MSR(MSR_TR11); + TRACE_SPRINTF_MSR(MSR_TR12); + TRACE_SPRINTF_MSR(IA32_APIC_BASE); + TRACE_SPRINTF_MSR(IA32_TIME_STAMP_COUNTER); + TRACE_SPRINTF_MSR(IA32_PerfCntr0); + TRACE_SPRINTF_MSR(IA32_PerfCntr1); + TRACE_SPRINTF_MSR(IA32_PerfCntr2); + TRACE_SPRINTF_MSR(IA32_PerfCntr3); + TRACE_SPRINTF_MSR(PerfFilteredCntr0); + TRACE_SPRINTF_MSR(PerfFilteredCntr1); + TRACE_SPRINTF_MSR(PerfFilteredCntr2); + TRACE_SPRINTF_MSR(PerfFilteredCntr3); + TRACE_SPRINTF_MSR(IA32_PerfEvtSel0); + TRACE_SPRINTF_MSR(IA32_PerfEvtSel1); + TRACE_SPRINTF_MSR(IA32_PerfEvtSel2); + TRACE_SPRINTF_MSR(IA32_PerfEvtSel3); + TRACE_SPRINTF_MSR(PerfFilterMask); + TRACE_SPRINTF_MSR(IA32_PERF_GLOBAL_STATUS); + TRACE_SPRINTF_MSR(IA32_PERF_GLOBAL_OVF_CONTROL); + TRACE_SPRINTF_MSR(IA32_PERF_GLOBAL_CTRL); + TRACE_SPRINTF_MSR(IA32_MCG_CTL); + TRACE_SPRINTF_MSR(IA32_MC0_CTRL); + TRACE_SPRINTF_MSR(IA32_MC0_STAT); + TRACE_SPRINTF_MSR(IA32_MC0_ADDR); + TRACE_SPRINTF_MSR(IA32_MC0_MISC); + TRACE_SPRINTF_MSR(IA32_MC1_CTRL); + TRACE_SPRINTF_MSR(IA32_MC1_STAT); + TRACE_SPRINTF_MSR(IA32_MC1_ADDR); + TRACE_SPRINTF_MSR(IA32_MC1_MISC); + TRACE_SPRINTF_MSR(SYSCALL_FLAG_MASK); + TRACE_SPRINTF_MSR(X86_PAT); +#endif + TRACE_SPRINTF("\t\t\t\n"); +} + + +//u64 rdtsccount = 0, dmasetuptime = 0, dmacomptime=0, hostacktime=0; + +#if MIC_TRACE_CAPTURE_MEMORY_TEST +// Local function to count the number of bytes in a U32 +// This is only used for the memory test. +static U32 AddBytes(U32 add) +{ + U32 sum = 0x0; + for (int i=0; i < sizeof(U32); i++) + { + sum += (add & 0xFF); + add = (add >> 8); + } + return sum; +} +#endif + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_capture_memory +// +// DESCRIPTION: +// Trace Capture IPI Handler +// +// PARAMETERS: None +// +// RETURNS: None +// +// TODOS: +// +static int +mictc_capture_memory(void) +{ + long err; + long i; + long delay_count; + long total_transfered = 0; + + g_sizeXferred = 0; + + // Transfer a full buffer. + for (i = 0; total_transfered < (max_pfn << PAGE_SHIFT); i++) { + printk("before scif_writeto, i = %ld\n", i); + + // Transfer any remainder + if ((max_pfn << PAGE_SHIFT) - total_transfered < MICTC_MEM_BUFFER_SIZE) { + long remainder = ((uint64_t)max_pfn << PAGE_SHIFT) % MICTC_MEM_BUFFER_SIZE; + + printk("Writing %ld bytes, max_pfn = %ld\n", remainder, max_pfn); + + if ((err = scif_writeto(mictc_endp_data, scif_offset_mem + (i * MICTC_MEM_BUFFER_SIZE), + remainder, scif_offset_dst, 0)) < 0) { + pr_crit("%s:%s:%d scif_writeto failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, err); + return 1; + } + total_transfered += remainder; + g_sizeXferred = remainder; + } else { + if ((err = scif_writeto(mictc_endp_data, scif_offset_mem + (i * MICTC_MEM_BUFFER_SIZE), + MICTC_MEM_BUFFER_SIZE, scif_offset_dst, 0)) < 0) { + pr_crit("%s:%s:%d scif_writeto failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, err); + return 1; + } + total_transfered += MICTC_MEM_BUFFER_SIZE; + g_sizeXferred = MICTC_MEM_BUFFER_SIZE; + } + *g_traceBufferSizeOffset = g_sizeXferred; + printk("before fence\n"); + err = scif_fence_signal(mictc_endp_data, (off_t)scif_offset_xml + TRACE_STATUS_OFFSET, + TRACE_PAGE_READY, 0, 0, SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_LOCAL); + + if (err < 0) { + printk("scif_fence_signal failed. err = %ld\n", err); + return 1; + } + printk("TRACE_PAGE_READY %lld bytes\n", g_sizeXferred); + g_sizeXferred = 0; + + delay_count = 0; + printk("waiting for TRACE_HOST_READY\n"); + + while (*g_traceBufferStatusOffset != TRACE_HOST_READY) { + cpu_relax(); + delay_count++; + if (delay_count == TRACE_CAPTURE_TIMEOUT) { + printk("Memory Dump Timeout. Host did not update @physAddr 0x%lx\n", i << PAGE_SHIFT); + return -EBUSY; + } + } + } + *g_traceBufferSizeOffset = 0; + *g_traceBufferStatusOffset = TRACE_MEM_COMPLETE; + + delay_count = 0; + + while (*g_traceBufferStatusOffset != TRACE_COMPLETE) { + cpu_relax(); + delay_count++; + if (delay_count == TRACE_CAPTURE_TIMEOUT) { + printk("Trace completion timeout.\n"); + return -EBUSY; + } + } + + return 0; +} + + +//------------------------------------------------------------------------------ +// FUNCTION: mictc_trace_capture +// +// DESCRIPTION: +// Perform all the tasks related to Trace Capture +// for a particular Hardware Thread. +// The tasks currently include: +// General purpose registers +// Segment registers +// Debug registers +// Control registers +// VPU registers +// MSRs +// +// Note: The SPU is not setup in Linux. +// +// PARAMETERS: regs - pointer to the task's registers +// +// RETURNS: None +// +// TODOS: +// +static void +mictc_trace_capture(struct pt_regs *regs) +{ + long delay_count; + +// printk("Entering mictc_trace_capture on cpu %d, for process = %s\n", smp_processor_id(), current->comm); + + // Logic to let threads in one by one in order + + while (atomic_read(&cpus_stopped) != smp_processor_id()) { + cpu_relax(); +//STH touch_nmi_watchdog(); + } + + if (smp_processor_id() == 0) + { + // CPU0 is responsible for preparing the + // Trace Capture Header. + mictc_prep_header(); + } + + TRACE_SPRINTF("\t\t\n", smp_processor_id()); + mictc_capture_general_purpose_reg(regs); + mictc_capture_segment_reg(&(trace->segment), regs); + mictc_capture_debug_reg(); + mictc_capture_control_reg(); + mictc_capture_vector_reg(&(trace->vpustate)); + +//STH touch_nmi_watchdog(); // Just to be safe + + // The SPU is not setup currently in Linux + if (always_false) mictc_capture_SPU_reg(); + + mictc_capture_FPU_reg(&(trace->fpu)); + mictc_capture_MSR(); + +// printk("In mictc_trace_capture on cpu %d, after MSRs\n", smp_processor_id()); + + TRACE_SPRINTF("\t\t\n"); + + // Each core should flush their caches + // as the initiator is going to take a memory + // dump soon after. + // Not required since DMA should snoop the caches. + //wbinvd(); + +// printk("In mictc_trace_capture on cpu %d, before check for last cpu\n", smp_processor_id()); + + if (smp_processor_id() == (num_online_cpus() - 1)) + { + // The last CPU is responsible for preparing the + // Trace Capture Trailer. + TRACE_SPRINTF("\t\n"); + + TRACE_SPRINTF("
\n"); + + // Update the size as the Host App needs this information. + *g_traceBufferSizeOffset = g_sizeXferred; + + g_sizeXferred = 0; + + // Update the status for the Host App. The CPU register state has been written by all + // the hardware threads. The host app polls for this status. + *g_traceBufferStatusOffset = TRACE_REG_COMPLETE; + + printk("Completed Arch Dump. Now Beginning Memory Dump. Be patient (~1 min is ETA)..\n"); + + delay_count = 0; + + while (*g_traceBufferStatusOffset != TRACE_GET_FILE) + { + cpu_relax(); + delay_count++; + if (delay_count == TRACE_CAPTURE_TIMEOUT) + { + printk("Arch Dump Timeout. Host did not update status.\n"); + break; + } + } + printk("%s out of wait loop.\n", __FUNCTION__); + } + +// printk("Exiting mictc_trace_capture on cpu %d\n", smp_processor_id()); +} + + +// Starting point for trace_capture. +static void +mictc_start_capture(void) +{ + long ret; + long err; + struct scif_portID portID_data; + int control_msg = 0; + int i; + int found_it = 0; + + spin_lock(&mictc_lock); + printk("Starting tracecapture on cpu %d. Taking lock.\n", smp_processor_id()); + + if (!(g_traceBufferAllocated = kmalloc(MICTC_XML_BUFFER_SIZE, GFP_KERNEL))) { + pr_crit("%s:%s:%d kmalloc failed failed with ENOMEM\n", __FILE__, __FUNCTION__, __LINE__); + goto done0; + } + + pr_crit("%s:%s:%d kmalloc returned %llx\n", __FILE__, __FUNCTION__, __LINE__, (uint64_t)g_traceBufferAllocated); + + g_traceBufferStatusOffset = (u64*)((u64)g_traceBufferAllocated + TRACE_STATUS_OFFSET); + g_traceBufferSizeOffset = (u64*)((u64)g_traceBufferAllocated + TRACE_SIZE_OFFSET); + g_traceBufferDataOffset = (u32*)((u64)g_traceBufferAllocated + TRACE_DATA_OFFSET); + g_traceBufferTriggerOffset = (u32*)((u64)g_traceBufferAllocated + TRACE_TRIGGER_OFFSET); + + *g_traceBufferStatusOffset = TRACE_DATA; +#if MIC_TRACE_CAPTURE_MEMORY_TEST + g_traceBufferChecksumOffset = (u64*)((u64)g_traceBufferAllocated + TRACE_CHECKSUM_OFFSET); +#endif + + if (!(trace = (struct mictc_trace *)kmalloc(sizeof(struct mictc_trace), GFP_KERNEL))) { + pr_crit("%s:%s:%d kmalloc failed failed with ENOMEM\n", __FILE__, __FUNCTION__, __LINE__); + goto done1a; + } + + pr_crit("%s:%s:%d kmalloc returned %llx\n", __FILE__, __FUNCTION__, __LINE__, (uint64_t)trace); + + memset(trace, 0, sizeof(struct mictc_trace)); + + pr_crit("g_traceBufferStatusOffset %llx\n", (uint64_t)g_traceBufferStatusOffset); + pr_crit("g_traceBufferSizeOffset %llx\n", (uint64_t)g_traceBufferSizeOffset); + pr_crit("g_traceBufferDataOffset %llx\n", (uint64_t)g_traceBufferDataOffset); + + // Data channel + if (!(mictc_endp_data = scif_open())) { + pr_crit("%s:%s:%d scif_open failed with ENOMEM\n", __FILE__, __FUNCTION__, __LINE__); + return; + } + + if ((ret = scif_bind(mictc_endp_data, MICTC_SCIF_PORT_DATA)) < 0) { + pr_crit("%s:%s:%d scif_bind failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, ret); + goto done1; + } + + portID_data.node = 0; + portID_data.port = MICTC_SCIF_PORT_DATA; + + if ((ret = scif_connect(mictc_endp_data, &portID_data)) < 0) { + pr_crit("%s:%s:%d scif_connect failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, ret); + goto done1; + } + + if ((ret = (long)scif_register(mictc_endp_data, + g_traceBufferAllocated, + MICTC_XML_BUFFER_SIZE, + 0, // suggested_offset, + SCIF_PROT_READ | SCIF_PROT_WRITE, + SCIF_MAP_KERNEL)) < 0) { + if (ret > -300) { + pr_crit("%s:%s:%d scif_register failed failed with %ld\n", __FILE__, __FUNCTION__, __LINE__, ret); + goto done2; + } + } + scif_offset_xml = ret; + pr_crit("%s:%s:%d scif_register scif_offset_xml = %lx\n", __FILE__, __FUNCTION__, __LINE__, scif_offset_xml); + + // Register all of physical memory. + if ((ret = (long)scif_register(mictc_endp_data, + __va(0), // Physical page 0 + max_pfn << PAGE_SHIFT, + 0, // suggested_offset, + SCIF_PROT_READ | SCIF_PROT_WRITE, + SCIF_MAP_KERNEL)) < 0) { + if (ret > -300) { + pr_crit("%s:%s:%d scif_register failed failed with %ld\n", __FILE__, __FUNCTION__, __LINE__, ret); + goto done2; + } + } + scif_offset_mem = ret; + pr_crit("%s:%s:%d scif_register scif_offset_mem = %lx\n", __FILE__, __FUNCTION__, __LINE__, scif_offset_mem); + + BARRIER(mictc_endp_data, "before barrier"); + + if ((err = scif_recv(mictc_endp_data, &scif_offset_dst, sizeof(scif_offset_dst), SCIF_RECV_BLOCK)) <= 0) { + pr_crit("%s:%s:%d scif_recv failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err); + goto close; + } + +// g_traceBufferDataOffset = (u32 *)ret; +// pr_crit("%s:%s:%d scif_register ret %lx\n", __FILE__, __FUNCTION__, __LINE__, scif_offset); + + if ((err = scif_send(mictc_endp_data, &scif_offset_xml, sizeof(scif_offset_xml), SCIF_SEND_BLOCK)) <= 0) { + pr_crit("%s:%s:%d scif_send failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err); + goto close; + } + + while (*g_traceBufferStatusOffset != TRACE_HOST_READY) + { + msleep(100); + touch_nmi_watchdog(); + } + + // Get trigger data. + for (i = 0; i < TRACE_TRIGGER_MAX; i++) { + g_traceTriggers[i] = *g_traceBufferTriggerOffset; + printk("Found trace trigger %d\n", g_traceTriggers[i]); + g_traceBufferTriggerOffset++; + + if (g_traceTriggers[i] == TRACE_EOL) break; + } + + // Is the trigger data empty? If so, accept everything. + if (g_traceTriggers[0] == TRACE_EOL) { + printk("Trace trigger data is empty.\n"); + found_it = 1; + } else if (g_traceTriggers[0] == TRACE_IGNORE) { + printk("Ignoring current trace."); + } else { + // See if g_traceCurrentTrigger is in the trigger data. + // If not, abort this trace. + for (i = 0; i < TRACE_TRIGGER_MAX; i++) { + if (g_traceTriggers[i] == TRACE_EOL) break; + + if (g_traceTriggers[i] == g_traceCurrentTrigger) { + found_it = 1; + printk("Matched trace trigger %d\n", g_traceTriggers[i]); + break; + } + } + } + + if (!found_it) { + // Abort this trace + printk("Trace trigger did not match -- aborting.\n"); + *g_traceBufferStatusOffset = TRACE_ABORTED; + goto done3; + } + + if (always_false) { + // Mmap memory at 0xfee03000 physical. + spu_addr = ioremap(0xfee03000, 0x1000); + if (! spu_addr) { + pr_crit("%s ioremap failed.\n", __FUNCTION__); + goto done3; + } + printk("CPU ioremap %p\n", spu_addr); + } + + cli; // Interrupts off + atomic_set(&cpus_stopped, 0); + atomic_set(&cpus_released, 0); + // Send IPI to capture all other cpus. + apic->send_IPI_allbutself(NMI_VECTOR); + mictc_trace_capture(task_pt_regs(current)); + atomic_inc(&cpus_stopped); + + pr_debug("start_capture: Entering wait loop until lock count %d >= %d on cpu %d\n", atomic_read(&cpus_stopped), num_online_cpus() - 1, smp_processor_id()); + + { int ctr = 0; + // Wait for every other CPU to finish its trace capture tasks. + while (atomic_read(&cpus_stopped) < num_online_cpus()) { + cpu_relax(); +//STH touch_nmi_watchdog(); + if (ctr++ > 1000000) { + ctr = 0; + printk("%s:%d *** waiting loop cpus_stopped = %d\n", __FUNCTION__, __LINE__, atomic_read(&cpus_stopped)); + } + } + } + + printk("%s out of wait loop.\n", __FUNCTION__); + + // Get a memory dump here before exiting. + err = mictc_capture_memory(); + + printk("Completed Memory Dump.\n"); +// printk("Completed Memory Dump. DMASetuptime = %ld , DMATime = %ld, HostAckTime = %ld\n", dmasetuptime, dmacomptime, hostacktime); + + // Now release all cores. + atomic_set(&cpus_stopped, num_online_cpus() + 1); + + // Wait for every other CPU to be released + while (atomic_read(&cpus_released) < num_online_cpus() - 1) { + // msleep(2000); + cpu_relax(); + touch_nmi_watchdog(); + } + sti; // Interrupts on + + // FIXME This cleanup probably needs to be checked. + close: + if (always_false) { + iounmap(spu_addr); + } + done3: +// scif_unregister(mictc_endp_data, scif_offset, MICTC_XML_BUFFER_SIZE); + done2: + done1: + scif_close(mictc_endp_data); + kfree(trace); + done1a: + kfree(g_traceBufferAllocated); + spin_unlock(&mictc_lock); + done0: + printk("Ending tracecapture on cpu %d. Releasing lock.\n", smp_processor_id()); +} +EXPORT_SYMBOL(mictc_start_capture); + + +/* + * mictc_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + * interface locks, if any (begin_session) + */ +int +mictc_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ + // Interrupts are off. + + // printk("Entering mictc_handle_exception on cpu %d pid: %d, name: %s\n", smp_processor_id(), current->pid, current->comm); + + mictc_trace_capture(regs); + atomic_inc(&cpus_stopped); + pr_debug("handler: Entering wait loop until lock count %d >= %d on cpu %d\n", atomic_read(&cpus_stopped), num_online_cpus() - 1, smp_processor_id()); + // Wait for every other CPU to finish its Trace Capture Tasks. + // This test is for num_online_cpus+1 to hold all threads that are + // in interrupt context so that the main thread can dump memory. + while (atomic_read(&cpus_stopped) < num_online_cpus() + 1) { + cpu_relax(); +//STH touch_nmi_watchdog(); + } + + atomic_inc(&cpus_released); + + printk("Exiting mictc_handle_exception on cpu %d %s\n", smp_processor_id(), current->comm); + return 1; +} + + +static int __mictc_notify(struct die_args *args, unsigned long cmd) +{ + struct pt_regs *regs = args->regs; +#if 0 + switch (cmd) { + case DIE_NMI: + if (atomic_read(&kgdb_active) != -1) { + /* KGDB CPU roundup */ + kgdb_nmicallback(smp_processor_id(), regs); + was_in_debug_nmi[smp_processor_id()] = 1; + touch_nmi_watchdog(); + return NOTIFY_STOP; + } + return NOTIFY_DONE; + + case DIE_NMIUNKNOWN: + if (was_in_debug_nmi[smp_processor_id()]) { + was_in_debug_nmi[smp_processor_id()] = 0; + return NOTIFY_STOP; + } + return NOTIFY_DONE; + + case DIE_DEBUG: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; + /* fall through */ + default: + if (user_mode(regs)) + return NOTIFY_DONE; + } +#endif + if (cmd == DIE_NMI) { + if (mictc_handle_exception(args->trapnr, args->signr, cmd, regs)) { + touch_nmi_watchdog(); + return NOTIFY_STOP; + } + } else { + touch_nmi_watchdog(); + return NOTIFY_DONE; + } + + /* Must touch watchdog before return to normal operation */ + touch_nmi_watchdog(); + return NOTIFY_STOP; +} + + +static int +mictc_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __mictc_notify(ptr, cmd); + local_irq_restore(flags); + + return ret; +} + + +/* + * This function is called whenever a process tries to do an ioctl on our + * device file. We get two extra parameters (additional to the inode and file + * structures, which all device functions get): the number of the ioctl called + * and the parameter given to the ioctl function. + * + * If the ioctl is write or read/write (meaning output is returned to the + * calling process), the ioctl call returns the output of this function. + * + */ +long device_ioctl( + struct file *file, /* ditto */ + unsigned int ioctl_num, /* number and param for ioctl */ + unsigned long ioctl_param) +{ + // Switch according to the ioctl called + switch (ioctl_num) { + case MICTC_START_CAPTURE: + + // ioctl_param contains the trace trigger number. + // Save it to check against the g_traceTrigger array. + g_traceCurrentTrigger = (u32)ioctl_param; + printk("IOCTL trace trigger %ld\n", ioctl_param); + mictc_start_capture(); + break; + default: + printk("Invalid ioctl.\n"); + return -ENXIO; + } + return 0; +} + + +/* + * This is called whenever a process attempts to open the device file + */ +static int device_open(struct inode *inode, struct file *file) +{ +#ifdef DEBUG + printk(KERN_INFO "device_open(%p)\n", file); +#endif + + /* + * We don't want to talk to two processes at the same time + */ + if (Device_Open) + return -EBUSY; + + Device_Open++; + try_module_get(THIS_MODULE); + return 0; +} + +static int device_release(struct inode *inode, struct file *file) +{ +#ifdef DEBUG + printk(KERN_INFO "device_release(%p,%p)\n", inode, file); +#endif + + /* + * We're now ready for our next caller + */ + Device_Open--; + + module_put(THIS_MODULE); + return 0; +} + + +/* + * This structure will hold the functions to be called + * when a process does something to the device we + * created. Since a pointer to this structure is kept in + * the devices table, it can't be local to + * init_module. NULL is for unimplemented functions. + */ +struct file_operations Fops = { + // .read = device_read, + // .write = device_write, + .unlocked_ioctl = device_ioctl, + .open = device_open, + .release = device_release, /* a.k.a. close */ +}; + +static struct notifier_block mictc_notifier = { + .notifier_call = mictc_notify, + .priority = 0x7fffffff /* we need to be notified first */ +}; + + +/* + * mictc_init - Register our notifier + * + */ +static +int mictc_init(void) +{ + int ret_val; + /* + * Register the character device (atleast try) + */ + ret_val = register_chrdev(MICTC_MAJOR_NUM, MICTC_DEVICE_NAME, &Fops); + + /* + * Negative values signify an error + */ + if (ret_val < 0) { + printk(KERN_ALERT "%s failed with %d\n", + "Sorry, registering the character device ", ret_val); + return ret_val; + } + + printk(KERN_INFO "%s The major device number is %d.\n", + "Registeration is a success", MICTC_MAJOR_NUM); + printk(KERN_INFO "To use trace capture you'll have to create a device file:\n"); + printk(KERN_INFO "mknod %s c %d 0\n", MICTC_FILE_NAME, MICTC_MAJOR_NUM); + + return register_die_notifier(&mictc_notifier); + +} + + +static +void mictc_exit(void) +{ + return; +} + +module_init(mictc_init); +module_exit(mictc_exit); + +MODULE_AUTHOR("Intel Corp. 2011 (sth " __DATE__ ") ver " TC_VER); +MODULE_DESCRIPTION("Trace Capture module for K1OM"); +MODULE_LICENSE("GPL"); diff --git a/trace_capture/trace_capture.h b/trace_capture/trace_capture.h new file mode 100644 index 0000000..b793dff --- /dev/null +++ b/trace_capture/trace_capture.h @@ -0,0 +1,245 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +/* + * Trace Capture module common declarations + * + * Contains configuration, constants and function prototypes + * for the Trace Capture module. + */ + +#ifndef _MICTC_H_ +#define _MICTC_H_ 1 + +#include +#include +#include +#include +//#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include // for get_user and put_user +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __SCIF_H__ +#include +#endif + +/* + * Version info: M.NP + */ + +#define TC_MAJOR "0" +#define TC_MINOR "1" +#define TC_PATCH "a" +#define TC_VER TC_MAJOR "." TC_MINOR TC_PATCH + +// These are common to the Host App +// and the MIC driver Trace Capture Feature +// COMMON DEFINES START HERE +enum TRACE_COMMAND +{ + TRACE_NOP = 100, + TRACE_DATA, + TRACE_HOST_READY, + TRACE_DONE, + TRACE_ERROR, + TRACE_PRINT, + TRACE_GET_FILE, + TRACE_PAGE_READY, + TRACE_REG_COMPLETE, + TRACE_MEM_COMPLETE, + TRACE_COMPLETE, + TRACE_ABORTED +}; + +// IOCTL +#define MICTC_MAJOR_NUM 's' +#define MICTC_DEVICE_NAME "trace_capture" +#define MICTC_FILE_NAME "/dev/trace_capture" + +#define MICTC_START_CAPTURE _IOW(MICTC_MAJOR_NUM, 0xff, int) + +// Use 2MB for KNF and 4MB for K1OM (auto-detected). +#define MICTC_XML_BUFFER_SIZE (2 * 1024UL * 1024UL) + +#define MICTC_MEM_BUFFER_SIZE (1 * 1024UL * 1024UL * 1024UL) + +// Shared memory constants +#define TRACE_STATUS_OFFSET 8 +#define TRACE_SIZE_OFFSET 16 + +// Enable/Disable Memory Test. +// This MUST be enabled simultaneously on Host App as well. +#define MIC_TRACE_CAPTURE_MEMORY_TEST 0 + +#if MIC_TRACE_CAPTURE_MEMORY_TEST +#define TRACE_CHECKSUM_OFFSET 24 +#endif + +#define TRACE_TRIGGER_MAX 10 +#define TRACE_TRIGGER_OFFSET 28 +#define TRACE_DATA_OFFSET 4096 + +// Used to indicate the end of the list for trace triggers. +#define TRACE_EOL 0xffffffff +// Used for trace counts to indicate that the driver should ignore current trace. +// Only meaningful when it is first in the list of trace triggers -- the entries +// after it are ignored. Trace counts supersede trace triggers. +#define TRACE_IGNORE 0xfffffffe + +// Types of Triggers - Refer to uOS Trace Capture Wiki for Usage +// Generic counter +#define TRACE_HOST_GENERIC_COUNTER 0x1 +// Async Flip counter +#define TRACE_HOST_FRAME_COUNTER 0x2 +// COMMON DEFINES END HERE + +// MSR's defined in the trace file sent during REQs +// Are these all valid for L1OM?? +#define P6_CR_TSC 0x10 +#define X86_CR_APICBASE 0x1b +#define MIC_CR_SPUBASE 0x1c +#define IA32_CR_MISC 0x1a0 +#define WMT_CR_LASTBRANCH_0 0x1db +#define WMT_CR_LASTBRANCH_1 0x1dc +#define X86_CR_MTRRphysMask0 0x201 +#define X86_CR_MTRRphysMask1 0x203 +#define X86_CR_MTRRphysMask2 0x205 +#define X86_CR_MTRRphysMask3 0x207 +#define X86_CR_MTRRphysMask4 0x209 +#define X86_CR_MTRRphysMask5 0x20b +#define X86_CR_MTRRphysMask6 0x20d +#define X86_CR_MTRRphysMask7 0x20f +#define IA32_CR_PAT 0x277 +#define IA32_MTRR_DEF_TYPE 0x2ff +#define VMX_MSR_BASE 0x480 +#define VMX_MSR_BASE_PLUS_1 0x481 +#define VMX_MSR_BASE_PLUS_2 0x482 +#define VMX_MSR_BASE_PLUS_3 0x483 +#define VMX_MSR_BASE_PLUS_4 0x484 +#define VMX_MSR_BASE_PLUS_5 0x485 +#define VMX_MSR_BASE_PLUS_6 0x486 +#define VMX_MSR_BASE_PLUS_7 0x487 +#define VMX_MSR_BASE_PLUS_8 0x488 +#define VMX_MSR_BASE_PLUS_9 0x489 +#define TIME 0x4711 +#define PINFO 0x4712 +#define X86_CR_MTRRdefType 0x2ff +#define X86_CR_MTRRcap 0xfe +#define X86_CR_MTRRphysBase0 0x200 +#define X86_CR_MTRRphysBase1 0x202 +#define X86_CR_MTRRphysBase2 0x204 +#define X86_CR_MTRRphysBase3 0x206 +#define X86_CR_MTRRphysBase4 0x208 +#define X86_CR_MTRRphysBase5 0x20a +#define X86_CR_MTRRphysBase6 0x20c +#define X86_CR_MTRRphysBase7 0x20e +#define X86_CR_MTRRfix64K_00000 0x250 +#define X86_CR_MTRRfix16K_80000 0x258 +#define X86_CR_MTRRfix16K_A0000 0x259 +#define X86_CR_MTRRfix4K_C0000 0x268 +#define X86_CR_MTRRfix4K_C8000 0x269 +#define X86_CR_MTRRfix4K_D0000 0x26a +#define X86_CR_MTRRfix4K_D8000 0x26b +#define X86_CR_MTRRfix4K_E0000 0x26c +#define X86_CR_MTRRfix4K_E8000 0x26d +#define X86_CR_MTRRfix4K_F0000 0x26e +#define X86_CR_MTRRfix4K_F8000 0x26f +#define P5_MC_ADDR 0x0 +#define P5_MC_TYPE 0x1 +#define MSR_TR1 0x2 +#define MSR_TR2 0x4 +#define MSR_TR3 0x5 +#define MSR_TR4 0x6 +#define MSR_TR5 0x7 +#define MSR_TR6 0x8 +#define MSR_TR7 0x9 +#define MSR_TR9 0xb +#define MSR_TR10 0xc +#define MSR_TR11 0xd +#define MSR_TR12 0xe +#define IA32_APIC_BASE 0x1b +#define IA32_TIME_STAMP_COUNTER 0x10 +#define IA32_PerfCntr0 0x20 +#define IA32_PerfCntr1 0x21 +#define IA32_PerfCntr2 0x22 +#define IA32_PerfCntr3 0x23 +#define PerfFilteredCntr0 0x24 +#define PerfFilteredCntr1 0x25 +#define PerfFilteredCntr2 0x26 +#define PerfFilteredCntr3 0x27 +#define IA32_PerfEvtSel0 0x28 +#define IA32_PerfEvtSel1 0x29 +#define IA32_PerfEvtSel2 0x2a +#define IA32_PerfEvtSel3 0x2b +#define PerfFilterMask 0x2c +#define IA32_PERF_GLOBAL_STATUS 0x2d +#define IA32_PERF_GLOBAL_OVF_CONTROL 0x2e +#define IA32_PERF_GLOBAL_CTRL 0x2f +#define IA32_MCG_CTL 0x17b +#define IA32_MC0_CTRL 0x400 +#define IA32_MC0_STAT 0x401 +#define IA32_MC0_ADDR 0x402 +#define IA32_MC0_MISC 0x403 +#define IA32_MC1_CTRL 0x404 +#define IA32_MC1_STAT 0x405 +#define IA32_MC1_ADDR 0x406 +#define IA32_MC1_MISC 0x407 +#define STAR 0xc0000081 +#define LSTAR 0xc0000082 +#define SYSCALL_FLAG_MASK 0xc0000084 +#define X86_PAT 0x277 +#define SPU_BASE 0x1C + + +#endif /* Recursion block */ diff --git a/udev-mic.rules b/udev-mic.rules new file mode 100644 index 0000000..3930e21 --- /dev/null +++ b/udev-mic.rules @@ -0,0 +1,9 @@ +# do not edit this file, it will be overwritten on update +# initramfs:default + +# MIC SCIF +KERNEL=="scif", ACTION=="add", NAME="mic/%k",MODE="0666", RUN+="/bin/chmod og+x /dev/mic" +KERNEL=="ctrl", ACTION=="add", NAME="mic/%k", MODE="0666" + +# Bring up network interfaces manually on rhel7 after module reload +KERNEL=="mic*", SUBSYSTEM=="net", RUN+="/bin/sh -c '/bin/grep 7. /etc/redhat-release && /sbin/ifup %k'" diff --git a/vcons/Kbuild b/vcons/Kbuild new file mode 100644 index 0000000..ffaf350 --- /dev/null +++ b/vcons/Kbuild @@ -0,0 +1,3 @@ +michvc-objs := hvc_mic.o + +obj-m := michvc.o diff --git a/vcons/hvc_console.h b/vcons/hvc_console.h new file mode 100644 index 0000000..54381eb --- /dev/null +++ b/vcons/hvc_console.h @@ -0,0 +1,119 @@ +/* + * hvc_console.h + * Copyright (C) 2005 IBM Corporation + * + * Author(s): + * Ryan S. Arnold + * + * hvc_console header information: + * moved here from arch/powerpc/include/asm/hvconsole.h + * and drivers/char/hvc_console.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef HVC_CONSOLE_H +#define HVC_CONSOLE_H +#include +#include +#include + +/* + * This is the max number of console adapters that can/will be found as + * console devices on first stage console init. Any number beyond this range + * can't be used as a console device but is still a valid tty device. + */ +#define MAX_NR_HVC_CONSOLES 16 + +/* + * The Linux TTY code does not support dynamic addition of tty derived devices + * so we need to know how many tty devices we might need when space is allocated + * for the tty device. Since this driver supports hotplug of vty adapters we + * need to make sure we have enough allocated. + */ +#define HVC_ALLOC_TTY_ADAPTERS 8 + +struct hvc_struct { + spinlock_t lock; + int index; + struct tty_struct *tty; + int count; + int do_wakeup; + char *outbuf; + int outbuf_size; + int n_outbuf; + uint32_t vtermno; + const struct hv_ops *ops; + int irq_requested; + int data; + struct winsize ws; + struct work_struct tty_resize; + struct list_head next; + struct kref kref; /* ref count & hvc_struct lifetime */ +}; + +/* implemented by a low level driver */ +struct hv_ops { + int (*get_chars)(uint32_t vtermno, char *buf, int count); + int (*put_chars)(uint32_t vtermno, const char *buf, int count); + + /* Callbacks for notification. Called in open, close and hangup */ + int (*notifier_add)(struct hvc_struct *hp, int irq); + void (*notifier_del)(struct hvc_struct *hp, int irq); + void (*notifier_hangup)(struct hvc_struct *hp, int irq); +}; + +/* Register a vterm and a slot index for use as a console (console_init) */ +extern int hvc_instantiate(uint32_t vtermno, int index, + const struct hv_ops *ops); + +/* register a vterm for hvc tty operation (module_init or hotplug add) */ +extern struct hvc_struct * hvc_alloc(uint32_t vtermno, int data, + const struct hv_ops *ops, int outbuf_size); +/* remove a vterm from hvc tty operation (module_exit or hotplug remove) */ +extern int hvc_remove(struct hvc_struct *hp); + +/* data available */ +int hvc_poll(struct hvc_struct *hp); +void hvc_kick(void); + +/* Resize hvc tty terminal window */ +extern void __hvc_resize(struct hvc_struct *hp, struct winsize ws); + +static inline void hvc_resize(struct hvc_struct *hp, struct winsize ws) +{ + unsigned long flags; + + spin_lock_irqsave(&hp->lock, flags); + __hvc_resize(hp, ws); + spin_unlock_irqrestore(&hp->lock, flags); +} + +/* default notifier for irq based notification */ +extern int notifier_add_irq(struct hvc_struct *hp, int data); +extern void notifier_del_irq(struct hvc_struct *hp, int data); +extern void notifier_hangup_irq(struct hvc_struct *hp, int data); + + +#if defined(CONFIG_XMON) && defined(CONFIG_SMP) +#include +#else +static inline int cpus_are_in_xmon(void) +{ + return 0; +} +#endif + +#endif // HVC_CONSOLE_H diff --git a/vcons/hvc_mic.c b/vcons/hvc_mic.c new file mode 100644 index 0000000..21640c0 --- /dev/null +++ b/vcons/hvc_mic.c @@ -0,0 +1,341 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include "hvc_console.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MIC_COOKIE 0xc0c0 +#define MIC_KNC 1 + +static long vcons_hdr_addr; +static struct micscif_rb mic_out_buf; +static struct micscif_rb mic_in_buf; + +struct vcons_info { + struct vcons_buf *hdr; + struct vcons_mic_header *mic_hdr; + char *vcons_op_buf; + char *vcons_ip_buf; +}; + +static struct vcons_info vcons_info; +static int dbg = 0; + +/* Receive data from the host (mic i/p buffer) */ +static int hvc_mic_get_chars(uint32_t vt, char *buf, int count) +{ + int ret, len, get_count; + + len = micscif_rb_count(&mic_in_buf, count); + get_count = min(len, count); + ret = micscif_rb_get_next(&mic_in_buf, buf, get_count); + if (ret == get_count) + micscif_rb_update_read_ptr(&mic_in_buf); + + return ret; +} + +/* Send data to the host (mic o/p buffer) */ +static int hvc_mic_put_chars(uint32_t vt, const char *buf, int count) +{ + int ret; + int put_count; + volatile int *host_status = + (volatile int *)&vcons_info.mic_hdr->host_status; + + put_count = min(micscif_rb_space(&mic_out_buf), count); + if (put_count) { + ret = micscif_rb_write(&mic_out_buf, (void *)buf, put_count); + BUG_ON(ret); + micscif_rb_commit(&mic_out_buf); + } else if (*host_status != MIC_VCONS_HOST_OPEN) + return count; + return put_count; +} + + +static irqreturn_t hvc_mic_handle_interrupt(int irq, void *dev_id) +{ + struct hvc_struct *hp = (struct hvc_struct *)dev_id; + if (hvc_poll(hp)) { + hvc_kick(); + return IRQ_HANDLED; + } + return IRQ_NONE; +} + +static int hvc_mic_notifier_add_irq(struct hvc_struct *hp, int irq) +{ + int ret = request_irq(get_sbox_irq(HVC_SBOX_INT_IDX), + hvc_mic_handle_interrupt, IRQF_DISABLED, + "hvc intr", hp); + if (ret) { + printk("Unable to register interrupt\n"); + return ret; + } + hp->irq_requested = 1; + return 0; +} + +static void hvc_mic_notifier_del_irq(struct hvc_struct *hp, int irq) +{ + if (hp->irq_requested) { + free_irq(get_sbox_irq(HVC_SBOX_INT_IDX), hp); + hp->irq_requested = 0; + } +} + +static void hvc_mic_notifier_hangup_irq(struct hvc_struct *hp, int irq) +{ + hvc_mic_notifier_del_irq(hp, irq); +} + +static const struct hv_ops hvc_mic_ops = { + .get_chars = hvc_mic_get_chars, + .put_chars = hvc_mic_put_chars, + .notifier_add = hvc_mic_notifier_add_irq, + .notifier_del = hvc_mic_notifier_del_irq, + .notifier_hangup = hvc_mic_notifier_hangup_irq, +}; + +static void dump_vcons_hdr(struct vcons_buf *hdr) +{ + printk(KERN_ERR "host_magic\t%x\n", readl(&hdr->host_magic)); + printk(KERN_ERR "mic_magic\t%x\n", readl(&hdr->mic_magic)); + printk(KERN_ERR "o_buf_dma_addr\t%x\n", readl(&hdr->o_buf_dma_addr)); + printk(KERN_ERR "o_wr\t%x\n", readl(&hdr->o_wr)); + printk(KERN_ERR "o_size\t%x\n", readl(&hdr->o_size)); + printk(KERN_ERR "i_hdr_addr\t%lx\n", readq(&hdr->i_hdr_addr)); + printk(KERN_ERR "i_buf_addr\t%lx\n", readq(&hdr->i_buf_addr)); + printk(KERN_ERR "i_rd\t%x\n", readl(&hdr->i_rd)); +} + +static int mic_cons_init(void) +{ + int rc; + + if ((rc = hvc_instantiate(MIC_COOKIE, 0, &hvc_mic_ops))) + printk(KERN_ERR "error instantiating hvc console\n"); + + return rc; +} + +static struct hvc_struct *hp; +static int __init hvc_mic_init(void) +{ + struct vcons_buf *hdr = NULL; + struct vcons_buf tmp_hdr; + int err = 0; + char *hvc_buf; + u8 card_type=0; + uint16_t host_rb_ver, mic_rb_ver; + +#if defined(CONFIG_MK1OM) + card_type = MIC_KNC; +#endif + hvc_buf = (char *)get_zeroed_page(GFP_KERNEL); + if (!hvc_buf) { + printk(KERN_ERR "unable to allocate vcons buffer\n"); + return -ENOMEM; + } + if (card_type == MIC_KNC) { + vcons_info.vcons_ip_buf = hvc_buf; + vcons_info.mic_hdr = (struct vcons_mic_header *)kzalloc(sizeof(struct vcons_mic_header), GFP_KERNEL); + if (!vcons_info.mic_hdr) { + free_page((unsigned long)hvc_buf); + printk(KERN_ERR "unable to allocate vcons header\n"); + return -ENOMEM; + } + } else { + vcons_info.vcons_ip_buf = hvc_buf + PAGE_SIZE/2; + vcons_info.mic_hdr = (struct vcons_mic_header *)hvc_buf; + } + + vcons_info.hdr = hdr = ioremap_nocache(vcons_hdr_addr, + sizeof(struct vcons_buf)); + if (!hdr) { + printk(KERN_ERR "unable to map vcons header\n"); + err = -ENOMEM; + goto error; + } + + if (dbg) + dump_vcons_hdr(hdr); + + if (readl(&hdr->host_magic) != MIC_HOST_VCONS_READY) { + printk(KERN_ERR "host not ready, giving up\n"); + err = -ENODEV; + goto error; + } + + host_rb_ver = readw(&hdr->host_rb_ver); + mic_rb_ver = micscif_rb_get_version(); + writew(mic_rb_ver, &hdr->mic_rb_ver); + if (host_rb_ver != mic_rb_ver) { + printk(KERN_ERR "Card and host ring buffer versions mismatch."); + printk(KERN_ERR "Card ver: %d, Host ver: %d \n", mic_rb_ver, + host_rb_ver); + writel(MIC_VCONS_RB_VER_ERR, &hdr->mic_magic); + err = -ENXIO; + goto error; + } + memcpy_fromio(&tmp_hdr, hdr, sizeof(struct vcons_buf)); + + if (!(vcons_info.vcons_op_buf = ioremap_nocache(tmp_hdr.o_buf_dma_addr, + tmp_hdr.o_size))) { + printk(KERN_ERR "unable to map vcons output buffer\n"); + err = -ENOMEM; + goto error; + } + + tmp_hdr.i_hdr_addr = virt_to_phys(vcons_info.mic_hdr); + tmp_hdr.i_buf_addr = virt_to_phys(vcons_info.vcons_ip_buf); + + if (card_type == MIC_KNC) + tmp_hdr.i_size = PAGE_SIZE; + else + tmp_hdr.i_size = PAGE_SIZE/2; + + micscif_rb_init(&mic_out_buf, (volatile uint32_t *)&vcons_info.mic_hdr->o_rd, + (volatile uint32_t *)&hdr->o_wr, + (volatile uint32_t *)vcons_info.vcons_op_buf, + tmp_hdr.o_size); + + micscif_rb_init(&mic_in_buf, + (volatile uint32_t *)&hdr->i_rd, + (volatile uint32_t *)&vcons_info.mic_hdr->i_wr, + (volatile uint32_t *)vcons_info.vcons_ip_buf, + tmp_hdr.i_size); + + mic_cons_init(); + hp = hvc_alloc(MIC_COOKIE, 2, &hvc_mic_ops, 128); + + if (IS_ERR(hp)) { + printk(KERN_ERR "unable to allocate hvc console\n"); + err = PTR_ERR(hp); + } else { + writeq(tmp_hdr.i_hdr_addr, &hdr->i_hdr_addr); + writeq(tmp_hdr.i_buf_addr, &hdr->i_buf_addr); + writel(tmp_hdr.i_size, &hdr->i_size); + writel(MIC_VCONS_READY, &hdr->mic_magic); + if (dbg) + dump_vcons_hdr(hdr); + + return 0; + } +error: + if (hdr) + iounmap(hdr); + if (vcons_info.vcons_op_buf) + iounmap(vcons_info.vcons_op_buf); +#if defined(CONFIG_MK1OM) + free_page((unsigned long)vcons_info.vcons_ip_buf); + kfree(vcons_info.mic_hdr); +#else + free_page((unsigned long)vcons_info.mic_hdr); +#endif + return err; +} + +static void __exit hvc_mic_exit(void) +{ + char buf[8]; + int ret, len; + + writel(0, &vcons_info.hdr->mic_magic); + + do { + len = micscif_rb_count(&mic_in_buf, sizeof(buf)); + ret = micscif_rb_get_next(&mic_in_buf, buf, + min(len, (int)sizeof(buf))); + } while (ret > 0); + + iounmap(vcons_info.hdr); + iounmap(vcons_info.vcons_op_buf); +#if defined(CONFIG_MK1OM) + free_page((unsigned long)vcons_info.vcons_ip_buf); + kfree(vcons_info.mic_hdr); +#else + free_page((unsigned long)vcons_info.mic_hdr); +#endif + if (hp) + hvc_remove(hp); +} + +MODULE_PARM_DESC(vcons_hdr_addr, "mic address of vcons hdr"); +module_param(vcons_hdr_addr, long, S_IRUGO); +module_param(dbg, int, S_IRUGO); +MODULE_LICENSE("GPL"); +module_init(hvc_mic_init); +module_exit(hvc_mic_exit); + diff --git a/virtio/Kbuild b/virtio/Kbuild new file mode 100644 index 0000000..a0033e5 --- /dev/null +++ b/virtio/Kbuild @@ -0,0 +1,2 @@ +obj-m += mic_virtblk.o + diff --git a/virtio/mic_virtblk.c b/virtio/mic_virtblk.c new file mode 100644 index 0000000..356b48f --- /dev/null +++ b/virtio/mic_virtblk.c @@ -0,0 +1,862 @@ +/* + virtio block device adapted for MIC. + copied from drivers/block/virtio_blk.c of Linux kernel + It is initially commited by + Rusty Russell 2007-10-21 18:03:38 + with SHA1 ID, e467cde238184d1b0923db2cd61ae1c5a6dc15aa + + drivers/block/virtio_blk.c of Linux kernel does not have copyright notice. + + * For adapting to MIC + * (C) Copyright 2012 Intel Corporation + * Author: Caz Yokoyama + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + */ +//#define DEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mic_common.h" +#include "mic/micveth_dma.h" +#include "mic/micscif_intr.h" +#include "mic/mic_virtio.h" + +#define SBOX_MMIO_LENGTH (64 * 1024) + +#define PART_BITS 4 + +#define VIRTQUEUE_LENGTH 128 +#define MIC_VRING_ALIGN PAGE_SIZE + +#define INTERRUPT_ID_FOR_VIRTBLK 3 + +extern int get_sbox_irq(int index); + +static int major, index = 0; +static long virtio_addr = 0; +static mic_data_t virtblk_mic_data; + +struct virtio_blk +{ + spinlock_t lock; + + struct virtio_device *vdev; + struct virtqueue *vq; + + /* The disk structure for the kernel. */ + struct gendisk *disk; + + /* Request tracking. */ + struct list_head reqs; + + mempool_t *pool; + + /* virtual address of blk_config */ + void __iomem *ioaddr; + + /* What host tells us, plus 2 for header & tailer. */ + unsigned int sg_elems; + + /* sbox va */ + u8 *sbox; + + /* Scatterlist: can be too big for stack. */ + struct scatterlist sg[/*sg_elems*/]; +}; + +struct virtblk_req +{ + struct list_head list; + struct request *req; + struct virtio_blk_outhdr out_hdr; + struct virtio_scsi_inhdr in_hdr; + u8 status; +}; + +#define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) + +/* The following vring_virtqueue and to_vvq() are copied from virtio_ring.c. Please name sure you have the same structure + as in virtio_ring.c. The reason why they are copied is that I don't want to change virtio_ring.c which is a symbolic link. +*/ +struct vring_virtqueue +{ + struct virtqueue vq; + + /* Actual memory layout for this queue */ + struct vring vring; + + /* Other side has made a mess, don't try any more. */ + bool broken; + + /* Host supports indirect buffers */ + bool indirect; + + /* Number of free buffers */ + unsigned int num_free; + /* Head of free buffer list. */ + unsigned int free_head; + /* Number we've added since last sync. */ + unsigned int num_added; + + /* Last used index we've seen. */ + u16 last_used_idx; + + /* How to notify other side. FIXME: commonalize hcalls! */ + void (*notify)(struct virtqueue *vq); + +#ifdef DEBUG + /* They're supposed to lock for us. */ + unsigned int in_use; +#endif + + struct _mic_ctx_t *mic_ctx; + /* Tokens for callbacks. */ + void *data[]; +}; + +#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) + +static void blk_done(struct virtqueue *vq) +{ + struct virtio_blk *vblk = vq->vdev->priv; + struct virtblk_req *vbr; + unsigned int len; + unsigned long flags; + + spin_lock_irqsave(&vblk->lock, flags); + while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { + int error; + + switch (vbr->status) { + case VIRTIO_BLK_S_OK: + error = 0; + break; + case VIRTIO_BLK_S_UNSUPP: + error = -ENOTTY; + break; + default: + error = -EIO; + break; + } + + if (blk_pc_request(vbr->req)) { + vbr->req->resid_len = vbr->in_hdr.residual; + vbr->req->sense_len = vbr->in_hdr.sense_len; + vbr->req->errors = vbr->in_hdr.errors; + } + + __blk_end_request_all(vbr->req, error); + list_del(&vbr->list); + mempool_free(vbr, vblk->pool); + } + /* In case queue is stopped waiting for more buffers. */ + blk_start_queue(vblk->disk->queue); + spin_unlock_irqrestore(&vblk->lock, flags); +} + +static bool do_req(struct request_queue *q, struct virtio_blk *vblk, + struct request *req) +{ + unsigned long num, out = 0, in = 0; + struct virtblk_req *vbr; + + vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); + if (!vbr) + /* When another request finishes we'll try again. */ + return false; + + vbr->req = req; + + if (req->cmd_flags & REQ_FLUSH) { + vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + } else { + switch (req->cmd_type) { + case REQ_TYPE_FS: + vbr->out_hdr.type = 0; + vbr->out_hdr.sector = blk_rq_pos(vbr->req); + vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + break; + case REQ_TYPE_BLOCK_PC: + vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + break; + case REQ_TYPE_SPECIAL: + vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID; + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + break; + default: + /* We don't put anything else in the queue. */ + BUG(); + } + } + + sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); + + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information before the normal inhdr. + */ + if (blk_pc_request(vbr->req)) + sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); + + num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); + + if (blk_pc_request(vbr->req)) { + sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96); + sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, + sizeof(vbr->in_hdr)); + } + + sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, + sizeof(vbr->status)); + + if (num) { + if (rq_data_dir(vbr->req) == WRITE) { + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + out += num; + } else { + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + in += num; + } + } + + if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) { + mempool_free(vbr, vblk->pool); + return false; + } + + list_add_tail(&vbr->list, &vblk->reqs); + return true; +} + +static void do_virtblk_request(struct request_queue *q) +{ + struct virtio_blk *vblk = q->queuedata; + struct request *req; + unsigned int issued = 0; + + while ((req = blk_peek_request(q)) != NULL) { + BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); + + /* If this request fails, stop queue and wait for something to + finish to restart it. */ + if (!do_req(q, vblk, req)) { + blk_stop_queue(q); + break; + } + blk_start_request(req); + issued++; + } + + if (issued) + virtqueue_kick(vblk->vq); +} + +static int +set_capacity_from_host(struct virtio_blk *vblk) +{ + struct virtio_device *vdev = vblk->vdev; + u64 cap; + + /* Host must always specify the capacity. */ + vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), + &cap, sizeof(cap)); + if (cap == 0) { + printk(KERN_ERR "Have you set virtblk file?\n"); + return -ENXIO; + } + + /* If capacity is too big, truncate with warning. */ + if ((sector_t)cap != cap) { + dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", + (unsigned long long)cap); + cap = (sector_t)-1; + } + set_capacity(vblk->disk, cap); + + return 0; +} + +static int +virtblk_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct virtio_blk *vblk = disk->private_data; + + return set_capacity_from_host(vblk); +} + +static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long data) +{ + struct gendisk *disk = bdev->bd_disk; + struct virtio_blk *vblk = disk->private_data; + + /* + * Only allow the generic SCSI ioctls if the host can support it. + */ + if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI)) + return -ENOTTY; + + return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, + (void __user *)data); +} + +/* We provide getgeo only to please some old bootloader/partitioning tools */ +static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + struct virtio_blk *vblk = bd->bd_disk->private_data; + struct virtio_blk_geometry vgeo; + int err; + + /* see if the host passed in geometry config */ + err = virtio_config_val(vblk->vdev, VIRTIO_BLK_F_GEOMETRY, + offsetof(struct virtio_blk_config, geometry), + &vgeo); + + if (!err) { + geo->heads = vgeo.heads; + geo->sectors = vgeo.sectors; + geo->cylinders = vgeo.cylinders; + } else { + /* some standard values, similar to sd */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + } + return 0; +} + +static const struct block_device_operations virtblk_fops = { + .open = virtblk_open, + .ioctl = virtblk_ioctl, + .owner = THIS_MODULE, + .getgeo = virtblk_getgeo, +}; + +static int index_to_minor(int index) +{ + return index << PART_BITS; +} + +static inline bool more_used(const struct vring_virtqueue *vq) +{ + return vq->last_used_idx != vq->vring.used->idx; +} + +static irqreturn_t +mic_virtblk_intr_handler(int irq, void *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!more_used(vq)) { + pr_debug("virtqueue interrupt with no work for %p\n", vq); + goto _exit_; + } + + if (unlikely(vq->broken)) + goto _exit_; + + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); + if (vq->vq.callback) + vq->vq.callback(&vq->vq); + + _exit_: + return IRQ_HANDLED; +} + +static int __devinit virtblk_probe(struct virtio_device *vdev) +{ + struct virtio_blk *vblk; + struct request_queue *q; + int err; + u32 v, blk_size, sg_elems, opt_io_size; + u16 min_io_size; + u8 physical_block_exp, alignment_offset; + struct board_info *bd_info = virtblk_mic_data.dd_bi[0]; + struct vb_shared *vb_shared; + + if (index_to_minor(index) >= 1 << MINORBITS) + return -ENOSPC; + + vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared; + vdev->features[0] = readl(&vb_shared->host_features); + + /* We need to know how many segments before we allocate. */ + err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX, + offsetof(struct virtio_blk_config, seg_max), + &sg_elems); + if (err) + sg_elems = 1; + + /* We need an extra sg elements at head and tail. */ + sg_elems += 2; + vdev->priv = vblk = kmalloc(sizeof(*vblk) + + sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL); + if (!vblk) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&vblk->reqs); + spin_lock_init(&vblk->lock); + vblk->vdev = vdev; + vblk->sg_elems = sg_elems; + sg_init_table(vblk->sg, vblk->sg_elems); + + /* map sbox */ + vblk->sbox = ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH); + if (!vblk->sbox) { + printk(KERN_ERR "%s: NULL SBOX ptr\n", __func__); + err = -ENOMEM; + goto out_free_vblk; + } + + /* We expect one virtqueue, for output. */ + vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); + if (IS_ERR(vblk->vq)) { + err = PTR_ERR(vblk->vq); + goto out_unmap_sbox; + } + + if ((err = request_irq(get_sbox_irq(VIRTIO_SBOX_INT_IDX), + mic_virtblk_intr_handler, IRQF_DISABLED, + "virtio intr", vblk->vq))) { + printk(KERN_ERR "%s: can't register interrupt: %d\n", __func__, err); + goto out_free_vq; + } + + vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); + if (!vblk->pool) { + err = -ENOMEM; + goto out_free_irq; + } + + /* FIXME: How many partitions? How long is a piece of string? */ + vblk->disk = alloc_disk(1 << PART_BITS); + if (!vblk->disk) { + err = -ENOMEM; + goto out_mempool; + } + + q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); + if (!q) { + err = -ENOMEM; + goto out_put_disk; + } + + q->queuedata = vblk; + + if (index < 26) { + sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); + } else if (index < (26 + 1) * 26) { + sprintf(vblk->disk->disk_name, "vd%c%c", + 'a' + index / 26 - 1, 'a' + index % 26); + } else { + const unsigned int m1 = (index / 26 - 1) / 26 - 1; + const unsigned int m2 = (index / 26 - 1) % 26; + const unsigned int m3 = index % 26; + sprintf(vblk->disk->disk_name, "vd%c%c%c", + 'a' + m1, 'a' + m2, 'a' + m3); + } + + vblk->disk->major = major; + vblk->disk->first_minor = index_to_minor(index); + vblk->disk->private_data = vblk; + vblk->disk->fops = &virtblk_fops; + vblk->disk->driverfs_dev = NULL; // There is no parent device. + index++; + + /* configure queue flush support */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) + blk_queue_flush(q, REQ_FLUSH); + + /* If disk is read-only in the host, the guest should obey */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) { + if (vdev->config->get_features(vdev) & (1U << VIRTIO_BLK_F_RO)) { + set_disk_ro(vblk->disk, 1); + } + } + + err = set_capacity_from_host(vblk); + if (err) + goto out_put_disk; + + /* We can handle whatever the host told us to handle. */ + blk_queue_max_segments(q, vblk->sg_elems-2); + + /* No need to bounce any requests */ + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); + + /* No real sector limit. */ + blk_queue_max_hw_sectors(q, -1U); + + /* Host can optionally specify maximum segment size and number of + * segments. */ + err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX, + offsetof(struct virtio_blk_config, size_max), + &v); + if (!err) + blk_queue_max_segment_size(q, v); + else + blk_queue_max_segment_size(q, -1U); + + /* Host can optionally specify the block size of the device */ + err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE, + offsetof(struct virtio_blk_config, blk_size), + &blk_size); + if (!err) + blk_queue_logical_block_size(q, blk_size); + else + blk_size = queue_logical_block_size(q); + + /* Use topology information if available */ + err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, + offsetof(struct virtio_blk_config, physical_block_exp), + &physical_block_exp); + if (!err && physical_block_exp) + blk_queue_physical_block_size(q, + blk_size * (1 << physical_block_exp)); + + err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, + offsetof(struct virtio_blk_config, alignment_offset), + &alignment_offset); + if (!err && alignment_offset) + blk_queue_alignment_offset(q, blk_size * alignment_offset); + + err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, + offsetof(struct virtio_blk_config, min_io_size), + &min_io_size); + if (!err && min_io_size) + blk_queue_io_min(q, blk_size * min_io_size); + + err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY, + offsetof(struct virtio_blk_config, opt_io_size), + &opt_io_size); + if (!err && opt_io_size) + blk_queue_io_opt(q, blk_size * opt_io_size); + + add_disk(vblk->disk); + return 0; + +out_put_disk: + put_disk(vblk->disk); +out_mempool: + mempool_destroy(vblk->pool); +out_free_irq: + free_irq(get_sbox_irq(VIRTIO_SBOX_INT_IDX), vblk->vq); +out_free_vq: + vdev->config->del_vqs(vdev); +out_unmap_sbox: + iounmap(vblk->sbox); +out_free_vblk: + kfree(vblk); +out: + return err; +} + +static void __devexit virtblk_remove(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + + /* Nothing should be pending. */ + BUG_ON(!list_empty(&vblk->reqs)); + + free_irq(get_sbox_irq(VIRTIO_SBOX_INT_IDX), vblk->vq); + + /* Stop all the virtqueues. */ + vdev->config->reset(vdev); + + del_gendisk(vblk->disk); + blk_cleanup_queue(vblk->disk->queue); + put_disk(vblk->disk); + mempool_destroy(vblk->pool); + vdev->config->del_vqs(vdev); + iounmap(vblk->sbox); + kfree(vblk); +} + +/* config->get_features() implementation */ +static u32 virtblk_get_features(struct virtio_device *vdev) +{ + /* When someone needs more than 32 feature bits, we'll need to + * steal a bit to indicate that the rest are somewhere else. */ + struct board_info *bd_info = virtblk_mic_data.dd_bi[0]; + struct vb_shared *vb_shared; + + vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared; + return readl(&vb_shared->host_features); +} + +/* virtio config->finalize_features() implementation */ +static void virtblk_finalize_features(struct virtio_device *vdev) +{ + struct board_info *bd_info = virtblk_mic_data.dd_bi[0]; + struct vb_shared *vb_shared; + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* We only support 32 feature bits. */ + BUILD_BUG_ON(ARRAY_SIZE(vdev->features) != 1); + + vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared; + writel(vdev->features[0], &vb_shared->client_features); +} + +/* config->get() implementation */ +static void virtblk_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct board_info *bd_info = virtblk_mic_data.dd_bi[0]; + struct vb_shared *vb_shared; + void *ioaddr; + u8 *ptr = buf; + int i; + + vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared; + ioaddr = (void *)&vb_shared->blk_config + offset; + for (i = 0; i < len; i++) + ptr[i] = readb(ioaddr + i); +} + +static void virtblk_reset(struct virtio_device *vdev) +{ +} + +/* the notify function used when creating a virt queue */ +static void virtblk_notify(struct virtqueue *vq) +{ + const int doorbell = 2; + struct virtio_blk *vblk = vq->vdev->priv; + uint32_t db_reg; + + /* Ring host doorbell interrupt */ + db_reg = readl(vblk->sbox + (SBOX_SDBIC0 + (4 * doorbell))) + | SBOX_SDBIC0_DBREQ_BIT; + writel(db_reg, vblk->sbox + (SBOX_SDBIC0 + (4 * doorbell))); +} + +/* the config->del_vqs() implementation */ +static void virtblk_del_vqs(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + unsigned long size; + + size = PAGE_ALIGN(vring_size(VIRTQUEUE_LENGTH, MIC_VRING_ALIGN)); + free_pages_exact(vblk->vq->priv, size); + + vring_del_virtqueue(vblk->vq); + vblk->vq = NULL; +} + +/* the config->find_vqs() implementation */ +static int virtblk_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + struct virtio_blk *vblk = vdev->priv; + struct virtqueue *vq; + int err; + unsigned long size; + void *queue; /* the virtual address of the ring queue */ + struct vring_virtqueue *vvq; + struct vring *vring; + struct board_info *bd_info = virtblk_mic_data.dd_bi[0]; + + BUG_ON(nvqs != 1); + BUG_ON(vblk == NULL); + + size = PAGE_ALIGN(vring_size(VIRTQUEUE_LENGTH, MIC_VRING_ALIGN)); + queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); + if (queue == NULL) { + err = -ENOMEM; + goto out_info; + } + + /* create the vring */ + vq = vring_new_virtqueue(VIRTQUEUE_LENGTH, MIC_VRING_ALIGN, + vdev, queue, virtblk_notify, callbacks[0], names[0]); + if (vq == NULL) { + err = -ENOMEM; + goto out_activate_queue; + } + vq->priv = queue; + + vqs[0] = vblk->vq = vq; + + vvq = to_vvq(vq); + vring = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared->vring; + writel(vvq->vring.num, &vring->num); + writeq(virt_to_phys(vvq->vring.desc), &vring->desc); + writeq(virt_to_phys(vvq->vring.avail), &vring->avail); + writeq(virt_to_phys(vvq->vring.used), &vring->used); + + return 0; + +out_activate_queue: + free_pages_exact(queue, size); +out_info: + return err; +} + +static struct virtio_config_ops virtio_blk_config_ops = { + .get = virtblk_get, + // .set = vp_set, + // .get_status = vp_get_status, + // .set_status = vp_set_status, + .reset = virtblk_reset, + .find_vqs = virtblk_find_vqs, + .del_vqs = virtblk_del_vqs, + .get_features = virtblk_get_features, + .finalize_features = virtblk_finalize_features, +}; + +static unsigned int features[] = { + VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, + VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY +}; + +/* + * virtio_blk causes spurious section mismatch warning by + * simultaneously referring to a __devinit and a __devexit function. + * Use __refdata to avoid this warning. + */ +static struct virtio_driver __refdata virtio_blk = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, +}; + +struct class block_class = { + .name = "block", +}; + +static struct device_type disk_type = { + .name = "disk", + /* + .groups = disk_attr_groups, + .release = disk_release, + .devnode = block_devnode, + */ +}; + +static int __init init(void) +{ + bd_info_t *bd_info; + struct virtio_device *vdev; + struct mic_virtblk *mic_virtblk; + int ret; + struct vb_shared *vb_shared; + +#ifdef CONFIG_ML1OM + printk(KERN_ERR "virtio block device is not available on KNF\n"); + ret = -ENODEV; + goto error_return; +#endif + major = register_blkdev(0, "virtblk"); + if (major < 0) { + ret = major; + goto error_return; + } + + bd_info = kmalloc(sizeof(bd_info_t), GFP_KERNEL); + if (bd_info == NULL) { + ret = -ENOMEM; + goto error_return; + } + memset(bd_info, 0, sizeof(*bd_info)); + virtblk_mic_data.dd_numdevs = 1; + index = 0; + virtblk_mic_data.dd_bi[0] = bd_info; + bd_info->bi_ctx.bi_id = 0; + + mic_virtblk = kmalloc(sizeof(*mic_virtblk), GFP_KERNEL); + if (mic_virtblk == NULL) { + ret = -ENOMEM; + goto free_bd_info; + } + memset(mic_virtblk, 0, sizeof(*mic_virtblk)); + bd_info->bi_virtio = (void *)mic_virtblk; + + if (virtio_addr == 0) { + printk(KERN_ERR "virtio address is not passed from host\n"); + return -ENODEV; + goto free_mic_virtblk; + } + vb_shared = ioremap_nocache(virtio_addr, sizeof(*vb_shared)); + if (vb_shared == NULL) { + ret = -ENODEV; + goto free_mic_virtblk; + } + vb_shared->update = true; + mic_virtblk->vb_shared = vb_shared; + + vdev = kmalloc(sizeof(*vdev), GFP_KERNEL); + if (vdev == NULL) { + ret = -ENOMEM; + goto free_mic_virtblk; + } + memset(vdev, 0, sizeof(*vdev)); + vdev->config = &virtio_blk_config_ops; + INIT_LIST_HEAD(&vdev->vqs); + vdev->dev.driver = &virtio_blk.driver; + vdev->dev.class = &block_class; + vdev->dev.type = &disk_type; + device_initialize(&vdev->dev); + mic_virtblk->vdev = (void *)vdev; + + return virtblk_probe(vdev); + + free_mic_virtblk: + kfree(bd_info->bi_virtio); + free_bd_info: + kfree(bd_info); + error_return: + return ret; +} + +static void __exit fini(void) +{ + bd_info_t *bd_info = virtblk_mic_data.dd_bi[0]; + struct mic_virtblk *mic_virtblk = (struct mic_virtblk *)bd_info->bi_virtio; + + unregister_blkdev(major, "virtblk"); + virtblk_remove(mic_virtblk->vdev); + iounmap(mic_virtblk->vb_shared); + kfree(mic_virtblk->vdev); + kfree(bd_info->bi_virtio); + kfree(bd_info); +} +module_init(init); +module_exit(fini); + +MODULE_DESCRIPTION("Virtio block driver"); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(virtio_addr, "address of virtio related structure"); +module_param(virtio_addr, long, S_IRUGO); diff --git a/vnet/Kbuild b/vnet/Kbuild new file mode 100644 index 0000000..492d0ca --- /dev/null +++ b/vnet/Kbuild @@ -0,0 +1,3 @@ +obj-m += intel_micveth.o + +intel_micveth-objs := micveth.o micveth_param.o micveth_dma.o diff --git a/vnet/mic.h b/vnet/mic.h new file mode 100644 index 0000000..bfd5e81 --- /dev/null +++ b/vnet/mic.h @@ -0,0 +1,108 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#ifndef MICDLDR_H +#define MICDLDR_H + +#define MIC_DECONS_DISABLE 0 +#define MIC_DECONS_ENABLE 1 + +typedef struct mic_upload { + int up_brdnum; + int up_uossize; + char *up_uosbuf; + int up_dcons; + int up_uoslog; + int up_uosreserve; +} mic_upload_t; + +typedef struct mic_sys_config { + int sc_numCards; +} mic_sys_config_t; + +#define UOS_NOT_BOOTED 0 +#define UOS_BOOTING 1 +#define UOS_BOOT_FAILED 2 +#define UOS_BOOT_SUCCEED 3 +#define UOS_RUNNING 4 +#define UOS_WEDGED 5 +#define UOS_UNKNOWN 6 + +#define PCI_VENDOR_INTEL 0x8086 + +#define PCI_SPEED_GEN1 1 +#define PCI_SPEED_GEN2 2 + +#define GDDR_VENDOR_SAMSUNG 1 +#define GDDR_VENDOR_QIMONDA 2 +#define GDDR_VENDOR_HYNIX 6 + +#define GDDR_DENSITY_512MB 0 +#define GDDR_DENSITY_1GB 1 + +typedef struct mic_brd_config { + int bc_brdnum; + struct { + char step[4]; + int freqMhz; + int vid; + int uvolts; + } bc_core; + struct { + unsigned short vendor; + unsigned short device; + unsigned int class; + char capableSpeed; + char capableWidth; + char currentSpeed; + char currentWidth; + } bc_pcie; + struct { + char vendor; + char density; + char fifoDepth; + short freq; // MT/sec + int size; // Mbytes + } bc_gddr; + int bc_uOSstate; +} mic_brd_config_t; + +#define MIC_UPLOAD_UOS _IOWR('l', 1, struct mic_upload) +#define MIC_RESET_UOS _IOWR('l', 2, int) +#define MIC_SYS_CONFIG _IOWR('l', 3, struct mic_sys_config) +#define MIC_BRD_CONFIG _IOWR('l', 4, struct mic_brd_config) + +#endif // MICDLDR_H + diff --git a/vnet/micveth.c b/vnet/micveth.c new file mode 100644 index 0000000..5ad96a9 --- /dev/null +++ b/vnet/micveth.c @@ -0,0 +1,869 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include + +#include "mic/micveth.h" + +#define PWR_MGMT_NO_POLL_AFTER_LINKS_UP 1 + +/* #define HOST */ +#define SBOX_MMIO_LENGTH (64 * 1024) + +/* Host - Card link initialization rotocol + * Card comes up and writes MICVETH_LINK_UP_MAGIC to scratch 14 & 15 + * Host detects that the card side interface is up and writes the + * 1) address of the tx/rx descriptor ring buffer to scratch 14 & 15 + * 2) last 2 octets of the MAC address (allows the host to identify + * the board number based on its mac address) + */ + +/* Host - Card descriptor queue/ring buffer (from the perspective of the host) + * + * There is a transmit and a receive queue. Each queue entry has + * a physical address and a length. + * + * Packet transmission + * The host adds a queue entry with the physical address of the skb and its + * length and updates the write pointer. The receive side on the card sees the + * new entry, allocates a new skb, maps the host's skb, copies it to a locally + * allocated skb and updates the read pointer. The host side later frees up skbs + * starting from a cached read pointer upto the read pointer + * + * Packet reception + * The host "posts" skbs to the rx queue. The transmit routine on the card + * copies its local skb to the host skb, updates the write pointer and frees + * its local skb + */ + +/* Vnet interrupts are now functional (with vnet=dma module parameter). In the + main flow of the driver all polling in the interrupt mode has been + eliminated. However, polling is still happening in clientpoll() routine which + tracks if the link is up or down. This can also be replaced by an interrupt + driven mechanism which will be done in the future. Apart from this, only + limited testing has been done in the interrupt mode, especially with respect + to sharing the interrupt with scif. Therefore, for now the default mode of + operation is still left as poll in micstart. +*/ + +#define SBOX_SDBIC0_DBREQ_BIT 0x80000000 + + +#ifdef HOST +#else +struct skb_node { + struct list_head list; + struct sk_buff *skb; +}; + +/* List of skbs to be transmitted - global for now assumes KN* has a single interface */ +struct list_head skb_list; +LIST_HEAD(skb_list); +#endif + +static void _micveth_process_descriptors(micveth_info_t *veth_info); + +#ifdef HOST +#else +static int micveth_xmit_enqueue(struct sk_buff *skb, struct net_device *dev, micveth_info_t *veth_info); +static int micveth_xmit_dequeue(struct net_device *dev, micveth_info_t *veth_info); +static struct sk_buff *dequeue_skb(micveth_info_t *veth_info); +static void micvnet_tx_dequeue_handler(struct work_struct *work); + +int micveth_start(mic_ctx_t *mic_ctx); +void micveth_stop(mic_ctx_t *mic_ctx); +static int micveth_start_dev(struct net_device *dev); +static int micveth_stop_dev(struct net_device *dev); +#endif + +static void micveth_clientpoll(struct work_struct *work); +static void micveth_poll(struct work_struct *work); +static irqreturn_t micvnet_host_intr_handler(int irq, void *cookie); +static void micvnet_intr_bh_handler(struct work_struct *work); +static void micveth_send_intr(micveth_info_t *veth_info); +int get_sbox_irq(int index); + +#ifdef HOST +#else +static mic_ctx_t mic_ctx_g; +#endif + +micveth_t micveth; + +static int +micveth_set_address(struct net_device *dev, void *p) +{ + struct sockaddr *sa = p; + + if (!is_valid_ether_addr(sa->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN); + return 0; +} + +static void +micveth_multicast_list(struct net_device *dev) +{ +} + +#ifdef HOST +#else +/* Enqueues an skb for transmission. This is necessary because micveth_xmit is called in + interrupt context and we cannot call ioremap_nocache from interrupt context. */ +static int +micveth_xmit_enqueue(struct sk_buff *skb, struct net_device *dev, micveth_info_t *veth_info) +{ + struct skb_node *new_node = kmalloc(sizeof(*new_node), GFP_ATOMIC); + + if (!new_node) + return ENOMEM; + new_node->skb = skb; + spin_lock(&veth_info->vi_txlock); + list_add_tail(&new_node->list, &skb_list); + spin_unlock(&veth_info->vi_txlock); + return 0; +} + +/* Dequeues a skb enqueued by micveth_xmit_enqueue */ +static struct sk_buff * +dequeue_skb(micveth_info_t *veth_info) +{ + struct sk_buff *skb = NULL; + struct skb_node *skb_node = NULL; + + spin_lock_bh(&veth_info->vi_txlock); + if (!list_empty(&skb_list)) + { + skb_node = list_entry(skb_list.next, struct skb_node , list); + list_del(&skb_node->list); + skb = skb_node->skb; + } + spin_unlock_bh(&veth_info->vi_txlock); + + if (skb_node) + kfree(skb_node); + return skb; +} + +/* Transmits skbs that have been enqueued by the by micveth_xmit_enqueue */ +static int +micveth_xmit_dequeue(struct net_device *dev, micveth_info_t *veth_info) +{ + veth_ring_t *ring; + ring_queue_t *tx_queue; + ring_desc_t *desc; + int next_tail; + void *dst; + struct sk_buff *skb; + + while ((skb = dequeue_skb(veth_info))) { + ring = veth_info->ring_ptr; + tx_queue = &ring->r_rx; + + next_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length; + if (next_tail == tx_queue->rq_head) { + printk(KERN_WARNING "dropping packet\n"); + /* queue_full situation - just drop the packet and let the stack retry */ + return 1; + } + + desc = &tx_queue->rq_descs[tx_queue->rq_tail]; + dst = ioremap_nocache(desc->rd_phys, skb->len); + if (!dst) { + tx_queue->rq_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length; + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + continue; + } + desc->rd_length = skb->len; + desc->rd_valid = 1; + memcpy(dst, skb->data, skb->len); + /* + * Need a write memory barrier between copying the skb data to + * the buffer and updating the tail pointer. NOT an smp_wmb(), + * because this memory barrier needs to be done even if there is + * a single CPU in the system. + * + * No need for the serializing request (Si bug workaround in + * KNF), since the buffer exists in host memory. If the buffer + * lives in card memory, and this code is running on the host, we + * would need extra barriers and a "serializing request" on any write. + */ + wmb(); + tx_queue->rq_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length; + iounmap(dst); + dev_kfree_skb(skb); + + if (mic_vnet_mode == VNET_MODE_INTR) { + micveth_send_intr(veth_info); + } + } + + return 0; +} + +static void +micvnet_tx_dequeue_handler(struct work_struct *work) +{ + micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_txws); + struct net_device *dev_veth = veth_info->vi_netdev; + + micveth_xmit_dequeue(dev_veth, veth_info); +} +#endif + +#ifdef HOST +#else // card +/* Transmit callback */ +static int +micveth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + micveth_info_t *veth_info; + + if (be16_to_cpu(skb->protocol) == ETH_P_IPV6) { + kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + veth_info = &micveth.lv_info[0]; + if (veth_info->vi_state == VETH_STATE_LINKUP) { + if (micveth_xmit_enqueue(skb, dev, veth_info)) { + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + } + } else { + dev_kfree_skb(skb); + } + + /* Reuse the interrupt workqueue to also queue tx dequeue tasks */ + queue_work(veth_info->vi_wq, &veth_info->vi_txws); + + return NETDEV_TX_OK; +} +#endif + +static int +micveth_change_mtu(struct net_device *dev, int new_mtu) +{ + dev->mtu = new_mtu; + return 0; +} + + +/* Start callback */ +static int +micveth_start_dev(struct net_device *dev) +{ + micveth_info_t *veth_info = dev->ml_priv; + + micveth_start(veth_info->mic_ctx); + return 0; +} + +/* Stop callback */ +static int +micveth_stop_dev(struct net_device *dev) +{ + micveth_info_t *veth_info = dev->ml_priv; + + micveth_stop(veth_info->mic_ctx); + return 0; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28) +static const struct net_device_ops veth_netdev_ops = { + .ndo_open = micveth_start_dev, + .ndo_stop = micveth_stop_dev, + .ndo_start_xmit = micveth_xmit, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_multicast_list = micveth_multicast_list, + .ndo_set_mac_address = micveth_set_address, + .ndo_change_mtu = micveth_change_mtu, +}; +#endif + +static void +micveth_setup(struct net_device *dev) +{ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28) + dev->hard_start_xmit = micveth_xmit; + dev->set_multicast_list = micveth_multicast_list; + dev->set_mac_address = micveth_set_address; +#endif + ether_setup(dev); + + /* Initialize the device structure. */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28) + dev->netdev_ops = &veth_netdev_ops; +#endif + dev->destructor = free_netdev; + + /* Fill in device structure with ethernet-generic values. */ + dev->mtu = (MICVETH_MAX_PACKET_SIZE); + dev->tx_queue_len = 0; + dev->flags &= ~IFF_MULTICAST; + random_ether_addr(dev->dev_addr); +} + +static int +micveth_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + return 0; +} + +static struct rtnl_link_ops micveth_link_ops __read_mostly = { + .kind = "micveth", + .setup = micveth_setup, + .validate = micveth_validate, +}; + +static int +micveth_probe_int(micveth_info_t *veth_info, mic_ctx_t *mic_ctx) +{ + struct net_device *dev_veth; + int err = 0; + + veth_info->vi_sbox = ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH); + veth_info->vi_scratch14 = (uint32_t *)(veth_info->vi_sbox + SBOX_SCRATCH14); + veth_info->vi_scratch15 = (uint32_t *)(veth_info->vi_sbox + SBOX_SCRATCH14); + writel(0x55, veth_info->vi_sbox + SBOX_DCR); + + veth_info->mic_ctx = mic_ctx; + mic_ctx->bi_vethinfo = (void *)veth_info; + + spin_lock_init(&veth_info->vi_txlock); + spin_lock_init(&veth_info->vi_rxlock); + + if (mic_vnet_mode == VNET_MODE_POLL) + INIT_DELAYED_WORK(&veth_info->vi_poll, micveth_poll); + + snprintf(veth_info->vi_wqname, sizeof(veth_info->vi_wqname), + "VNET INTR %d", 0); + veth_info->vi_wq = create_singlethread_workqueue(veth_info->vi_wqname); + INIT_WORK(&veth_info->vi_txws, micvnet_tx_dequeue_handler); + + if (mic_vnet_mode == VNET_MODE_INTR) { + if ((err = request_irq(get_sbox_irq(VNET_SBOX_INT_IDX), + micvnet_host_intr_handler, IRQF_DISABLED, + "micveth intr", veth_info))) { + printk(KERN_ERR "%s: interrupt registration failed\n", __func__); + return err; + } + INIT_WORK(&veth_info->vi_bh, micvnet_intr_bh_handler); + } + + // Set the current sk_buff allocation size + veth_info->vi_skb_mtu = MICVETH_MAX_PACKET_SIZE + 32; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0) + if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", micveth_setup)) == NULL) { +#else + if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", NET_NAME_UNKNOWN, micveth_setup)) == NULL) { +#endif + return -ENOMEM; + } + + veth_info->vi_netdev = dev_veth; + dev_veth->ml_priv = veth_info; + dev_veth->rtnl_link_ops = &micveth_link_ops; + + if ((err = register_netdev(dev_veth)) < 0) { + printk("register netdev failed %d\n", err); + free_netdev(dev_veth); + return err; + } + + veth_info->vi_state = VETH_STATE_INITIALIZED; + + /* Inform host after completing initialization */ + printk("%s: writing magic to SC14 and SC15\n", __FUNCTION__); + writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH14); + writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH15); + + return 0; +} + +void +micveth_remove_int(mic_ctx_t *mic_ctx) +{ + micveth_stop(mic_ctx); +} + +static int __init +micveth_create_int(int num_bds, struct device *dev) +{ + int bd; + int err = 0; + + printk("micveth_init(%d)\n", num_bds); + + micveth.lv_num_interfaces = num_bds; + micveth.lv_num_clients = num_bds; + micveth.lv_active_clients = 0; + micveth.lv_num_links_remaining = num_bds; + + if ((err = rtnl_link_register(&micveth_link_ops))) { + printk(KERN_ERR "%s: rtnl_link_register failed!\n", __func__); + return err; + } + + // Allocate space for the control of each device in the system. + micveth.lv_info = kmalloc(sizeof(micveth_info_t) * num_bds, GFP_KERNEL); + if (!micveth.lv_info) { + printk(KERN_ERR "%s: micveth_info alloc failed!\n", __func__); + return -ENOMEM; + } + + // Initialize state mutex. Overloaded use for several fields. + mutex_init(&micveth.lv_state_mutex); + + // Setup of timer for probeing active mic clients. When the total active board + // count is zero the poll is not running. + micveth.lv_pollstate = CLIENT_POLL_STOPPED; + INIT_DELAYED_WORK(&micveth.lv_poll, micveth_clientpoll); + init_waitqueue_head(&micveth.lv_wq); + + // Init each of the existing boards. + for (bd = 0; bd < num_bds; bd++) { +#ifdef HOST + micveth_probe_int(&micveth.lv_info[bd], &mic_data.dd_bi[bd]->bi_ctx); +#else + micveth_probe_int(&micveth.lv_info[bd], &mic_ctx_g); +#endif + } + + return err; +} + +static void +micveth_exit_int(void) +{ + micveth_info_t *veth_info = &micveth.lv_info[0]; +#ifdef HOST +#endif + micveth_stop(veth_info->mic_ctx); + + destroy_workqueue(veth_info->vi_wq); + rtnl_link_unregister(&micveth_link_ops); + +#ifdef HOST +#else // card + iounmap((void *)veth_info->ring_ptr); + iounmap(veth_info->vi_sbox); +#endif + + kfree(micveth.lv_info); +} + +/* Card side - tell the host that the interface is up */ +static int +micveth_start_int(mic_ctx_t *mic_ctx) +{ + micveth_info_t *veth_info = &micveth.lv_info[mic_ctx->bi_id]; + + // Eventuall (very soon) most of the descriptor allocation for a board will be done here + if (veth_info->vi_state != VETH_STATE_INITIALIZED) + return 0; + + mutex_lock(&micveth.lv_state_mutex); + + if (micveth.lv_pollstate == CLIENT_POLL_STOPPED) { + schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY)); + micveth.lv_pollstate = CLIENT_POLL_RUNNING; + } + + micveth.lv_active_clients++; + mutex_unlock(&micveth.lv_state_mutex); + + veth_info->vi_state = VETH_STATE_LINKDOWN; + + return 0; +} + +/* Card side - tell the host that the interface is down */ +static void +micveth_stop_int(mic_ctx_t *mic_ctx) +{ + micveth_info_t *veth_info = (micveth_info_t *)(mic_ctx->bi_vethinfo); + + if (veth_info->vi_state == VETH_STATE_INITIALIZED) + return; + + mutex_lock(&micveth.lv_state_mutex); + micveth.lv_active_clients--; + veth_info->vi_state = VETH_STATE_INITIALIZED; + + if (micveth.lv_active_clients) { + mutex_unlock(&micveth.lv_state_mutex); + return; + } + + micveth.lv_num_links_remaining = micveth.lv_num_clients; + +#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP + micveth.lv_pollstate = CLIENT_POLL_STOPPED; + mutex_unlock(&micveth.lv_state_mutex); +#else + micveth.lv_pollstate = CLIENT_POLL_STOPPING; + mutex_unlock(&micveth.lv_state_mutex); + wait_event(micveth.lv_wq, micveth.lv_pollstate == CLIENT_POLL_STOPPED); +#endif + +#ifdef HOST +#else // card + writel(MICVETH_LINK_DOWN_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH14); + writel(MICVETH_LINK_DOWN_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH15); +#endif +} + +#ifdef HOST +#else // card +/* Link detection */ +static void +micveth_clientpoll(struct work_struct *work) +{ + micveth_info_t *veth_info; + mic_ctx_t *mic_ctx; + uint32_t scratch14; + uint32_t scratch15; + struct net_device *dev_veth; + veth_info = &micveth.lv_info[0]; + dev_veth = veth_info->vi_netdev; + mic_ctx = veth_info->mic_ctx; + mutex_lock(&micveth.lv_state_mutex); + + if (micveth.lv_pollstate == CLIENT_POLL_STOPPING) { + micveth.lv_pollstate = CLIENT_POLL_STOPPED; + mutex_unlock(&micveth.lv_state_mutex); + wake_up(&micveth.lv_wq); + return; + } + + if (veth_info->vi_state == VETH_STATE_LINKUP) { + scratch14 = readl(veth_info->vi_sbox + SBOX_SCRATCH14); + scratch15 = readl(veth_info->vi_sbox + SBOX_SCRATCH15); + + if ((MICVETH_LINK_DOWN_MAGIC == scratch14) && + (MICVETH_LINK_DOWN_MAGIC == scratch15)) { + veth_info->vi_state = VETH_STATE_LINKDOWN; + } + } else { + scratch14 = readl(veth_info->vi_sbox + SBOX_SCRATCH14); + scratch15 = readl(veth_info->vi_sbox + SBOX_SCRATCH15); + + if ((MICVETH_LINK_UP_MAGIC != scratch14) && + (MICVETH_LINK_UP_MAGIC != scratch15)) { + printk("micveth_clientpoll(): SC14 and SC15 changed from MAGIC, I got the RB addresses!\n"); + writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH14); + writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH15); + dev_veth->dev_addr[4] = (scratch15 >> 24) & 0xff; + dev_veth->dev_addr[5] = (scratch15 >> 16) & 0xff; + veth_info->vi_ring.phys = ((uint64_t)(scratch15 & 0xffff) << 32) | scratch14; + veth_info->vi_ring.phys |= (1ULL << 39); + veth_info->vi_ring.length = sizeof(veth_ring_t); + veth_info->ring_ptr = ioremap_nocache(veth_info->vi_ring.phys, veth_info->vi_ring.length); + BUG_ON(veth_info->ring_ptr == NULL); + + printk("micveth_clientpoll(): VETH_STATE_LINKUP\n"); + veth_info->vi_state = VETH_STATE_LINKUP; + if (mic_vnet_mode == VNET_MODE_POLL) { + printk("micveth_clientpoll(): poll for work now !!\n"); + schedule_delayed_work(&veth_info->vi_poll, msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY)); + } + + micveth.lv_num_links_remaining--; + } + } + mutex_unlock(&micveth.lv_state_mutex); + +#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP + if (micveth.lv_num_links_remaining) +#endif + schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY)); +} +#endif +extern struct sk_buff *jsp_dbg1; + +#ifdef HOST +#else // card +static irqreturn_t +micvnet_host_intr_handler(int irq, void *cookie) +{ + micveth_info_t *veth_info = cookie; + queue_work(veth_info->vi_wq, &veth_info->vi_bh); + return IRQ_HANDLED; +} + +/* Ring host doorbell 3 interrupt */ +static void +micveth_send_intr(micveth_info_t *veth_info) +{ + uint32_t db_reg; + + // Ring host doorbell 3 interrupt + db_reg = readl(veth_info->vi_sbox + SBOX_SDBIC3) | SBOX_SDBIC0_DBREQ_BIT; + writel(db_reg, veth_info->vi_sbox + SBOX_SDBIC3); +} + +static void +_micveth_process_descriptors(micveth_info_t *veth_info) +{ + veth_ring_t *ring = veth_info->ring_ptr; + ring_queue_t *rx_queue = &ring->r_tx; + ring_desc_t desc; + struct sk_buff *skb; + void *pkt; + int receive_skb = 0; + int err; + + if (veth_info->vi_state != VETH_STATE_LINKUP) { + return; + } + + spin_lock(&veth_info->vi_rxlock); + + while (rx_queue->rq_head != rx_queue->rq_tail) { + desc = rx_queue->rq_descs[rx_queue->rq_head]; + + veth_info->vi_netdev->stats.rx_packets++; + veth_info->vi_netdev->stats.rx_bytes += desc.rd_length; + + pkt = ioremap_nocache(desc.rd_phys, desc.rd_length); + if (pkt == NULL) { + veth_info->vi_netdev->stats.rx_dropped++; + goto update_ring; + } + + /* handle jumbo frame */ + if (desc.rd_length > ETH_DATA_LEN) + skb = dev_alloc_skb(veth_info->vi_skb_mtu); + else + skb = dev_alloc_skb(ETH_DATA_LEN + 32); + if (skb == NULL) { + veth_info->vi_netdev->stats.rx_dropped++; + iounmap(pkt); + goto update_ring; + } + + memcpy(skb_put(skb,desc.rd_length), pkt, desc.rd_length); + iounmap(pkt); + skb->dev = veth_info->vi_netdev; + skb->protocol = eth_type_trans(skb, skb->dev); + skb->ip_summed = CHECKSUM_NONE; + local_bh_disable(); + err = netif_receive_skb(skb); + err = err; + local_bh_enable(); + /* + * Need a general memory barrier between copying the data from + * the buffer and updating the head pointer. It's the general + * mb() because we're ordering the read of the data with the write. + * + * No need for the serializing request (Si bug workaround in + * KNF), since the buffer exists in host memory. If the buffer + * lives in card memory, and this code is running on the host, we + * would need extra barriers and a "serializing request" on any write. + */ + mb(); +update_ring: + rx_queue->rq_head = (rx_queue->rq_head + 1) % rx_queue->rq_length; + receive_skb++; + } + + /* Send intr to TX so that pending SKB's can be freed */ + if (receive_skb && mic_vnet_mode == VNET_MODE_INTR) { + micveth_send_intr(veth_info); + } + + spin_unlock(&veth_info->vi_rxlock); + + if (mic_vnet_mode == VNET_MODE_POLL) { + schedule_delayed_work(&veth_info->vi_poll, msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY)); + } +} + +static void +micvnet_intr_bh_handler(struct work_struct *work) +{ + micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_bh); + _micveth_process_descriptors(veth_info); +} + +static void +micveth_poll(struct work_struct *work) +{ + micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_poll.work); + + _micveth_process_descriptors(veth_info); +} + +#endif + +#ifdef HOST +#else // card +static int __init +micveth_module_init_int(void) +{ + mic_ctx_t *mic_ctx = &mic_ctx_g; + int ret = 0; + + printk("micveth_probe()\n"); + memset(mic_ctx, 0, sizeof(*mic_ctx)); + mic_ctx->bi_id = 0; + + if ((ret = micveth_init(NULL))) + return ret; + if ((ret = micveth_init_legacy(1, NULL))) + return ret; + + return 0; +} + +static void __exit +micveth_module_exit_int(void) +{ + micveth_exit(); +} +#endif + +/* + VNET driver public API. These are simply wrappers which either invoke the old + interrupt/poll mode functions or the new DMA mode functions. These are temporary and + will be phased out with the old interrupt/poll mode so only the DMA mode will be around + eventually. + */ +int __init +micveth_init(struct device *dev) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + return micvnet_init(dev); + /* Intr/poll modes use micveth_init_legacy */ + return 0; +} + +int __init +micveth_init_legacy(int num_bds, struct device *dev) +{ + if (mic_vnet_mode != VNET_MODE_DMA) + return micveth_create_int(num_bds, dev); + /* DMA mode uses micveth_create */ + return 0; +} + +void +micveth_exit(void) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + micvnet_exit(); + else + micveth_exit_int(); +} + +int +micveth_probe(mic_ctx_t *mic_ctx) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + return micvnet_probe(mic_ctx); + /* No support for micveth_probe in legacy intr/poll modes */ + return 0; +} + +void +micveth_remove(mic_ctx_t *mic_ctx) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + micvnet_remove(mic_ctx); + /* No support for micveth_remove in legacy intr/poll modes */ +} + +int +micveth_start(mic_ctx_t *mic_ctx) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + return micvnet_start(mic_ctx); + else + return micveth_start_int(mic_ctx); +} + +void +micveth_stop(mic_ctx_t *mic_ctx) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + micvnet_stop(mic_ctx); + else + micveth_stop_int(mic_ctx); +} + +static int __init +micveth_module_init(void) +{ + printk("vnet: mode: %s, buffers: %d\n", + mic_vnet_modes[mic_vnet_mode], vnet_num_buffers); + + if (mic_vnet_mode == VNET_MODE_DMA) + return micvnet_module_init(); + else + return micveth_module_init_int(); +} + +static void __exit +micveth_module_exit(void) +{ + if (mic_vnet_mode == VNET_MODE_DMA) + micvnet_module_exit(); + else + micveth_module_exit_int(); +} + +#ifdef HOST +#else // card +module_init(micveth_module_init); +module_exit(micveth_module_exit); + +MODULE_LICENSE("GPL"); +#endif diff --git a/vnet/micveth_dma.c b/vnet/micveth_dma.c new file mode 100644 index 0000000..c62675b --- /dev/null +++ b/vnet/micveth_dma.c @@ -0,0 +1,1642 @@ + +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include +#include +#include "mic_common.h" +#include "mic/micveth_dma.h" +#include "mic/mic_macaddr.h" + +/* TODO: Clean up shutdown, let DMA's drain */ + +#ifndef HOST +#define SBOX_SDBIC0_DBREQ_BIT 0x80000000 +#define SBOX_MMIO_LENGTH (64 * 1024) +#endif +#define STOP_WAIT_TIMEOUT (4 * HZ) + +#ifndef HOST +static mic_ctx_t mic_ctx_g; +#endif + +struct micvnet micvnet; + + +static void micvnet_send_intr(struct micvnet_info *vnet_info); +static int micvnet_init_msg_rings(struct micvnet_info *vnet_info); +static int micvnet_init_rx_skb_send_msg(struct micvnet_info *vnet_info); +static void micvnet_send_add_dma_buffer_messages(struct micvnet_info *vnet_info); +static void micvnet_stop_ws(struct work_struct *work); +static void micvnet_start_ws(struct work_struct *work); +int get_sbox_irq(int index); + +static __always_inline mic_ctx_t * +vnet_to_ctx(struct micvnet_info *vnet_info) +{ + return vnet_info->mic_ctx; +} + +static __always_inline void +micvnet_wake_queue(struct micvnet_info *vnet_info) +{ + if (atomic_read(&vnet_info->vi_state) == MICVNET_STATE_LINKUP) + netif_wake_queue(vnet_info->vi_netdev); +} + +static __always_inline void +micvnet_dec_cnt_tx_pending(struct micvnet_info *vnet_info) +{ + if (atomic_dec_and_test(&vnet_info->cnt_tx_pending) && + (atomic_read(&vnet_info->vi_state) == MICVNET_STATE_LINK_DOWN)) + wake_up_interruptible(&vnet_info->stop_waitq); +} + + +/*********************************************************** + Pre-allocated "list" of objects which are allocated and deallocated in FIFO + sequence. Allows reservation of memory at init time to prevent mem allocation + failures at run time. */ +static int +list_obj_list_init(int num_obj, size_t obj_size, struct obj_list *list) +{ + list->size = num_obj + 1; + list->obj_size = obj_size; + list->head = list->tail = 0; + + if (!(list->buf = kmalloc(list->size * list->obj_size, GFP_KERNEL))) { + printk(KERN_ERR "%s: list alloc failed\n", __func__); + return -ENOMEM; + } + return 0; +} + +static void +list_obj_list_deinit(struct obj_list *list) +{ + if (list->buf) { + kfree(list->buf); + list->buf = NULL; + } +} + +static void * +list_obj_alloc(struct obj_list *list) +{ + char *obj; + + /* Remove bug_on() here to handle VNET OOO messages. In OOO conditions + * requests to allocate more objects than list->size are possible. */ + if (((list->head + 1) % list->size) == list->tail) { + printk(KERN_ERR "%s: BUG: no free objects in obj list\n", __func__); + return NULL; + } + + obj = list->buf + list->head * list->obj_size; + wmb(); + list->head = (list->head + 1) % list->size; + + return obj; +} + +void +list_obj_free(struct obj_list *list) +{ + /* Remove bug_on() here to handle VNET OOO messages */ + if (list->tail == list->head) { + printk(KERN_ERR "%s: BUG: free too many list objects\n", __func__); + return; + } + + list->tail = (list->tail + 1) % list->size; +} + +/*********************************************************** + * Vnet message functions + */ +#ifdef HOST +static void +micvnet_msg_rb_init(struct micvnet_msg_rb *rb) +{ + rb->head = rb->tail = 0; + rb->size = MICVNET_MSG_RB_SIZE; + rb->prev_head = rb->prev_tail = rb->size - 1; +} + +static void +micvnet_reset_msg_rings(struct micvnet_info *vnet_info) +{ + micvnet_msg_rb_init(vnet_info->vi_qp.tx); + micvnet_msg_rb_init(vnet_info->vi_qp.rx); +} +#endif + +static void +micvnet_msg_rb_write_msg(struct micvnet_info *vnet_info, struct micvnet_msg *msg) +{ + struct micvnet_msg_rb *rb = vnet_info->vi_qp.tx; + + /* The condition below should never occur under normal conditions + because the VNET message ring buffer size is at least 1 greater than + the maximum total number of outstanding messages possible in the + system. However, all bets are off if VNET OOO messages are + seen. Therefore remove the previous bug_on() here and busy wait. */ + while (((rb->head + 1) % rb->size) == rb->tail) + cpu_relax(); + + if (!(rb->head == (rb->prev_head + 1) % rb->size)) + printk(KERN_ERR "BUG: head not equal to prev_head + 1:\n \ + head %d prev_head %d\n", rb->head, rb->prev_head); + + smp_mb(); +#ifdef HOST + rb->buf[rb->head] = *msg; +#else + memcpy_toio(&rb->buf[rb->head], msg, sizeof(*msg)); +#endif + smp_mb(); + serializing_request(&rb->buf[rb->head]); + + rb->prev_head = rb->head; + rb->head = (rb->head + 1) % rb->size; +#ifndef HOST + rb->head = rb->head; +#endif + smp_mb(); + serializing_request(&rb->head); +} + +static int +micvnet_msg_rb_read_msg(struct micvnet_info *vnet_info, struct micvnet_msg *msg) +{ + struct micvnet_msg_rb *rb = vnet_info->vi_qp.rx; + + if (rb->tail == rb->head) + return 1; + + if (!(rb->tail == (rb->prev_tail + 1) % rb->size)) + printk(KERN_ERR "BUG: tail not equal to prev_tail + 1:\n \ + tail %d prev_tail %d\n", rb->tail, rb->prev_tail); + + smp_mb(); +#ifdef HOST + *msg = rb->buf[rb->tail]; +#else + memcpy_fromio(msg, &rb->buf[rb->tail], sizeof(*msg)); +#endif + smp_mb(); + serializing_request(&rb->buf[rb->tail]); + + rb->prev_tail = rb->tail; + rb->tail = (rb->tail + 1) % rb->size; +#ifndef HOST + rb->tail = rb->tail; +#endif + smp_mb(); + serializing_request(&rb->tail); + + return 0; +} + +void +micvnet_msg_send_msg(struct micvnet_info *vnet_info, struct micvnet_msg *msg) +{ + micvnet_msg_rb_write_msg(vnet_info, msg); +#ifdef HOST + if (micpm_get_reference(vnet_to_ctx(vnet_info), true)) + return; +#endif + micvnet_send_intr(vnet_info); +#ifdef HOST + micpm_put_reference(vnet_to_ctx(vnet_info)); +#endif +} + +static void +micvnet_msg_send_add_dma_buffer_msg(struct micvnet_info *vnet_info, + struct rx_node *rnode) +{ + struct micvnet_msg msg; + struct micvnet_msg_add_dma_buffer + *body = &msg.body.micvnet_msg_add_dma_buffer; + + msg.msg_id = MICVNET_MSG_ADD_DMA_BUFFER; + body->buf_phys = rnode->phys; + body->buf_size = rnode->size; + micvnet_msg_send_msg(vnet_info, &msg); +} + +static void +micvnet_msg_recv_add_dma_buffer(struct micvnet_info *vnet_info, + struct micvnet_msg_add_dma_buffer *msg) +{ + struct dma_node *dnode; + + /* Remove bug_on() here to handle VNET OOO messages */ + if (!(dnode = list_obj_alloc(&vnet_info->dnode_list))) + return; + + dnode->phys = msg->buf_phys; + dnode->size = msg->buf_size; + + spin_lock(&vnet_info->vi_rxlock); + list_add_tail(&dnode->list, &vnet_info->vi_dma_buf); + spin_unlock(&vnet_info->vi_rxlock); + + atomic_inc(&vnet_info->cnt_dma_buf_avail); + micvnet_wake_queue(vnet_info); +} + +static void +micvnet_msg_send_dma_complete_msg(struct micvnet_info *vnet_info, + struct sched_node *snode) +{ + struct micvnet_msg msg; + struct micvnet_msg_dma_complete + *body = &msg.body.micvnet_msg_dma_complete; + + msg.msg_id = MICVNET_MSG_DMA_COMPLETE; + body->dst_phys = snode->dst_phys; + body->size = snode->skb->len; + body->dma_offset = snode->dma_offset; + micvnet_msg_send_msg(vnet_info, &msg); +} + +/* Handle an unexpected out-of-order message */ +static int +micvnet_msg_handle_ooo_msg(struct micvnet_info *vnet_info, + struct micvnet_msg_dma_complete *msg) +{ + struct micvnet_msg_rb *rb = vnet_info->vi_qp.rx; + struct rx_node *rnode; + struct list_head *pos, *tmpl; + bool found = false; + + rnode = list_entry((&vnet_info->vi_rx_skb)->next, struct rx_node, list); + + /* Normal operation */ + if (rnode->phys == msg->dst_phys + && msg->size <= (rnode->size - 3 * DMA_ALIGNMENT) + && msg->dma_offset < 2 * DMA_ALIGNMENT) + return 0; + + /* Flag that weird stuff's going on */ + printk(KERN_ERR "BUG: Unexpected vnet dma_complete message parameters:\n \ + rnode->phys %p, msg->dst_phys %p\n \ + rnode->size %lld, msg->size %lld, msg->dma_offset %lld\n \ + rx rb head %d tail %d size %d\n", + (char *) rnode->phys, (char *) msg->dst_phys, + rnode->size, msg->size, msg->dma_offset, + rb->head, rb->tail, rb->size); + + /* if message is received in order but with incorrect parameters + (size/dma_offset), drop it, but re-add the rnode at the back of the + rx_skb list, as well as at tx, similar to what is done below for ooo + case. */ + if (rnode->phys == msg->dst_phys) { + list_del(&rnode->list); + list_add_tail(&rnode->list, &vnet_info->vi_rx_skb); + micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode); + vnet_info->vi_netdev->stats.rx_dropped++; + return 1; + } + + /* Start of OOO message processing. First check if the message has + * really been received OOO. If it is completely unknown to us we just + * drop it and go on. */ + list_for_each(pos, &vnet_info->vi_rx_skb) { + rnode = list_entry(pos, struct rx_node, list); + if (rnode->phys == msg->dst_phys) { + found = true; + break; + } + } + + if (!found) { + vnet_info->vi_netdev->stats.rx_dropped++; + return 1; + } + + vnet_info->vi_netdev->stats.rx_errors++; + + /* Skip all the rnode's till we find the one we are looking for. Rather + * than free rnode skb's and reallocate them, and therby risk allocation + * failures, we simply delete the rnode's from their current position on + * the rnode list and re-add them at back of the list, as well as add + * them back at tx. */ + list_for_each_safe(pos, tmpl, &vnet_info->vi_rx_skb) { + rnode = list_entry(pos, struct rx_node, list); + if (rnode->phys == msg->dst_phys) + break; + + list_del(&rnode->list); + list_add_tail(&rnode->list, &vnet_info->vi_rx_skb); + micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode); + } + + return 0; +} + +static void +micvnet_msg_recv_dma_complete(struct micvnet_info *vnet_info, + struct micvnet_msg_dma_complete *msg) +{ + struct rx_node *rnode; + struct sk_buff *skb; + + vnet_info->vi_netdev->stats.rx_packets++; + + if (micvnet_msg_handle_ooo_msg(vnet_info, msg)) + return; + + rnode = list_entry((&vnet_info->vi_rx_skb)->next, struct rx_node, list); + /* Our OOO message handling guarantees that rnode->phys == msg->dst_phys */ + + vnet_info->vi_netdev->stats.rx_bytes += msg->size; + list_del(&rnode->list); + + spin_lock_bh(&vnet_info->vi_txlock); + if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP) { + spin_unlock_bh(&vnet_info->vi_txlock); + goto skip_adding_new_buffers; + } + atomic_inc(&vnet_info->cnt_tx_pending); + spin_unlock_bh(&vnet_info->vi_txlock); + + /* OOM handling: check if a new SKB can be allocated. If not, we will re-add the + old SKB to TX and not give it to the network stack, i.e. drop it */ + if (micvnet_init_rx_skb_send_msg(vnet_info)) { + list_add_tail(&rnode->list, &vnet_info->vi_rx_skb); + micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode); + micvnet_dec_cnt_tx_pending(vnet_info); + vnet_info->vi_netdev->stats.rx_dropped++; + return; + } + micvnet_dec_cnt_tx_pending(vnet_info); + +skip_adding_new_buffers: + skb = rnode->skb; + skb_reserve(skb, msg->dma_offset); + skb_put(skb, msg->size); + skb->dev = vnet_info->vi_netdev; + skb->protocol = eth_type_trans(skb, skb->dev); + skb->ip_summed = CHECKSUM_NONE; + + local_bh_disable(); + netif_receive_skb(skb); + local_bh_enable(); + +#ifdef HOST + mic_ctx_unmap_single(vnet_to_ctx(vnet_info), rnode->phys, rnode->size); +#endif + kfree(rnode); +} + +static void +micvnet_msg_send_link_down_msg(struct work_struct *work) +{ + struct micvnet_info *vnet_info + = container_of(work, struct micvnet_info, vi_ws_link_down); + struct micvnet_msg msg; + msg.msg_id = MICVNET_MSG_LINK_DOWN; + micvnet_msg_send_msg(vnet_info, &msg); +} + +static void +micvnet_msg_recv_msg_link_down(struct micvnet_info *vnet_info) +{ + atomic_set(&vnet_info->vi_state, MICVNET_STATE_BEGIN_UNINIT); + + if (vnet_info->link_down_initiator) + wake_up_interruptible(&vnet_info->stop_waitq); + else + schedule_work(&vnet_info->vi_ws_stop); +} + +static void +micvnet_msg_send_link_up_msg(struct micvnet_info *vnet_info) +{ + struct micvnet_msg msg; + struct micvnet_msg_link_up + *body = &msg.body.micvnet_msg_link_up; + + msg.msg_id = MICVNET_MSG_LINK_UP; + body->vnet_driver_version = VNET_DRIVER_VERSION; + micvnet_msg_send_msg(vnet_info, &msg); +} + +static void +micvnet_msg_recv_msg_link_up(struct micvnet_info *vnet_info, + struct micvnet_msg_link_up *msg) +{ + if (msg->vnet_driver_version != VNET_DRIVER_VERSION) { + printk(KERN_ERR "%s: Error: vnet driver version mismatch: " + "expected %d actual %lld\n" + "Ensure that host and card modules are " + "from the same build.\n", + __func__, VNET_DRIVER_VERSION, + msg->vnet_driver_version); + return; + } +#ifdef HOST + schedule_work(&vnet_info->vi_ws_start); +#else + micvnet_send_add_dma_buffer_messages(vnet_info); +#endif +} + +static void +micvnet_msg_process_messages(struct micvnet_info *vnet_info) +{ + struct micvnet_msg msg; + +#ifdef HOST + micpm_get_reference(vnet_to_ctx(vnet_info), true); +#endif + while (!micvnet_msg_rb_read_msg(vnet_info, &msg)) { + switch(msg.msg_id) { + case MICVNET_MSG_ADD_DMA_BUFFER: + micvnet_msg_recv_add_dma_buffer + (vnet_info, + &msg.body.micvnet_msg_add_dma_buffer); + break; + + case MICVNET_MSG_DMA_COMPLETE: + micvnet_msg_recv_dma_complete + (vnet_info, + &msg.body.micvnet_msg_dma_complete); + break; + + case MICVNET_MSG_LINK_DOWN: + micvnet_msg_recv_msg_link_down(vnet_info); + break; + + case MICVNET_MSG_LINK_UP: + micvnet_msg_recv_msg_link_up(vnet_info, + &msg.body.micvnet_msg_link_up); + break; + + default: + printk(KERN_ERR "BUG: unknown vnet msg id: %lld\n", msg.msg_id); + break; + } + } +#ifdef HOST + micpm_put_reference(vnet_to_ctx(vnet_info)); +#endif +} + +/*********************************************************** + * Interrupts + */ +#ifdef HOST +static int +micvnet_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell) +{ + struct micvnet_info *vnet_info; + vnet_info = mic_ctx->bi_vethinfo; + + queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_bh); + return 0; +} +#else +static irqreturn_t +micvnet_host_intr_handler(int irq, void *data) +{ + struct micvnet_info *vnet_info = data; + queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_bh); + return IRQ_HANDLED; +} +#endif + +static void +micvnet_intr_bh_handler(struct work_struct *work) +{ + struct micvnet_info *vnet_info + = container_of(work, struct micvnet_info, vi_ws_bh); + + micvnet_msg_process_messages(vnet_info); +} + +#ifdef HOST +static void +micvnet_send_intr(struct micvnet_info *vnet_info) +{ + mic_ctx_t *mic_ctx = vnet_info->mic_ctx; + mic_send_vnet_intr(mic_ctx); +} +#else +/* Ring host doorbell 3 interrupt */ +static void +micvnet_send_intr(struct micvnet_info *vnet_info) +{ + uint32_t db_reg; + + /* Ring host doorbell 3 interrupt */ + db_reg = readl(vnet_info->vi_sbox + SBOX_SDBIC3) + | SBOX_SDBIC0_DBREQ_BIT; + writel(db_reg, vnet_info->vi_sbox + SBOX_SDBIC3); +} +#endif + +/*********************************************************** + * Net device ops and rtnl link ops + */ +/* + Do nothing in ndo_open and ndo_stop. There are two reasons for this: + 1. Since host and card side drivers are driver pairs, if ifconfig up or + ifconfig down occurs on one side this needs to be communicated to the other + side other side otherwise in the current implementation this can bring down + the system. Ignoring ifconfig up or down avoids this issue. + 2. For now, micvnet_init is called before the dma can be initialized. However, + as soon as micvnet_init has been called and netdev has been created, the OS + can invoke .ndo_open, which however requires the DMA to have been + initialized. But DMA can not be initialized until later (at present after + the card has booted). + Therefore we ourselves call micvnet_start and micvnet_stop at appropriate + times when we are ready for them. The only consequence is all packets till + micvnet_start has been invoked will be dropped in ndo_start_xmit. + */ + +/* Start callback */ +static int +micvnet_start_dev(struct net_device *dev) +{ + struct micvnet_info *vnet_info = dev->ml_priv; + + /* Stop the queue till the state becomes LINKUP. The queue will be started when + dma buffers are added in micvnet_msg_recv_add_dma_buffer(). Not doing this + results in packets getting dropped till state is LINKUP. */ + if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP) + netif_stop_queue(vnet_info->vi_netdev); + + return 0; +} + +/* Stop callback */ +static int +micvnet_stop_dev(struct net_device *dev) +{ + return 0; +} + +static void +micvnet_dma_cb_bh(struct work_struct *work) +{ + struct micvnet_info + *vnet_info = container_of(work, struct micvnet_info, vi_ws_dmacb); + struct sched_node *snode; + + if (!atomic_read(&vnet_info->cnt_dma_complete)) + return; + + do { + spin_lock_bh(&vnet_info->vi_txlock); + snode = list_entry((&vnet_info->vi_sched_skb)->next, + struct sched_node, list); + list_del(&snode->list); + spin_unlock_bh(&vnet_info->vi_txlock); + + micvnet_msg_send_dma_complete_msg(vnet_info, snode); + + micvnet_dec_cnt_tx_pending(vnet_info); +#ifdef HOST + mic_ctx_unmap_single(vnet_to_ctx(vnet_info), + snode->dma_src_phys, snode->dma_size); + micpm_put_reference(vnet_to_ctx(vnet_info)); +#endif + kfree_skb(snode->skb); + kfree(snode); + + } while (!atomic_dec_and_test(&vnet_info->cnt_dma_complete)); +} + +static void +micvnet_dma_completion_callback(uint64_t data) +{ + struct micvnet_info *vnet_info = (struct micvnet_info *) data; + + atomic_inc(&vnet_info->cnt_dma_complete); + + queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_dmacb); +} + +static int +micvnet_do_dma(struct micvnet_info *vnet_info, struct sched_node *snode) +{ + uint64_t dma_src, dma_dst; + int ret = 0; + + dma_src = snode->dma_src_phys; + dma_dst = ALIGN(snode->dst_phys, DMA_ALIGNMENT); + snode->dma_offset = (snode->skb->data - snode->skb_data_aligned) + + (dma_dst - snode->dst_phys); + if ((ret = request_dma_channel(vnet_info->dma_chan))) + goto err_exit; + + ret = do_dma(vnet_info->dma_chan, + DO_DMA_INTR, + dma_src, + dma_dst, + snode->dma_size, + &vnet_info->dma_cb); + + free_dma_channel(vnet_info->dma_chan); + +err_exit: + return ret; +} + +static int +micvnet_schedule_dma(struct micvnet_info *vnet_info) +{ + struct tx_node *tnode; + struct sched_node *snode; + struct dma_node *dnode; + struct sk_buff *skb; + int ret = 0; + /* tnode */ + spin_lock_bh(&vnet_info->vi_txlock); + BUG_ON(list_empty(&vnet_info->vi_tx_skb)); + tnode = list_entry((&vnet_info->vi_tx_skb)->next, + struct tx_node, list); + list_del(&tnode->list); + spin_unlock_bh(&vnet_info->vi_txlock); + skb = tnode->skb; + kfree(tnode); + +#ifdef HOST + if ((ret = micpm_get_reference(vnet_to_ctx(vnet_info), true))) + goto err_exit_no_dec_node_refcnt; +#endif + + /* dnode */ + spin_lock(&vnet_info->vi_rxlock); + BUG_ON(list_empty(&vnet_info->vi_dma_buf)); + dnode = list_entry((&vnet_info->vi_dma_buf)->next, + struct dma_node, list); + spin_unlock(&vnet_info->vi_rxlock); + if (dnode->size < skb->len + 3 * DMA_ALIGNMENT) { + ret = -ENOMEM; + goto err_exit; + } + + /* snode */ + if (!(snode = kmalloc(sizeof(*snode), GFP_KERNEL))) { + ret = -ENOMEM; + goto err_exit; + } + snode->skb = skb; + snode->dst_phys = dnode->phys; + snode->skb_data_aligned + = (unsigned char *) ((uint64_t) skb->data & ~(DMA_ALIGNMENT - 1)); + snode->dma_size + = ALIGN((skb->len + (skb->data - snode->skb_data_aligned)), + DMA_ALIGNMENT); +#ifdef HOST + snode->dma_src_phys = mic_ctx_map_single(vnet_to_ctx(vnet_info), + snode->skb_data_aligned, + snode->dma_size); + if (mic_map_error(snode->dma_src_phys)) { + kfree(snode); + ret = -ENOMEM; + goto err_exit; + } +#else + snode->dma_src_phys = virt_to_phys(snode->skb_data_aligned); +#endif + + if ((ret = micvnet_do_dma(vnet_info, snode))) { +#ifdef HOST + mic_ctx_unmap_single(vnet_to_ctx(vnet_info), + snode->dma_src_phys, snode->dma_size); +#endif + kfree(snode); + goto err_exit; + } + + /* Update snode/dnode lists only after all operations have successfully + completed and no further errors are possible */ + spin_lock_bh(&vnet_info->vi_txlock); + list_add_tail(&snode->list, &vnet_info->vi_sched_skb); + spin_unlock_bh(&vnet_info->vi_txlock); + + spin_lock(&vnet_info->vi_rxlock); + list_del(&dnode->list); + spin_unlock(&vnet_info->vi_rxlock); + list_obj_free(&vnet_info->dnode_list); + + vnet_info->vi_netdev->stats.tx_packets++; + vnet_info->vi_netdev->stats.tx_bytes += skb->len; + + return ret; + +err_exit: +#ifdef HOST + micpm_put_reference(vnet_to_ctx(vnet_info)); +err_exit_no_dec_node_refcnt: +#endif + micvnet_dec_cnt_tx_pending(vnet_info); + atomic_inc(&vnet_info->cnt_dma_buf_avail); + micvnet_wake_queue(vnet_info); + skb->dev->stats.tx_dropped++; + kfree_skb(skb); + return ret; +} + +static void +micvnet_schedule_dmas(struct work_struct *work) +{ + struct micvnet_info *vnet_info + = container_of(work, struct micvnet_info, vi_ws_tx); + volatile bool tx_skb_list_empty; + while (1) { + spin_lock_bh(&vnet_info->vi_txlock); + tx_skb_list_empty = list_empty(&vnet_info->vi_tx_skb); + spin_unlock_bh(&vnet_info->vi_txlock); + if (tx_skb_list_empty) + break; + + micvnet_schedule_dma(vnet_info); + } +} + +int +micvnet_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct micvnet_info *vnet_info = (struct micvnet_info*)dev->ml_priv; + struct tx_node *tnode; + if (!vnet_info || !atomic_read(&vnet_info->cnt_dma_buf_avail)){ + goto err_exit; + } + + if (!(tnode = kmalloc(sizeof(*tnode), GFP_ATOMIC))) + goto err_exit; + tnode->skb = skb; + + spin_lock(&vnet_info->vi_txlock); + if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP) + goto err_exit_unlock; + list_add_tail(&tnode->list, &vnet_info->vi_tx_skb); + atomic_inc(&vnet_info->cnt_tx_pending); + spin_unlock(&vnet_info->vi_txlock); + + queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_tx); + + if (atomic_dec_and_test(&vnet_info->cnt_dma_buf_avail)) + netif_stop_queue(vnet_info->vi_netdev); + + return NETDEV_TX_OK; + +err_exit_unlock: + kfree(tnode); + spin_unlock(&vnet_info->vi_txlock); +err_exit: + kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) +static void +micvnet_multicast_list(struct net_device *dev) +{ +} +#endif + +static int +micvnet_set_address(struct net_device *dev, void *p) +{ + struct sockaddr *sa = p; + + if (!is_valid_ether_addr(sa->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN); + return 0; +} + +#define MIN_MTU 68 +#define MAX_MTU MICVNET_MAX_MTU + +static int +micvnet_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < MIN_MTU || new_mtu > MAX_MTU) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +union serial { + uint32_t regs[3]; + char string[13]; +}; + +void +mic_get_serial_from_dbox(struct micvnet_info *vni, char *serialnum) +{ + union serial serial; +#ifdef HOST + serial.regs[0] = DBOX_READ(vni->mic_ctx->mmio.va, DBOX_SWF1X0); + serial.regs[1] = DBOX_READ(vni->mic_ctx->mmio.va, DBOX_SWF1X1); + serial.regs[2] = DBOX_READ(vni->mic_ctx->mmio.va, DBOX_SWF1X2); +#else + serial.regs[0] = readl(vni->vi_dbox + DBOX_SWF1X0); + serial.regs[1] = readl(vni->vi_dbox + DBOX_SWF1X1); + serial.regs[2] = readl(vni->vi_dbox + DBOX_SWF1X2); +#endif + serial.string[12] = '\0'; + strcpy(serialnum, serial.string); +} + +int +micvnet_setmac_from_serial(struct net_device *dev) +{ + struct micvnet_info *vni = (struct micvnet_info *)dev->ml_priv; + char serialnum[17]; + int err; + + mic_get_serial_from_dbox(vni, serialnum); +#ifdef HOST + err = mic_get_mac_from_serial(serialnum, dev->dev_addr, 1); +#else + err = mic_get_mac_from_serial(serialnum, dev->dev_addr, 0); +#endif + return err; +} + +static const struct net_device_ops micvnet_netdev_ops = { + .ndo_open = micvnet_start_dev, + .ndo_stop = micvnet_stop_dev, + .ndo_start_xmit = micvnet_xmit, + .ndo_validate_addr = eth_validate_addr, +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) + .ndo_set_multicast_list = micvnet_multicast_list, +#endif + .ndo_set_mac_address = micvnet_set_address, + .ndo_change_mtu = micvnet_change_mtu, +}; + +static void +micvnet_setup(struct net_device *dev) +{ + ether_setup(dev); + + /* Initialize the device structure. */ + dev->netdev_ops = &micvnet_netdev_ops; + dev->destructor = free_netdev; + + /* Fill in device structure with ethernet-generic values. */ + dev->mtu = MICVNET_MAX_MTU; + dev->flags &= ~IFF_MULTICAST; +} + +static struct rtnl_link_ops micvnet_link_ops __read_mostly = { + .kind = "micvnet", + .setup = micvnet_setup, +}; + +/*********************************************************** + * Vnet init/deinit + */ +static int +micvnet_init_hw_regs(struct micvnet_info *vnet_info) +{ +#ifdef HOST + mic_ctx_t *mic_ctx = vnet_info->mic_ctx; + + vnet_info->vi_pdev = mic_ctx->bi_pdev; + vnet_info->vi_sbox = (uint8_t *)((unsigned long) mic_ctx->mmio.va + + HOST_SBOX_BASE_ADDRESS); + vnet_info->vi_scratch14 + = (uint32_t *)((unsigned long)mic_ctx->mmio.va + + HOST_SBOX_BASE_ADDRESS + SBOX_SCRATCH14); +#else + vnet_info->vi_sbox = ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH); + vnet_info->vi_dbox = ioremap_nocache(DBOX_BASE, SBOX_MMIO_LENGTH); + if (!vnet_info->vi_sbox) { + printk(KERN_ERR "%s: NULL SBOX ptr\n", __func__); + return -ENOMEM; + } + vnet_info->vi_scratch14 + = (uint32_t *)(vnet_info->vi_sbox + SBOX_SCRATCH14); +#endif + return 0; +} + +static void +micvnet_deinit_hw_regs(struct micvnet_info *vnet_info) +{ +#ifndef HOST + iounmap(vnet_info->vi_sbox); + iounmap(vnet_info->vi_dbox); +#endif +} + +static int +micvnet_init_interrupts(struct micvnet_info *vnet_info) +{ + mic_ctx_t *mic_ctx = vnet_info->mic_ctx; + int ret = 0; + + spin_lock_init(&vnet_info->vi_txlock); + spin_lock_init(&vnet_info->vi_rxlock); + + snprintf(vnet_info->vi_wqname, sizeof(vnet_info->vi_wqname), + "VNET WQ %d", mic_ctx->bi_id); + + if (!(vnet_info->vi_wq = + __mic_create_singlethread_workqueue(vnet_info->vi_wqname))) { + printk(KERN_ERR "%s: create_singlethread_workqueue\n", __func__); + return -ENOMEM; + } + init_waitqueue_head(&vnet_info->stop_waitq); + + INIT_WORK(&vnet_info->vi_ws_bh, micvnet_intr_bh_handler); + INIT_WORK(&vnet_info->vi_ws_tx, micvnet_schedule_dmas); + INIT_WORK(&vnet_info->vi_ws_dmacb, micvnet_dma_cb_bh); + INIT_WORK(&vnet_info->vi_ws_link_down, micvnet_msg_send_link_down_msg); + INIT_WORK(&vnet_info->vi_ws_stop, micvnet_stop_ws); + INIT_WORK(&vnet_info->vi_ws_start, micvnet_start_ws); +#ifdef HOST + if ((ret = mic_reg_irqhandler(mic_ctx, 3, "Host DoorBell 3", + micvnet_host_doorbell_intr_handler))) { +#else + if ((ret = request_irq(get_sbox_irq(VNET_SBOX_INT_IDX), + micvnet_host_intr_handler, IRQF_DISABLED, + "vnet intr", vnet_info))) { +#endif + printk(KERN_ERR "%s: interrupt registration failed\n", __func__); + goto err_exit_destroy_workqueue; + } + return 0; + +err_exit_destroy_workqueue: + destroy_workqueue(vnet_info->vi_wq); + return ret; +} + +static void +micvnet_deinit_interrupts(struct micvnet_info *vnet_info) +{ +#ifdef HOST + mic_unreg_irqhandler(vnet_info->mic_ctx, 3, "Host DoorBell 3"); +#else + free_irq(get_sbox_irq(VNET_SBOX_INT_IDX), vnet_info); +#endif + destroy_workqueue(vnet_info->vi_wq); +} + + +static int +micvnet_init_netdev(struct micvnet_info *vnet_info) +{ + struct net_device *dev_vnet; + int ret = 0; + + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) + if ((dev_vnet = (struct net_device *)alloc_netdev(sizeof(struct micvnet_info), "mic%d", + NET_NAME_UNKNOWN, micvnet_setup)) == NULL) { +#else + if ((dev_vnet = (struct net_device *)alloc_netdev(sizeof(struct micvnet_info), "mic%d", + micvnet_setup)) == NULL) { +#endif + printk(KERN_ERR "%s: alloc_netdev failed\n", __func__); + return -ENOMEM; + } + + vnet_info->vi_netdev = dev_vnet; + dev_vnet->ml_priv = vnet_info; + + if (micvnet_setmac_from_serial(dev_vnet)) + random_ether_addr(dev_vnet->dev_addr); + + dev_vnet->rtnl_link_ops = &micvnet_link_ops; + + if ((ret = register_netdev(dev_vnet)) < 0) { + printk(KERN_ERR "%s: register_netdev failed %d\n", __func__, ret); + free_netdev(dev_vnet); + return ret; + } + + return 0; +} + +static int +micvnet_init_msg_rings(struct micvnet_info *vnet_info) +{ +#ifdef HOST + vnet_info->vi_qp.tx = &vnet_info->vi_rp.rb_tx; + vnet_info->vi_qp.rx = &vnet_info->vi_rp.rb_rx; + micvnet_reset_msg_rings(vnet_info); + + vnet_info->vi_rp_phys = mic_ctx_map_single(vnet_to_ctx(vnet_info), + &vnet_info->vi_rp, + sizeof(vnet_info->vi_rp)); + if (mic_map_error(vnet_info->vi_rp_phys)) { + printk(KERN_ERR "%s: mic_map_error failed\n", __func__); + return -ENOMEM; + } +#else + if (!(vnet_info->vi_rp_phys = vnet_addr)) { + printk(KERN_ERR "%s: null vnet_addr\n", __func__); + return -ENOMEM; + } + vnet_info->ring_ptr + = ioremap_nocache(vnet_info->vi_rp_phys, + sizeof(struct micvnet_msg_ring_pair)); + if (!vnet_info->ring_ptr) { + printk(KERN_ERR "%s: NULL ring ptr\n", __func__); + return -ENOMEM; + } + vnet_info->vi_qp.tx = &vnet_info->ring_ptr->rb_rx; + vnet_info->vi_qp.rx = &vnet_info->ring_ptr->rb_tx; +#endif + return 0; +} + +static void +micvnet_deinit_msg_rings(struct micvnet_info *vnet_info) +{ +#ifdef HOST + mic_ctx_unmap_single(vnet_to_ctx(vnet_info), + vnet_info->vi_rp_phys, sizeof(vnet_info->vi_rp)); +#else + iounmap(vnet_info->ring_ptr); +#endif +} + +static int +micvnet_init_lists(struct micvnet_info *vnet_info) +{ + int ret; + if ((ret = list_obj_list_init(VNET_MAX_SKBS, sizeof(struct dma_node), + &vnet_info->dnode_list))) + return ret; + + INIT_LIST_HEAD(&vnet_info->vi_rx_skb); + INIT_LIST_HEAD(&vnet_info->vi_dma_buf); + INIT_LIST_HEAD(&vnet_info->vi_tx_skb); + INIT_LIST_HEAD(&vnet_info->vi_sched_skb); + return 0; +} + +static void +micvnet_deinit_lists(struct micvnet_info *vnet_info) +{ + struct list_head *pos, *tmpq; + struct rx_node *rnode; + struct tx_node *tnode; + struct dma_node *dnode; + struct sched_node *snode; + + list_for_each_safe(pos, tmpq, &vnet_info->vi_rx_skb) { + rnode = list_entry(pos, struct rx_node, list); + list_del(&rnode->list); +#ifdef HOST + mic_ctx_unmap_single(vnet_to_ctx(vnet_info), + rnode->phys, rnode->size); +#endif + kfree_skb(rnode->skb); + kfree(rnode); + } + + list_for_each_safe(pos, tmpq, &vnet_info->vi_dma_buf) { + dnode = list_entry(pos, struct dma_node, list); + list_del(&dnode->list); + list_obj_free(&vnet_info->dnode_list); + } + + list_for_each_safe(pos, tmpq, &vnet_info->vi_tx_skb) { + tnode = list_entry(pos, struct tx_node, list); + list_del(&tnode->list); + kfree_skb(tnode->skb); + kfree(tnode); + } + + list_for_each_safe(pos, tmpq, &vnet_info->vi_sched_skb) { + snode = list_entry(pos, struct sched_node, list); + list_del(&snode->list); +#ifdef HOST + mic_ctx_unmap_single(vnet_to_ctx(vnet_info), snode->dma_src_phys, + snode->dma_size); + micpm_put_reference(vnet_to_ctx(vnet_info)); +#endif + kfree_skb(snode->skb); + kfree(snode); + } + + list_obj_list_deinit(&vnet_info->dnode_list); +} +static int +micvnet_init_dma(struct micvnet_info *vnet_info) +{ + mic_ctx_t *mic_ctx = vnet_info->mic_ctx; + int ret; + + /* Note: open_dma_device must use mic_ctx->dma_handle since that is + used in the isr */ +#ifdef HOST + if (micpm_get_reference(mic_ctx, true) != 0) { + printk(KERN_ERR "%s: micpm_get_reference failed\n", __func__); + return -ENODEV; + } + + if ((ret = open_dma_device(mic_ctx->bi_id + 1, + mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS, + &mic_ctx->dma_handle))) { + printk(KERN_ERR "%s: open_dma_device failed\n", __func__); + micpm_put_reference(mic_ctx); + return ret; + } + micpm_put_reference(mic_ctx); +#else + if ((ret = open_dma_device(0, 0, &mic_ctx->dma_handle))) { + printk(KERN_ERR "%s: open_dma_device failed\n", __func__); + return ret; + } +#endif + + vnet_info->dma_handle = mic_ctx->dma_handle; + + if ((ret = allocate_dma_channel(vnet_info->dma_handle, + &vnet_info->dma_chan))) { + printk(KERN_ERR "%s: allocate_dma_channel failed\n", __func__); + goto err_exit_close_dma; + } + free_dma_channel(vnet_info->dma_chan); + vnet_info->dma_cb.dma_completion_func = micvnet_dma_completion_callback; + vnet_info->dma_cb.cb_cookie = (uint64_t) vnet_info; + atomic_set(&vnet_info->cnt_dma_complete, 0); + atomic_set(&vnet_info->cnt_dma_buf_avail, 0); + vnet_info->link_down_initiator = false; + atomic_set(&vnet_info->cnt_tx_pending, 0); + return 0; + +err_exit_close_dma: + close_dma_device(mic_ctx->bi_id + 1, &vnet_info->dma_handle); + return ret; +} + +static void +micvnet_deinit_dma(struct micvnet_info *vnet_info) +{ + mic_ctx_t *mic_ctx = vnet_info->mic_ctx; + + close_dma_device(mic_ctx->bi_id + 1, &vnet_info->dma_handle); +} +static int +micvnet_alloc_rx_node(struct micvnet_info *vnet_info, struct rx_node **node) +{ + struct rx_node *rnode; + + if (!(rnode = kmalloc(sizeof(*rnode), GFP_KERNEL))) + return -ENOMEM; + + rnode->size = vnet_info->vi_netdev->mtu + 3 * DMA_ALIGNMENT + ETH_HLEN; + + if (!(rnode->skb = dev_alloc_skb(rnode->size))) { + kfree(rnode); + return -ENOMEM; + } + +#ifdef HOST + rnode->phys = mic_ctx_map_single(vnet_to_ctx(vnet_info), + rnode->skb->data, rnode->size); + if (mic_map_error(rnode->phys)) { + kfree_skb(rnode->skb); + kfree(rnode); + return -ENOMEM; + } +#else + rnode->phys = virt_to_phys(rnode->skb->data); +#endif + + *node = rnode; + + return 0; +} + +static int +micvnet_init_rx_skb_send_msg(struct micvnet_info *vnet_info) +{ + struct rx_node *rnode; + int ret = 0; + + if (unlikely(ret = micvnet_alloc_rx_node(vnet_info, &rnode))) + return ret; + + list_add_tail(&rnode->list, &vnet_info->vi_rx_skb); + + micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode); + + return 0; +} + +static int +micvnet_init_rx_skbs(struct micvnet_info *vnet_info) +{ + struct rx_node *rnode; + int i, ret = 0; + + + if ( (vnet_num_buffers <= 0) || (vnet_num_buffers > VNET_MAX_SKBS) ) + vnet_num_buffers = VNET_MAX_SKBS; + + for (i = 0; i < vnet_num_buffers; i++) { + if (unlikely(ret = micvnet_alloc_rx_node(vnet_info, &rnode))) + return ret; + + list_add_tail(&rnode->list, &vnet_info->vi_rx_skb); + } + + return ret; +} + +static void +micvnet_send_add_dma_buffer_messages(struct micvnet_info *vnet_info) +{ + struct rx_node *rnode; + struct list_head *pos; + + list_for_each(pos, &vnet_info->vi_rx_skb) { + rnode = list_entry(pos, struct rx_node, list); + micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode); + } +} + +static void +micvnet_initiate_link_down(struct micvnet_info *vnet_info) +{ + int ret; + netif_tx_disable(vnet_info->vi_netdev); + spin_lock_bh(&vnet_info->vi_txlock); + atomic_set(&vnet_info->vi_state, MICVNET_STATE_LINK_DOWN); + spin_unlock_bh(&vnet_info->vi_txlock); + + /* This wait precludes this function to be called from the context of + * the vnet wq thread */ + ret = wait_event_interruptible_timeout( + vnet_info->stop_waitq, + (atomic_read(&vnet_info->cnt_tx_pending) == 0), + STOP_WAIT_TIMEOUT); + if (!ret) + printk(KERN_ERR "%s timeout waiting for Tx dma buffers to drain\n", __func__); + /* To avoid introducing a lock in micvnet_msg_send_msg() send the + * LINK_DOWN message from vnet wq thread context. LINK_DOWN will be the + * LAST message sent. */ + queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_link_down); +} + +static void +micvnet_stop_deinit(struct micvnet_info *vnet_info) +{ + flush_workqueue(vnet_info->vi_wq); + atomic_set(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED); + + micvnet_deinit_dma(vnet_info); + micvnet_deinit_lists(vnet_info); +#ifdef HOST + micvnet_reset_msg_rings(vnet_info); +#endif + atomic_dec(&micvnet.lv_active_clients); +} + +int +micvnet_probe(mic_ctx_t *mic_ctx) +{ + struct micvnet_info *vnet_info; + int ret = 0; + + mic_ctx->bi_vethinfo = NULL; + + if (!micvnet.created) + return 1; + + if (!(vnet_info = kzalloc(sizeof(struct micvnet_info), GFP_KERNEL))) { + printk(KERN_ERR "%s: vnet_info alloc failed\n", __func__); + return -ENOMEM; + } + + mic_ctx->bi_vethinfo = vnet_info; + vnet_info->mic_ctx = mic_ctx; + if ((ret = micvnet_init_hw_regs(vnet_info))) + goto err_exit_free_vnet_info; + if ((ret = micvnet_init_msg_rings(vnet_info))) + goto err_exit_deinit_hw_regs; + if ((ret = micvnet_init_interrupts(vnet_info))) + goto err_exit_deinit_msg_rings; + if ((ret = micvnet_init_netdev(vnet_info))) + goto err_exit_deinit_interrupts; + + atomic_set(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED); + return 0; + +err_exit_deinit_interrupts: + micvnet_deinit_interrupts(vnet_info); +err_exit_deinit_msg_rings: + micvnet_deinit_msg_rings(vnet_info); +err_exit_deinit_hw_regs: + micvnet_deinit_hw_regs(vnet_info); +err_exit_free_vnet_info: + kfree(vnet_info); + + return ret; +} + +void +micvnet_remove(mic_ctx_t *mic_ctx) +{ + struct micvnet_info + *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo; + + if (!vnet_info) + return; + + micvnet_stop(mic_ctx); + + vnet_info->vi_netdev->ml_priv = NULL; + + micvnet_deinit_interrupts(vnet_info); + micvnet_deinit_msg_rings(vnet_info); + micvnet_deinit_hw_regs(vnet_info); + + mic_ctx->bi_vethinfo = NULL; + + kfree(vnet_info); +} + +int +micvnet_execute_start(struct micvnet_info *vnet_info) +{ + int ret = 0; + + if (!vnet_info) { + printk(KERN_ERR "%s: vnet_info is NULL\n", __func__); + return 1; + } + + if (atomic_cmpxchg(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED, + MICVNET_STATE_TRANSITIONING) != MICVNET_STATE_UNINITIALIZED) { + printk(KERN_ERR "%s: wrong vnet state %d\n", __func__, + atomic_read(&vnet_info->vi_state)); + return 1; + } + + if ((ret = micvnet_init_lists(vnet_info))) + goto err_exit; + if ((ret = micvnet_init_dma(vnet_info))) + goto err_exit_deinit_lists; + if ((ret = micvnet_init_rx_skbs(vnet_info))) { + printk(KERN_ERR "%s: micvnet_init_rx_skbs failed\n", __func__); + goto err_exit_deinit_dma; + } + + memset(&vnet_info->vi_netdev->stats, 0, sizeof(vnet_info->vi_netdev->stats)); + atomic_inc(&micvnet.lv_active_clients); + atomic_set(&vnet_info->vi_state, MICVNET_STATE_LINKUP); + + micvnet_msg_send_link_up_msg(vnet_info); +#ifdef HOST + micvnet_send_add_dma_buffer_messages(vnet_info); +#else + writel(MICVNET_CARD_UP_MAGIC, vnet_info->vi_scratch14); + /* Card adds DMA buffers to host after receiving MICVNET_MSG_LINK_UP */ +#endif + return 0; + +err_exit_deinit_dma: + micvnet_deinit_dma(vnet_info); +err_exit_deinit_lists: + /* RX SKB's are deallocated in micvnet_deinit_lists() */ + micvnet_deinit_lists(vnet_info); +err_exit: + atomic_set(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED); + return ret; +} + +static void +micvnet_start_ws(struct work_struct *work) +{ + struct micvnet_info *vnet_info + = container_of(work, struct micvnet_info, vi_ws_start); + + micvnet_execute_start(vnet_info); +} + +int micvnet_start(mic_ctx_t *mic_ctx) +{ +#ifndef HOST + struct micvnet_info *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo; + micvnet_execute_start(vnet_info); +#endif + return 0; +} + +void +micvnet_execute_stop(struct micvnet_info *vnet_info) +{ + int ret; + if (!vnet_info) + return; + + switch(atomic_read(&vnet_info->vi_state)) { + case MICVNET_STATE_LINKUP: + case MICVNET_STATE_BEGIN_UNINIT: + break; + default: + return; + } + +#ifdef HOST + if ((micpm_get_reference(vnet_to_ctx(vnet_info), true)) != 0) + goto exit; +#endif + micvnet_initiate_link_down(vnet_info); + if (vnet_info->link_down_initiator && !(vnet_info->mic_ctx->state == MIC_SHUTDOWN && vnet_info->mic_ctx->sdbic1)){ + ret = wait_event_interruptible_timeout( + vnet_info->stop_waitq, + (atomic_read(&vnet_info->vi_state) == MICVNET_STATE_BEGIN_UNINIT), + STOP_WAIT_TIMEOUT); + if (!ret) + printk(KERN_ERR "%s: timeout waiting for link down message response\n", __func__); + } + +#ifdef HOST + micpm_put_reference(vnet_to_ctx(vnet_info)); +exit: +#endif + micvnet_stop_deinit(vnet_info); +} + +void +micvnet_stop(mic_ctx_t *mic_ctx) +{ + struct micvnet_info *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo; + + vnet_info->link_down_initiator = true; + micvnet_execute_stop(vnet_info); +} + +static void +micvnet_stop_ws(struct work_struct *work) +{ + struct micvnet_info *vnet_info + = container_of(work, struct micvnet_info, vi_ws_stop); + + vnet_info->link_down_initiator = false; + micvnet_execute_stop(vnet_info); +} + +#if !defined(WINDOWS) && defined(HOST) +static ssize_t +show_vnet(struct device *dev, struct device_attribute *attr, char *buf); +DEVICE_ATTR(vnet, S_IRUGO, show_vnet, NULL); + +static ssize_t +show_vnet(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "Number of active vnet clients: %d\n", + atomic_read(&micvnet.lv_active_clients)); +} +#endif + +int +micvnet_init(struct device *dev) +{ + int ret = 0; + + micvnet.created = 0; + atomic_set(&micvnet.lv_active_clients, 0); + + if ((ret = rtnl_link_register(&micvnet_link_ops))) { + printk(KERN_ERR "%s: rtnl_link_register failed\n", __func__); + return ret; + } + +#ifdef HOST + if ((ret = device_create_file(dev, &dev_attr_vnet))) { + printk(KERN_ERR "%s: device_create_file failed\n", __func__); + rtnl_link_unregister(&micvnet_link_ops); + return ret; + } +#endif + micvnet.created = 1; + return 0; +} + +void +micvnet_exit(void) +{ + rtnl_link_unregister(&micvnet_link_ops); +} + +#ifndef HOST +static void __exit +_micvnet_module_exit(void) +{ + mic_ctx_t *mic_ctx = &mic_ctx_g; + + micvnet_stop(mic_ctx); + micvnet_remove(mic_ctx); + micvnet_exit(); +} + +static int +micvnet_reboot(struct notifier_block *notifier, unsigned long unused1, void *unused2) +{ + /* Calling _micvnet_module_exit() here will hang the uOS during shutdown in NFS + * root case */ + return NOTIFY_OK; +} + +static struct notifier_block micvnet_reboot_notifier = { + .notifier_call = micvnet_reboot, + .priority = 0, +}; + +void __exit +micvnet_module_exit(void) +{ + unregister_reboot_notifier(&micvnet_reboot_notifier); + _micvnet_module_exit(); +} + +int __init +micvnet_module_init(void) +{ + mic_ctx_t *mic_ctx = &mic_ctx_g; + int ret = 0; + + if ((ret = register_reboot_notifier(&micvnet_reboot_notifier))) { + printk(KERN_ERR "register_reboot_notifier failed: error %d\n", ret); + goto err_exit; + } + + memset(mic_ctx, 0, sizeof(*mic_ctx)); + mic_ctx->bi_id = 0; + + if ((ret = micvnet_init(NULL))) + goto err_exit_unregister_reboot_notifier; + if ((ret = micvnet_probe(mic_ctx))) + goto err_exit_micvnet_exit; + if ((ret = micvnet_start(mic_ctx))) + goto err_exit_micvnet_remove; + + return 0; + +err_exit_micvnet_remove: + micvnet_remove(mic_ctx); +err_exit_micvnet_exit: + micvnet_exit(); +err_exit_unregister_reboot_notifier: + unregister_reboot_notifier(&micvnet_reboot_notifier); +err_exit: + printk(KERN_ERR "%s failed: error %d\n", __func__, ret); + return ret; +} + +#ifdef STANDALONE_VNET_DMA +module_init(micvnet_module_init); +module_exit(micvnet_module_exit); +#endif + +MODULE_LICENSE("GPL"); +#endif diff --git a/vnet/micveth_param.c b/vnet/micveth_param.c new file mode 100644 index 0000000..449deed --- /dev/null +++ b/vnet/micveth_param.c @@ -0,0 +1,95 @@ +/* + * Copyright 2010-2017 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Disclaimer: The codes contained in these modules may be specific to + * the Intel Software Development Platform codenamed Knights Ferry, + * and the Intel product codenamed Knights Corner, and are not backward + * compatible with other Intel products. Additionally, Intel will NOT + * support the codes or instruction set in future products. + * + * Intel offers no warranty of any kind regarding the code. This code is + * licensed on an "AS IS" basis and Intel is not obligated to provide + * any support, assistance, installation, training, or other services + * of any kind. Intel is also not obligated to provide any updates, + * enhancements or extensions. Intel specifically disclaims any warranty + * of merchantability, non-infringement, fitness for any particular + * purpose, and any other warranty. + * + * Further, Intel disclaims all liability of any kind, including but + * not limited to liability for infringement of any proprietary rights, + * relating to the use of the code, even if Intel is notified of the + * possibility of such liability. Except as expressly stated in an Intel + * license agreement provided with this code and agreed upon with Intel, + * no license, express or implied, by estoppel or otherwise, to any + * intellectual property rights is granted herein. + */ + +#include +#include +#include +#include + +#include "mic/micveth.h" + +#define __VNET_MODE(u, l) #l , +char *mic_vnet_modes[] = { VNET_MODES }; +#undef __VNET_MODE + +/* + *KAA: not sure when this API changed, could have been in 35. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) +#define GRRR const +#else +#define GRRR /* As nothing */ +#endif + +static int param_set_vnetmode(const char *val, GRRR struct kernel_param *kp) +{ + int i; + for (i = 0; i < sizeof(mic_vnet_modes) / sizeof(char *); i++) + if (!strcmp(val, mic_vnet_modes[i])) { + mic_vnet_mode = i; + return 0; + } + return -EINVAL; +} + +static int param_get_vnetmode(char *buffer, GRRR struct kernel_param *kp) +{ + return sprintf(buffer, "%s", mic_vnet_modes[mic_vnet_mode]); +} + +#define param_check_vnetmode(name, p) __param_check(name, p, int) + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) +struct kernel_param_ops param_ops_vnetmode = { + .set = param_set_vnetmode, + .get = param_get_vnetmode, +}; +#endif /* Kernel > 2.6.36 */ + +int mic_vnet_mode = VNET_MODE_DMA; +module_param_named(vnet, mic_vnet_mode, vnetmode, 0400); +#define __VNET_MODE(u, l) " " #l +MODULE_PARM_DESC(vnet, "Vnet operating mode, one of:" VNET_MODES); +#undef __VNET_MODE + +int vnet_num_buffers = VNET_MAX_SKBS; +module_param(vnet_num_buffers, int, 0400); +MODULE_PARM_DESC(vnet_num_buffers, "Number of buffers used by the VNET driver"); + +ulong vnet_addr = 0; +module_param(vnet_addr, ulong, 0400); +MODULE_PARM_DESC(vnet_addr, "Vnet driver host ring address"); + + -- 2.20.1